diff --git a/NEWS b/NEWS index 78c44db4a6..0592e697bb 100644 --- a/NEWS +++ b/NEWS @@ -2,6 +2,22 @@ systemd System and Service Manager CHANGES WITH 243 in spe: + * Previously, filters defined with SystemCallFilter= would have the + effect that an calling an offending system call would terminate the + calling thread. This behaviour never made much sense, since killing + individual threads of unexpecting processes is likely to create more + problems than it solves. With this release the default action changed + from killing the thread to killing the whole process. For this to + work correctly both a kernel version (>= 4.14) and a libseccomp + version (>= 2.4.0) supporting this new seccomp action is required. If + an older kernel or libseccomp is used the old behaviour continues to + be used. This change does not affect any services that have no system + call filters defined, or that use SystemCallErrorNumber= (and thus + see EPERM or another error instead of being killed when calling an + offending system call). Note that systemd documentation always + claimed that the whole process is killed. With this change behaviour + is thus adjusted to match the documentation. + * The "kernel.pid_max" sysctl is now bumped to 4194304 by default, i.e. the full 22bit range the kernel allows, up from the old 16bit range. This should improve security and robustness a bit, as PID diff --git a/src/core/execute.c b/src/core/execute.c index 9975de1ff5..e90c3ac4f3 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1439,7 +1439,7 @@ static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ if (skip_seccomp_unavailable(u, "SystemCallFilter=")) return 0; - negative_action = c->syscall_errno == 0 ? SCMP_ACT_KILL : SCMP_ACT_ERRNO(c->syscall_errno); + negative_action = c->syscall_errno == 0 ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno); if (c->syscall_whitelist) { default_action = negative_action; diff --git a/src/nspawn/nspawn-oci.c b/src/nspawn/nspawn-oci.c index 97323f31dd..b00ff289a6 100644 --- a/src/nspawn/nspawn-oci.c +++ b/src/nspawn/nspawn-oci.c @@ -1656,13 +1656,19 @@ static int oci_seccomp_action_from_string(const char *name, uint32_t *ret) { const char *name; uint32_t action; } table[] = { - { "SCMP_ACT_ALLOW", SCMP_ACT_ALLOW }, - { "SCMP_ACT_ERRNO", SCMP_ACT_ERRNO(EPERM) }, /* the OCI spec doesn't document the error, but it appears EPERM is supposed to be used */ - { "SCMP_ACT_KILL", SCMP_ACT_KILL }, -#ifdef SCMP_ACT_LOG - { "SCMP_ACT_LOG", SCMP_ACT_LOG }, + { "SCMP_ACT_ALLOW", SCMP_ACT_ALLOW }, + { "SCMP_ACT_ERRNO", SCMP_ACT_ERRNO(EPERM) }, /* the OCI spec doesn't document the error, but it appears EPERM is supposed to be used */ + { "SCMP_ACT_KILL", SCMP_ACT_KILL }, +#ifdef SCMP_ACT_KILL_PROCESS + { "SCMP_ACT_KILL_PROCESS", SCMP_ACT_KILL_PROCESS }, #endif - { "SCMP_ACT_TRAP", SCMP_ACT_TRAP }, +#ifdef SCMP_ACT_KILL_THREAD + { "SCMP_ACT_KILL_THREAD", SCMP_ACT_KILL_THREAD }, +#endif +#ifdef SCMP_ACT_LOG + { "SCMP_ACT_LOG", SCMP_ACT_LOG }, +#endif + { "SCMP_ACT_TRAP", SCMP_ACT_TRAP }, /* We don't support SCMP_ACT_TRACE because that requires a tracer, and that doesn't really make sense * here */ diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c index 95e46a6aa4..72920ee7df 100644 --- a/src/shared/seccomp-util.c +++ b/src/shared/seccomp-util.c @@ -1964,3 +1964,18 @@ int seccomp_restrict_suid_sgid(void) { return 0; } + +uint32_t scmp_act_kill_process(void) { + + /* Returns SCMP_ACT_KILL_PROCESS if it's supported, and SCMP_ACT_KILL_THREAD otherwise. We never + * actually want to use SCMP_ACT_KILL_THREAD as its semantics are nuts (killing arbitrary threads of + * a program is just a bad idea), but on old kernels/old libseccomp it is all we have, and at least + * for single-threaded apps does the right thing. */ + +#ifdef SCMP_ACT_KILL_PROCESS + if (seccomp_api_get() >= 3) + return SCMP_ACT_KILL_PROCESS; +#endif + + return SCMP_ACT_KILL; /* same as SCMP_ACT_KILL_THREAD */ +} diff --git a/src/shared/seccomp-util.h b/src/shared/seccomp-util.h index 2566d2d17f..1729dc1b6e 100644 --- a/src/shared/seccomp-util.h +++ b/src/shared/seccomp-util.h @@ -104,3 +104,5 @@ extern const uint32_t seccomp_local_archs[]; DEFINE_TRIVIAL_CLEANUP_FUNC(scmp_filter_ctx, seccomp_release); int parse_syscall_archs(char **l, Set **archs); + +uint32_t scmp_act_kill_process(void); diff --git a/src/test/test-execute.c b/src/test/test-execute.c index 9f1cb0ca38..7f8ea2be98 100644 --- a/src/test/test-execute.c +++ b/src/test/test-execute.c @@ -33,6 +33,12 @@ static bool can_unshare; typedef void (*test_function_t)(Manager *m); +static int cld_dumped_to_killed(int code) { + /* Depending on the system, seccomp version, … some signals might result in dumping, others in plain + * killing. Let's ignore the difference here, and map both cases to CLD_KILLED */ + return code == CLD_DUMPED ? CLD_KILLED : code; +} + static void check(const char *func, Manager *m, Unit *unit, int status_expected, int code_expected) { Service *service = NULL; usec_t ts; @@ -62,18 +68,20 @@ static void check(const char *func, Manager *m, Unit *unit, int status_expected, } } exec_status_dump(&service->main_exec_status, stdout, "\t"); + + if (cld_dumped_to_killed(service->main_exec_status.code) != cld_dumped_to_killed(code_expected)) { + log_error("%s: %s: exit code %d, expected %d", + func, unit->id, + service->main_exec_status.code, code_expected); + abort(); + } + if (service->main_exec_status.status != status_expected) { log_error("%s: %s: exit status %d, expected %d", func, unit->id, service->main_exec_status.status, status_expected); abort(); } - if (service->main_exec_status.code != code_expected) { - log_error("%s: %s: exit code %d, expected %d", - func, unit->id, - service->main_exec_status.code, code_expected); - abort(); - } } static bool check_nobody_user_and_group(void) { diff --git a/src/test/test-seccomp.c b/src/test/test-seccomp.c index 9b7307cf39..a906070f9a 100644 --- a/src/test/test-seccomp.c +++ b/src/test/test-seccomp.c @@ -635,7 +635,7 @@ static void test_load_syscall_filter_set_raw(void) { assert_se(access("/", F_OK) >= 0); assert_se(poll(NULL, 0, 0) == 0); - assert_se(seccomp_load_syscall_filter_set_raw(SCMP_ACT_ALLOW, NULL, SCMP_ACT_KILL, true) >= 0); + assert_se(seccomp_load_syscall_filter_set_raw(SCMP_ACT_ALLOW, NULL, scmp_act_kill_process(), true) >= 0); assert_se(access("/", F_OK) >= 0); assert_se(poll(NULL, 0, 0) == 0); diff --git a/test/test-execute/exec-systemcallfilter-failing.service b/test/test-execute/exec-systemcallfilter-failing.service index bcebc99507..996f859217 100644 --- a/test/test-execute/exec-systemcallfilter-failing.service +++ b/test/test-execute/exec-systemcallfilter-failing.service @@ -4,6 +4,7 @@ Description=Test for SystemCallFilter [Service] ExecStart=/bin/sh -c 'echo "This should not be seen"' Type=oneshot +LimitCORE=0 SystemCallFilter=ioperm SystemCallFilter=~ioperm SystemCallFilter=ioperm diff --git a/test/test-execute/exec-systemcallfilter-failing2.service b/test/test-execute/exec-systemcallfilter-failing2.service index 2fdc0ed772..c74f42248b 100644 --- a/test/test-execute/exec-systemcallfilter-failing2.service +++ b/test/test-execute/exec-systemcallfilter-failing2.service @@ -4,4 +4,5 @@ Description=Test for SystemCallFilter [Service] ExecStart=/bin/sh -c 'echo "This should not be seen"' Type=oneshot +LimitCORE=0 SystemCallFilter=~write open execve exit_group close mmap munmap fstat DONOTEXIST diff --git a/units/system-update-cleanup.service b/units/system-update-cleanup.service index d5eca2546b..41abcd631c 100644 --- a/units/system-update-cleanup.service +++ b/units/system-update-cleanup.service @@ -8,7 +8,7 @@ # (at your option) any later version. [Unit] -Description=Remove the Offline System Updates symlink +Description=Remove the Offline System Updates Symlink Documentation=man:systemd.special(7) man:systemd.offline-updates(7) After=system-update.target DefaultDependencies=no