diff --git a/man/systemd-nspawn.xml b/man/systemd-nspawn.xml index 5d3212dec7..c4db6a3ada 100644 --- a/man/systemd-nspawn.xml +++ b/man/systemd-nspawn.xml @@ -713,6 +713,23 @@ above). + + + + Alter the system call filter applied to containers. Takes a space-separated list of system call + names or group names (the latter prefixed with @, as listed by the + syscall-filter command of systemd-analyze1). Passed + system calls will be permitted. The list may optionally be prefixed by ~, in which case all + listed system calls are prohibited. If this command line option is used multiple times the configured lists are + combined. If both a positive and a negative list (that is one system call list without and one with the + ~ prefix) are configured, the positive list takes precedence over the negative list. Note + that systemd-nspawn always implements a system call blacklist (as opposed to a whitelist), + and this command line option hence adds or removes entries from the default blacklist, depending on the + ~ prefix. Note that the applied system call filter is also altered implicitly if additional + capabilities are passed using the --capabilities=. + + diff --git a/man/systemd.nspawn.xml b/man/systemd.nspawn.xml index 4f3f052911..58024a071d 100644 --- a/man/systemd.nspawn.xml +++ b/man/systemd.nspawn.xml @@ -274,11 +274,21 @@ NotifyReady= - Configures support for notifications from the container's init process. - This is equivalent to use command line switch, - and takes the same options. See systemd-nspawn1 - for details about the specific options supported. + Configures support for notifications from the container's init process. This is equivalent to + the command line switch, and takes the same paramaters. See + systemd-nspawn1 for details + about the specific options supported. + + + SystemCallFilter= + + Configures the system call filter applied to containers. This is equivalent to the + command line switch, and takes the same list parameter. See + systemd-nspawn1 for + details. + + diff --git a/src/nspawn/nspawn-gperf.gperf b/src/nspawn/nspawn-gperf.gperf index e5fdf63162..b61b347ee7 100644 --- a/src/nspawn/nspawn-gperf.gperf +++ b/src/nspawn/nspawn-gperf.gperf @@ -29,6 +29,7 @@ Exec.WorkingDirectory, config_parse_path, 0, offsetof(Settings, Exec.PivotRoot, config_parse_pivot_root, 0, 0 Exec.PrivateUsers, config_parse_private_users, 0, 0 Exec.NotifyReady, config_parse_bool, 0, offsetof(Settings, notify_ready) +Exec.SystemCallFilter, config_parse_syscall_filter,0, 0, Files.ReadOnly, config_parse_tristate, 0, offsetof(Settings, read_only) Files.Volatile, config_parse_volatile_mode, 0, offsetof(Settings, volatile_mode) Files.Bind, config_parse_bind, 0, 0 diff --git a/src/nspawn/nspawn-seccomp.c b/src/nspawn/nspawn-seccomp.c index 25851401f3..a6f7a7dabc 100644 --- a/src/nspawn/nspawn-seccomp.c +++ b/src/nspawn/nspawn-seccomp.c @@ -33,13 +33,16 @@ #include "seccomp-util.h" #endif #include "string-util.h" +#include "strv.h" #ifdef HAVE_SECCOMP static int seccomp_add_default_syscall_filter( scmp_filter_ctx ctx, uint32_t arch, - uint64_t cap_list_retain) { + uint64_t cap_list_retain, + char **syscall_whitelist, + char **syscall_blacklist) { static const struct { uint64_t capability; @@ -67,12 +70,13 @@ static int seccomp_add_default_syscall_filter( int r, c = 0; size_t i; + char **p; for (i = 0; i < ELEMENTSOF(blacklist); i++) { if (blacklist[i].capability != 0 && (cap_list_retain & (1ULL << blacklist[i].capability))) continue; - r = seccomp_add_syscall_filter_item(ctx, blacklist[i].name, SCMP_ACT_ERRNO(EPERM)); + r = seccomp_add_syscall_filter_item(ctx, blacklist[i].name, SCMP_ACT_ERRNO(EPERM), syscall_whitelist); if (r < 0) /* If the system call is not known on this architecture, then that's fine, let's ignore it */ log_debug_errno(r, "Failed to add rule for system call %s, ignoring: %m", blacklist[i].name); @@ -80,15 +84,23 @@ static int seccomp_add_default_syscall_filter( c++; } + STRV_FOREACH(p, syscall_blacklist) { + r = seccomp_add_syscall_filter_item(ctx, *p, SCMP_ACT_ERRNO(EPERM), syscall_whitelist); + if (r < 0) + log_debug_errno(r, "Failed to add rule for system call %s, ignoring: %m", *p); + else + c++; + } + return c; } -int setup_seccomp(uint64_t cap_list_retain) { +int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **syscall_blacklist) { uint32_t arch; int r; if (!is_seccomp_available()) { - log_debug("SECCOMP features not detected in the kernel, disabling SECCOMP audit filter"); + log_debug("SECCOMP features not detected in the kernel, disabling SECCOMP filterering"); return 0; } @@ -102,7 +114,7 @@ int setup_seccomp(uint64_t cap_list_retain) { if (r < 0) return log_error_errno(r, "Failed to allocate seccomp object: %m"); - n = seccomp_add_default_syscall_filter(seccomp, arch, cap_list_retain); + n = seccomp_add_default_syscall_filter(seccomp, arch, cap_list_retain, syscall_whitelist, syscall_blacklist); if (n < 0) return n; @@ -141,7 +153,7 @@ int setup_seccomp(uint64_t cap_list_retain) { #else -int setup_seccomp(uint64_t cap_list_retain) { +int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **syscall_blacklist) { return 0; } diff --git a/src/nspawn/nspawn-seccomp.h b/src/nspawn/nspawn-seccomp.h index 5bde16faf9..5cf5ad1e14 100644 --- a/src/nspawn/nspawn-seccomp.h +++ b/src/nspawn/nspawn-seccomp.h @@ -21,4 +21,4 @@ #include -int setup_seccomp(uint64_t cap_list_retain); +int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **syscall_blacklist); diff --git a/src/nspawn/nspawn-settings.c b/src/nspawn/nspawn-settings.c index 5217d10665..c02c1ea697 100644 --- a/src/nspawn/nspawn-settings.c +++ b/src/nspawn/nspawn-settings.c @@ -93,6 +93,8 @@ Settings* settings_free(Settings *s) { free(s->pivot_root_new); free(s->pivot_root_old); free(s->working_directory); + strv_free(s->syscall_whitelist); + strv_free(s->syscall_blacklist); strv_free(s->network_interfaces); strv_free(s->network_macvlan); @@ -568,3 +570,51 @@ int config_parse_private_users( return 0; } + +int config_parse_syscall_filter( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Settings *settings = data; + bool negative; + const char *items; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + negative = rvalue[0] == '~'; + items = negative ? rvalue + 1 : rvalue; + + for (;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&items, &word, NULL, 0); + if (r == 0) + break; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to parse SystemCallFilter= parameter %s, ignoring: %m", rvalue); + return 0; + } + + if (negative) + r = strv_extend(&settings->syscall_blacklist, word); + else + r = strv_extend(&settings->syscall_whitelist, word); + if (r < 0) + return log_oom(); + } + + return 0; +} diff --git a/src/nspawn/nspawn-settings.h b/src/nspawn/nspawn-settings.h index 021403258f..75d68ce4cf 100644 --- a/src/nspawn/nspawn-settings.h +++ b/src/nspawn/nspawn-settings.h @@ -58,7 +58,8 @@ typedef enum SettingsMask { SETTING_USERNS = 1 << 13, SETTING_NOTIFY_READY = 1 << 14, SETTING_PIVOT_ROOT = 1 << 15, - _SETTINGS_MASK_ALL = (1 << 16) -1 + SETTING_SYSCALL_FILTER = 1 << 16, + _SETTINGS_MASK_ALL = (1 << 17) -1 } SettingsMask; typedef struct Settings { @@ -78,6 +79,8 @@ typedef struct Settings { UserNamespaceMode userns_mode; uid_t uid_shift, uid_range; bool notify_ready; + char **syscall_whitelist; + char **syscall_blacklist; /* [Image] */ int read_only; @@ -121,3 +124,4 @@ int config_parse_network_zone(const char *unit, const char *filename, unsigned l int config_parse_boot(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); int config_parse_pid2(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); int config_parse_private_users(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); +int config_parse_syscall_filter(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 24a3da68ca..cf804ed1b3 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -208,6 +208,8 @@ static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO; static void *arg_root_hash = NULL; static size_t arg_root_hash_size = 0; +static char **arg_syscall_whitelist = NULL; +static char **arg_syscall_blacklist = NULL; static void help(void) { printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n" @@ -267,6 +269,8 @@ static void help(void) { " --capability=CAP In addition to the default, retain specified\n" " capability\n" " --drop-capability=CAP Drop the specified capability from the default set\n" + " --system-call-filter=LIST|~LIST\n" + " Permit/prohibit specific system calls\n" " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n" " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n" " host, try-guest, try-host\n" @@ -431,6 +435,7 @@ static int parse_argv(int argc, char *argv[]) { ARG_PRIVATE_USERS_CHOWN, ARG_NOTIFY_READY, ARG_ROOT_HASH, + ARG_SYSTEM_CALL_FILTER, }; static const struct option options[] = { @@ -482,6 +487,7 @@ static int parse_argv(int argc, char *argv[]) { { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT }, { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY }, { "root-hash", required_argument, NULL, ARG_ROOT_HASH }, + { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER }, {} }; @@ -1051,6 +1057,36 @@ static int parse_argv(int argc, char *argv[]) { break; } + case ARG_SYSTEM_CALL_FILTER: { + bool negative; + const char *items; + + negative = optarg[0] == '~'; + items = negative ? optarg + 1 : optarg; + + for (;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&items, &word, NULL, 0); + if (r == 0) + break; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + return log_error_errno(r, "Failed to parse system call filter: %m"); + + if (negative) + r = strv_extend(&arg_syscall_blacklist, word); + else + r = strv_extend(&arg_syscall_whitelist, word); + if (r < 0) + return log_oom(); + } + + arg_settings_mask |= SETTING_SYSCALL_FILTER; + break; + } + case '?': return -EINVAL; @@ -2606,7 +2642,7 @@ static int outer_child( if (r < 0) return r; - r = setup_seccomp(arg_caps_retain); + r = setup_seccomp(arg_caps_retain, arg_syscall_whitelist, arg_syscall_blacklist); if (r < 0) return r; @@ -3111,6 +3147,21 @@ static int load_settings(void) { if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0) arg_notify_ready = settings->notify_ready; + if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) { + + if (!arg_settings_trusted && !strv_isempty(arg_syscall_whitelist)) + log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", p); + else { + strv_free(arg_syscall_whitelist); + strv_free(arg_syscall_blacklist); + + arg_syscall_whitelist = settings->syscall_whitelist; + arg_syscall_blacklist = settings->syscall_blacklist; + + settings->syscall_whitelist = settings->syscall_blacklist = NULL; + } + } + return 0; } diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c index 1215f714f1..643dde6c4a 100644 --- a/src/shared/seccomp-util.c +++ b/src/shared/seccomp-util.c @@ -682,14 +682,17 @@ const SyscallFilterSet *syscall_filter_set_find(const char *name) { return NULL; } -static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action); +static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action, char **exclude); -int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, uint32_t action) { +int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, uint32_t action, char **exclude) { int r; assert(seccomp); assert(name); + if (strv_contains(exclude, name)) + return 0; + if (name[0] == '@') { const SyscallFilterSet *other; @@ -697,7 +700,7 @@ int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, if (!other) return -EINVAL; - r = seccomp_add_syscall_filter_set(seccomp, other, action); + r = seccomp_add_syscall_filter_set(seccomp, other, action, exclude); if (r < 0) return r; } else { @@ -719,7 +722,8 @@ int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, static int seccomp_add_syscall_filter_set( scmp_filter_ctx seccomp, const SyscallFilterSet *set, - uint32_t action) { + uint32_t action, + char **exclude) { const char *sys; int r; @@ -728,7 +732,7 @@ static int seccomp_add_syscall_filter_set( assert(set); NULSTR_FOREACH(sys, set->value) { - r = seccomp_add_syscall_filter_item(seccomp, sys, action); + r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude); if (r < 0) return r; } @@ -754,7 +758,7 @@ int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilter if (r < 0) return r; - r = seccomp_add_syscall_filter_set(seccomp, set, action); + r = seccomp_add_syscall_filter_set(seccomp, set, action, NULL); if (r < 0) { log_debug_errno(r, "Failed to add filter set, ignoring: %m"); continue; diff --git a/src/shared/seccomp-util.h b/src/shared/seccomp-util.h index 894c53e6fd..c1612f5894 100644 --- a/src/shared/seccomp-util.h +++ b/src/shared/seccomp-util.h @@ -69,7 +69,7 @@ const SyscallFilterSet *syscall_filter_set_find(const char *name); int seccomp_filter_set_add(Set *s, bool b, const SyscallFilterSet *set); -int seccomp_add_syscall_filter_item(scmp_filter_ctx *ctx, const char *name, uint32_t action); +int seccomp_add_syscall_filter_item(scmp_filter_ctx *ctx, const char *name, uint32_t action, char **exclude); int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action); int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Set* set, uint32_t action);