diff --git a/man/systemd-nspawn.xml b/man/systemd-nspawn.xml
index 5d3212dec7..c4db6a3ada 100644
--- a/man/systemd-nspawn.xml
+++ b/man/systemd-nspawn.xml
@@ -713,6 +713,23 @@
above).
+
+
+
+ Alter the system call filter applied to containers. Takes a space-separated list of system call
+ names or group names (the latter prefixed with @, as listed by the
+ syscall-filter command of systemd-analyze1). Passed
+ system calls will be permitted. The list may optionally be prefixed by ~, in which case all
+ listed system calls are prohibited. If this command line option is used multiple times the configured lists are
+ combined. If both a positive and a negative list (that is one system call list without and one with the
+ ~ prefix) are configured, the positive list takes precedence over the negative list. Note
+ that systemd-nspawn always implements a system call blacklist (as opposed to a whitelist),
+ and this command line option hence adds or removes entries from the default blacklist, depending on the
+ ~ prefix. Note that the applied system call filter is also altered implicitly if additional
+ capabilities are passed using the --capabilities=.
+
+
diff --git a/man/systemd.nspawn.xml b/man/systemd.nspawn.xml
index 4f3f052911..58024a071d 100644
--- a/man/systemd.nspawn.xml
+++ b/man/systemd.nspawn.xml
@@ -274,11 +274,21 @@
NotifyReady=
- Configures support for notifications from the container's init process.
- This is equivalent to use command line switch,
- and takes the same options. See systemd-nspawn1
- for details about the specific options supported.
+ Configures support for notifications from the container's init process. This is equivalent to
+ the command line switch, and takes the same paramaters. See
+ systemd-nspawn1 for details
+ about the specific options supported.
+
+
+ SystemCallFilter=
+
+ Configures the system call filter applied to containers. This is equivalent to the
+ command line switch, and takes the same list parameter. See
+ systemd-nspawn1 for
+ details.
+
+
diff --git a/src/nspawn/nspawn-gperf.gperf b/src/nspawn/nspawn-gperf.gperf
index e5fdf63162..b61b347ee7 100644
--- a/src/nspawn/nspawn-gperf.gperf
+++ b/src/nspawn/nspawn-gperf.gperf
@@ -29,6 +29,7 @@ Exec.WorkingDirectory, config_parse_path, 0, offsetof(Settings,
Exec.PivotRoot, config_parse_pivot_root, 0, 0
Exec.PrivateUsers, config_parse_private_users, 0, 0
Exec.NotifyReady, config_parse_bool, 0, offsetof(Settings, notify_ready)
+Exec.SystemCallFilter, config_parse_syscall_filter,0, 0,
Files.ReadOnly, config_parse_tristate, 0, offsetof(Settings, read_only)
Files.Volatile, config_parse_volatile_mode, 0, offsetof(Settings, volatile_mode)
Files.Bind, config_parse_bind, 0, 0
diff --git a/src/nspawn/nspawn-seccomp.c b/src/nspawn/nspawn-seccomp.c
index 25851401f3..a6f7a7dabc 100644
--- a/src/nspawn/nspawn-seccomp.c
+++ b/src/nspawn/nspawn-seccomp.c
@@ -33,13 +33,16 @@
#include "seccomp-util.h"
#endif
#include "string-util.h"
+#include "strv.h"
#ifdef HAVE_SECCOMP
static int seccomp_add_default_syscall_filter(
scmp_filter_ctx ctx,
uint32_t arch,
- uint64_t cap_list_retain) {
+ uint64_t cap_list_retain,
+ char **syscall_whitelist,
+ char **syscall_blacklist) {
static const struct {
uint64_t capability;
@@ -67,12 +70,13 @@ static int seccomp_add_default_syscall_filter(
int r, c = 0;
size_t i;
+ char **p;
for (i = 0; i < ELEMENTSOF(blacklist); i++) {
if (blacklist[i].capability != 0 && (cap_list_retain & (1ULL << blacklist[i].capability)))
continue;
- r = seccomp_add_syscall_filter_item(ctx, blacklist[i].name, SCMP_ACT_ERRNO(EPERM));
+ r = seccomp_add_syscall_filter_item(ctx, blacklist[i].name, SCMP_ACT_ERRNO(EPERM), syscall_whitelist);
if (r < 0)
/* If the system call is not known on this architecture, then that's fine, let's ignore it */
log_debug_errno(r, "Failed to add rule for system call %s, ignoring: %m", blacklist[i].name);
@@ -80,15 +84,23 @@ static int seccomp_add_default_syscall_filter(
c++;
}
+ STRV_FOREACH(p, syscall_blacklist) {
+ r = seccomp_add_syscall_filter_item(ctx, *p, SCMP_ACT_ERRNO(EPERM), syscall_whitelist);
+ if (r < 0)
+ log_debug_errno(r, "Failed to add rule for system call %s, ignoring: %m", *p);
+ else
+ c++;
+ }
+
return c;
}
-int setup_seccomp(uint64_t cap_list_retain) {
+int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **syscall_blacklist) {
uint32_t arch;
int r;
if (!is_seccomp_available()) {
- log_debug("SECCOMP features not detected in the kernel, disabling SECCOMP audit filter");
+ log_debug("SECCOMP features not detected in the kernel, disabling SECCOMP filterering");
return 0;
}
@@ -102,7 +114,7 @@ int setup_seccomp(uint64_t cap_list_retain) {
if (r < 0)
return log_error_errno(r, "Failed to allocate seccomp object: %m");
- n = seccomp_add_default_syscall_filter(seccomp, arch, cap_list_retain);
+ n = seccomp_add_default_syscall_filter(seccomp, arch, cap_list_retain, syscall_whitelist, syscall_blacklist);
if (n < 0)
return n;
@@ -141,7 +153,7 @@ int setup_seccomp(uint64_t cap_list_retain) {
#else
-int setup_seccomp(uint64_t cap_list_retain) {
+int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **syscall_blacklist) {
return 0;
}
diff --git a/src/nspawn/nspawn-seccomp.h b/src/nspawn/nspawn-seccomp.h
index 5bde16faf9..5cf5ad1e14 100644
--- a/src/nspawn/nspawn-seccomp.h
+++ b/src/nspawn/nspawn-seccomp.h
@@ -21,4 +21,4 @@
#include
-int setup_seccomp(uint64_t cap_list_retain);
+int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **syscall_blacklist);
diff --git a/src/nspawn/nspawn-settings.c b/src/nspawn/nspawn-settings.c
index 5217d10665..c02c1ea697 100644
--- a/src/nspawn/nspawn-settings.c
+++ b/src/nspawn/nspawn-settings.c
@@ -93,6 +93,8 @@ Settings* settings_free(Settings *s) {
free(s->pivot_root_new);
free(s->pivot_root_old);
free(s->working_directory);
+ strv_free(s->syscall_whitelist);
+ strv_free(s->syscall_blacklist);
strv_free(s->network_interfaces);
strv_free(s->network_macvlan);
@@ -568,3 +570,51 @@ int config_parse_private_users(
return 0;
}
+
+int config_parse_syscall_filter(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ Settings *settings = data;
+ bool negative;
+ const char *items;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ negative = rvalue[0] == '~';
+ items = negative ? rvalue + 1 : rvalue;
+
+ for (;;) {
+ _cleanup_free_ char *word = NULL;
+
+ r = extract_first_word(&items, &word, NULL, 0);
+ if (r == 0)
+ break;
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_ERR, filename, line, r, "Failed to parse SystemCallFilter= parameter %s, ignoring: %m", rvalue);
+ return 0;
+ }
+
+ if (negative)
+ r = strv_extend(&settings->syscall_blacklist, word);
+ else
+ r = strv_extend(&settings->syscall_whitelist, word);
+ if (r < 0)
+ return log_oom();
+ }
+
+ return 0;
+}
diff --git a/src/nspawn/nspawn-settings.h b/src/nspawn/nspawn-settings.h
index 021403258f..75d68ce4cf 100644
--- a/src/nspawn/nspawn-settings.h
+++ b/src/nspawn/nspawn-settings.h
@@ -58,7 +58,8 @@ typedef enum SettingsMask {
SETTING_USERNS = 1 << 13,
SETTING_NOTIFY_READY = 1 << 14,
SETTING_PIVOT_ROOT = 1 << 15,
- _SETTINGS_MASK_ALL = (1 << 16) -1
+ SETTING_SYSCALL_FILTER = 1 << 16,
+ _SETTINGS_MASK_ALL = (1 << 17) -1
} SettingsMask;
typedef struct Settings {
@@ -78,6 +79,8 @@ typedef struct Settings {
UserNamespaceMode userns_mode;
uid_t uid_shift, uid_range;
bool notify_ready;
+ char **syscall_whitelist;
+ char **syscall_blacklist;
/* [Image] */
int read_only;
@@ -121,3 +124,4 @@ int config_parse_network_zone(const char *unit, const char *filename, unsigned l
int config_parse_boot(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);
int config_parse_pid2(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);
int config_parse_private_users(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);
+int config_parse_syscall_filter(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c
index 24a3da68ca..cf804ed1b3 100644
--- a/src/nspawn/nspawn.c
+++ b/src/nspawn/nspawn.c
@@ -208,6 +208,8 @@ static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS
static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO;
static void *arg_root_hash = NULL;
static size_t arg_root_hash_size = 0;
+static char **arg_syscall_whitelist = NULL;
+static char **arg_syscall_blacklist = NULL;
static void help(void) {
printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
@@ -267,6 +269,8 @@ static void help(void) {
" --capability=CAP In addition to the default, retain specified\n"
" capability\n"
" --drop-capability=CAP Drop the specified capability from the default set\n"
+ " --system-call-filter=LIST|~LIST\n"
+ " Permit/prohibit specific system calls\n"
" --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
" --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
" host, try-guest, try-host\n"
@@ -431,6 +435,7 @@ static int parse_argv(int argc, char *argv[]) {
ARG_PRIVATE_USERS_CHOWN,
ARG_NOTIFY_READY,
ARG_ROOT_HASH,
+ ARG_SYSTEM_CALL_FILTER,
};
static const struct option options[] = {
@@ -482,6 +487,7 @@ static int parse_argv(int argc, char *argv[]) {
{ "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
{ "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
{ "root-hash", required_argument, NULL, ARG_ROOT_HASH },
+ { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
{}
};
@@ -1051,6 +1057,36 @@ static int parse_argv(int argc, char *argv[]) {
break;
}
+ case ARG_SYSTEM_CALL_FILTER: {
+ bool negative;
+ const char *items;
+
+ negative = optarg[0] == '~';
+ items = negative ? optarg + 1 : optarg;
+
+ for (;;) {
+ _cleanup_free_ char *word = NULL;
+
+ r = extract_first_word(&items, &word, NULL, 0);
+ if (r == 0)
+ break;
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse system call filter: %m");
+
+ if (negative)
+ r = strv_extend(&arg_syscall_blacklist, word);
+ else
+ r = strv_extend(&arg_syscall_whitelist, word);
+ if (r < 0)
+ return log_oom();
+ }
+
+ arg_settings_mask |= SETTING_SYSCALL_FILTER;
+ break;
+ }
+
case '?':
return -EINVAL;
@@ -2606,7 +2642,7 @@ static int outer_child(
if (r < 0)
return r;
- r = setup_seccomp(arg_caps_retain);
+ r = setup_seccomp(arg_caps_retain, arg_syscall_whitelist, arg_syscall_blacklist);
if (r < 0)
return r;
@@ -3111,6 +3147,21 @@ static int load_settings(void) {
if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
arg_notify_ready = settings->notify_ready;
+ if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
+
+ if (!arg_settings_trusted && !strv_isempty(arg_syscall_whitelist))
+ log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", p);
+ else {
+ strv_free(arg_syscall_whitelist);
+ strv_free(arg_syscall_blacklist);
+
+ arg_syscall_whitelist = settings->syscall_whitelist;
+ arg_syscall_blacklist = settings->syscall_blacklist;
+
+ settings->syscall_whitelist = settings->syscall_blacklist = NULL;
+ }
+ }
+
return 0;
}
diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c
index 1215f714f1..643dde6c4a 100644
--- a/src/shared/seccomp-util.c
+++ b/src/shared/seccomp-util.c
@@ -682,14 +682,17 @@ const SyscallFilterSet *syscall_filter_set_find(const char *name) {
return NULL;
}
-static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action);
+static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action, char **exclude);
-int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, uint32_t action) {
+int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, uint32_t action, char **exclude) {
int r;
assert(seccomp);
assert(name);
+ if (strv_contains(exclude, name))
+ return 0;
+
if (name[0] == '@') {
const SyscallFilterSet *other;
@@ -697,7 +700,7 @@ int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name,
if (!other)
return -EINVAL;
- r = seccomp_add_syscall_filter_set(seccomp, other, action);
+ r = seccomp_add_syscall_filter_set(seccomp, other, action, exclude);
if (r < 0)
return r;
} else {
@@ -719,7 +722,8 @@ int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name,
static int seccomp_add_syscall_filter_set(
scmp_filter_ctx seccomp,
const SyscallFilterSet *set,
- uint32_t action) {
+ uint32_t action,
+ char **exclude) {
const char *sys;
int r;
@@ -728,7 +732,7 @@ static int seccomp_add_syscall_filter_set(
assert(set);
NULSTR_FOREACH(sys, set->value) {
- r = seccomp_add_syscall_filter_item(seccomp, sys, action);
+ r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude);
if (r < 0)
return r;
}
@@ -754,7 +758,7 @@ int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilter
if (r < 0)
return r;
- r = seccomp_add_syscall_filter_set(seccomp, set, action);
+ r = seccomp_add_syscall_filter_set(seccomp, set, action, NULL);
if (r < 0) {
log_debug_errno(r, "Failed to add filter set, ignoring: %m");
continue;
diff --git a/src/shared/seccomp-util.h b/src/shared/seccomp-util.h
index 894c53e6fd..c1612f5894 100644
--- a/src/shared/seccomp-util.h
+++ b/src/shared/seccomp-util.h
@@ -69,7 +69,7 @@ const SyscallFilterSet *syscall_filter_set_find(const char *name);
int seccomp_filter_set_add(Set *s, bool b, const SyscallFilterSet *set);
-int seccomp_add_syscall_filter_item(scmp_filter_ctx *ctx, const char *name, uint32_t action);
+int seccomp_add_syscall_filter_item(scmp_filter_ctx *ctx, const char *name, uint32_t action, char **exclude);
int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action);
int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Set* set, uint32_t action);