diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index fa6b965101..bf4f223a43 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -2411,6 +2411,11 @@ RestrictNamespaces=~cgroup net done with the namespace specific unit setting such as PrivateNetwork= or PrivateMounts=. + Note that some namespace sandboxing options might entail mount namespace for private API VFS instances, + such as PrivatePIDs=, ProtectControlGroups=private/strict, or + PrivateNetwork=. If any of the mentioned options are enabled, mount namespace + is implicitly delegated. + diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c index 044a1da437..d9878e6088 100644 --- a/src/core/exec-invoke.c +++ b/src/core/exec-invoke.c @@ -3441,7 +3441,7 @@ static int apply_mount_namespace( /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the * service will need to write to it in order to start the notifications. */ - if (exec_is_cgroup_mount_read_only(context, params) && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) { + if (exec_is_cgroup_mount_read_only(context) && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) { read_write_paths_cleanup = strv_copy(context->read_write_paths); if (!read_write_paths_cleanup) return -ENOMEM; @@ -3586,7 +3586,7 @@ static int apply_mount_namespace( * sandbox inside the mount namespace. */ .ignore_protect_paths = !needs_sandboxing && !context->dynamic_user && root_dir, - .protect_control_groups = needs_sandboxing ? exec_get_protect_control_groups(context, params) : PROTECT_CONTROL_GROUPS_NO, + .protect_control_groups = needs_sandboxing ? exec_get_protect_control_groups(context) : PROTECT_CONTROL_GROUPS_NO, .protect_kernel_tunables = needs_sandboxing && context->protect_kernel_tunables, .protect_kernel_modules = needs_sandboxing && context->protect_kernel_modules, .protect_kernel_logs = needs_sandboxing && context->protect_kernel_logs, @@ -4205,9 +4205,8 @@ static void log_command_line( LOG_EXEC_INVOCATION_ID(params)); } -static bool exec_context_needs_cap_sys_admin(const ExecContext *context, const ExecParameters *params) { +static bool exec_context_needs_cap_sys_admin(const ExecContext *context) { assert(context); - assert(params); return context->private_users != PRIVATE_USERS_NO || context->private_tmp != PRIVATE_TMP_NO || @@ -4229,7 +4228,7 @@ static bool exec_context_needs_cap_sys_admin(const ExecContext *context, const E context->protect_kernel_tunables || context->protect_kernel_modules || context->protect_kernel_logs || - exec_needs_cgroup_mount(context, params) || + exec_needs_cgroup_mount(context) || context->protect_clock || context->protect_hostname != PROTECT_HOSTNAME_NO || !strv_isempty(context->read_write_paths) || @@ -4270,13 +4269,23 @@ static bool exec_namespace_is_delegated( /* If we need unprivileged private users, we've already unshared a user namespace by the time we call * setup_delegated_namespaces() for the first time so let's make sure we do all other namespace * unsharing in the first call to setup_delegated_namespaces() by returning false here. */ - if (!have_cap_sys_admin && exec_context_needs_cap_sys_admin(context, params)) + if (!have_cap_sys_admin && exec_context_needs_cap_sys_admin(context)) return false; if (context->delegate_namespaces == NAMESPACE_FLAGS_INITIAL) return params->runtime_scope == RUNTIME_SCOPE_USER; - return FLAGS_SET(context->delegate_namespaces, namespace); + if (FLAGS_SET(context->delegate_namespaces, namespace)) + return true; + + /* Various namespaces imply mountns for private procfs/sysfs/cgroupfs instances, which means when + * those are delegated mountns must be deferred too. + * + * The list should stay in sync with exec_needs_mount_namespace(). */ + if (namespace == CLONE_NEWNS) + return context->delegate_namespaces & (CLONE_NEWPID|CLONE_NEWCGROUP|CLONE_NEWNET); + + return false; } static int setup_delegated_namespaces( @@ -4355,7 +4364,7 @@ static int setup_delegated_namespaces( log_exec_warning(context, params, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring."); } - if (needs_sandboxing && exec_needs_cgroup_namespace(context, params) && + if (needs_sandboxing && exec_needs_cgroup_namespace(context) && exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWCGROUP) == delegate) { if (unshare(CLONE_NEWCGROUP) < 0) { *reterr_exit_status = EXIT_NAMESPACE; @@ -5197,7 +5206,7 @@ int exec_invoke( * to the cgroup namespace to environment variables and mounts. If chown/chmod fails, we should not pass memory * pressure path environment variable or read-write mount to the unit. This is why we check if * memory_pressure_path != NULL in the conditional below. */ - if (memory_pressure_path && needs_sandboxing && exec_needs_cgroup_namespace(context, params)) { + if (memory_pressure_path && needs_sandboxing && exec_needs_cgroup_namespace(context)) { memory_pressure_path = mfree(memory_pressure_path); r = cg_get_path("memory", "", "memory.pressure", &memory_pressure_path); if (r < 0) { @@ -5364,7 +5373,7 @@ int exec_invoke( } } - if (needs_sandboxing && !have_cap_sys_admin && exec_context_needs_cap_sys_admin(context, params)) { + if (needs_sandboxing && !have_cap_sys_admin && exec_context_needs_cap_sys_admin(context)) { /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces. * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to * set up all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */ diff --git a/src/core/execute.c b/src/core/execute.c index e6a45e3c13..3bc80cd643 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -232,24 +232,18 @@ bool exec_needs_ipc_namespace(const ExecContext *context) { return context->private_ipc || context->ipc_namespace_path; } -static bool can_apply_cgroup_namespace(const ExecContext *context, const ExecParameters *params) { - return cg_all_unified() > 0 && ns_type_supported(NAMESPACE_CGROUP); -} - static bool needs_cgroup_namespace(ProtectControlGroups i) { return IN_SET(i, PROTECT_CONTROL_GROUPS_PRIVATE, PROTECT_CONTROL_GROUPS_STRICT); } -ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context, const ExecParameters *params) { +ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context) { assert(context); /* If cgroup namespace is configured via ProtectControlGroups=private or strict but we can't actually - * use cgroup namespace, either from not having unified hierarchy or kernel support, we ignore the - * setting and do not unshare the namespace. ProtectControlGroups=private and strict get downgraded - * to no and yes respectively. This ensures that strict always gets a read-only mount of /sys/fs/cgroup. - * - * TODO: Remove fallback once cgroupv1 support is removed in v258. */ - if (needs_cgroup_namespace(context->protect_control_groups) && !can_apply_cgroup_namespace(context, params)) { + * use cgroup namespace, we ignore the setting and do not unshare the namespace. + * ProtectControlGroups=private and strict get downgraded to no and yes respectively. This ensures + * that strict always gets a read-only mount of /sys/fs/cgroup/. */ + if (needs_cgroup_namespace(context->protect_control_groups) && !ns_type_supported(NAMESPACE_CGROUP)) { if (context->protect_control_groups == PROTECT_CONTROL_GROUPS_PRIVATE) return PROTECT_CONTROL_GROUPS_NO; if (context->protect_control_groups == PROTECT_CONTROL_GROUPS_STRICT) @@ -258,22 +252,22 @@ ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context, return context->protect_control_groups; } -bool exec_needs_cgroup_namespace(const ExecContext *context, const ExecParameters *params) { +bool exec_needs_cgroup_namespace(const ExecContext *context) { assert(context); - return needs_cgroup_namespace(exec_get_protect_control_groups(context, params)); + return needs_cgroup_namespace(exec_get_protect_control_groups(context)); } -bool exec_needs_cgroup_mount(const ExecContext *context, const ExecParameters *params) { +bool exec_needs_cgroup_mount(const ExecContext *context) { assert(context); - return exec_get_protect_control_groups(context, params) != PROTECT_CONTROL_GROUPS_NO; + return exec_get_protect_control_groups(context) != PROTECT_CONTROL_GROUPS_NO; } -bool exec_is_cgroup_mount_read_only(const ExecContext *context, const ExecParameters *params) { +bool exec_is_cgroup_mount_read_only(const ExecContext *context) { assert(context); - return IN_SET(exec_get_protect_control_groups(context, params), PROTECT_CONTROL_GROUPS_YES, PROTECT_CONTROL_GROUPS_STRICT); + return IN_SET(exec_get_protect_control_groups(context), PROTECT_CONTROL_GROUPS_YES, PROTECT_CONTROL_GROUPS_STRICT); } bool exec_needs_pid_namespace(const ExecContext *context) { @@ -331,7 +325,7 @@ bool exec_needs_mount_namespace( context->protect_kernel_tunables || context->protect_kernel_modules || context->protect_kernel_logs || - exec_needs_cgroup_mount(context, params) || + exec_needs_cgroup_mount(context) || context->protect_proc != PROTECT_PROC_DEFAULT || context->proc_subset != PROC_SUBSET_ALL || exec_needs_ipc_namespace(context) || diff --git a/src/core/execute.h b/src/core/execute.h index af2d972406..78f04d5173 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -631,10 +631,11 @@ bool exec_needs_network_namespace(const ExecContext *context); bool exec_needs_ipc_namespace(const ExecContext *context); bool exec_needs_pid_namespace(const ExecContext *context); -ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context, const ExecParameters *params); -bool exec_needs_cgroup_namespace(const ExecContext *context, const ExecParameters *params); -bool exec_needs_cgroup_mount(const ExecContext *context, const ExecParameters *params); -bool exec_is_cgroup_mount_read_only(const ExecContext *context, const ExecParameters *params); +ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context); +bool exec_needs_cgroup_namespace(const ExecContext *context); +bool exec_needs_cgroup_mount(const ExecContext *context); +bool exec_is_cgroup_mount_read_only(const ExecContext *context); + const char* exec_get_private_notify_socket_path(const ExecContext *context, const ExecParameters *params, bool needs_sandboxing); /* These logging macros do the same logging as those in unit.h, but using ExecContext and ExecParameters diff --git a/src/core/namespace.c b/src/core/namespace.c index 56a3f93c3e..7e131b1425 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -26,7 +26,6 @@ #include "loopback-setup.h" #include "missing_syscall.h" #include "mkdir-label.h" -#include "mount-setup.h" #include "mount-util.h" #include "mountpoint-util.h" #include "namespace-util.h" @@ -207,14 +206,14 @@ static const MountEntry protect_control_groups_yes_table[] = { }; /* ProtectControlGroups=private table. Note mount_private_apivfs() always use MS_NOSUID|MS_NOEXEC|MS_NODEV so - * flags is not set here. nsdelegate has been supported since kernels >= 4.13 so it is safe to use. */ + * flags is not set here. */ static const MountEntry protect_control_groups_private_table[] = { - { "/sys/fs/cgroup", MOUNT_PRIVATE_CGROUP2FS, false, .read_only = false, .nosuid = true, .noexec = true, .options_const = "nsdelegate" }, + { "/sys/fs/cgroup", MOUNT_PRIVATE_CGROUP2FS, false, .read_only = false }, }; /* ProtectControlGroups=strict table */ static const MountEntry protect_control_groups_strict_table[] = { - { "/sys/fs/cgroup", MOUNT_PRIVATE_CGROUP2FS, false, .read_only = true, .nosuid = true, .noexec = true, .options_const = "nsdelegate" }, + { "/sys/fs/cgroup", MOUNT_PRIVATE_CGROUP2FS, false, .read_only = true }, }; /* ProtectSystem=yes table */ @@ -338,7 +337,7 @@ static bool mount_entry_read_only(const MountEntry *p) { static bool mount_entry_noexec(const MountEntry *p) { assert(p); - return p->noexec || IN_SET(p->mode, MOUNT_NOEXEC, MOUNT_INACCESSIBLE, MOUNT_PRIVATE_SYSFS, MOUNT_BIND_SYSFS, MOUNT_PROCFS); + return p->noexec || IN_SET(p->mode, MOUNT_NOEXEC, MOUNT_INACCESSIBLE, MOUNT_PRIVATE_SYSFS, MOUNT_BIND_SYSFS, MOUNT_PROCFS, MOUNT_PRIVATE_CGROUP2FS); } static bool mount_entry_exec(const MountEntry *p) { @@ -1320,16 +1319,6 @@ static int mount_private_apivfs( return r; r = mount_nofollow_verbose(LOG_DEBUG, fstype, temporary_mount, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, opts); - if (r == -EINVAL && opts) - /* If this failed with EINVAL then this likely means either: - * 1. the textual hidepid= stuff for procfs is not supported by the kernel, and thus the - * per-instance hidepid= neither, which means we really don't want to use it, since it - * would affect our host's /proc mount. - * 2. nsdelegate for cgroup2 is not supported by the kernel even though CLONE_NEWCGROUP - * is supported. - * - * Hence let's gracefully fallback to a classic, unrestricted version. */ - r = mount_nofollow_verbose(LOG_DEBUG, fstype, temporary_mount, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, /* opts = */ NULL); if (ERRNO_IS_NEG_PRIVILEGE(r)) { /* When we do not have enough privileges to mount a new instance, fall back to use an * existing mount. */ @@ -1348,8 +1337,8 @@ static int mount_private_apivfs( return r; return 1; - - } else if (r < 0) + } + if (r < 0) return r; /* OK. We have a new mount instance. Let's clear an existing mount and its submounts. */ @@ -1375,18 +1364,9 @@ static int mount_private_sysfs(const MountEntry *m, const NamespaceParameters *p } static int mount_private_cgroup2fs(const MountEntry *m, const NamespaceParameters *p) { - _cleanup_free_ char *opts = NULL; - assert(m); assert(p); - - if (cgroupfs_recursiveprot_supported()) { - opts = strextend_with_separator(NULL, ",", mount_entry_options(m) ?: POINTER_MAX, "memory_recursiveprot"); - if (!opts) - return -ENOMEM; - } - - return mount_private_apivfs("cgroup2", mount_entry_path(m), "/sys/fs/cgroup", opts ?: mount_entry_options(m), p->runtime_scope); + return mount_private_apivfs("cgroup2", mount_entry_path(m), "/sys/fs/cgroup", /* opts = */ NULL, p->runtime_scope); } static int mount_procfs(const MountEntry *m, const NamespaceParameters *p) { @@ -1414,14 +1394,14 @@ static int mount_procfs(const MountEntry *m, const NamespaceParameters *p) { * fsopen()/fsconfig() was also backported on some distros which allows us to detect * hidepid=/subset= support in even more scenarios. */ - if (mount_option_supported("proc", "hidepid", hpv) != 0) { + if (mount_option_supported("proc", "hidepid", hpv) > 0) { opts = strjoin("hidepid=", hpv); if (!opts) return -ENOMEM; } if (p->proc_subset == PROC_SUBSET_PID && - mount_option_supported("proc", "subset", "pid") != 0) + mount_option_supported("proc", "subset", "pid") > 0) if (!strextend_with_separator(&opts, ",", "subset=pid")) return -ENOMEM; } diff --git a/src/shared/mount-setup.c b/src/shared/mount-setup.c index e7712fa274..ad2327e084 100644 --- a/src/shared/mount-setup.c +++ b/src/shared/mount-setup.c @@ -52,7 +52,7 @@ typedef struct MountPoint { MountMode mode; } MountPoint; -bool cgroupfs_recursiveprot_supported(void) { +static bool cgroupfs_recursiveprot_supported(void) { int r; /* Added in kernel 5.7 */ diff --git a/src/shared/mount-setup.h b/src/shared/mount-setup.h index c07fe86364..34de1dad0b 100644 --- a/src/shared/mount-setup.h +++ b/src/shared/mount-setup.h @@ -8,5 +8,3 @@ bool mount_point_ignore(const char *path); int mount_setup_early(void); int mount_setup(bool loaded_policy, bool leave_propagation); - -bool cgroupfs_recursiveprot_supported(void); diff --git a/test/units/TEST-07-PID1.delegate-namespaces.sh b/test/units/TEST-07-PID1.delegate-namespaces.sh index 9bd9691197..6d8d51caff 100755 --- a/test/units/TEST-07-PID1.delegate-namespaces.sh +++ b/test/units/TEST-07-PID1.delegate-namespaces.sh @@ -35,17 +35,12 @@ testcase_network() { systemd-run -p PrivateUsersEx=self -p PrivateNetwork=yes -p DelegateNamespaces=net --wait --pipe -- ip link add veth1 type veth peer name veth2 } -testcase_cgroup() { - (! systemd-run -p PrivateUsersEx=self -p ProtectControlGroupsEx=private --wait --pipe -- sh -c 'echo 0 >/sys/fs/cgroup/cgroup.pressure') - systemd-run -p PrivateUsersEx=self -p ProtectControlGroupsEx=private -p DelegateNamespaces=cgroup --wait --pipe -- sh -c 'echo 0 >/sys/fs/cgroup/cgroup.pressure' -} - testcase_pid() { # MountAPIVFS=yes always bind mounts child mounts of APIVFS filesystems, which means /proc/sys is always read-only # so we can't write to it when running in a container. if ! systemd-detect-virt --container; then (! systemd-run -p PrivateUsersEx=self -p PrivatePIDs=yes -p MountAPIVFS=yes --wait --pipe -- sh -c 'echo 5 >/proc/sys/kernel/ns_last_pid') - systemd-run -p PrivateUsersEx=self -p PrivatePIDs=yes -p MountAPIVFS=yes -p DelegateNamespaces="mnt pid" --wait --pipe -- sh -c 'echo 5 >/proc/sys/kernel/ns_last_pid' + systemd-run -p PrivateUsersEx=self -p PrivatePIDs=yes -p MountAPIVFS=yes -p DelegateNamespaces=pid --wait --pipe -- sh -c 'echo 5 >/proc/sys/kernel/ns_last_pid' fi }