From d4bdcecaf937922f7c515ab912d3cb84dc4b1ab7 Mon Sep 17 00:00:00 2001 From: Mike Yuan Date: Sun, 30 Mar 2025 18:45:27 +0200 Subject: [PATCH 1/5] TEST-07-PID1: remove bogus test case for DelegateNamespaces=cgroup We enable nsdelegate for cgroupfs, and hence the kernel would always refuse writes to /sys/fs/cgroup/cgroup.pressure and friends regardless of whether the cgns is owned by userns: https://github.com/torvalds/linux/blob/cb82ca153949c6204af793de24b18a04236e79fd/kernel/cgroup/cgroup.c#L4132 This currently works because the mountns (thus cgroupfs) remains to be non-delegated and we're actually operating on the real root cgroup. It appears that cgroupfs generally doesn't care about userns, so I'm yet to see a way to test this properly. Let's drop this for now, to unblock fixes in the following commits. --- test/units/TEST-07-PID1.delegate-namespaces.sh | 5 ----- 1 file changed, 5 deletions(-) diff --git a/test/units/TEST-07-PID1.delegate-namespaces.sh b/test/units/TEST-07-PID1.delegate-namespaces.sh index 9bd9691197..210635ebbc 100755 --- a/test/units/TEST-07-PID1.delegate-namespaces.sh +++ b/test/units/TEST-07-PID1.delegate-namespaces.sh @@ -35,11 +35,6 @@ testcase_network() { systemd-run -p PrivateUsersEx=self -p PrivateNetwork=yes -p DelegateNamespaces=net --wait --pipe -- ip link add veth1 type veth peer name veth2 } -testcase_cgroup() { - (! systemd-run -p PrivateUsersEx=self -p ProtectControlGroupsEx=private --wait --pipe -- sh -c 'echo 0 >/sys/fs/cgroup/cgroup.pressure') - systemd-run -p PrivateUsersEx=self -p ProtectControlGroupsEx=private -p DelegateNamespaces=cgroup --wait --pipe -- sh -c 'echo 0 >/sys/fs/cgroup/cgroup.pressure' -} - testcase_pid() { # MountAPIVFS=yes always bind mounts child mounts of APIVFS filesystems, which means /proc/sys is always read-only # so we can't write to it when running in a container. From 2b4cfbf91fdbe7c336fd374e78ae1dca5b342131 Mon Sep 17 00:00:00 2001 From: Mike Yuan Date: Sat, 29 Mar 2025 20:53:36 +0100 Subject: [PATCH 2/5] core/execute: drop unused function param and cg unified check for cgns While at it, remove TODO about assuming availability of cgns. We generally want to keep that optional still. --- src/core/exec-invoke.c | 17 ++++++++--------- src/core/execute.c | 30 ++++++++++++------------------ src/core/execute.h | 9 +++++---- 3 files changed, 25 insertions(+), 31 deletions(-) diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c index 044a1da437..c926a808e0 100644 --- a/src/core/exec-invoke.c +++ b/src/core/exec-invoke.c @@ -3441,7 +3441,7 @@ static int apply_mount_namespace( /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the * service will need to write to it in order to start the notifications. */ - if (exec_is_cgroup_mount_read_only(context, params) && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) { + if (exec_is_cgroup_mount_read_only(context) && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) { read_write_paths_cleanup = strv_copy(context->read_write_paths); if (!read_write_paths_cleanup) return -ENOMEM; @@ -3586,7 +3586,7 @@ static int apply_mount_namespace( * sandbox inside the mount namespace. */ .ignore_protect_paths = !needs_sandboxing && !context->dynamic_user && root_dir, - .protect_control_groups = needs_sandboxing ? exec_get_protect_control_groups(context, params) : PROTECT_CONTROL_GROUPS_NO, + .protect_control_groups = needs_sandboxing ? exec_get_protect_control_groups(context) : PROTECT_CONTROL_GROUPS_NO, .protect_kernel_tunables = needs_sandboxing && context->protect_kernel_tunables, .protect_kernel_modules = needs_sandboxing && context->protect_kernel_modules, .protect_kernel_logs = needs_sandboxing && context->protect_kernel_logs, @@ -4205,9 +4205,8 @@ static void log_command_line( LOG_EXEC_INVOCATION_ID(params)); } -static bool exec_context_needs_cap_sys_admin(const ExecContext *context, const ExecParameters *params) { +static bool exec_context_needs_cap_sys_admin(const ExecContext *context) { assert(context); - assert(params); return context->private_users != PRIVATE_USERS_NO || context->private_tmp != PRIVATE_TMP_NO || @@ -4229,7 +4228,7 @@ static bool exec_context_needs_cap_sys_admin(const ExecContext *context, const E context->protect_kernel_tunables || context->protect_kernel_modules || context->protect_kernel_logs || - exec_needs_cgroup_mount(context, params) || + exec_needs_cgroup_mount(context) || context->protect_clock || context->protect_hostname != PROTECT_HOSTNAME_NO || !strv_isempty(context->read_write_paths) || @@ -4270,7 +4269,7 @@ static bool exec_namespace_is_delegated( /* If we need unprivileged private users, we've already unshared a user namespace by the time we call * setup_delegated_namespaces() for the first time so let's make sure we do all other namespace * unsharing in the first call to setup_delegated_namespaces() by returning false here. */ - if (!have_cap_sys_admin && exec_context_needs_cap_sys_admin(context, params)) + if (!have_cap_sys_admin && exec_context_needs_cap_sys_admin(context)) return false; if (context->delegate_namespaces == NAMESPACE_FLAGS_INITIAL) @@ -4355,7 +4354,7 @@ static int setup_delegated_namespaces( log_exec_warning(context, params, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring."); } - if (needs_sandboxing && exec_needs_cgroup_namespace(context, params) && + if (needs_sandboxing && exec_needs_cgroup_namespace(context) && exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWCGROUP) == delegate) { if (unshare(CLONE_NEWCGROUP) < 0) { *reterr_exit_status = EXIT_NAMESPACE; @@ -5197,7 +5196,7 @@ int exec_invoke( * to the cgroup namespace to environment variables and mounts. If chown/chmod fails, we should not pass memory * pressure path environment variable or read-write mount to the unit. This is why we check if * memory_pressure_path != NULL in the conditional below. */ - if (memory_pressure_path && needs_sandboxing && exec_needs_cgroup_namespace(context, params)) { + if (memory_pressure_path && needs_sandboxing && exec_needs_cgroup_namespace(context)) { memory_pressure_path = mfree(memory_pressure_path); r = cg_get_path("memory", "", "memory.pressure", &memory_pressure_path); if (r < 0) { @@ -5364,7 +5363,7 @@ int exec_invoke( } } - if (needs_sandboxing && !have_cap_sys_admin && exec_context_needs_cap_sys_admin(context, params)) { + if (needs_sandboxing && !have_cap_sys_admin && exec_context_needs_cap_sys_admin(context)) { /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces. * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to * set up all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */ diff --git a/src/core/execute.c b/src/core/execute.c index e6a45e3c13..3bc80cd643 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -232,24 +232,18 @@ bool exec_needs_ipc_namespace(const ExecContext *context) { return context->private_ipc || context->ipc_namespace_path; } -static bool can_apply_cgroup_namespace(const ExecContext *context, const ExecParameters *params) { - return cg_all_unified() > 0 && ns_type_supported(NAMESPACE_CGROUP); -} - static bool needs_cgroup_namespace(ProtectControlGroups i) { return IN_SET(i, PROTECT_CONTROL_GROUPS_PRIVATE, PROTECT_CONTROL_GROUPS_STRICT); } -ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context, const ExecParameters *params) { +ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context) { assert(context); /* If cgroup namespace is configured via ProtectControlGroups=private or strict but we can't actually - * use cgroup namespace, either from not having unified hierarchy or kernel support, we ignore the - * setting and do not unshare the namespace. ProtectControlGroups=private and strict get downgraded - * to no and yes respectively. This ensures that strict always gets a read-only mount of /sys/fs/cgroup. - * - * TODO: Remove fallback once cgroupv1 support is removed in v258. */ - if (needs_cgroup_namespace(context->protect_control_groups) && !can_apply_cgroup_namespace(context, params)) { + * use cgroup namespace, we ignore the setting and do not unshare the namespace. + * ProtectControlGroups=private and strict get downgraded to no and yes respectively. This ensures + * that strict always gets a read-only mount of /sys/fs/cgroup/. */ + if (needs_cgroup_namespace(context->protect_control_groups) && !ns_type_supported(NAMESPACE_CGROUP)) { if (context->protect_control_groups == PROTECT_CONTROL_GROUPS_PRIVATE) return PROTECT_CONTROL_GROUPS_NO; if (context->protect_control_groups == PROTECT_CONTROL_GROUPS_STRICT) @@ -258,22 +252,22 @@ ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context, return context->protect_control_groups; } -bool exec_needs_cgroup_namespace(const ExecContext *context, const ExecParameters *params) { +bool exec_needs_cgroup_namespace(const ExecContext *context) { assert(context); - return needs_cgroup_namespace(exec_get_protect_control_groups(context, params)); + return needs_cgroup_namespace(exec_get_protect_control_groups(context)); } -bool exec_needs_cgroup_mount(const ExecContext *context, const ExecParameters *params) { +bool exec_needs_cgroup_mount(const ExecContext *context) { assert(context); - return exec_get_protect_control_groups(context, params) != PROTECT_CONTROL_GROUPS_NO; + return exec_get_protect_control_groups(context) != PROTECT_CONTROL_GROUPS_NO; } -bool exec_is_cgroup_mount_read_only(const ExecContext *context, const ExecParameters *params) { +bool exec_is_cgroup_mount_read_only(const ExecContext *context) { assert(context); - return IN_SET(exec_get_protect_control_groups(context, params), PROTECT_CONTROL_GROUPS_YES, PROTECT_CONTROL_GROUPS_STRICT); + return IN_SET(exec_get_protect_control_groups(context), PROTECT_CONTROL_GROUPS_YES, PROTECT_CONTROL_GROUPS_STRICT); } bool exec_needs_pid_namespace(const ExecContext *context) { @@ -331,7 +325,7 @@ bool exec_needs_mount_namespace( context->protect_kernel_tunables || context->protect_kernel_modules || context->protect_kernel_logs || - exec_needs_cgroup_mount(context, params) || + exec_needs_cgroup_mount(context) || context->protect_proc != PROTECT_PROC_DEFAULT || context->proc_subset != PROC_SUBSET_ALL || exec_needs_ipc_namespace(context) || diff --git a/src/core/execute.h b/src/core/execute.h index af2d972406..78f04d5173 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -631,10 +631,11 @@ bool exec_needs_network_namespace(const ExecContext *context); bool exec_needs_ipc_namespace(const ExecContext *context); bool exec_needs_pid_namespace(const ExecContext *context); -ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context, const ExecParameters *params); -bool exec_needs_cgroup_namespace(const ExecContext *context, const ExecParameters *params); -bool exec_needs_cgroup_mount(const ExecContext *context, const ExecParameters *params); -bool exec_is_cgroup_mount_read_only(const ExecContext *context, const ExecParameters *params); +ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context); +bool exec_needs_cgroup_namespace(const ExecContext *context); +bool exec_needs_cgroup_mount(const ExecContext *context); +bool exec_is_cgroup_mount_read_only(const ExecContext *context); + const char* exec_get_private_notify_socket_path(const ExecContext *context, const ExecParameters *params, bool needs_sandboxing); /* These logging macros do the same logging as those in unit.h, but using ExecContext and ExecParameters From 32b69b190b74c0e03416572dffa31b598511e33f Mon Sep 17 00:00:00 2001 From: Mike Yuan Date: Sat, 29 Mar 2025 21:02:04 +0100 Subject: [PATCH 3/5] core: delegate mountns implicitly when any of pidns/cgns/netns is in use --- man/systemd.exec.xml | 5 +++++ src/core/exec-invoke.c | 12 +++++++++++- test/units/TEST-07-PID1.delegate-namespaces.sh | 2 +- 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index fa6b965101..bf4f223a43 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -2411,6 +2411,11 @@ RestrictNamespaces=~cgroup net done with the namespace specific unit setting such as PrivateNetwork= or PrivateMounts=. + Note that some namespace sandboxing options might entail mount namespace for private API VFS instances, + such as PrivatePIDs=, ProtectControlGroups=private/strict, or + PrivateNetwork=. If any of the mentioned options are enabled, mount namespace + is implicitly delegated. + diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c index c926a808e0..d9878e6088 100644 --- a/src/core/exec-invoke.c +++ b/src/core/exec-invoke.c @@ -4275,7 +4275,17 @@ static bool exec_namespace_is_delegated( if (context->delegate_namespaces == NAMESPACE_FLAGS_INITIAL) return params->runtime_scope == RUNTIME_SCOPE_USER; - return FLAGS_SET(context->delegate_namespaces, namespace); + if (FLAGS_SET(context->delegate_namespaces, namespace)) + return true; + + /* Various namespaces imply mountns for private procfs/sysfs/cgroupfs instances, which means when + * those are delegated mountns must be deferred too. + * + * The list should stay in sync with exec_needs_mount_namespace(). */ + if (namespace == CLONE_NEWNS) + return context->delegate_namespaces & (CLONE_NEWPID|CLONE_NEWCGROUP|CLONE_NEWNET); + + return false; } static int setup_delegated_namespaces( diff --git a/test/units/TEST-07-PID1.delegate-namespaces.sh b/test/units/TEST-07-PID1.delegate-namespaces.sh index 210635ebbc..6d8d51caff 100755 --- a/test/units/TEST-07-PID1.delegate-namespaces.sh +++ b/test/units/TEST-07-PID1.delegate-namespaces.sh @@ -40,7 +40,7 @@ testcase_pid() { # so we can't write to it when running in a container. if ! systemd-detect-virt --container; then (! systemd-run -p PrivateUsersEx=self -p PrivatePIDs=yes -p MountAPIVFS=yes --wait --pipe -- sh -c 'echo 5 >/proc/sys/kernel/ns_last_pid') - systemd-run -p PrivateUsersEx=self -p PrivatePIDs=yes -p MountAPIVFS=yes -p DelegateNamespaces="mnt pid" --wait --pipe -- sh -c 'echo 5 >/proc/sys/kernel/ns_last_pid' + systemd-run -p PrivateUsersEx=self -p PrivatePIDs=yes -p MountAPIVFS=yes -p DelegateNamespaces=pid --wait --pipe -- sh -c 'echo 5 >/proc/sys/kernel/ns_last_pid' fi } From 1614d0c45190eed6cccae3b98856cefcd9f0bcbc Mon Sep 17 00:00:00 2001 From: Mike Yuan Date: Sun, 16 Mar 2025 21:55:29 +0100 Subject: [PATCH 4/5] core/namespace: stop applying mount options on private cgroupfs mount We always unshare cgroup ns for ProtectControlGroups=private/strict, while the mount options only apply to the cgroupfs instance in initial cgns (c.f. https://github.com/torvalds/linux/blob/b69bb476dee99d564d65d418e9a20acca6f32c3f/kernel/cgroup/cgroup.c#L1984) Hence let's drop the thing wholesale. Also, as noted in the comment already, mount_private_apivfs() internally enforces nosuid/noexec, so drop explicit flags too. --- src/core/namespace.c | 20 +++++--------------- src/shared/mount-setup.c | 2 +- src/shared/mount-setup.h | 2 -- 3 files changed, 6 insertions(+), 18 deletions(-) diff --git a/src/core/namespace.c b/src/core/namespace.c index 56a3f93c3e..aecc827797 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -26,7 +26,6 @@ #include "loopback-setup.h" #include "missing_syscall.h" #include "mkdir-label.h" -#include "mount-setup.h" #include "mount-util.h" #include "mountpoint-util.h" #include "namespace-util.h" @@ -207,14 +206,14 @@ static const MountEntry protect_control_groups_yes_table[] = { }; /* ProtectControlGroups=private table. Note mount_private_apivfs() always use MS_NOSUID|MS_NOEXEC|MS_NODEV so - * flags is not set here. nsdelegate has been supported since kernels >= 4.13 so it is safe to use. */ + * flags is not set here. */ static const MountEntry protect_control_groups_private_table[] = { - { "/sys/fs/cgroup", MOUNT_PRIVATE_CGROUP2FS, false, .read_only = false, .nosuid = true, .noexec = true, .options_const = "nsdelegate" }, + { "/sys/fs/cgroup", MOUNT_PRIVATE_CGROUP2FS, false, .read_only = false }, }; /* ProtectControlGroups=strict table */ static const MountEntry protect_control_groups_strict_table[] = { - { "/sys/fs/cgroup", MOUNT_PRIVATE_CGROUP2FS, false, .read_only = true, .nosuid = true, .noexec = true, .options_const = "nsdelegate" }, + { "/sys/fs/cgroup", MOUNT_PRIVATE_CGROUP2FS, false, .read_only = true }, }; /* ProtectSystem=yes table */ @@ -338,7 +337,7 @@ static bool mount_entry_read_only(const MountEntry *p) { static bool mount_entry_noexec(const MountEntry *p) { assert(p); - return p->noexec || IN_SET(p->mode, MOUNT_NOEXEC, MOUNT_INACCESSIBLE, MOUNT_PRIVATE_SYSFS, MOUNT_BIND_SYSFS, MOUNT_PROCFS); + return p->noexec || IN_SET(p->mode, MOUNT_NOEXEC, MOUNT_INACCESSIBLE, MOUNT_PRIVATE_SYSFS, MOUNT_BIND_SYSFS, MOUNT_PROCFS, MOUNT_PRIVATE_CGROUP2FS); } static bool mount_entry_exec(const MountEntry *p) { @@ -1375,18 +1374,9 @@ static int mount_private_sysfs(const MountEntry *m, const NamespaceParameters *p } static int mount_private_cgroup2fs(const MountEntry *m, const NamespaceParameters *p) { - _cleanup_free_ char *opts = NULL; - assert(m); assert(p); - - if (cgroupfs_recursiveprot_supported()) { - opts = strextend_with_separator(NULL, ",", mount_entry_options(m) ?: POINTER_MAX, "memory_recursiveprot"); - if (!opts) - return -ENOMEM; - } - - return mount_private_apivfs("cgroup2", mount_entry_path(m), "/sys/fs/cgroup", opts ?: mount_entry_options(m), p->runtime_scope); + return mount_private_apivfs("cgroup2", mount_entry_path(m), "/sys/fs/cgroup", /* opts = */ NULL, p->runtime_scope); } static int mount_procfs(const MountEntry *m, const NamespaceParameters *p) { diff --git a/src/shared/mount-setup.c b/src/shared/mount-setup.c index db963df39e..c628c87942 100644 --- a/src/shared/mount-setup.c +++ b/src/shared/mount-setup.c @@ -52,7 +52,7 @@ typedef struct MountPoint { MountMode mode; } MountPoint; -bool cgroupfs_recursiveprot_supported(void) { +static bool cgroupfs_recursiveprot_supported(void) { int r; /* Added in kernel 5.7 */ diff --git a/src/shared/mount-setup.h b/src/shared/mount-setup.h index c07fe86364..34de1dad0b 100644 --- a/src/shared/mount-setup.h +++ b/src/shared/mount-setup.h @@ -8,5 +8,3 @@ bool mount_point_ignore(const char *path); int mount_setup_early(void); int mount_setup(bool loaded_policy, bool leave_propagation); - -bool cgroupfs_recursiveprot_supported(void); From 0cba5bdcf102649741d83734c1c5c9e0bedfdfeb Mon Sep 17 00:00:00 2001 From: Mike Yuan Date: Sun, 16 Mar 2025 22:05:41 +0100 Subject: [PATCH 5/5] core/namespace: remove wonky fallback in mount_private_apivfs() Let's avoid dropping opts willy-nilly, especially that we already carry the logic of determining availability prior to mount (but make sure we respect the result though, and don't assume things are available if the check fails). --- src/core/namespace.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/src/core/namespace.c b/src/core/namespace.c index aecc827797..7e131b1425 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -1319,16 +1319,6 @@ static int mount_private_apivfs( return r; r = mount_nofollow_verbose(LOG_DEBUG, fstype, temporary_mount, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, opts); - if (r == -EINVAL && opts) - /* If this failed with EINVAL then this likely means either: - * 1. the textual hidepid= stuff for procfs is not supported by the kernel, and thus the - * per-instance hidepid= neither, which means we really don't want to use it, since it - * would affect our host's /proc mount. - * 2. nsdelegate for cgroup2 is not supported by the kernel even though CLONE_NEWCGROUP - * is supported. - * - * Hence let's gracefully fallback to a classic, unrestricted version. */ - r = mount_nofollow_verbose(LOG_DEBUG, fstype, temporary_mount, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, /* opts = */ NULL); if (ERRNO_IS_NEG_PRIVILEGE(r)) { /* When we do not have enough privileges to mount a new instance, fall back to use an * existing mount. */ @@ -1347,8 +1337,8 @@ static int mount_private_apivfs( return r; return 1; - - } else if (r < 0) + } + if (r < 0) return r; /* OK. We have a new mount instance. Let's clear an existing mount and its submounts. */ @@ -1404,14 +1394,14 @@ static int mount_procfs(const MountEntry *m, const NamespaceParameters *p) { * fsopen()/fsconfig() was also backported on some distros which allows us to detect * hidepid=/subset= support in even more scenarios. */ - if (mount_option_supported("proc", "hidepid", hpv) != 0) { + if (mount_option_supported("proc", "hidepid", hpv) > 0) { opts = strjoin("hidepid=", hpv); if (!opts) return -ENOMEM; } if (p->proc_subset == PROC_SUBSET_PID && - mount_option_supported("proc", "subset", "pid") != 0) + mount_option_supported("proc", "subset", "pid") > 0) if (!strextend_with_separator(&opts, ",", "subset=pid")) return -ENOMEM; }