core: delegate mountns implicitly when any of pidns/cgns/netns is in use, clean up private cgroupfs mount (#36892)

Fixes #36952
2026-04-14 00:14:32 +09:00 · 2025-04-03 00:15:33 +09:00
parent f5c626df2c 0cba5bdcf1
commit b0c9ba4eb1
8 changed files with 52 additions and 70 deletions
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -2411,6 +2411,11 @@ RestrictNamespaces=~cgroup net</programlisting>
        done with the namespace specific unit setting such as <varname>PrivateNetwork=</varname> or
        <varname>PrivateMounts=</varname>.</para>

+        <para>Note that some namespace sandboxing options might entail mount namespace for private API VFS instances,
+        such as <varname>PrivatePIDs=</varname>, <varname>ProtectControlGroups=private/strict</varname>, or
+        <varname>PrivateNetwork=</varname>. If any of the mentioned options are enabled, mount namespace
+        is implicitly delegated.</para>
+
        <xi:include href="version-info.xml" xpointer="v258"/></listitem>
      </varlistentry>

--- a/src/core/exec-invoke.c
+++ b/src/core/exec-invoke.c
@@ -3441,7 +3441,7 @@ static int apply_mount_namespace(

        /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
         * service will need to write to it in order to start the notifications. */
-        if (exec_is_cgroup_mount_read_only(context, params) && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
+        if (exec_is_cgroup_mount_read_only(context) && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
                read_write_paths_cleanup = strv_copy(context->read_write_paths);
                if (!read_write_paths_cleanup)
                        return -ENOMEM;
@@ -3586,7 +3586,7 @@ static int apply_mount_namespace(
                 * sandbox inside the mount namespace. */
                .ignore_protect_paths = !needs_sandboxing && !context->dynamic_user && root_dir,

-                .protect_control_groups = needs_sandboxing ? exec_get_protect_control_groups(context, params) : PROTECT_CONTROL_GROUPS_NO,
+                .protect_control_groups = needs_sandboxing ? exec_get_protect_control_groups(context) : PROTECT_CONTROL_GROUPS_NO,
                .protect_kernel_tunables = needs_sandboxing && context->protect_kernel_tunables,
                .protect_kernel_modules = needs_sandboxing && context->protect_kernel_modules,
                .protect_kernel_logs = needs_sandboxing && context->protect_kernel_logs,
@@ -4205,9 +4205,8 @@ static void log_command_line(
                        LOG_EXEC_INVOCATION_ID(params));
 }

-static bool exec_context_needs_cap_sys_admin(const ExecContext *context, const ExecParameters *params) {
+static bool exec_context_needs_cap_sys_admin(const ExecContext *context) {
        assert(context);
-        assert(params);

        return context->private_users != PRIVATE_USERS_NO ||
               context->private_tmp != PRIVATE_TMP_NO ||
@@ -4229,7 +4228,7 @@ static bool exec_context_needs_cap_sys_admin(const ExecContext *context, const E
               context->protect_kernel_tunables ||
               context->protect_kernel_modules ||
               context->protect_kernel_logs ||
-               exec_needs_cgroup_mount(context, params) ||
+               exec_needs_cgroup_mount(context) ||
               context->protect_clock ||
               context->protect_hostname != PROTECT_HOSTNAME_NO ||
               !strv_isempty(context->read_write_paths) ||
@@ -4270,13 +4269,23 @@ static bool exec_namespace_is_delegated(
        /* If we need unprivileged private users, we've already unshared a user namespace by the time we call
         * setup_delegated_namespaces() for the first time so let's make sure we do all other namespace
         * unsharing in the first call to setup_delegated_namespaces() by returning false here. */
-        if (!have_cap_sys_admin && exec_context_needs_cap_sys_admin(context, params))
+        if (!have_cap_sys_admin && exec_context_needs_cap_sys_admin(context))
                return false;

        if (context->delegate_namespaces == NAMESPACE_FLAGS_INITIAL)
                return params->runtime_scope == RUNTIME_SCOPE_USER;

-        return FLAGS_SET(context->delegate_namespaces, namespace);
+        if (FLAGS_SET(context->delegate_namespaces, namespace))
+                return true;
+
+        /* Various namespaces imply mountns for private procfs/sysfs/cgroupfs instances, which means when
+         * those are delegated mountns must be deferred too.
+         *
+         * The list should stay in sync with exec_needs_mount_namespace(). */
+        if (namespace == CLONE_NEWNS)
+                return context->delegate_namespaces & (CLONE_NEWPID|CLONE_NEWCGROUP|CLONE_NEWNET);
+
+        return false;
 }

 static int setup_delegated_namespaces(
@@ -4355,7 +4364,7 @@ static int setup_delegated_namespaces(
                        log_exec_warning(context, params, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
        }

-        if (needs_sandboxing && exec_needs_cgroup_namespace(context, params) &&
+        if (needs_sandboxing && exec_needs_cgroup_namespace(context) &&
            exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWCGROUP) == delegate) {
                if (unshare(CLONE_NEWCGROUP) < 0) {
                        *reterr_exit_status = EXIT_NAMESPACE;
@@ -5197,7 +5206,7 @@ int exec_invoke(
                                 * to the cgroup namespace to environment variables and mounts. If chown/chmod fails, we should not pass memory
                                 * pressure path environment variable or read-write mount to the unit. This is why we check if
                                 * memory_pressure_path != NULL in the conditional below. */
-                                if (memory_pressure_path && needs_sandboxing && exec_needs_cgroup_namespace(context, params)) {
+                                if (memory_pressure_path && needs_sandboxing && exec_needs_cgroup_namespace(context)) {
                                        memory_pressure_path = mfree(memory_pressure_path);
                                        r = cg_get_path("memory", "", "memory.pressure", &memory_pressure_path);
                                        if (r < 0) {
@@ -5364,7 +5373,7 @@ int exec_invoke(
                }
        }

-        if (needs_sandboxing && !have_cap_sys_admin && exec_context_needs_cap_sys_admin(context, params)) {
+        if (needs_sandboxing && !have_cap_sys_admin && exec_context_needs_cap_sys_admin(context)) {
                /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
                 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
                 * set up all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -232,24 +232,18 @@ bool exec_needs_ipc_namespace(const ExecContext *context) {
        return context->private_ipc || context->ipc_namespace_path;
 }

-static bool can_apply_cgroup_namespace(const ExecContext *context, const ExecParameters *params) {
-        return cg_all_unified() > 0 && ns_type_supported(NAMESPACE_CGROUP);
-}
-
 static bool needs_cgroup_namespace(ProtectControlGroups i) {
        return IN_SET(i, PROTECT_CONTROL_GROUPS_PRIVATE, PROTECT_CONTROL_GROUPS_STRICT);
 }

-ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context, const ExecParameters *params) {
+ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context) {
        assert(context);

        /* If cgroup namespace is configured via ProtectControlGroups=private or strict but we can't actually
-         * use cgroup namespace, either from not having unified hierarchy or kernel support, we ignore the
-         * setting and do not unshare the namespace. ProtectControlGroups=private and strict get downgraded
-         * to no and yes respectively. This ensures that strict always gets a read-only mount of /sys/fs/cgroup.
-         *
-         * TODO: Remove fallback once cgroupv1 support is removed in v258. */
-        if (needs_cgroup_namespace(context->protect_control_groups) && !can_apply_cgroup_namespace(context, params)) {
+         * use cgroup namespace, we ignore the setting and do not unshare the namespace.
+         * ProtectControlGroups=private and strict get downgraded to no and yes respectively. This ensures
+         * that strict always gets a read-only mount of /sys/fs/cgroup/. */
+        if (needs_cgroup_namespace(context->protect_control_groups) && !ns_type_supported(NAMESPACE_CGROUP)) {
                if (context->protect_control_groups == PROTECT_CONTROL_GROUPS_PRIVATE)
                        return PROTECT_CONTROL_GROUPS_NO;
                if (context->protect_control_groups == PROTECT_CONTROL_GROUPS_STRICT)
@@ -258,22 +252,22 @@ ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context,
        return context->protect_control_groups;
 }

-bool exec_needs_cgroup_namespace(const ExecContext *context, const ExecParameters *params) {
+bool exec_needs_cgroup_namespace(const ExecContext *context) {
        assert(context);

-        return needs_cgroup_namespace(exec_get_protect_control_groups(context, params));
+        return needs_cgroup_namespace(exec_get_protect_control_groups(context));
 }

-bool exec_needs_cgroup_mount(const ExecContext *context, const ExecParameters *params) {
+bool exec_needs_cgroup_mount(const ExecContext *context) {
        assert(context);

-        return exec_get_protect_control_groups(context, params) != PROTECT_CONTROL_GROUPS_NO;
+        return exec_get_protect_control_groups(context) != PROTECT_CONTROL_GROUPS_NO;
 }

-bool exec_is_cgroup_mount_read_only(const ExecContext *context, const ExecParameters *params) {
+bool exec_is_cgroup_mount_read_only(const ExecContext *context) {
        assert(context);

-        return IN_SET(exec_get_protect_control_groups(context, params), PROTECT_CONTROL_GROUPS_YES, PROTECT_CONTROL_GROUPS_STRICT);
+        return IN_SET(exec_get_protect_control_groups(context), PROTECT_CONTROL_GROUPS_YES, PROTECT_CONTROL_GROUPS_STRICT);
 }

 bool exec_needs_pid_namespace(const ExecContext *context) {
@@ -331,7 +325,7 @@ bool exec_needs_mount_namespace(
            context->protect_kernel_tunables ||
            context->protect_kernel_modules ||
            context->protect_kernel_logs ||
-            exec_needs_cgroup_mount(context, params) ||
+            exec_needs_cgroup_mount(context) ||
            context->protect_proc != PROTECT_PROC_DEFAULT ||
            context->proc_subset != PROC_SUBSET_ALL ||
            exec_needs_ipc_namespace(context) ||
--- a/src/core/execute.h
+++ b/src/core/execute.h
@@ -631,10 +631,11 @@ bool exec_needs_network_namespace(const ExecContext *context);
 bool exec_needs_ipc_namespace(const ExecContext *context);
 bool exec_needs_pid_namespace(const ExecContext *context);

-ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context, const ExecParameters *params);
-bool exec_needs_cgroup_namespace(const ExecContext *context, const ExecParameters *params);
-bool exec_needs_cgroup_mount(const ExecContext *context, const ExecParameters *params);
-bool exec_is_cgroup_mount_read_only(const ExecContext *context, const ExecParameters *params);
+ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context);
+bool exec_needs_cgroup_namespace(const ExecContext *context);
+bool exec_needs_cgroup_mount(const ExecContext *context);
+bool exec_is_cgroup_mount_read_only(const ExecContext *context);
+
 const char* exec_get_private_notify_socket_path(const ExecContext *context, const ExecParameters *params, bool needs_sandboxing);

 /* These logging macros do the same logging as those in unit.h, but using ExecContext and ExecParameters
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -26,7 +26,6 @@
 #include "loopback-setup.h"
 #include "missing_syscall.h"
 #include "mkdir-label.h"
-#include "mount-setup.h"
 #include "mount-util.h"
 #include "mountpoint-util.h"
 #include "namespace-util.h"
@@ -207,14 +206,14 @@ static const MountEntry protect_control_groups_yes_table[] = {
 };

 /* ProtectControlGroups=private table. Note mount_private_apivfs() always use MS_NOSUID|MS_NOEXEC|MS_NODEV so
- * flags is not set here. nsdelegate has been supported since kernels >= 4.13 so it is safe to use. */
+ * flags is not set here. */
 static const MountEntry protect_control_groups_private_table[] = {
-        { "/sys/fs/cgroup",      MOUNT_PRIVATE_CGROUP2FS, false, .read_only = false, .nosuid = true, .noexec = true, .options_const = "nsdelegate"  },
+        { "/sys/fs/cgroup",      MOUNT_PRIVATE_CGROUP2FS, false, .read_only = false },
 };

 /* ProtectControlGroups=strict table */
 static const MountEntry protect_control_groups_strict_table[] = {
-        { "/sys/fs/cgroup",      MOUNT_PRIVATE_CGROUP2FS, false, .read_only = true,  .nosuid = true, .noexec = true, .options_const = "nsdelegate"  },
+        { "/sys/fs/cgroup",      MOUNT_PRIVATE_CGROUP2FS, false, .read_only = true },
 };

 /* ProtectSystem=yes table */
@@ -338,7 +337,7 @@ static bool mount_entry_read_only(const MountEntry *p) {
 static bool mount_entry_noexec(const MountEntry *p) {
        assert(p);

-        return p->noexec || IN_SET(p->mode, MOUNT_NOEXEC, MOUNT_INACCESSIBLE, MOUNT_PRIVATE_SYSFS, MOUNT_BIND_SYSFS, MOUNT_PROCFS);
+        return p->noexec || IN_SET(p->mode, MOUNT_NOEXEC, MOUNT_INACCESSIBLE, MOUNT_PRIVATE_SYSFS, MOUNT_BIND_SYSFS, MOUNT_PROCFS, MOUNT_PRIVATE_CGROUP2FS);
 }

 static bool mount_entry_exec(const MountEntry *p) {
@@ -1320,16 +1319,6 @@ static int mount_private_apivfs(
                return r;

        r = mount_nofollow_verbose(LOG_DEBUG, fstype, temporary_mount, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
-        if (r == -EINVAL && opts)
-                /* If this failed with EINVAL then this likely means either:
-                 * 1. the textual hidepid= stuff for procfs is not supported by the kernel, and thus the
-                 *    per-instance hidepid= neither, which means we really don't want to use it, since it
-                 *    would affect our host's /proc mount.
-                 * 2. nsdelegate for cgroup2 is not supported by the kernel even though CLONE_NEWCGROUP
-                 *    is supported.
-                 *
-                 * Hence let's gracefully fallback to a classic, unrestricted version. */
-                r = mount_nofollow_verbose(LOG_DEBUG, fstype, temporary_mount, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, /* opts = */ NULL);
        if (ERRNO_IS_NEG_PRIVILEGE(r)) {
                /* When we do not have enough privileges to mount a new instance, fall back to use an
                 * existing mount. */
@@ -1348,8 +1337,8 @@ static int mount_private_apivfs(
                        return r;

                return 1;
-
-        } else if (r < 0)
+        }
+        if (r < 0)
                return r;

        /* OK. We have a new mount instance. Let's clear an existing mount and its submounts. */
@@ -1375,18 +1364,9 @@ static int mount_private_sysfs(const MountEntry *m, const NamespaceParameters *p
 }

 static int mount_private_cgroup2fs(const MountEntry *m, const NamespaceParameters *p) {
-        _cleanup_free_ char *opts = NULL;
-
        assert(m);
        assert(p);
-
-        if (cgroupfs_recursiveprot_supported()) {
-                opts = strextend_with_separator(NULL, ",", mount_entry_options(m) ?: POINTER_MAX, "memory_recursiveprot");
-                if (!opts)
-                        return -ENOMEM;
-        }
-
-        return mount_private_apivfs("cgroup2", mount_entry_path(m), "/sys/fs/cgroup", opts ?: mount_entry_options(m), p->runtime_scope);
+        return mount_private_apivfs("cgroup2", mount_entry_path(m), "/sys/fs/cgroup", /* opts = */ NULL, p->runtime_scope);
 }

 static int mount_procfs(const MountEntry *m, const NamespaceParameters *p) {
@@ -1414,14 +1394,14 @@ static int mount_procfs(const MountEntry *m, const NamespaceParameters *p) {
                 * fsopen()/fsconfig() was also backported on some distros which allows us to detect
                 * hidepid=/subset= support in even more scenarios. */

-                if (mount_option_supported("proc", "hidepid", hpv) != 0) {
+                if (mount_option_supported("proc", "hidepid", hpv) > 0) {
                        opts = strjoin("hidepid=", hpv);
                        if (!opts)
                                return -ENOMEM;
                }

                if (p->proc_subset == PROC_SUBSET_PID &&
-                    mount_option_supported("proc", "subset", "pid") != 0)
+                    mount_option_supported("proc", "subset", "pid") > 0)
                        if (!strextend_with_separator(&opts, ",", "subset=pid"))
                                return -ENOMEM;
        }
--- a/src/shared/mount-setup.c
+++ b/src/shared/mount-setup.c
@@ -52,7 +52,7 @@ typedef struct MountPoint {
        MountMode mode;
 } MountPoint;

-bool cgroupfs_recursiveprot_supported(void) {
+static bool cgroupfs_recursiveprot_supported(void) {
        int r;

        /* Added in kernel 5.7 */
--- a/src/shared/mount-setup.h
+++ b/src/shared/mount-setup.h
@@ -8,5 +8,3 @@ bool mount_point_ignore(const char *path);

 int mount_setup_early(void);
 int mount_setup(bool loaded_policy, bool leave_propagation);
-
-bool cgroupfs_recursiveprot_supported(void);
--- a/test/units/TEST-07-PID1.delegate-namespaces.sh
+++ b/test/units/TEST-07-PID1.delegate-namespaces.sh
@@ -35,17 +35,12 @@ testcase_network() {
    systemd-run -p PrivateUsersEx=self -p PrivateNetwork=yes -p DelegateNamespaces=net --wait --pipe -- ip link add veth1 type veth peer name veth2
 }

-testcase_cgroup() {
-    (! systemd-run -p PrivateUsersEx=self -p ProtectControlGroupsEx=private --wait --pipe -- sh -c 'echo 0 >/sys/fs/cgroup/cgroup.pressure')
-    systemd-run -p PrivateUsersEx=self -p ProtectControlGroupsEx=private -p DelegateNamespaces=cgroup --wait --pipe -- sh -c 'echo 0 >/sys/fs/cgroup/cgroup.pressure'
-}
-
 testcase_pid() {
    # MountAPIVFS=yes always bind mounts child mounts of APIVFS filesystems, which means /proc/sys is always read-only
    # so we can't write to it when running in a container.
    if ! systemd-detect-virt --container; then
        (! systemd-run -p PrivateUsersEx=self -p PrivatePIDs=yes -p MountAPIVFS=yes --wait --pipe -- sh -c 'echo 5 >/proc/sys/kernel/ns_last_pid')
-        systemd-run -p PrivateUsersEx=self -p PrivatePIDs=yes -p MountAPIVFS=yes -p DelegateNamespaces="mnt pid" --wait --pipe -- sh -c 'echo 5 >/proc/sys/kernel/ns_last_pid'
+        systemd-run -p PrivateUsersEx=self -p PrivatePIDs=yes -p MountAPIVFS=yes -p DelegateNamespaces=pid --wait --pipe -- sh -c 'echo 5 >/proc/sys/kernel/ns_last_pid'
    fi
 }