core: Make DelegateNamespaces= work for user managers with CAP_SYS_ADMIN (#36771)

Currently DelegateNamespaces= only works for services spawned by the system manager. User managers will always unshare the user namespace first even if they're running with CAP_SYS_ADMIN. Let's add support for DelegateNamespaces= for user managers if they're running with CAP_SYS_ADMIN. By default, we'll still delegate all namespaces for user managers, but this can now be overridden by explicitly passing DelegateNamespaces=. If a user manager is running without CAP_SYS_ADMIN, the user manager is still always unshared first just like before.
2026-04-14 08:25:20 +09:00 · 2025-03-20 06:28:18 +09:00
parent b0fe8df5a9 38748596f0
commit db4b4e0cd3
7 changed files with 92 additions and 56 deletions
--- a/src/basic/capability-util.c
+++ b/src/basic/capability-util.c
@@ -114,8 +114,9 @@ int capability_ambient_set_apply(uint64_t set, bool also_inherit) {
        int r;

        /* Remove capabilities requested in ambient set, but not in the bounding set */
-        BIT_FOREACH(i, set) {
-                assert((unsigned) i <= cap_last_cap());
+        for (unsigned i = 0; i <= cap_last_cap(); i++) {
+                if (!BIT_SET(set, i))
+                        continue;

                if (prctl(PR_CAPBSET_READ, (unsigned long) i) != 1) {
                        log_debug("Ambient capability %s requested but missing from bounding set, suppressing automatically.",
--- a/src/core/exec-invoke.c
+++ b/src/core/exec-invoke.c
@@ -1473,7 +1473,7 @@ static bool context_has_no_new_privileges(const ExecContext *c) {

 static bool seccomp_allows_drop_privileges(const ExecContext *c) {
        void *id, *val;
-        bool has_capget = false, has_capset = false, has_prctl = false;
+        bool have_capget = false, have_capset = false, have_prctl = false;

        assert(c);

@@ -1487,17 +1487,17 @@ static bool seccomp_allows_drop_privileges(const ExecContext *c) {
                name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);

                if (streq(name, "capget"))
-                        has_capget = true;
+                        have_capget = true;
                else if (streq(name, "capset"))
-                        has_capset = true;
+                        have_capset = true;
                else if (streq(name, "prctl"))
-                        has_prctl = true;
+                        have_prctl = true;
        }

        if (c->syscall_allow_list)
-                return has_capget && has_capset && has_prctl;
+                return have_capget && have_capset && have_prctl;
        else
-                return !(has_capget || has_capset || has_prctl);
+                return !(have_capget || have_capset || have_prctl);
 }

 static bool skip_seccomp_unavailable(const ExecContext *c, const ExecParameters *p, const char *msg) {
@@ -4205,19 +4205,10 @@ static void log_command_line(
                        LOG_EXEC_INVOCATION_ID(params));
 }

-static bool exec_context_need_unprivileged_private_users(
-                const ExecContext *context,
-                const ExecParameters *params) {
-
+static bool exec_context_needs_cap_sys_admin(const ExecContext *context, const ExecParameters *params) {
        assert(context);
        assert(params);

-        /* These options require PrivateUsers= when used in user units, as we need to be in a user namespace
-         * to have permission to enable them when not running as root. If we have effective CAP_SYS_ADMIN
-         * (system manager) then we have privileges and don't need this. */
-        if (params->runtime_scope != RUNTIME_SCOPE_USER)
-                return false;
-
        return context->private_users != PRIVATE_USERS_NO ||
               context->private_tmp != PRIVATE_TMP_NO ||
               context->private_devices ||
@@ -4259,9 +4250,6 @@ static PrivateUsers exec_context_get_effective_private_users(
        if (context->private_users != PRIVATE_USERS_NO)
                return context->private_users;

-        if (exec_context_need_unprivileged_private_users(context, params))
-                return PRIVATE_USERS_SELF;
-
        /* If any namespace is delegated with DelegateNamespaces=, always set up a user namespace. */
        if (context->delegate_namespaces != NAMESPACE_FLAGS_INITIAL)
                return PRIVATE_USERS_SELF;
@@ -4272,6 +4260,7 @@ static PrivateUsers exec_context_get_effective_private_users(
 static bool exec_namespace_is_delegated(
                const ExecContext *context,
                const ExecParameters *params,
+                bool have_cap_sys_admin,
                unsigned long namespace) {

        assert(context);
@@ -4281,11 +4270,11 @@ static bool exec_namespace_is_delegated(
        /* If we need unprivileged private users, we've already unshared a user namespace by the time we call
         * setup_delegated_namespaces() for the first time so let's make sure we do all other namespace
         * unsharing in the first call to setup_delegated_namespaces() by returning false here. */
-        if (exec_context_need_unprivileged_private_users(context, params))
+        if (!have_cap_sys_admin && exec_context_needs_cap_sys_admin(context, params))
                return false;

        if (context->delegate_namespaces == NAMESPACE_FLAGS_INITIAL)
-                return false;
+                return params->runtime_scope == RUNTIME_SCOPE_USER;

        return FLAGS_SET(context->delegate_namespaces, namespace);
 }
@@ -4300,7 +4289,7 @@ static int setup_delegated_namespaces(
                uid_t gid,
                const ExecCommand *command,
                bool needs_sandboxing,
-                bool has_cap_sys_admin,
+                bool have_cap_sys_admin,
                int *reterr_exit_status) {

        int r;
@@ -4318,7 +4307,7 @@ static int setup_delegated_namespaces(
        assert(reterr_exit_status);

        if (exec_needs_network_namespace(context) &&
-            exec_namespace_is_delegated(context, params, CLONE_NEWNET) == delegate &&
+            exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWNET) == delegate &&
            runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {

                /* Try to enable network namespacing if network namespacing is available and we have
@@ -4345,7 +4334,7 @@ static int setup_delegated_namespaces(
        }

        if (exec_needs_ipc_namespace(context) &&
-            exec_namespace_is_delegated(context, params, CLONE_NEWIPC) == delegate &&
+            exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWIPC) == delegate &&
            runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {

                if (ns_type_supported(NAMESPACE_IPC)) {
@@ -4367,7 +4356,7 @@ static int setup_delegated_namespaces(
        }

        if (needs_sandboxing && exec_needs_cgroup_namespace(context, params) &&
-            exec_namespace_is_delegated(context, params, CLONE_NEWCGROUP) == delegate) {
+            exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWCGROUP) == delegate) {
                if (unshare(CLONE_NEWCGROUP) < 0) {
                        *reterr_exit_status = EXIT_NAMESPACE;
                        return log_exec_error_errno(context, params, errno, "Failed to set up cgroup namespacing: %m");
@@ -4379,7 +4368,7 @@ static int setup_delegated_namespaces(
        /* Unshare a new PID namespace before setting up mounts to ensure /proc/ is mounted with only processes in PID namespace visible.
         * Note PrivatePIDs=yes implies MountAPIVFS=yes so we'll always ensure procfs is remounted. */
        if (needs_sandboxing && exec_needs_pid_namespace(context) &&
-            exec_namespace_is_delegated(context, params, CLONE_NEWPID) == delegate) {
+            exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWPID) == delegate) {
                if (params->pidref_transport_fd < 0) {
                        *reterr_exit_status = EXIT_NAMESPACE;
                        return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ENOTCONN), "PidRef socket is not set up: %m");
@@ -4391,7 +4380,7 @@ static int setup_delegated_namespaces(
                 * We need to check prior to entering the user namespace because if we're running unprivileged or in a
                 * system without CAP_SYS_ADMIN, then we can have CAP_SYS_ADMIN in the current user namespace but not
                 * once we unshare a mount namespace. */
-                if (!has_cap_sys_admin) {
+                if (!have_cap_sys_admin || delegate) {
                        r = can_mount_proc(context, params);
                        if (r < 0) {
                                *reterr_exit_status = EXIT_NAMESPACE;
@@ -4416,7 +4405,7 @@ static int setup_delegated_namespaces(
        /* If PrivatePIDs= yes is configured, we're now running as pid 1 in a pid namespace! */

        if (exec_needs_mount_namespace(context, params, runtime) &&
-            exec_namespace_is_delegated(context, params, CLONE_NEWNS) == delegate) {
+            exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWNS) == delegate) {
                _cleanup_free_ char *error_path = NULL;

                r = apply_mount_namespace(command->flags,
@@ -4437,7 +4426,8 @@ static int setup_delegated_namespaces(
                log_exec_debug(context, params, "Set up %smount namespace", delegate ? "delegated " : "");
        }

-        if (needs_sandboxing && exec_namespace_is_delegated(context, params, CLONE_NEWUTS) == delegate) {
+        if (needs_sandboxing &&
+            exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWUTS) == delegate) {
                r = apply_protect_hostname(context, params, reterr_exit_status);
                if (r < 0)
                        return r;
@@ -4645,9 +4635,10 @@ int exec_invoke(
        ino_t journal_stream_ino = 0;
        bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
                needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
-                needs_mount_namespace;  /* Do we need to set up a mount namespace for this kernel? */
-        bool keep_seccomp_privileges = false;
-        bool has_cap_sys_admin = false;
+                needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
+                have_cap_sys_admin,
+                userns_set_up = false,
+                keep_seccomp_privileges = false;
 #if HAVE_SELINUX
        _cleanup_free_ char *mac_selinux_context_net = NULL;
        bool use_selinux = false;
@@ -5308,7 +5299,7 @@ int exec_invoke(
        uint64_t capability_ambient_set = context->capability_ambient_set;

        /* Check CAP_SYS_ADMIN before we enter user namespace to see if we can mount /proc even though its masked. */
-        has_cap_sys_admin = have_effective_cap(CAP_SYS_ADMIN) > 0;
+        have_cap_sys_admin = have_effective_cap(CAP_SYS_ADMIN) > 0;

        if (needs_sandboxing) {
                /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
@@ -5373,11 +5364,13 @@ int exec_invoke(
                }
        }

-        if (needs_sandboxing && exec_context_need_unprivileged_private_users(context, params)) {
+        if (needs_sandboxing && !have_cap_sys_admin && exec_context_needs_cap_sys_admin(context, params)) {
                /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
                 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
                 * set up all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
                PrivateUsers pu = exec_context_get_effective_private_users(context, params);
+                if (pu == PRIVATE_USERS_NO)
+                        pu = PRIVATE_USERS_SELF;

                /* The kernel requires /proc/pid/setgroups be set to "deny" prior to writing /proc/pid/gid_map in
                 * unprivileged user namespaces. */
@@ -5392,6 +5385,7 @@ int exec_invoke(
                        log_exec_info_errno(context, params, r, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
                else {
                        assert(r > 0);
+                        userns_set_up = true;
                        log_debug("Set up unprivileged user namespace");
                }
        }
@@ -5407,7 +5401,7 @@ int exec_invoke(
                        gid,
                        command,
                        needs_sandboxing,
-                        has_cap_sys_admin,
+                        have_cap_sys_admin,
                        exit_status);
        if (r < 0)
                return r;
@@ -5444,7 +5438,7 @@ int exec_invoke(
         * case of mount namespaces being less privileged when the mount point list is copied from a
         * different user namespace). */

-        if (needs_sandboxing && !exec_context_need_unprivileged_private_users(context, params)) {
+        if (needs_sandboxing && !userns_set_up) {
                PrivateUsers pu = exec_context_get_effective_private_users(context, params);

                r = setup_private_users(pu, saved_uid, saved_gid, uid, gid,
@@ -5468,7 +5462,7 @@ int exec_invoke(
                        gid,
                        command,
                        needs_sandboxing,
-                        has_cap_sys_admin,
+                        have_cap_sys_admin,
                        exit_status);
        if (r < 0)
                return r;
--- a/src/libsystemd/sd-bus/sd-bus.c
+++ b/src/libsystemd/sd-bus/sd-bus.c
@@ -1760,8 +1760,10 @@ _public_ int sd_bus_open_user_machine(sd_bus **ret, const char *user_and_machine
        assert_return(user_and_machine, -EINVAL);
        assert_return(ret, -EINVAL);

-        /* Shortcut things if we'd end up on this host and as the same user.  */
-        if (user_and_machine_equivalent(user_and_machine))
+        /* Shortcut things if we'd end up on this host and as the same user and have one of the necessary
+         * environment variables set already.  */
+        if (user_and_machine_equivalent(user_and_machine) &&
+            (secure_getenv("DBUS_SESSION_BUS_ADDRESS") || secure_getenv("XDG_RUNTIME_DIR")))
                return sd_bus_open_user(ret);

        r = user_and_machine_valid(user_and_machine);
--- a/src/run/run.c
+++ b/src/run/run.c
@@ -2483,6 +2483,11 @@ static int start_transient_scope(sd_bus *bus) {
                        return log_oom();
        }

+        /* Stop agents before we pass control away and before we drop privileges, to avoid TTY conflicts and
+         * before we become unable to stop agents. */
+        polkit_agent_close();
+        ask_password_agent_close();
+
        if (arg_nice_set) {
                if (setpriority(PRIO_PROCESS, 0, arg_nice) < 0)
                        return log_error_errno(errno, "Failed to set nice level: %m");
@@ -2571,10 +2576,6 @@ static int start_transient_scope(sd_bus *bus) {
                }
        }

-        /* Stop agents before we pass control away, to avoid TTY conflicts */
-        polkit_agent_close();
-        ask_password_agent_close();
-
        execvpe(arg_cmdline[0], arg_cmdline, env);

        return log_error_errno(errno, "Failed to execute: %m");
--- a/src/shared/bus-unit-util.c
+++ b/src/shared/bus-unit-util.c
@@ -1669,13 +1669,17 @@ static int bus_append_execute_property(sd_bus_message *m, const char *field, con
        if (STR_IN_SET(field, "RestrictNamespaces",
                              "DelegateNamespaces")) {
                bool invert = false;
+                unsigned long all = UPDATE_FLAG(NAMESPACE_FLAGS_ALL, CLONE_NEWUSER, !streq(field, "DelegateNamespaces"));
                unsigned long flags;

                r = parse_boolean(eq);
                if (r > 0)
-                        flags = 0;
+                        /* RestrictNamespaces= value gets stored into a field with reverse semantics (the
+                         * namespaces which are retained), so RestrictNamespaces=true means we retain no
+                         * access to any namespaces and vice-versa. */
+                        flags = streq(field, "RestrictNamespaces") ? 0 : all;
                else if (r == 0)
-                        flags = NAMESPACE_FLAGS_ALL;
+                        flags = streq(field, "RestrictNamespaces") ? all : 0;
                else {
                        if (eq[0] == '~') {
                                invert = true;
@@ -1688,7 +1692,7 @@ static int bus_append_execute_property(sd_bus_message *m, const char *field, con
                }

                if (invert)
-                        flags = (~flags) & NAMESPACE_FLAGS_ALL;
+                        flags = (~flags) & all;

                r = sd_bus_message_append(m, "(sv)", field, "t", (uint64_t) flags);
                if (r < 0)
--- a/test/units/TEST-07-PID1.delegate-namespaces.sh
+++ b/test/units/TEST-07-PID1.delegate-namespaces.sh
@@ -9,6 +9,22 @@ set -o pipefail
 # shellcheck source=test/units/util.sh
 . "$(dirname "$0")"/util.sh

+# IMPORTANT: For /proc/ to be remounted in pid namespace within an unprivileged user namespace, there needs to
+# be at least 1 unmasked procfs mount in ANY directory. Otherwise, if /proc/ is masked (e.g. /proc/scsi is
+# over-mounted with tmpfs), then mounting a new /proc/ will fail.
+#
+# Thus, to guarantee PrivatePIDs=yes tests for unprivileged users pass, we mount a new procfs on a temporary
+# directory with no masking. This will guarantee an unprivileged user can mount a new /proc/ successfully.
+mkdir -p /tmp/TEST-07-PID1-delegate-namespaces-proc
+mount -t proc proc /tmp/TEST-07-PID1-delegate-namespaces-proc
+
+at_exit() {
+    umount /tmp/TEST-07-PID1-delegate-namespaces-proc
+    rm -rf /tmp/TEST-07-PID1-delegate-namespaces-proc
+}
+
+trap at_exit EXIT
+
 testcase_mount() {
    (! systemd-run -p PrivateUsersEx=self -p PrivateMounts=yes --wait --pipe -- mount --bind /usr /home)
    systemd-run -p PrivateUsersEx=self -p PrivateMounts=yes -p DelegateNamespaces=mnt --wait --pipe -- mount --bind /usr /home
@@ -16,7 +32,7 @@ testcase_mount() {

 testcase_network() {
    (! systemd-run -p PrivateUsersEx=self -p PrivateNetwork=yes --wait --pipe -- ip link add veth1 type veth peer name veth2)
-    systemd-run -p PrivateUsersEx=self -p PrivateMounts=yes -p DelegateNamespaces=mnt --wait --pipe -- ip link add veth1 type veth peer name veth2
+    systemd-run -p PrivateUsersEx=self -p PrivateNetwork=yes -p DelegateNamespaces=net --wait --pipe -- ip link add veth1 type veth peer name veth2
 }

 testcase_cgroup() {
@@ -25,8 +41,12 @@ testcase_cgroup() {
 }

 testcase_pid() {
-    (! systemd-run -p PrivateUsersEx=self -p PrivatePIDs=yes --wait --pipe -- sh -c 'echo 5 >/proc/sys/kernel/ns_last_pid')
-    systemd-run -p PrivateUsersEx=self -p PrivatePIDs=yes -p DelegateNamespaces=pid --wait --pipe -- sh -c 'echo 5 >/proc/sys/kernel/ns_last_pid'
+    # MountAPIVFS=yes always bind mounts child mounts of APIVFS filesystems, which means /proc/sys is always read-only
+    # so we can't write to it when running in a container.
+    if ! systemd-detect-virt --container; then
+        (! systemd-run -p PrivateUsersEx=self -p PrivatePIDs=yes -p MountAPIVFS=yes --wait --pipe -- sh -c 'echo 5 >/proc/sys/kernel/ns_last_pid')
+        systemd-run -p PrivateUsersEx=self -p PrivatePIDs=yes -p MountAPIVFS=yes -p DelegateNamespaces="mnt pid" --wait --pipe -- sh -c 'echo 5 >/proc/sys/kernel/ns_last_pid'
+    fi
 }

 testcase_uts() {
@@ -42,6 +62,18 @@ testcase_implied_private_users_self() {
    systemd-run -p PrivateUsersEx=identity -p PrivateMounts=yes -p DelegateNamespaces=mnt --wait bash -c 'test "$(cat /proc/self/uid_map)" == "         0          0      65536"'
 }

+testcase_user_manager() {
+    systemctl start user@0
+    # DelegateNamespaces=yes is implied for user managers.
+    systemd-run --machine=testuser@.host --user -p PrivateMounts=yes -p AmbientCapabilities="~" --wait --pipe -- mount --bind /usr /home
+    # Even those with CAP_SYS_ADMIN.
+    SYSTEMD_LOG_LEVEL=debug systemd-run --machine=.host --user -p PrivateMounts=yes --wait --pipe -- mount --bind /usr /home
+    # But can be overridden for user managers that are running with CAP_SYS_ADMIN.
+    (! systemd-run --machine=.host --user -p PrivateMounts=yes -p DelegateNamespaces=no --wait --pipe -- mount --bind /usr /home)
+    # But not for those without CAP_SYS_ADMIN.
+    systemd-run --machine=testuser@.host --user -p PrivateMounts=yes -p DelegateNamespaces=no -p AmbientCapabilities="~" --wait --pipe -- mount --bind /usr /home
+}
+
 testcase_multiple_features() {
    unsquashfs -no-xattrs -d /tmp/TEST-07-PID1-delegate-namespaces-root /usr/share/minimal_0.raw

@@ -52,7 +84,7 @@ testcase_multiple_features() {
        -p BindReadOnlyPaths=/usr/share \
        -p NoNewPrivileges=yes \
        -p ProtectSystem=strict \
-        -p User=testuser\
+        -p User=testuser \
        -p Group=testuser \
        -p RuntimeDirectory=abc \
        -p StateDirectory=qed \
@@ -78,3 +110,5 @@ testcase_multiple_features() {

    rm -rf /tmp/TEST-07-PID1-delegate-namespaces-root
 }
+
+run_testcases
--- a/test/units/TEST-07-PID1.private-pids.sh
+++ b/test/units/TEST-07-PID1.private-pids.sh
@@ -95,7 +95,7 @@ testcase_multiple_features() {
        -p BindReadOnlyPaths=/usr/share \
        -p NoNewPrivileges=yes \
        -p ProtectSystem=strict \
-        -p User=testuser\
+        -p User=testuser \
        -p Group=testuser \
        -p RuntimeDirectory=abc \
        -p StateDirectory=qed \
@@ -142,8 +142,8 @@ testcase_unpriv() {
    mount -t proc proc /tmp/TEST-07-PID1-private-pids-proc

    # Verify running as unprivileged user can unshare PID namespace and mounts /proc properly.
-    assert_eq "$(runas testuser systemd-run --wait --user --pipe -p PrivatePIDs=yes readlink /proc/self)" "1"
-    assert_eq "$(runas testuser systemd-run --wait --user --pipe -p PrivatePIDs=yes ps aux --no-heading | wc -l)" "1"
+    assert_eq "$(systemd-run --machine=testuser@.host --wait --user --pipe -p PrivatePIDs=yes readlink /proc/self)" "1"
+    assert_eq "$(systemd-run --machine=testuser@.host --wait --user --pipe -p PrivatePIDs=yes ps aux --no-heading | wc -l)" "1"

    umount /tmp/TEST-07-PID1-private-pids-proc
    rm -rf /tmp/TEST-07-PID1-private-pids-proc
@@ -162,7 +162,7 @@ testcase_unpriv() {
        mount -t tmpfs tmpfs /proc/scsi
    fi

-    (! runas testuser systemd-run --wait --user --pipe -p PrivatePIDs=yes true)
+    (! systemd-run --machine=testuser@.host --wait --user --pipe -p PrivatePIDs=yes true)

    if [[ "$HAS_EXISTING_SCSI_MOUNT" == "no" ]]; then
        umount /proc/scsi