From af6a8b6969d3f8b2189ea7b8a164910bcf2cc71e Mon Sep 17 00:00:00 2001
From: Daan De Meyer <daan.j.demeyer@gmail.com>
Date: Mon, 17 Mar 2025 10:44:05 +0100
Subject: [PATCH 01/13] exec-invoke: Rename various variables from has_ to
 have_

All of these encode information of the current process, so have_ is
more approriate than has_, which refers to something else.
---
 src/core/exec-invoke.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c
index 6bb4584a8e..9b21e385a8 100644
--- a/src/core/exec-invoke.c
+++ b/src/core/exec-invoke.c
@@ -1473,7 +1473,7 @@ static bool context_has_no_new_privileges(const ExecContext *c) {
 
 static bool seccomp_allows_drop_privileges(const ExecContext *c) {
         void *id, *val;
-        bool has_capget = false, has_capset = false, has_prctl = false;
+        bool have_capget = false, have_capset = false, have_prctl = false;
 
         assert(c);
 
@@ -1487,17 +1487,17 @@ static bool seccomp_allows_drop_privileges(const ExecContext *c) {
                 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
 
                 if (streq(name, "capget"))
-                        has_capget = true;
+                        have_capget = true;
                 else if (streq(name, "capset"))
-                        has_capset = true;
+                        have_capset = true;
                 else if (streq(name, "prctl"))
-                        has_prctl = true;
+                        have_prctl = true;
         }
 
         if (c->syscall_allow_list)
-                return has_capget && has_capset && has_prctl;
+                return have_capget && have_capset && have_prctl;
         else
-                return !(has_capget || has_capset || has_prctl);
+                return !(have_capget || have_capset || have_prctl);
 }
 
 static bool skip_seccomp_unavailable(const ExecContext *c, const ExecParameters *p, const char *msg) {
@@ -4300,7 +4300,7 @@ static int setup_delegated_namespaces(
                 uid_t gid,
                 const ExecCommand *command,
                 bool needs_sandboxing,
-                bool has_cap_sys_admin,
+                bool have_cap_sys_admin,
                 int *reterr_exit_status) {
 
         int r;
@@ -4391,7 +4391,7 @@ static int setup_delegated_namespaces(
                  * We need to check prior to entering the user namespace because if we're running unprivileged or in a
                  * system without CAP_SYS_ADMIN, then we can have CAP_SYS_ADMIN in the current user namespace but not
                  * once we unshare a mount namespace. */
-                if (!has_cap_sys_admin) {
+                if (!have_cap_sys_admin) {
                         r = can_mount_proc(context, params);
                         if (r < 0) {
                                 *reterr_exit_status = EXIT_NAMESPACE;
@@ -4647,7 +4647,7 @@ int exec_invoke(
                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
                 needs_mount_namespace;  /* Do we need to set up a mount namespace for this kernel? */
         bool keep_seccomp_privileges = false;
-        bool has_cap_sys_admin = false;
+        bool have_cap_sys_admin = false;
 #if HAVE_SELINUX
         _cleanup_free_ char *mac_selinux_context_net = NULL;
         bool use_selinux = false;
@@ -5308,7 +5308,7 @@ int exec_invoke(
         uint64_t capability_ambient_set = context->capability_ambient_set;
 
         /* Check CAP_SYS_ADMIN before we enter user namespace to see if we can mount /proc even though its masked. */
-        has_cap_sys_admin = have_effective_cap(CAP_SYS_ADMIN) > 0;
+        have_cap_sys_admin = have_effective_cap(CAP_SYS_ADMIN) > 0;
 
         if (needs_sandboxing) {
                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
@@ -5407,7 +5407,7 @@ int exec_invoke(
                         gid,
                         command,
                         needs_sandboxing,
-                        has_cap_sys_admin,
+                        have_cap_sys_admin,
                         exit_status);
         if (r < 0)
                 return r;
@@ -5468,7 +5468,7 @@ int exec_invoke(
                         gid,
                         command,
                         needs_sandboxing,
-                        has_cap_sys_admin,
+                        have_cap_sys_admin,
                         exit_status);
         if (r < 0)
                 return r;

From ad9b5d4732ec2131aec12fcfed8d141fb720fd95 Mon Sep 17 00:00:00 2001
From: Daan De Meyer <daan.j.demeyer@gmail.com>
Date: Mon, 17 Mar 2025 11:29:48 +0100
Subject: [PATCH 02/13] bus-unit-util: Fix DelegateNamespaces= parser

Similarly to the config file parse method, let's fix the systemd-run
parser as well.

Follow up for 11b982053bdc31806e571ea0771d7f10cb276d69
---
 src/shared/bus-unit-util.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c
index f787413161..1e04a051a6 100644
--- a/src/shared/bus-unit-util.c
+++ b/src/shared/bus-unit-util.c
@@ -1669,13 +1669,17 @@ static int bus_append_execute_property(sd_bus_message *m, const char *field, con
         if (STR_IN_SET(field, "RestrictNamespaces",
                               "DelegateNamespaces")) {
                 bool invert = false;
+                unsigned long all = UPDATE_FLAG(NAMESPACE_FLAGS_ALL, CLONE_NEWUSER, !streq(field, "DelegateNamespaces"));
                 unsigned long flags;
 
                 r = parse_boolean(eq);
                 if (r > 0)
-                        flags = 0;
+                        /* RestrictNamespaces= value gets stored into a field with reverse semantics (the
+                         * namespaces which are retained), so RestrictNamespaces=true means we retain no
+                         * access to any namespaces and vice-versa. */
+                        flags = streq(field, "RestrictNamespaces") ? 0 : all;
                 else if (r == 0)
-                        flags = NAMESPACE_FLAGS_ALL;
+                        flags = streq(field, "RestrictNamespaces") ? all : 0;
                 else {
                         if (eq[0] == '~') {
                                 invert = true;
@@ -1688,7 +1692,7 @@ static int bus_append_execute_property(sd_bus_message *m, const char *field, con
                 }
 
                 if (invert)
-                        flags = (~flags) & NAMESPACE_FLAGS_ALL;
+                        flags = (~flags) & all;
 
                 r = sd_bus_message_append(m, "(sv)", field, "t", (uint64_t) flags);
                 if (r < 0)

From c8912e3d8347fee3a325276177eed132e131ae19 Mon Sep 17 00:00:00 2001
From: Daan De Meyer <daan.j.demeyer@gmail.com>
Date: Mon, 17 Mar 2025 11:31:28 +0100
Subject: [PATCH 03/13] TEST-07-PID1.delegate-namespaces: Fix
 testcase_network()

---
 test/units/TEST-07-PID1.delegate-namespaces.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/units/TEST-07-PID1.delegate-namespaces.sh b/test/units/TEST-07-PID1.delegate-namespaces.sh
index fe0defaeb6..ed447f1508 100755
--- a/test/units/TEST-07-PID1.delegate-namespaces.sh
+++ b/test/units/TEST-07-PID1.delegate-namespaces.sh
@@ -16,7 +16,7 @@ testcase_mount() {
 
 testcase_network() {
     (! systemd-run -p PrivateUsersEx=self -p PrivateNetwork=yes --wait --pipe -- ip link add veth1 type veth peer name veth2)
-    systemd-run -p PrivateUsersEx=self -p PrivateMounts=yes -p DelegateNamespaces=mnt --wait --pipe -- ip link add veth1 type veth peer name veth2
+    systemd-run -p PrivateUsersEx=self -p PrivateNetwork=yes -p DelegateNamespaces=net --wait --pipe -- ip link add veth1 type veth peer name veth2
 }
 
 testcase_cgroup() {

From 918a188fdab1278fbbe2c960e1da7c655f644340 Mon Sep 17 00:00:00 2001
From: Daan De Meyer <daan.j.demeyer@gmail.com>
Date: Mon, 17 Mar 2025 16:17:25 +0100
Subject: [PATCH 04/13] core: Also check if we can mount /proc if pid namespace
 is delegated

If the pid namespace is delegated, it doesn't matter if we have CAP_SYS_ADMIN,
we'll still fail to mount /proc if part of it is masked on the host so also
check if we can mount /proc if the pid namespace is delegated.
---
 src/core/exec-invoke.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c
index 9b21e385a8..e69c65feca 100644
--- a/src/core/exec-invoke.c
+++ b/src/core/exec-invoke.c
@@ -4391,7 +4391,7 @@ static int setup_delegated_namespaces(
                  * We need to check prior to entering the user namespace because if we're running unprivileged or in a
                  * system without CAP_SYS_ADMIN, then we can have CAP_SYS_ADMIN in the current user namespace but not
                  * once we unshare a mount namespace. */
-                if (!have_cap_sys_admin) {
+                if (!have_cap_sys_admin || delegate) {
                         r = can_mount_proc(context, params);
                         if (r < 0) {
                                 *reterr_exit_status = EXIT_NAMESPACE;

From aae4748464d3b8a810f88b3f1d65dd7d5b5ff396 Mon Sep 17 00:00:00 2001
From: Daan De Meyer <daan.j.demeyer@gmail.com>
Date: Mon, 17 Mar 2025 16:20:00 +0100
Subject: [PATCH 05/13] TEST-07-PID1.delegate-namespaces: Make sure fully
 visible procfs is available

To be able to mount /proc inside an unprivileged user namespace, we have
to make sure a fully visible procfs is available on the host, so let's make
sure that's the case.
---
 test/units/TEST-07-PID1.delegate-namespaces.sh | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/test/units/TEST-07-PID1.delegate-namespaces.sh b/test/units/TEST-07-PID1.delegate-namespaces.sh
index ed447f1508..8eb9956c2e 100755
--- a/test/units/TEST-07-PID1.delegate-namespaces.sh
+++ b/test/units/TEST-07-PID1.delegate-namespaces.sh
@@ -9,6 +9,22 @@ set -o pipefail
 # shellcheck source=test/units/util.sh
 . "$(dirname "$0")"/util.sh
 
+# IMPORTANT: For /proc/ to be remounted in pid namespace within an unprivileged user namespace, there needs to
+# be at least 1 unmasked procfs mount in ANY directory. Otherwise, if /proc/ is masked (e.g. /proc/scsi is
+# over-mounted with tmpfs), then mounting a new /proc/ will fail.
+#
+# Thus, to guarantee PrivatePIDs=yes tests for unprivileged users pass, we mount a new procfs on a temporary
+# directory with no masking. This will guarantee an unprivileged user can mount a new /proc/ successfully.
+mkdir -p /tmp/TEST-07-PID1-delegate-namespaces-proc
+mount -t proc proc /tmp/TEST-07-PID1-delegate-namespaces-proc
+
+at_exit() {
+    umount /tmp/TEST-07-PID1-delegate-namespaces-proc
+    rm -rf /tmp/TEST-07-PID1-delegate-namespaces-proc
+}
+
+trap at_exit EXIT
+
 testcase_mount() {
     (! systemd-run -p PrivateUsersEx=self -p PrivateMounts=yes --wait --pipe -- mount --bind /usr /home)
     systemd-run -p PrivateUsersEx=self -p PrivateMounts=yes -p DelegateNamespaces=mnt --wait --pipe -- mount --bind /usr /home

From b7602662cbf6210052985c25a29550bc2314646d Mon Sep 17 00:00:00 2001
From: Daan De Meyer <daan.j.demeyer@gmail.com>
Date: Mon, 17 Mar 2025 21:22:49 +0100
Subject: [PATCH 06/13] TEST-07-PID1.delegate-namespaces: Fix testcase_pid()

Make sure the test has its own /proc and skip it in containers as
MountAPIVFS=yes in a container always results in a read-only /proc/sys
which means the test can't write to /proc/sys/kernel/ns_last_pid.
---
 test/units/TEST-07-PID1.delegate-namespaces.sh | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/test/units/TEST-07-PID1.delegate-namespaces.sh b/test/units/TEST-07-PID1.delegate-namespaces.sh
index 8eb9956c2e..061a7cf5b2 100755
--- a/test/units/TEST-07-PID1.delegate-namespaces.sh
+++ b/test/units/TEST-07-PID1.delegate-namespaces.sh
@@ -41,8 +41,12 @@ testcase_cgroup() {
 }
 
 testcase_pid() {
-    (! systemd-run -p PrivateUsersEx=self -p PrivatePIDs=yes --wait --pipe -- sh -c 'echo 5 >/proc/sys/kernel/ns_last_pid')
-    systemd-run -p PrivateUsersEx=self -p PrivatePIDs=yes -p DelegateNamespaces=pid --wait --pipe -- sh -c 'echo 5 >/proc/sys/kernel/ns_last_pid'
+    # MountAPIVFS=yes always bind mounts child mounts of APIVFS filesystems, which means /proc/sys is always read-only
+    # so we can't write to it when running in a container.
+    if ! systemd-detect-virt --container; then
+        (! systemd-run -p PrivateUsersEx=self -p PrivatePIDs=yes -p MountAPIVFS=yes --wait --pipe -- sh -c 'echo 5 >/proc/sys/kernel/ns_last_pid')
+        systemd-run -p PrivateUsersEx=self -p PrivatePIDs=yes -p MountAPIVFS=yes -p DelegateNamespaces="mnt pid" --wait --pipe -- sh -c 'echo 5 >/proc/sys/kernel/ns_last_pid'
+    fi
 }
 
 testcase_uts() {

From 9a78b5eb7ed39edd86fe9d96fb69199366166db8 Mon Sep 17 00:00:00 2001
From: Daan De Meyer <daan.j.demeyer@gmail.com>
Date: Mon, 17 Mar 2025 11:31:48 +0100
Subject: [PATCH 07/13] TEST-07-PID1.delegate-namespaces: Actually run the
 testcases

---
 test/units/TEST-07-PID1.delegate-namespaces.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/units/TEST-07-PID1.delegate-namespaces.sh b/test/units/TEST-07-PID1.delegate-namespaces.sh
index 061a7cf5b2..a9afc426ae 100755
--- a/test/units/TEST-07-PID1.delegate-namespaces.sh
+++ b/test/units/TEST-07-PID1.delegate-namespaces.sh
@@ -98,3 +98,5 @@ testcase_multiple_features() {
 
     rm -rf /tmp/TEST-07-PID1-delegate-namespaces-root
 }
+
+run_testcases

From 87ddf5188b53a04a08be090b144a8b87a8e44b1b Mon Sep 17 00:00:00 2001
From: Daan De Meyer <daan.j.demeyer@gmail.com>
Date: Tue, 18 Mar 2025 09:48:21 +0100
Subject: [PATCH 08/13] TEST-07-PID1.private-pids: Use --machine=testuser@.host
 instead of runas

Let's use the systemd way to run systemd-run as a different user instead
of setpriv.
---
 test/units/TEST-07-PID1.private-pids.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/units/TEST-07-PID1.private-pids.sh b/test/units/TEST-07-PID1.private-pids.sh
index eede43cbaf..1c9104d435 100755
--- a/test/units/TEST-07-PID1.private-pids.sh
+++ b/test/units/TEST-07-PID1.private-pids.sh
@@ -142,8 +142,8 @@ testcase_unpriv() {
     mount -t proc proc /tmp/TEST-07-PID1-private-pids-proc
 
     # Verify running as unprivileged user can unshare PID namespace and mounts /proc properly.
-    assert_eq "$(runas testuser systemd-run --wait --user --pipe -p PrivatePIDs=yes readlink /proc/self)" "1"
-    assert_eq "$(runas testuser systemd-run --wait --user --pipe -p PrivatePIDs=yes ps aux --no-heading | wc -l)" "1"
+    assert_eq "$(systemd-run --machine=testuser@.host --wait --user --pipe -p PrivatePIDs=yes readlink /proc/self)" "1"
+    assert_eq "$(systemd-run --machine=testuser@.host --wait --user --pipe -p PrivatePIDs=yes ps aux --no-heading | wc -l)" "1"
 
     umount /tmp/TEST-07-PID1-private-pids-proc
     rm -rf /tmp/TEST-07-PID1-private-pids-proc
@@ -162,7 +162,7 @@ testcase_unpriv() {
         mount -t tmpfs tmpfs /proc/scsi
     fi
 
-    (! runas testuser systemd-run --wait --user --pipe -p PrivatePIDs=yes true)
+    (! systemd-run --machine=testuser@.host --wait --user --pipe -p PrivatePIDs=yes true)
 
     if [[ "$HAS_EXISTING_SCSI_MOUNT" == "no" ]]; then
         umount /proc/scsi

From f49b7404b2a49efb8b76afea27f355cade3da6dc Mon Sep 17 00:00:00 2001
From: Daan De Meyer <daan.j.demeyer@gmail.com>
Date: Mon, 17 Mar 2025 12:26:46 +0100
Subject: [PATCH 09/13] capability-util: Ignore unknown capabilities instead of
 aborting

capability_ambient_set_apply() can be called with capability sets
containing unknown capabilities. Let's not crash when this is the
case but instead ignore the unknown capabilities.

This fixes a crash when running the following command:

"systemd-run -p "AmbientCapabilities=~" --wait --pipe id"

Fixes d5e12dc75e0e356c62e514e9c347efb200fe60e0
---
 src/basic/capability-util.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/basic/capability-util.c b/src/basic/capability-util.c
index 11d7e95cb6..0b544ea64a 100644
--- a/src/basic/capability-util.c
+++ b/src/basic/capability-util.c
@@ -114,8 +114,9 @@ int capability_ambient_set_apply(uint64_t set, bool also_inherit) {
         int r;
 
         /* Remove capabilities requested in ambient set, but not in the bounding set */
-        BIT_FOREACH(i, set) {
-                assert((unsigned) i <= cap_last_cap());
+        for (unsigned i = 0; i <= cap_last_cap(); i++) {
+                if (!BIT_SET(set, i))
+                        continue;
 
                 if (prctl(PR_CAPBSET_READ, (unsigned long) i) != 1) {
                         log_debug("Ambient capability %s requested but missing from bounding set, suppressing automatically.",

From 9e34c34b7b027da24b084a58246c1d88bdbcc817 Mon Sep 17 00:00:00 2001
From: Daan De Meyer <daan.j.demeyer@gmail.com>
Date: Mon, 17 Mar 2025 12:28:37 +0100
Subject: [PATCH 10/13] sd_bus_open_user_machine(): Don't shortcut without
 necessary env

Don't shortcut if we don't have the necessary environment variables
set in sd_bus_open_user_machine().
---
 src/libsystemd/sd-bus/sd-bus.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/libsystemd/sd-bus/sd-bus.c b/src/libsystemd/sd-bus/sd-bus.c
index 7c6183d1bb..6429267843 100644
--- a/src/libsystemd/sd-bus/sd-bus.c
+++ b/src/libsystemd/sd-bus/sd-bus.c
@@ -1760,8 +1760,10 @@ _public_ int sd_bus_open_user_machine(sd_bus **ret, const char *user_and_machine
         assert_return(user_and_machine, -EINVAL);
         assert_return(ret, -EINVAL);
 
-        /* Shortcut things if we'd end up on this host and as the same user.  */
-        if (user_and_machine_equivalent(user_and_machine))
+        /* Shortcut things if we'd end up on this host and as the same user and have one of the necessary
+         * environment variables set already.  */
+        if (user_and_machine_equivalent(user_and_machine) &&
+            (secure_getenv("DBUS_SESSION_BUS_ADDRESS") || secure_getenv("XDG_RUNTIME_DIR")))
                 return sd_bus_open_user(ret);
 
         r = user_and_machine_valid(user_and_machine);

From a2425e8a8b51b6c0c100b73ecd7c1b4c8003f21a Mon Sep 17 00:00:00 2001
From: Daan De Meyer <daan.j.demeyer@gmail.com>
Date: Mon, 17 Mar 2025 16:01:26 +0100
Subject: [PATCH 11/13] run: Stop agents before we drop privileges

After dropping privileges, we won't be able to stop agents anymore
as our signals will get ignored.
---
 src/run/run.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/run/run.c b/src/run/run.c
index cdb5cfefd6..e4ae5373b9 100644
--- a/src/run/run.c
+++ b/src/run/run.c
@@ -2483,6 +2483,11 @@ static int start_transient_scope(sd_bus *bus) {
                         return log_oom();
         }
 
+        /* Stop agents before we pass control away and before we drop privileges, to avoid TTY conflicts and
+         * before we become unable to stop agents. */
+        polkit_agent_close();
+        ask_password_agent_close();
+
         if (arg_nice_set) {
                 if (setpriority(PRIO_PROCESS, 0, arg_nice) < 0)
                         return log_error_errno(errno, "Failed to set nice level: %m");
@@ -2571,10 +2576,6 @@ static int start_transient_scope(sd_bus *bus) {
                 }
         }
 
-        /* Stop agents before we pass control away, to avoid TTY conflicts */
-        polkit_agent_close();
-        ask_password_agent_close();
-
         execvpe(arg_cmdline[0], arg_cmdline, env);
 
         return log_error_errno(errno, "Failed to execute: %m");

From b281686b4f812365adc6896cee6ca20aa69b1baf Mon Sep 17 00:00:00 2001
From: Daan De Meyer <daan.j.demeyer@gmail.com>
Date: Mon, 17 Mar 2025 16:25:19 +0100
Subject: [PATCH 12/13] test: Fix formatting

---
 test/units/TEST-07-PID1.delegate-namespaces.sh | 2 +-
 test/units/TEST-07-PID1.private-pids.sh        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/units/TEST-07-PID1.delegate-namespaces.sh b/test/units/TEST-07-PID1.delegate-namespaces.sh
index a9afc426ae..8ea58205ba 100755
--- a/test/units/TEST-07-PID1.delegate-namespaces.sh
+++ b/test/units/TEST-07-PID1.delegate-namespaces.sh
@@ -72,7 +72,7 @@ testcase_multiple_features() {
         -p BindReadOnlyPaths=/usr/share \
         -p NoNewPrivileges=yes \
         -p ProtectSystem=strict \
-        -p User=testuser\
+        -p User=testuser \
         -p Group=testuser \
         -p RuntimeDirectory=abc \
         -p StateDirectory=qed \
diff --git a/test/units/TEST-07-PID1.private-pids.sh b/test/units/TEST-07-PID1.private-pids.sh
index 1c9104d435..091535e3da 100755
--- a/test/units/TEST-07-PID1.private-pids.sh
+++ b/test/units/TEST-07-PID1.private-pids.sh
@@ -95,7 +95,7 @@ testcase_multiple_features() {
         -p BindReadOnlyPaths=/usr/share \
         -p NoNewPrivileges=yes \
         -p ProtectSystem=strict \
-        -p User=testuser\
+        -p User=testuser \
         -p Group=testuser \
         -p RuntimeDirectory=abc \
         -p StateDirectory=qed \

From 38748596f0783f2b773bd95d4af4d83f5b5ff872 Mon Sep 17 00:00:00 2001
From: Daan De Meyer <daan.j.demeyer@gmail.com>
Date: Mon, 17 Mar 2025 11:35:23 +0100
Subject: [PATCH 13/13] core: Make DelegateNamespaces= work for user managers
 with CAP_SYS_ADMIN

Currently DelegateNamespaces= only works for services spawned by the
system manager. User managers will always unshare the user namespace
first even if they're running with CAP_SYS_ADMIN.

Let's add support for DelegateNamespaces= for user managers if they're
running with CAP_SYS_ADMIN. By default, we'll still delegate all namespaces
for user managers, but this can now be overridden by explicitly passing
DelegateNamespaces=.

If a user manager is running without CAP_SYS_ADMIN, the user manager is
still always unshared first just like before.
---
 src/core/exec-invoke.c                        | 46 ++++++++-----------
 .../units/TEST-07-PID1.delegate-namespaces.sh | 12 +++++
 2 files changed, 32 insertions(+), 26 deletions(-)

diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c
index e69c65feca..3e7a15cb33 100644
--- a/src/core/exec-invoke.c
+++ b/src/core/exec-invoke.c
@@ -4205,19 +4205,10 @@ static void log_command_line(
                         LOG_EXEC_INVOCATION_ID(params));
 }
 
-static bool exec_context_need_unprivileged_private_users(
-                const ExecContext *context,
-                const ExecParameters *params) {
-
+static bool exec_context_needs_cap_sys_admin(const ExecContext *context, const ExecParameters *params) {
         assert(context);
         assert(params);
 
-        /* These options require PrivateUsers= when used in user units, as we need to be in a user namespace
-         * to have permission to enable them when not running as root. If we have effective CAP_SYS_ADMIN
-         * (system manager) then we have privileges and don't need this. */
-        if (params->runtime_scope != RUNTIME_SCOPE_USER)
-                return false;
-
         return context->private_users != PRIVATE_USERS_NO ||
                context->private_tmp != PRIVATE_TMP_NO ||
                context->private_devices ||
@@ -4259,9 +4250,6 @@ static PrivateUsers exec_context_get_effective_private_users(
         if (context->private_users != PRIVATE_USERS_NO)
                 return context->private_users;
 
-        if (exec_context_need_unprivileged_private_users(context, params))
-                return PRIVATE_USERS_SELF;
-
         /* If any namespace is delegated with DelegateNamespaces=, always set up a user namespace. */
         if (context->delegate_namespaces != NAMESPACE_FLAGS_INITIAL)
                 return PRIVATE_USERS_SELF;
@@ -4272,6 +4260,7 @@ static PrivateUsers exec_context_get_effective_private_users(
 static bool exec_namespace_is_delegated(
                 const ExecContext *context,
                 const ExecParameters *params,
+                bool have_cap_sys_admin,
                 unsigned long namespace) {
 
         assert(context);
@@ -4281,11 +4270,11 @@ static bool exec_namespace_is_delegated(
         /* If we need unprivileged private users, we've already unshared a user namespace by the time we call
          * setup_delegated_namespaces() for the first time so let's make sure we do all other namespace
          * unsharing in the first call to setup_delegated_namespaces() by returning false here. */
-        if (exec_context_need_unprivileged_private_users(context, params))
+        if (!have_cap_sys_admin && exec_context_needs_cap_sys_admin(context, params))
                 return false;
 
         if (context->delegate_namespaces == NAMESPACE_FLAGS_INITIAL)
-                return false;
+                return params->runtime_scope == RUNTIME_SCOPE_USER;
 
         return FLAGS_SET(context->delegate_namespaces, namespace);
 }
@@ -4318,7 +4307,7 @@ static int setup_delegated_namespaces(
         assert(reterr_exit_status);
 
         if (exec_needs_network_namespace(context) &&
-            exec_namespace_is_delegated(context, params, CLONE_NEWNET) == delegate &&
+            exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWNET) == delegate &&
             runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
 
                 /* Try to enable network namespacing if network namespacing is available and we have
@@ -4345,7 +4334,7 @@ static int setup_delegated_namespaces(
         }
 
         if (exec_needs_ipc_namespace(context) &&
-            exec_namespace_is_delegated(context, params, CLONE_NEWIPC) == delegate &&
+            exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWIPC) == delegate &&
             runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
 
                 if (ns_type_supported(NAMESPACE_IPC)) {
@@ -4367,7 +4356,7 @@ static int setup_delegated_namespaces(
         }
 
         if (needs_sandboxing && exec_needs_cgroup_namespace(context, params) &&
-            exec_namespace_is_delegated(context, params, CLONE_NEWCGROUP) == delegate) {
+            exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWCGROUP) == delegate) {
                 if (unshare(CLONE_NEWCGROUP) < 0) {
                         *reterr_exit_status = EXIT_NAMESPACE;
                         return log_exec_error_errno(context, params, errno, "Failed to set up cgroup namespacing: %m");
@@ -4379,7 +4368,7 @@ static int setup_delegated_namespaces(
         /* Unshare a new PID namespace before setting up mounts to ensure /proc/ is mounted with only processes in PID namespace visible.
          * Note PrivatePIDs=yes implies MountAPIVFS=yes so we'll always ensure procfs is remounted. */
         if (needs_sandboxing && exec_needs_pid_namespace(context) &&
-            exec_namespace_is_delegated(context, params, CLONE_NEWPID) == delegate) {
+            exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWPID) == delegate) {
                 if (params->pidref_transport_fd < 0) {
                         *reterr_exit_status = EXIT_NAMESPACE;
                         return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ENOTCONN), "PidRef socket is not set up: %m");
@@ -4416,7 +4405,7 @@ static int setup_delegated_namespaces(
         /* If PrivatePIDs= yes is configured, we're now running as pid 1 in a pid namespace! */
 
         if (exec_needs_mount_namespace(context, params, runtime) &&
-            exec_namespace_is_delegated(context, params, CLONE_NEWNS) == delegate) {
+            exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWNS) == delegate) {
                 _cleanup_free_ char *error_path = NULL;
 
                 r = apply_mount_namespace(command->flags,
@@ -4437,7 +4426,8 @@ static int setup_delegated_namespaces(
                 log_exec_debug(context, params, "Set up %smount namespace", delegate ? "delegated " : "");
         }
 
-        if (needs_sandboxing && exec_namespace_is_delegated(context, params, CLONE_NEWUTS) == delegate) {
+        if (needs_sandboxing &&
+            exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWUTS) == delegate) {
                 r = apply_protect_hostname(context, params, reterr_exit_status);
                 if (r < 0)
                         return r;
@@ -4645,9 +4635,10 @@ int exec_invoke(
         ino_t journal_stream_ino = 0;
         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
-                needs_mount_namespace;  /* Do we need to set up a mount namespace for this kernel? */
-        bool keep_seccomp_privileges = false;
-        bool have_cap_sys_admin = false;
+                needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
+                have_cap_sys_admin,
+                userns_set_up = false,
+                keep_seccomp_privileges = false;
 #if HAVE_SELINUX
         _cleanup_free_ char *mac_selinux_context_net = NULL;
         bool use_selinux = false;
@@ -5373,11 +5364,13 @@ int exec_invoke(
                 }
         }
 
-        if (needs_sandboxing && exec_context_need_unprivileged_private_users(context, params)) {
+        if (needs_sandboxing && !have_cap_sys_admin && exec_context_needs_cap_sys_admin(context, params)) {
                 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
                  * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
                  * set up all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
                 PrivateUsers pu = exec_context_get_effective_private_users(context, params);
+                if (pu == PRIVATE_USERS_NO)
+                        pu = PRIVATE_USERS_SELF;
 
                 /* The kernel requires /proc/pid/setgroups be set to "deny" prior to writing /proc/pid/gid_map in
                  * unprivileged user namespaces. */
@@ -5392,6 +5385,7 @@ int exec_invoke(
                         log_exec_info_errno(context, params, r, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
                 else {
                         assert(r > 0);
+                        userns_set_up = true;
                         log_debug("Set up unprivileged user namespace");
                 }
         }
@@ -5444,7 +5438,7 @@ int exec_invoke(
          * case of mount namespaces being less privileged when the mount point list is copied from a
          * different user namespace). */
 
-        if (needs_sandboxing && !exec_context_need_unprivileged_private_users(context, params)) {
+        if (needs_sandboxing && !userns_set_up) {
                 PrivateUsers pu = exec_context_get_effective_private_users(context, params);
 
                 r = setup_private_users(pu, saved_uid, saved_gid, uid, gid,
diff --git a/test/units/TEST-07-PID1.delegate-namespaces.sh b/test/units/TEST-07-PID1.delegate-namespaces.sh
index 8ea58205ba..9bd9691197 100755
--- a/test/units/TEST-07-PID1.delegate-namespaces.sh
+++ b/test/units/TEST-07-PID1.delegate-namespaces.sh
@@ -62,6 +62,18 @@ testcase_implied_private_users_self() {
     systemd-run -p PrivateUsersEx=identity -p PrivateMounts=yes -p DelegateNamespaces=mnt --wait bash -c 'test "$(cat /proc/self/uid_map)" == "         0          0      65536"'
 }
 
+testcase_user_manager() {
+    systemctl start user@0
+    # DelegateNamespaces=yes is implied for user managers.
+    systemd-run --machine=testuser@.host --user -p PrivateMounts=yes -p AmbientCapabilities="~" --wait --pipe -- mount --bind /usr /home
+    # Even those with CAP_SYS_ADMIN.
+    SYSTEMD_LOG_LEVEL=debug systemd-run --machine=.host --user -p PrivateMounts=yes --wait --pipe -- mount --bind /usr /home
+    # But can be overridden for user managers that are running with CAP_SYS_ADMIN.
+    (! systemd-run --machine=.host --user -p PrivateMounts=yes -p DelegateNamespaces=no --wait --pipe -- mount --bind /usr /home)
+    # But not for those without CAP_SYS_ADMIN.
+    systemd-run --machine=testuser@.host --user -p PrivateMounts=yes -p DelegateNamespaces=no -p AmbientCapabilities="~" --wait --pipe -- mount --bind /usr /home
+}
+
 testcase_multiple_features() {
     unsquashfs -no-xattrs -d /tmp/TEST-07-PID1-delegate-namespaces-root /usr/share/minimal_0.raw