From af6a8b6969d3f8b2189ea7b8a164910bcf2cc71e Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Mon, 17 Mar 2025 10:44:05 +0100 Subject: [PATCH 01/13] exec-invoke: Rename various variables from has_ to have_ All of these encode information of the current process, so have_ is more approriate than has_, which refers to something else. --- src/core/exec-invoke.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c index 6bb4584a8e..9b21e385a8 100644 --- a/src/core/exec-invoke.c +++ b/src/core/exec-invoke.c @@ -1473,7 +1473,7 @@ static bool context_has_no_new_privileges(const ExecContext *c) { static bool seccomp_allows_drop_privileges(const ExecContext *c) { void *id, *val; - bool has_capget = false, has_capset = false, has_prctl = false; + bool have_capget = false, have_capset = false, have_prctl = false; assert(c); @@ -1487,17 +1487,17 @@ static bool seccomp_allows_drop_privileges(const ExecContext *c) { name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1); if (streq(name, "capget")) - has_capget = true; + have_capget = true; else if (streq(name, "capset")) - has_capset = true; + have_capset = true; else if (streq(name, "prctl")) - has_prctl = true; + have_prctl = true; } if (c->syscall_allow_list) - return has_capget && has_capset && has_prctl; + return have_capget && have_capset && have_prctl; else - return !(has_capget || has_capset || has_prctl); + return !(have_capget || have_capset || have_prctl); } static bool skip_seccomp_unavailable(const ExecContext *c, const ExecParameters *p, const char *msg) { @@ -4300,7 +4300,7 @@ static int setup_delegated_namespaces( uid_t gid, const ExecCommand *command, bool needs_sandboxing, - bool has_cap_sys_admin, + bool have_cap_sys_admin, int *reterr_exit_status) { int r; @@ -4391,7 +4391,7 @@ static int setup_delegated_namespaces( * We need to check prior to entering the user namespace because if we're running unprivileged or in a * system without CAP_SYS_ADMIN, then we can have CAP_SYS_ADMIN in the current user namespace but not * once we unshare a mount namespace. */ - if (!has_cap_sys_admin) { + if (!have_cap_sys_admin) { r = can_mount_proc(context, params); if (r < 0) { *reterr_exit_status = EXIT_NAMESPACE; @@ -4647,7 +4647,7 @@ int exec_invoke( needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */ needs_mount_namespace; /* Do we need to set up a mount namespace for this kernel? */ bool keep_seccomp_privileges = false; - bool has_cap_sys_admin = false; + bool have_cap_sys_admin = false; #if HAVE_SELINUX _cleanup_free_ char *mac_selinux_context_net = NULL; bool use_selinux = false; @@ -5308,7 +5308,7 @@ int exec_invoke( uint64_t capability_ambient_set = context->capability_ambient_set; /* Check CAP_SYS_ADMIN before we enter user namespace to see if we can mount /proc even though its masked. */ - has_cap_sys_admin = have_effective_cap(CAP_SYS_ADMIN) > 0; + have_cap_sys_admin = have_effective_cap(CAP_SYS_ADMIN) > 0; if (needs_sandboxing) { /* MAC enablement checks need to be done before a new mount ns is created, as they rely on @@ -5407,7 +5407,7 @@ int exec_invoke( gid, command, needs_sandboxing, - has_cap_sys_admin, + have_cap_sys_admin, exit_status); if (r < 0) return r; @@ -5468,7 +5468,7 @@ int exec_invoke( gid, command, needs_sandboxing, - has_cap_sys_admin, + have_cap_sys_admin, exit_status); if (r < 0) return r; From ad9b5d4732ec2131aec12fcfed8d141fb720fd95 Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Mon, 17 Mar 2025 11:29:48 +0100 Subject: [PATCH 02/13] bus-unit-util: Fix DelegateNamespaces= parser Similarly to the config file parse method, let's fix the systemd-run parser as well. Follow up for 11b982053bdc31806e571ea0771d7f10cb276d69 --- src/shared/bus-unit-util.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c index f787413161..1e04a051a6 100644 --- a/src/shared/bus-unit-util.c +++ b/src/shared/bus-unit-util.c @@ -1669,13 +1669,17 @@ static int bus_append_execute_property(sd_bus_message *m, const char *field, con if (STR_IN_SET(field, "RestrictNamespaces", "DelegateNamespaces")) { bool invert = false; + unsigned long all = UPDATE_FLAG(NAMESPACE_FLAGS_ALL, CLONE_NEWUSER, !streq(field, "DelegateNamespaces")); unsigned long flags; r = parse_boolean(eq); if (r > 0) - flags = 0; + /* RestrictNamespaces= value gets stored into a field with reverse semantics (the + * namespaces which are retained), so RestrictNamespaces=true means we retain no + * access to any namespaces and vice-versa. */ + flags = streq(field, "RestrictNamespaces") ? 0 : all; else if (r == 0) - flags = NAMESPACE_FLAGS_ALL; + flags = streq(field, "RestrictNamespaces") ? all : 0; else { if (eq[0] == '~') { invert = true; @@ -1688,7 +1692,7 @@ static int bus_append_execute_property(sd_bus_message *m, const char *field, con } if (invert) - flags = (~flags) & NAMESPACE_FLAGS_ALL; + flags = (~flags) & all; r = sd_bus_message_append(m, "(sv)", field, "t", (uint64_t) flags); if (r < 0) From c8912e3d8347fee3a325276177eed132e131ae19 Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Mon, 17 Mar 2025 11:31:28 +0100 Subject: [PATCH 03/13] TEST-07-PID1.delegate-namespaces: Fix testcase_network() --- test/units/TEST-07-PID1.delegate-namespaces.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/units/TEST-07-PID1.delegate-namespaces.sh b/test/units/TEST-07-PID1.delegate-namespaces.sh index fe0defaeb6..ed447f1508 100755 --- a/test/units/TEST-07-PID1.delegate-namespaces.sh +++ b/test/units/TEST-07-PID1.delegate-namespaces.sh @@ -16,7 +16,7 @@ testcase_mount() { testcase_network() { (! systemd-run -p PrivateUsersEx=self -p PrivateNetwork=yes --wait --pipe -- ip link add veth1 type veth peer name veth2) - systemd-run -p PrivateUsersEx=self -p PrivateMounts=yes -p DelegateNamespaces=mnt --wait --pipe -- ip link add veth1 type veth peer name veth2 + systemd-run -p PrivateUsersEx=self -p PrivateNetwork=yes -p DelegateNamespaces=net --wait --pipe -- ip link add veth1 type veth peer name veth2 } testcase_cgroup() { From 918a188fdab1278fbbe2c960e1da7c655f644340 Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Mon, 17 Mar 2025 16:17:25 +0100 Subject: [PATCH 04/13] core: Also check if we can mount /proc if pid namespace is delegated If the pid namespace is delegated, it doesn't matter if we have CAP_SYS_ADMIN, we'll still fail to mount /proc if part of it is masked on the host so also check if we can mount /proc if the pid namespace is delegated. --- src/core/exec-invoke.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c index 9b21e385a8..e69c65feca 100644 --- a/src/core/exec-invoke.c +++ b/src/core/exec-invoke.c @@ -4391,7 +4391,7 @@ static int setup_delegated_namespaces( * We need to check prior to entering the user namespace because if we're running unprivileged or in a * system without CAP_SYS_ADMIN, then we can have CAP_SYS_ADMIN in the current user namespace but not * once we unshare a mount namespace. */ - if (!have_cap_sys_admin) { + if (!have_cap_sys_admin || delegate) { r = can_mount_proc(context, params); if (r < 0) { *reterr_exit_status = EXIT_NAMESPACE; From aae4748464d3b8a810f88b3f1d65dd7d5b5ff396 Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Mon, 17 Mar 2025 16:20:00 +0100 Subject: [PATCH 05/13] TEST-07-PID1.delegate-namespaces: Make sure fully visible procfs is available To be able to mount /proc inside an unprivileged user namespace, we have to make sure a fully visible procfs is available on the host, so let's make sure that's the case. --- test/units/TEST-07-PID1.delegate-namespaces.sh | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/test/units/TEST-07-PID1.delegate-namespaces.sh b/test/units/TEST-07-PID1.delegate-namespaces.sh index ed447f1508..8eb9956c2e 100755 --- a/test/units/TEST-07-PID1.delegate-namespaces.sh +++ b/test/units/TEST-07-PID1.delegate-namespaces.sh @@ -9,6 +9,22 @@ set -o pipefail # shellcheck source=test/units/util.sh . "$(dirname "$0")"/util.sh +# IMPORTANT: For /proc/ to be remounted in pid namespace within an unprivileged user namespace, there needs to +# be at least 1 unmasked procfs mount in ANY directory. Otherwise, if /proc/ is masked (e.g. /proc/scsi is +# over-mounted with tmpfs), then mounting a new /proc/ will fail. +# +# Thus, to guarantee PrivatePIDs=yes tests for unprivileged users pass, we mount a new procfs on a temporary +# directory with no masking. This will guarantee an unprivileged user can mount a new /proc/ successfully. +mkdir -p /tmp/TEST-07-PID1-delegate-namespaces-proc +mount -t proc proc /tmp/TEST-07-PID1-delegate-namespaces-proc + +at_exit() { + umount /tmp/TEST-07-PID1-delegate-namespaces-proc + rm -rf /tmp/TEST-07-PID1-delegate-namespaces-proc +} + +trap at_exit EXIT + testcase_mount() { (! systemd-run -p PrivateUsersEx=self -p PrivateMounts=yes --wait --pipe -- mount --bind /usr /home) systemd-run -p PrivateUsersEx=self -p PrivateMounts=yes -p DelegateNamespaces=mnt --wait --pipe -- mount --bind /usr /home From b7602662cbf6210052985c25a29550bc2314646d Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Mon, 17 Mar 2025 21:22:49 +0100 Subject: [PATCH 06/13] TEST-07-PID1.delegate-namespaces: Fix testcase_pid() Make sure the test has its own /proc and skip it in containers as MountAPIVFS=yes in a container always results in a read-only /proc/sys which means the test can't write to /proc/sys/kernel/ns_last_pid. --- test/units/TEST-07-PID1.delegate-namespaces.sh | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/test/units/TEST-07-PID1.delegate-namespaces.sh b/test/units/TEST-07-PID1.delegate-namespaces.sh index 8eb9956c2e..061a7cf5b2 100755 --- a/test/units/TEST-07-PID1.delegate-namespaces.sh +++ b/test/units/TEST-07-PID1.delegate-namespaces.sh @@ -41,8 +41,12 @@ testcase_cgroup() { } testcase_pid() { - (! systemd-run -p PrivateUsersEx=self -p PrivatePIDs=yes --wait --pipe -- sh -c 'echo 5 >/proc/sys/kernel/ns_last_pid') - systemd-run -p PrivateUsersEx=self -p PrivatePIDs=yes -p DelegateNamespaces=pid --wait --pipe -- sh -c 'echo 5 >/proc/sys/kernel/ns_last_pid' + # MountAPIVFS=yes always bind mounts child mounts of APIVFS filesystems, which means /proc/sys is always read-only + # so we can't write to it when running in a container. + if ! systemd-detect-virt --container; then + (! systemd-run -p PrivateUsersEx=self -p PrivatePIDs=yes -p MountAPIVFS=yes --wait --pipe -- sh -c 'echo 5 >/proc/sys/kernel/ns_last_pid') + systemd-run -p PrivateUsersEx=self -p PrivatePIDs=yes -p MountAPIVFS=yes -p DelegateNamespaces="mnt pid" --wait --pipe -- sh -c 'echo 5 >/proc/sys/kernel/ns_last_pid' + fi } testcase_uts() { From 9a78b5eb7ed39edd86fe9d96fb69199366166db8 Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Mon, 17 Mar 2025 11:31:48 +0100 Subject: [PATCH 07/13] TEST-07-PID1.delegate-namespaces: Actually run the testcases --- test/units/TEST-07-PID1.delegate-namespaces.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/units/TEST-07-PID1.delegate-namespaces.sh b/test/units/TEST-07-PID1.delegate-namespaces.sh index 061a7cf5b2..a9afc426ae 100755 --- a/test/units/TEST-07-PID1.delegate-namespaces.sh +++ b/test/units/TEST-07-PID1.delegate-namespaces.sh @@ -98,3 +98,5 @@ testcase_multiple_features() { rm -rf /tmp/TEST-07-PID1-delegate-namespaces-root } + +run_testcases From 87ddf5188b53a04a08be090b144a8b87a8e44b1b Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Tue, 18 Mar 2025 09:48:21 +0100 Subject: [PATCH 08/13] TEST-07-PID1.private-pids: Use --machine=testuser@.host instead of runas Let's use the systemd way to run systemd-run as a different user instead of setpriv. --- test/units/TEST-07-PID1.private-pids.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/units/TEST-07-PID1.private-pids.sh b/test/units/TEST-07-PID1.private-pids.sh index eede43cbaf..1c9104d435 100755 --- a/test/units/TEST-07-PID1.private-pids.sh +++ b/test/units/TEST-07-PID1.private-pids.sh @@ -142,8 +142,8 @@ testcase_unpriv() { mount -t proc proc /tmp/TEST-07-PID1-private-pids-proc # Verify running as unprivileged user can unshare PID namespace and mounts /proc properly. - assert_eq "$(runas testuser systemd-run --wait --user --pipe -p PrivatePIDs=yes readlink /proc/self)" "1" - assert_eq "$(runas testuser systemd-run --wait --user --pipe -p PrivatePIDs=yes ps aux --no-heading | wc -l)" "1" + assert_eq "$(systemd-run --machine=testuser@.host --wait --user --pipe -p PrivatePIDs=yes readlink /proc/self)" "1" + assert_eq "$(systemd-run --machine=testuser@.host --wait --user --pipe -p PrivatePIDs=yes ps aux --no-heading | wc -l)" "1" umount /tmp/TEST-07-PID1-private-pids-proc rm -rf /tmp/TEST-07-PID1-private-pids-proc @@ -162,7 +162,7 @@ testcase_unpriv() { mount -t tmpfs tmpfs /proc/scsi fi - (! runas testuser systemd-run --wait --user --pipe -p PrivatePIDs=yes true) + (! systemd-run --machine=testuser@.host --wait --user --pipe -p PrivatePIDs=yes true) if [[ "$HAS_EXISTING_SCSI_MOUNT" == "no" ]]; then umount /proc/scsi From f49b7404b2a49efb8b76afea27f355cade3da6dc Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Mon, 17 Mar 2025 12:26:46 +0100 Subject: [PATCH 09/13] capability-util: Ignore unknown capabilities instead of aborting capability_ambient_set_apply() can be called with capability sets containing unknown capabilities. Let's not crash when this is the case but instead ignore the unknown capabilities. This fixes a crash when running the following command: "systemd-run -p "AmbientCapabilities=~" --wait --pipe id" Fixes d5e12dc75e0e356c62e514e9c347efb200fe60e0 --- src/basic/capability-util.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/basic/capability-util.c b/src/basic/capability-util.c index 11d7e95cb6..0b544ea64a 100644 --- a/src/basic/capability-util.c +++ b/src/basic/capability-util.c @@ -114,8 +114,9 @@ int capability_ambient_set_apply(uint64_t set, bool also_inherit) { int r; /* Remove capabilities requested in ambient set, but not in the bounding set */ - BIT_FOREACH(i, set) { - assert((unsigned) i <= cap_last_cap()); + for (unsigned i = 0; i <= cap_last_cap(); i++) { + if (!BIT_SET(set, i)) + continue; if (prctl(PR_CAPBSET_READ, (unsigned long) i) != 1) { log_debug("Ambient capability %s requested but missing from bounding set, suppressing automatically.", From 9e34c34b7b027da24b084a58246c1d88bdbcc817 Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Mon, 17 Mar 2025 12:28:37 +0100 Subject: [PATCH 10/13] sd_bus_open_user_machine(): Don't shortcut without necessary env Don't shortcut if we don't have the necessary environment variables set in sd_bus_open_user_machine(). --- src/libsystemd/sd-bus/sd-bus.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/libsystemd/sd-bus/sd-bus.c b/src/libsystemd/sd-bus/sd-bus.c index 7c6183d1bb..6429267843 100644 --- a/src/libsystemd/sd-bus/sd-bus.c +++ b/src/libsystemd/sd-bus/sd-bus.c @@ -1760,8 +1760,10 @@ _public_ int sd_bus_open_user_machine(sd_bus **ret, const char *user_and_machine assert_return(user_and_machine, -EINVAL); assert_return(ret, -EINVAL); - /* Shortcut things if we'd end up on this host and as the same user. */ - if (user_and_machine_equivalent(user_and_machine)) + /* Shortcut things if we'd end up on this host and as the same user and have one of the necessary + * environment variables set already. */ + if (user_and_machine_equivalent(user_and_machine) && + (secure_getenv("DBUS_SESSION_BUS_ADDRESS") || secure_getenv("XDG_RUNTIME_DIR"))) return sd_bus_open_user(ret); r = user_and_machine_valid(user_and_machine); From a2425e8a8b51b6c0c100b73ecd7c1b4c8003f21a Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Mon, 17 Mar 2025 16:01:26 +0100 Subject: [PATCH 11/13] run: Stop agents before we drop privileges After dropping privileges, we won't be able to stop agents anymore as our signals will get ignored. --- src/run/run.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/run/run.c b/src/run/run.c index cdb5cfefd6..e4ae5373b9 100644 --- a/src/run/run.c +++ b/src/run/run.c @@ -2483,6 +2483,11 @@ static int start_transient_scope(sd_bus *bus) { return log_oom(); } + /* Stop agents before we pass control away and before we drop privileges, to avoid TTY conflicts and + * before we become unable to stop agents. */ + polkit_agent_close(); + ask_password_agent_close(); + if (arg_nice_set) { if (setpriority(PRIO_PROCESS, 0, arg_nice) < 0) return log_error_errno(errno, "Failed to set nice level: %m"); @@ -2571,10 +2576,6 @@ static int start_transient_scope(sd_bus *bus) { } } - /* Stop agents before we pass control away, to avoid TTY conflicts */ - polkit_agent_close(); - ask_password_agent_close(); - execvpe(arg_cmdline[0], arg_cmdline, env); return log_error_errno(errno, "Failed to execute: %m"); From b281686b4f812365adc6896cee6ca20aa69b1baf Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Mon, 17 Mar 2025 16:25:19 +0100 Subject: [PATCH 12/13] test: Fix formatting --- test/units/TEST-07-PID1.delegate-namespaces.sh | 2 +- test/units/TEST-07-PID1.private-pids.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/units/TEST-07-PID1.delegate-namespaces.sh b/test/units/TEST-07-PID1.delegate-namespaces.sh index a9afc426ae..8ea58205ba 100755 --- a/test/units/TEST-07-PID1.delegate-namespaces.sh +++ b/test/units/TEST-07-PID1.delegate-namespaces.sh @@ -72,7 +72,7 @@ testcase_multiple_features() { -p BindReadOnlyPaths=/usr/share \ -p NoNewPrivileges=yes \ -p ProtectSystem=strict \ - -p User=testuser\ + -p User=testuser \ -p Group=testuser \ -p RuntimeDirectory=abc \ -p StateDirectory=qed \ diff --git a/test/units/TEST-07-PID1.private-pids.sh b/test/units/TEST-07-PID1.private-pids.sh index 1c9104d435..091535e3da 100755 --- a/test/units/TEST-07-PID1.private-pids.sh +++ b/test/units/TEST-07-PID1.private-pids.sh @@ -95,7 +95,7 @@ testcase_multiple_features() { -p BindReadOnlyPaths=/usr/share \ -p NoNewPrivileges=yes \ -p ProtectSystem=strict \ - -p User=testuser\ + -p User=testuser \ -p Group=testuser \ -p RuntimeDirectory=abc \ -p StateDirectory=qed \ From 38748596f0783f2b773bd95d4af4d83f5b5ff872 Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Mon, 17 Mar 2025 11:35:23 +0100 Subject: [PATCH 13/13] core: Make DelegateNamespaces= work for user managers with CAP_SYS_ADMIN Currently DelegateNamespaces= only works for services spawned by the system manager. User managers will always unshare the user namespace first even if they're running with CAP_SYS_ADMIN. Let's add support for DelegateNamespaces= for user managers if they're running with CAP_SYS_ADMIN. By default, we'll still delegate all namespaces for user managers, but this can now be overridden by explicitly passing DelegateNamespaces=. If a user manager is running without CAP_SYS_ADMIN, the user manager is still always unshared first just like before. --- src/core/exec-invoke.c | 46 ++++++++----------- .../units/TEST-07-PID1.delegate-namespaces.sh | 12 +++++ 2 files changed, 32 insertions(+), 26 deletions(-) diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c index e69c65feca..3e7a15cb33 100644 --- a/src/core/exec-invoke.c +++ b/src/core/exec-invoke.c @@ -4205,19 +4205,10 @@ static void log_command_line( LOG_EXEC_INVOCATION_ID(params)); } -static bool exec_context_need_unprivileged_private_users( - const ExecContext *context, - const ExecParameters *params) { - +static bool exec_context_needs_cap_sys_admin(const ExecContext *context, const ExecParameters *params) { assert(context); assert(params); - /* These options require PrivateUsers= when used in user units, as we need to be in a user namespace - * to have permission to enable them when not running as root. If we have effective CAP_SYS_ADMIN - * (system manager) then we have privileges and don't need this. */ - if (params->runtime_scope != RUNTIME_SCOPE_USER) - return false; - return context->private_users != PRIVATE_USERS_NO || context->private_tmp != PRIVATE_TMP_NO || context->private_devices || @@ -4259,9 +4250,6 @@ static PrivateUsers exec_context_get_effective_private_users( if (context->private_users != PRIVATE_USERS_NO) return context->private_users; - if (exec_context_need_unprivileged_private_users(context, params)) - return PRIVATE_USERS_SELF; - /* If any namespace is delegated with DelegateNamespaces=, always set up a user namespace. */ if (context->delegate_namespaces != NAMESPACE_FLAGS_INITIAL) return PRIVATE_USERS_SELF; @@ -4272,6 +4260,7 @@ static PrivateUsers exec_context_get_effective_private_users( static bool exec_namespace_is_delegated( const ExecContext *context, const ExecParameters *params, + bool have_cap_sys_admin, unsigned long namespace) { assert(context); @@ -4281,11 +4270,11 @@ static bool exec_namespace_is_delegated( /* If we need unprivileged private users, we've already unshared a user namespace by the time we call * setup_delegated_namespaces() for the first time so let's make sure we do all other namespace * unsharing in the first call to setup_delegated_namespaces() by returning false here. */ - if (exec_context_need_unprivileged_private_users(context, params)) + if (!have_cap_sys_admin && exec_context_needs_cap_sys_admin(context, params)) return false; if (context->delegate_namespaces == NAMESPACE_FLAGS_INITIAL) - return false; + return params->runtime_scope == RUNTIME_SCOPE_USER; return FLAGS_SET(context->delegate_namespaces, namespace); } @@ -4318,7 +4307,7 @@ static int setup_delegated_namespaces( assert(reterr_exit_status); if (exec_needs_network_namespace(context) && - exec_namespace_is_delegated(context, params, CLONE_NEWNET) == delegate && + exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWNET) == delegate && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) { /* Try to enable network namespacing if network namespacing is available and we have @@ -4345,7 +4334,7 @@ static int setup_delegated_namespaces( } if (exec_needs_ipc_namespace(context) && - exec_namespace_is_delegated(context, params, CLONE_NEWIPC) == delegate && + exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWIPC) == delegate && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) { if (ns_type_supported(NAMESPACE_IPC)) { @@ -4367,7 +4356,7 @@ static int setup_delegated_namespaces( } if (needs_sandboxing && exec_needs_cgroup_namespace(context, params) && - exec_namespace_is_delegated(context, params, CLONE_NEWCGROUP) == delegate) { + exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWCGROUP) == delegate) { if (unshare(CLONE_NEWCGROUP) < 0) { *reterr_exit_status = EXIT_NAMESPACE; return log_exec_error_errno(context, params, errno, "Failed to set up cgroup namespacing: %m"); @@ -4379,7 +4368,7 @@ static int setup_delegated_namespaces( /* Unshare a new PID namespace before setting up mounts to ensure /proc/ is mounted with only processes in PID namespace visible. * Note PrivatePIDs=yes implies MountAPIVFS=yes so we'll always ensure procfs is remounted. */ if (needs_sandboxing && exec_needs_pid_namespace(context) && - exec_namespace_is_delegated(context, params, CLONE_NEWPID) == delegate) { + exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWPID) == delegate) { if (params->pidref_transport_fd < 0) { *reterr_exit_status = EXIT_NAMESPACE; return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ENOTCONN), "PidRef socket is not set up: %m"); @@ -4416,7 +4405,7 @@ static int setup_delegated_namespaces( /* If PrivatePIDs= yes is configured, we're now running as pid 1 in a pid namespace! */ if (exec_needs_mount_namespace(context, params, runtime) && - exec_namespace_is_delegated(context, params, CLONE_NEWNS) == delegate) { + exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWNS) == delegate) { _cleanup_free_ char *error_path = NULL; r = apply_mount_namespace(command->flags, @@ -4437,7 +4426,8 @@ static int setup_delegated_namespaces( log_exec_debug(context, params, "Set up %smount namespace", delegate ? "delegated " : ""); } - if (needs_sandboxing && exec_namespace_is_delegated(context, params, CLONE_NEWUTS) == delegate) { + if (needs_sandboxing && + exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWUTS) == delegate) { r = apply_protect_hostname(context, params, reterr_exit_status); if (r < 0) return r; @@ -4645,9 +4635,10 @@ int exec_invoke( ino_t journal_stream_ino = 0; bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */ needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */ - needs_mount_namespace; /* Do we need to set up a mount namespace for this kernel? */ - bool keep_seccomp_privileges = false; - bool have_cap_sys_admin = false; + needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */ + have_cap_sys_admin, + userns_set_up = false, + keep_seccomp_privileges = false; #if HAVE_SELINUX _cleanup_free_ char *mac_selinux_context_net = NULL; bool use_selinux = false; @@ -5373,11 +5364,13 @@ int exec_invoke( } } - if (needs_sandboxing && exec_context_need_unprivileged_private_users(context, params)) { + if (needs_sandboxing && !have_cap_sys_admin && exec_context_needs_cap_sys_admin(context, params)) { /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces. * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to * set up all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */ PrivateUsers pu = exec_context_get_effective_private_users(context, params); + if (pu == PRIVATE_USERS_NO) + pu = PRIVATE_USERS_SELF; /* The kernel requires /proc/pid/setgroups be set to "deny" prior to writing /proc/pid/gid_map in * unprivileged user namespaces. */ @@ -5392,6 +5385,7 @@ int exec_invoke( log_exec_info_errno(context, params, r, "Failed to set up user namespacing for unprivileged user, ignoring: %m"); else { assert(r > 0); + userns_set_up = true; log_debug("Set up unprivileged user namespace"); } } @@ -5444,7 +5438,7 @@ int exec_invoke( * case of mount namespaces being less privileged when the mount point list is copied from a * different user namespace). */ - if (needs_sandboxing && !exec_context_need_unprivileged_private_users(context, params)) { + if (needs_sandboxing && !userns_set_up) { PrivateUsers pu = exec_context_get_effective_private_users(context, params); r = setup_private_users(pu, saved_uid, saved_gid, uid, gid, diff --git a/test/units/TEST-07-PID1.delegate-namespaces.sh b/test/units/TEST-07-PID1.delegate-namespaces.sh index 8ea58205ba..9bd9691197 100755 --- a/test/units/TEST-07-PID1.delegate-namespaces.sh +++ b/test/units/TEST-07-PID1.delegate-namespaces.sh @@ -62,6 +62,18 @@ testcase_implied_private_users_self() { systemd-run -p PrivateUsersEx=identity -p PrivateMounts=yes -p DelegateNamespaces=mnt --wait bash -c 'test "$(cat /proc/self/uid_map)" == " 0 0 65536"' } +testcase_user_manager() { + systemctl start user@0 + # DelegateNamespaces=yes is implied for user managers. + systemd-run --machine=testuser@.host --user -p PrivateMounts=yes -p AmbientCapabilities="~" --wait --pipe -- mount --bind /usr /home + # Even those with CAP_SYS_ADMIN. + SYSTEMD_LOG_LEVEL=debug systemd-run --machine=.host --user -p PrivateMounts=yes --wait --pipe -- mount --bind /usr /home + # But can be overridden for user managers that are running with CAP_SYS_ADMIN. + (! systemd-run --machine=.host --user -p PrivateMounts=yes -p DelegateNamespaces=no --wait --pipe -- mount --bind /usr /home) + # But not for those without CAP_SYS_ADMIN. + systemd-run --machine=testuser@.host --user -p PrivateMounts=yes -p DelegateNamespaces=no -p AmbientCapabilities="~" --wait --pipe -- mount --bind /usr /home +} + testcase_multiple_features() { unsquashfs -no-xattrs -d /tmp/TEST-07-PID1-delegate-namespaces-root /usr/share/minimal_0.raw