diff --git a/src/basic/capability-util.c b/src/basic/capability-util.c index 11d7e95cb6..0b544ea64a 100644 --- a/src/basic/capability-util.c +++ b/src/basic/capability-util.c @@ -114,8 +114,9 @@ int capability_ambient_set_apply(uint64_t set, bool also_inherit) { int r; /* Remove capabilities requested in ambient set, but not in the bounding set */ - BIT_FOREACH(i, set) { - assert((unsigned) i <= cap_last_cap()); + for (unsigned i = 0; i <= cap_last_cap(); i++) { + if (!BIT_SET(set, i)) + continue; if (prctl(PR_CAPBSET_READ, (unsigned long) i) != 1) { log_debug("Ambient capability %s requested but missing from bounding set, suppressing automatically.", diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c index 6bb4584a8e..3e7a15cb33 100644 --- a/src/core/exec-invoke.c +++ b/src/core/exec-invoke.c @@ -1473,7 +1473,7 @@ static bool context_has_no_new_privileges(const ExecContext *c) { static bool seccomp_allows_drop_privileges(const ExecContext *c) { void *id, *val; - bool has_capget = false, has_capset = false, has_prctl = false; + bool have_capget = false, have_capset = false, have_prctl = false; assert(c); @@ -1487,17 +1487,17 @@ static bool seccomp_allows_drop_privileges(const ExecContext *c) { name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1); if (streq(name, "capget")) - has_capget = true; + have_capget = true; else if (streq(name, "capset")) - has_capset = true; + have_capset = true; else if (streq(name, "prctl")) - has_prctl = true; + have_prctl = true; } if (c->syscall_allow_list) - return has_capget && has_capset && has_prctl; + return have_capget && have_capset && have_prctl; else - return !(has_capget || has_capset || has_prctl); + return !(have_capget || have_capset || have_prctl); } static bool skip_seccomp_unavailable(const ExecContext *c, const ExecParameters *p, const char *msg) { @@ -4205,19 +4205,10 @@ static void log_command_line( LOG_EXEC_INVOCATION_ID(params)); } -static bool exec_context_need_unprivileged_private_users( - const ExecContext *context, - const ExecParameters *params) { - +static bool exec_context_needs_cap_sys_admin(const ExecContext *context, const ExecParameters *params) { assert(context); assert(params); - /* These options require PrivateUsers= when used in user units, as we need to be in a user namespace - * to have permission to enable them when not running as root. If we have effective CAP_SYS_ADMIN - * (system manager) then we have privileges and don't need this. */ - if (params->runtime_scope != RUNTIME_SCOPE_USER) - return false; - return context->private_users != PRIVATE_USERS_NO || context->private_tmp != PRIVATE_TMP_NO || context->private_devices || @@ -4259,9 +4250,6 @@ static PrivateUsers exec_context_get_effective_private_users( if (context->private_users != PRIVATE_USERS_NO) return context->private_users; - if (exec_context_need_unprivileged_private_users(context, params)) - return PRIVATE_USERS_SELF; - /* If any namespace is delegated with DelegateNamespaces=, always set up a user namespace. */ if (context->delegate_namespaces != NAMESPACE_FLAGS_INITIAL) return PRIVATE_USERS_SELF; @@ -4272,6 +4260,7 @@ static PrivateUsers exec_context_get_effective_private_users( static bool exec_namespace_is_delegated( const ExecContext *context, const ExecParameters *params, + bool have_cap_sys_admin, unsigned long namespace) { assert(context); @@ -4281,11 +4270,11 @@ static bool exec_namespace_is_delegated( /* If we need unprivileged private users, we've already unshared a user namespace by the time we call * setup_delegated_namespaces() for the first time so let's make sure we do all other namespace * unsharing in the first call to setup_delegated_namespaces() by returning false here. */ - if (exec_context_need_unprivileged_private_users(context, params)) + if (!have_cap_sys_admin && exec_context_needs_cap_sys_admin(context, params)) return false; if (context->delegate_namespaces == NAMESPACE_FLAGS_INITIAL) - return false; + return params->runtime_scope == RUNTIME_SCOPE_USER; return FLAGS_SET(context->delegate_namespaces, namespace); } @@ -4300,7 +4289,7 @@ static int setup_delegated_namespaces( uid_t gid, const ExecCommand *command, bool needs_sandboxing, - bool has_cap_sys_admin, + bool have_cap_sys_admin, int *reterr_exit_status) { int r; @@ -4318,7 +4307,7 @@ static int setup_delegated_namespaces( assert(reterr_exit_status); if (exec_needs_network_namespace(context) && - exec_namespace_is_delegated(context, params, CLONE_NEWNET) == delegate && + exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWNET) == delegate && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) { /* Try to enable network namespacing if network namespacing is available and we have @@ -4345,7 +4334,7 @@ static int setup_delegated_namespaces( } if (exec_needs_ipc_namespace(context) && - exec_namespace_is_delegated(context, params, CLONE_NEWIPC) == delegate && + exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWIPC) == delegate && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) { if (ns_type_supported(NAMESPACE_IPC)) { @@ -4367,7 +4356,7 @@ static int setup_delegated_namespaces( } if (needs_sandboxing && exec_needs_cgroup_namespace(context, params) && - exec_namespace_is_delegated(context, params, CLONE_NEWCGROUP) == delegate) { + exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWCGROUP) == delegate) { if (unshare(CLONE_NEWCGROUP) < 0) { *reterr_exit_status = EXIT_NAMESPACE; return log_exec_error_errno(context, params, errno, "Failed to set up cgroup namespacing: %m"); @@ -4379,7 +4368,7 @@ static int setup_delegated_namespaces( /* Unshare a new PID namespace before setting up mounts to ensure /proc/ is mounted with only processes in PID namespace visible. * Note PrivatePIDs=yes implies MountAPIVFS=yes so we'll always ensure procfs is remounted. */ if (needs_sandboxing && exec_needs_pid_namespace(context) && - exec_namespace_is_delegated(context, params, CLONE_NEWPID) == delegate) { + exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWPID) == delegate) { if (params->pidref_transport_fd < 0) { *reterr_exit_status = EXIT_NAMESPACE; return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ENOTCONN), "PidRef socket is not set up: %m"); @@ -4391,7 +4380,7 @@ static int setup_delegated_namespaces( * We need to check prior to entering the user namespace because if we're running unprivileged or in a * system without CAP_SYS_ADMIN, then we can have CAP_SYS_ADMIN in the current user namespace but not * once we unshare a mount namespace. */ - if (!has_cap_sys_admin) { + if (!have_cap_sys_admin || delegate) { r = can_mount_proc(context, params); if (r < 0) { *reterr_exit_status = EXIT_NAMESPACE; @@ -4416,7 +4405,7 @@ static int setup_delegated_namespaces( /* If PrivatePIDs= yes is configured, we're now running as pid 1 in a pid namespace! */ if (exec_needs_mount_namespace(context, params, runtime) && - exec_namespace_is_delegated(context, params, CLONE_NEWNS) == delegate) { + exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWNS) == delegate) { _cleanup_free_ char *error_path = NULL; r = apply_mount_namespace(command->flags, @@ -4437,7 +4426,8 @@ static int setup_delegated_namespaces( log_exec_debug(context, params, "Set up %smount namespace", delegate ? "delegated " : ""); } - if (needs_sandboxing && exec_namespace_is_delegated(context, params, CLONE_NEWUTS) == delegate) { + if (needs_sandboxing && + exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWUTS) == delegate) { r = apply_protect_hostname(context, params, reterr_exit_status); if (r < 0) return r; @@ -4645,9 +4635,10 @@ int exec_invoke( ino_t journal_stream_ino = 0; bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */ needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */ - needs_mount_namespace; /* Do we need to set up a mount namespace for this kernel? */ - bool keep_seccomp_privileges = false; - bool has_cap_sys_admin = false; + needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */ + have_cap_sys_admin, + userns_set_up = false, + keep_seccomp_privileges = false; #if HAVE_SELINUX _cleanup_free_ char *mac_selinux_context_net = NULL; bool use_selinux = false; @@ -5308,7 +5299,7 @@ int exec_invoke( uint64_t capability_ambient_set = context->capability_ambient_set; /* Check CAP_SYS_ADMIN before we enter user namespace to see if we can mount /proc even though its masked. */ - has_cap_sys_admin = have_effective_cap(CAP_SYS_ADMIN) > 0; + have_cap_sys_admin = have_effective_cap(CAP_SYS_ADMIN) > 0; if (needs_sandboxing) { /* MAC enablement checks need to be done before a new mount ns is created, as they rely on @@ -5373,11 +5364,13 @@ int exec_invoke( } } - if (needs_sandboxing && exec_context_need_unprivileged_private_users(context, params)) { + if (needs_sandboxing && !have_cap_sys_admin && exec_context_needs_cap_sys_admin(context, params)) { /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces. * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to * set up all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */ PrivateUsers pu = exec_context_get_effective_private_users(context, params); + if (pu == PRIVATE_USERS_NO) + pu = PRIVATE_USERS_SELF; /* The kernel requires /proc/pid/setgroups be set to "deny" prior to writing /proc/pid/gid_map in * unprivileged user namespaces. */ @@ -5392,6 +5385,7 @@ int exec_invoke( log_exec_info_errno(context, params, r, "Failed to set up user namespacing for unprivileged user, ignoring: %m"); else { assert(r > 0); + userns_set_up = true; log_debug("Set up unprivileged user namespace"); } } @@ -5407,7 +5401,7 @@ int exec_invoke( gid, command, needs_sandboxing, - has_cap_sys_admin, + have_cap_sys_admin, exit_status); if (r < 0) return r; @@ -5444,7 +5438,7 @@ int exec_invoke( * case of mount namespaces being less privileged when the mount point list is copied from a * different user namespace). */ - if (needs_sandboxing && !exec_context_need_unprivileged_private_users(context, params)) { + if (needs_sandboxing && !userns_set_up) { PrivateUsers pu = exec_context_get_effective_private_users(context, params); r = setup_private_users(pu, saved_uid, saved_gid, uid, gid, @@ -5468,7 +5462,7 @@ int exec_invoke( gid, command, needs_sandboxing, - has_cap_sys_admin, + have_cap_sys_admin, exit_status); if (r < 0) return r; diff --git a/src/libsystemd/sd-bus/sd-bus.c b/src/libsystemd/sd-bus/sd-bus.c index 7c6183d1bb..6429267843 100644 --- a/src/libsystemd/sd-bus/sd-bus.c +++ b/src/libsystemd/sd-bus/sd-bus.c @@ -1760,8 +1760,10 @@ _public_ int sd_bus_open_user_machine(sd_bus **ret, const char *user_and_machine assert_return(user_and_machine, -EINVAL); assert_return(ret, -EINVAL); - /* Shortcut things if we'd end up on this host and as the same user. */ - if (user_and_machine_equivalent(user_and_machine)) + /* Shortcut things if we'd end up on this host and as the same user and have one of the necessary + * environment variables set already. */ + if (user_and_machine_equivalent(user_and_machine) && + (secure_getenv("DBUS_SESSION_BUS_ADDRESS") || secure_getenv("XDG_RUNTIME_DIR"))) return sd_bus_open_user(ret); r = user_and_machine_valid(user_and_machine); diff --git a/src/run/run.c b/src/run/run.c index cdb5cfefd6..e4ae5373b9 100644 --- a/src/run/run.c +++ b/src/run/run.c @@ -2483,6 +2483,11 @@ static int start_transient_scope(sd_bus *bus) { return log_oom(); } + /* Stop agents before we pass control away and before we drop privileges, to avoid TTY conflicts and + * before we become unable to stop agents. */ + polkit_agent_close(); + ask_password_agent_close(); + if (arg_nice_set) { if (setpriority(PRIO_PROCESS, 0, arg_nice) < 0) return log_error_errno(errno, "Failed to set nice level: %m"); @@ -2571,10 +2576,6 @@ static int start_transient_scope(sd_bus *bus) { } } - /* Stop agents before we pass control away, to avoid TTY conflicts */ - polkit_agent_close(); - ask_password_agent_close(); - execvpe(arg_cmdline[0], arg_cmdline, env); return log_error_errno(errno, "Failed to execute: %m"); diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c index f787413161..1e04a051a6 100644 --- a/src/shared/bus-unit-util.c +++ b/src/shared/bus-unit-util.c @@ -1669,13 +1669,17 @@ static int bus_append_execute_property(sd_bus_message *m, const char *field, con if (STR_IN_SET(field, "RestrictNamespaces", "DelegateNamespaces")) { bool invert = false; + unsigned long all = UPDATE_FLAG(NAMESPACE_FLAGS_ALL, CLONE_NEWUSER, !streq(field, "DelegateNamespaces")); unsigned long flags; r = parse_boolean(eq); if (r > 0) - flags = 0; + /* RestrictNamespaces= value gets stored into a field with reverse semantics (the + * namespaces which are retained), so RestrictNamespaces=true means we retain no + * access to any namespaces and vice-versa. */ + flags = streq(field, "RestrictNamespaces") ? 0 : all; else if (r == 0) - flags = NAMESPACE_FLAGS_ALL; + flags = streq(field, "RestrictNamespaces") ? all : 0; else { if (eq[0] == '~') { invert = true; @@ -1688,7 +1692,7 @@ static int bus_append_execute_property(sd_bus_message *m, const char *field, con } if (invert) - flags = (~flags) & NAMESPACE_FLAGS_ALL; + flags = (~flags) & all; r = sd_bus_message_append(m, "(sv)", field, "t", (uint64_t) flags); if (r < 0) diff --git a/test/units/TEST-07-PID1.delegate-namespaces.sh b/test/units/TEST-07-PID1.delegate-namespaces.sh index fe0defaeb6..9bd9691197 100755 --- a/test/units/TEST-07-PID1.delegate-namespaces.sh +++ b/test/units/TEST-07-PID1.delegate-namespaces.sh @@ -9,6 +9,22 @@ set -o pipefail # shellcheck source=test/units/util.sh . "$(dirname "$0")"/util.sh +# IMPORTANT: For /proc/ to be remounted in pid namespace within an unprivileged user namespace, there needs to +# be at least 1 unmasked procfs mount in ANY directory. Otherwise, if /proc/ is masked (e.g. /proc/scsi is +# over-mounted with tmpfs), then mounting a new /proc/ will fail. +# +# Thus, to guarantee PrivatePIDs=yes tests for unprivileged users pass, we mount a new procfs on a temporary +# directory with no masking. This will guarantee an unprivileged user can mount a new /proc/ successfully. +mkdir -p /tmp/TEST-07-PID1-delegate-namespaces-proc +mount -t proc proc /tmp/TEST-07-PID1-delegate-namespaces-proc + +at_exit() { + umount /tmp/TEST-07-PID1-delegate-namespaces-proc + rm -rf /tmp/TEST-07-PID1-delegate-namespaces-proc +} + +trap at_exit EXIT + testcase_mount() { (! systemd-run -p PrivateUsersEx=self -p PrivateMounts=yes --wait --pipe -- mount --bind /usr /home) systemd-run -p PrivateUsersEx=self -p PrivateMounts=yes -p DelegateNamespaces=mnt --wait --pipe -- mount --bind /usr /home @@ -16,7 +32,7 @@ testcase_mount() { testcase_network() { (! systemd-run -p PrivateUsersEx=self -p PrivateNetwork=yes --wait --pipe -- ip link add veth1 type veth peer name veth2) - systemd-run -p PrivateUsersEx=self -p PrivateMounts=yes -p DelegateNamespaces=mnt --wait --pipe -- ip link add veth1 type veth peer name veth2 + systemd-run -p PrivateUsersEx=self -p PrivateNetwork=yes -p DelegateNamespaces=net --wait --pipe -- ip link add veth1 type veth peer name veth2 } testcase_cgroup() { @@ -25,8 +41,12 @@ testcase_cgroup() { } testcase_pid() { - (! systemd-run -p PrivateUsersEx=self -p PrivatePIDs=yes --wait --pipe -- sh -c 'echo 5 >/proc/sys/kernel/ns_last_pid') - systemd-run -p PrivateUsersEx=self -p PrivatePIDs=yes -p DelegateNamespaces=pid --wait --pipe -- sh -c 'echo 5 >/proc/sys/kernel/ns_last_pid' + # MountAPIVFS=yes always bind mounts child mounts of APIVFS filesystems, which means /proc/sys is always read-only + # so we can't write to it when running in a container. + if ! systemd-detect-virt --container; then + (! systemd-run -p PrivateUsersEx=self -p PrivatePIDs=yes -p MountAPIVFS=yes --wait --pipe -- sh -c 'echo 5 >/proc/sys/kernel/ns_last_pid') + systemd-run -p PrivateUsersEx=self -p PrivatePIDs=yes -p MountAPIVFS=yes -p DelegateNamespaces="mnt pid" --wait --pipe -- sh -c 'echo 5 >/proc/sys/kernel/ns_last_pid' + fi } testcase_uts() { @@ -42,6 +62,18 @@ testcase_implied_private_users_self() { systemd-run -p PrivateUsersEx=identity -p PrivateMounts=yes -p DelegateNamespaces=mnt --wait bash -c 'test "$(cat /proc/self/uid_map)" == " 0 0 65536"' } +testcase_user_manager() { + systemctl start user@0 + # DelegateNamespaces=yes is implied for user managers. + systemd-run --machine=testuser@.host --user -p PrivateMounts=yes -p AmbientCapabilities="~" --wait --pipe -- mount --bind /usr /home + # Even those with CAP_SYS_ADMIN. + SYSTEMD_LOG_LEVEL=debug systemd-run --machine=.host --user -p PrivateMounts=yes --wait --pipe -- mount --bind /usr /home + # But can be overridden for user managers that are running with CAP_SYS_ADMIN. + (! systemd-run --machine=.host --user -p PrivateMounts=yes -p DelegateNamespaces=no --wait --pipe -- mount --bind /usr /home) + # But not for those without CAP_SYS_ADMIN. + systemd-run --machine=testuser@.host --user -p PrivateMounts=yes -p DelegateNamespaces=no -p AmbientCapabilities="~" --wait --pipe -- mount --bind /usr /home +} + testcase_multiple_features() { unsquashfs -no-xattrs -d /tmp/TEST-07-PID1-delegate-namespaces-root /usr/share/minimal_0.raw @@ -52,7 +84,7 @@ testcase_multiple_features() { -p BindReadOnlyPaths=/usr/share \ -p NoNewPrivileges=yes \ -p ProtectSystem=strict \ - -p User=testuser\ + -p User=testuser \ -p Group=testuser \ -p RuntimeDirectory=abc \ -p StateDirectory=qed \ @@ -78,3 +110,5 @@ testcase_multiple_features() { rm -rf /tmp/TEST-07-PID1-delegate-namespaces-root } + +run_testcases diff --git a/test/units/TEST-07-PID1.private-pids.sh b/test/units/TEST-07-PID1.private-pids.sh index eede43cbaf..091535e3da 100755 --- a/test/units/TEST-07-PID1.private-pids.sh +++ b/test/units/TEST-07-PID1.private-pids.sh @@ -95,7 +95,7 @@ testcase_multiple_features() { -p BindReadOnlyPaths=/usr/share \ -p NoNewPrivileges=yes \ -p ProtectSystem=strict \ - -p User=testuser\ + -p User=testuser \ -p Group=testuser \ -p RuntimeDirectory=abc \ -p StateDirectory=qed \ @@ -142,8 +142,8 @@ testcase_unpriv() { mount -t proc proc /tmp/TEST-07-PID1-private-pids-proc # Verify running as unprivileged user can unshare PID namespace and mounts /proc properly. - assert_eq "$(runas testuser systemd-run --wait --user --pipe -p PrivatePIDs=yes readlink /proc/self)" "1" - assert_eq "$(runas testuser systemd-run --wait --user --pipe -p PrivatePIDs=yes ps aux --no-heading | wc -l)" "1" + assert_eq "$(systemd-run --machine=testuser@.host --wait --user --pipe -p PrivatePIDs=yes readlink /proc/self)" "1" + assert_eq "$(systemd-run --machine=testuser@.host --wait --user --pipe -p PrivatePIDs=yes ps aux --no-heading | wc -l)" "1" umount /tmp/TEST-07-PID1-private-pids-proc rm -rf /tmp/TEST-07-PID1-private-pids-proc @@ -162,7 +162,7 @@ testcase_unpriv() { mount -t tmpfs tmpfs /proc/scsi fi - (! runas testuser systemd-run --wait --user --pipe -p PrivatePIDs=yes true) + (! systemd-run --machine=testuser@.host --wait --user --pipe -p PrivatePIDs=yes true) if [[ "$HAS_EXISTING_SCSI_MOUNT" == "no" ]]; then umount /proc/scsi