core: Make DelegateNamespaces= work for user managers with CAP_SYS_ADMIN (#36771)

Currently DelegateNamespaces= only works for services spawned by the
system manager. User managers will always unshare the user namespace
first even if they're running with CAP_SYS_ADMIN.

Let's add support for DelegateNamespaces= for user managers if they're
running with CAP_SYS_ADMIN. By default, we'll still delegate all
namespaces
for user managers, but this can now be overridden by explicitly passing
DelegateNamespaces=.

If a user manager is running without CAP_SYS_ADMIN, the user manager is
still always unshared first just like before.
This commit is contained in:
Yu Watanabe
2025-03-20 06:28:18 +09:00
committed by GitHub
7 changed files with 92 additions and 56 deletions

View File

@@ -114,8 +114,9 @@ int capability_ambient_set_apply(uint64_t set, bool also_inherit) {
int r;
/* Remove capabilities requested in ambient set, but not in the bounding set */
BIT_FOREACH(i, set) {
assert((unsigned) i <= cap_last_cap());
for (unsigned i = 0; i <= cap_last_cap(); i++) {
if (!BIT_SET(set, i))
continue;
if (prctl(PR_CAPBSET_READ, (unsigned long) i) != 1) {
log_debug("Ambient capability %s requested but missing from bounding set, suppressing automatically.",

View File

@@ -1473,7 +1473,7 @@ static bool context_has_no_new_privileges(const ExecContext *c) {
static bool seccomp_allows_drop_privileges(const ExecContext *c) {
void *id, *val;
bool has_capget = false, has_capset = false, has_prctl = false;
bool have_capget = false, have_capset = false, have_prctl = false;
assert(c);
@@ -1487,17 +1487,17 @@ static bool seccomp_allows_drop_privileges(const ExecContext *c) {
name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
if (streq(name, "capget"))
has_capget = true;
have_capget = true;
else if (streq(name, "capset"))
has_capset = true;
have_capset = true;
else if (streq(name, "prctl"))
has_prctl = true;
have_prctl = true;
}
if (c->syscall_allow_list)
return has_capget && has_capset && has_prctl;
return have_capget && have_capset && have_prctl;
else
return !(has_capget || has_capset || has_prctl);
return !(have_capget || have_capset || have_prctl);
}
static bool skip_seccomp_unavailable(const ExecContext *c, const ExecParameters *p, const char *msg) {
@@ -4205,19 +4205,10 @@ static void log_command_line(
LOG_EXEC_INVOCATION_ID(params));
}
static bool exec_context_need_unprivileged_private_users(
const ExecContext *context,
const ExecParameters *params) {
static bool exec_context_needs_cap_sys_admin(const ExecContext *context, const ExecParameters *params) {
assert(context);
assert(params);
/* These options require PrivateUsers= when used in user units, as we need to be in a user namespace
* to have permission to enable them when not running as root. If we have effective CAP_SYS_ADMIN
* (system manager) then we have privileges and don't need this. */
if (params->runtime_scope != RUNTIME_SCOPE_USER)
return false;
return context->private_users != PRIVATE_USERS_NO ||
context->private_tmp != PRIVATE_TMP_NO ||
context->private_devices ||
@@ -4259,9 +4250,6 @@ static PrivateUsers exec_context_get_effective_private_users(
if (context->private_users != PRIVATE_USERS_NO)
return context->private_users;
if (exec_context_need_unprivileged_private_users(context, params))
return PRIVATE_USERS_SELF;
/* If any namespace is delegated with DelegateNamespaces=, always set up a user namespace. */
if (context->delegate_namespaces != NAMESPACE_FLAGS_INITIAL)
return PRIVATE_USERS_SELF;
@@ -4272,6 +4260,7 @@ static PrivateUsers exec_context_get_effective_private_users(
static bool exec_namespace_is_delegated(
const ExecContext *context,
const ExecParameters *params,
bool have_cap_sys_admin,
unsigned long namespace) {
assert(context);
@@ -4281,11 +4270,11 @@ static bool exec_namespace_is_delegated(
/* If we need unprivileged private users, we've already unshared a user namespace by the time we call
* setup_delegated_namespaces() for the first time so let's make sure we do all other namespace
* unsharing in the first call to setup_delegated_namespaces() by returning false here. */
if (exec_context_need_unprivileged_private_users(context, params))
if (!have_cap_sys_admin && exec_context_needs_cap_sys_admin(context, params))
return false;
if (context->delegate_namespaces == NAMESPACE_FLAGS_INITIAL)
return false;
return params->runtime_scope == RUNTIME_SCOPE_USER;
return FLAGS_SET(context->delegate_namespaces, namespace);
}
@@ -4300,7 +4289,7 @@ static int setup_delegated_namespaces(
uid_t gid,
const ExecCommand *command,
bool needs_sandboxing,
bool has_cap_sys_admin,
bool have_cap_sys_admin,
int *reterr_exit_status) {
int r;
@@ -4318,7 +4307,7 @@ static int setup_delegated_namespaces(
assert(reterr_exit_status);
if (exec_needs_network_namespace(context) &&
exec_namespace_is_delegated(context, params, CLONE_NEWNET) == delegate &&
exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWNET) == delegate &&
runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
/* Try to enable network namespacing if network namespacing is available and we have
@@ -4345,7 +4334,7 @@ static int setup_delegated_namespaces(
}
if (exec_needs_ipc_namespace(context) &&
exec_namespace_is_delegated(context, params, CLONE_NEWIPC) == delegate &&
exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWIPC) == delegate &&
runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
if (ns_type_supported(NAMESPACE_IPC)) {
@@ -4367,7 +4356,7 @@ static int setup_delegated_namespaces(
}
if (needs_sandboxing && exec_needs_cgroup_namespace(context, params) &&
exec_namespace_is_delegated(context, params, CLONE_NEWCGROUP) == delegate) {
exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWCGROUP) == delegate) {
if (unshare(CLONE_NEWCGROUP) < 0) {
*reterr_exit_status = EXIT_NAMESPACE;
return log_exec_error_errno(context, params, errno, "Failed to set up cgroup namespacing: %m");
@@ -4379,7 +4368,7 @@ static int setup_delegated_namespaces(
/* Unshare a new PID namespace before setting up mounts to ensure /proc/ is mounted with only processes in PID namespace visible.
* Note PrivatePIDs=yes implies MountAPIVFS=yes so we'll always ensure procfs is remounted. */
if (needs_sandboxing && exec_needs_pid_namespace(context) &&
exec_namespace_is_delegated(context, params, CLONE_NEWPID) == delegate) {
exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWPID) == delegate) {
if (params->pidref_transport_fd < 0) {
*reterr_exit_status = EXIT_NAMESPACE;
return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ENOTCONN), "PidRef socket is not set up: %m");
@@ -4391,7 +4380,7 @@ static int setup_delegated_namespaces(
* We need to check prior to entering the user namespace because if we're running unprivileged or in a
* system without CAP_SYS_ADMIN, then we can have CAP_SYS_ADMIN in the current user namespace but not
* once we unshare a mount namespace. */
if (!has_cap_sys_admin) {
if (!have_cap_sys_admin || delegate) {
r = can_mount_proc(context, params);
if (r < 0) {
*reterr_exit_status = EXIT_NAMESPACE;
@@ -4416,7 +4405,7 @@ static int setup_delegated_namespaces(
/* If PrivatePIDs= yes is configured, we're now running as pid 1 in a pid namespace! */
if (exec_needs_mount_namespace(context, params, runtime) &&
exec_namespace_is_delegated(context, params, CLONE_NEWNS) == delegate) {
exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWNS) == delegate) {
_cleanup_free_ char *error_path = NULL;
r = apply_mount_namespace(command->flags,
@@ -4437,7 +4426,8 @@ static int setup_delegated_namespaces(
log_exec_debug(context, params, "Set up %smount namespace", delegate ? "delegated " : "");
}
if (needs_sandboxing && exec_namespace_is_delegated(context, params, CLONE_NEWUTS) == delegate) {
if (needs_sandboxing &&
exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWUTS) == delegate) {
r = apply_protect_hostname(context, params, reterr_exit_status);
if (r < 0)
return r;
@@ -4645,9 +4635,10 @@ int exec_invoke(
ino_t journal_stream_ino = 0;
bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
needs_mount_namespace; /* Do we need to set up a mount namespace for this kernel? */
bool keep_seccomp_privileges = false;
bool has_cap_sys_admin = false;
needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
have_cap_sys_admin,
userns_set_up = false,
keep_seccomp_privileges = false;
#if HAVE_SELINUX
_cleanup_free_ char *mac_selinux_context_net = NULL;
bool use_selinux = false;
@@ -5308,7 +5299,7 @@ int exec_invoke(
uint64_t capability_ambient_set = context->capability_ambient_set;
/* Check CAP_SYS_ADMIN before we enter user namespace to see if we can mount /proc even though its masked. */
has_cap_sys_admin = have_effective_cap(CAP_SYS_ADMIN) > 0;
have_cap_sys_admin = have_effective_cap(CAP_SYS_ADMIN) > 0;
if (needs_sandboxing) {
/* MAC enablement checks need to be done before a new mount ns is created, as they rely on
@@ -5373,11 +5364,13 @@ int exec_invoke(
}
}
if (needs_sandboxing && exec_context_need_unprivileged_private_users(context, params)) {
if (needs_sandboxing && !have_cap_sys_admin && exec_context_needs_cap_sys_admin(context, params)) {
/* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
* Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
* set up all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
PrivateUsers pu = exec_context_get_effective_private_users(context, params);
if (pu == PRIVATE_USERS_NO)
pu = PRIVATE_USERS_SELF;
/* The kernel requires /proc/pid/setgroups be set to "deny" prior to writing /proc/pid/gid_map in
* unprivileged user namespaces. */
@@ -5392,6 +5385,7 @@ int exec_invoke(
log_exec_info_errno(context, params, r, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
else {
assert(r > 0);
userns_set_up = true;
log_debug("Set up unprivileged user namespace");
}
}
@@ -5407,7 +5401,7 @@ int exec_invoke(
gid,
command,
needs_sandboxing,
has_cap_sys_admin,
have_cap_sys_admin,
exit_status);
if (r < 0)
return r;
@@ -5444,7 +5438,7 @@ int exec_invoke(
* case of mount namespaces being less privileged when the mount point list is copied from a
* different user namespace). */
if (needs_sandboxing && !exec_context_need_unprivileged_private_users(context, params)) {
if (needs_sandboxing && !userns_set_up) {
PrivateUsers pu = exec_context_get_effective_private_users(context, params);
r = setup_private_users(pu, saved_uid, saved_gid, uid, gid,
@@ -5468,7 +5462,7 @@ int exec_invoke(
gid,
command,
needs_sandboxing,
has_cap_sys_admin,
have_cap_sys_admin,
exit_status);
if (r < 0)
return r;

View File

@@ -1760,8 +1760,10 @@ _public_ int sd_bus_open_user_machine(sd_bus **ret, const char *user_and_machine
assert_return(user_and_machine, -EINVAL);
assert_return(ret, -EINVAL);
/* Shortcut things if we'd end up on this host and as the same user. */
if (user_and_machine_equivalent(user_and_machine))
/* Shortcut things if we'd end up on this host and as the same user and have one of the necessary
* environment variables set already. */
if (user_and_machine_equivalent(user_and_machine) &&
(secure_getenv("DBUS_SESSION_BUS_ADDRESS") || secure_getenv("XDG_RUNTIME_DIR")))
return sd_bus_open_user(ret);
r = user_and_machine_valid(user_and_machine);

View File

@@ -2483,6 +2483,11 @@ static int start_transient_scope(sd_bus *bus) {
return log_oom();
}
/* Stop agents before we pass control away and before we drop privileges, to avoid TTY conflicts and
* before we become unable to stop agents. */
polkit_agent_close();
ask_password_agent_close();
if (arg_nice_set) {
if (setpriority(PRIO_PROCESS, 0, arg_nice) < 0)
return log_error_errno(errno, "Failed to set nice level: %m");
@@ -2571,10 +2576,6 @@ static int start_transient_scope(sd_bus *bus) {
}
}
/* Stop agents before we pass control away, to avoid TTY conflicts */
polkit_agent_close();
ask_password_agent_close();
execvpe(arg_cmdline[0], arg_cmdline, env);
return log_error_errno(errno, "Failed to execute: %m");

View File

@@ -1669,13 +1669,17 @@ static int bus_append_execute_property(sd_bus_message *m, const char *field, con
if (STR_IN_SET(field, "RestrictNamespaces",
"DelegateNamespaces")) {
bool invert = false;
unsigned long all = UPDATE_FLAG(NAMESPACE_FLAGS_ALL, CLONE_NEWUSER, !streq(field, "DelegateNamespaces"));
unsigned long flags;
r = parse_boolean(eq);
if (r > 0)
flags = 0;
/* RestrictNamespaces= value gets stored into a field with reverse semantics (the
* namespaces which are retained), so RestrictNamespaces=true means we retain no
* access to any namespaces and vice-versa. */
flags = streq(field, "RestrictNamespaces") ? 0 : all;
else if (r == 0)
flags = NAMESPACE_FLAGS_ALL;
flags = streq(field, "RestrictNamespaces") ? all : 0;
else {
if (eq[0] == '~') {
invert = true;
@@ -1688,7 +1692,7 @@ static int bus_append_execute_property(sd_bus_message *m, const char *field, con
}
if (invert)
flags = (~flags) & NAMESPACE_FLAGS_ALL;
flags = (~flags) & all;
r = sd_bus_message_append(m, "(sv)", field, "t", (uint64_t) flags);
if (r < 0)

View File

@@ -9,6 +9,22 @@ set -o pipefail
# shellcheck source=test/units/util.sh
. "$(dirname "$0")"/util.sh
# IMPORTANT: For /proc/ to be remounted in pid namespace within an unprivileged user namespace, there needs to
# be at least 1 unmasked procfs mount in ANY directory. Otherwise, if /proc/ is masked (e.g. /proc/scsi is
# over-mounted with tmpfs), then mounting a new /proc/ will fail.
#
# Thus, to guarantee PrivatePIDs=yes tests for unprivileged users pass, we mount a new procfs on a temporary
# directory with no masking. This will guarantee an unprivileged user can mount a new /proc/ successfully.
mkdir -p /tmp/TEST-07-PID1-delegate-namespaces-proc
mount -t proc proc /tmp/TEST-07-PID1-delegate-namespaces-proc
at_exit() {
umount /tmp/TEST-07-PID1-delegate-namespaces-proc
rm -rf /tmp/TEST-07-PID1-delegate-namespaces-proc
}
trap at_exit EXIT
testcase_mount() {
(! systemd-run -p PrivateUsersEx=self -p PrivateMounts=yes --wait --pipe -- mount --bind /usr /home)
systemd-run -p PrivateUsersEx=self -p PrivateMounts=yes -p DelegateNamespaces=mnt --wait --pipe -- mount --bind /usr /home
@@ -16,7 +32,7 @@ testcase_mount() {
testcase_network() {
(! systemd-run -p PrivateUsersEx=self -p PrivateNetwork=yes --wait --pipe -- ip link add veth1 type veth peer name veth2)
systemd-run -p PrivateUsersEx=self -p PrivateMounts=yes -p DelegateNamespaces=mnt --wait --pipe -- ip link add veth1 type veth peer name veth2
systemd-run -p PrivateUsersEx=self -p PrivateNetwork=yes -p DelegateNamespaces=net --wait --pipe -- ip link add veth1 type veth peer name veth2
}
testcase_cgroup() {
@@ -25,8 +41,12 @@ testcase_cgroup() {
}
testcase_pid() {
(! systemd-run -p PrivateUsersEx=self -p PrivatePIDs=yes --wait --pipe -- sh -c 'echo 5 >/proc/sys/kernel/ns_last_pid')
systemd-run -p PrivateUsersEx=self -p PrivatePIDs=yes -p DelegateNamespaces=pid --wait --pipe -- sh -c 'echo 5 >/proc/sys/kernel/ns_last_pid'
# MountAPIVFS=yes always bind mounts child mounts of APIVFS filesystems, which means /proc/sys is always read-only
# so we can't write to it when running in a container.
if ! systemd-detect-virt --container; then
(! systemd-run -p PrivateUsersEx=self -p PrivatePIDs=yes -p MountAPIVFS=yes --wait --pipe -- sh -c 'echo 5 >/proc/sys/kernel/ns_last_pid')
systemd-run -p PrivateUsersEx=self -p PrivatePIDs=yes -p MountAPIVFS=yes -p DelegateNamespaces="mnt pid" --wait --pipe -- sh -c 'echo 5 >/proc/sys/kernel/ns_last_pid'
fi
}
testcase_uts() {
@@ -42,6 +62,18 @@ testcase_implied_private_users_self() {
systemd-run -p PrivateUsersEx=identity -p PrivateMounts=yes -p DelegateNamespaces=mnt --wait bash -c 'test "$(cat /proc/self/uid_map)" == " 0 0 65536"'
}
testcase_user_manager() {
systemctl start user@0
# DelegateNamespaces=yes is implied for user managers.
systemd-run --machine=testuser@.host --user -p PrivateMounts=yes -p AmbientCapabilities="~" --wait --pipe -- mount --bind /usr /home
# Even those with CAP_SYS_ADMIN.
SYSTEMD_LOG_LEVEL=debug systemd-run --machine=.host --user -p PrivateMounts=yes --wait --pipe -- mount --bind /usr /home
# But can be overridden for user managers that are running with CAP_SYS_ADMIN.
(! systemd-run --machine=.host --user -p PrivateMounts=yes -p DelegateNamespaces=no --wait --pipe -- mount --bind /usr /home)
# But not for those without CAP_SYS_ADMIN.
systemd-run --machine=testuser@.host --user -p PrivateMounts=yes -p DelegateNamespaces=no -p AmbientCapabilities="~" --wait --pipe -- mount --bind /usr /home
}
testcase_multiple_features() {
unsquashfs -no-xattrs -d /tmp/TEST-07-PID1-delegate-namespaces-root /usr/share/minimal_0.raw
@@ -52,7 +84,7 @@ testcase_multiple_features() {
-p BindReadOnlyPaths=/usr/share \
-p NoNewPrivileges=yes \
-p ProtectSystem=strict \
-p User=testuser\
-p User=testuser \
-p Group=testuser \
-p RuntimeDirectory=abc \
-p StateDirectory=qed \
@@ -78,3 +110,5 @@ testcase_multiple_features() {
rm -rf /tmp/TEST-07-PID1-delegate-namespaces-root
}
run_testcases

View File

@@ -95,7 +95,7 @@ testcase_multiple_features() {
-p BindReadOnlyPaths=/usr/share \
-p NoNewPrivileges=yes \
-p ProtectSystem=strict \
-p User=testuser\
-p User=testuser \
-p Group=testuser \
-p RuntimeDirectory=abc \
-p StateDirectory=qed \
@@ -142,8 +142,8 @@ testcase_unpriv() {
mount -t proc proc /tmp/TEST-07-PID1-private-pids-proc
# Verify running as unprivileged user can unshare PID namespace and mounts /proc properly.
assert_eq "$(runas testuser systemd-run --wait --user --pipe -p PrivatePIDs=yes readlink /proc/self)" "1"
assert_eq "$(runas testuser systemd-run --wait --user --pipe -p PrivatePIDs=yes ps aux --no-heading | wc -l)" "1"
assert_eq "$(systemd-run --machine=testuser@.host --wait --user --pipe -p PrivatePIDs=yes readlink /proc/self)" "1"
assert_eq "$(systemd-run --machine=testuser@.host --wait --user --pipe -p PrivatePIDs=yes ps aux --no-heading | wc -l)" "1"
umount /tmp/TEST-07-PID1-private-pids-proc
rm -rf /tmp/TEST-07-PID1-private-pids-proc
@@ -162,7 +162,7 @@ testcase_unpriv() {
mount -t tmpfs tmpfs /proc/scsi
fi
(! runas testuser systemd-run --wait --user --pipe -p PrivatePIDs=yes true)
(! systemd-run --machine=testuser@.host --wait --user --pipe -p PrivatePIDs=yes true)
if [[ "$HAS_EXISTING_SCSI_MOUNT" == "no" ]]; then
umount /proc/scsi