diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 545af6033a..0413ac9025 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -2009,8 +2009,8 @@ BindReadOnlyPaths=/var/lib/systemd PrivateUsers= - Takes a boolean argument or one of self or - identity. Defaults to false. If enabled, sets up a new user namespace for the + Takes a boolean argument or one of self, identity, + or full. Defaults to false. If enabled, sets up a new user namespace for the executed processes and configures a user and group mapping. If set to a true value or self, a minimal user and group mapping is configured that maps the root user and group as well as the unit's own user and group to themselves and @@ -2026,6 +2026,13 @@ BindReadOnlyPaths=/var/lib/systemd since all UIDs/GIDs are chosen identically it does provide process capability isolation, and hence is often a good choice if proper user namespacing with distinct UID maps is not appropriate. + If the parameter is full, user namespacing is set up with an identity + mapping for all UIDs/GIDs. In addition, for system services, full allows the unit + to call setgroups() system calls (by setting + /proc/pid/setgroups to allow). + Similar to identity, this does not provide UID/GID isolation, but it does provide + process capability isolation. + If this mode is enabled, all unit processes are run without privileges in the host user namespace (regardless if the unit's own user/group is root or not). Specifically this means that the process will have zero process capabilities on the host's user namespace, but diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c index fd306f1143..520a57a198 100644 --- a/src/core/exec-invoke.c +++ b/src/core/exec-invoke.c @@ -2079,7 +2079,7 @@ static int build_pass_environment(const ExecContext *c, char ***ret) { return 0; } -static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) { +static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogid, uid_t uid, gid_t gid, bool allow_setgroups) { _cleanup_free_ char *uid_map = NULL, *gid_map = NULL; _cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR; _cleanup_close_ int unshare_ready_fd = -EBADF; @@ -2105,6 +2105,29 @@ static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogi uid_map = strdup("0 0 65536\n"); if (!uid_map) return -ENOMEM; + } else if (private_users == PRIVATE_USERS_FULL) { + /* Map all UID/GID from original to new user namespace. We can't use `0 0 UINT32_MAX` because + * this is the same UID/GID map as the init user namespace and systemd's running_in_userns() + * checks whether its in a user namespace by comparing uid_map/gid_map to `0 0 UINT32_MAX`. + * Thus, we still map all UIDs/GIDs but do it using two extents to differentiate the new user + * namespace from the init namespace: + * 0 0 1 + * 1 1 UINT32_MAX - 1 + * + * systemd will remove the heuristic in running_in_userns() and use namespace inodes in version 258 + * (PR #35382). But some users may be running a container image with older systemd < 258 so we keep + * this uid_map/gid_map hack until version 259 for version N-1 compatibility. + * + * TODO: Switch to `0 0 UINT32_MAX` in systemd v259. + * + * Note the kernel defines the UID range between 0 and UINT32_MAX so we map all UIDs even though + * the UID range beyond INT32_MAX (e.g. i.e. the range above the signed 32-bit range) is + * icky. For example, setfsuid() returns the old UID as signed integer. But units can decide to + * use these UIDs/GIDs so we need to map them. */ + r = asprintf(&uid_map, "0 0 1\n" + "1 1 " UID_FMT "\n", (uid_t) (UINT32_MAX - 1)); + if (r < 0) + return -ENOMEM; /* Can only set up multiple mappings with CAP_SETUID. */ } else if (have_effective_cap(CAP_SETUID) > 0 && uid != ouid && uid_is_valid(uid)) { r = asprintf(&uid_map, @@ -2125,6 +2148,11 @@ static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogi gid_map = strdup("0 0 65536\n"); if (!gid_map) return -ENOMEM; + } else if (private_users == PRIVATE_USERS_FULL) { + r = asprintf(&gid_map, "0 0 1\n" + "1 1 " GID_FMT "\n", (gid_t) (UINT32_MAX - 1)); + if (r < 0) + return -ENOMEM; /* Can only set up multiple mappings with CAP_SETGID. */ } else if (have_effective_cap(CAP_SETGID) > 0 && gid != ogid && gid_is_valid(gid)) { r = asprintf(&gid_map, @@ -2170,7 +2198,8 @@ static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogi if (read(unshare_ready_fd, &c, sizeof(c)) < 0) report_errno_and_exit(errno_pipe[1], -errno); - /* Disable the setgroups() system call in the child user namespace, for good. */ + /* Disable the setgroups() system call in the child user namespace, for good, unless PrivateUsers=full + * and using the system service manager. */ a = procfs_file_alloca(ppid, "setgroups"); fd = open(a, O_WRONLY|O_CLOEXEC); if (fd < 0) { @@ -2181,8 +2210,9 @@ static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogi /* If the file is missing the kernel is too old, let's continue anyway. */ } else { - if (write(fd, "deny\n", 5) < 0) { - r = log_debug_errno(errno, "Failed to write \"deny\" to %s: %m", a); + const char *setgroups = allow_setgroups ? "allow\n" : "deny\n"; + if (write(fd, setgroups, strlen(setgroups)) < 0) { + r = log_debug_errno(errno, "Failed to write '%s' to %s: %m", setgroups, a); report_errno_and_exit(errno_pipe[1], r); } @@ -4984,7 +5014,9 @@ int exec_invoke( if (pu == PRIVATE_USERS_NO) pu = PRIVATE_USERS_SELF; - r = setup_private_users(pu, saved_uid, saved_gid, uid, gid); + /* The kernel requires /proc/pid/setgroups be set to "deny" prior to writing /proc/pid/gid_map in + * unprivileged user namespaces. */ + r = setup_private_users(pu, saved_uid, saved_gid, uid, gid, /* allow_setgroups= */ false); /* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let * the actual requested operations fail (or silently continue). */ if (r < 0 && context->private_users != PRIVATE_USERS_NO) { @@ -5154,7 +5186,8 @@ int exec_invoke( * different user namespace). */ if (needs_sandboxing && !userns_set_up) { - r = setup_private_users(context->private_users, saved_uid, saved_gid, uid, gid); + r = setup_private_users(context->private_users, saved_uid, saved_gid, uid, gid, + /* allow_setgroups= */ context->private_users == PRIVATE_USERS_FULL); if (r < 0) { *exit_status = EXIT_USER; return log_exec_error_errno(context, params, r, "Failed to set up user namespacing: %m"); diff --git a/src/core/namespace.c b/src/core/namespace.c index c9362e55ab..3ac1eba12f 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -3380,6 +3380,7 @@ static const char* const private_users_table[_PRIVATE_USERS_MAX] = { [PRIVATE_USERS_NO] = "no", [PRIVATE_USERS_SELF] = "self", [PRIVATE_USERS_IDENTITY] = "identity", + [PRIVATE_USERS_FULL] = "full", }; DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(private_users, PrivateUsers, PRIVATE_USERS_SELF); diff --git a/src/core/namespace.h b/src/core/namespace.h index 96f62be30a..7ad4ef002d 100644 --- a/src/core/namespace.h +++ b/src/core/namespace.h @@ -73,6 +73,7 @@ typedef enum PrivateUsers { PRIVATE_USERS_NO, PRIVATE_USERS_SELF, PRIVATE_USERS_IDENTITY, + PRIVATE_USERS_FULL, _PRIVATE_USERS_MAX, _PRIVATE_USERS_INVALID = -EINVAL, } PrivateUsers; diff --git a/test/units/TEST-07-PID1.private-users.sh b/test/units/TEST-07-PID1.private-users.sh index 2475b5d365..e788f52a2f 100755 --- a/test/units/TEST-07-PID1.private-users.sh +++ b/test/units/TEST-07-PID1.private-users.sh @@ -6,7 +6,12 @@ set -o pipefail systemd-run -p PrivateUsers=yes --wait bash -c 'test "$(cat /proc/self/uid_map)" == " 0 0 1"' systemd-run -p PrivateUsers=yes --wait bash -c 'test "$(cat /proc/self/gid_map)" == " 0 0 1"' +systemd-run -p PrivateUsersEx=yes --wait bash -c 'test "$(cat /proc/self/setgroups)" == "deny"' systemd-run -p PrivateUsersEx=self --wait bash -c 'test "$(cat /proc/self/uid_map)" == " 0 0 1"' systemd-run -p PrivateUsersEx=self --wait bash -c 'test "$(cat /proc/self/gid_map)" == " 0 0 1"' +systemd-run -p PrivateUsersEx=self --wait bash -c 'test "$(cat /proc/self/setgroups)" == "deny"' systemd-run -p PrivateUsersEx=identity --wait bash -c 'test "$(cat /proc/self/uid_map)" == " 0 0 65536"' systemd-run -p PrivateUsersEx=identity --wait bash -c 'test "$(cat /proc/self/gid_map)" == " 0 0 65536"' +systemd-run -p PrivateUsersEx=full --wait bash -c 'test "$(cat /proc/self/uid_map | tr -d "\n")" == " 0 0 1 1 1 4294967294"' +systemd-run -p PrivateUsersEx=full --wait bash -c 'test "$(cat /proc/self/gid_map | tr -d "\n")" == " 0 0 1 1 1 4294967294"' +systemd-run -p PrivateUsersEx=full --wait bash -c 'test "$(cat /proc/self/setgroups)" == "allow"'