diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 482dbbda80..b31e64f57c 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -2027,8 +2027,11 @@ BindReadOnlyPaths=/var/lib/systemd often a good choice if proper user namespacing with distinct UID maps is not appropriate. If the parameter is full, user namespacing is set up with an identity - mapping for all UIDs/GIDs. Similar to identity, this does not provide UID/GID - isolation, but it does provide process capability isolation. + mapping for all UIDs/GIDs. In addition, for system services, full allows the unit + to call setgroups() system calls (by setting + /proc/pid/setgroups to allow). + Similar to identity, this does not provide UID/GID isolation, but it does provide + process capability isolation. If this mode is enabled, all unit processes are run without privileges in the host user namespace (regardless if the unit's own user/group is root or not). Specifically diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c index 682d6449d7..da2d4abd3c 100644 --- a/src/core/exec-invoke.c +++ b/src/core/exec-invoke.c @@ -2077,7 +2077,7 @@ static int build_pass_environment(const ExecContext *c, char ***ret) { return 0; } -static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) { +static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogid, uid_t uid, gid_t gid, bool allow_setgroups) { _cleanup_free_ char *uid_map = NULL, *gid_map = NULL; _cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR; _cleanup_close_ int unshare_ready_fd = -EBADF; @@ -2196,7 +2196,8 @@ static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogi if (read(unshare_ready_fd, &c, sizeof(c)) < 0) report_errno_and_exit(errno_pipe[1], -errno); - /* Disable the setgroups() system call in the child user namespace, for good. */ + /* Disable the setgroups() system call in the child user namespace, for good, unless PrivateUsers=full + * and using the system service manager. */ a = procfs_file_alloca(ppid, "setgroups"); fd = open(a, O_WRONLY|O_CLOEXEC); if (fd < 0) { @@ -2207,8 +2208,9 @@ static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogi /* If the file is missing the kernel is too old, let's continue anyway. */ } else { - if (write(fd, "deny\n", 5) < 0) { - r = log_debug_errno(errno, "Failed to write \"deny\" to %s: %m", a); + const char *setgroups = allow_setgroups ? "allow\n" : "deny\n"; + if (write(fd, setgroups, strlen(setgroups)) < 0) { + r = log_debug_errno(errno, "Failed to write '%s' to %s: %m", setgroups, a); report_errno_and_exit(errno_pipe[1], r); } @@ -5007,7 +5009,9 @@ int exec_invoke( if (pu == PRIVATE_USERS_NO) pu = PRIVATE_USERS_SELF; - r = setup_private_users(pu, saved_uid, saved_gid, uid, gid); + /* The kernel requires /proc/pid/setgroups be set to "deny" prior to writing /proc/pid/gid_map in + * unprivileged user namespaces. */ + r = setup_private_users(pu, saved_uid, saved_gid, uid, gid, /* allow_setgroups= */ false); /* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let * the actual requested operations fail (or silently continue). */ if (r < 0 && context->private_users != PRIVATE_USERS_NO) { @@ -5177,7 +5181,8 @@ int exec_invoke( * different user namespace). */ if (needs_sandboxing && !userns_set_up) { - r = setup_private_users(context->private_users, saved_uid, saved_gid, uid, gid); + r = setup_private_users(context->private_users, saved_uid, saved_gid, uid, gid, + /* allow_setgroups= */ context->private_users == PRIVATE_USERS_FULL); if (r < 0) { *exit_status = EXIT_USER; return log_exec_error_errno(context, params, r, "Failed to set up user namespacing: %m"); diff --git a/test/units/TEST-07-PID1.private-users.sh b/test/units/TEST-07-PID1.private-users.sh index ba85248f96..e788f52a2f 100755 --- a/test/units/TEST-07-PID1.private-users.sh +++ b/test/units/TEST-07-PID1.private-users.sh @@ -6,9 +6,12 @@ set -o pipefail systemd-run -p PrivateUsers=yes --wait bash -c 'test "$(cat /proc/self/uid_map)" == " 0 0 1"' systemd-run -p PrivateUsers=yes --wait bash -c 'test "$(cat /proc/self/gid_map)" == " 0 0 1"' +systemd-run -p PrivateUsersEx=yes --wait bash -c 'test "$(cat /proc/self/setgroups)" == "deny"' systemd-run -p PrivateUsersEx=self --wait bash -c 'test "$(cat /proc/self/uid_map)" == " 0 0 1"' systemd-run -p PrivateUsersEx=self --wait bash -c 'test "$(cat /proc/self/gid_map)" == " 0 0 1"' +systemd-run -p PrivateUsersEx=self --wait bash -c 'test "$(cat /proc/self/setgroups)" == "deny"' systemd-run -p PrivateUsersEx=identity --wait bash -c 'test "$(cat /proc/self/uid_map)" == " 0 0 65536"' systemd-run -p PrivateUsersEx=identity --wait bash -c 'test "$(cat /proc/self/gid_map)" == " 0 0 65536"' systemd-run -p PrivateUsersEx=full --wait bash -c 'test "$(cat /proc/self/uid_map | tr -d "\n")" == " 0 0 1 1 1 4294967294"' systemd-run -p PrivateUsersEx=full --wait bash -c 'test "$(cat /proc/self/gid_map | tr -d "\n")" == " 0 0 1 1 1 4294967294"' +systemd-run -p PrivateUsersEx=full --wait bash -c 'test "$(cat /proc/self/setgroups)" == "allow"'