diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c index a9c8664c4a..9e7af81f1e 100644 --- a/src/core/exec-invoke.c +++ b/src/core/exec-invoke.c @@ -1177,14 +1177,56 @@ static int pam_close_session_and_delete_credentials(pam_handle_t *handle, int fl } #endif +static int attach_to_subcgroup( + const ExecContext *context, + const CGroupContext *cgroup_context, + const ExecParameters *params, + const char *prefix) { + + _cleanup_free_ char *subgroup = NULL; + int r; + + assert(context); + assert(cgroup_context); + assert(params); + + /* If we're a control process that needs a subgroup, we've already been spawned into it as otherwise + * we'd violate the "no inner processes" rule, so no need to do anything. */ + if (exec_params_needs_control_subcgroup(params)) + return 0; + + r = exec_params_get_cgroup_path(params, cgroup_context, prefix, &subgroup); + if (r < 0) + return log_error_errno(r, "Failed to acquire cgroup path: %m"); + /* No subgroup required? Then there's nothing to do. */ + if (r == 0) + return 0; + + r = cg_attach(subgroup, 0); + if (r == -EUCLEAN) + return log_error_errno(r, + "Failed to attach process " PID_FMT " to cgroup '%s', " + "because the cgroup or one of its parents or " + "siblings is in the threaded mode.", + getpid_cached(), subgroup); + if (r < 0) + return log_error_errno(r, + "Failed to attach process " PID_FMT " to cgroup %s: %m", + getpid_cached(), subgroup); + + return 0; +} + static int setup_pam( const ExecContext *context, + const CGroupContext *cgroup_context, ExecParameters *params, const char *user, uid_t uid, gid_t gid, char ***env, /* updated on success */ const int fds[], size_t n_fds, + bool needs_sandboxing, int exec_fd) { #if HAVE_PAM @@ -1290,6 +1332,15 @@ static int setup_pam( if (r == 0) { int ret = EXIT_PAM; + if (needs_sandboxing && exec_needs_cgroup_namespace(context) && params->cgroup_path) { + /* Move PAM process into subgroup immediately if the main process hasn't been moved + * into the subgroup yet (when cgroup namespacing is enabled) and a subgroup is + * configured. */ + r = attach_to_subcgroup(context, cgroup_context, params, params->cgroup_path); + if (r < 0) + return r; + } + /* The child's job is to reset the PAM session on termination */ barrier_set_role(&barrier, BARRIER_CHILD); @@ -4911,28 +4962,48 @@ int exec_invoke( if (socket_fd >= 0) (void) fd_nonblock(socket_fd, false); + /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted + * from it. */ + needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED); + /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields. * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */ if (params->cgroup_path) { - _cleanup_free_ char *p = NULL; + _cleanup_free_ char *subcgroup = NULL; - r = exec_params_get_cgroup_path(params, cgroup_context, &p); + r = exec_params_get_cgroup_path(params, cgroup_context, params->cgroup_path, &subcgroup); if (r < 0) { *exit_status = EXIT_CGROUP; return log_error_errno(r, "Failed to acquire cgroup path: %m"); } + if (r > 0) { + /* If there is a subcgroup required, let's make sure to create it now. */ + r = cg_create(subcgroup); + if (r < 0) + return log_error_errno(r, "Failed to create subcgroup '%s': %m", subcgroup); + } - r = cg_attach(p, 0); + /* If we need a cgroup namespace, we cannot yet move the service to its configured subgroup, + * as unsharing the cgroup namespace later on makes the current cgroup the root of the + * namespace and we want the root of the namespace to be the main service cgroup and not the + * subgroup. One edge case is if we're a control process that needs to be spawned in a + * subgroup, in this case, we have no choice as moving into the main service cgroup might + * violate the no inner processes rule of cgroupv2. */ + const char *cgtarget = needs_sandboxing && exec_needs_cgroup_namespace(context) && + !exec_params_needs_control_subcgroup(params) + ? params->cgroup_path : subcgroup; + + r = cg_attach(cgtarget, 0); if (r == -EUCLEAN) { *exit_status = EXIT_CGROUP; return log_error_errno(r, "Failed to attach process to cgroup '%s', " "because the cgroup or one of its parents or " - "siblings is in the threaded mode.", p); + "siblings is in the threaded mode.", cgtarget); } if (r < 0) { *exit_status = EXIT_CGROUP; - return log_error_errno(r, "Failed to attach to cgroup %s: %m", p); + return log_error_errno(r, "Failed to attach to cgroup %s: %m", cgtarget); } } @@ -5128,10 +5199,6 @@ int exec_invoke( } } - /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted - * from it. */ - needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED); - if (params->cgroup_path) { /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not @@ -5147,7 +5214,7 @@ int exec_invoke( return log_error_errno(r, "Failed to adjust control group access: %m"); } - r = exec_params_get_cgroup_path(params, cgroup_context, &p); + r = exec_params_get_cgroup_path(params, cgroup_context, params->cgroup_path, &p); if (r < 0) { *exit_status = EXIT_CGROUP; return log_error_errno(r, "Failed to acquire cgroup path: %m"); @@ -5328,7 +5395,8 @@ int exec_invoke( * wins here. (See above.) */ /* All fds passed in the fds array will be closed in the pam child process. */ - r = setup_pam(context, params, username, uid, gid, &accum_env, params->fds, n_fds, params->exec_fd); + r = setup_pam(context, cgroup_context, params, username, uid, gid, &accum_env, + params->fds, n_fds, needs_sandboxing, params->exec_fd); if (r < 0) { *exit_status = EXIT_PAM; return log_error_errno(r, "Failed to set up PAM session: %m"); @@ -5453,6 +5521,17 @@ int exec_invoke( if (r < 0) return r; + if (needs_sandboxing && exec_needs_cgroup_namespace(context) && params->cgroup_path) { + /* Move ourselves into the subcgroup now *after* we've unshared the cgroup namespace, which + * ensures the root of the cgroup namespace is the top level service cgroup and not the + * subcgroup. Adjust the prefix accordingly since we're in a cgroup namespace now. */ + r = attach_to_subcgroup(context, cgroup_context, params, /* prefix= */ NULL); + if (r < 0) { + *exit_status = EXIT_CGROUP; + return r; + } + } + /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we * shall execute. */ diff --git a/src/core/execute.c b/src/core/execute.c index 31b5e13fe4..1fdb319042 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -405,20 +405,24 @@ bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType typ return true; } +int exec_params_needs_control_subcgroup(const ExecParameters *params) { + /* Keep this in sync with exec_params_get_cgroup_path(). */ + return FLAGS_SET(params->flags, EXEC_CGROUP_DELEGATE|EXEC_CONTROL_CGROUP|EXEC_IS_CONTROL); +} + int exec_params_get_cgroup_path( const ExecParameters *params, const CGroupContext *c, + const char *prefix, char **ret) { const char *subgroup = NULL; char *p; assert(params); + assert(c); assert(ret); - if (!params->cgroup_path) - return -EINVAL; - /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control * processes started after the main unit's process in the unit's main cgroup because it is now an inner one, @@ -428,6 +432,7 @@ int exec_params_get_cgroup_path( * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP * flag, which is only passed for the former statements, not for the latter. */ + /* Keep this in sync with exec_params_needs_control_subcgroup(). */ if (FLAGS_SET(params->flags, EXEC_CGROUP_DELEGATE) && (FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP) || c->delegate_subgroup)) { if (FLAGS_SET(params->flags, EXEC_IS_CONTROL)) subgroup = ".control"; @@ -436,9 +441,9 @@ int exec_params_get_cgroup_path( } if (subgroup) - p = path_join(params->cgroup_path, subgroup); + p = path_join(prefix, subgroup); else - p = strdup(params->cgroup_path); + p = strdup(strempty(prefix)); if (!p) return -ENOMEM; @@ -506,8 +511,16 @@ int exec_spawn( want to log from the parent, so we use the possibly inaccurate path here. */ log_command_line(unit, "About to execute", command->path, command->argv); - if (params->cgroup_path) { - r = exec_params_get_cgroup_path(params, cgroup_context, &subcgroup_path); + /* We cannot spawn the main service process into the subcgroup as it might need to unshare the cgroup + * namespace first if one is configured to make sure the root of the cgroup namespace is the service + * cgroup and not the subcgroup. However, when running control commands on a live service, the + * commands have to be spawned inside a subcgroup, otherwise we violate the no inner processes rule + * of cgroupv2 as the main service process might already have enabled controllers by writing to + * cgroup.subtree_control. */ + + const char *cgtarget; + if (exec_params_needs_control_subcgroup(params)) { + r = exec_params_get_cgroup_path(params, cgroup_context, params->cgroup_path, &subcgroup_path); if (r < 0) return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m"); if (r > 0) { @@ -518,7 +531,10 @@ int exec_spawn( if (r < 0) return log_unit_error_errno(unit, r, "Failed to create subcgroup '%s': %m", subcgroup_path); } - } + + cgtarget = subcgroup_path; + } else + cgtarget = params->cgroup_path; /* In order to avoid copy-on-write traps and OOM-kills when pid1's memory.current is above the * child's memory.max, serialize all the state needed to start the unit, and pass it to the @@ -582,24 +598,24 @@ int exec_spawn( "--log-level", max_log_levels, "--log-target", log_target_to_string(manager_get_executor_log_target(unit->manager))), environ, - subcgroup_path, + cgtarget, &pidref); /* Drop the ambient set again, so no processes other than sd-executore spawned from the manager inherit it. */ (void) capability_ambient_set_apply(0, /* also_inherit= */ false); - if (r == -EUCLEAN && subcgroup_path) + if (r == -EUCLEAN && cgtarget) return log_unit_error_errno(unit, r, "Failed to spawn process into cgroup '%s', because the cgroup " "or one of its parents or siblings is in the threaded mode.", - subcgroup_path); + cgtarget); if (r < 0) return log_unit_error_errno(unit, r, "Failed to spawn executor: %m"); /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the * process will be killed too). */ - if (r == 0 && subcgroup_path) - (void) cg_attach(subcgroup_path, pidref.pid); + if (r == 0 && cgtarget) + (void) cg_attach(cgtarget, pidref.pid); /* r > 0: Already in the right cgroup thanks to CLONE_INTO_CGROUP */ log_unit_debug(unit, "Forked %s as " PID_FMT " (%s CLONE_INTO_CGROUP)", diff --git a/src/core/execute.h b/src/core/execute.h index fba0862739..e8f4b3eea0 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -551,7 +551,8 @@ DEFINE_TRIVIAL_CLEANUP_FUNC(ExecRuntime*, exec_runtime_free); ExecRuntime* exec_runtime_destroy(ExecRuntime *rt); void exec_runtime_clear(ExecRuntime *rt); -int exec_params_get_cgroup_path(const ExecParameters *params, const CGroupContext *c, char **ret); +int exec_params_needs_control_subcgroup(const ExecParameters *params); +int exec_params_get_cgroup_path(const ExecParameters *params, const CGroupContext *c, const char *prefix, char **ret); void exec_params_shallow_clear(ExecParameters *p); void exec_params_dump(const ExecParameters *p, FILE* f, const char *prefix); void exec_params_deep_clear(ExecParameters *p); diff --git a/test/units/TEST-07-PID1.protect-control-groups.sh b/test/units/TEST-07-PID1.protect-control-groups.sh index e7752ffb4b..0220e1b692 100755 --- a/test/units/TEST-07-PID1.protect-control-groups.sh +++ b/test/units/TEST-07-PID1.protect-control-groups.sh @@ -104,4 +104,53 @@ testcase_basic_strict() { test_basic "strict" "yes" true "$READ_ONLY_MOUNT_FLAG" } +testcase_delegate_subgroup() { + # Make sure the service cgroup is the root of the cgroup namespace when we use DelegateSubgroup. + systemd-run \ + -p ProtectControlGroupsEx=private \ + -p PrivateMounts=yes \ + -p Delegate=yes \ + -p DelegateSubgroup=supervisor \ + --wait \ + --pipe \ + ls /sys/fs/cgroup/supervisor +} + +testcase_delegate_subgroup_control() { + # Make sure control processes are namespaced, are put in the .control cgroup, have the .control group as + # the root of their cgroup namespace and don't violate the no inner processes rule. To ensure we don't + # violate the no inner processes rule, we make sure to enable a cgroup controller so that + # cgroup.subtree_control for the main service cgroup is not empty which will make any attempt to spawn + # processes into that cgroup fail with EBUSY. + assert_eq "$( + systemd-run \ + --service-type=notify \ + -p ProtectControlGroupsEx=private \ + -p PrivateMounts=yes \ + -p Delegate=yes \ + -p DelegateSubgroup=supervisor \ + -p ExecStartPost='sh -c "cat /proc/self/cgroup; kill $MAINPID"' \ + --unit delegate-subgroup-control \ + --wait \ + --pipe \ + sh -c 'echo +pids >/sys/fs/cgroup/cgroup.subtree_control; systemd-notify --ready; sleep infinity' + )" "0::/" +} + +testcase_delegate_subgroup_pam() { + # Make sure any pam processes we spawn don't violate the no inner processes rule. + systemd-run \ + --service-type=oneshot \ + -p ProtectControlGroupsEx=private \ + -p PrivateMounts=yes \ + -p Delegate=yes \ + -p DelegateSubgroup=supervisor \ + -p User=testuser \ + -p PAMName=systemd-user \ + --unit delegate-subgroup-pam \ + --wait \ + --pipe \ + sh -c 'echo +pids >/sys/fs/cgroup/cgroup.subtree_control' +} + run_testcases