diff --git a/man/org.freedesktop.systemd1.xml b/man/org.freedesktop.systemd1.xml
index 1c5e7f2eb7..b95f68b4a5 100644
--- a/man/org.freedesktop.systemd1.xml
+++ b/man/org.freedesktop.systemd1.xml
@@ -147,6 +147,9 @@ node /org/freedesktop/systemd1 {
AttachProcessesToUnit(in s unit_name,
in s subcgroup,
in au pids);
+ RemoveSubgroupFromUnit(in s unit_name,
+ in s subcgroup,
+ in t flags);
AbandonScope(in s name);
GetJob(in u id,
out o job);
@@ -870,6 +873,8 @@ node /org/freedesktop/systemd1 {
+
+
@@ -1599,6 +1604,13 @@ node /org/freedesktop/systemd1 {
parameters. The possible values are configuration, state,
logs, cache, runtime,
fdstore, and all.
+
+ RemoveSubgroupFromUnit() removes a subcgroup belonging to a unit's
+ cgroup. Takes three arguments: the unit name (if empty defaults to the caller's unit), a cgroup path
+ (which must start start with a slash /), which is taken relative to the unit's
+ cgroup, and a flags argument (which must be zero for now). This is primarily useful for unprivileged
+ service managers to ask the system service manager for removal of subcgroups it manages, in case one
+ was delegated to other UIDs.
@@ -2704,6 +2716,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
GetProcesses(out a(sus) processes);
AttachProcesses(in s subcgroup,
in au pids);
+ RemoveSubgroup(in s subcgroup,
+ in t flags);
properties:
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s Type = '...';
@@ -3398,6 +3412,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
+
+
@@ -4006,6 +4022,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
+
+
@@ -4901,6 +4919,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
GetProcesses(out a(sus) processes);
AttachProcesses(in s subcgroup,
in au pids);
+ RemoveSubgroup(in s subcgroup,
+ in t flags);
properties:
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s BindIPv6Only = '...';
@@ -5592,6 +5612,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
+
+
@@ -6206,6 +6228,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
+
+
@@ -7001,6 +7025,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
GetProcesses(out a(sus) processes);
AttachProcesses(in s subcgroup,
in au pids);
+ RemoveSubgroup(in s subcgroup,
+ in t flags);
properties:
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s Where = '...';
@@ -7601,6 +7627,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
+
+
@@ -8141,6 +8169,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
+
+
@@ -8991,6 +9021,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
GetProcesses(out a(sus) processes);
AttachProcesses(in s subcgroup,
in au pids);
+ RemoveSubgroup(in s subcgroup,
+ in t flags);
properties:
readonly s What = '...';
readonly i Priority = ...;
@@ -9577,6 +9609,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
+
+
@@ -10103,6 +10137,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
+
+
@@ -10805,6 +10841,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice {
GetProcesses(out a(sus) processes);
AttachProcesses(in s subcgroup,
in au pids);
+ RemoveSubgroup(in s subcgroup,
+ in t flags);
properties:
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly s Slice = '...';
@@ -11004,6 +11042,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice {
+
+
@@ -11196,6 +11236,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice {
+
+
@@ -11411,6 +11453,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope {
GetProcesses(out a(sus) processes);
AttachProcesses(in s subcgroup,
in au pids);
+ RemoveSubgroup(in s subcgroup,
+ in t flags);
signals:
RequestStop();
properties:
@@ -11636,6 +11680,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope {
+
+
@@ -11850,6 +11896,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope {
+
+
@@ -12254,6 +12302,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
ShutdownStartTimestamp,
ShutdownStartTimestampMonotonic, and
SoftRebootsCount were added in version 256.
+ RemoveSubgroupFromUnit() was added in version 258.
Unit Objects
@@ -12320,7 +12369,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
ProtectControlGroupsEx,
PrivateUsersEx, and
PrivatePIDs were added in version 257.
- ProtectHostnameEx was added in version 258.
+ ProtectHostnameEx and RemoveSubGroup() were added in version 258.
Socket Unit Objects
@@ -12364,7 +12413,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
ManagedOOMMemoryPressureDurationUSec,
ProtectControlGroupsEx, and
PrivatePIDs were added in version 257.
- ProtectHostnameEx was added in version 258.
+ ProtectHostnameEx and RemoveSubgroup() were added in version 258.
Mount Unit Objects
@@ -12405,7 +12454,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
ManagedOOMMemoryPressureDurationUSec,
ProtectControlGroupsEx, and
PrivatePIDs were added in version 257.
- ProtectHostnameEx was added in version 258.
+ ProtectHostnameEx and RemoveSubgroup() was added in version 258.
Swap Unit Objects
@@ -12446,7 +12495,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
ManagedOOMMemoryPressureDurationUSec,
ProtectControlGroupsEx, and
PrivatePIDs were added in version 257.
- ProtectHostnameEx was added in version 258.
+ ProtectHostnameEx and RemoveSubgroup() were added in version 258.
Slice Unit Objects
@@ -12472,6 +12521,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
EffectiveTasksMax, and
MemoryZSwapWriteback were added in version 256.
ManagedOOMMemoryPressureDurationUSec was added in version 257.
+ RemoveSubgroup() was added in version 258.
Scope Unit Objects
@@ -12498,6 +12548,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
EffectiveTasksMax, and
MemoryZSwapWriteback were added in version 256.
ManagedOOMMemoryPressureDurationUSec was added in version 257.
+ RemoveSubgroup() was added in version 258.
Job Objects
diff --git a/src/core/cgroup.c b/src/core/cgroup.c
index 6933aae54d..e959b307c6 100644
--- a/src/core/cgroup.c
+++ b/src/core/cgroup.c
@@ -3126,6 +3126,49 @@ int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path) {
return ret;
}
+int unit_remove_subcgroup(Unit *u, const char *suffix_path) {
+ int r;
+
+ assert(u);
+
+ if (!UNIT_HAS_CGROUP_CONTEXT(u))
+ return -EINVAL;
+
+ if (!unit_cgroup_delegate(u))
+ return -ENOMEDIUM;
+
+ r = unit_pick_cgroup_path(u);
+ if (r < 0)
+ return r;
+
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt || !crt->cgroup_path)
+ return -EOWNERDEAD;
+
+ _cleanup_free_ char *j = NULL;
+ bool delete_root;
+ const char *d;
+ if (empty_or_root(suffix_path)) {
+ d = empty_to_root(crt->cgroup_path);
+ delete_root = false; /* Don't attempt to delete the main cgroup of this unit */
+ } else {
+ j = path_join(crt->cgroup_path, suffix_path);
+ if (!j)
+ return -ENOMEM;
+
+ d = j;
+ delete_root = true;
+ }
+
+ log_unit_debug(u, "Removing subcgroup '%s'...", d);
+
+ r = cg_trim_everywhere(u->manager->cgroup_supported, d, delete_root);
+ if (r < 0)
+ return log_unit_debug_errno(u, r, "Failed to fully %s cgroup '%s': %m", delete_root ? "remove" : "trim", d);
+
+ return 0;
+}
+
static bool unit_has_mask_realized(
Unit *u,
CGroupMask target_mask,
@@ -3567,6 +3610,51 @@ static bool unit_maybe_release_cgroup(Unit *u) {
return false;
}
+static int unit_prune_cgroup_via_bus(Unit *u) {
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+ int r;
+
+ assert(u);
+ assert(u->manager);
+
+ if (MANAGER_IS_SYSTEM(u->manager))
+ return -EINVAL;
+
+ if (!u->manager->system_bus)
+ return -EIO;
+
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt || !crt->cgroup_path)
+ return -EOWNERDEAD;
+
+ /* Determine this unit's cgroup path relative to our cgroup root */
+ const char *pp = path_startswith(crt->cgroup_path, u->manager->cgroup_root);
+ if (!pp)
+ return -EINVAL;
+
+ _cleanup_free_ char *absolute = NULL;
+ if (!path_is_absolute(pp)) { /* RemoveSubgroupFromUnit() wants an absolute path */
+ absolute = strjoin("/", pp);
+ if (!absolute)
+ return -ENOMEM;
+
+ pp = absolute;
+ }
+
+ r = bus_call_method(u->manager->system_bus,
+ bus_systemd_mgr,
+ "RemoveSubgroupFromUnit",
+ &error, NULL,
+ "sst",
+ NULL /* empty unit name means client's unit, i.e. us */,
+ pp,
+ (uint64_t) 0);
+ if (r < 0)
+ return log_unit_debug_errno(u, r, "Failed to trim cgroup via the bus: %s", bus_error_message(&error, r));
+
+ return 0;
+}
+
void unit_prune_cgroup(Unit *u) {
bool is_root_slice;
int r;
@@ -3598,13 +3686,21 @@ void unit_prune_cgroup(Unit *u) {
is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
r = cg_trim_everywhere(u->manager->cgroup_supported, crt->cgroup_path, !is_root_slice);
- if (r < 0)
- /* One reason we could have failed here is, that the cgroup still contains a process.
- * However, if the cgroup becomes removable at a later time, it might be removed when
- * the containing slice is stopped. So even if we failed now, this unit shouldn't assume
- * that the cgroup is still realized the next time it is started. Do not return early
- * on error, continue cleanup. */
- log_unit_full_errno(u, r == -EBUSY ? LOG_DEBUG : LOG_WARNING, r, "Failed to destroy cgroup %s, ignoring: %m", empty_to_root(crt->cgroup_path));
+ if (r < 0) {
+ int k = unit_prune_cgroup_via_bus(u);
+
+ if (k >= 0)
+ log_unit_debug_errno(u, r, "Failed to destroy cgroup %s on our own (%m), but worked when talking to PID 1.", empty_to_root(crt->cgroup_path));
+ else {
+ /* One reason we could have failed here is, that the cgroup still contains a process.
+ * However, if the cgroup becomes removable at a later time, it might be removed when
+ * the containing slice is stopped. So even if we failed now, this unit shouldn't
+ * assume that the cgroup is still realized the next time it is started. Do not
+ * return early on error, continue cleanup. */
+ log_unit_full_errno(u, r == -EBUSY ? LOG_DEBUG : LOG_WARNING, r,
+ "Failed to destroy cgroup %s, ignoring: %m", empty_to_root(crt->cgroup_path));
+ }
+ }
if (is_root_slice)
return;
diff --git a/src/core/cgroup.h b/src/core/cgroup.h
index 1ed74831c8..807e56c621 100644
--- a/src/core/cgroup.h
+++ b/src/core/cgroup.h
@@ -456,6 +456,7 @@ int unit_check_oomd_kill(Unit *u);
int unit_check_oom(Unit *u);
int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path);
+int unit_remove_subcgroup(Unit *u, const char *suffix_path);
int manager_setup_cgroup(Manager *m);
void manager_shutdown_cgroup(Manager *m, bool delete);
diff --git a/src/core/dbus-manager.c b/src/core/dbus-manager.c
index ed1f124182..99e3ea12ac 100644
--- a/src/core/dbus-manager.c
+++ b/src/core/dbus-manager.c
@@ -960,6 +960,12 @@ static int method_attach_processes_to_unit(sd_bus_message *message, void *userda
return method_generic_unit_operation(message, userdata, error, bus_unit_method_attach_processes, GENERIC_UNIT_VALIDATE_LOADED);
}
+static int method_remove_subgroup_from_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ /* Don't allow removal of subgroups from units that aren't loaded. But allow loading the unit, since
+ * this is clean-up work, that is OK to do when the unit is stopped already. */
+ return method_generic_unit_operation(message, userdata, error, bus_unit_method_remove_subgroup, GENERIC_UNIT_LOAD|GENERIC_UNIT_VALIDATE_LOADED);
+}
+
static int transient_unit_from_message(
Manager *m,
sd_bus_message *message,
@@ -3246,6 +3252,11 @@ const sd_bus_vtable bus_manager_vtable[] = {
SD_BUS_NO_RESULT,
method_attach_processes_to_unit,
SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("RemoveSubgroupFromUnit",
+ SD_BUS_ARGS("s", unit_name, "s", subcgroup, "t", flags),
+ SD_BUS_NO_RESULT,
+ method_remove_subgroup_from_unit,
+ SD_BUS_VTABLE_UNPRIVILEGED),
SD_BUS_METHOD_WITH_ARGS("AbandonScope",
SD_BUS_ARGS("s", name),
SD_BUS_NO_RESULT,
diff --git a/src/core/dbus-unit.c b/src/core/dbus-unit.c
index 6dca2d93bd..68cfed9444 100644
--- a/src/core/dbus-unit.c
+++ b/src/core/dbus-unit.c
@@ -1525,7 +1525,7 @@ int bus_unit_method_attach_processes(sd_bus_message *message, void *userdata, sd
return r;
for (;;) {
_cleanup_(pidref_freep) PidRef *pidref = NULL;
- uid_t process_uid, sender_uid;
+ uid_t sender_uid;
uint32_t upid;
r = sd_bus_message_read(message, "u", &upid);
@@ -1560,16 +1560,21 @@ int bus_unit_method_attach_processes(sd_bus_message *message, void *userdata, sd
if (r < 0)
return r;
- /* Let's validate security: if the sender is root, then all is OK. If the sender is any other unit,
- * then the process' UID and the target unit's UID have to match the sender's UID */
+ /* Let's validate security: if the sender is root or the owner of the service manager, then
+ * all is OK. If the sender is any other user, then the process in question must be owned by
+ * both the sender and the target unit's UID. Note that ownership here means either direct
+ * ownership, or indirect via a userns that is owned by the right UID. */
if (sender_uid != 0 && sender_uid != getuid()) {
- r = pidref_get_uid(pidref, &process_uid);
+ r = process_is_owned_by_uid(pidref, sender_uid);
if (r < 0)
- return sd_bus_error_set_errnof(error, r, "Failed to retrieve process UID: %m");
-
- if (process_uid != sender_uid)
+ return sd_bus_error_set_errnof(error, r, "Failed to check if process " PID_FMT " is owned by client's UID: %m", pidref->pid);
+ if (r == 0)
return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "Process " PID_FMT " not owned by client's UID. Refusing.", pidref->pid);
- if (process_uid != u->ref_uid)
+
+ r = process_is_owned_by_uid(pidref, u->ref_uid);
+ if (r < 0)
+ return sd_bus_error_set_errnof(error, r, "Failed to check if process " PID_FMT " is owned by target unit's UID: %m", pidref->pid);
+ if (r == 0)
return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "Process " PID_FMT " not owned by target unit's UID. Refusing.", pidref->pid);
}
@@ -1589,6 +1594,59 @@ int bus_unit_method_attach_processes(sd_bus_message *message, void *userdata, sd
return sd_bus_reply_method_return(message, NULL);
}
+int bus_unit_method_remove_subgroup(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Unit *u = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ /* This removes a subcgroup of the unit, regardless which user owns the subcgroup. This is useful
+ * when cgroup delegation is enabled for a unit, and the unit subdelegates the cgroup further */
+
+ r = mac_selinux_unit_access_check(u, message, "stop", error);
+ if (r < 0)
+ return r;
+
+ const char *path;
+ uint64_t flags;
+ r = sd_bus_message_read(message, "st", &path, &flags);
+ if (r < 0)
+ return r;
+
+ /* No flags defined for now. */
+ if (flags != 0)
+ return sd_bus_reply_method_errorf(message, SD_BUS_ERROR_INVALID_ARGS, "Invalid 'flags' parameter '%" PRIu64 "'", flags);
+
+ if (!unit_cgroup_delegate(u))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Subcgroup removal not available on non-delegated units.");
+
+ if (!path_is_absolute(path))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Control group path is not absolute: %s", path);
+
+ if (!path_is_normalized(path))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Control group path is not normalized: %s", path);
+
+ _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
+ r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_EUID, &creds);
+ if (r < 0)
+ return r;
+
+ uid_t sender_uid;
+ r = sd_bus_creds_get_euid(creds, &sender_uid);
+ if (r < 0)
+ return r;
+
+ /* Allow this only if the client is privileged, is us, or is the user of the unit itself. */
+ if (sender_uid != 0 && sender_uid != getuid() && sender_uid != u->ref_uid)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "Client is not permitted to alter cgroup.");
+
+ r = unit_remove_subcgroup(u, path);
+ if (r < 0)
+ return sd_bus_error_set_errnof(error, r, "Failed to remove subgroup %s: %m", path);
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
const sd_bus_vtable bus_unit_cgroup_vtable[] = {
SD_BUS_VTABLE_START(0),
SD_BUS_PROPERTY("Slice", "s", property_get_slice, 0, 0),
@@ -1628,6 +1686,12 @@ const sd_bus_vtable bus_unit_cgroup_vtable[] = {
bus_unit_method_attach_processes,
SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("RemoveSubgroup",
+ SD_BUS_ARGS("s", subcgroup, "t", flags),
+ SD_BUS_NO_RESULT,
+ bus_unit_method_remove_subgroup,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+
SD_BUS_VTABLE_END
};
diff --git a/src/core/dbus-unit.h b/src/core/dbus-unit.h
index 6b7828e4ba..e9dd1ec317 100644
--- a/src/core/dbus-unit.h
+++ b/src/core/dbus-unit.h
@@ -22,6 +22,7 @@ int bus_unit_set_properties(Unit *u, sd_bus_message *message, UnitWriteFlags fla
int bus_unit_method_set_properties(sd_bus_message *message, void *userdata, sd_bus_error *error);
int bus_unit_method_get_processes(sd_bus_message *message, void *userdata, sd_bus_error *error);
int bus_unit_method_attach_processes(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_unit_method_remove_subgroup(sd_bus_message *message, void *userdata, sd_bus_error *error);
int bus_unit_method_ref(sd_bus_message *message, void *userdata, sd_bus_error *error);
int bus_unit_method_unref(sd_bus_message *message, void *userdata, sd_bus_error *error);
int bus_unit_method_clean(sd_bus_message *message, void *userdata, sd_bus_error *error);
diff --git a/src/core/org.freedesktop.systemd1.conf b/src/core/org.freedesktop.systemd1.conf
index 52034e07e7..2b978a1e77 100644
--- a/src/core/org.freedesktop.systemd1.conf
+++ b/src/core/org.freedesktop.systemd1.conf
@@ -274,6 +274,10 @@
send_interface="org.freedesktop.systemd1.Manager"
send_member="AttachProcessesToUnit"/>
+
+
@@ -432,6 +436,10 @@
send_interface="org.freedesktop.systemd1.Service"
send_member="AttachProcesses"/>
+
+
@@ -446,6 +454,10 @@
send_interface="org.freedesktop.systemd1.Scope"
send_member="AttachProcesses"/>
+
+
diff --git a/test/units/TEST-19-CGROUP.delegate.sh b/test/units/TEST-19-CGROUP.delegate.sh
index 7f3a7059e2..986b9ac2f6 100755
--- a/test/units/TEST-19-CGROUP.delegate.sh
+++ b/test/units/TEST-19-CGROUP.delegate.sh
@@ -1,5 +1,6 @@
#!/usr/bin/env bash
# SPDX-License-Identifier: LGPL-2.1-or-later
+# shellcheck disable=SC2235
set -eux
set -o pipefail
@@ -87,6 +88,39 @@ testcase_scope_unpriv_delegation() {
-w /sys/fs/cgroup/workload.slice/test-workload0.scope/cgroup.subtree_control
}
+testcase_user_unpriv_delegation() {
+ # Check that delegation works for unpriv users, and that we can insert a
+ # subcgroup owned by a different user (which can happen in case unpriv
+ # userns where a UID range was delegated), which is still cleaned up
+ # correctly when it goes down.
+
+ run0 -u testuser systemd-run --user \
+ --property="Delegate=yes" \
+ --unit=test-chown-subcgroup \
+ --service-type=exec \
+ sleep infinity
+
+ TESTUID=$(id -u testuser)
+ CGROUP="/sys/fs/cgroup/user.slice/user-$TESTUID.slice/user@$TESTUID.service/app.slice/test-chown-subcgroup.service"
+ test -d "$CGROUP"
+
+ # Create a subcgroup, and make it owned by some unrelated user
+ SUBCGROUP="$CGROUP/subcgroup"
+ mkdir "$SUBCGROUP"
+ chown 1:1 "$SUBCGROUP"
+
+ # Make sure the subcgroup is not empty (empty dirs owned by other users can
+ # be removed if one owns the dir they are contained in, after all)
+ mkdir "$SUBCGROUP"/filler
+
+ run0 -u testuser systemctl stop --user test-chown-subcgroup.service
+
+ # Verify that the subcgroup got correctly removed
+ (! test -e "$CGROUP")
+
+ systemctl stop user@testuser.service
+}
+
testcase_subgroup() {
# Verify that DelegateSubgroup= affects ownership correctly
unit="test-subgroup-$RANDOM.service"