cgroup: Add support for ProtectControlGroups= private and strict

This commit adds two settings private and strict to
the ProtectControlGroups= property. Private will unshare the cgroup
namespace and mount a read-write private cgroup2 filesystem at /sys/fs/cgroup.
Strict does the same except the mount is read-only. Since the unit is
running in a cgroup namespace, the new root of /sys/fs/cgroup is the unit's
own cgroup.

We also add a new dbus property ProtectControlGroupsEx which accepts strings
instead of boolean. This will allow users to use private/strict via dbus
and systemd-run in addition to service files.

Note private and strict fall back to no and yes respectively if the kernel
doesn't support cgroup2 or system is not using unified hierarchy.

Fixes: #34634
This commit is contained in:
Ryan Wilson
2024-10-18 11:41:09 -07:00
parent 5fe2923828
commit cd58b5a135
10 changed files with 323 additions and 34 deletions

View File

@@ -3251,6 +3251,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b ProtectControlGroups = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s ProtectControlGroupsEx = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b PrivateNetwork = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b PrivateUsers = ...;
@@ -3868,8 +3870,6 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
<!--property ProtectKernelLogs is not documented!-->
<!--property ProtectControlGroups is not documented!-->
<!--property PrivateNetwork is not documented!-->
<!--property PrivateUsers is not documented!-->
@@ -4572,6 +4572,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
<variablelist class="dbus-property" generated="True" extra-ref="ProtectControlGroups"/>
<variablelist class="dbus-property" generated="True" extra-ref="ProtectControlGroupsEx"/>
<variablelist class="dbus-property" generated="True" extra-ref="PrivateNetwork"/>
<variablelist class="dbus-property" generated="True" extra-ref="PrivateUsers"/>
@@ -4858,6 +4860,12 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
unit file setting <varname>ManagedOOMMemoryPressureDurationSec=</varname> listed in
<citerefentry><refentrytitle>systemd.resource-control</refentrytitle><manvolnum>5</manvolnum></citerefentry>.
Note the time unit is expressed in <literal>μs</literal>.</para>
<para><varname>ProtectControlGroupsEx</varname> implement the destination parameter of the
unit file setting <varname>ProtectControlGroups=</varname> listed in
<citerefentry><refentrytitle>systemd.exec</refentrytitle><manvolnum>5</manvolnum></citerefentry>.
Unlike boolean <varname>ProtectControlGroups</varname>, <varname>ProtectControlGroupsEx</varname>
is a string type.</para>
</refsect2>
</refsect1>
@@ -5415,6 +5423,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b ProtectControlGroups = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s ProtectControlGroupsEx = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b PrivateNetwork = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b PrivateUsers = ...;
@@ -6044,8 +6054,6 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
<!--property ProtectKernelLogs is not documented!-->
<!--property ProtectControlGroups is not documented!-->
<!--property PrivateNetwork is not documented!-->
<!--property PrivateUsers is not documented!-->
@@ -6720,6 +6728,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
<variablelist class="dbus-property" generated="True" extra-ref="ProtectControlGroups"/>
<variablelist class="dbus-property" generated="True" extra-ref="ProtectControlGroupsEx"/>
<variablelist class="dbus-property" generated="True" extra-ref="PrivateNetwork"/>
<variablelist class="dbus-property" generated="True" extra-ref="PrivateUsers"/>
@@ -7416,6 +7426,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b ProtectControlGroups = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s ProtectControlGroupsEx = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b PrivateNetwork = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b PrivateUsers = ...;
@@ -7971,8 +7983,6 @@ node /org/freedesktop/systemd1/unit/home_2emount {
<!--property ProtectKernelLogs is not documented!-->
<!--property ProtectControlGroups is not documented!-->
<!--property PrivateNetwork is not documented!-->
<!--property PrivateUsers is not documented!-->
@@ -8559,6 +8569,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
<variablelist class="dbus-property" generated="True" extra-ref="ProtectControlGroups"/>
<variablelist class="dbus-property" generated="True" extra-ref="ProtectControlGroupsEx"/>
<variablelist class="dbus-property" generated="True" extra-ref="PrivateNetwork"/>
<variablelist class="dbus-property" generated="True" extra-ref="PrivateUsers"/>
@@ -9384,6 +9396,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b ProtectControlGroups = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s ProtectControlGroupsEx = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b PrivateNetwork = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b PrivateUsers = ...;
@@ -9925,8 +9939,6 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
<!--property ProtectKernelLogs is not documented!-->
<!--property ProtectControlGroups is not documented!-->
<!--property PrivateNetwork is not documented!-->
<!--property PrivateUsers is not documented!-->
@@ -10499,6 +10511,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
<variablelist class="dbus-property" generated="True" extra-ref="ProtectControlGroups"/>
<variablelist class="dbus-property" generated="True" extra-ref="ProtectControlGroupsEx"/>
<variablelist class="dbus-property" generated="True" extra-ref="PrivateNetwork"/>
<variablelist class="dbus-property" generated="True" extra-ref="PrivateUsers"/>
@@ -12262,7 +12276,8 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
<varname>ImportCredentialEx</varname>,
<varname>ExtraFileDescriptorNames</varname>,
<varname>ManagedOOMMemoryPressureDurationUSec</varname>,
<varname>BindLogSockets</varname>, and
<varname>BindLogSockets</varname>,
<varname>ProtectControlGroupsEx</varname>, and
<varname>PrivateUsersEx</varname> were added in version 257.</para>
</refsect2>
<refsect2>
@@ -12303,8 +12318,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
<para><varname>PrivateTmpEx</varname>,
<varname>ImportCredentialEx</varname>,
<varname>BindLogSockets</varname>,
<varname>PrivateUsersEx</varname>, and
<varname>ManagedOOMMemoryPressureDurationUSec</varname> were added in version 257.</para>
<varname>PrivateUsersEx</varname>,
<varname>ManagedOOMMemoryPressureDurationUSec</varname>, and
<varname>ProtectControlGroupsEx</varname> were added in version 257.</para>
</refsect2>
<refsect2>
<title>Mount Unit Objects</title>
@@ -12341,8 +12357,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
<para><varname>PrivateTmpEx</varname>,
<varname>ImportCredentialEx</varname>,
<varname>BindLogSockets</varname>,
<varname>PrivateUsersEx</varname>, and
<varname>ManagedOOMMemoryPressureDurationUSec</varname> were added in version 257.</para>
<varname>PrivateUsersEx</varname>,
<varname>ManagedOOMMemoryPressureDurationUSec</varname>, and
<varname>ProtectControlGroupsEx</varname> were added in version 257.</para>
</refsect2>
<refsect2>
<title>Swap Unit Objects</title>
@@ -12379,8 +12396,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
<para><varname>PrivateTmpEx</varname>,
<varname>ImportCredentialEx</varname>,
<varname>BindLogSockets</varname>,
<varname>PrivateUsersEx</varname>, and
<varname>ManagedOOMMemoryPressureDurationUSec</varname> were added in version 257.</para>
<varname>PrivateUsersEx</varname>,
<varname>ManagedOOMMemoryPressureDurationUSec</varname>, and
<varname>ProtectControlGroupsEx</varname> were added in version 257.</para>
</refsect2>
<refsect2>
<title>Slice Unit Objects</title>

View File

@@ -2117,14 +2117,22 @@ BindReadOnlyPaths=/var/lib/systemd</programlisting>
<varlistentry>
<term><varname>ProtectControlGroups=</varname></term>
<listitem><para>Takes a boolean argument. If true, the Linux Control Groups (<citerefentry
project='man-pages'><refentrytitle>cgroups</refentrytitle><manvolnum>7</manvolnum></citerefentry>) hierarchies
<listitem><para>Takes a boolean argument or the special values <literal>private</literal> or
<literal>strict</literal>. If true, the Linux Control Groups (<citerefentry project='man-pages'>
<refentrytitle>cgroups</refentrytitle><manvolnum>7</manvolnum></citerefentry>) hierarchies
accessible through <filename>/sys/fs/cgroup/</filename> will be made read-only to all processes of the
unit. Except for container managers no services should require write access to the control groups hierarchies;
it is hence recommended to turn this on for most services. For this setting the same restrictions regarding
mount propagation and privileges apply as for <varname>ReadOnlyPaths=</varname> and related calls, see
above. Defaults to off. If <varname>ProtectControlGroups=</varname> is set, <varname>MountAPIVFS=yes</varname>
is implied.</para>
unit. If set to <literal>private</literal>, the unit will run in a cgroup namespace with a private
writable mount of <filename>/sys/fs/cgroup/</filename>. If set to <literal>strict</literal>, the unit
will run in a cgroup namespace with a private read-only mount of <filename>/sys/fs/cgroup/</filename>.
Defaults to off. If <varname>ProtectControlGroups=</varname> is set, <varname>MountAPIVFS=yes</varname>
is implied. Note <literal>private</literal> and <literal>strict</literal> are downgraded to false and
true respectively unless the system is using the unified control group hierarchy and the kernel supports
cgroup namespaces.</para>
<para>Except for container managers no services should require write access to the control groups hierarchies;
it is hence recommended to set <varname>ProtectControlGroups=</varname> to true or <literal>strict</literal>
for most services. For this setting the same restrictions regarding mount propagation and privileges apply
as for <varname>ReadOnlyPaths=</varname> and related settings, see above.</para>
<xi:include href="system-only.xml" xpointer="singular"/>

View File

@@ -61,6 +61,7 @@ static BUS_DEFINE_PROPERTY_GET2(property_get_ioprio_priority, "i", ExecContext,
static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_empty_string, "s", NULL);
static BUS_DEFINE_PROPERTY_GET_REF(property_get_private_tmp_ex, "s", PrivateTmp, private_tmp_to_string);
static BUS_DEFINE_PROPERTY_GET_REF(property_get_private_users_ex, "s", PrivateUsers, private_users_to_string);
static BUS_DEFINE_PROPERTY_GET_REF(property_get_protect_control_groups_ex, "s", ProtectControlGroups, protect_control_groups_to_string);
static BUS_DEFINE_PROPERTY_GET_REF(property_get_syslog_level, "i", int, LOG_PRI);
static BUS_DEFINE_PROPERTY_GET_REF(property_get_syslog_facility, "i", int, LOG_FAC);
static BUS_DEFINE_PROPERTY_GET(property_get_cpu_affinity_from_numa, "b", ExecContext, exec_context_get_cpu_affinity_from_numa);
@@ -1179,6 +1180,7 @@ const sd_bus_vtable bus_exec_vtable[] = {
SD_BUS_PROPERTY("ProtectKernelModules", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_modules), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("ProtectKernelLogs", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_logs), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("ProtectControlGroups", "b", property_get_protect_control_groups, offsetof(ExecContext, protect_control_groups), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("ProtectControlGroupsEx", "s", property_get_protect_control_groups_ex, offsetof(ExecContext, protect_control_groups), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("PrivateNetwork", "b", bus_property_get_bool, offsetof(ExecContext, private_network), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("PrivateUsers", "b", property_get_private_users, offsetof(ExecContext, private_users), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("PrivateUsersEx", "s", property_get_private_users_ex, offsetof(ExecContext, private_users), SD_BUS_VTABLE_PROPERTY_CONST),
@@ -1939,6 +1941,27 @@ int bus_exec_context_set_transient_property(
return 1;
}
if (streq(name, "ProtectControlGroupsEx")) {
const char *s;
ProtectControlGroups t;
r = sd_bus_message_read(message, "s", &s);
if (r < 0)
return r;
t = protect_control_groups_from_string(s);
if (t < 0)
return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid %s setting: %s", name, s);
if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
c->protect_control_groups = t;
(void) unit_write_settingf(u, flags, name, "ProtectControlGroups=%s",
protect_control_groups_to_string(c->protect_control_groups));
}
return 1;
}
if (streq(name, "PrivateDevices"))
return bus_set_transient_bool(u, name, &c->private_devices, message, flags, error);

View File

@@ -3098,7 +3098,7 @@ static int apply_mount_namespace(
/* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
* service will need to write to it in order to start the notifications. */
if (context->protect_control_groups != PROTECT_CONTROL_GROUPS_NO && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
if (exec_is_cgroup_mount_read_only(context, params) && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
read_write_paths_cleanup = strv_copy(context->read_write_paths);
if (!read_write_paths_cleanup)
return -ENOMEM;
@@ -3242,7 +3242,7 @@ static int apply_mount_namespace(
* sandbox inside the mount namespace. */
.ignore_protect_paths = !needs_sandboxing && !context->dynamic_user && root_dir,
.protect_control_groups = needs_sandboxing ? context->protect_control_groups : PROTECT_CONTROL_GROUPS_NO,
.protect_control_groups = needs_sandboxing ? exec_get_protect_control_groups(context, params) : PROTECT_CONTROL_GROUPS_NO,
.protect_kernel_tunables = needs_sandboxing && context->protect_kernel_tunables,
.protect_kernel_modules = needs_sandboxing && context->protect_kernel_modules,
.protect_kernel_logs = needs_sandboxing && context->protect_kernel_logs,
@@ -3886,7 +3886,7 @@ static bool exec_context_need_unprivileged_private_users(
context->protect_kernel_tunables ||
context->protect_kernel_modules ||
context->protect_kernel_logs ||
context->protect_control_groups != PROTECT_CONTROL_GROUPS_NO ||
exec_needs_cgroup_mount(context, params) ||
context->protect_clock ||
context->protect_hostname ||
!strv_isempty(context->read_write_paths) ||
@@ -4580,6 +4580,10 @@ int exec_invoke(
}
}
/* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
* from it. */
needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
if (params->cgroup_path) {
/* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
* this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
@@ -4623,6 +4627,18 @@ int exec_invoke(
"Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path);
memory_pressure_path = mfree(memory_pressure_path);
}
/* First we use the current cgroup path to chmod and chown the memory pressure path, then pass the path relative
* to the cgroup namespace to environment variables and mounts. If chown/chmod fails, we should not pass memory
* pressure path environment variable or read-write mount to the unit. This is why we check if
* memory_pressure_path != NULL in the conditional below. */
if (memory_pressure_path && needs_sandboxing && exec_needs_cgroup_namespace(context, params)) {
memory_pressure_path = mfree(memory_pressure_path);
r = cg_get_path("memory", "", "memory.pressure", &memory_pressure_path);
if (r < 0) {
*exit_status = EXIT_MEMORY;
return log_oom();
}
}
} else if (cgroup_context->memory_pressure_watch == CGROUP_PRESSURE_WATCH_NO) {
memory_pressure_path = strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
if (!memory_pressure_path) {
@@ -4709,10 +4725,6 @@ int exec_invoke(
return log_exec_error_errno(context, params, r, "Failed to set up kernel keyring: %m");
}
/* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
* from it. */
needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
/* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
* for it, and the kernel doesn't actually support ambient caps. */
needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
@@ -4853,6 +4865,14 @@ int exec_invoke(
log_exec_warning(context, params, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
}
if (needs_sandboxing && exec_needs_cgroup_namespace(context, params)) {
r = unshare(CLONE_NEWCGROUP);
if (r < 0) {
*exit_status = EXIT_NAMESPACE;
return log_exec_error_errno(context, params, r, "Failed to set up cgroup namespacing: %m");
}
}
if (needs_mount_namespace) {
_cleanup_free_ char *error_path = NULL;

View File

@@ -210,6 +210,50 @@ bool exec_needs_ipc_namespace(const ExecContext *context) {
return context->private_ipc || context->ipc_namespace_path;
}
static bool can_apply_cgroup_namespace(const ExecContext *context, const ExecParameters *params) {
return cg_all_unified() > 0 && ns_type_supported(NAMESPACE_CGROUP);
}
static bool needs_cgroup_namespace(ProtectControlGroups i) {
return IN_SET(i, PROTECT_CONTROL_GROUPS_PRIVATE, PROTECT_CONTROL_GROUPS_STRICT);
}
ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context, const ExecParameters *params) {
assert(context);
/* If cgroup namespace is configured via ProtectControlGroups=private or strict but we can't actually
* use cgroup namespace, either from not having unified hierarchy or kernel support, we ignore the
* setting and do not unshare the namespace. ProtectControlGroups=private and strict get downgraded
* to no and yes respectively. This ensures that strict always gets a read-only mount of /sys/fs/cgroup.
*
* TODO: Remove fallback once cgroupv1 support is removed in v258. */
if (needs_cgroup_namespace(context->protect_control_groups) && !can_apply_cgroup_namespace(context, params)) {
if (context->protect_control_groups == PROTECT_CONTROL_GROUPS_PRIVATE)
return PROTECT_CONTROL_GROUPS_NO;
if (context->protect_control_groups == PROTECT_CONTROL_GROUPS_STRICT)
return PROTECT_CONTROL_GROUPS_YES;
}
return context->protect_control_groups;
}
bool exec_needs_cgroup_namespace(const ExecContext *context, const ExecParameters *params) {
assert(context);
return needs_cgroup_namespace(exec_get_protect_control_groups(context, params));
}
bool exec_needs_cgroup_mount(const ExecContext *context, const ExecParameters *params) {
assert(context);
return exec_get_protect_control_groups(context, params) != PROTECT_CONTROL_GROUPS_NO;
}
bool exec_is_cgroup_mount_read_only(const ExecContext *context, const ExecParameters *params) {
assert(context);
return IN_SET(exec_get_protect_control_groups(context, params), PROTECT_CONTROL_GROUPS_YES, PROTECT_CONTROL_GROUPS_STRICT);
}
bool exec_needs_mount_namespace(
const ExecContext *context,
const ExecParameters *params,
@@ -259,7 +303,7 @@ bool exec_needs_mount_namespace(
context->protect_kernel_tunables ||
context->protect_kernel_modules ||
context->protect_kernel_logs ||
context->protect_control_groups != PROTECT_CONTROL_GROUPS_NO ||
exec_needs_cgroup_mount(context, params) ||
context->protect_proc != PROTECT_PROC_DEFAULT ||
context->proc_subset != PROC_SUBSET_ALL ||
exec_needs_ipc_namespace(context))

View File

@@ -616,6 +616,11 @@ bool exec_needs_mount_namespace(const ExecContext *context, const ExecParameters
bool exec_needs_network_namespace(const ExecContext *context);
bool exec_needs_ipc_namespace(const ExecContext *context);
ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context, const ExecParameters *params);
bool exec_needs_cgroup_namespace(const ExecContext *context, const ExecParameters *params);
bool exec_needs_cgroup_mount(const ExecContext *context, const ExecParameters *params);
bool exec_is_cgroup_mount_read_only(const ExecContext *context, const ExecParameters *params);
/* These logging macros do the same logging as those in unit.h, but using ExecContext and ExecParameters
* instead of the unit object, so that it can be used in the sd-executor context (where the unit object is
* not available). */

View File

@@ -65,6 +65,7 @@ typedef enum MountMode {
MOUNT_PRIVATE_SYSFS,
MOUNT_BIND_SYSFS,
MOUNT_PROCFS,
MOUNT_PRIVATE_CGROUP2FS,
MOUNT_READ_ONLY,
MOUNT_READ_WRITE,
MOUNT_NOEXEC,
@@ -204,6 +205,17 @@ static const MountEntry protect_control_groups_yes_table[] = {
{ "/sys/fs/cgroup", MOUNT_READ_ONLY, false },
};
/* ProtectControlGroups=private table. Note mount_private_apivfs() always use MS_NOSUID|MS_NOEXEC|MS_NODEV so
* flags is not set here. nsdelegate has been supported since kernels >= 4.13 so it is safe to use. */
static const MountEntry protect_control_groups_private_table[] = {
{ "/sys/fs/cgroup", MOUNT_PRIVATE_CGROUP2FS, false, .read_only = false, .nosuid = true, .noexec = true, .options_const = "nsdelegate" },
};
/* ProtectControlGroups=strict table */
static const MountEntry protect_control_groups_strict_table[] = {
{ "/sys/fs/cgroup", MOUNT_PRIVATE_CGROUP2FS, false, .read_only = true, .nosuid = true, .noexec = true, .options_const = "nsdelegate" },
};
/* ProtectSystem=yes table */
static const MountEntry protect_system_yes_table[] = {
{ "/usr", MOUNT_READ_ONLY, false },
@@ -252,6 +264,7 @@ static const char * const mount_mode_table[_MOUNT_MODE_MAX] = {
[MOUNT_EMPTY_DIR] = "empty-dir",
[MOUNT_PRIVATE_SYSFS] = "private-sysfs",
[MOUNT_BIND_SYSFS] = "bind-sysfs",
[MOUNT_PRIVATE_CGROUP2FS] = "private-cgroup2fs",
[MOUNT_PROCFS] = "procfs",
[MOUNT_READ_ONLY] = "read-only",
[MOUNT_READ_WRITE] = "read-write",
@@ -743,6 +756,12 @@ static int append_protect_control_groups(MountList *ml, ProtectControlGroups pro
case PROTECT_CONTROL_GROUPS_YES:
return append_static_mounts(ml, protect_control_groups_yes_table, ELEMENTSOF(protect_control_groups_yes_table), ignore_protect);
case PROTECT_CONTROL_GROUPS_PRIVATE:
return append_static_mounts(ml, protect_control_groups_private_table, ELEMENTSOF(protect_control_groups_private_table), ignore_protect);
case PROTECT_CONTROL_GROUPS_STRICT:
return append_static_mounts(ml, protect_control_groups_strict_table, ELEMENTSOF(protect_control_groups_strict_table), ignore_protect);
default:
assert_not_reached();
}
@@ -1290,10 +1309,14 @@ static int mount_private_apivfs(
r = mount_nofollow_verbose(LOG_DEBUG, fstype, temporary_mount, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
if (r == -EINVAL && opts)
/* If this failed with EINVAL then this likely means the textual hidepid= stuff for procfs is
* not supported by the kernel, and thus the per-instance hidepid= neither, which means we
* really don't want to use it, since it would affect our host's /proc mount. Hence let's
* gracefully fallback to a classic, unrestricted version. */
/* If this failed with EINVAL then this likely means either:
* 1. the textual hidepid= stuff for procfs is not supported by the kernel, and thus the
* per-instance hidepid= neither, which means we really don't want to use it, since it
* would affect our host's /proc mount.
* 2. nsdelegate for cgroup2 is not supported by the kernel even though CLONE_NEWCGROUP
* is supported.
*
* Hence let's gracefully fallback to a classic, unrestricted version. */
r = mount_nofollow_verbose(LOG_DEBUG, fstype, temporary_mount, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, /* opts = */ NULL);
if (ERRNO_IS_NEG_PRIVILEGE(r)) {
/* When we do not have enough privileges to mount a new instance, fall back to use an
@@ -1339,6 +1362,39 @@ static int mount_private_sysfs(const MountEntry *m, const NamespaceParameters *p
return mount_private_apivfs("sysfs", mount_entry_path(m), "/sys", /* opts = */ NULL, p->runtime_scope);
}
static bool check_recursiveprot_supported(void) {
int r;
/* memory_recursiveprot is only supported for kernels >= 5.7. Note mount_option_supported uses fsopen()
* and fsconfig() which are supported for kernels >= 5.2. So if mount_option_supported() returns an
* error, we can assume memory_recursiveprot is not supported. */
r = mount_option_supported("cgroup2", "memory_recursiveprot", NULL);
if (r < 0)
log_debug_errno(r, "Failed to determine whether the 'memory_recursiveprot' mount option is supported, assuming not: %m");
else if (r == 0)
log_debug("This kernel version does not support 'memory_recursiveprot', not using mount option.");
return r > 0;
}
static int mount_private_cgroup2fs(const MountEntry *m, const NamespaceParameters *p) {
_cleanup_free_ char *opts = NULL;
assert(m);
assert(p);
if (check_recursiveprot_supported()) {
opts = strdup(strempty(mount_entry_options(m)));
if (!opts)
return -ENOMEM;
if (!strextend_with_separator(&opts, ",", "memory_recursiveprot"))
return -ENOMEM;
}
return mount_private_apivfs("cgroup2", mount_entry_path(m), "/sys/fs/cgroup", opts ?: mount_entry_options(m), p->runtime_scope);
}
static int mount_procfs(const MountEntry *m, const NamespaceParameters *p) {
_cleanup_free_ char *opts = NULL;
@@ -1784,6 +1840,9 @@ static int apply_one_mount(
case MOUNT_PROCFS:
return mount_procfs(m, p);
case MOUNT_PRIVATE_CGROUP2FS:
return mount_private_cgroup2fs(m, p);
case MOUNT_RUN:
return mount_run(m);
@@ -3212,6 +3271,8 @@ DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_system, ProtectSystem, PROTECT_S
static const char *const protect_control_groups_table[_PROTECT_CONTROL_GROUPS_MAX] = {
[PROTECT_CONTROL_GROUPS_NO] = "no",
[PROTECT_CONTROL_GROUPS_YES] = "yes",
[PROTECT_CONTROL_GROUPS_PRIVATE] = "private",
[PROTECT_CONTROL_GROUPS_STRICT] = "strict",
};
DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_control_groups, ProtectControlGroups, PROTECT_CONTROL_GROUPS_YES);

View File

@@ -72,6 +72,8 @@ typedef enum PrivateUsers {
typedef enum ProtectControlGroups {
PROTECT_CONTROL_GROUPS_NO,
PROTECT_CONTROL_GROUPS_YES,
PROTECT_CONTROL_GROUPS_PRIVATE,
PROTECT_CONTROL_GROUPS_STRICT,
_PROTECT_CONTROL_GROUPS_MAX,
_PROTECT_CONTROL_GROUPS_INVALID = -EINVAL,
} ProtectControlGroups;

View File

@@ -1047,6 +1047,7 @@ static int bus_append_execute_property(sd_bus_message *m, const char *field, con
"ProtectHome",
"PrivateTmpEx",
"PrivateUsersEx",
"ProtectControlGroupsEx",
"SELinuxContext",
"RootImage",
"RootVerity",

View File

@@ -0,0 +1,107 @@
#!/usr/bin/env bash
# SPDX-License-Identifier: LGPL-2.1-or-later
# shellcheck disable=SC2016
set -eux
set -o pipefail
# shellcheck source=test/units/test-control.sh
. "$(dirname "$0")"/test-control.sh
# shellcheck source=test/units/util.sh
. "$(dirname "$0")"/util.sh
SLICE="system.slice"
UNIT_PREFIX="test-07-protect-control-groups"
READ_ONLY_MOUNT_FLAG="ro"
READ_WRITE_MOUNT_FLAG="rw"
at_exit() {
set +e
systemctl stop "$UNIT_PREFIX*.service"
systemctl reset-failed
}
trap at_exit EXIT
ROOT_CGROUP_NS=$(readlink /proc/self/ns/cgroup)
ENABLE_MEM_PRESSURE_TEST=true
# We do not just test if the file exists, but try to read from it, since if
# CONFIG_PSI_DEFAULT_DISABLED is set in the kernel the file will exist and can
# be opened, but any read()s will fail with EOPNOTSUPP, which we want to
# detect.
if ! cat /proc/pressure/memory >/dev/null ; then
echo "Kernel too old, has no PSI, not running ProtectControlGroups= with MemoryPressureWatch= test." >&2
ENABLE_MEM_PRESSURE_TEST=false
fi
if ! test -f "/sys/fs/cgroup/$(systemctl show TEST-07-PID1.service -P ControlGroup)/memory.pressure" ; then
echo "No memory accounting/PSI delegated via cgroup, not running ProtectControlGroups= with MemoryPressureWatch= test." >&2
ENABLE_MEM_PRESSURE_TEST=false
fi
test_basic() {
local protect_control_groups_ex="$1"
local protect_control_groups="$2"
local in_cgroup_ns="$3"
local mount_flag="$4"
if [[ $in_cgroup_ns == true ]]; then
local ns_cmp_op="!="
local unit_cgroup="0::/"
local memory_pressure_watch="/sys/fs/cgroup/memory.pressure"
else
local ns_cmp_op="=="
local unit_cgroup="0::/$SLICE/$UNIT_PREFIX-$protect_control_groups_ex-1.service"
local memory_pressure_watch="/sys/fs/cgroup/$SLICE/$UNIT_PREFIX-$protect_control_groups_ex-2.service/memory.pressure"
fi
# Compare cgroup namespace to root namespace
systemd-run -p "ProtectControlGroupsEx=$protect_control_groups_ex" --slice "$SLICE" --wait \
bash -xec "test \"\$(readlink /proc/self/ns/cgroup)\" $ns_cmp_op \"$ROOT_CGROUP_NS\""
# Verify unit cgroup
systemd-run -p "ProtectControlGroupsEx=$protect_control_groups_ex" --slice "$SLICE" --wait \
--unit "$UNIT_PREFIX-$protect_control_groups_ex-1" \
bash -xec "test \"\$(cat /proc/self/cgroup)\" == \"$unit_cgroup\""
# Verify memory pressure watch points to correct file
if [[ $ENABLE_MEM_PRESSURE_TEST == true ]]; then
systemd-run -p "ProtectControlGroupsEx=$protect_control_groups_ex" -p MemoryPressureWatch=yes --slice "$SLICE" --wait \
--unit "$UNIT_PREFIX-$protect_control_groups_ex-2" \
bash -xec "test \"\$MEMORY_PRESSURE_WATCH\" == \"$memory_pressure_watch\""
fi
# Verify /sys/fs/cgroup mount is read-only or read-write
systemd-run -p "ProtectControlGroupsEx=$protect_control_groups_ex" --slice "$SLICE" --wait \
bash -xec "[[ \"\$\$(findmnt --mountpoint /sys/fs/cgroup --noheadings -o FSTYPE)\" == cgroup2 ]];
[[ \"\$\$(findmnt --mountpoint /sys/fs/cgroup --noheadings -o FS-OPTIONS)\" =~ nsdelegate ]];
[[ \"\$\$(findmnt --mountpoint /sys/fs/cgroup --noheadings -o VFS-OPTIONS)\" =~ noexec ]];
[[ \"\$\$(findmnt --mountpoint /sys/fs/cgroup --noheadings -o VFS-OPTIONS)\" =~ nosuid ]];
[[ \"\$\$(findmnt --mountpoint /sys/fs/cgroup --noheadings -o VFS-OPTIONS)\" =~ nodev ]];
[[ \"\$\$(findmnt --mountpoint /sys/fs/cgroup --noheadings -o VFS-OPTIONS)\" =~ \"$mount_flag\" ]];"
# Verify dbus properties
systemd-run -p "ProtectControlGroupsEx=$protect_control_groups_ex" --slice "$SLICE" --remain-after-exit \
--unit "$UNIT_PREFIX-$protect_control_groups_ex-3" true
assert_eq "$(systemctl show -P ProtectControlGroupsEx "$UNIT_PREFIX-$protect_control_groups_ex-3")" "$protect_control_groups_ex"
assert_eq "$(systemctl show -P ProtectControlGroups "$UNIT_PREFIX-$protect_control_groups_ex-3")" "$protect_control_groups"
systemctl stop "$UNIT_PREFIX-$protect_control_groups_ex-3"
}
testcase_basic_no() {
test_basic "no" "no" false "$READ_WRITE_MOUNT_FLAG"
}
testcase_basic_yes() {
test_basic "yes" "yes" false "$READ_ONLY_MOUNT_FLAG"
}
testcase_basic_private() {
test_basic "private" "yes" true "$READ_WRITE_MOUNT_FLAG"
}
testcase_basic_strict() {
test_basic "strict" "yes" true "$READ_ONLY_MOUNT_FLAG"
}
run_testcases