diff --git a/man/org.freedesktop.systemd1.xml b/man/org.freedesktop.systemd1.xml index d86a5d9f32..f484f28a70 100644 --- a/man/org.freedesktop.systemd1.xml +++ b/man/org.freedesktop.systemd1.xml @@ -3251,6 +3251,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly b ProtectControlGroups = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly s ProtectControlGroupsEx = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly b PrivateNetwork = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly b PrivateUsers = ...; @@ -3868,8 +3870,6 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { - - @@ -4572,6 +4572,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + + @@ -4858,6 +4860,12 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { unit file setting ManagedOOMMemoryPressureDurationSec= listed in systemd.resource-control5. Note the time unit is expressed in μs. + + ProtectControlGroupsEx implement the destination parameter of the + unit file setting ProtectControlGroups= listed in + systemd.exec5. + Unlike boolean ProtectControlGroups, ProtectControlGroupsEx + is a string type. @@ -5415,6 +5423,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly b ProtectControlGroups = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly s ProtectControlGroupsEx = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly b PrivateNetwork = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly b PrivateUsers = ...; @@ -6044,8 +6054,6 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { - - @@ -6720,6 +6728,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { + + @@ -7416,6 +7426,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly b ProtectControlGroups = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly s ProtectControlGroupsEx = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly b PrivateNetwork = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly b PrivateUsers = ...; @@ -7971,8 +7983,6 @@ node /org/freedesktop/systemd1/unit/home_2emount { - - @@ -8559,6 +8569,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { + + @@ -9384,6 +9396,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly b ProtectControlGroups = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly s ProtectControlGroupsEx = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly b PrivateNetwork = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly b PrivateUsers = ...; @@ -9925,8 +9939,6 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { - - @@ -10499,6 +10511,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { + + @@ -12262,7 +12276,8 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ ImportCredentialEx, ExtraFileDescriptorNames, ManagedOOMMemoryPressureDurationUSec, - BindLogSockets, and + BindLogSockets, + ProtectControlGroupsEx, and PrivateUsersEx were added in version 257. @@ -12303,8 +12318,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ PrivateTmpEx, ImportCredentialEx, BindLogSockets, - PrivateUsersEx, and - ManagedOOMMemoryPressureDurationUSec were added in version 257. + PrivateUsersEx, + ManagedOOMMemoryPressureDurationUSec, and + ProtectControlGroupsEx were added in version 257. Mount Unit Objects @@ -12341,8 +12357,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ PrivateTmpEx, ImportCredentialEx, BindLogSockets, - PrivateUsersEx, and - ManagedOOMMemoryPressureDurationUSec were added in version 257. + PrivateUsersEx, + ManagedOOMMemoryPressureDurationUSec, and + ProtectControlGroupsEx were added in version 257. Swap Unit Objects @@ -12379,8 +12396,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ PrivateTmpEx, ImportCredentialEx, BindLogSockets, - PrivateUsersEx, and - ManagedOOMMemoryPressureDurationUSec were added in version 257. + PrivateUsersEx, + ManagedOOMMemoryPressureDurationUSec, and + ProtectControlGroupsEx were added in version 257. Slice Unit Objects diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 6764f89b02..f84204d247 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -2117,14 +2117,22 @@ BindReadOnlyPaths=/var/lib/systemd ProtectControlGroups= - Takes a boolean argument. If true, the Linux Control Groups (cgroups7) hierarchies + Takes a boolean argument or the special values private or + strict. If true, the Linux Control Groups ( + cgroups7) hierarchies accessible through /sys/fs/cgroup/ will be made read-only to all processes of the - unit. Except for container managers no services should require write access to the control groups hierarchies; - it is hence recommended to turn this on for most services. For this setting the same restrictions regarding - mount propagation and privileges apply as for ReadOnlyPaths= and related calls, see - above. Defaults to off. If ProtectControlGroups= is set, MountAPIVFS=yes - is implied. + unit. If set to private, the unit will run in a cgroup namespace with a private + writable mount of /sys/fs/cgroup/. If set to strict, the unit + will run in a cgroup namespace with a private read-only mount of /sys/fs/cgroup/. + Defaults to off. If ProtectControlGroups= is set, MountAPIVFS=yes + is implied. Note private and strict are downgraded to false and + true respectively unless the system is using the unified control group hierarchy and the kernel supports + cgroup namespaces. + + Except for container managers no services should require write access to the control groups hierarchies; + it is hence recommended to set ProtectControlGroups= to true or strict + for most services. For this setting the same restrictions regarding mount propagation and privileges apply + as for ReadOnlyPaths= and related settings, see above. diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c index 4ef57137ea..4f627a11e2 100644 --- a/src/core/dbus-execute.c +++ b/src/core/dbus-execute.c @@ -61,6 +61,7 @@ static BUS_DEFINE_PROPERTY_GET2(property_get_ioprio_priority, "i", ExecContext, static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_empty_string, "s", NULL); static BUS_DEFINE_PROPERTY_GET_REF(property_get_private_tmp_ex, "s", PrivateTmp, private_tmp_to_string); static BUS_DEFINE_PROPERTY_GET_REF(property_get_private_users_ex, "s", PrivateUsers, private_users_to_string); +static BUS_DEFINE_PROPERTY_GET_REF(property_get_protect_control_groups_ex, "s", ProtectControlGroups, protect_control_groups_to_string); static BUS_DEFINE_PROPERTY_GET_REF(property_get_syslog_level, "i", int, LOG_PRI); static BUS_DEFINE_PROPERTY_GET_REF(property_get_syslog_facility, "i", int, LOG_FAC); static BUS_DEFINE_PROPERTY_GET(property_get_cpu_affinity_from_numa, "b", ExecContext, exec_context_get_cpu_affinity_from_numa); @@ -1179,6 +1180,7 @@ const sd_bus_vtable bus_exec_vtable[] = { SD_BUS_PROPERTY("ProtectKernelModules", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_modules), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("ProtectKernelLogs", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_logs), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("ProtectControlGroups", "b", property_get_protect_control_groups, offsetof(ExecContext, protect_control_groups), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ProtectControlGroupsEx", "s", property_get_protect_control_groups_ex, offsetof(ExecContext, protect_control_groups), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("PrivateNetwork", "b", bus_property_get_bool, offsetof(ExecContext, private_network), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("PrivateUsers", "b", property_get_private_users, offsetof(ExecContext, private_users), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("PrivateUsersEx", "s", property_get_private_users_ex, offsetof(ExecContext, private_users), SD_BUS_VTABLE_PROPERTY_CONST), @@ -1939,6 +1941,27 @@ int bus_exec_context_set_transient_property( return 1; } + if (streq(name, "ProtectControlGroupsEx")) { + const char *s; + ProtectControlGroups t; + + r = sd_bus_message_read(message, "s", &s); + if (r < 0) + return r; + + t = protect_control_groups_from_string(s); + if (t < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid %s setting: %s", name, s); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->protect_control_groups = t; + (void) unit_write_settingf(u, flags, name, "ProtectControlGroups=%s", + protect_control_groups_to_string(c->protect_control_groups)); + } + + return 1; + } + if (streq(name, "PrivateDevices")) return bus_set_transient_bool(u, name, &c->private_devices, message, flags, error); diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c index a0caf9bf89..eda0aee7c2 100644 --- a/src/core/exec-invoke.c +++ b/src/core/exec-invoke.c @@ -3098,7 +3098,7 @@ static int apply_mount_namespace( /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the * service will need to write to it in order to start the notifications. */ - if (context->protect_control_groups != PROTECT_CONTROL_GROUPS_NO && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) { + if (exec_is_cgroup_mount_read_only(context, params) && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) { read_write_paths_cleanup = strv_copy(context->read_write_paths); if (!read_write_paths_cleanup) return -ENOMEM; @@ -3242,7 +3242,7 @@ static int apply_mount_namespace( * sandbox inside the mount namespace. */ .ignore_protect_paths = !needs_sandboxing && !context->dynamic_user && root_dir, - .protect_control_groups = needs_sandboxing ? context->protect_control_groups : PROTECT_CONTROL_GROUPS_NO, + .protect_control_groups = needs_sandboxing ? exec_get_protect_control_groups(context, params) : PROTECT_CONTROL_GROUPS_NO, .protect_kernel_tunables = needs_sandboxing && context->protect_kernel_tunables, .protect_kernel_modules = needs_sandboxing && context->protect_kernel_modules, .protect_kernel_logs = needs_sandboxing && context->protect_kernel_logs, @@ -3886,7 +3886,7 @@ static bool exec_context_need_unprivileged_private_users( context->protect_kernel_tunables || context->protect_kernel_modules || context->protect_kernel_logs || - context->protect_control_groups != PROTECT_CONTROL_GROUPS_NO || + exec_needs_cgroup_mount(context, params) || context->protect_clock || context->protect_hostname || !strv_isempty(context->read_write_paths) || @@ -4580,6 +4580,10 @@ int exec_invoke( } } + /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted + * from it. */ + needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED); + if (params->cgroup_path) { /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not @@ -4623,6 +4627,18 @@ int exec_invoke( "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path); memory_pressure_path = mfree(memory_pressure_path); } + /* First we use the current cgroup path to chmod and chown the memory pressure path, then pass the path relative + * to the cgroup namespace to environment variables and mounts. If chown/chmod fails, we should not pass memory + * pressure path environment variable or read-write mount to the unit. This is why we check if + * memory_pressure_path != NULL in the conditional below. */ + if (memory_pressure_path && needs_sandboxing && exec_needs_cgroup_namespace(context, params)) { + memory_pressure_path = mfree(memory_pressure_path); + r = cg_get_path("memory", "", "memory.pressure", &memory_pressure_path); + if (r < 0) { + *exit_status = EXIT_MEMORY; + return log_oom(); + } + } } else if (cgroup_context->memory_pressure_watch == CGROUP_PRESSURE_WATCH_NO) { memory_pressure_path = strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */ if (!memory_pressure_path) { @@ -4709,10 +4725,6 @@ int exec_invoke( return log_exec_error_errno(context, params, r, "Failed to set up kernel keyring: %m"); } - /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted - * from it. */ - needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED); - /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked * for it, and the kernel doesn't actually support ambient caps. */ needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported(); @@ -4853,6 +4865,14 @@ int exec_invoke( log_exec_warning(context, params, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring."); } + if (needs_sandboxing && exec_needs_cgroup_namespace(context, params)) { + r = unshare(CLONE_NEWCGROUP); + if (r < 0) { + *exit_status = EXIT_NAMESPACE; + return log_exec_error_errno(context, params, r, "Failed to set up cgroup namespacing: %m"); + } + } + if (needs_mount_namespace) { _cleanup_free_ char *error_path = NULL; diff --git a/src/core/execute.c b/src/core/execute.c index c1acc992f5..1fda693344 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -210,6 +210,50 @@ bool exec_needs_ipc_namespace(const ExecContext *context) { return context->private_ipc || context->ipc_namespace_path; } +static bool can_apply_cgroup_namespace(const ExecContext *context, const ExecParameters *params) { + return cg_all_unified() > 0 && ns_type_supported(NAMESPACE_CGROUP); +} + +static bool needs_cgroup_namespace(ProtectControlGroups i) { + return IN_SET(i, PROTECT_CONTROL_GROUPS_PRIVATE, PROTECT_CONTROL_GROUPS_STRICT); +} + +ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context, const ExecParameters *params) { + assert(context); + + /* If cgroup namespace is configured via ProtectControlGroups=private or strict but we can't actually + * use cgroup namespace, either from not having unified hierarchy or kernel support, we ignore the + * setting and do not unshare the namespace. ProtectControlGroups=private and strict get downgraded + * to no and yes respectively. This ensures that strict always gets a read-only mount of /sys/fs/cgroup. + * + * TODO: Remove fallback once cgroupv1 support is removed in v258. */ + if (needs_cgroup_namespace(context->protect_control_groups) && !can_apply_cgroup_namespace(context, params)) { + if (context->protect_control_groups == PROTECT_CONTROL_GROUPS_PRIVATE) + return PROTECT_CONTROL_GROUPS_NO; + if (context->protect_control_groups == PROTECT_CONTROL_GROUPS_STRICT) + return PROTECT_CONTROL_GROUPS_YES; + } + return context->protect_control_groups; +} + +bool exec_needs_cgroup_namespace(const ExecContext *context, const ExecParameters *params) { + assert(context); + + return needs_cgroup_namespace(exec_get_protect_control_groups(context, params)); +} + +bool exec_needs_cgroup_mount(const ExecContext *context, const ExecParameters *params) { + assert(context); + + return exec_get_protect_control_groups(context, params) != PROTECT_CONTROL_GROUPS_NO; +} + +bool exec_is_cgroup_mount_read_only(const ExecContext *context, const ExecParameters *params) { + assert(context); + + return IN_SET(exec_get_protect_control_groups(context, params), PROTECT_CONTROL_GROUPS_YES, PROTECT_CONTROL_GROUPS_STRICT); +} + bool exec_needs_mount_namespace( const ExecContext *context, const ExecParameters *params, @@ -259,7 +303,7 @@ bool exec_needs_mount_namespace( context->protect_kernel_tunables || context->protect_kernel_modules || context->protect_kernel_logs || - context->protect_control_groups != PROTECT_CONTROL_GROUPS_NO || + exec_needs_cgroup_mount(context, params) || context->protect_proc != PROTECT_PROC_DEFAULT || context->proc_subset != PROC_SUBSET_ALL || exec_needs_ipc_namespace(context)) diff --git a/src/core/execute.h b/src/core/execute.h index 5fbc196cb7..1f9b3f8f14 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -616,6 +616,11 @@ bool exec_needs_mount_namespace(const ExecContext *context, const ExecParameters bool exec_needs_network_namespace(const ExecContext *context); bool exec_needs_ipc_namespace(const ExecContext *context); +ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context, const ExecParameters *params); +bool exec_needs_cgroup_namespace(const ExecContext *context, const ExecParameters *params); +bool exec_needs_cgroup_mount(const ExecContext *context, const ExecParameters *params); +bool exec_is_cgroup_mount_read_only(const ExecContext *context, const ExecParameters *params); + /* These logging macros do the same logging as those in unit.h, but using ExecContext and ExecParameters * instead of the unit object, so that it can be used in the sd-executor context (where the unit object is * not available). */ diff --git a/src/core/namespace.c b/src/core/namespace.c index 371d4a237a..a0d3dc0cbb 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -65,6 +65,7 @@ typedef enum MountMode { MOUNT_PRIVATE_SYSFS, MOUNT_BIND_SYSFS, MOUNT_PROCFS, + MOUNT_PRIVATE_CGROUP2FS, MOUNT_READ_ONLY, MOUNT_READ_WRITE, MOUNT_NOEXEC, @@ -204,6 +205,17 @@ static const MountEntry protect_control_groups_yes_table[] = { { "/sys/fs/cgroup", MOUNT_READ_ONLY, false }, }; +/* ProtectControlGroups=private table. Note mount_private_apivfs() always use MS_NOSUID|MS_NOEXEC|MS_NODEV so + * flags is not set here. nsdelegate has been supported since kernels >= 4.13 so it is safe to use. */ +static const MountEntry protect_control_groups_private_table[] = { + { "/sys/fs/cgroup", MOUNT_PRIVATE_CGROUP2FS, false, .read_only = false, .nosuid = true, .noexec = true, .options_const = "nsdelegate" }, +}; + +/* ProtectControlGroups=strict table */ +static const MountEntry protect_control_groups_strict_table[] = { + { "/sys/fs/cgroup", MOUNT_PRIVATE_CGROUP2FS, false, .read_only = true, .nosuid = true, .noexec = true, .options_const = "nsdelegate" }, +}; + /* ProtectSystem=yes table */ static const MountEntry protect_system_yes_table[] = { { "/usr", MOUNT_READ_ONLY, false }, @@ -252,6 +264,7 @@ static const char * const mount_mode_table[_MOUNT_MODE_MAX] = { [MOUNT_EMPTY_DIR] = "empty-dir", [MOUNT_PRIVATE_SYSFS] = "private-sysfs", [MOUNT_BIND_SYSFS] = "bind-sysfs", + [MOUNT_PRIVATE_CGROUP2FS] = "private-cgroup2fs", [MOUNT_PROCFS] = "procfs", [MOUNT_READ_ONLY] = "read-only", [MOUNT_READ_WRITE] = "read-write", @@ -743,6 +756,12 @@ static int append_protect_control_groups(MountList *ml, ProtectControlGroups pro case PROTECT_CONTROL_GROUPS_YES: return append_static_mounts(ml, protect_control_groups_yes_table, ELEMENTSOF(protect_control_groups_yes_table), ignore_protect); + case PROTECT_CONTROL_GROUPS_PRIVATE: + return append_static_mounts(ml, protect_control_groups_private_table, ELEMENTSOF(protect_control_groups_private_table), ignore_protect); + + case PROTECT_CONTROL_GROUPS_STRICT: + return append_static_mounts(ml, protect_control_groups_strict_table, ELEMENTSOF(protect_control_groups_strict_table), ignore_protect); + default: assert_not_reached(); } @@ -1290,10 +1309,14 @@ static int mount_private_apivfs( r = mount_nofollow_verbose(LOG_DEBUG, fstype, temporary_mount, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, opts); if (r == -EINVAL && opts) - /* If this failed with EINVAL then this likely means the textual hidepid= stuff for procfs is - * not supported by the kernel, and thus the per-instance hidepid= neither, which means we - * really don't want to use it, since it would affect our host's /proc mount. Hence let's - * gracefully fallback to a classic, unrestricted version. */ + /* If this failed with EINVAL then this likely means either: + * 1. the textual hidepid= stuff for procfs is not supported by the kernel, and thus the + * per-instance hidepid= neither, which means we really don't want to use it, since it + * would affect our host's /proc mount. + * 2. nsdelegate for cgroup2 is not supported by the kernel even though CLONE_NEWCGROUP + * is supported. + * + * Hence let's gracefully fallback to a classic, unrestricted version. */ r = mount_nofollow_verbose(LOG_DEBUG, fstype, temporary_mount, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, /* opts = */ NULL); if (ERRNO_IS_NEG_PRIVILEGE(r)) { /* When we do not have enough privileges to mount a new instance, fall back to use an @@ -1339,6 +1362,39 @@ static int mount_private_sysfs(const MountEntry *m, const NamespaceParameters *p return mount_private_apivfs("sysfs", mount_entry_path(m), "/sys", /* opts = */ NULL, p->runtime_scope); } +static bool check_recursiveprot_supported(void) { + int r; + + /* memory_recursiveprot is only supported for kernels >= 5.7. Note mount_option_supported uses fsopen() + * and fsconfig() which are supported for kernels >= 5.2. So if mount_option_supported() returns an + * error, we can assume memory_recursiveprot is not supported. */ + r = mount_option_supported("cgroup2", "memory_recursiveprot", NULL); + if (r < 0) + log_debug_errno(r, "Failed to determine whether the 'memory_recursiveprot' mount option is supported, assuming not: %m"); + else if (r == 0) + log_debug("This kernel version does not support 'memory_recursiveprot', not using mount option."); + + return r > 0; +} + +static int mount_private_cgroup2fs(const MountEntry *m, const NamespaceParameters *p) { + _cleanup_free_ char *opts = NULL; + + assert(m); + assert(p); + + if (check_recursiveprot_supported()) { + opts = strdup(strempty(mount_entry_options(m))); + if (!opts) + return -ENOMEM; + + if (!strextend_with_separator(&opts, ",", "memory_recursiveprot")) + return -ENOMEM; + } + + return mount_private_apivfs("cgroup2", mount_entry_path(m), "/sys/fs/cgroup", opts ?: mount_entry_options(m), p->runtime_scope); +} + static int mount_procfs(const MountEntry *m, const NamespaceParameters *p) { _cleanup_free_ char *opts = NULL; @@ -1784,6 +1840,9 @@ static int apply_one_mount( case MOUNT_PROCFS: return mount_procfs(m, p); + case MOUNT_PRIVATE_CGROUP2FS: + return mount_private_cgroup2fs(m, p); + case MOUNT_RUN: return mount_run(m); @@ -3212,6 +3271,8 @@ DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_system, ProtectSystem, PROTECT_S static const char *const protect_control_groups_table[_PROTECT_CONTROL_GROUPS_MAX] = { [PROTECT_CONTROL_GROUPS_NO] = "no", [PROTECT_CONTROL_GROUPS_YES] = "yes", + [PROTECT_CONTROL_GROUPS_PRIVATE] = "private", + [PROTECT_CONTROL_GROUPS_STRICT] = "strict", }; DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_control_groups, ProtectControlGroups, PROTECT_CONTROL_GROUPS_YES); diff --git a/src/core/namespace.h b/src/core/namespace.h index 17c0dd9e18..fa4f7c7cf6 100644 --- a/src/core/namespace.h +++ b/src/core/namespace.h @@ -72,6 +72,8 @@ typedef enum PrivateUsers { typedef enum ProtectControlGroups { PROTECT_CONTROL_GROUPS_NO, PROTECT_CONTROL_GROUPS_YES, + PROTECT_CONTROL_GROUPS_PRIVATE, + PROTECT_CONTROL_GROUPS_STRICT, _PROTECT_CONTROL_GROUPS_MAX, _PROTECT_CONTROL_GROUPS_INVALID = -EINVAL, } ProtectControlGroups; diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c index 59e4901878..2e17bae51a 100644 --- a/src/shared/bus-unit-util.c +++ b/src/shared/bus-unit-util.c @@ -1047,6 +1047,7 @@ static int bus_append_execute_property(sd_bus_message *m, const char *field, con "ProtectHome", "PrivateTmpEx", "PrivateUsersEx", + "ProtectControlGroupsEx", "SELinuxContext", "RootImage", "RootVerity", diff --git a/test/units/TEST-07-PID1.protect-control-groups.sh b/test/units/TEST-07-PID1.protect-control-groups.sh new file mode 100755 index 0000000000..e7752ffb4b --- /dev/null +++ b/test/units/TEST-07-PID1.protect-control-groups.sh @@ -0,0 +1,107 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: LGPL-2.1-or-later +# shellcheck disable=SC2016 +set -eux +set -o pipefail + +# shellcheck source=test/units/test-control.sh +. "$(dirname "$0")"/test-control.sh +# shellcheck source=test/units/util.sh +. "$(dirname "$0")"/util.sh + +SLICE="system.slice" +UNIT_PREFIX="test-07-protect-control-groups" + +READ_ONLY_MOUNT_FLAG="ro" +READ_WRITE_MOUNT_FLAG="rw" + +at_exit() { + set +e + + systemctl stop "$UNIT_PREFIX*.service" + systemctl reset-failed +} + +trap at_exit EXIT + +ROOT_CGROUP_NS=$(readlink /proc/self/ns/cgroup) + +ENABLE_MEM_PRESSURE_TEST=true + +# We do not just test if the file exists, but try to read from it, since if +# CONFIG_PSI_DEFAULT_DISABLED is set in the kernel the file will exist and can +# be opened, but any read()s will fail with EOPNOTSUPP, which we want to +# detect. +if ! cat /proc/pressure/memory >/dev/null ; then + echo "Kernel too old, has no PSI, not running ProtectControlGroups= with MemoryPressureWatch= test." >&2 + ENABLE_MEM_PRESSURE_TEST=false +fi + +if ! test -f "/sys/fs/cgroup/$(systemctl show TEST-07-PID1.service -P ControlGroup)/memory.pressure" ; then + echo "No memory accounting/PSI delegated via cgroup, not running ProtectControlGroups= with MemoryPressureWatch= test." >&2 + ENABLE_MEM_PRESSURE_TEST=false +fi + +test_basic() { + local protect_control_groups_ex="$1" + local protect_control_groups="$2" + local in_cgroup_ns="$3" + local mount_flag="$4" + + if [[ $in_cgroup_ns == true ]]; then + local ns_cmp_op="!=" + local unit_cgroup="0::/" + local memory_pressure_watch="/sys/fs/cgroup/memory.pressure" + else + local ns_cmp_op="==" + local unit_cgroup="0::/$SLICE/$UNIT_PREFIX-$protect_control_groups_ex-1.service" + local memory_pressure_watch="/sys/fs/cgroup/$SLICE/$UNIT_PREFIX-$protect_control_groups_ex-2.service/memory.pressure" + fi + + # Compare cgroup namespace to root namespace + systemd-run -p "ProtectControlGroupsEx=$protect_control_groups_ex" --slice "$SLICE" --wait \ + bash -xec "test \"\$(readlink /proc/self/ns/cgroup)\" $ns_cmp_op \"$ROOT_CGROUP_NS\"" + # Verify unit cgroup + systemd-run -p "ProtectControlGroupsEx=$protect_control_groups_ex" --slice "$SLICE" --wait \ + --unit "$UNIT_PREFIX-$protect_control_groups_ex-1" \ + bash -xec "test \"\$(cat /proc/self/cgroup)\" == \"$unit_cgroup\"" + # Verify memory pressure watch points to correct file + if [[ $ENABLE_MEM_PRESSURE_TEST == true ]]; then + systemd-run -p "ProtectControlGroupsEx=$protect_control_groups_ex" -p MemoryPressureWatch=yes --slice "$SLICE" --wait \ + --unit "$UNIT_PREFIX-$protect_control_groups_ex-2" \ + bash -xec "test \"\$MEMORY_PRESSURE_WATCH\" == \"$memory_pressure_watch\"" + fi + # Verify /sys/fs/cgroup mount is read-only or read-write + systemd-run -p "ProtectControlGroupsEx=$protect_control_groups_ex" --slice "$SLICE" --wait \ + bash -xec "[[ \"\$\$(findmnt --mountpoint /sys/fs/cgroup --noheadings -o FSTYPE)\" == cgroup2 ]]; + [[ \"\$\$(findmnt --mountpoint /sys/fs/cgroup --noheadings -o FS-OPTIONS)\" =~ nsdelegate ]]; + [[ \"\$\$(findmnt --mountpoint /sys/fs/cgroup --noheadings -o VFS-OPTIONS)\" =~ noexec ]]; + [[ \"\$\$(findmnt --mountpoint /sys/fs/cgroup --noheadings -o VFS-OPTIONS)\" =~ nosuid ]]; + [[ \"\$\$(findmnt --mountpoint /sys/fs/cgroup --noheadings -o VFS-OPTIONS)\" =~ nodev ]]; + [[ \"\$\$(findmnt --mountpoint /sys/fs/cgroup --noheadings -o VFS-OPTIONS)\" =~ \"$mount_flag\" ]];" + + # Verify dbus properties + systemd-run -p "ProtectControlGroupsEx=$protect_control_groups_ex" --slice "$SLICE" --remain-after-exit \ + --unit "$UNIT_PREFIX-$protect_control_groups_ex-3" true + assert_eq "$(systemctl show -P ProtectControlGroupsEx "$UNIT_PREFIX-$protect_control_groups_ex-3")" "$protect_control_groups_ex" + assert_eq "$(systemctl show -P ProtectControlGroups "$UNIT_PREFIX-$protect_control_groups_ex-3")" "$protect_control_groups" + systemctl stop "$UNIT_PREFIX-$protect_control_groups_ex-3" +} + +testcase_basic_no() { + test_basic "no" "no" false "$READ_WRITE_MOUNT_FLAG" +} + +testcase_basic_yes() { + test_basic "yes" "yes" false "$READ_ONLY_MOUNT_FLAG" +} + +testcase_basic_private() { + test_basic "private" "yes" true "$READ_WRITE_MOUNT_FLAG" +} + +testcase_basic_strict() { + test_basic "strict" "yes" true "$READ_ONLY_MOUNT_FLAG" +} + +run_testcases