core: delegate mountns implicitly when any of pidns/cgns/netns is in use, clean up private cgroupfs mount (#36892)

Fixes #36952
This commit is contained in:
Yu Watanabe
2025-04-03 00:15:33 +09:00
committed by GitHub
8 changed files with 52 additions and 70 deletions

View File

@@ -2411,6 +2411,11 @@ RestrictNamespaces=~cgroup net</programlisting>
done with the namespace specific unit setting such as <varname>PrivateNetwork=</varname> or
<varname>PrivateMounts=</varname>.</para>
<para>Note that some namespace sandboxing options might entail mount namespace for private API VFS instances,
such as <varname>PrivatePIDs=</varname>, <varname>ProtectControlGroups=private/strict</varname>, or
<varname>PrivateNetwork=</varname>. If any of the mentioned options are enabled, mount namespace
is implicitly delegated.</para>
<xi:include href="version-info.xml" xpointer="v258"/></listitem>
</varlistentry>

View File

@@ -3441,7 +3441,7 @@ static int apply_mount_namespace(
/* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
* service will need to write to it in order to start the notifications. */
if (exec_is_cgroup_mount_read_only(context, params) && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
if (exec_is_cgroup_mount_read_only(context) && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
read_write_paths_cleanup = strv_copy(context->read_write_paths);
if (!read_write_paths_cleanup)
return -ENOMEM;
@@ -3586,7 +3586,7 @@ static int apply_mount_namespace(
* sandbox inside the mount namespace. */
.ignore_protect_paths = !needs_sandboxing && !context->dynamic_user && root_dir,
.protect_control_groups = needs_sandboxing ? exec_get_protect_control_groups(context, params) : PROTECT_CONTROL_GROUPS_NO,
.protect_control_groups = needs_sandboxing ? exec_get_protect_control_groups(context) : PROTECT_CONTROL_GROUPS_NO,
.protect_kernel_tunables = needs_sandboxing && context->protect_kernel_tunables,
.protect_kernel_modules = needs_sandboxing && context->protect_kernel_modules,
.protect_kernel_logs = needs_sandboxing && context->protect_kernel_logs,
@@ -4205,9 +4205,8 @@ static void log_command_line(
LOG_EXEC_INVOCATION_ID(params));
}
static bool exec_context_needs_cap_sys_admin(const ExecContext *context, const ExecParameters *params) {
static bool exec_context_needs_cap_sys_admin(const ExecContext *context) {
assert(context);
assert(params);
return context->private_users != PRIVATE_USERS_NO ||
context->private_tmp != PRIVATE_TMP_NO ||
@@ -4229,7 +4228,7 @@ static bool exec_context_needs_cap_sys_admin(const ExecContext *context, const E
context->protect_kernel_tunables ||
context->protect_kernel_modules ||
context->protect_kernel_logs ||
exec_needs_cgroup_mount(context, params) ||
exec_needs_cgroup_mount(context) ||
context->protect_clock ||
context->protect_hostname != PROTECT_HOSTNAME_NO ||
!strv_isempty(context->read_write_paths) ||
@@ -4270,13 +4269,23 @@ static bool exec_namespace_is_delegated(
/* If we need unprivileged private users, we've already unshared a user namespace by the time we call
* setup_delegated_namespaces() for the first time so let's make sure we do all other namespace
* unsharing in the first call to setup_delegated_namespaces() by returning false here. */
if (!have_cap_sys_admin && exec_context_needs_cap_sys_admin(context, params))
if (!have_cap_sys_admin && exec_context_needs_cap_sys_admin(context))
return false;
if (context->delegate_namespaces == NAMESPACE_FLAGS_INITIAL)
return params->runtime_scope == RUNTIME_SCOPE_USER;
return FLAGS_SET(context->delegate_namespaces, namespace);
if (FLAGS_SET(context->delegate_namespaces, namespace))
return true;
/* Various namespaces imply mountns for private procfs/sysfs/cgroupfs instances, which means when
* those are delegated mountns must be deferred too.
*
* The list should stay in sync with exec_needs_mount_namespace(). */
if (namespace == CLONE_NEWNS)
return context->delegate_namespaces & (CLONE_NEWPID|CLONE_NEWCGROUP|CLONE_NEWNET);
return false;
}
static int setup_delegated_namespaces(
@@ -4355,7 +4364,7 @@ static int setup_delegated_namespaces(
log_exec_warning(context, params, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
}
if (needs_sandboxing && exec_needs_cgroup_namespace(context, params) &&
if (needs_sandboxing && exec_needs_cgroup_namespace(context) &&
exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWCGROUP) == delegate) {
if (unshare(CLONE_NEWCGROUP) < 0) {
*reterr_exit_status = EXIT_NAMESPACE;
@@ -5197,7 +5206,7 @@ int exec_invoke(
* to the cgroup namespace to environment variables and mounts. If chown/chmod fails, we should not pass memory
* pressure path environment variable or read-write mount to the unit. This is why we check if
* memory_pressure_path != NULL in the conditional below. */
if (memory_pressure_path && needs_sandboxing && exec_needs_cgroup_namespace(context, params)) {
if (memory_pressure_path && needs_sandboxing && exec_needs_cgroup_namespace(context)) {
memory_pressure_path = mfree(memory_pressure_path);
r = cg_get_path("memory", "", "memory.pressure", &memory_pressure_path);
if (r < 0) {
@@ -5364,7 +5373,7 @@ int exec_invoke(
}
}
if (needs_sandboxing && !have_cap_sys_admin && exec_context_needs_cap_sys_admin(context, params)) {
if (needs_sandboxing && !have_cap_sys_admin && exec_context_needs_cap_sys_admin(context)) {
/* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
* Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
* set up all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */

View File

@@ -232,24 +232,18 @@ bool exec_needs_ipc_namespace(const ExecContext *context) {
return context->private_ipc || context->ipc_namespace_path;
}
static bool can_apply_cgroup_namespace(const ExecContext *context, const ExecParameters *params) {
return cg_all_unified() > 0 && ns_type_supported(NAMESPACE_CGROUP);
}
static bool needs_cgroup_namespace(ProtectControlGroups i) {
return IN_SET(i, PROTECT_CONTROL_GROUPS_PRIVATE, PROTECT_CONTROL_GROUPS_STRICT);
}
ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context, const ExecParameters *params) {
ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context) {
assert(context);
/* If cgroup namespace is configured via ProtectControlGroups=private or strict but we can't actually
* use cgroup namespace, either from not having unified hierarchy or kernel support, we ignore the
* setting and do not unshare the namespace. ProtectControlGroups=private and strict get downgraded
* to no and yes respectively. This ensures that strict always gets a read-only mount of /sys/fs/cgroup.
*
* TODO: Remove fallback once cgroupv1 support is removed in v258. */
if (needs_cgroup_namespace(context->protect_control_groups) && !can_apply_cgroup_namespace(context, params)) {
* use cgroup namespace, we ignore the setting and do not unshare the namespace.
* ProtectControlGroups=private and strict get downgraded to no and yes respectively. This ensures
* that strict always gets a read-only mount of /sys/fs/cgroup/. */
if (needs_cgroup_namespace(context->protect_control_groups) && !ns_type_supported(NAMESPACE_CGROUP)) {
if (context->protect_control_groups == PROTECT_CONTROL_GROUPS_PRIVATE)
return PROTECT_CONTROL_GROUPS_NO;
if (context->protect_control_groups == PROTECT_CONTROL_GROUPS_STRICT)
@@ -258,22 +252,22 @@ ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context,
return context->protect_control_groups;
}
bool exec_needs_cgroup_namespace(const ExecContext *context, const ExecParameters *params) {
bool exec_needs_cgroup_namespace(const ExecContext *context) {
assert(context);
return needs_cgroup_namespace(exec_get_protect_control_groups(context, params));
return needs_cgroup_namespace(exec_get_protect_control_groups(context));
}
bool exec_needs_cgroup_mount(const ExecContext *context, const ExecParameters *params) {
bool exec_needs_cgroup_mount(const ExecContext *context) {
assert(context);
return exec_get_protect_control_groups(context, params) != PROTECT_CONTROL_GROUPS_NO;
return exec_get_protect_control_groups(context) != PROTECT_CONTROL_GROUPS_NO;
}
bool exec_is_cgroup_mount_read_only(const ExecContext *context, const ExecParameters *params) {
bool exec_is_cgroup_mount_read_only(const ExecContext *context) {
assert(context);
return IN_SET(exec_get_protect_control_groups(context, params), PROTECT_CONTROL_GROUPS_YES, PROTECT_CONTROL_GROUPS_STRICT);
return IN_SET(exec_get_protect_control_groups(context), PROTECT_CONTROL_GROUPS_YES, PROTECT_CONTROL_GROUPS_STRICT);
}
bool exec_needs_pid_namespace(const ExecContext *context) {
@@ -331,7 +325,7 @@ bool exec_needs_mount_namespace(
context->protect_kernel_tunables ||
context->protect_kernel_modules ||
context->protect_kernel_logs ||
exec_needs_cgroup_mount(context, params) ||
exec_needs_cgroup_mount(context) ||
context->protect_proc != PROTECT_PROC_DEFAULT ||
context->proc_subset != PROC_SUBSET_ALL ||
exec_needs_ipc_namespace(context) ||

View File

@@ -631,10 +631,11 @@ bool exec_needs_network_namespace(const ExecContext *context);
bool exec_needs_ipc_namespace(const ExecContext *context);
bool exec_needs_pid_namespace(const ExecContext *context);
ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context, const ExecParameters *params);
bool exec_needs_cgroup_namespace(const ExecContext *context, const ExecParameters *params);
bool exec_needs_cgroup_mount(const ExecContext *context, const ExecParameters *params);
bool exec_is_cgroup_mount_read_only(const ExecContext *context, const ExecParameters *params);
ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context);
bool exec_needs_cgroup_namespace(const ExecContext *context);
bool exec_needs_cgroup_mount(const ExecContext *context);
bool exec_is_cgroup_mount_read_only(const ExecContext *context);
const char* exec_get_private_notify_socket_path(const ExecContext *context, const ExecParameters *params, bool needs_sandboxing);
/* These logging macros do the same logging as those in unit.h, but using ExecContext and ExecParameters

View File

@@ -26,7 +26,6 @@
#include "loopback-setup.h"
#include "missing_syscall.h"
#include "mkdir-label.h"
#include "mount-setup.h"
#include "mount-util.h"
#include "mountpoint-util.h"
#include "namespace-util.h"
@@ -207,14 +206,14 @@ static const MountEntry protect_control_groups_yes_table[] = {
};
/* ProtectControlGroups=private table. Note mount_private_apivfs() always use MS_NOSUID|MS_NOEXEC|MS_NODEV so
* flags is not set here. nsdelegate has been supported since kernels >= 4.13 so it is safe to use. */
* flags is not set here. */
static const MountEntry protect_control_groups_private_table[] = {
{ "/sys/fs/cgroup", MOUNT_PRIVATE_CGROUP2FS, false, .read_only = false, .nosuid = true, .noexec = true, .options_const = "nsdelegate" },
{ "/sys/fs/cgroup", MOUNT_PRIVATE_CGROUP2FS, false, .read_only = false },
};
/* ProtectControlGroups=strict table */
static const MountEntry protect_control_groups_strict_table[] = {
{ "/sys/fs/cgroup", MOUNT_PRIVATE_CGROUP2FS, false, .read_only = true, .nosuid = true, .noexec = true, .options_const = "nsdelegate" },
{ "/sys/fs/cgroup", MOUNT_PRIVATE_CGROUP2FS, false, .read_only = true },
};
/* ProtectSystem=yes table */
@@ -338,7 +337,7 @@ static bool mount_entry_read_only(const MountEntry *p) {
static bool mount_entry_noexec(const MountEntry *p) {
assert(p);
return p->noexec || IN_SET(p->mode, MOUNT_NOEXEC, MOUNT_INACCESSIBLE, MOUNT_PRIVATE_SYSFS, MOUNT_BIND_SYSFS, MOUNT_PROCFS);
return p->noexec || IN_SET(p->mode, MOUNT_NOEXEC, MOUNT_INACCESSIBLE, MOUNT_PRIVATE_SYSFS, MOUNT_BIND_SYSFS, MOUNT_PROCFS, MOUNT_PRIVATE_CGROUP2FS);
}
static bool mount_entry_exec(const MountEntry *p) {
@@ -1320,16 +1319,6 @@ static int mount_private_apivfs(
return r;
r = mount_nofollow_verbose(LOG_DEBUG, fstype, temporary_mount, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
if (r == -EINVAL && opts)
/* If this failed with EINVAL then this likely means either:
* 1. the textual hidepid= stuff for procfs is not supported by the kernel, and thus the
* per-instance hidepid= neither, which means we really don't want to use it, since it
* would affect our host's /proc mount.
* 2. nsdelegate for cgroup2 is not supported by the kernel even though CLONE_NEWCGROUP
* is supported.
*
* Hence let's gracefully fallback to a classic, unrestricted version. */
r = mount_nofollow_verbose(LOG_DEBUG, fstype, temporary_mount, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, /* opts = */ NULL);
if (ERRNO_IS_NEG_PRIVILEGE(r)) {
/* When we do not have enough privileges to mount a new instance, fall back to use an
* existing mount. */
@@ -1348,8 +1337,8 @@ static int mount_private_apivfs(
return r;
return 1;
} else if (r < 0)
}
if (r < 0)
return r;
/* OK. We have a new mount instance. Let's clear an existing mount and its submounts. */
@@ -1375,18 +1364,9 @@ static int mount_private_sysfs(const MountEntry *m, const NamespaceParameters *p
}
static int mount_private_cgroup2fs(const MountEntry *m, const NamespaceParameters *p) {
_cleanup_free_ char *opts = NULL;
assert(m);
assert(p);
if (cgroupfs_recursiveprot_supported()) {
opts = strextend_with_separator(NULL, ",", mount_entry_options(m) ?: POINTER_MAX, "memory_recursiveprot");
if (!opts)
return -ENOMEM;
}
return mount_private_apivfs("cgroup2", mount_entry_path(m), "/sys/fs/cgroup", opts ?: mount_entry_options(m), p->runtime_scope);
return mount_private_apivfs("cgroup2", mount_entry_path(m), "/sys/fs/cgroup", /* opts = */ NULL, p->runtime_scope);
}
static int mount_procfs(const MountEntry *m, const NamespaceParameters *p) {
@@ -1414,14 +1394,14 @@ static int mount_procfs(const MountEntry *m, const NamespaceParameters *p) {
* fsopen()/fsconfig() was also backported on some distros which allows us to detect
* hidepid=/subset= support in even more scenarios. */
if (mount_option_supported("proc", "hidepid", hpv) != 0) {
if (mount_option_supported("proc", "hidepid", hpv) > 0) {
opts = strjoin("hidepid=", hpv);
if (!opts)
return -ENOMEM;
}
if (p->proc_subset == PROC_SUBSET_PID &&
mount_option_supported("proc", "subset", "pid") != 0)
mount_option_supported("proc", "subset", "pid") > 0)
if (!strextend_with_separator(&opts, ",", "subset=pid"))
return -ENOMEM;
}

View File

@@ -52,7 +52,7 @@ typedef struct MountPoint {
MountMode mode;
} MountPoint;
bool cgroupfs_recursiveprot_supported(void) {
static bool cgroupfs_recursiveprot_supported(void) {
int r;
/* Added in kernel 5.7 */

View File

@@ -8,5 +8,3 @@ bool mount_point_ignore(const char *path);
int mount_setup_early(void);
int mount_setup(bool loaded_policy, bool leave_propagation);
bool cgroupfs_recursiveprot_supported(void);

View File

@@ -35,17 +35,12 @@ testcase_network() {
systemd-run -p PrivateUsersEx=self -p PrivateNetwork=yes -p DelegateNamespaces=net --wait --pipe -- ip link add veth1 type veth peer name veth2
}
testcase_cgroup() {
(! systemd-run -p PrivateUsersEx=self -p ProtectControlGroupsEx=private --wait --pipe -- sh -c 'echo 0 >/sys/fs/cgroup/cgroup.pressure')
systemd-run -p PrivateUsersEx=self -p ProtectControlGroupsEx=private -p DelegateNamespaces=cgroup --wait --pipe -- sh -c 'echo 0 >/sys/fs/cgroup/cgroup.pressure'
}
testcase_pid() {
# MountAPIVFS=yes always bind mounts child mounts of APIVFS filesystems, which means /proc/sys is always read-only
# so we can't write to it when running in a container.
if ! systemd-detect-virt --container; then
(! systemd-run -p PrivateUsersEx=self -p PrivatePIDs=yes -p MountAPIVFS=yes --wait --pipe -- sh -c 'echo 5 >/proc/sys/kernel/ns_last_pid')
systemd-run -p PrivateUsersEx=self -p PrivatePIDs=yes -p MountAPIVFS=yes -p DelegateNamespaces="mnt pid" --wait --pipe -- sh -c 'echo 5 >/proc/sys/kernel/ns_last_pid'
systemd-run -p PrivateUsersEx=self -p PrivatePIDs=yes -p MountAPIVFS=yes -p DelegateNamespaces=pid --wait --pipe -- sh -c 'echo 5 >/proc/sys/kernel/ns_last_pid'
fi
}