mirror of
https://github.com/morgan9e/systemd
synced 2026-04-14 00:14:32 +09:00
core: delegate mountns implicitly when any of pidns/cgns/netns is in use, clean up private cgroupfs mount (#36892)
Fixes #36952
This commit is contained in:
@@ -2411,6 +2411,11 @@ RestrictNamespaces=~cgroup net</programlisting>
|
||||
done with the namespace specific unit setting such as <varname>PrivateNetwork=</varname> or
|
||||
<varname>PrivateMounts=</varname>.</para>
|
||||
|
||||
<para>Note that some namespace sandboxing options might entail mount namespace for private API VFS instances,
|
||||
such as <varname>PrivatePIDs=</varname>, <varname>ProtectControlGroups=private/strict</varname>, or
|
||||
<varname>PrivateNetwork=</varname>. If any of the mentioned options are enabled, mount namespace
|
||||
is implicitly delegated.</para>
|
||||
|
||||
<xi:include href="version-info.xml" xpointer="v258"/></listitem>
|
||||
</varlistentry>
|
||||
|
||||
|
||||
@@ -3441,7 +3441,7 @@ static int apply_mount_namespace(
|
||||
|
||||
/* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
|
||||
* service will need to write to it in order to start the notifications. */
|
||||
if (exec_is_cgroup_mount_read_only(context, params) && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
|
||||
if (exec_is_cgroup_mount_read_only(context) && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
|
||||
read_write_paths_cleanup = strv_copy(context->read_write_paths);
|
||||
if (!read_write_paths_cleanup)
|
||||
return -ENOMEM;
|
||||
@@ -3586,7 +3586,7 @@ static int apply_mount_namespace(
|
||||
* sandbox inside the mount namespace. */
|
||||
.ignore_protect_paths = !needs_sandboxing && !context->dynamic_user && root_dir,
|
||||
|
||||
.protect_control_groups = needs_sandboxing ? exec_get_protect_control_groups(context, params) : PROTECT_CONTROL_GROUPS_NO,
|
||||
.protect_control_groups = needs_sandboxing ? exec_get_protect_control_groups(context) : PROTECT_CONTROL_GROUPS_NO,
|
||||
.protect_kernel_tunables = needs_sandboxing && context->protect_kernel_tunables,
|
||||
.protect_kernel_modules = needs_sandboxing && context->protect_kernel_modules,
|
||||
.protect_kernel_logs = needs_sandboxing && context->protect_kernel_logs,
|
||||
@@ -4205,9 +4205,8 @@ static void log_command_line(
|
||||
LOG_EXEC_INVOCATION_ID(params));
|
||||
}
|
||||
|
||||
static bool exec_context_needs_cap_sys_admin(const ExecContext *context, const ExecParameters *params) {
|
||||
static bool exec_context_needs_cap_sys_admin(const ExecContext *context) {
|
||||
assert(context);
|
||||
assert(params);
|
||||
|
||||
return context->private_users != PRIVATE_USERS_NO ||
|
||||
context->private_tmp != PRIVATE_TMP_NO ||
|
||||
@@ -4229,7 +4228,7 @@ static bool exec_context_needs_cap_sys_admin(const ExecContext *context, const E
|
||||
context->protect_kernel_tunables ||
|
||||
context->protect_kernel_modules ||
|
||||
context->protect_kernel_logs ||
|
||||
exec_needs_cgroup_mount(context, params) ||
|
||||
exec_needs_cgroup_mount(context) ||
|
||||
context->protect_clock ||
|
||||
context->protect_hostname != PROTECT_HOSTNAME_NO ||
|
||||
!strv_isempty(context->read_write_paths) ||
|
||||
@@ -4270,13 +4269,23 @@ static bool exec_namespace_is_delegated(
|
||||
/* If we need unprivileged private users, we've already unshared a user namespace by the time we call
|
||||
* setup_delegated_namespaces() for the first time so let's make sure we do all other namespace
|
||||
* unsharing in the first call to setup_delegated_namespaces() by returning false here. */
|
||||
if (!have_cap_sys_admin && exec_context_needs_cap_sys_admin(context, params))
|
||||
if (!have_cap_sys_admin && exec_context_needs_cap_sys_admin(context))
|
||||
return false;
|
||||
|
||||
if (context->delegate_namespaces == NAMESPACE_FLAGS_INITIAL)
|
||||
return params->runtime_scope == RUNTIME_SCOPE_USER;
|
||||
|
||||
return FLAGS_SET(context->delegate_namespaces, namespace);
|
||||
if (FLAGS_SET(context->delegate_namespaces, namespace))
|
||||
return true;
|
||||
|
||||
/* Various namespaces imply mountns for private procfs/sysfs/cgroupfs instances, which means when
|
||||
* those are delegated mountns must be deferred too.
|
||||
*
|
||||
* The list should stay in sync with exec_needs_mount_namespace(). */
|
||||
if (namespace == CLONE_NEWNS)
|
||||
return context->delegate_namespaces & (CLONE_NEWPID|CLONE_NEWCGROUP|CLONE_NEWNET);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static int setup_delegated_namespaces(
|
||||
@@ -4355,7 +4364,7 @@ static int setup_delegated_namespaces(
|
||||
log_exec_warning(context, params, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
|
||||
}
|
||||
|
||||
if (needs_sandboxing && exec_needs_cgroup_namespace(context, params) &&
|
||||
if (needs_sandboxing && exec_needs_cgroup_namespace(context) &&
|
||||
exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWCGROUP) == delegate) {
|
||||
if (unshare(CLONE_NEWCGROUP) < 0) {
|
||||
*reterr_exit_status = EXIT_NAMESPACE;
|
||||
@@ -5197,7 +5206,7 @@ int exec_invoke(
|
||||
* to the cgroup namespace to environment variables and mounts. If chown/chmod fails, we should not pass memory
|
||||
* pressure path environment variable or read-write mount to the unit. This is why we check if
|
||||
* memory_pressure_path != NULL in the conditional below. */
|
||||
if (memory_pressure_path && needs_sandboxing && exec_needs_cgroup_namespace(context, params)) {
|
||||
if (memory_pressure_path && needs_sandboxing && exec_needs_cgroup_namespace(context)) {
|
||||
memory_pressure_path = mfree(memory_pressure_path);
|
||||
r = cg_get_path("memory", "", "memory.pressure", &memory_pressure_path);
|
||||
if (r < 0) {
|
||||
@@ -5364,7 +5373,7 @@ int exec_invoke(
|
||||
}
|
||||
}
|
||||
|
||||
if (needs_sandboxing && !have_cap_sys_admin && exec_context_needs_cap_sys_admin(context, params)) {
|
||||
if (needs_sandboxing && !have_cap_sys_admin && exec_context_needs_cap_sys_admin(context)) {
|
||||
/* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
|
||||
* Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
|
||||
* set up all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
|
||||
|
||||
@@ -232,24 +232,18 @@ bool exec_needs_ipc_namespace(const ExecContext *context) {
|
||||
return context->private_ipc || context->ipc_namespace_path;
|
||||
}
|
||||
|
||||
static bool can_apply_cgroup_namespace(const ExecContext *context, const ExecParameters *params) {
|
||||
return cg_all_unified() > 0 && ns_type_supported(NAMESPACE_CGROUP);
|
||||
}
|
||||
|
||||
static bool needs_cgroup_namespace(ProtectControlGroups i) {
|
||||
return IN_SET(i, PROTECT_CONTROL_GROUPS_PRIVATE, PROTECT_CONTROL_GROUPS_STRICT);
|
||||
}
|
||||
|
||||
ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context, const ExecParameters *params) {
|
||||
ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context) {
|
||||
assert(context);
|
||||
|
||||
/* If cgroup namespace is configured via ProtectControlGroups=private or strict but we can't actually
|
||||
* use cgroup namespace, either from not having unified hierarchy or kernel support, we ignore the
|
||||
* setting and do not unshare the namespace. ProtectControlGroups=private and strict get downgraded
|
||||
* to no and yes respectively. This ensures that strict always gets a read-only mount of /sys/fs/cgroup.
|
||||
*
|
||||
* TODO: Remove fallback once cgroupv1 support is removed in v258. */
|
||||
if (needs_cgroup_namespace(context->protect_control_groups) && !can_apply_cgroup_namespace(context, params)) {
|
||||
* use cgroup namespace, we ignore the setting and do not unshare the namespace.
|
||||
* ProtectControlGroups=private and strict get downgraded to no and yes respectively. This ensures
|
||||
* that strict always gets a read-only mount of /sys/fs/cgroup/. */
|
||||
if (needs_cgroup_namespace(context->protect_control_groups) && !ns_type_supported(NAMESPACE_CGROUP)) {
|
||||
if (context->protect_control_groups == PROTECT_CONTROL_GROUPS_PRIVATE)
|
||||
return PROTECT_CONTROL_GROUPS_NO;
|
||||
if (context->protect_control_groups == PROTECT_CONTROL_GROUPS_STRICT)
|
||||
@@ -258,22 +252,22 @@ ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context,
|
||||
return context->protect_control_groups;
|
||||
}
|
||||
|
||||
bool exec_needs_cgroup_namespace(const ExecContext *context, const ExecParameters *params) {
|
||||
bool exec_needs_cgroup_namespace(const ExecContext *context) {
|
||||
assert(context);
|
||||
|
||||
return needs_cgroup_namespace(exec_get_protect_control_groups(context, params));
|
||||
return needs_cgroup_namespace(exec_get_protect_control_groups(context));
|
||||
}
|
||||
|
||||
bool exec_needs_cgroup_mount(const ExecContext *context, const ExecParameters *params) {
|
||||
bool exec_needs_cgroup_mount(const ExecContext *context) {
|
||||
assert(context);
|
||||
|
||||
return exec_get_protect_control_groups(context, params) != PROTECT_CONTROL_GROUPS_NO;
|
||||
return exec_get_protect_control_groups(context) != PROTECT_CONTROL_GROUPS_NO;
|
||||
}
|
||||
|
||||
bool exec_is_cgroup_mount_read_only(const ExecContext *context, const ExecParameters *params) {
|
||||
bool exec_is_cgroup_mount_read_only(const ExecContext *context) {
|
||||
assert(context);
|
||||
|
||||
return IN_SET(exec_get_protect_control_groups(context, params), PROTECT_CONTROL_GROUPS_YES, PROTECT_CONTROL_GROUPS_STRICT);
|
||||
return IN_SET(exec_get_protect_control_groups(context), PROTECT_CONTROL_GROUPS_YES, PROTECT_CONTROL_GROUPS_STRICT);
|
||||
}
|
||||
|
||||
bool exec_needs_pid_namespace(const ExecContext *context) {
|
||||
@@ -331,7 +325,7 @@ bool exec_needs_mount_namespace(
|
||||
context->protect_kernel_tunables ||
|
||||
context->protect_kernel_modules ||
|
||||
context->protect_kernel_logs ||
|
||||
exec_needs_cgroup_mount(context, params) ||
|
||||
exec_needs_cgroup_mount(context) ||
|
||||
context->protect_proc != PROTECT_PROC_DEFAULT ||
|
||||
context->proc_subset != PROC_SUBSET_ALL ||
|
||||
exec_needs_ipc_namespace(context) ||
|
||||
|
||||
@@ -631,10 +631,11 @@ bool exec_needs_network_namespace(const ExecContext *context);
|
||||
bool exec_needs_ipc_namespace(const ExecContext *context);
|
||||
bool exec_needs_pid_namespace(const ExecContext *context);
|
||||
|
||||
ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context, const ExecParameters *params);
|
||||
bool exec_needs_cgroup_namespace(const ExecContext *context, const ExecParameters *params);
|
||||
bool exec_needs_cgroup_mount(const ExecContext *context, const ExecParameters *params);
|
||||
bool exec_is_cgroup_mount_read_only(const ExecContext *context, const ExecParameters *params);
|
||||
ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context);
|
||||
bool exec_needs_cgroup_namespace(const ExecContext *context);
|
||||
bool exec_needs_cgroup_mount(const ExecContext *context);
|
||||
bool exec_is_cgroup_mount_read_only(const ExecContext *context);
|
||||
|
||||
const char* exec_get_private_notify_socket_path(const ExecContext *context, const ExecParameters *params, bool needs_sandboxing);
|
||||
|
||||
/* These logging macros do the same logging as those in unit.h, but using ExecContext and ExecParameters
|
||||
|
||||
@@ -26,7 +26,6 @@
|
||||
#include "loopback-setup.h"
|
||||
#include "missing_syscall.h"
|
||||
#include "mkdir-label.h"
|
||||
#include "mount-setup.h"
|
||||
#include "mount-util.h"
|
||||
#include "mountpoint-util.h"
|
||||
#include "namespace-util.h"
|
||||
@@ -207,14 +206,14 @@ static const MountEntry protect_control_groups_yes_table[] = {
|
||||
};
|
||||
|
||||
/* ProtectControlGroups=private table. Note mount_private_apivfs() always use MS_NOSUID|MS_NOEXEC|MS_NODEV so
|
||||
* flags is not set here. nsdelegate has been supported since kernels >= 4.13 so it is safe to use. */
|
||||
* flags is not set here. */
|
||||
static const MountEntry protect_control_groups_private_table[] = {
|
||||
{ "/sys/fs/cgroup", MOUNT_PRIVATE_CGROUP2FS, false, .read_only = false, .nosuid = true, .noexec = true, .options_const = "nsdelegate" },
|
||||
{ "/sys/fs/cgroup", MOUNT_PRIVATE_CGROUP2FS, false, .read_only = false },
|
||||
};
|
||||
|
||||
/* ProtectControlGroups=strict table */
|
||||
static const MountEntry protect_control_groups_strict_table[] = {
|
||||
{ "/sys/fs/cgroup", MOUNT_PRIVATE_CGROUP2FS, false, .read_only = true, .nosuid = true, .noexec = true, .options_const = "nsdelegate" },
|
||||
{ "/sys/fs/cgroup", MOUNT_PRIVATE_CGROUP2FS, false, .read_only = true },
|
||||
};
|
||||
|
||||
/* ProtectSystem=yes table */
|
||||
@@ -338,7 +337,7 @@ static bool mount_entry_read_only(const MountEntry *p) {
|
||||
static bool mount_entry_noexec(const MountEntry *p) {
|
||||
assert(p);
|
||||
|
||||
return p->noexec || IN_SET(p->mode, MOUNT_NOEXEC, MOUNT_INACCESSIBLE, MOUNT_PRIVATE_SYSFS, MOUNT_BIND_SYSFS, MOUNT_PROCFS);
|
||||
return p->noexec || IN_SET(p->mode, MOUNT_NOEXEC, MOUNT_INACCESSIBLE, MOUNT_PRIVATE_SYSFS, MOUNT_BIND_SYSFS, MOUNT_PROCFS, MOUNT_PRIVATE_CGROUP2FS);
|
||||
}
|
||||
|
||||
static bool mount_entry_exec(const MountEntry *p) {
|
||||
@@ -1320,16 +1319,6 @@ static int mount_private_apivfs(
|
||||
return r;
|
||||
|
||||
r = mount_nofollow_verbose(LOG_DEBUG, fstype, temporary_mount, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
|
||||
if (r == -EINVAL && opts)
|
||||
/* If this failed with EINVAL then this likely means either:
|
||||
* 1. the textual hidepid= stuff for procfs is not supported by the kernel, and thus the
|
||||
* per-instance hidepid= neither, which means we really don't want to use it, since it
|
||||
* would affect our host's /proc mount.
|
||||
* 2. nsdelegate for cgroup2 is not supported by the kernel even though CLONE_NEWCGROUP
|
||||
* is supported.
|
||||
*
|
||||
* Hence let's gracefully fallback to a classic, unrestricted version. */
|
||||
r = mount_nofollow_verbose(LOG_DEBUG, fstype, temporary_mount, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, /* opts = */ NULL);
|
||||
if (ERRNO_IS_NEG_PRIVILEGE(r)) {
|
||||
/* When we do not have enough privileges to mount a new instance, fall back to use an
|
||||
* existing mount. */
|
||||
@@ -1348,8 +1337,8 @@ static int mount_private_apivfs(
|
||||
return r;
|
||||
|
||||
return 1;
|
||||
|
||||
} else if (r < 0)
|
||||
}
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
/* OK. We have a new mount instance. Let's clear an existing mount and its submounts. */
|
||||
@@ -1375,18 +1364,9 @@ static int mount_private_sysfs(const MountEntry *m, const NamespaceParameters *p
|
||||
}
|
||||
|
||||
static int mount_private_cgroup2fs(const MountEntry *m, const NamespaceParameters *p) {
|
||||
_cleanup_free_ char *opts = NULL;
|
||||
|
||||
assert(m);
|
||||
assert(p);
|
||||
|
||||
if (cgroupfs_recursiveprot_supported()) {
|
||||
opts = strextend_with_separator(NULL, ",", mount_entry_options(m) ?: POINTER_MAX, "memory_recursiveprot");
|
||||
if (!opts)
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
return mount_private_apivfs("cgroup2", mount_entry_path(m), "/sys/fs/cgroup", opts ?: mount_entry_options(m), p->runtime_scope);
|
||||
return mount_private_apivfs("cgroup2", mount_entry_path(m), "/sys/fs/cgroup", /* opts = */ NULL, p->runtime_scope);
|
||||
}
|
||||
|
||||
static int mount_procfs(const MountEntry *m, const NamespaceParameters *p) {
|
||||
@@ -1414,14 +1394,14 @@ static int mount_procfs(const MountEntry *m, const NamespaceParameters *p) {
|
||||
* fsopen()/fsconfig() was also backported on some distros which allows us to detect
|
||||
* hidepid=/subset= support in even more scenarios. */
|
||||
|
||||
if (mount_option_supported("proc", "hidepid", hpv) != 0) {
|
||||
if (mount_option_supported("proc", "hidepid", hpv) > 0) {
|
||||
opts = strjoin("hidepid=", hpv);
|
||||
if (!opts)
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
if (p->proc_subset == PROC_SUBSET_PID &&
|
||||
mount_option_supported("proc", "subset", "pid") != 0)
|
||||
mount_option_supported("proc", "subset", "pid") > 0)
|
||||
if (!strextend_with_separator(&opts, ",", "subset=pid"))
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
@@ -52,7 +52,7 @@ typedef struct MountPoint {
|
||||
MountMode mode;
|
||||
} MountPoint;
|
||||
|
||||
bool cgroupfs_recursiveprot_supported(void) {
|
||||
static bool cgroupfs_recursiveprot_supported(void) {
|
||||
int r;
|
||||
|
||||
/* Added in kernel 5.7 */
|
||||
|
||||
@@ -8,5 +8,3 @@ bool mount_point_ignore(const char *path);
|
||||
|
||||
int mount_setup_early(void);
|
||||
int mount_setup(bool loaded_policy, bool leave_propagation);
|
||||
|
||||
bool cgroupfs_recursiveprot_supported(void);
|
||||
|
||||
@@ -35,17 +35,12 @@ testcase_network() {
|
||||
systemd-run -p PrivateUsersEx=self -p PrivateNetwork=yes -p DelegateNamespaces=net --wait --pipe -- ip link add veth1 type veth peer name veth2
|
||||
}
|
||||
|
||||
testcase_cgroup() {
|
||||
(! systemd-run -p PrivateUsersEx=self -p ProtectControlGroupsEx=private --wait --pipe -- sh -c 'echo 0 >/sys/fs/cgroup/cgroup.pressure')
|
||||
systemd-run -p PrivateUsersEx=self -p ProtectControlGroupsEx=private -p DelegateNamespaces=cgroup --wait --pipe -- sh -c 'echo 0 >/sys/fs/cgroup/cgroup.pressure'
|
||||
}
|
||||
|
||||
testcase_pid() {
|
||||
# MountAPIVFS=yes always bind mounts child mounts of APIVFS filesystems, which means /proc/sys is always read-only
|
||||
# so we can't write to it when running in a container.
|
||||
if ! systemd-detect-virt --container; then
|
||||
(! systemd-run -p PrivateUsersEx=self -p PrivatePIDs=yes -p MountAPIVFS=yes --wait --pipe -- sh -c 'echo 5 >/proc/sys/kernel/ns_last_pid')
|
||||
systemd-run -p PrivateUsersEx=self -p PrivatePIDs=yes -p MountAPIVFS=yes -p DelegateNamespaces="mnt pid" --wait --pipe -- sh -c 'echo 5 >/proc/sys/kernel/ns_last_pid'
|
||||
systemd-run -p PrivateUsersEx=self -p PrivatePIDs=yes -p MountAPIVFS=yes -p DelegateNamespaces=pid --wait --pipe -- sh -c 'echo 5 >/proc/sys/kernel/ns_last_pid'
|
||||
fi
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user