diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index fa6b965101..bf4f223a43 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -2411,6 +2411,11 @@ RestrictNamespaces=~cgroup net
done with the namespace specific unit setting such as PrivateNetwork= or
PrivateMounts=.
+ Note that some namespace sandboxing options might entail mount namespace for private API VFS instances,
+ such as PrivatePIDs=, ProtectControlGroups=private/strict, or
+ PrivateNetwork=. If any of the mentioned options are enabled, mount namespace
+ is implicitly delegated.
+
diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c
index 044a1da437..d9878e6088 100644
--- a/src/core/exec-invoke.c
+++ b/src/core/exec-invoke.c
@@ -3441,7 +3441,7 @@ static int apply_mount_namespace(
/* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
* service will need to write to it in order to start the notifications. */
- if (exec_is_cgroup_mount_read_only(context, params) && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
+ if (exec_is_cgroup_mount_read_only(context) && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
read_write_paths_cleanup = strv_copy(context->read_write_paths);
if (!read_write_paths_cleanup)
return -ENOMEM;
@@ -3586,7 +3586,7 @@ static int apply_mount_namespace(
* sandbox inside the mount namespace. */
.ignore_protect_paths = !needs_sandboxing && !context->dynamic_user && root_dir,
- .protect_control_groups = needs_sandboxing ? exec_get_protect_control_groups(context, params) : PROTECT_CONTROL_GROUPS_NO,
+ .protect_control_groups = needs_sandboxing ? exec_get_protect_control_groups(context) : PROTECT_CONTROL_GROUPS_NO,
.protect_kernel_tunables = needs_sandboxing && context->protect_kernel_tunables,
.protect_kernel_modules = needs_sandboxing && context->protect_kernel_modules,
.protect_kernel_logs = needs_sandboxing && context->protect_kernel_logs,
@@ -4205,9 +4205,8 @@ static void log_command_line(
LOG_EXEC_INVOCATION_ID(params));
}
-static bool exec_context_needs_cap_sys_admin(const ExecContext *context, const ExecParameters *params) {
+static bool exec_context_needs_cap_sys_admin(const ExecContext *context) {
assert(context);
- assert(params);
return context->private_users != PRIVATE_USERS_NO ||
context->private_tmp != PRIVATE_TMP_NO ||
@@ -4229,7 +4228,7 @@ static bool exec_context_needs_cap_sys_admin(const ExecContext *context, const E
context->protect_kernel_tunables ||
context->protect_kernel_modules ||
context->protect_kernel_logs ||
- exec_needs_cgroup_mount(context, params) ||
+ exec_needs_cgroup_mount(context) ||
context->protect_clock ||
context->protect_hostname != PROTECT_HOSTNAME_NO ||
!strv_isempty(context->read_write_paths) ||
@@ -4270,13 +4269,23 @@ static bool exec_namespace_is_delegated(
/* If we need unprivileged private users, we've already unshared a user namespace by the time we call
* setup_delegated_namespaces() for the first time so let's make sure we do all other namespace
* unsharing in the first call to setup_delegated_namespaces() by returning false here. */
- if (!have_cap_sys_admin && exec_context_needs_cap_sys_admin(context, params))
+ if (!have_cap_sys_admin && exec_context_needs_cap_sys_admin(context))
return false;
if (context->delegate_namespaces == NAMESPACE_FLAGS_INITIAL)
return params->runtime_scope == RUNTIME_SCOPE_USER;
- return FLAGS_SET(context->delegate_namespaces, namespace);
+ if (FLAGS_SET(context->delegate_namespaces, namespace))
+ return true;
+
+ /* Various namespaces imply mountns for private procfs/sysfs/cgroupfs instances, which means when
+ * those are delegated mountns must be deferred too.
+ *
+ * The list should stay in sync with exec_needs_mount_namespace(). */
+ if (namespace == CLONE_NEWNS)
+ return context->delegate_namespaces & (CLONE_NEWPID|CLONE_NEWCGROUP|CLONE_NEWNET);
+
+ return false;
}
static int setup_delegated_namespaces(
@@ -4355,7 +4364,7 @@ static int setup_delegated_namespaces(
log_exec_warning(context, params, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
}
- if (needs_sandboxing && exec_needs_cgroup_namespace(context, params) &&
+ if (needs_sandboxing && exec_needs_cgroup_namespace(context) &&
exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWCGROUP) == delegate) {
if (unshare(CLONE_NEWCGROUP) < 0) {
*reterr_exit_status = EXIT_NAMESPACE;
@@ -5197,7 +5206,7 @@ int exec_invoke(
* to the cgroup namespace to environment variables and mounts. If chown/chmod fails, we should not pass memory
* pressure path environment variable or read-write mount to the unit. This is why we check if
* memory_pressure_path != NULL in the conditional below. */
- if (memory_pressure_path && needs_sandboxing && exec_needs_cgroup_namespace(context, params)) {
+ if (memory_pressure_path && needs_sandboxing && exec_needs_cgroup_namespace(context)) {
memory_pressure_path = mfree(memory_pressure_path);
r = cg_get_path("memory", "", "memory.pressure", &memory_pressure_path);
if (r < 0) {
@@ -5364,7 +5373,7 @@ int exec_invoke(
}
}
- if (needs_sandboxing && !have_cap_sys_admin && exec_context_needs_cap_sys_admin(context, params)) {
+ if (needs_sandboxing && !have_cap_sys_admin && exec_context_needs_cap_sys_admin(context)) {
/* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
* Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
* set up all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
diff --git a/src/core/execute.c b/src/core/execute.c
index e6a45e3c13..3bc80cd643 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -232,24 +232,18 @@ bool exec_needs_ipc_namespace(const ExecContext *context) {
return context->private_ipc || context->ipc_namespace_path;
}
-static bool can_apply_cgroup_namespace(const ExecContext *context, const ExecParameters *params) {
- return cg_all_unified() > 0 && ns_type_supported(NAMESPACE_CGROUP);
-}
-
static bool needs_cgroup_namespace(ProtectControlGroups i) {
return IN_SET(i, PROTECT_CONTROL_GROUPS_PRIVATE, PROTECT_CONTROL_GROUPS_STRICT);
}
-ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context, const ExecParameters *params) {
+ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context) {
assert(context);
/* If cgroup namespace is configured via ProtectControlGroups=private or strict but we can't actually
- * use cgroup namespace, either from not having unified hierarchy or kernel support, we ignore the
- * setting and do not unshare the namespace. ProtectControlGroups=private and strict get downgraded
- * to no and yes respectively. This ensures that strict always gets a read-only mount of /sys/fs/cgroup.
- *
- * TODO: Remove fallback once cgroupv1 support is removed in v258. */
- if (needs_cgroup_namespace(context->protect_control_groups) && !can_apply_cgroup_namespace(context, params)) {
+ * use cgroup namespace, we ignore the setting and do not unshare the namespace.
+ * ProtectControlGroups=private and strict get downgraded to no and yes respectively. This ensures
+ * that strict always gets a read-only mount of /sys/fs/cgroup/. */
+ if (needs_cgroup_namespace(context->protect_control_groups) && !ns_type_supported(NAMESPACE_CGROUP)) {
if (context->protect_control_groups == PROTECT_CONTROL_GROUPS_PRIVATE)
return PROTECT_CONTROL_GROUPS_NO;
if (context->protect_control_groups == PROTECT_CONTROL_GROUPS_STRICT)
@@ -258,22 +252,22 @@ ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context,
return context->protect_control_groups;
}
-bool exec_needs_cgroup_namespace(const ExecContext *context, const ExecParameters *params) {
+bool exec_needs_cgroup_namespace(const ExecContext *context) {
assert(context);
- return needs_cgroup_namespace(exec_get_protect_control_groups(context, params));
+ return needs_cgroup_namespace(exec_get_protect_control_groups(context));
}
-bool exec_needs_cgroup_mount(const ExecContext *context, const ExecParameters *params) {
+bool exec_needs_cgroup_mount(const ExecContext *context) {
assert(context);
- return exec_get_protect_control_groups(context, params) != PROTECT_CONTROL_GROUPS_NO;
+ return exec_get_protect_control_groups(context) != PROTECT_CONTROL_GROUPS_NO;
}
-bool exec_is_cgroup_mount_read_only(const ExecContext *context, const ExecParameters *params) {
+bool exec_is_cgroup_mount_read_only(const ExecContext *context) {
assert(context);
- return IN_SET(exec_get_protect_control_groups(context, params), PROTECT_CONTROL_GROUPS_YES, PROTECT_CONTROL_GROUPS_STRICT);
+ return IN_SET(exec_get_protect_control_groups(context), PROTECT_CONTROL_GROUPS_YES, PROTECT_CONTROL_GROUPS_STRICT);
}
bool exec_needs_pid_namespace(const ExecContext *context) {
@@ -331,7 +325,7 @@ bool exec_needs_mount_namespace(
context->protect_kernel_tunables ||
context->protect_kernel_modules ||
context->protect_kernel_logs ||
- exec_needs_cgroup_mount(context, params) ||
+ exec_needs_cgroup_mount(context) ||
context->protect_proc != PROTECT_PROC_DEFAULT ||
context->proc_subset != PROC_SUBSET_ALL ||
exec_needs_ipc_namespace(context) ||
diff --git a/src/core/execute.h b/src/core/execute.h
index af2d972406..78f04d5173 100644
--- a/src/core/execute.h
+++ b/src/core/execute.h
@@ -631,10 +631,11 @@ bool exec_needs_network_namespace(const ExecContext *context);
bool exec_needs_ipc_namespace(const ExecContext *context);
bool exec_needs_pid_namespace(const ExecContext *context);
-ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context, const ExecParameters *params);
-bool exec_needs_cgroup_namespace(const ExecContext *context, const ExecParameters *params);
-bool exec_needs_cgroup_mount(const ExecContext *context, const ExecParameters *params);
-bool exec_is_cgroup_mount_read_only(const ExecContext *context, const ExecParameters *params);
+ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context);
+bool exec_needs_cgroup_namespace(const ExecContext *context);
+bool exec_needs_cgroup_mount(const ExecContext *context);
+bool exec_is_cgroup_mount_read_only(const ExecContext *context);
+
const char* exec_get_private_notify_socket_path(const ExecContext *context, const ExecParameters *params, bool needs_sandboxing);
/* These logging macros do the same logging as those in unit.h, but using ExecContext and ExecParameters
diff --git a/src/core/namespace.c b/src/core/namespace.c
index 56a3f93c3e..7e131b1425 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -26,7 +26,6 @@
#include "loopback-setup.h"
#include "missing_syscall.h"
#include "mkdir-label.h"
-#include "mount-setup.h"
#include "mount-util.h"
#include "mountpoint-util.h"
#include "namespace-util.h"
@@ -207,14 +206,14 @@ static const MountEntry protect_control_groups_yes_table[] = {
};
/* ProtectControlGroups=private table. Note mount_private_apivfs() always use MS_NOSUID|MS_NOEXEC|MS_NODEV so
- * flags is not set here. nsdelegate has been supported since kernels >= 4.13 so it is safe to use. */
+ * flags is not set here. */
static const MountEntry protect_control_groups_private_table[] = {
- { "/sys/fs/cgroup", MOUNT_PRIVATE_CGROUP2FS, false, .read_only = false, .nosuid = true, .noexec = true, .options_const = "nsdelegate" },
+ { "/sys/fs/cgroup", MOUNT_PRIVATE_CGROUP2FS, false, .read_only = false },
};
/* ProtectControlGroups=strict table */
static const MountEntry protect_control_groups_strict_table[] = {
- { "/sys/fs/cgroup", MOUNT_PRIVATE_CGROUP2FS, false, .read_only = true, .nosuid = true, .noexec = true, .options_const = "nsdelegate" },
+ { "/sys/fs/cgroup", MOUNT_PRIVATE_CGROUP2FS, false, .read_only = true },
};
/* ProtectSystem=yes table */
@@ -338,7 +337,7 @@ static bool mount_entry_read_only(const MountEntry *p) {
static bool mount_entry_noexec(const MountEntry *p) {
assert(p);
- return p->noexec || IN_SET(p->mode, MOUNT_NOEXEC, MOUNT_INACCESSIBLE, MOUNT_PRIVATE_SYSFS, MOUNT_BIND_SYSFS, MOUNT_PROCFS);
+ return p->noexec || IN_SET(p->mode, MOUNT_NOEXEC, MOUNT_INACCESSIBLE, MOUNT_PRIVATE_SYSFS, MOUNT_BIND_SYSFS, MOUNT_PROCFS, MOUNT_PRIVATE_CGROUP2FS);
}
static bool mount_entry_exec(const MountEntry *p) {
@@ -1320,16 +1319,6 @@ static int mount_private_apivfs(
return r;
r = mount_nofollow_verbose(LOG_DEBUG, fstype, temporary_mount, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
- if (r == -EINVAL && opts)
- /* If this failed with EINVAL then this likely means either:
- * 1. the textual hidepid= stuff for procfs is not supported by the kernel, and thus the
- * per-instance hidepid= neither, which means we really don't want to use it, since it
- * would affect our host's /proc mount.
- * 2. nsdelegate for cgroup2 is not supported by the kernel even though CLONE_NEWCGROUP
- * is supported.
- *
- * Hence let's gracefully fallback to a classic, unrestricted version. */
- r = mount_nofollow_verbose(LOG_DEBUG, fstype, temporary_mount, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, /* opts = */ NULL);
if (ERRNO_IS_NEG_PRIVILEGE(r)) {
/* When we do not have enough privileges to mount a new instance, fall back to use an
* existing mount. */
@@ -1348,8 +1337,8 @@ static int mount_private_apivfs(
return r;
return 1;
-
- } else if (r < 0)
+ }
+ if (r < 0)
return r;
/* OK. We have a new mount instance. Let's clear an existing mount and its submounts. */
@@ -1375,18 +1364,9 @@ static int mount_private_sysfs(const MountEntry *m, const NamespaceParameters *p
}
static int mount_private_cgroup2fs(const MountEntry *m, const NamespaceParameters *p) {
- _cleanup_free_ char *opts = NULL;
-
assert(m);
assert(p);
-
- if (cgroupfs_recursiveprot_supported()) {
- opts = strextend_with_separator(NULL, ",", mount_entry_options(m) ?: POINTER_MAX, "memory_recursiveprot");
- if (!opts)
- return -ENOMEM;
- }
-
- return mount_private_apivfs("cgroup2", mount_entry_path(m), "/sys/fs/cgroup", opts ?: mount_entry_options(m), p->runtime_scope);
+ return mount_private_apivfs("cgroup2", mount_entry_path(m), "/sys/fs/cgroup", /* opts = */ NULL, p->runtime_scope);
}
static int mount_procfs(const MountEntry *m, const NamespaceParameters *p) {
@@ -1414,14 +1394,14 @@ static int mount_procfs(const MountEntry *m, const NamespaceParameters *p) {
* fsopen()/fsconfig() was also backported on some distros which allows us to detect
* hidepid=/subset= support in even more scenarios. */
- if (mount_option_supported("proc", "hidepid", hpv) != 0) {
+ if (mount_option_supported("proc", "hidepid", hpv) > 0) {
opts = strjoin("hidepid=", hpv);
if (!opts)
return -ENOMEM;
}
if (p->proc_subset == PROC_SUBSET_PID &&
- mount_option_supported("proc", "subset", "pid") != 0)
+ mount_option_supported("proc", "subset", "pid") > 0)
if (!strextend_with_separator(&opts, ",", "subset=pid"))
return -ENOMEM;
}
diff --git a/src/shared/mount-setup.c b/src/shared/mount-setup.c
index e7712fa274..ad2327e084 100644
--- a/src/shared/mount-setup.c
+++ b/src/shared/mount-setup.c
@@ -52,7 +52,7 @@ typedef struct MountPoint {
MountMode mode;
} MountPoint;
-bool cgroupfs_recursiveprot_supported(void) {
+static bool cgroupfs_recursiveprot_supported(void) {
int r;
/* Added in kernel 5.7 */
diff --git a/src/shared/mount-setup.h b/src/shared/mount-setup.h
index c07fe86364..34de1dad0b 100644
--- a/src/shared/mount-setup.h
+++ b/src/shared/mount-setup.h
@@ -8,5 +8,3 @@ bool mount_point_ignore(const char *path);
int mount_setup_early(void);
int mount_setup(bool loaded_policy, bool leave_propagation);
-
-bool cgroupfs_recursiveprot_supported(void);
diff --git a/test/units/TEST-07-PID1.delegate-namespaces.sh b/test/units/TEST-07-PID1.delegate-namespaces.sh
index 9bd9691197..6d8d51caff 100755
--- a/test/units/TEST-07-PID1.delegate-namespaces.sh
+++ b/test/units/TEST-07-PID1.delegate-namespaces.sh
@@ -35,17 +35,12 @@ testcase_network() {
systemd-run -p PrivateUsersEx=self -p PrivateNetwork=yes -p DelegateNamespaces=net --wait --pipe -- ip link add veth1 type veth peer name veth2
}
-testcase_cgroup() {
- (! systemd-run -p PrivateUsersEx=self -p ProtectControlGroupsEx=private --wait --pipe -- sh -c 'echo 0 >/sys/fs/cgroup/cgroup.pressure')
- systemd-run -p PrivateUsersEx=self -p ProtectControlGroupsEx=private -p DelegateNamespaces=cgroup --wait --pipe -- sh -c 'echo 0 >/sys/fs/cgroup/cgroup.pressure'
-}
-
testcase_pid() {
# MountAPIVFS=yes always bind mounts child mounts of APIVFS filesystems, which means /proc/sys is always read-only
# so we can't write to it when running in a container.
if ! systemd-detect-virt --container; then
(! systemd-run -p PrivateUsersEx=self -p PrivatePIDs=yes -p MountAPIVFS=yes --wait --pipe -- sh -c 'echo 5 >/proc/sys/kernel/ns_last_pid')
- systemd-run -p PrivateUsersEx=self -p PrivatePIDs=yes -p MountAPIVFS=yes -p DelegateNamespaces="mnt pid" --wait --pipe -- sh -c 'echo 5 >/proc/sys/kernel/ns_last_pid'
+ systemd-run -p PrivateUsersEx=self -p PrivatePIDs=yes -p MountAPIVFS=yes -p DelegateNamespaces=pid --wait --pipe -- sh -c 'echo 5 >/proc/sys/kernel/ns_last_pid'
fi
}