core: Add DelegateNamespaces=

This delegates one or more namespaces to the service. Concretely,
this setting influences in which order we unshare namespaces. Delegated
namespaces are unshared *after* the user namespace is unshared. Other
namespaces are unshared *before* the user namespace is unshared.

Fixes #35369
This commit is contained in:
Daan De Meyer
2025-02-04 15:48:36 +01:00
parent 7904c1dbe6
commit 8234cd9989
12 changed files with 295 additions and 36 deletions

View File

@@ -3358,6 +3358,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly t RestrictNamespaces = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly t DelegateNamespaces = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly (bas) RestrictFileSystems = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly a(ssbt) BindPaths = [...];
@@ -3963,6 +3965,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
<!--property RestrictNamespaces is not documented!-->
<!--property DelegateNamespaces is not documented!-->
<!--property RestrictFileSystems is not documented!-->
<!--property BindPaths is not documented!-->
@@ -4685,6 +4689,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
<variablelist class="dbus-property" generated="True" extra-ref="RestrictNamespaces"/>
<variablelist class="dbus-property" generated="True" extra-ref="DelegateNamespaces"/>
<variablelist class="dbus-property" generated="True" extra-ref="RestrictFileSystems"/>
<variablelist class="dbus-property" generated="True" extra-ref="BindPaths"/>
@@ -5559,6 +5565,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly t RestrictNamespaces = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly t DelegateNamespaces = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly (bas) RestrictFileSystems = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly a(ssbt) BindPaths = [...];
@@ -6176,6 +6184,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
<!--property RestrictNamespaces is not documented!-->
<!--property DelegateNamespaces is not documented!-->
<!--property RestrictFileSystems is not documented!-->
<!--property BindPaths is not documented!-->
@@ -6870,6 +6880,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
<variablelist class="dbus-property" generated="True" extra-ref="RestrictNamespaces"/>
<variablelist class="dbus-property" generated="True" extra-ref="DelegateNamespaces"/>
<variablelist class="dbus-property" generated="True" extra-ref="RestrictFileSystems"/>
<variablelist class="dbus-property" generated="True" extra-ref="BindPaths"/>
@@ -7576,6 +7588,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly t RestrictNamespaces = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly t DelegateNamespaces = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly (bas) RestrictFileSystems = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly a(ssbt) BindPaths = [...];
@@ -8123,6 +8137,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
<!--property RestrictNamespaces is not documented!-->
<!--property DelegateNamespaces is not documented!-->
<!--property RestrictFileSystems is not documented!-->
<!--property BindPaths is not documented!-->
@@ -8733,6 +8749,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
<variablelist class="dbus-property" generated="True" extra-ref="RestrictNamespaces"/>
<variablelist class="dbus-property" generated="True" extra-ref="DelegateNamespaces"/>
<variablelist class="dbus-property" generated="True" extra-ref="RestrictFileSystems"/>
<variablelist class="dbus-property" generated="True" extra-ref="BindPaths"/>
@@ -9566,6 +9584,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly t RestrictNamespaces = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly t DelegateNamespaces = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly (bas) RestrictFileSystems = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly a(ssbt) BindPaths = [...];
@@ -10095,6 +10115,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
<!--property RestrictNamespaces is not documented!-->
<!--property DelegateNamespaces is not documented!-->
<!--property RestrictFileSystems is not documented!-->
<!--property BindPaths is not documented!-->
@@ -10687,6 +10709,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
<variablelist class="dbus-property" generated="True" extra-ref="RestrictNamespaces"/>
<variablelist class="dbus-property" generated="True" extra-ref="DelegateNamespaces"/>
<variablelist class="dbus-property" generated="True" extra-ref="RestrictFileSystems"/>
<variablelist class="dbus-property" generated="True" extra-ref="BindPaths"/>
@@ -12385,7 +12409,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
<varname>ProtectControlGroupsEx</varname>,
<varname>PrivateUsersEx</varname>, and
<varname>PrivatePIDs</varname> were added in version 257.</para>
<para><varname>ProtectHostnameEx</varname> and <function>RemoveSubGroup()</function> were added in version 258.</para>
<para><varname>ProtectHostnameEx</varname>,
<varname>DelegateNamespaces</varname>, and
<function>RemoveSubGroup()</function> were added in version 258.</para>
</refsect2>
<refsect2>
<title>Socket Unit Objects</title>
@@ -12429,7 +12455,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
<varname>ManagedOOMMemoryPressureDurationUSec</varname>,
<varname>ProtectControlGroupsEx</varname>, and
<varname>PrivatePIDs</varname> were added in version 257.</para>
<para><varname>ProtectHostnameEx</varname> and <function>RemoveSubgroup()</function> were added in version 258.</para>
<para><varname>ProtectHostnameEx</varname>,
<varname>DelegateNamespaces</varname>, and
<function>RemoveSubgroup()</function> were added in version 258.</para>
</refsect2>
<refsect2>
<title>Mount Unit Objects</title>
@@ -12471,6 +12499,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
<varname>ProtectControlGroupsEx</varname>, and
<varname>PrivatePIDs</varname> were added in version 257.</para>
<para><varname>ProtectHostnameEx</varname>,
<varname>DelegateNamespaces</varname>,
<function>RemoveSubgroup()</function>,
<varname>ReloadResult</varname>, and
<varname>CleanResult</varname> were added in version 258.</para>
@@ -12514,7 +12543,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
<varname>ManagedOOMMemoryPressureDurationUSec</varname>,
<varname>ProtectControlGroupsEx</varname>, and
<varname>PrivatePIDs</varname> were added in version 257.</para>
<para><varname>ProtectHostnameEx</varname> and <function>RemoveSubgroup()</function> were added in version 258.</para>
<para><varname>ProtectHostnameEx</varname>,
<varname>DelegateNamespaces</varname>, and
<function>RemoveSubgroup()</function> were added in version 258.</para>
</refsect2>
<refsect2>
<title>Slice Unit Objects</title>

View File

@@ -2375,6 +2375,43 @@ RestrictNamespaces=~cgroup net</programlisting>
<xi:include href="version-info.xml" xpointer="v233"/></listitem>
</varlistentry>
<varlistentry>
<term><varname>DelegateNamespaces=</varname></term>
<listitem><para>Delegates ownership of the given namespace types to the user namespace of the
processes of this unit. For details about Linux namespaces, see <citerefentry
project='man-pages'><refentrytitle>namespaces</refentrytitle><manvolnum>7</manvolnum></citerefentry>.
Either takes a boolean argument, or a space-separated list of namespace type identifiers. If false
(the default), the unit's processes' user namespace will not have ownership over any namespaces
created during setup of the unit's sandboxed environment. If true, ownership of all namespace types
(except for user namespaces, where the concept doesn't apply) created during setup of the unit's
sandboxed environment is delegated to the unit's processes' user namespace. Otherwise, a
space-separated list of namespace type identifiers must be specified, consisting of any combination
of: <constant>cgroup</constant>, <constant>ipc</constant>, <constant>net</constant>,
<constant>mnt</constant>, <constant>pid</constant>, and <constant>uts</constant>. All namespaces of
the listed types will be owned by the unit's processes' user namespace if they are created during
setup of the unit's sandboxed environment (allow-listing). By prepending the list with a single tilde
character (<literal>~</literal>) the effect may be inverted: all namespaces of types not listed and
created during setup of the unit's sandboxed environment will be owned by the unit's processes' user
namespace (deny-listing). If the empty string is assigned, the default namespace ownership is
applied, which is equivalent to false. This option may appear more than once, in which case the
namespace types are merged by <constant>OR</constant>, or by <constant>AND</constant> if the lines
are prefixed with <literal>~</literal> (see examples below). Internally, this setting controls the
order in which namespaces are unshared by systemd. Namespace types that should be owned by the unit's
processes' user namespace will be unshared after unsharing the user namespace. Internally, this
setting controls the order in which namespaces are unshared. Delegated namespaces will be unshared
after the user namespace is unshared. Other namespaces will be unshared before the user namespace is
unshared.</para>
<para>Delegating any namespace with <varname>DelegateNamespaces=</varname> implies
<varname>PrivateUsers=self</varname> unless <varname>PrivateUsers=</varname> is explicitly enabled
already by the unit. Delegating a namespace does not imply that the namespace is unshared, that is
done with the namespace specific unit setting such as <varname>PrivateNetwork=</varname> or
<varname>PrivateMounts=</varname>.</para>
<xi:include href="version-info.xml" xpointer="v258"/></listitem>
</varlistentry>
<varlistentry>
<term><varname>LockPersonality=</varname></term>

View File

@@ -1263,6 +1263,7 @@ const sd_bus_vtable bus_exec_vtable[] = {
SD_BUS_PROPERTY("RestrictRealtime", "b", bus_property_get_bool, offsetof(ExecContext, restrict_realtime), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("RestrictSUIDSGID", "b", bus_property_get_bool, offsetof(ExecContext, restrict_suid_sgid), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("RestrictNamespaces", "t", bus_property_get_ulong, offsetof(ExecContext, restrict_namespaces), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("DelegateNamespaces", "t", bus_property_get_ulong, offsetof(ExecContext, delegate_namespaces), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("RestrictFileSystems", "(bas)", property_get_restrict_filesystems, 0, SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("BindPaths", "a(ssbt)", property_get_bind_paths, 0, SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("BindReadOnlyPaths", "a(ssbt)", property_get_bind_paths, 0, SD_BUS_VTABLE_PROPERTY_CONST),
@@ -2194,6 +2195,9 @@ int bus_exec_context_set_transient_property(
if (streq(name, "RestrictNamespaces"))
return bus_set_transient_namespace_flag(u, name, &c->restrict_namespaces, message, flags, error);
if (streq(name, "DelegateNamespaces"))
return bus_set_transient_namespace_flag(u, name, &c->delegate_namespaces, message, flags, error);
if (streq(name, "RestrictFileSystems")) {
int allow_list;
_cleanup_strv_free_ char **l = NULL;

View File

@@ -4209,13 +4209,56 @@ static bool exec_context_need_unprivileged_private_users(
!strv_isempty(context->read_only_paths) ||
!strv_isempty(context->inaccessible_paths) ||
!strv_isempty(context->exec_paths) ||
!strv_isempty(context->no_exec_paths);
!strv_isempty(context->no_exec_paths) ||
context->delegate_namespaces != NAMESPACE_FLAGS_INITIAL;
}
static PrivateUsers exec_context_get_effective_private_users(
const ExecContext *context,
const ExecParameters *params) {
assert(context);
assert(params);
if (context->private_users != PRIVATE_USERS_NO)
return context->private_users;
if (exec_context_need_unprivileged_private_users(context, params))
return PRIVATE_USERS_SELF;
/* If any namespace is delegated with DelegateNamespaces=, always set up a user namespace. */
if (context->delegate_namespaces != NAMESPACE_FLAGS_INITIAL)
return PRIVATE_USERS_SELF;
return PRIVATE_USERS_NO;
}
static bool exec_namespace_is_delegated(
const ExecContext *context,
const ExecParameters *params,
unsigned long namespace) {
assert(context);
assert(params);
assert(namespace != CLONE_NEWUSER);
/* If we need unprivileged private users, we've already unshared a user namespace by the time we call
* setup_delegated_namespaces() for the first time so let's make sure we do all other namespace
* unsharing in the first call to setup_delegated_namespaces() by returning false here. */
if (exec_context_need_unprivileged_private_users(context, params))
return false;
if (context->delegate_namespaces == NAMESPACE_FLAGS_INITIAL)
return false;
return FLAGS_SET(context->delegate_namespaces, namespace);
}
static int setup_delegated_namespaces(
const ExecContext *context,
ExecParameters *params,
ExecRuntime *runtime,
bool delegate,
const char *memory_pressure_path,
uid_t uid,
uid_t gid,
@@ -4226,16 +4269,25 @@ static int setup_delegated_namespaces(
int r;
/* This function is called twice, once before unsharing the user namespace, and once after unsharing
* the user namespace. When called before unsharing the user namespace, "delegate" is set to "false".
* When called after unsharing the user namespace, "delegate" is set to "true". The net effect is
* that all namespaces that should not be delegated are unshared when this function is called the
* first time and all namespaces that should be delegated are unshared when this function is called
* the second time. */
assert(context);
assert(params);
assert(reterr_exit_status);
if (exec_needs_network_namespace(context) &&
exec_namespace_is_delegated(context, params, CLONE_NEWNET) == delegate &&
runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
/* Try to enable network namespacing if network namespacing is available and we have
* CAP_NET_ADMIN. We need CAP_NET_ADMIN to be able to configure the loopback device in the
* new network namespace. And if we don't have that, then we could only create a network
* CAP_NET_ADMIN in the current user namespace (either the system manager one or the unit's
* own user namespace). We need CAP_NET_ADMIN to be able to configure the loopback device in
* the new network namespace. And if we don't have that, then we could only create a network
* namespace without the ability to set up "lo". Hence gracefully skip things then. */
if (ns_type_supported(NAMESPACE_NET) && have_effective_cap(CAP_NET_ADMIN) > 0) {
r = setup_shareable_ns(runtime->shared->netns_storage_socket, CLONE_NEWNET);
@@ -4245,7 +4297,8 @@ static int setup_delegated_namespaces(
else if (r < 0) {
*reterr_exit_status = EXIT_NETWORK;
return log_exec_error_errno(context, params, r, "Failed to set up network namespacing: %m");
}
} else
log_exec_debug(context, params, "Set up %snetwork namespace", delegate ? "delegated " : "");
} else if (context->network_namespace_path) {
*reterr_exit_status = EXIT_NETWORK;
return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EOPNOTSUPP),
@@ -4254,8 +4307,9 @@ static int setup_delegated_namespaces(
log_exec_notice(context, params, "PrivateNetwork=yes is configured, but the kernel does not support or we lack privileges for network namespace, proceeding without.");
}
if (exec_needs_ipc_namespace(context) && runtime &&
runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
if (exec_needs_ipc_namespace(context) &&
exec_namespace_is_delegated(context, params, CLONE_NEWIPC) == delegate &&
runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
if (ns_type_supported(NAMESPACE_IPC)) {
r = setup_shareable_ns(runtime->shared->ipcns_storage_socket, CLONE_NEWIPC);
@@ -4265,7 +4319,8 @@ static int setup_delegated_namespaces(
else if (r < 0) {
*reterr_exit_status = EXIT_NAMESPACE;
return log_exec_error_errno(context, params, r, "Failed to set up IPC namespacing: %m");
}
} else
log_exec_debug(context, params, "Set up %sIPC namespace", delegate ? "delegated " : "");
} else if (context->ipc_namespace_path) {
*reterr_exit_status = EXIT_NAMESPACE;
return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EOPNOTSUPP),
@@ -4274,16 +4329,20 @@ static int setup_delegated_namespaces(
log_exec_warning(context, params, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
}
if (needs_sandboxing && exec_needs_cgroup_namespace(context, params)) {
if (needs_sandboxing && exec_needs_cgroup_namespace(context, params) &&
exec_namespace_is_delegated(context, params, CLONE_NEWCGROUP) == delegate) {
if (unshare(CLONE_NEWCGROUP) < 0) {
*reterr_exit_status = EXIT_NAMESPACE;
return log_exec_error_errno(context, params, errno, "Failed to set up cgroup namespacing: %m");
}
log_exec_debug(context, params, "Set up %scgroup namespace", delegate ? "delegated " : "");
}
/* Unshare a new PID namespace before setting up mounts to ensure /proc/ is mounted with only processes in PID namespace visible.
* Note PrivatePIDs=yes implies MountAPIVFS=yes so we'll always ensure procfs is remounted. */
if (needs_sandboxing && exec_needs_pid_namespace(context)) {
if (needs_sandboxing && exec_needs_pid_namespace(context) &&
exec_namespace_is_delegated(context, params, CLONE_NEWPID) == delegate) {
if (params->pidref_transport_fd < 0) {
*reterr_exit_status = EXIT_NAMESPACE;
return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ENOTCONN), "PidRef socket is not set up: %m");
@@ -4313,11 +4372,14 @@ static int setup_delegated_namespaces(
*reterr_exit_status = EXIT_NAMESPACE;
return log_exec_error_errno(context, params, r, "Failed to set up pid namespace: %m");
}
log_exec_debug(context, params, "Set up %spid namespace", delegate ? "delegated " : "");
}
/* If PrivatePIDs= yes is configured, we're now running as pid 1 in a pid namespace! */
if (exec_needs_mount_namespace(context, params, runtime)) {
if (exec_needs_mount_namespace(context, params, runtime) &&
exec_namespace_is_delegated(context, params, CLONE_NEWNS) == delegate) {
_cleanup_free_ char *error_path = NULL;
r = apply_mount_namespace(command->flags,
@@ -4334,12 +4396,16 @@ static int setup_delegated_namespaces(
return log_exec_error_errno(context, params, r, "Failed to set up mount namespacing%s%s: %m",
error_path ? ": " : "", strempty(error_path));
}
log_exec_debug(context, params, "Set up %smount namespace", delegate ? "delegated " : "");
}
if (needs_sandboxing) {
if (needs_sandboxing && exec_namespace_is_delegated(context, params, CLONE_NEWUTS) == delegate) {
r = apply_protect_hostname(context, params, reterr_exit_status);
if (r < 0)
return r;
log_exec_debug(context, params, "Set up %sUTS namespace", delegate ? "delegated " : "");
}
return 0;
@@ -4531,7 +4597,6 @@ int exec_invoke(
char **final_argv = NULL;
dev_t journal_stream_dev = 0;
ino_t journal_stream_ino = 0;
bool userns_set_up = false;
bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
needs_mount_namespace; /* Do we need to set up a mount namespace for this kernel? */
@@ -5264,9 +5329,7 @@ int exec_invoke(
/* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
* Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
* set up all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
PrivateUsers pu = context->private_users;
if (pu == PRIVATE_USERS_NO)
pu = PRIVATE_USERS_SELF;
PrivateUsers pu = exec_context_get_effective_private_users(context, params);
/* The kernel requires /proc/pid/setgroups be set to "deny" prior to writing /proc/pid/gid_map in
* unprivileged user namespaces. */
@@ -5281,14 +5344,16 @@ int exec_invoke(
log_exec_info_errno(context, params, r, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
else {
assert(r > 0);
userns_set_up = true;
log_debug("Set up unprivileged user namespace");
}
}
/* Call setup_delegated_namespaces() the first time to unshare all non-delegated namespaces. */
r = setup_delegated_namespaces(
context,
params,
runtime,
/* delegate= */ false,
memory_pressure_path,
uid,
gid,
@@ -5331,15 +5396,35 @@ int exec_invoke(
* case of mount namespaces being less privileged when the mount point list is copied from a
* different user namespace). */
if (needs_sandboxing && !userns_set_up) {
r = setup_private_users(context->private_users, saved_uid, saved_gid, uid, gid,
/* allow_setgroups= */ context->private_users == PRIVATE_USERS_FULL);
if (needs_sandboxing && !exec_context_need_unprivileged_private_users(context, params)) {
PrivateUsers pu = exec_context_get_effective_private_users(context, params);
r = setup_private_users(pu, saved_uid, saved_gid, uid, gid,
/* allow_setgroups= */ pu == PRIVATE_USERS_FULL);
if (r < 0) {
*exit_status = EXIT_USER;
return log_exec_error_errno(context, params, r, "Failed to set up user namespacing: %m");
}
log_debug("Set up privileged user namespace");
}
/* Call setup_delegated_namespaces() the second time to unshare all delegated namespaces. */
r = setup_delegated_namespaces(
context,
params,
runtime,
/* delegate= */ true,
memory_pressure_path,
uid,
gid,
command,
needs_sandboxing,
has_cap_sys_admin,
exit_status);
if (r < 0)
return r;
/* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
* shall execute. */

View File

@@ -2474,6 +2474,12 @@ static int exec_context_serialize(const ExecContext *c, FILE *f) {
return r;
}
if (c->delegate_namespaces != NAMESPACE_FLAGS_INITIAL) {
r = serialize_item_format(f, "exec-context-delegate-namespaces", "%lu", c->delegate_namespaces);
if (r < 0)
return r;
}
#if HAVE_LIBBPF
if (exec_context_restrict_filesystems_set(c)) {
char *fs;
@@ -3536,6 +3542,10 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) {
r = safe_atolu(val, &c->restrict_namespaces);
if (r < 0)
return r;
} else if ((val = startswith(l, "exec-context-delegate-namespaces="))) {
r = safe_atolu(val, &c->delegate_namespaces);
if (r < 0)
return r;
} else if ((val = startswith(l, "exec-context-restrict-filesystems="))) {
r = set_ensure_allocated(&c->restrict_filesystems, &string_hash_ops);
if (r < 0)

View File

@@ -611,6 +611,7 @@ void exec_context_init(ExecContext *c) {
.timeout_clean_usec = USEC_INFINITY,
.capability_bounding_set = CAP_MASK_UNSET,
.restrict_namespaces = NAMESPACE_FLAGS_INITIAL,
.delegate_namespaces = NAMESPACE_FLAGS_INITIAL,
.log_level_max = -1,
#if HAVE_SECCOMP
.syscall_errno = SECCOMP_ERROR_NUMBER_KILL,

View File

@@ -350,6 +350,7 @@ struct ExecContext {
unsigned long personality;
unsigned long restrict_namespaces; /* The CLONE_NEWxyz flags permitted to the unit's processes */
unsigned long delegate_namespaces; /* The CLONE_NEWxyz flags delegated to the unit's processes */
Set *restrict_filesystems;
bool restrict_filesystems_allow_list:1;

View File

@@ -73,7 +73,8 @@
{{type}}.SystemCallErrorNumber, config_parse_syscall_errno, 0, offsetof({{type}}, exec_context)
{{type}}.SystemCallLog, config_parse_syscall_log, 0, offsetof({{type}}, exec_context)
{{type}}.MemoryDenyWriteExecute, config_parse_bool, 0, offsetof({{type}}, exec_context.memory_deny_write_execute)
{{type}}.RestrictNamespaces, config_parse_restrict_namespaces, 0, offsetof({{type}}, exec_context)
{{type}}.RestrictNamespaces, config_parse_namespace_flags, 0, offsetof({{type}}, exec_context.restrict_namespaces)
{{type}}.DelegateNamespaces, config_parse_namespace_flags, 0, offsetof({{type}}, exec_context.delegate_namespaces)
{{type}}.RestrictRealtime, config_parse_bool, 0, offsetof({{type}}, exec_context.restrict_realtime)
{{type}}.RestrictSUIDSGID, config_parse_bool, 0, offsetof({{type}}, exec_context.restrict_suid_sgid)
{{type}}.RestrictAddressFamilies, config_parse_address_families, 0, offsetof({{type}}, exec_context)

View File

@@ -3566,7 +3566,7 @@ int config_parse_address_families(
}
}
int config_parse_restrict_namespaces(
int config_parse_namespace_flags(
const char *unit,
const char *filename,
unsigned line,
@@ -3578,24 +3578,25 @@ int config_parse_restrict_namespaces(
void *data,
void *userdata) {
ExecContext *c = data;
unsigned long flags;
unsigned long *flags = data;
unsigned long all = UPDATE_FLAG(NAMESPACE_FLAGS_ALL, CLONE_NEWUSER, !streq(lvalue, "DelegateNamespaces"));
unsigned long f;
bool invert = false;
int r;
if (isempty(rvalue)) {
/* Reset to the default. */
c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
*flags = NAMESPACE_FLAGS_INITIAL;
return 0;
}
/* Boolean parameter ignores the previous settings */
r = parse_boolean(rvalue);
if (r > 0) {
c->restrict_namespaces = 0;
*flags = 0;
return 0;
} else if (r == 0) {
c->restrict_namespaces = NAMESPACE_FLAGS_ALL;
*flags = all;
return 0;
}
@@ -3605,18 +3606,25 @@ int config_parse_restrict_namespaces(
}
/* Not a boolean argument, in this case it's a list of namespace types. */
r = namespace_flags_from_string(rvalue, &flags);
r = namespace_flags_from_string(rvalue, &f);
if (r < 0) {
log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse namespace type string, ignoring: %s", rvalue);
return 0;
}
if (c->restrict_namespaces == NAMESPACE_FLAGS_INITIAL)
if (*flags == NAMESPACE_FLAGS_INITIAL)
/* Initial assignment. Just set the value. */
c->restrict_namespaces = invert ? (~flags) & NAMESPACE_FLAGS_ALL : flags;
f = invert ? (~f) & all : f;
else
/* Merge the value with the previous one. */
SET_FLAG(c->restrict_namespaces, flags, !invert);
f = UPDATE_FLAG(*flags, f, !invert);
if (FLAGS_SET(f, CLONE_NEWUSER) && streq(lvalue, "DelegateNamespaces")) {
log_syntax(unit, LOG_WARNING, filename, line, r, "The user namespace cannot be delegated with DelegateNamespaces=, ignoring: %s", rvalue);
return 0;
}
*flags = f;
return 0;
}
@@ -6359,7 +6367,7 @@ void unit_dump_config_items(FILE *f) {
{ config_parse_syscall_errno, "ERRNO" },
{ config_parse_syscall_log, "SYSCALLS" },
{ config_parse_address_families, "FAMILIES" },
{ config_parse_restrict_namespaces, "NAMESPACES" },
{ config_parse_namespace_flags, "NAMESPACES" },
#endif
{ config_parse_restrict_filesystems, "FILESYSTEMS" },
{ config_parse_cpu_shares, "SHARES" },

View File

@@ -127,7 +127,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_working_directory);
CONFIG_PARSER_PROTOTYPE(config_parse_fdname);
CONFIG_PARSER_PROTOTYPE(config_parse_user_group_compat);
CONFIG_PARSER_PROTOTYPE(config_parse_user_group_strv_compat);
CONFIG_PARSER_PROTOTYPE(config_parse_restrict_namespaces);
CONFIG_PARSER_PROTOTYPE(config_parse_namespace_flags);
CONFIG_PARSER_PROTOTYPE(config_parse_restrict_filesystems);
CONFIG_PARSER_PROTOTYPE(config_parse_bind_paths);
CONFIG_PARSER_PROTOTYPE(config_parse_exec_keyring_mode);

View File

@@ -1667,7 +1667,8 @@ static int bus_append_execute_property(sd_bus_message *m, const char *field, con
return 1;
}
if (streq(field, "RestrictNamespaces")) {
if (STR_IN_SET(field, "RestrictNamespaces",
"DelegateNamespaces")) {
bool invert = false;
unsigned long flags;

View File

@@ -0,0 +1,80 @@
#!/usr/bin/env bash
# SPDX-License-Identifier: LGPL-2.1-or-later
# shellcheck disable=SC2016
set -eux
set -o pipefail
# shellcheck source=test/units/test-control.sh
. "$(dirname "$0")"/test-control.sh
# shellcheck source=test/units/util.sh
. "$(dirname "$0")"/util.sh
testcase_mount() {
(! systemd-run -p PrivateUsersEx=self -p PrivateMounts=yes --wait --pipe -- mount --bind /usr /home)
systemd-run -p PrivateUsersEx=self -p PrivateMounts=yes -p DelegateNamespaces=mnt --wait --pipe -- mount --bind /usr /home
}
testcase_network() {
(! systemd-run -p PrivateUsersEx=self -p PrivateNetwork=yes --wait --pipe -- ip link add veth1 type veth peer name veth2)
systemd-run -p PrivateUsersEx=self -p PrivateMounts=yes -p DelegateNamespaces=mnt --wait --pipe -- ip link add veth1 type veth peer name veth2
}
testcase_cgroup() {
(! systemd-run -p PrivateUsersEx=self -p ProtectControlGroupsEx=private --wait --pipe -- sh -c 'echo 0 >/sys/fs/cgroup/cgroup.pressure')
systemd-run -p PrivateUsersEx=self -p ProtectControlGroupsEx=private -p DelegateNamespaces=cgroup --wait --pipe -- sh -c 'echo 0 >/sys/fs/cgroup/cgroup.pressure'
}
testcase_pid() {
(! systemd-run -p PrivateUsersEx=self -p PrivatePIDs=yes --wait --pipe -- sh -c 'echo 5 >/proc/sys/kernel/ns_last_pid')
systemd-run -p PrivateUsersEx=self -p PrivatePIDs=yes -p DelegateNamespaces=pid --wait --pipe -- sh -c 'echo 5 >/proc/sys/kernel/ns_last_pid'
}
testcase_uts() {
(! systemd-run -p PrivateUsersEx=self -p ProtectHostnameEx=private --wait --pipe -- hostname abc)
systemd-run -p PrivateUsersEx=self -p ProtectHostnameEx=private -p DelegateNamespaces=uts --wait --pipe -- hostname abc
}
testcase_implied_private_users_self() {
# If not explicitly set PrivateUsers=self is implied.
systemd-run -p PrivateMounts=yes -p DelegateNamespaces=mnt --wait --pipe -- mount --bind /usr /home
# If explicitly set it PrivateUsers= is not overridden.
systemd-run -p PrivateUsersEx=identity -p PrivateMounts=yes -p DelegateNamespaces=mnt --wait --pipe -- mount --bind /usr /home
systemd-run -p PrivateUsersEx=identity -p PrivateMounts=yes -p DelegateNamespaces=mnt --wait bash -c 'test "$(cat /proc/self/uid_map)" == " 0 0 65536"'
}
testcase_multiple_features() {
unsquashfs -no-xattrs -d /tmp/TEST-07-PID1-delegate-namespaces-root /usr/share/minimal_0.raw
systemd-run \
-p PrivatePIDs=yes \
-p RootDirectory=/tmp/TEST-07-PID1-delegate-namespaces-root \
-p ProcSubset=pid \
-p BindReadOnlyPaths=/usr/share \
-p NoNewPrivileges=yes \
-p ProtectSystem=strict \
-p User=testuser\
-p Group=testuser \
-p RuntimeDirectory=abc \
-p StateDirectory=qed \
-p InaccessiblePaths=/usr/include \
-p TemporaryFileSystem=/home \
-p PrivateTmp=yes \
-p PrivateDevices=yes \
-p PrivateNetwork=yes \
-p PrivateUsersEx=self \
-p PrivateIPC=yes \
-p ProtectHostname=yes \
-p ProtectClock=yes \
-p ProtectKernelTunables=yes \
-p ProtectKernelModules=yes \
-p ProtectKernelLogs=yes \
-p ProtectControlGroupsEx=private \
-p LockPersonality=yes \
-p Environment=ABC=QED \
-p DelegateNamespaces=yes \
--wait \
--pipe \
grep MARKER=1 /etc/os-release
rm -rf /tmp/TEST-07-PID1-delegate-namespaces-root
}