core: Introduce PrivateBPF= to mount a private BPFFS

Add a new option PrivateBPF= to mount a new instance of bpffs within a
namespace.
PrivateBPF= can be set to "no" to use the host bpffs in readonly mode
and "yes" to do a new mount.
The mount is done with the new fsopen()/fsmount() API because in future
we'll hook some commands between the two calls.
This commit is contained in:
Matteo Croce
2025-06-27 14:17:00 +02:00
parent 2c7dabff50
commit 3a47437fc9
14 changed files with 284 additions and 7 deletions

View File

@@ -3374,6 +3374,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly (ss) ProtectHostnameEx = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s PrivateBPF = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b MemoryKSM = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s NetworkNamespacePath = '...';
@@ -3975,6 +3977,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
<!--property ProcSubset is not documented!-->
<!--property PrivateBPF is not documented!-->
<!--property MemoryKSM is not documented!-->
<!--property NetworkNamespacePath is not documented!-->
@@ -4701,6 +4705,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
<variablelist class="dbus-property" generated="True" extra-ref="ProtectHostnameEx"/>
<variablelist class="dbus-property" generated="True" extra-ref="PrivateBPF"/>
<variablelist class="dbus-property" generated="True" extra-ref="MemoryKSM"/>
<variablelist class="dbus-property" generated="True" extra-ref="NetworkNamespacePath"/>
@@ -5583,6 +5589,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly (ss) ProtectHostnameEx = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s PrivateBPF = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b MemoryKSM = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s NetworkNamespacePath = '...';
@@ -6204,6 +6212,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
<!--property ProcSubset is not documented!-->
<!--property PrivateBPF is not documented!-->
<!--property MemoryKSM is not documented!-->
<!--property NetworkNamespacePath is not documented!-->
@@ -6910,6 +6920,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
<variablelist class="dbus-property" generated="True" extra-ref="ProtectHostnameEx"/>
<variablelist class="dbus-property" generated="True" extra-ref="PrivateBPF"/>
<variablelist class="dbus-property" generated="True" extra-ref="MemoryKSM"/>
<variablelist class="dbus-property" generated="True" extra-ref="NetworkNamespacePath"/>
@@ -7616,6 +7628,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly (ss) ProtectHostnameEx = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s PrivateBPF = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b MemoryKSM = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s NetworkNamespacePath = '...';
@@ -8159,6 +8173,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
<!--property ProcSubset is not documented!-->
<!--property PrivateBPF is not documented!-->
<!--property MemoryKSM is not documented!-->
<!--property NetworkNamespacePath is not documented!-->
@@ -8773,6 +8789,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
<variablelist class="dbus-property" generated="True" extra-ref="ProtectHostnameEx"/>
<variablelist class="dbus-property" generated="True" extra-ref="PrivateBPF"/>
<variablelist class="dbus-property" generated="True" extra-ref="MemoryKSM"/>
<variablelist class="dbus-property" generated="True" extra-ref="NetworkNamespacePath"/>
@@ -9612,6 +9630,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly (ss) ProtectHostnameEx = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s PrivateBPF = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b MemoryKSM = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s NetworkNamespacePath = '...';
@@ -10137,6 +10157,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
<!--property ProcSubset is not documented!-->
<!--property PrivateBPF is not documented!-->
<!--property MemoryKSM is not documented!-->
<!--property NetworkNamespacePath is not documented!-->
@@ -10733,6 +10755,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
<variablelist class="dbus-property" generated="True" extra-ref="ProtectHostnameEx"/>
<variablelist class="dbus-property" generated="True" extra-ref="PrivateBPF"/>
<variablelist class="dbus-property" generated="True" extra-ref="MemoryKSM"/>
<variablelist class="dbus-property" generated="True" extra-ref="NetworkNamespacePath"/>
@@ -12316,6 +12340,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
<varname>PrivatePIDs</varname> were added in version 257.</para>
<para><varname>ProtectHostnameEx</varname>,
<varname>DelegateNamespaces</varname>,
<varname>PrivateBPF</varname>,
<function>RemoveSubGroup()</function>,
<varname>StateDirectoryQuota</varname>,
<varname>StateDirectoryQuotaUsage</varname>,
@@ -12374,6 +12399,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
<varname>PassPIDFD</varname>,
<varname>AcceptFileDescriptors</varname>,
<varname>DelegateNamespaces</varname>,
<varname>PrivateBPF</varname>,
<function>RemoveSubgroup()</function>,
<varname>DeferTrigger</varname>,
<varname>DeferTriggerMaxUSec</varname>,
@@ -12429,6 +12455,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
<varname>PrivatePIDs</varname> were added in version 257.</para>
<para><varname>ProtectHostnameEx</varname>,
<varname>DelegateNamespaces</varname>,
<varname>PrivateBPF</varname>,
<function>RemoveSubgroup()</function>,
<varname>ReloadResult</varname>,
<varname>CleanResult</varname>,
@@ -12484,6 +12511,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
<varname>PrivatePIDs</varname> were added in version 257.</para>
<para><varname>ProtectHostnameEx</varname>,
<varname>DelegateNamespaces</varname>,
<varname>PrivateBPF</varname>,
<function>RemoveSubgroup()</function>,
<varname>StateDirectoryQuota</varname>,
<varname>StateDirectoryQuotaUsage</varname>,

View File

@@ -2555,6 +2555,16 @@ RestrictNamespaces=~cgroup net</programlisting>
<xi:include href="version-info.xml" xpointer="v258"/></listitem>
</varlistentry>
<varlistentry>
<term><varname>PrivateBPF=</varname></term>
<listitem><para>Takes a boolean argument. If set, mount a private instance of the BPF filesystem
on <filename>/sys/fs/bpf/</filename>. Otherwise, if <varname>ProtectKernelTunables=</varname> is set,
the instance from the host is inherited but mounted read-only. Defaults to false.</para>
<xi:include href="version-info.xml" xpointer="v258"/></listitem>
</varlistentry>
<varlistentry>
<term><varname>LockPersonality=</varname></term>

View File

@@ -54,6 +54,7 @@ BUS_DEFINE_PROPERTY_GET_ENUM(bus_property_get_exec_preserve_mode, exec_preserve_
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_keyring_mode, exec_keyring_mode, ExecKeyringMode);
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_proc, protect_proc, ProtectProc);
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_proc_subset, proc_subset, ProcSubset);
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_private_bpf, private_bpf, PrivateBPF);
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_home, protect_home, ProtectHome);
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_system, protect_system, ProtectSystem);
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_personality, personality, unsigned long);
@@ -1316,6 +1317,7 @@ const sd_bus_vtable bus_exec_vtable[] = {
SD_BUS_PROPERTY("ProcSubset", "s", property_get_proc_subset, offsetof(ExecContext, proc_subset), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("ProtectHostname", "b", property_get_protect_hostname, offsetof(ExecContext, protect_hostname), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("ProtectHostnameEx", "(ss)", property_get_protect_hostname_ex, 0, SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("PrivateBPF", "s", property_get_private_bpf, offsetof(ExecContext, private_bpf), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("MemoryKSM", "b", bus_property_get_tristate, offsetof(ExecContext, memory_ksm), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("NetworkNamespacePath", "s", NULL, offsetof(ExecContext, network_namespace_path), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("IPCNamespacePath", "s", NULL, offsetof(ExecContext, ipc_namespace_path), SD_BUS_VTABLE_PROPERTY_CONST),
@@ -1753,6 +1755,7 @@ static BUS_DEFINE_SET_TRANSIENT_PARSE(protect_home, ProtectHome, protect_home_fr
static BUS_DEFINE_SET_TRANSIENT_PARSE(keyring_mode, ExecKeyringMode, exec_keyring_mode_from_string);
static BUS_DEFINE_SET_TRANSIENT_PARSE(protect_proc, ProtectProc, protect_proc_from_string);
static BUS_DEFINE_SET_TRANSIENT_PARSE(proc_subset, ProcSubset, proc_subset_from_string);
static BUS_DEFINE_SET_TRANSIENT_PARSE(private_bpf, PrivateBPF, private_bpf_from_string);
BUS_DEFINE_SET_TRANSIENT_PARSE(exec_preserve_mode, ExecPreserveMode, exec_preserve_mode_from_string);
static BUS_DEFINE_SET_TRANSIENT_PARSE_PTR(personality, unsigned long, parse_personality);
static BUS_DEFINE_SET_TRANSIENT_TO_STRING_ALLOC(secure_bits, "i", int32_t, int, "%" PRIi32, secure_bits_to_string_alloc_with_check);
@@ -2279,6 +2282,9 @@ int bus_exec_context_set_transient_property(
if (streq(name, "ProcSubset"))
return bus_set_transient_proc_subset(u, name, &c->proc_subset, message, flags, error);
if (streq(name, "PrivateBPF"))
return bus_set_transient_private_bpf(u, name, &c->private_bpf, message, flags, error);
if (streq(name, "RuntimeDirectoryPreserve"))
return bus_set_transient_exec_preserve_mode(u, name, &c->runtime_directory_preserve_mode, message, flags, error);

View File

@@ -2270,6 +2270,61 @@ static int setup_private_users_child(int unshare_ready_fd, const char *uid_map,
return 0;
}
static int bpffs_prepare(
PidRef *ret_pid,
int *ret_sock_fd,
int *ret_errno_pipe) {
_cleanup_close_pair_ int socket_fds[2] = EBADF_PAIR, bpffs_errno_pipe[2] = EBADF_PAIR;
int r;
assert(ret_sock_fd);
assert(ret_pid);
assert(ret_errno_pipe);
r = pipe2(bpffs_errno_pipe, O_CLOEXEC|O_NONBLOCK);
if (r < 0)
return log_debug_errno(errno, "Failed to create pipe: %m");
r = socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, socket_fds);
if (r < 0)
return log_debug_errno(errno, "Failed to create socket pair: %m");
r = pidref_safe_fork("(sd-bpffs)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL, ret_pid);
if (r < 0)
return log_debug_errno(r, "Failed to fork bpffs privileged helper: %m");
if (r == 0) {
_cleanup_close_ int fs_fd = -EBADF;
bpffs_errno_pipe[0] = safe_close(bpffs_errno_pipe[0]);
socket_fds[0] = safe_close(socket_fds[0]);
fs_fd = receive_one_fd(socket_fds[1], /* flags = */ 0);
if (fs_fd < 0) {
log_debug_errno(fs_fd, "Failed to receive file descriptor from parent: %m");
report_errno_and_exit(bpffs_errno_pipe[1], fs_fd);
}
r = fsconfig(fs_fd, FSCONFIG_CMD_CREATE, /* key = */ NULL, /* value = */ NULL, /* aux = */ 0);
if (r < 0) {
log_debug_errno(errno, "Failed to create bpffs superblock: %m");
report_errno_and_exit(bpffs_errno_pipe[1], errno);
}
if (write(socket_fds[1], (uint8_t[1]) {}, 1) < 0) {
log_debug_errno(errno, "Failed to send data to child: %m");
report_errno_and_exit(bpffs_errno_pipe[1], errno);
}
_exit(EXIT_SUCCESS);
}
*ret_sock_fd = TAKE_FD(socket_fds[0]);
*ret_errno_pipe = TAKE_FD(bpffs_errno_pipe[0]);
return 0;
}
static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogid, uid_t uid, gid_t gid, bool allow_setgroups) {
_cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
_cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR;
@@ -3600,9 +3655,10 @@ static int apply_mount_namespace(
ExecRuntime *runtime,
const char *memory_pressure_path,
bool needs_sandboxing,
char **reterr_path,
uid_t exec_directory_uid,
gid_t exec_directory_gid) {
gid_t exec_directory_gid,
int bpffs_socket_fd,
char **reterr_path) {
_cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
_cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
@@ -3814,6 +3870,9 @@ static int apply_mount_namespace(
.protect_system = needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
.protect_proc = needs_sandboxing ? context->protect_proc : PROTECT_PROC_DEFAULT,
.proc_subset = needs_sandboxing ? context->proc_subset : PROC_SUBSET_ALL,
.private_bpf = needs_sandboxing ? context->private_bpf : PRIVATE_BPF_NO,
.bpffs_socket_fd = bpffs_socket_fd,
};
r = setup_namespace(&parameters, reterr_path);
@@ -4454,6 +4513,7 @@ static int setup_delegated_namespaces(
const ExecCommand *command,
bool needs_sandboxing,
bool have_cap_sys_admin,
int bpffs_socket_fd,
int *reterr_exit_status) {
int r;
@@ -4574,9 +4634,10 @@ static int setup_delegated_namespaces(
runtime,
memory_pressure_path,
needs_sandboxing,
&error_path,
uid,
gid);
gid,
bpffs_socket_fd,
&error_path);
if (r < 0) {
*reterr_exit_status = EXIT_NAMESPACE;
return log_error_errno(r, "Failed to set up mount namespacing%s%s: %m",
@@ -4911,7 +4972,9 @@ int exec_invoke(
_cleanup_free_ gid_t *gids = NULL, *gids_after_pam = NULL;
int ngids = 0, ngids_after_pam = 0;
int socket_fd = -EBADF, named_iofds[3] = EBADF_TRIPLET;
_cleanup_close_ int bpffs_socket_fd = -EBADF, bpffs_errno_pipe = -EBADF;
size_t n_storage_fds, n_socket_fds, n_extra_fds;
_cleanup_(pidref_done_sigkill_wait) PidRef bpffs_pidref = PIDREF_NULL;
assert(command);
assert(context);
@@ -5627,6 +5690,26 @@ int exec_invoke(
}
}
if (context->private_bpf != PRIVATE_BPF_NO) {
/* To create a BPF token, the bpffs has to be mounted with the fsopen()/fsmount() API.
* More specifically, fsopen() must be called within the user namespace, then all the
* fsconfig() as privileged user, and finally and fsmount() and move_mount() in
* the user namespace.
* To do this, we split the code into a bpffs_prepare() and mount_bpffs() functions,
* the first runs as privileged user the second as unprivileged one, and they coordinate
* by sending messages and file descriptors via a socket pair.
* The user and mount namespaces need to be unshared in this exact order and before
* the fsopen() call for the fsopen() API to work as unprivileged.
* This is the kernel sample doing this:
* https://github.com/torvalds/linux/blob/master/tools/testing/selftests/bpf/prog_tests/token.c
*/
r = bpffs_prepare(&bpffs_pidref, &bpffs_socket_fd, &bpffs_errno_pipe);
if (r < 0) {
*exit_status = EXIT_BPF;
return log_error_errno(r, "Failed to mount bpffs in bpffs_prepare(): %m");
}
}
if (needs_sandboxing && !have_cap_sys_admin && exec_needs_cap_sys_admin(context, params)) {
/* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
* Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
@@ -5665,6 +5748,7 @@ int exec_invoke(
command,
needs_sandboxing,
have_cap_sys_admin,
bpffs_socket_fd,
exit_status);
if (r < 0)
return r;
@@ -5724,10 +5808,30 @@ int exec_invoke(
command,
needs_sandboxing,
have_cap_sys_admin,
bpffs_socket_fd,
exit_status);
if (r < 0)
return r;
if (context->private_bpf != PRIVATE_BPF_NO) {
r = pidref_wait_for_terminate_and_check("(sd-bpffs)", &bpffs_pidref, /* flags = */ 0);
if (r < 0) {
*exit_status = EXIT_BPF;
return r;
}
/* If something strange happened with the child, let's consider this fatal, too */
if (r != EXIT_SUCCESS) {
*exit_status = EXIT_BPF;
ssize_t ss = read(bpffs_errno_pipe, &r, sizeof(r));
if (ss == sizeof(r))
return log_debug_errno(r, "bpffs helper exited with error: %m");
if (ss < 0)
return log_debug_errno(errno, "Failed to read from the bpffs helper errno pipe: %m");
return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Short read from the bpffs helper errno pipe.");
}
pidref_done(&bpffs_pidref);
}
if (needs_sandboxing && exec_needs_cgroup_namespace(context) && params->cgroup_path) {
/* Move ourselves into the subcgroup now *after* we've unshared the cgroup namespace, which
* ensures the root of the cgroup namespace is the top level service cgroup and not the

View File

@@ -1803,6 +1803,10 @@ static int exec_context_serialize(const ExecContext *c, FILE *f) {
if (r < 0)
return r;
r = serialize_item(f, "exec-context-private-bpf", private_bpf_to_string(c->private_bpf));
if (r < 0)
return r;
r = serialize_item(f, "exec-context-runtime-directory-preserve-mode", exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
if (r < 0)
return r;
@@ -2741,6 +2745,10 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) {
c->proc_subset = proc_subset_from_string(val);
if (c->proc_subset < 0)
return -EINVAL;
} else if ((val = startswith(l, "exec-context-private-bpf="))) {
c->private_bpf = private_bpf_from_string(val);
if (c->private_bpf < 0)
return -EINVAL;
} else if ((val = startswith(l, "exec-context-runtime-directory-preserve-mode="))) {
c->runtime_directory_preserve_mode = exec_preserve_mode_from_string(val);
if (c->runtime_directory_preserve_mode < 0)

View File

@@ -324,6 +324,7 @@ bool exec_needs_mount_namespace(
exec_needs_cgroup_mount(context) ||
context->protect_proc != PROTECT_PROC_DEFAULT ||
context->proc_subset != PROC_SUBSET_ALL ||
context->private_bpf != PRIVATE_BPF_NO ||
exec_needs_ipc_namespace(context) ||
exec_needs_pid_namespace(context, params))
return true;
@@ -1124,7 +1125,8 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
"%sKeyringMode: %s\n"
"%sProtectHostname: %s%s%s\n"
"%sProtectProc: %s\n"
"%sProcSubset: %s\n",
"%sProcSubset: %s\n"
"%sPrivateBPF: %s\n",
prefix, c->umask,
prefix, empty_to_root(c->working_directory),
prefix, empty_to_root(c->root_directory),
@@ -1151,7 +1153,8 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
prefix, exec_keyring_mode_to_string(c->keyring_mode),
prefix, protect_hostname_to_string(c->protect_hostname), c->private_hostname ? ":" : "", strempty(c->private_hostname),
prefix, protect_proc_to_string(c->protect_proc),
prefix, proc_subset_to_string(c->proc_subset));
prefix, proc_subset_to_string(c->proc_subset),
prefix, private_bpf_to_string(c->private_bpf));
if (c->set_login_environment >= 0)
fprintf(f, "%sSetLoginEnvironment: %s\n", prefix, yes_no(c->set_login_environment > 0));

View File

@@ -300,6 +300,8 @@ typedef struct ExecContext {
ProtectProc protect_proc; /* hidepid= */
ProcSubset proc_subset; /* subset= */
PrivateBPF private_bpf;
int private_mounts;
int mount_apivfs;
int bind_log_sockets;

View File

@@ -67,6 +67,7 @@
{{type}}.KeyringMode, config_parse_exec_keyring_mode, 0, offsetof({{type}}, exec_context.keyring_mode)
{{type}}.ProtectProc, config_parse_protect_proc, 0, offsetof({{type}}, exec_context.protect_proc)
{{type}}.ProcSubset, config_parse_proc_subset, 0, offsetof({{type}}, exec_context.proc_subset)
{{type}}.PrivateBPF, config_parse_private_bpf, 0, offsetof({{type}}, exec_context.private_bpf)
{% if HAVE_SECCOMP %}
{{type}}.SystemCallFilter, config_parse_syscall_filter, 0, offsetof({{type}}, exec_context)
{{type}}.SystemCallArchitectures, config_parse_syscall_archs, 0, offsetof({{type}}, exec_context.syscall_archs)

View File

@@ -133,6 +133,7 @@ DEFINE_CONFIG_PARSE_ENUM(config_parse_device_policy, cgroup_device_policy, CGrou
DEFINE_CONFIG_PARSE_ENUM(config_parse_exec_keyring_mode, exec_keyring_mode, ExecKeyringMode);
DEFINE_CONFIG_PARSE_ENUM(config_parse_protect_proc, protect_proc, ProtectProc);
DEFINE_CONFIG_PARSE_ENUM(config_parse_proc_subset, proc_subset, ProcSubset);
DEFINE_CONFIG_PARSE_ENUM(config_parse_private_bpf, private_bpf, PrivateBPF);
DEFINE_CONFIG_PARSE_ENUM(config_parse_private_tmp, private_tmp, PrivateTmp);
DEFINE_CONFIG_PARSE_ENUM(config_parse_private_users, private_users, PrivateUsers);
DEFINE_CONFIG_PARSE_ENUM(config_parse_private_pids, private_pids, PrivatePIDs);

View File

@@ -129,6 +129,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_bind_paths);
CONFIG_PARSER_PROTOTYPE(config_parse_exec_keyring_mode);
CONFIG_PARSER_PROTOTYPE(config_parse_protect_proc);
CONFIG_PARSER_PROTOTYPE(config_parse_proc_subset);
CONFIG_PARSER_PROTOTYPE(config_parse_private_bpf);
CONFIG_PARSER_PROTOTYPE(config_parse_job_timeout_sec);
CONFIG_PARSER_PROTOTYPE(config_parse_job_running_timeout_sec);
CONFIG_PARSER_PROTOTYPE(config_parse_log_extra_fields);

View File

@@ -79,6 +79,7 @@ typedef enum MountMode {
MOUNT_EXTENSION_IMAGE, /* Mounted outside the root directory, and used by subsequent mounts */
MOUNT_MQUEUEFS,
MOUNT_READ_WRITE_IMPLICIT, /* Should have the lowest priority. */
MOUNT_BPFFS, /* Special mount for bpffs, which is mounted with fsmount() and move_mount() */
_MOUNT_MODE_MAX,
_MOUNT_MODE_INVALID = -EINVAL,
} MountMode;
@@ -161,13 +162,17 @@ static const MountEntry protect_kernel_tunables_proc_table[] = {
static const MountEntry protect_kernel_tunables_sys_table[] = {
{ "/sys", MOUNT_READ_ONLY, false },
{ "/sys/fs/bpf", MOUNT_READ_ONLY, true },
{ "/sys/fs/cgroup", MOUNT_READ_WRITE_IMPLICIT, false }, /* READ_ONLY is set by ProtectControlGroups= option */
{ "/sys/fs/selinux", MOUNT_READ_WRITE_IMPLICIT, true },
{ "/sys/kernel/debug", MOUNT_READ_ONLY, true },
{ "/sys/kernel/tracing", MOUNT_READ_ONLY, true },
};
/* PrivateBPF= option */
static const MountEntry private_bpf_no_table[] = {
{ "/sys/fs/bpf", MOUNT_READ_ONLY, true },
};
/* ProtectKernelModules= option */
static const MountEntry protect_kernel_modules_table[] = {
{ "/usr/lib/modules", MOUNT_INACCESSIBLE, true },
@@ -927,6 +932,36 @@ static int append_protect_system(MountList *ml, ProtectSystem protect_system, bo
}
}
static int append_private_bpf(
MountList *ml,
PrivateBPF private_bpf,
bool protect_kernel_tunables,
bool ignore_protect,
const NamespaceParameters *p) {
assert(ml);
switch (private_bpf) {
case PRIVATE_BPF_NO:
if (protect_kernel_tunables)
return append_static_mounts(ml, private_bpf_no_table, ELEMENTSOF(private_bpf_no_table), ignore_protect);
return 0;
case PRIVATE_BPF_YES: {
MountEntry *me = mount_list_extend(ml);
if (!me)
return log_oom_debug();
*me = (MountEntry) {
.path_const = "/sys/fs/bpf",
.mode = MOUNT_BPFFS,
};
return 0;
}
default:
assert_not_reached();
}
}
static int mount_path_compare(const MountEntry *a, const MountEntry *b) {
int d;
@@ -1697,6 +1732,34 @@ static int mount_overlay(const MountEntry *m) {
return 1;
}
static int mount_bpffs(const MountEntry *m, int socket_fd) {
int r;
assert(m);
assert(socket_fd >= 0);
_cleanup_close_ int fs_fd = fsopen("bpf", FSOPEN_CLOEXEC);
if (fs_fd < 0)
return log_debug_errno(errno, "Failed to fsopen: %m");
r = send_one_fd(socket_fd, fs_fd, /* flags = */ 0);
if (r < 0)
return log_debug_errno(r, "Failed to send bpffs fd to child: %m");
if (read(socket_fd, (uint8_t[1]) {}, 1) < 0)
return log_debug_errno(errno, "Failed to receive data from child: %m");
_cleanup_close_ int mnt_fd = fsmount(fs_fd, /* flags = */ 0, /* mount_attrs = */ 0);
if (mnt_fd < 0)
return log_debug_errno(errno, "Failed to fsmount bpffs: %m");
r = move_mount(mnt_fd, "", AT_FDCWD, mount_entry_path(m), MOVE_MOUNT_F_EMPTY_PATH);
if (r < 0)
return log_debug_errno(errno, "Failed to move bpffs mount to %s: %m", mount_entry_path(m));
return 1;
}
static int follow_symlink(
const char *root_directory,
MountEntry *m) {
@@ -1953,6 +2016,9 @@ static int apply_one_mount(
case MOUNT_OVERLAY:
return mount_overlay(m);
case MOUNT_BPFFS:
return mount_bpffs(m, p->bpffs_socket_fd);
default:
assert_not_reached();
}
@@ -2151,6 +2217,7 @@ static bool namespace_parameters_mount_apivfs(const NamespaceParameters *p) {
p->protect_kernel_tunables ||
p->protect_proc != PROTECT_PROC_DEFAULT ||
p->proc_subset != PROC_SUBSET_ALL ||
p->private_bpf != PRIVATE_BPF_NO ||
p->private_pids != PRIVATE_PIDS_NO;
}
@@ -2653,6 +2720,10 @@ int setup_namespace(const NamespaceParameters *p, char **reterr_path) {
if (r < 0)
return r;
r = append_private_bpf(&ml, p->private_bpf, p->protect_kernel_tunables, /* ignore_protect = */ false, p);
if (r < 0)
return r;
if (namespace_parameters_mount_apivfs(p)) {
r = append_static_mounts(&ml,
apivfs_table,
@@ -3888,6 +3959,13 @@ static const char* const proc_subset_table[_PROC_SUBSET_MAX] = {
DEFINE_STRING_TABLE_LOOKUP(proc_subset, ProcSubset);
static const char* const private_bpf_table[_PRIVATE_BPF_MAX] = {
[PRIVATE_BPF_NO] = "no",
[PRIVATE_BPF_YES] = "yes",
};
DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(private_bpf, PrivateBPF, PRIVATE_BPF_YES);
static const char* const private_tmp_table[_PRIVATE_TMP_MAX] = {
[PRIVATE_TMP_NO] = "no",
[PRIVATE_TMP_CONNECTED] = "connected",

View File

@@ -51,6 +51,13 @@ typedef enum ProcSubset {
_PROC_SUBSET_INVALID = -EINVAL,
} ProcSubset;
typedef enum PrivateBPF {
PRIVATE_BPF_NO,
PRIVATE_BPF_YES,
_PRIVATE_BPF_MAX,
_PRIVATE_BPF_INVALID = -EINVAL,
} PrivateBPF;
typedef enum PrivateTmp {
PRIVATE_TMP_NO,
PRIVATE_TMP_CONNECTED, /* Bind mounted from the host's filesystem */
@@ -188,9 +195,12 @@ typedef struct NamespaceParameters {
ProtectSystem protect_system;
ProtectProc protect_proc;
ProcSubset proc_subset;
PrivateBPF private_bpf;
PrivateTmp private_tmp;
PrivateTmp private_var_tmp;
PrivatePIDs private_pids;
int bpffs_socket_fd;
} NamespaceParameters;
int setup_namespace(const NamespaceParameters *p, char **reterr_path);
@@ -223,6 +233,9 @@ ProtectProc protect_proc_from_string(const char *s) _pure_;
const char* proc_subset_to_string(ProcSubset i) _const_;
ProcSubset proc_subset_from_string(const char *s) _pure_;
const char* private_bpf_to_string(PrivateBPF i) _const_;
PrivateBPF private_bpf_from_string(const char *s) _pure_;
const char* private_tmp_to_string(PrivateTmp i) _const_;
PrivateTmp private_tmp_from_string(const char *s) _pure_;

View File

@@ -2425,6 +2425,7 @@ static const BusProperty execute_properties[] = {
{ "MountImagePolicy", bus_append_string },
{ "ExtensionImagePolicy", bus_append_string },
{ "PrivatePIDs", bus_append_string },
{ "PrivateBPF", bus_append_string },
{ "IgnoreSIGPIPE", bus_append_parse_boolean },
{ "TTYVHangup", bus_append_parse_boolean },
{ "TTYReset", bus_append_parse_boolean },

View File

@@ -0,0 +1,21 @@
#!/usr/bin/env bash
# SPDX-License-Identifier: LGPL-2.1-or-later
set -eux
set -o pipefail
# Check that with ProtectKernelTunables=yes and PrivateBPF=no, the host bpffs is remounted ro
systemd-run --wait \
-p PrivateUsers=yes \
-p PrivateMounts=yes \
-p DelegateNamespaces=mnt \
-p ProtectKernelTunables=yes \
-p PrivateBPF=no \
grep -q '/sys/fs/bpf .* ro,' /proc/mounts
# Check that with PrivateBPF=yes, a new bpffs instance is mounted
systemd-run --wait \
-p PrivateUsers=yes \
-p PrivateMounts=yes \
-p DelegateNamespaces=mnt \
-p PrivateBPF=yes \
grep -q '^none /sys/fs/bpf bpf rw' /proc/mounts