diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c index 6058553aac..82a6a12a98 100644 --- a/src/core/exec-invoke.c +++ b/src/core/exec-invoke.c @@ -2270,20 +2270,51 @@ static int setup_private_users_child(int unshare_ready_fd, const char *uid_map, return 0; } +static int bpffs_helper(const ExecContext *c, int socket_fd) { + assert(c); + assert(socket_fd >= 0); + + _cleanup_close_ int fs_fd = receive_one_fd(socket_fd, /* flags = */ 0); + if (fs_fd < 0) + return log_debug_errno(fs_fd, "Failed to receive file descriptor from parent: %m"); + + char number[STRLEN("0x") + sizeof(c->bpf_delegate_commands) * 2 + 1]; + xsprintf(number, "0x%"PRIx64, c->bpf_delegate_commands); + if (fsconfig(fs_fd, FSCONFIG_SET_STRING, "delegate_cmds", number, /* aux = */ 0) < 0) + return log_debug_errno(errno, "Failed to FSCONFIG_SET_STRING: %m"); + + xsprintf(number, "0x%"PRIx64, c->bpf_delegate_maps); + if (fsconfig(fs_fd, FSCONFIG_SET_STRING, "delegate_maps", number, /* aux = */ 0) < 0) + return log_debug_errno(errno, "Failed to FSCONFIG_SET_STRING: %m"); + + xsprintf(number, "0x%"PRIx64, c->bpf_delegate_programs); + if (fsconfig(fs_fd, FSCONFIG_SET_STRING, "delegate_progs", number, /* aux = */ 0) < 0) + return log_debug_errno(errno, "Failed to FSCONFIG_SET_STRING: %m"); + + xsprintf(number, "0x%"PRIx64, c->bpf_delegate_attachments); + if (fsconfig(fs_fd, FSCONFIG_SET_STRING, "delegate_attachs", number, /* aux = */ 0) < 0) + return log_debug_errno(errno, "Failed to FSCONFIG_SET_STRING: %m"); + + if (fsconfig(fs_fd, FSCONFIG_CMD_CREATE, /* key = */ NULL, /* value = */ NULL, /* aux = */ 0) < 0) + return log_debug_errno(errno, "Failed to create bpffs superblock: %m"); + + return 0; +} + static int bpffs_prepare( const ExecContext *c, PidRef *ret_pid, int *ret_sock_fd, int *ret_errno_pipe) { - _cleanup_close_pair_ int socket_fds[2] = EBADF_PAIR, bpffs_errno_pipe[2] = EBADF_PAIR; + _cleanup_close_pair_ int socket_fds[2] = EBADF_PAIR, errno_pipe[2] = EBADF_PAIR; int r; assert(ret_sock_fd); assert(ret_pid); assert(ret_errno_pipe); - r = pipe2(bpffs_errno_pipe, O_CLOEXEC|O_NONBLOCK); + r = pipe2(errno_pipe, O_CLOEXEC|O_NONBLOCK); if (r < 0) return log_debug_errno(errno, "Failed to create pipe: %m"); @@ -2295,67 +2326,13 @@ static int bpffs_prepare( if (r < 0) return log_debug_errno(r, "Failed to fork bpffs privileged helper: %m"); if (r == 0) { - _cleanup_close_ int fs_fd = -EBADF; - char number[STRLEN("0x") + sizeof(c->bpf_delegate_commands) * 2 + 1]; - - bpffs_errno_pipe[0] = safe_close(bpffs_errno_pipe[0]); + errno_pipe[0] = safe_close(errno_pipe[0]); socket_fds[0] = safe_close(socket_fds[0]); - - fs_fd = receive_one_fd(socket_fds[1], /* flags = */ 0); - if (fs_fd < 0) { - log_debug_errno(fs_fd, "Failed to receive file descriptor from parent: %m"); - report_errno_and_exit(bpffs_errno_pipe[1], fs_fd); - } - - xsprintf(number, "0x%"PRIx64, c->bpf_delegate_commands); - - r = fsconfig(fs_fd, FSCONFIG_SET_STRING, "delegate_cmds", number, /* aux = */ 0); - if (r < 0) { - log_debug_errno(errno, "Failed to FSCONFIG_SET_STRING: %m"); - report_errno_and_exit(bpffs_errno_pipe[1], errno); - } - - xsprintf(number, "0x%"PRIx64, c->bpf_delegate_maps); - - r = fsconfig(fs_fd, FSCONFIG_SET_STRING, "delegate_maps", number, /* aux = */ 0); - if (r < 0) { - log_debug_errno(errno, "Failed to FSCONFIG_SET_STRING: %m"); - report_errno_and_exit(bpffs_errno_pipe[1], errno); - } - - xsprintf(number, "0x%"PRIx64, c->bpf_delegate_programs); - - r = fsconfig(fs_fd, FSCONFIG_SET_STRING, "delegate_progs", number, /* aux = */ 0); - if (r < 0) { - log_debug_errno(errno, "Failed to FSCONFIG_SET_STRING: %m"); - report_errno_and_exit(bpffs_errno_pipe[1], errno); - } - - xsprintf(number, "0x%"PRIx64, c->bpf_delegate_attachments); - - r = fsconfig(fs_fd, FSCONFIG_SET_STRING, "delegate_attachs", number, /* aux = */ 0); - if (r < 0) { - log_debug_errno(errno, "Failed to FSCONFIG_SET_STRING: %m"); - report_errno_and_exit(bpffs_errno_pipe[1], errno); - } - - r = fsconfig(fs_fd, FSCONFIG_CMD_CREATE, /* key = */ NULL, /* value = */ NULL, /* aux = */ 0); - if (r < 0) { - log_debug_errno(errno, "Failed to create bpffs superblock: %m"); - report_errno_and_exit(bpffs_errno_pipe[1], errno); - } - - if (write(socket_fds[1], (uint8_t[1]) {}, 1) < 0) { - log_debug_errno(errno, "Failed to send data to child: %m"); - report_errno_and_exit(bpffs_errno_pipe[1], errno); - } - - _exit(EXIT_SUCCESS); + report_errno_and_exit(errno_pipe[1], bpffs_helper(c, socket_fds[1])); } *ret_sock_fd = TAKE_FD(socket_fds[0]); - *ret_errno_pipe = TAKE_FD(bpffs_errno_pipe[0]); - + *ret_errno_pipe = TAKE_FD(errno_pipe[0]); return 0; } @@ -3691,7 +3668,9 @@ static int apply_mount_namespace( bool needs_sandboxing, uid_t exec_directory_uid, gid_t exec_directory_gid, + PidRef *bpffs_pidref, int bpffs_socket_fd, + int bpffs_errno_pipe, char **reterr_path) { _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT; @@ -3906,7 +3885,9 @@ static int apply_mount_namespace( .proc_subset = needs_sandboxing ? context->proc_subset : PROC_SUBSET_ALL, .private_bpf = needs_sandboxing ? context->private_bpf : PRIVATE_BPF_NO, + .bpffs_pidref = bpffs_pidref, .bpffs_socket_fd = bpffs_socket_fd, + .bpffs_errno_pipe = bpffs_errno_pipe, }; r = setup_namespace(¶meters, reterr_path); @@ -4547,7 +4528,9 @@ static int setup_delegated_namespaces( const ExecCommand *command, bool needs_sandboxing, bool have_cap_sys_admin, + PidRef *bpffs_pidref, int bpffs_socket_fd, + int bpffs_errno_pipe, int *reterr_exit_status) { int r; @@ -4670,7 +4653,9 @@ static int setup_delegated_namespaces( needs_sandboxing, uid, gid, + bpffs_pidref, bpffs_socket_fd, + bpffs_errno_pipe, &error_path); if (r < 0) { *reterr_exit_status = EXIT_NAMESPACE; @@ -5782,7 +5767,9 @@ int exec_invoke( command, needs_sandboxing, have_cap_sys_admin, + &bpffs_pidref, bpffs_socket_fd, + bpffs_errno_pipe, exit_status); if (r < 0) return r; @@ -5842,29 +5829,15 @@ int exec_invoke( command, needs_sandboxing, have_cap_sys_admin, + &bpffs_pidref, bpffs_socket_fd, + bpffs_errno_pipe, exit_status); if (r < 0) return r; - if (context->private_bpf != PRIVATE_BPF_NO) { - r = pidref_wait_for_terminate_and_check("(sd-bpffs)", &bpffs_pidref, /* flags = */ 0); - if (r < 0) { - *exit_status = EXIT_BPF; - return r; - } - /* If something strange happened with the child, let's consider this fatal, too */ - if (r != EXIT_SUCCESS) { - *exit_status = EXIT_BPF; - ssize_t ss = read(bpffs_errno_pipe, &r, sizeof(r)); - if (ss == sizeof(r)) - return log_debug_errno(r, "bpffs helper exited with error: %m"); - if (ss < 0) - return log_debug_errno(errno, "Failed to read from the bpffs helper errno pipe: %m"); - return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Short read from the bpffs helper errno pipe."); - } - pidref_done(&bpffs_pidref); - } + /* Kill unnecessary process, for the case that e.g. when the bpffs mount point is hidden. */ + pidref_done_sigkill_wait(&bpffs_pidref); if (needs_sandboxing && exec_needs_cgroup_namespace(context) && params->cgroup_path) { /* Move ourselves into the subcgroup now *after* we've unshared the cgroup namespace, which diff --git a/src/core/namespace.c b/src/core/namespace.c index c384d67898..644614a184 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -957,6 +957,7 @@ static int append_private_bpf( *me = (MountEntry) { .path_const = "/sys/fs/bpf", .mode = MOUNT_BPFFS, + .ignore = !protect_kernel_tunables, /* indicate whether we should fall back to MOUNT_READ_ONLY on failure. */ }; return 0; } @@ -1735,11 +1736,13 @@ static int mount_overlay(const MountEntry *m) { return 1; } -static int mount_bpffs(const MountEntry *m, int socket_fd) { +static int mount_bpffs(const MountEntry *m, PidRef *pidref, int socket_fd, int errno_pipe) { int r; assert(m); + assert(pidref_is_set(pidref)); assert(socket_fd >= 0); + assert(errno_pipe >= 0); _cleanup_close_ int fs_fd = fsopen("bpf", FSOPEN_CLOEXEC); if (fs_fd < 0) @@ -1749,8 +1752,21 @@ static int mount_bpffs(const MountEntry *m, int socket_fd) { if (r < 0) return log_debug_errno(r, "Failed to send bpffs fd to child: %m"); - if (read(socket_fd, (uint8_t[1]) {}, 1) < 0) - return log_debug_errno(errno, "Failed to receive data from child: %m"); + r = pidref_wait_for_terminate_and_check("(sd-bpffs)", pidref, /* flags = */ 0); + if (r < 0) + return r; + + /* If something strange happened with the child, let's consider this fatal, too */ + if (r != EXIT_SUCCESS) { + ssize_t ss = read(errno_pipe, &r, sizeof(r)); + if (ss < 0) + return log_debug_errno(errno, "Failed to read from the bpffs helper errno pipe: %m"); + if (ss != sizeof(r)) + return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Short read from the bpffs helper errno pipe."); + return log_debug_errno(r, "bpffs helper exited with error: %m"); + } + + pidref_done(pidref); _cleanup_close_ int mnt_fd = fsmount(fs_fd, /* flags = */ 0, /* mount_attrs = */ 0); if (mnt_fd < 0) @@ -1817,6 +1833,23 @@ static int apply_one_mount( log_debug("Applying namespace mount on %s", mount_entry_path(m)); + if (m->mode == MOUNT_BPFFS) { + r = mount_bpffs(m, p->bpffs_pidref, p->bpffs_socket_fd, p->bpffs_errno_pipe); + if (r >= 0 || + (!ERRNO_IS_NEG_NOT_SUPPORTED(r) && /* old kernel? */ + !ERRNO_IS_NEG_PRIVILEGE(r))) /* ubuntu kernel bug? See issue #38225 */ + return r; + + if (m->ignore) { + log_debug_errno(r, "Failed to mount new bpffs instance, ignoring: %m"); + return 0; + } + + log_debug_errno(r, "Failed to mount new bpffs instance, fallback to making %s read-only, ignoring: %m", mount_entry_path(m)); + m->mode = MOUNT_READ_ONLY; + m->ignore = true; + } + switch (m->mode) { case MOUNT_INACCESSIBLE: { @@ -2019,9 +2052,6 @@ static int apply_one_mount( case MOUNT_OVERLAY: return mount_overlay(m); - case MOUNT_BPFFS: - return mount_bpffs(m, p->bpffs_socket_fd); - default: assert_not_reached(); } diff --git a/src/core/namespace.h b/src/core/namespace.h index 42e146a7e8..4e9d87e266 100644 --- a/src/core/namespace.h +++ b/src/core/namespace.h @@ -200,7 +200,9 @@ typedef struct NamespaceParameters { PrivateTmp private_var_tmp; PrivatePIDs private_pids; + PidRef *bpffs_pidref; int bpffs_socket_fd; + int bpffs_errno_pipe; } NamespaceParameters; int setup_namespace(const NamespaceParameters *p, char **reterr_path); diff --git a/src/test/test-bpf-token.c b/src/test/test-bpf-token.c index 23dd143082..3d6127182e 100644 --- a/src/test/test-bpf-token.c +++ b/src/test/test-bpf-token.c @@ -4,25 +4,22 @@ #include #include "fd-util.h" -#include "main-func.h" #include "tests.h" -static int run(int argc, char *argv[]) { +static int intro(void) { #if __LIBBPF_CURRENT_VERSION_GEQ(1, 5) - _cleanup_close_ int bpffs_fd = -EBADF, token_fd = -EBADF; - - bpffs_fd = open("/sys/fs/bpf", O_RDONLY); + _cleanup_close_ int bpffs_fd = open("/sys/fs/bpf", O_RDONLY); if (bpffs_fd < 0) - return -errno; + return log_error_errno(errno, "Failed to open '/sys/fs/bpf': %m"); - token_fd = bpf_token_create(bpffs_fd, /* opts = */ NULL); + _cleanup_close_ int token_fd = bpf_token_create(bpffs_fd, /* opts = */ NULL); if (token_fd < 0) - return -errno; + return log_error_errno(errno, "Failed to create bpf token: %m"); - return 0; + return EXIT_SUCCESS; #else - exit(77); + return log_tests_skipped("libbpf is older than v1.5"); #endif } -DEFINE_MAIN_FUNCTION(run); +DEFINE_TEST_MAIN_WITH_INTRO(LOG_DEBUG, intro); diff --git a/test/units/TEST-07-PID1.private-bpf.sh b/test/units/TEST-07-PID1.private-bpf.sh index f405b08d07..d9c0218055 100755 --- a/test/units/TEST-07-PID1.private-bpf.sh +++ b/test/units/TEST-07-PID1.private-bpf.sh @@ -13,12 +13,26 @@ systemd-run --wait \ grep -q '/sys/fs/bpf .* ro,' /proc/mounts # Check that with PrivateBPF=yes, a new bpffs instance is mounted -systemd-run --wait \ +if ! systemd-run --wait \ -p PrivateUsers=yes \ -p PrivateMounts=yes \ -p DelegateNamespaces=mnt \ -p PrivateBPF=yes \ - grep -q '^none /sys/fs/bpf bpf rw' /proc/mounts + grep -q '^none /sys/fs/bpf bpf rw' /proc/mounts; then + + # If it does not work, maybe the kernel is old or the system has buggy ubuntu kernel. + # Let's check if PrivateBPF=yes is ignored gracefully in that case. + systemd-run --wait \ + -p PrivateUsers=yes \ + -p PrivateMounts=yes \ + -p DelegateNamespaces=mnt \ + -p ProtectKernelTunables=yes \ + -p PrivateBPF=yes \ + grep -q '/sys/fs/bpf .* ro,' /proc/mounts + + # Skip all remaining tests. + exit 0 +fi # Check that when specifying the delegate arguments, the mount options are set properly check_mount_opts() { @@ -63,9 +77,9 @@ systemd-run --wait \ /usr/lib/systemd/tests/unit-tests/manual/test-bpf-token # Check that without the delegates, the helper aborts trying to get a token -! systemd-run --wait \ +(! systemd-run --wait \ -p PrivateUsers=yes \ -p PrivateMounts=yes \ -p DelegateNamespaces=mnt \ -p PrivateBPF=yes \ - /usr/lib/systemd/tests/unit-tests/manual/test-bpf-token + /usr/lib/systemd/tests/unit-tests/manual/test-bpf-token)