core/bpf: drop old kernels support (#37151)

This commit is contained in:
Yu Watanabe
2025-05-10 03:13:42 +09:00
committed by GitHub
23 changed files with 159 additions and 403 deletions

View File

@@ -252,51 +252,6 @@ int bpf_devices_apply_policy(
return 0;
}
int bpf_devices_supported(void) {
const struct bpf_insn trivial[] = {
BPF_MOV64_IMM(BPF_REG_0, 1),
BPF_EXIT_INSN()
};
_cleanup_(bpf_program_freep) BPFProgram *program = NULL;
static int supported = -1;
int r;
/* Checks whether BPF device controller is supported. For this, we check two things:
*
* a) whether we are privileged
* b) the BPF implementation in the kernel supports BPF_PROG_TYPE_CGROUP_DEVICE programs, which we require
*/
if (supported >= 0)
return supported;
if (geteuid() != 0) {
log_debug("Not enough privileges, BPF device control is not supported.");
return supported = 0;
}
r = bpf_program_new(BPF_PROG_TYPE_CGROUP_DEVICE, "sd_devices", &program);
if (r < 0) {
log_debug_errno(r, "Can't allocate CGROUP DEVICE BPF program, BPF device control is not supported: %m");
return supported = 0;
}
r = bpf_program_add_instructions(program, trivial, ELEMENTSOF(trivial));
if (r < 0) {
log_debug_errno(r, "Can't add trivial instructions to CGROUP DEVICE BPF program, BPF device control is not supported: %m");
return supported = 0;
}
r = bpf_program_load_kernel(program, NULL, 0);
if (r < 0) {
log_debug_errno(r, "Can't load kernel CGROUP DEVICE BPF program, BPF device control is not supported: %m");
return supported = 0;
}
return supported = 1;
}
static int allow_list_device_pattern(
BPFProgram *prog,
const char *path,

View File

@@ -15,7 +15,6 @@ int bpf_devices_apply_policy(
const char *cgroup_path,
BPFProgram **prog_installed);
int bpf_devices_supported(void);
int bpf_devices_allow_list_device(BPFProgram *prog, const char *path, const char *node, CGroupDevicePermissions p);
int bpf_devices_allow_list_major(BPFProgram *prog, const char *path, const char *name, char type, CGroupDevicePermissions p);
int bpf_devices_allow_list_static(BPFProgram *prog, const char *path);

View File

@@ -542,7 +542,7 @@ int bpf_firewall_compile(Unit *u) {
bool ip_allow_any = false, ip_deny_any = false;
CGroupContext *cc;
CGroupRuntime *crt;
int r, supported;
int r;
assert(u);
@@ -554,27 +554,12 @@ int bpf_firewall_compile(Unit *u) {
if (!crt)
return -ENOMEM;
supported = bpf_firewall_supported();
if (supported < 0)
return supported;
if (supported == BPF_FIREWALL_UNSUPPORTED)
if (bpf_program_supported() <= 0)
return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
"bpf-firewall: BPF firewalling not supported, proceeding without.");
if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE)
/* If BPF_F_ALLOW_MULTI is not supported we don't support any BPF magic on inner nodes (i.e. on slice
* units), since that would mean leaf nodes couldn't do any BPF anymore at all. Under the assumption
* that BPF is more interesting on leaf nodes we hence avoid it on inner nodes in that case. This is
* consistent with old systemd behaviour from before v238, where BPF wasn't supported in inner nodes at
* all, either. */
return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
"bpf-firewall: BPF_F_ALLOW_MULTI is not supported, not doing BPF firewall on slice units.");
/* If BPF_F_ALLOW_MULTI flag is supported program name is also supported (both were added to v4.15
* kernel). */
if (supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI) {
ingress_name = "sd_fw_ingress";
egress_name = "sd_fw_egress";
}
ingress_name = "sd_fw_ingress";
egress_name = "sd_fw_egress";
/* Note that when we compile a new firewall we first flush out the access maps and the BPF programs themselves,
* but we reuse the accounting maps. That way the firewall in effect always maps to the actual
@@ -646,7 +631,7 @@ static int load_bpf_progs_from_fs_to_set(Unit *u, char **filter_paths, Set **set
int bpf_firewall_load_custom(Unit *u) {
CGroupContext *cc;
CGroupRuntime *crt;
int r, supported;
int r;
assert(u);
@@ -660,13 +645,9 @@ int bpf_firewall_load_custom(Unit *u) {
if (!(cc->ip_filters_ingress || cc->ip_filters_egress))
return 0;
supported = bpf_firewall_supported();
if (supported < 0)
return supported;
if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI)
if (bpf_program_supported() <= 0)
return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
"bpf-firewall: BPF_F_ALLOW_MULTI not supported, cannot attach custom BPF programs.");
"bpf-firewall: BPF firewalling not supported, cannot attach custom BPF programs.");
r = load_bpf_progs_from_fs_to_set(u, cc->ip_filters_ingress, &crt->ip_bpf_custom_ingress);
if (r < 0)
@@ -702,8 +683,7 @@ int bpf_firewall_install(Unit *u) {
_cleanup_free_ char *path = NULL;
CGroupContext *cc;
CGroupRuntime *crt;
int r, supported;
uint32_t flags;
int r;
assert(u);
@@ -718,43 +698,23 @@ int bpf_firewall_install(Unit *u) {
if (!crt->cgroup_realized)
return -EINVAL;
supported = bpf_firewall_supported();
if (supported < 0)
return supported;
if (supported == BPF_FIREWALL_UNSUPPORTED)
if (bpf_program_supported() <= 0)
return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
"bpf-firewall: BPF firewalling not supported, proceeding without.");
if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE)
return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
"bpf-firewall: BPF_F_ALLOW_MULTI not supported, not doing BPF firewall on slice units.");
if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI &&
(!set_isempty(crt->ip_bpf_custom_ingress) || !set_isempty(crt->ip_bpf_custom_egress)))
return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
"bpf-firewall: BPF_F_ALLOW_MULTI not supported, cannot attach custom BPF programs.");
r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, crt->cgroup_path, NULL, &path);
if (r < 0)
return log_unit_error_errno(u, r, "bpf-firewall: Failed to determine cgroup path: %m");
flags = supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI ? BPF_F_ALLOW_MULTI : 0;
if (FLAGS_SET(flags, BPF_F_ALLOW_MULTI)) {
/* If we have BPF_F_ALLOW_MULTI, then let's clear the fields, but destroy the programs only
* after attaching the new programs, so that there's no time window where neither program is
* attached. (There will be a program where both are attached, but that's OK, since this is a
* security feature where we rather want to lock down too much than too little */
ip_bpf_egress_uninstall = TAKE_PTR(crt->ip_bpf_egress_installed);
ip_bpf_ingress_uninstall = TAKE_PTR(crt->ip_bpf_ingress_installed);
} else {
/* If we don't have BPF_F_ALLOW_MULTI then unref the old BPF programs (which will implicitly
* detach them) right before attaching the new program, to minimize the time window when we
* don't account for IP traffic. */
crt->ip_bpf_egress_installed = bpf_program_free(crt->ip_bpf_egress_installed);
crt->ip_bpf_ingress_installed = bpf_program_free(crt->ip_bpf_ingress_installed);
}
/* Let's clear the fields, but destroy the programs only after attaching the new programs, so that
* there's no time window where neither program is attached. (There will be a program where both are
* attached, but that's OK, since this is a security feature where we rather want to lock down too
* much than too little. */
ip_bpf_egress_uninstall = TAKE_PTR(crt->ip_bpf_egress_installed);
ip_bpf_ingress_uninstall = TAKE_PTR(crt->ip_bpf_ingress_installed);
if (crt->ip_bpf_egress) {
r = bpf_program_cgroup_attach(crt->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path, flags);
r = bpf_program_cgroup_attach(crt->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path, BPF_F_ALLOW_MULTI);
if (r < 0)
return log_unit_error_errno(u, r,
"bpf-firewall: Attaching egress BPF program to cgroup %s failed: %m", path);
@@ -764,7 +724,7 @@ int bpf_firewall_install(Unit *u) {
}
if (crt->ip_bpf_ingress) {
r = bpf_program_cgroup_attach(crt->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path, flags);
r = bpf_program_cgroup_attach(crt->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path, BPF_F_ALLOW_MULTI);
if (r < 0)
return log_unit_error_errno(u, r,
"bpf-firewall: Attaching ingress BPF program to cgroup %s failed: %m", path);
@@ -830,118 +790,9 @@ int bpf_firewall_reset_accounting(int map_fd) {
return bpf_map_update_element(map_fd, &key, &value);
}
static int bpf_firewall_unsupported_reason = 0;
int bpf_firewall_supported(void) {
const struct bpf_insn trivial[] = {
BPF_MOV64_IMM(BPF_REG_0, 1),
BPF_EXIT_INSN()
};
_cleanup_(bpf_program_freep) BPFProgram *program = NULL;
static int supported = -1;
union bpf_attr attr;
int r;
/* Checks whether BPF firewalling is supported. For this, we check the following things:
*
* - the BPF implementation in the kernel supports BPF_PROG_TYPE_CGROUP_SKB programs, which we require
* - the BPF implementation in the kernel supports the BPF_PROG_DETACH call, which we require
*/
if (supported >= 0)
return supported;
/* prog_name is NULL since it is supported only starting from v4.15 kernel. */
r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, NULL, &program);
if (r < 0) {
bpf_firewall_unsupported_reason =
log_debug_errno(r, "bpf-firewall: Can't allocate CGROUP SKB BPF program, BPF firewalling is not supported: %m");
return supported = BPF_FIREWALL_UNSUPPORTED;
}
r = bpf_program_add_instructions(program, trivial, ELEMENTSOF(trivial));
if (r < 0) {
bpf_firewall_unsupported_reason =
log_debug_errno(r, "bpf-firewall: Can't add trivial instructions to CGROUP SKB BPF program, BPF firewalling is not supported: %m");
return supported = BPF_FIREWALL_UNSUPPORTED;
}
r = bpf_program_load_kernel(program, NULL, 0);
if (r < 0) {
bpf_firewall_unsupported_reason =
log_debug_errno(r, "bpf-firewall: Can't load kernel CGROUP SKB BPF program, BPF firewalling is not supported: %m");
return supported = BPF_FIREWALL_UNSUPPORTED;
}
/* Unfortunately the kernel allows us to create BPF_PROG_TYPE_CGROUP_SKB programs even when CONFIG_CGROUP_BPF
* is turned off at kernel compilation time. This sucks of course: why does it allow us to create a cgroup BPF
* program if we can't do a thing with it later?
*
* We detect this case by issuing the BPF_PROG_DETACH bpf() call with invalid file descriptors: if
* CONFIG_CGROUP_BPF is turned off, then the call will fail early with EINVAL. If it is turned on the
* parameters are validated however, and that'll fail with EBADF then. */
// FIXME: Clang doesn't 0-pad with structured initialization, causing
// the kernel to reject the bpf_attr as invalid. See:
// https://github.com/torvalds/linux/blob/v5.9/kernel/bpf/syscall.c#L65
// Ideally it should behave like GCC, so that we can remove these workarounds.
zero(attr);
attr.attach_type = BPF_CGROUP_INET_EGRESS;
attr.target_fd = -EBADF;
attr.attach_bpf_fd = -EBADF;
if (bpf(BPF_PROG_DETACH, &attr, sizeof(attr)) < 0) {
if (errno != EBADF) {
bpf_firewall_unsupported_reason =
log_debug_errno(errno, "bpf-firewall: Didn't get EBADF from BPF_PROG_DETACH, BPF firewalling is not supported: %m");
return supported = BPF_FIREWALL_UNSUPPORTED;
}
/* YAY! */
} else {
bpf_firewall_unsupported_reason =
log_debug_errno(SYNTHETIC_ERRNO(EBADE),
"bpf-firewall: Wut? Kernel accepted our invalid BPF_PROG_DETACH call? "
"Something is weird, assuming BPF firewalling is broken and hence not supported.");
return supported = BPF_FIREWALL_UNSUPPORTED;
}
/* So now we know that the BPF program is generally available, let's see if BPF_F_ALLOW_MULTI is also supported
* (which was added in kernel 4.15). We use a similar logic as before, but this time we use the BPF_PROG_ATTACH
* bpf() call and the BPF_F_ALLOW_MULTI flags value. Since the flags are checked early in the system call we'll
* get EINVAL if it's not supported, and EBADF as before if it is available.
* Use probe result as the indicator that program name is also supported since they both were
* added in kernel 4.15. */
zero(attr);
attr.attach_type = BPF_CGROUP_INET_EGRESS;
attr.target_fd = -EBADF;
attr.attach_bpf_fd = -EBADF;
attr.attach_flags = BPF_F_ALLOW_MULTI;
if (bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)) < 0) {
if (errno == EBADF) {
log_debug_errno(errno, "bpf-firewall: Got EBADF when using BPF_F_ALLOW_MULTI, which indicates it is supported. Yay!");
return supported = BPF_FIREWALL_SUPPORTED_WITH_MULTI;
}
if (errno == EINVAL)
log_debug_errno(errno, "bpf-firewall: Got EINVAL error when using BPF_F_ALLOW_MULTI, which indicates it's not supported.");
else
log_debug_errno(errno, "bpf-firewall: Got unexpected error when using BPF_F_ALLOW_MULTI, assuming it's not supported: %m");
return supported = BPF_FIREWALL_SUPPORTED;
} else {
bpf_firewall_unsupported_reason =
log_debug_errno(SYNTHETIC_ERRNO(EBADE),
"bpf-firewall: Wut? Kernel accepted our invalid BPF_PROG_ATTACH+BPF_F_ALLOW_MULTI call? "
"Something is weird, assuming BPF firewalling is broken and hence not supported.");
return supported = BPF_FIREWALL_UNSUPPORTED;
}
}
void emit_bpf_firewall_warning(Unit *u) {
static bool warned = false;
int r;
assert(u);
assert(u->manager);
@@ -949,9 +800,12 @@ void emit_bpf_firewall_warning(Unit *u) {
if (warned || MANAGER_IS_TEST_RUN(u->manager))
return;
bool quiet = ERRNO_IS_PRIVILEGE(bpf_firewall_unsupported_reason) && detect_container() > 0;
r = bpf_program_supported();
assert(r < 0);
log_unit_full_errno(u, quiet ? LOG_DEBUG : LOG_WARNING, bpf_firewall_unsupported_reason,
bool quiet = ERRNO_IS_NEG_PRIVILEGE(r) && detect_container() > 0;
log_unit_full_errno(u, quiet ? LOG_DEBUG : LOG_WARNING, r,
"unit configures an IP firewall, but %s.\n"
"(This warning is only shown for the first unit using IP firewalling.)",
getuid() != 0 ? "not running as root" :

View File

@@ -6,14 +6,6 @@
#include "cgroup.h"
#include "unit.h"
enum {
BPF_FIREWALL_UNSUPPORTED = 0,
BPF_FIREWALL_SUPPORTED = 1,
BPF_FIREWALL_SUPPORTED_WITH_MULTI = 2,
};
int bpf_firewall_supported(void);
int bpf_firewall_compile(Unit *u);
int bpf_firewall_install(Unit *u);
int bpf_firewall_load_custom(Unit *u);

View File

@@ -27,7 +27,6 @@
/* libbpf, clang and llc compile time dependencies are satisfied */
#include "bpf-dlopen.h"
#include "bpf-link.h"
#include "bpf-util.h"
#include "bpf/restrict_fs/restrict-fs-skel.h"
#define CGROUP_HASH_SIZE_MAX 2048
@@ -102,7 +101,7 @@ bool bpf_restrict_fs_supported(bool initialize) {
if (!initialize)
return false;
if (!cgroup_bpf_supported())
if (dlopen_bpf_full(LOG_WARNING) < 0)
return (supported = false);
r = lsm_supported("bpf");

View File

@@ -10,7 +10,6 @@
#include "bpf-dlopen.h"
#include "bpf-link.h"
#include "bpf-util.h"
#include "bpf/restrict_ifaces/restrict-ifaces-skel.h"
static struct restrict_ifaces_bpf *restrict_ifaces_bpf_free(struct restrict_ifaces_bpf *obj) {
@@ -81,14 +80,9 @@ int bpf_restrict_ifaces_supported(void) {
if (supported >= 0)
return supported;
if (!cgroup_bpf_supported())
if (dlopen_bpf_full(LOG_WARNING) < 0)
return (supported = false);
if (!compat_libbpf_probe_bpf_prog_type(BPF_PROG_TYPE_CGROUP_SKB, /*opts=*/NULL)) {
log_debug("restrict-interfaces: BPF program type cgroup_skb is not supported");
return (supported = false);
}
r = prepare_restrict_ifaces_bpf(NULL, true, NULL, &obj);
if (r < 0) {
log_debug_errno(r, "restrict-interfaces: Failed to load BPF object: %m");

View File

@@ -12,7 +12,6 @@
/* libbpf, clang, llvm and bpftool compile time dependencies are satisfied */
#include "bpf-dlopen.h"
#include "bpf-link.h"
#include "bpf-util.h"
#include "bpf/socket_bind/socket-bind-api.bpf.h"
#include "bpf/socket_bind/socket-bind-skel.h"
@@ -127,14 +126,9 @@ int bpf_socket_bind_supported(void) {
_cleanup_(socket_bind_bpf_freep) struct socket_bind_bpf *obj = NULL;
int r;
if (!cgroup_bpf_supported())
if (dlopen_bpf_full(LOG_WARNING) < 0)
return false;
if (!compat_libbpf_probe_bpf_prog_type(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, /*opts=*/NULL)) {
log_debug("bpf-socket-bind: BPF program type cgroup_sock_addr is not supported");
return false;
}
r = prepare_socket_bind_bpf(/*unit=*/NULL, /*allow_rules=*/NULL, /*deny_rules=*/NULL, &obj);
if (r < 0) {
log_debug_errno(r, "bpf-socket-bind: socket bind filtering is not supported: %m");

View File

@@ -1,24 +0,0 @@
/* SPDX-License-Identifier: LGPL-2.1-or-later */
#include "bpf-dlopen.h"
#include "bpf-util.h"
#include "cgroup-util.h"
#include "initrd-util.h"
#include "log.h"
bool cgroup_bpf_supported(void) {
static int supported = -1;
int r;
if (supported >= 0)
return supported;
r = dlopen_bpf();
if (r < 0) {
log_full_errno(in_initrd() ? LOG_DEBUG : LOG_INFO,
r, "Failed to open libbpf, cgroup BPF features disabled: %m");
return (supported = false);
}
return (supported = true);
}

View File

@@ -1,5 +0,0 @@
/* SPDX-License-Identifier: LGPL-2.1-or-later */
#include <stdbool.h>
bool cgroup_bpf_supported(void);

View File

@@ -3259,22 +3259,11 @@ static int cg_bpf_mask_supported(CGroupMask *ret) {
CGroupMask mask = 0;
int r;
/* BPF-based firewall */
r = bpf_firewall_supported();
if (r < 0)
return r;
if (r > 0)
mask |= CGROUP_MASK_BPF_FIREWALL;
/* BPF-based device access control */
r = bpf_devices_supported();
if (r < 0)
return r;
if (r > 0)
mask |= CGROUP_MASK_BPF_DEVICES;
/* BPF pinned prog (always supported by cgroup v2) */
mask |= CGROUP_MASK_BPF_FOREIGN;
/* BPF-based firewall, device access control, and pinned foreign prog */
if (bpf_program_supported() > 0)
mask |= CGROUP_MASK_BPF_FIREWALL |
CGROUP_MASK_BPF_DEVICES |
CGROUP_MASK_BPF_FOREIGN;
/* BPF-based bind{4|6} hooks */
r = bpf_socket_bind_supported();

View File

@@ -635,18 +635,13 @@ static int bus_cgroup_set_transient_property(
unit_write_setting(u, flags, name, buf);
if (*filters) {
r = bpf_firewall_supported();
if (r < 0)
return r;
if (r != BPF_FIREWALL_SUPPORTED_WITH_MULTI) {
static bool warned = false;
if (*filters && bpf_program_supported() <= 0) {
static bool warned = false;
log_full(warned ? LOG_DEBUG : LOG_WARNING,
"Transient unit %s configures an IP firewall with BPF, but the local system does not support BPF/cgroup firewalling with multiple filters.\n"
"Starting this unit will fail! (This warning is only shown for the first started transient unit using IP firewalling.)", u->id);
warned = true;
}
log_full(warned ? LOG_DEBUG : LOG_WARNING,
"Transient unit %s configures an IP firewall with BPF, but the local system does not support BPF/cgroup firewalling with multiple filters.\n"
"Starting this unit will fail! (This warning is only shown for the first started transient unit using IP firewalling.)", u->id);
warned = true;
}
}

View File

@@ -5712,15 +5712,12 @@ int config_parse_ip_filter_bpf_progs(
if (r < 0)
return log_oom();
r = bpf_firewall_supported();
if (r < 0)
return r;
if (r != BPF_FIREWALL_SUPPORTED_WITH_MULTI) {
if (bpf_program_supported() <= 0) {
static bool warned = false;
log_full(warned ? LOG_DEBUG : LOG_WARNING,
"File %s:%u configures an IP firewall with BPF programs (%s=%s), but the local system does not support BPF/cgroup based firewalling with multiple filters.\n"
"Starting this unit will fail! (This warning is only shown for the first loaded unit using IP firewalling.)", filename, line, lvalue, rvalue);
log_syntax(unit, warned ? LOG_DEBUG : LOG_WARNING, filename, line, 0,
"Configures an IP firewall with BPF programs (%s=%s), but the local system does not support BPF/cgroup based firewalling with multiple filters. "
"Starting this unit will fail! (This warning is only shown for the first loaded unit using IP firewalling.)", lvalue, rvalue);
warned = true;
}

View File

@@ -65,12 +65,6 @@ libcore_sources = files(
'varlink.c',
)
if conf.get('BPF_FRAMEWORK') == 1
libcore_sources += files(
'bpf-util.c',
)
endif
subdir('bpf/socket_bind')
subdir('bpf/restrict_fs')
subdir('bpf/restrict_ifaces')

View File

@@ -1477,9 +1477,7 @@ static int socket_address_listen_do(
log_unit_error_errno(u, error, fmt, strna(_t)); \
})
static int fork_needed(const SocketAddress *address, Socket *s) {
int r;
static bool fork_needed(const SocketAddress *address, Socket *s) {
assert(address);
assert(s);
@@ -1493,13 +1491,9 @@ static int fork_needed(const SocketAddress *address, Socket *s) {
if (nft_set->source == NFT_SET_SOURCE_CGROUP)
return true;
if (IN_SET(address->sockaddr.sa.sa_family, AF_INET, AF_INET6)) {
r = bpf_firewall_supported();
if (r < 0)
return r;
if (r != BPF_FIREWALL_UNSUPPORTED) /* If BPF firewalling isn't supported anyway — there's no point in this forking complexity */
return true;
}
if (IN_SET(address->sockaddr.sa.sa_family, AF_INET, AF_INET6) &&
bpf_program_supported() > 0) /* If BPF firewalling isn't supported anyway — there's no point in this forking complexity */
return true;
return exec_needs_network_namespace(&s->exec_context);
}
@@ -1521,10 +1515,7 @@ static int socket_address_listen_in_cgroup(
* the socket is actually properly attached to the unit's cgroup for the purpose of BPF filtering and
* such. */
r = fork_needed(address, s);
if (r < 0)
return r;
if (r == 0) {
if (!fork_needed(address, s)) {
/* Shortcut things... */
fd = socket_address_listen_do(s, address, label);
if (fd < 0)
@@ -3030,10 +3021,7 @@ static int socket_accept_in_cgroup(Socket *s, SocketPort *p, int fd) {
if (!IN_SET(p->address.sockaddr.sa.sa_family, AF_INET, AF_INET6))
goto shortcut;
r = bpf_firewall_supported();
if (r < 0)
return r;
if (r == BPF_FIREWALL_UNSUPPORTED)
if (bpf_program_supported() <= 0)
goto shortcut;
if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pair) < 0)

View File

@@ -26,12 +26,10 @@ struct bpf_map_create_opts;
* - before the compat static inline helpers that use them.
* When removing this file move these back to bpf-dlopen.h */
extern int (*sym_bpf_map_create)(enum bpf_map_type, const char *, __u32, __u32, __u32, const struct bpf_map_create_opts *);
extern int (*sym_libbpf_probe_bpf_prog_type)(enum bpf_prog_type, const void *);
extern struct bpf_map* (*sym_bpf_object__next_map)(const struct bpf_object *obj, const struct bpf_map *map);
/* compat symbols removed in libbpf 1.0 */
extern int (*sym_bpf_create_map)(enum bpf_map_type, int key_size, int value_size, int max_entries, __u32 map_flags);
extern bool (*sym_bpf_probe_prog_type)(enum bpf_prog_type, __u32);
/* helpers to use the available variant behind new API */
static inline int compat_bpf_map_create(enum bpf_map_type map_type,
@@ -47,10 +45,3 @@ static inline int compat_bpf_map_create(enum bpf_map_type map_type,
return sym_bpf_create_map(map_type, key_size, value_size, max_entries,
0 /* opts->map_flags, but opts is always NULL for us so skip build dependency on the type */);
}
static inline int compat_libbpf_probe_bpf_prog_type(enum bpf_prog_type prog_type, const void *opts) {
if (sym_libbpf_probe_bpf_prog_type)
return sym_libbpf_probe_bpf_prog_type(prog_type, opts);
return sym_bpf_probe_prog_type(prog_type, 0);
}

View File

@@ -51,12 +51,10 @@ DLSYM_PROTOTYPE(ring_buffer__poll) = NULL;
/* new symbols available from libbpf 0.7.0 */
int (*sym_bpf_map_create)(enum bpf_map_type, const char *, __u32, __u32, __u32, const struct bpf_map_create_opts *);
int (*sym_libbpf_probe_bpf_prog_type)(enum bpf_prog_type, const void *);
struct bpf_map* (*sym_bpf_object__next_map)(const struct bpf_object *obj, const struct bpf_map *map);
/* compat symbols removed in libbpf 1.0 */
int (*sym_bpf_create_map)(enum bpf_map_type, int key_size, int value_size, int max_entries, __u32 map_flags);
bool (*sym_bpf_probe_prog_type)(enum bpf_prog_type, __u32);
_printf_(2,0)
static int bpf_print_func(enum libbpf_print_level level, const char *fmt, va_list ap) {
@@ -72,10 +70,14 @@ static int bpf_print_func(enum libbpf_print_level level, const char *fmt, va_lis
return log_internalv(LOG_DEBUG, errno, NULL, 0, NULL, fmt, ap);
}
int dlopen_bpf(void) {
int dlopen_bpf_full(int log_level) {
static int cached = 0;
void *dl;
int r;
if (cached != 0)
return cached;
ELF_NOTE_DLOPEN("bpf",
"Support firewalling and sandboxing with BPF",
ELF_NOTE_DLOPEN_PRIORITY_SUGGESTED,
@@ -91,8 +93,9 @@ int dlopen_bpf(void) {
* list for both files, and when we assume 1.0+ is present we can remove this dlopen */
dl = dlopen("libbpf.so.0", RTLD_NOW|RTLD_NODELETE);
if (!dl)
return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
"neither libbpf.so.1 nor libbpf.so.0 are installed: %s", dlerror());
return cached = log_full_errno(log_level, SYNTHETIC_ERRNO(EOPNOTSUPP),
"Neither libbpf.so.1 nor libbpf.so.0 are installed, cgroup BPF features disabled: %s",
dlerror());
log_debug("Loaded 'libbpf.so.0' via dlopen()");
@@ -101,11 +104,9 @@ int dlopen_bpf(void) {
dl, LOG_DEBUG,
#if MODERN_LIBBPF
/* Don't exist anymore in new libbpf, hence cannot type check them */
DLSYM_ARG_FORCE(bpf_create_map),
DLSYM_ARG_FORCE(bpf_probe_prog_type)
DLSYM_ARG_FORCE(bpf_create_map)
#else
DLSYM_ARG(bpf_create_map),
DLSYM_ARG(bpf_probe_prog_type)
DLSYM_ARG(bpf_create_map)
#endif
);
@@ -118,18 +119,16 @@ int dlopen_bpf(void) {
dl, LOG_DEBUG,
#if MODERN_LIBBPF
DLSYM_ARG(bpf_map_create),
DLSYM_ARG(libbpf_probe_bpf_prog_type),
DLSYM_ARG(bpf_object__next_map)
#else
/* These symbols did not exist in old libbpf, hence we cannot type check them */
DLSYM_ARG_FORCE(bpf_map_create),
DLSYM_ARG_FORCE(libbpf_probe_bpf_prog_type),
DLSYM_ARG_FORCE(bpf_object__next_map)
#endif
);
}
if (r < 0)
return r;
return cached = log_full_errno(log_level, r, "Failed to load libbpf symbols, cgroup BPF features disabled: %m");
r = dlsym_many_or_warn(
dl, LOG_DEBUG,
@@ -171,14 +170,14 @@ int dlopen_bpf(void) {
DLSYM_ARG(ring_buffer__new),
DLSYM_ARG(ring_buffer__poll));
if (r < 0)
return r;
return cached = log_full_errno(log_level, r, "Failed to load libbpf symbols, cgroup BPF features disabled: %m");
/* We set the print helper unconditionally. Otherwise libbpf will emit not useful log messages. */
(void) sym_libbpf_set_print(bpf_print_func);
REENABLE_WARNING;
return r;
return cached = true;
}
int bpf_get_error_translated(const void *ptr) {
@@ -200,8 +199,8 @@ int bpf_get_error_translated(const void *ptr) {
#else
int dlopen_bpf(void) {
return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
"libbpf support is not compiled in.");
int dlopen_bpf_full(int log_level) {
return log_once_errno(log_level, SYNTHETIC_ERRNO(EOPNOTSUPP),
"libbpf support is not compiled in, cgroup BPF features disabled.");
}
#endif

View File

@@ -1,6 +1,9 @@
/* SPDX-License-Identifier: LGPL-2.1-or-later */
#pragma once
#include <stdbool.h>
#include <syslog.h>
#if HAVE_LIBBPF
#include <bpf/bpf.h>
@@ -48,4 +51,7 @@ int bpf_get_error_translated(const void *ptr);
#endif
int dlopen_bpf(void);
int dlopen_bpf_full(int log_level);
static inline int dlopen_bpf(void) {
return dlopen_bpf_full(LOG_DEBUG);
}

View File

@@ -1,6 +1,7 @@
/* SPDX-License-Identifier: LGPL-2.1-or-later */
#include <fcntl.h>
#include <linux/bpf_insn.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
@@ -42,6 +43,52 @@ DEFINE_STRING_TABLE_LOOKUP(bpf_cgroup_attach_type, int);
DEFINE_HASH_OPS_WITH_KEY_DESTRUCTOR(bpf_program_hash_ops, void, trivial_hash_func, trivial_compare_func, bpf_program_free);
int bpf_program_supported(void) {
static int cached = 0;
if (cached != 0)
return cached;
/* Currently, we only use the following three types:
* - BPF_PROG_TYPE_CGROUP_SKB, supported since kernel v4.10 (0e33661de493db325435d565a4a722120ae4cbf3),
* - BPF_PROG_TYPE_CGROUP_DEVICE, supported since kernel v4.15 (ebc614f687369f9df99828572b1d85a7c2de3d92),
* - BPF_PROG_TYPE_CGROUP_SOCK_ADDR, supported since kernel v4.17 (4fbac77d2d092b475dda9eea66da674369665427).
* Hence, as our baseline on the kernel is v5.4, it is not necessary to check if we can create BPF
* programs of hthese types.
*
* However, unfortunately the kernel allows us to create BPF_PROG_TYPE_CGROUP_SKB (maybe also other types)
* programs even when CONFIG_CGROUP_BPF is turned off at kernel compilation time. This sucks of course:
* why does it allow us to create a cgroup BPF program if we can't do a thing with it later?
*
* We detect this case by issuing the BPF_PROG_DETACH bpf() call with invalid file descriptors: if
* CONFIG_CGROUP_BPF is turned off, then the call will fail early with EINVAL. If it is turned on the
* parameters are validated however, and that'll fail with EBADF then.
*
* The check seems also important when we are running with sanitizers. With sanitizers (at least with
* LLVM v20), the following check and other bpf() calls fails even if the kernel supports BPF. To
* avoid unexpected fail when running with sanitizers, let's explicitly check if bpf() syscall works. */
/* Clang and GCC (>=15) do not 0-pad with structured initialization, causing the kernel to reject the
* bpf_attr as invalid. See: https://github.com/torvalds/linux/blob/v5.9/kernel/bpf/syscall.c#L65
* Hence, we cannot use structured initialization here, and need to clear the structure with zero
* explicitly before use. */
union bpf_attr attr;
zero(attr);
attr.attach_type = BPF_CGROUP_INET_EGRESS; /* since kernel v4.10 (0e33661de493db325435d565a4a722120ae4cbf3) */
attr.target_fd = -EBADF;
attr.attach_bpf_fd = -EBADF;
if (bpf(BPF_PROG_DETACH, &attr, sizeof(attr)) < 0) {
if (errno == EBADF) /* YAY! */
return cached = true;
return cached = log_debug_errno(errno, "Didn't get EBADF from invalid BPF_PROG_DETACH call: %m");
}
return cached = log_debug_errno(SYNTHETIC_ERRNO(EBADE),
"Wut? Kernel accepted our invalid BPF_PROG_DETACH call? Something is weird, assuming BPF is broken and hence not supported.");
}
BPFProgram *bpf_program_free(BPFProgram *p) {
if (!p)
return NULL;

View File

@@ -33,6 +33,8 @@ struct BPFProgram {
uint32_t attached_flags;
};
int bpf_program_supported(void);
int bpf_program_new(uint32_t prog_type, const char *prog_name, BPFProgram **ret);
int bpf_program_new_from_bpffs_path(const char *path, BPFProgram **ret);
BPFProgram *bpf_program_free(BPFProgram *p);

View File

@@ -219,7 +219,10 @@ static int allocate_scope(void) {
return 0;
}
r = sd_bus_default_system(&bus);
if (geteuid() == 0)
r = sd_bus_default_system(&bus);
else
r = sd_bus_default_user(&bus);
if (r < 0)
return log_error_errno(r, "Failed to connect to system bus: %m");

View File

@@ -257,6 +257,11 @@ int main(int argc, char *argv[]) {
test_setup_logging(LOG_DEBUG);
r = bpf_program_supported();
if (r < 0)
return log_tests_skipped_errno(r, "BPF device filter not supported");
ASSERT_TRUE(r);
ASSERT_OK(getrlimit(RLIMIT_MEMLOCK, &rl));
rl.rlim_cur = rl.rlim_max = MAX(rl.rlim_max, CAN_MEMLOCK_SIZE);
(void) setrlimit(RLIMIT_MEMLOCK, &rl);
@@ -274,11 +279,6 @@ int main(int argc, char *argv[]) {
if (r < 0)
return log_tests_skipped_errno(r, "Failed to prepare cgroup subtree");
r = bpf_devices_supported();
if (r == 0)
return log_tests_skipped("BPF device filter not supported");
ASSERT_EQ(r, 1);
r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, cgroup, NULL, &controller_path);
ASSERT_OK(r);

View File

@@ -31,7 +31,6 @@ int main(int argc, char *argv[]) {
struct rlimit rl;
int r;
union bpf_attr attr;
bool test_custom_filter = false;
const char *test_prog = "/sys/fs/bpf/test-dropper";
test_setup_logging(LOG_DEBUG);
@@ -39,6 +38,11 @@ int main(int argc, char *argv[]) {
if (detect_container() > 0)
return log_tests_skipped("test-bpf-firewall fails inside LXC and Docker containers: https://github.com/systemd/systemd/issues/9666");
r = bpf_program_supported();
if (r < 0)
return log_tests_skipped_errno(r, "BPF firewalling not supported");
ASSERT_TRUE(r);
ASSERT_OK(getrlimit(RLIMIT_MEMLOCK, &rl));
rl.rlim_cur = rl.rlim_max = MAX(rl.rlim_max, CAN_MEMLOCK_SIZE);
(void) setrlimit(RLIMIT_MEMLOCK, &rl);
@@ -65,34 +69,17 @@ int main(int argc, char *argv[]) {
r = bpf_program_add_instructions(p, exit_insn, ELEMENTSOF(exit_insn));
ASSERT_EQ(r, 0);
r = bpf_firewall_supported();
if (r == BPF_FIREWALL_UNSUPPORTED)
return log_tests_skipped("BPF firewalling not supported");
ASSERT_GT(r, 0);
if (r == BPF_FIREWALL_SUPPORTED_WITH_MULTI) {
log_notice("BPF firewalling with BPF_F_ALLOW_MULTI supported. Yay!");
test_custom_filter = true;
} else
log_notice("BPF firewalling (though without BPF_F_ALLOW_MULTI) supported. Good.");
r = bpf_program_load_kernel(p, log_buf, ELEMENTSOF(log_buf));
ASSERT_OK(r);
if (test_custom_filter) {
zero(attr);
attr.pathname = PTR_TO_UINT64(test_prog);
attr.bpf_fd = p->kernel_fd;
attr.file_flags = 0;
zero(attr);
attr.pathname = PTR_TO_UINT64(test_prog);
attr.bpf_fd = p->kernel_fd;
attr.file_flags = 0;
(void) unlink(test_prog);
(void) unlink(test_prog);
r = bpf(BPF_OBJ_PIN, &attr, sizeof(attr));
if (r < 0) {
log_warning_errno(errno, "BPF object pinning failed, will not run custom filter test: %m");
test_custom_filter = false;
}
}
ASSERT_OK(bpf(BPF_OBJ_PIN, &attr, sizeof(attr)));
p = bpf_program_free(p);
@@ -192,31 +179,30 @@ int main(int argc, char *argv[]) {
assert_se(SERVICE(u)->exec_command[SERVICE_EXEC_START]->command_next->exec_status.code != CLD_EXITED ||
SERVICE(u)->exec_command[SERVICE_EXEC_START]->command_next->exec_status.status != EXIT_SUCCESS);
if (test_custom_filter) {
assert_se(u = unit_new(m, sizeof(Service)));
assert_se(unit_add_name(u, "custom-filter.service") == 0);
assert_se(cc = unit_get_cgroup_context(u));
u->perpetual = true;
/* testing custom filter */
assert_se(u = unit_new(m, sizeof(Service)));
assert_se(unit_add_name(u, "custom-filter.service") == 0);
assert_se(cc = unit_get_cgroup_context(u));
u->perpetual = true;
cc->ip_accounting = true;
cc->ip_accounting = true;
assert_se(config_parse_ip_filter_bpf_progs(u->id, "filename", 1, "Service", 1, "IPIngressFilterPath", 0, test_prog, &cc->ip_filters_ingress, u) == 0);
assert_se(config_parse_exec(u->id, "filename", 1, "Service", 1, "ExecStart", SERVICE_EXEC_START, "-/bin/ping -c 1 127.0.0.1 -W 5", SERVICE(u)->exec_command, u) == 0);
assert_se(config_parse_ip_filter_bpf_progs(u->id, "filename", 1, "Service", 1, "IPIngressFilterPath", 0, test_prog, &cc->ip_filters_ingress, u) == 0);
assert_se(config_parse_exec(u->id, "filename", 1, "Service", 1, "ExecStart", SERVICE_EXEC_START, "-/bin/ping -c 1 127.0.0.1 -W 5", SERVICE(u)->exec_command, u) == 0);
SERVICE(u)->type = SERVICE_ONESHOT;
u->load_state = UNIT_LOADED;
SERVICE(u)->type = SERVICE_ONESHOT;
u->load_state = UNIT_LOADED;
ASSERT_OK(unit_start(u, NULL));
ASSERT_OK(unit_start(u, NULL));
while (!IN_SET(SERVICE(u)->state, SERVICE_DEAD, SERVICE_FAILED))
assert_se(sd_event_run(m->event, UINT64_MAX) >= 0);
while (!IN_SET(SERVICE(u)->state, SERVICE_DEAD, SERVICE_FAILED))
assert_se(sd_event_run(m->event, UINT64_MAX) >= 0);
assert_se(SERVICE(u)->exec_command[SERVICE_EXEC_START]->exec_status.code != CLD_EXITED ||
SERVICE(u)->exec_command[SERVICE_EXEC_START]->exec_status.status != EXIT_SUCCESS);
assert_se(SERVICE(u)->exec_command[SERVICE_EXEC_START]->exec_status.code != CLD_EXITED ||
SERVICE(u)->exec_command[SERVICE_EXEC_START]->exec_status.status != EXIT_SUCCESS);
(void) unlink(test_prog);
assert_se(SERVICE(u)->state == SERVICE_DEAD);
}
(void) unlink(test_prog);
assert_se(SERVICE(u)->state == SERVICE_DEAD);
return 0;
}

View File

@@ -279,8 +279,9 @@ int main(int argc, char *argv[]) {
if (detect_container() > 0)
return log_tests_skipped("test-bpf fails inside LXC and Docker containers: https://github.com/systemd/systemd/issues/9666");
if (getuid() != 0)
return log_tests_skipped("not running as root");
r = bpf_program_supported();
if (r < 0)
return log_tests_skipped_errno(r, "not running as root");
ASSERT_OK(getrlimit(RLIMIT_MEMLOCK, &rl));
rl.rlim_cur = rl.rlim_max = MAX(rl.rlim_max, CAN_MEMLOCK_SIZE);