Files
systemd/src/shared/bpf-program.c
Yu Watanabe b6278c1937 bpf-program: check if a trivial BPF program can be created and loaded
Re-introduce the check dropped by
ec3c5cfac7,
ad446c8ceb.

For some reasons, if we are running on LXC, even if bpf_program_supported()
returned true, but bpf_program_load_kernel() failed:
```
Attaching device control BPF program to cgroup /system.slice/test-bpf-devices-875b406d56ac7bc3.scope/186c411f6e991777 failed: Operation not permitted
src/test/test-bpf-devices.c:31: Assertion failed: Expected "r" to succeed, but got error: Operation not permitted
```
2025-05-27 17:24:33 +01:00

582 lines
20 KiB
C

/* SPDX-License-Identifier: LGPL-2.1-or-later */
#include <fcntl.h>
#include <linux/bpf.h>
#include <linux/bpf_insn.h>
#include <unistd.h>
#include "alloc-util.h"
#include "bpf-program.h"
#include "errno-util.h"
#include "escape.h"
#include "extract-word.h"
#include "fd-util.h"
#include "fdset.h"
#include "log.h"
#include "memory-util.h"
#include "missing_syscall.h"
#include "parse-util.h"
#include "path-util.h"
#include "serialize.h"
#include "set.h"
#include "string-table.h"
#include "string-util.h"
static const char *const bpf_cgroup_attach_type_table[__MAX_BPF_ATTACH_TYPE] = {
[BPF_CGROUP_INET_INGRESS] = "ingress",
[BPF_CGROUP_INET_EGRESS] = "egress",
[BPF_CGROUP_INET_SOCK_CREATE] = "sock_create",
[BPF_CGROUP_SOCK_OPS] = "sock_ops",
[BPF_CGROUP_DEVICE] = "device",
[BPF_CGROUP_INET4_BIND] = "bind4",
[BPF_CGROUP_INET6_BIND] = "bind6",
[BPF_CGROUP_INET4_CONNECT] = "connect4",
[BPF_CGROUP_INET6_CONNECT] = "connect6",
[BPF_CGROUP_INET4_POST_BIND] = "post_bind4",
[BPF_CGROUP_INET6_POST_BIND] = "post_bind6",
[BPF_CGROUP_UDP4_SENDMSG] = "sendmsg4",
[BPF_CGROUP_UDP6_SENDMSG] = "sendmsg6",
[BPF_CGROUP_SYSCTL] = "sysctl",
[BPF_CGROUP_UDP4_RECVMSG] = "recvmsg4",
[BPF_CGROUP_UDP6_RECVMSG] = "recvmsg6",
[BPF_CGROUP_GETSOCKOPT] = "getsockopt",
[BPF_CGROUP_SETSOCKOPT] = "setsockopt",
};
DEFINE_STRING_TABLE_LOOKUP(bpf_cgroup_attach_type, int);
DEFINE_HASH_OPS_WITH_KEY_DESTRUCTOR(bpf_program_hash_ops, void, trivial_hash_func, trivial_compare_func, bpf_program_free);
int bpf_program_supported(void) {
static int cached = 0;
int r;
if (cached != 0)
return cached;
/* Currently, we only use the following three types:
* - BPF_PROG_TYPE_CGROUP_SKB, supported since kernel v4.10 (0e33661de493db325435d565a4a722120ae4cbf3),
* - BPF_PROG_TYPE_CGROUP_DEVICE, supported since kernel v4.15 (ebc614f687369f9df99828572b1d85a7c2de3d92),
* - BPF_PROG_TYPE_CGROUP_SOCK_ADDR, supported since kernel v4.17 (4fbac77d2d092b475dda9eea66da674369665427).
* As our baseline on the kernel is v5.4, it is enough to check if one BPF program can be created and loaded. */
_cleanup_(bpf_program_freep) BPFProgram *program = NULL;
r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, /* prog_name = */ NULL, &program);
if (r < 0)
return cached = log_debug_errno(r, "Can't allocate CGROUP SKB BPF program, assuming BPF is not supported: %m");
static const struct bpf_insn trivial[] = {
BPF_MOV64_IMM(BPF_REG_0, 1),
BPF_EXIT_INSN()
};
r = bpf_program_add_instructions(program, trivial, ELEMENTSOF(trivial));
if (r < 0)
return cached = log_debug_errno(r, "Can't add trivial instructions to CGROUP SKB BPF program, assuming BPF is not supported: %m");
r = bpf_program_load_kernel(program, /* log_buf = */ NULL, /* log_size = */ 0);
if (r < 0)
return cached = log_debug_errno(r, "Can't load kernel CGROUP SKB BPF program, assuming BPF is not supported: %m");
/* Unfortunately the kernel allows us to create BPF_PROG_TYPE_CGROUP_SKB (maybe also other types)
* programs even when CONFIG_CGROUP_BPF is turned off at kernel compilation time. This sucks of course:
* why does it allow us to create a cgroup BPF program if we can't do a thing with it later?
*
* We detect this case by issuing the BPF_PROG_DETACH bpf() call with invalid file descriptors: if
* CONFIG_CGROUP_BPF is turned off, then the call will fail early with EINVAL. If it is turned on the
* parameters are validated however, and that'll fail with EBADF then.
*
* The check seems also important when we are running with sanitizers. With sanitizers (at least with
* LLVM v20), the following check and other bpf() calls fails even if the kernel supports BPF. To
* avoid unexpected fail when running with sanitizers, let's explicitly check if bpf() syscall works. */
/* Clang and GCC (>=15) do not 0-pad with structured initialization, causing the kernel to reject the
* bpf_attr as invalid. See: https://github.com/torvalds/linux/blob/v5.9/kernel/bpf/syscall.c#L65
* Hence, we cannot use structured initialization here, and need to clear the structure with zero
* explicitly before use. */
union bpf_attr attr;
zero(attr);
attr.attach_type = BPF_CGROUP_INET_EGRESS; /* since kernel v4.10 (0e33661de493db325435d565a4a722120ae4cbf3) */
attr.target_fd = -EBADF;
attr.attach_bpf_fd = -EBADF;
if (bpf(BPF_PROG_DETACH, &attr, sizeof(attr)) < 0) {
if (errno == EBADF) /* YAY! */
return cached = true;
return cached = log_debug_errno(errno, "Didn't get EBADF from invalid BPF_PROG_DETACH call: %m");
}
return cached = log_debug_errno(SYNTHETIC_ERRNO(EBADE),
"Wut? Kernel accepted our invalid BPF_PROG_DETACH call? Something is weird, assuming BPF is broken and hence not supported.");
}
BPFProgram *bpf_program_free(BPFProgram *p) {
if (!p)
return NULL;
/* Unfortunately, the kernel currently doesn't implicitly detach BPF programs from their cgroups when the last
* fd to the BPF program is closed. This has nasty side-effects since this means that abnormally terminated
* programs that attached one of their BPF programs to a cgroup will leave this program pinned for good with
* zero chance of recovery, until the cgroup is removed. This is particularly problematic if the cgroup in
* question is the root cgroup (or any other cgroup belonging to a service that cannot be restarted during
* operation, such as dbus), as the memory for the BPF program can only be reclaimed through a reboot. To
* counter this, we track closely to which cgroup a program was attached to and will detach it on our own
* whenever we close the BPF fd. */
(void) bpf_program_cgroup_detach(p);
safe_close(p->kernel_fd);
free(p->prog_name);
free(p->instructions);
free(p->attached_path);
return mfree(p);
}
/* struct bpf_prog_info info must be initialized since its value is both input and output
* for BPF_OBJ_GET_INFO_BY_FD syscall. */
static int bpf_program_get_info_by_fd(int prog_fd, struct bpf_prog_info *info, uint32_t info_len) {
union bpf_attr attr;
/* Explicitly memset to zero since some compilers may produce non-zero-initialized padding when
* structured initialization is used.
* Refer to https://github.com/systemd/systemd/issues/18164
*/
zero(attr);
attr.info.bpf_fd = prog_fd;
attr.info.info_len = info_len;
attr.info.info = PTR_TO_UINT64(info);
return RET_NERRNO(bpf(BPF_OBJ_GET_INFO_BY_FD, &attr, sizeof(attr)));
}
int bpf_program_new(uint32_t prog_type, const char *prog_name, BPFProgram **ret) {
_cleanup_(bpf_program_freep) BPFProgram *p = NULL;
_cleanup_free_ char *name = NULL;
if (prog_name) {
if (strlen(prog_name) >= BPF_OBJ_NAME_LEN)
return -ENAMETOOLONG;
name = strdup(prog_name);
if (!name)
return -ENOMEM;
}
p = new(BPFProgram, 1);
if (!p)
return -ENOMEM;
*p = (BPFProgram) {
.prog_type = prog_type,
.kernel_fd = -EBADF,
.prog_name = TAKE_PTR(name),
};
*ret = TAKE_PTR(p);
return 0;
}
int bpf_program_new_from_bpffs_path(const char *path, BPFProgram **ret) {
_cleanup_(bpf_program_freep) BPFProgram *p = NULL;
struct bpf_prog_info info = {};
int r;
assert(path);
assert(ret);
p = new(BPFProgram, 1);
if (!p)
return -ENOMEM;
*p = (BPFProgram) {
.prog_type = BPF_PROG_TYPE_UNSPEC,
.kernel_fd = -EBADF,
};
r = bpf_program_load_from_bpf_fs(p, path);
if (r < 0)
return r;
r = bpf_program_get_info_by_fd(p->kernel_fd, &info, sizeof(info));
if (r < 0)
return r;
p->prog_type = info.type;
*ret = TAKE_PTR(p);
return 0;
}
int bpf_program_add_instructions(BPFProgram *p, const struct bpf_insn *instructions, size_t count) {
assert(p);
if (p->kernel_fd >= 0) /* don't allow modification after we uploaded things to the kernel */
return -EBUSY;
if (!GREEDY_REALLOC(p->instructions, p->n_instructions + count))
return -ENOMEM;
memcpy(p->instructions + p->n_instructions, instructions, sizeof(struct bpf_insn) * count);
p->n_instructions += count;
return 0;
}
int bpf_program_load_kernel(BPFProgram *p, char *log_buf, size_t log_size) {
union bpf_attr attr;
assert(p);
if (p->kernel_fd >= 0) { /* make this idempotent */
memzero(log_buf, log_size);
return 0;
}
// FIXME: Clang doesn't 0-pad with structured initialization, causing
// the kernel to reject the bpf_attr as invalid. See:
// https://github.com/torvalds/linux/blob/v5.9/kernel/bpf/syscall.c#L65
// Ideally it should behave like GCC, so that we can remove these workarounds.
zero(attr);
attr.prog_type = p->prog_type;
attr.insns = PTR_TO_UINT64(p->instructions);
attr.insn_cnt = p->n_instructions;
attr.license = PTR_TO_UINT64("GPL");
attr.log_buf = PTR_TO_UINT64(log_buf);
attr.log_level = !!log_buf;
attr.log_size = log_size;
if (p->prog_name)
strncpy(attr.prog_name, p->prog_name, BPF_OBJ_NAME_LEN - 1);
p->kernel_fd = bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
if (p->kernel_fd < 0)
return -errno;
return 0;
}
int bpf_program_load_from_bpf_fs(BPFProgram *p, const char *path) {
union bpf_attr attr;
assert(p);
if (p->kernel_fd >= 0) /* don't overwrite an assembled or loaded program */
return -EBUSY;
zero(attr);
attr.pathname = PTR_TO_UINT64(path);
p->kernel_fd = bpf(BPF_OBJ_GET, &attr, sizeof(attr));
if (p->kernel_fd < 0)
return -errno;
return 0;
}
int bpf_program_cgroup_attach(BPFProgram *p, int type, const char *path, uint32_t flags) {
_cleanup_free_ char *copy = NULL;
_cleanup_close_ int fd = -EBADF;
union bpf_attr attr;
int r;
assert(p);
assert(type >= 0);
assert(path);
if (!IN_SET(flags, 0, BPF_F_ALLOW_OVERRIDE, BPF_F_ALLOW_MULTI))
return -EINVAL;
/* We need to track which cgroup the program is attached to, and we can only track one attachment, hence let's
* refuse this early. */
if (p->attached_path) {
if (!path_equal(p->attached_path, path))
return -EBUSY;
if (p->attached_type != type)
return -EBUSY;
if (p->attached_flags != flags)
return -EBUSY;
/* Here's a shortcut: if we previously attached this program already, then we don't have to do so
* again. Well, with one exception: if we are in BPF_F_ALLOW_OVERRIDE mode then someone else might have
* replaced our program since the last time, hence let's reattach it again, just to be safe. In flags
* == 0 mode this is not an issue since nobody else can replace our program in that case, and in flags
* == BPF_F_ALLOW_MULTI mode any other's program would be installed in addition to ours hence ours
* would remain in effect. */
if (flags != BPF_F_ALLOW_OVERRIDE)
return 0;
}
/* Ensure we have a kernel object for this. */
r = bpf_program_load_kernel(p, NULL, 0);
if (r < 0)
return r;
copy = strdup(path);
if (!copy)
return -ENOMEM;
fd = open(path, O_DIRECTORY|O_RDONLY|O_CLOEXEC);
if (fd < 0)
return -errno;
zero(attr);
attr.attach_type = type;
attr.target_fd = fd;
attr.attach_bpf_fd = p->kernel_fd;
attr.attach_flags = flags;
if (bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)) < 0)
return -errno;
free_and_replace(p->attached_path, copy);
p->attached_type = type;
p->attached_flags = flags;
return 0;
}
int bpf_program_cgroup_detach(BPFProgram *p) {
_cleanup_close_ int fd = -EBADF;
assert(p);
if (!p->attached_path)
return -EUNATCH;
fd = open(p->attached_path, O_DIRECTORY|O_RDONLY|O_CLOEXEC);
if (fd < 0) {
if (errno != ENOENT)
return -errno;
/* If the cgroup does not exist anymore, then we don't have to explicitly detach, it got detached
* implicitly by the removal, hence don't complain */
} else {
union bpf_attr attr;
zero(attr);
attr.attach_type = p->attached_type;
attr.target_fd = fd;
attr.attach_bpf_fd = p->kernel_fd;
if (bpf(BPF_PROG_DETACH, &attr, sizeof(attr)) < 0)
return -errno;
}
p->attached_path = mfree(p->attached_path);
return 0;
}
int bpf_map_new(
const char *name,
enum bpf_map_type type,
size_t key_size,
size_t value_size,
size_t max_entries,
uint32_t flags) {
union bpf_attr attr;
const char *n = name;
zero(attr);
attr.map_type = type;
attr.key_size = key_size;
attr.value_size = value_size;
attr.max_entries = max_entries;
attr.map_flags = flags;
/* The map name is primarily informational for debugging purposes, and typically too short
* to carry the full unit name, hence we employ a trivial lossy escaping to make it fit
* (truncation + only alphanumerical, "." and "_" are allowed as per
* https://docs.kernel.org/bpf/maps.html#usage-notes) */
for (size_t i = 0; i < sizeof(attr.map_name) - 1 && *n; i++, n++)
attr.map_name[i] = strchr(ALPHANUMERICAL ".", *n) ? *n : '_';
return RET_NERRNO(bpf(BPF_MAP_CREATE, &attr, sizeof(attr)));
}
int bpf_map_update_element(int fd, const void *key, void *value) {
union bpf_attr attr;
zero(attr);
attr.map_fd = fd;
attr.key = PTR_TO_UINT64(key);
attr.value = PTR_TO_UINT64(value);
return RET_NERRNO(bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr)));
}
int bpf_map_lookup_element(int fd, const void *key, void *value) {
union bpf_attr attr;
zero(attr);
attr.map_fd = fd;
attr.key = PTR_TO_UINT64(key);
attr.value = PTR_TO_UINT64(value);
return RET_NERRNO(bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr)));
}
int bpf_program_pin(int prog_fd, const char *bpffs_path) {
union bpf_attr attr;
zero(attr);
attr.pathname = PTR_TO_UINT64((void *) bpffs_path);
attr.bpf_fd = prog_fd;
return RET_NERRNO(bpf(BPF_OBJ_PIN, &attr, sizeof(attr)));
}
int bpf_program_get_id_by_fd(int prog_fd, uint32_t *ret_id) {
struct bpf_prog_info info = {};
int r;
assert(ret_id);
r = bpf_program_get_info_by_fd(prog_fd, &info, sizeof(info));
if (r < 0)
return r;
*ret_id = info.id;
return 0;
};
int bpf_program_serialize_attachment(
FILE *f,
FDSet *fds,
const char *key,
BPFProgram *p) {
_cleanup_free_ char *escaped = NULL;
int copy, r;
if (!p || !p->attached_path)
return 0;
assert(p->kernel_fd >= 0);
escaped = cescape(p->attached_path);
if (!escaped)
return -ENOMEM;
copy = fdset_put_dup(fds, p->kernel_fd);
if (copy < 0)
return log_error_errno(copy, "Failed to add BPF kernel fd to serialize: %m");
r = serialize_item_format(
f,
key,
"%i %s %s",
copy,
bpf_cgroup_attach_type_to_string(p->attached_type),
escaped);
if (r < 0)
return r;
/* After serialization, let's forget the fact that this program is attached. The attachment — if you
* so will — is now 'owned' by the serialization, and not us anymore. Why does that matter? Because
* of BPF's less-than-ideal lifecycle handling: to detach a program from a cgroup we have to
* explicitly do so, it's not done implicitly on close(). Now, since we are serializing here we don't
* want the program to be detached while freeing things, so that the attachment can be retained after
* deserializing again. bpf_program_free() implicitly detaches things, if attached_path is non-NULL,
* hence we set it to NULL here. */
p->attached_path = mfree(p->attached_path);
return 0;
}
int bpf_program_serialize_attachment_set(FILE *f, FDSet *fds, const char *key, Set *set) {
BPFProgram *p;
int r;
SET_FOREACH(p, set) {
r = bpf_program_serialize_attachment(f, fds, key, p);
if (r < 0)
return r;
}
return 0;
}
int bpf_program_deserialize_attachment(const char *v, FDSet *fds, BPFProgram **bpfp) {
_cleanup_free_ char *sfd = NULL, *sat = NULL, *unescaped = NULL;
_cleanup_(bpf_program_freep) BPFProgram *p = NULL;
_cleanup_close_ int fd = -EBADF;
ssize_t l;
int ifd, at, r;
assert(v);
assert(bpfp);
/* Extract first word: the fd number */
r = extract_first_word(&v, &sfd, NULL, 0);
if (r < 0)
return r;
if (r == 0)
return -EINVAL;
ifd = parse_fd(sfd);
if (ifd < 0)
return r;
/* Extract second word: the attach type */
r = extract_first_word(&v, &sat, NULL, 0);
if (r < 0)
return r;
if (r == 0)
return -EINVAL;
at = bpf_cgroup_attach_type_from_string(sat);
if (at < 0)
return at;
/* The rest is the path */
if (isempty(v))
return -EINVAL;
l = cunescape(v, 0, &unescaped);
if (l < 0)
return l;
fd = fdset_remove(fds, ifd);
if (fd < 0)
return fd;
p = new(BPFProgram, 1);
if (!p)
return -ENOMEM;
*p = (BPFProgram) {
.kernel_fd = TAKE_FD(fd),
.prog_type = BPF_PROG_TYPE_UNSPEC,
.attached_path = TAKE_PTR(unescaped),
.attached_type = at,
};
if (*bpfp)
bpf_program_free(*bpfp);
*bpfp = TAKE_PTR(p);
return 0;
}
int bpf_program_deserialize_attachment_set(const char *v, FDSet *fds, Set **bpfsetp) {
BPFProgram *p = NULL;
int r;
assert(v);
assert(bpfsetp);
r = bpf_program_deserialize_attachment(v, fds, &p);
if (r < 0)
return r;
r = set_ensure_consume(bpfsetp, &bpf_program_hash_ops, p);
if (r < 0)
return r;
return 0;
}