Files
systemd/src/shared/bpf-program.c
Yu Watanabe 543a48b653 libc-wrapper: introduce a tiny libc wrapper
Then, move syscall definitions to the wrapper, and prototypes are moved
to relevant headers.

This also adds checks for add_key() and request_key(), as one day
glibc may be going to add some of them separatedly.

The check for fspick in meson.build is dropped, as it is currently
unused in our code.

This also moves
- basic/missing_bpf.h -> include/override/linux/bpf.h,
- basic/missing_keyctl.h -> include/override/linux/keyctl.h.
2025-07-11 13:05:46 +09:00

582 lines
20 KiB
C

/* SPDX-License-Identifier: LGPL-2.1-or-later */
#include <fcntl.h>
#include <linux/bpf.h>
#include <linux/bpf_insn.h>
#include <sys/bpf.h>
#include <unistd.h>
#include "alloc-util.h"
#include "bpf-program.h"
#include "errno-util.h"
#include "escape.h"
#include "extract-word.h"
#include "fd-util.h"
#include "fdset.h"
#include "log.h"
#include "memory-util.h"
#include "parse-util.h"
#include "path-util.h"
#include "serialize.h"
#include "set.h"
#include "string-table.h"
#include "string-util.h"
static const char *const bpf_cgroup_attach_type_table[__MAX_BPF_ATTACH_TYPE] = {
[BPF_CGROUP_INET_INGRESS] = "ingress",
[BPF_CGROUP_INET_EGRESS] = "egress",
[BPF_CGROUP_INET_SOCK_CREATE] = "sock_create",
[BPF_CGROUP_SOCK_OPS] = "sock_ops",
[BPF_CGROUP_DEVICE] = "device",
[BPF_CGROUP_INET4_BIND] = "bind4",
[BPF_CGROUP_INET6_BIND] = "bind6",
[BPF_CGROUP_INET4_CONNECT] = "connect4",
[BPF_CGROUP_INET6_CONNECT] = "connect6",
[BPF_CGROUP_INET4_POST_BIND] = "post_bind4",
[BPF_CGROUP_INET6_POST_BIND] = "post_bind6",
[BPF_CGROUP_UDP4_SENDMSG] = "sendmsg4",
[BPF_CGROUP_UDP6_SENDMSG] = "sendmsg6",
[BPF_CGROUP_SYSCTL] = "sysctl",
[BPF_CGROUP_UDP4_RECVMSG] = "recvmsg4",
[BPF_CGROUP_UDP6_RECVMSG] = "recvmsg6",
[BPF_CGROUP_GETSOCKOPT] = "getsockopt",
[BPF_CGROUP_SETSOCKOPT] = "setsockopt",
};
DEFINE_STRING_TABLE_LOOKUP(bpf_cgroup_attach_type, int);
DEFINE_HASH_OPS_WITH_KEY_DESTRUCTOR(bpf_program_hash_ops, void, trivial_hash_func, trivial_compare_func, bpf_program_free);
int bpf_program_supported(void) {
static int cached = 0;
int r;
if (cached != 0)
return cached;
/* Currently, we only use the following three types:
* - BPF_PROG_TYPE_CGROUP_SKB, supported since kernel v4.10 (0e33661de493db325435d565a4a722120ae4cbf3),
* - BPF_PROG_TYPE_CGROUP_DEVICE, supported since kernel v4.15 (ebc614f687369f9df99828572b1d85a7c2de3d92),
* - BPF_PROG_TYPE_CGROUP_SOCK_ADDR, supported since kernel v4.17 (4fbac77d2d092b475dda9eea66da674369665427).
* As our baseline on the kernel is v5.4, it is enough to check if one BPF program can be created and loaded. */
_cleanup_(bpf_program_freep) BPFProgram *program = NULL;
r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, /* prog_name = */ NULL, &program);
if (r < 0)
return cached = log_debug_errno(r, "Can't allocate CGROUP SKB BPF program, assuming BPF is not supported: %m");
static const struct bpf_insn trivial[] = {
BPF_MOV64_IMM(BPF_REG_0, 1),
BPF_EXIT_INSN()
};
r = bpf_program_add_instructions(program, trivial, ELEMENTSOF(trivial));
if (r < 0)
return cached = log_debug_errno(r, "Can't add trivial instructions to CGROUP SKB BPF program, assuming BPF is not supported: %m");
r = bpf_program_load_kernel(program, /* log_buf = */ NULL, /* log_size = */ 0);
if (r < 0)
return cached = log_debug_errno(r, "Can't load kernel CGROUP SKB BPF program, assuming BPF is not supported: %m");
/* Unfortunately the kernel allows us to create BPF_PROG_TYPE_CGROUP_SKB (maybe also other types)
* programs even when CONFIG_CGROUP_BPF is turned off at kernel compilation time. This sucks of course:
* why does it allow us to create a cgroup BPF program if we can't do a thing with it later?
*
* We detect this case by issuing the BPF_PROG_DETACH bpf() call with invalid file descriptors: if
* CONFIG_CGROUP_BPF is turned off, then the call will fail early with EINVAL. If it is turned on the
* parameters are validated however, and that'll fail with EBADF then.
*
* The check seems also important when we are running with sanitizers. With sanitizers (at least with
* LLVM v20), the following check and other bpf() calls fails even if the kernel supports BPF. To
* avoid unexpected fail when running with sanitizers, let's explicitly check if bpf() syscall works. */
/* Clang and GCC (>=15) do not 0-pad with structured initialization, causing the kernel to reject the
* bpf_attr as invalid. See: https://github.com/torvalds/linux/blob/v5.9/kernel/bpf/syscall.c#L65
* Hence, we cannot use structured initialization here, and need to clear the structure with zero
* explicitly before use. */
union bpf_attr attr;
zero(attr);
attr.attach_type = BPF_CGROUP_INET_EGRESS; /* since kernel v4.10 (0e33661de493db325435d565a4a722120ae4cbf3) */
attr.target_fd = -EBADF;
attr.attach_bpf_fd = -EBADF;
if (bpf(BPF_PROG_DETACH, &attr, sizeof(attr)) < 0) {
if (errno == EBADF) /* YAY! */
return cached = true;
return cached = log_debug_errno(errno, "Didn't get EBADF from invalid BPF_PROG_DETACH call: %m");
}
return cached = log_debug_errno(SYNTHETIC_ERRNO(EBADE),
"Wut? Kernel accepted our invalid BPF_PROG_DETACH call? Something is weird, assuming BPF is broken and hence not supported.");
}
BPFProgram *bpf_program_free(BPFProgram *p) {
if (!p)
return NULL;
/* Unfortunately, the kernel currently doesn't implicitly detach BPF programs from their cgroups when the last
* fd to the BPF program is closed. This has nasty side-effects since this means that abnormally terminated
* programs that attached one of their BPF programs to a cgroup will leave this program pinned for good with
* zero chance of recovery, until the cgroup is removed. This is particularly problematic if the cgroup in
* question is the root cgroup (or any other cgroup belonging to a service that cannot be restarted during
* operation, such as dbus), as the memory for the BPF program can only be reclaimed through a reboot. To
* counter this, we track closely to which cgroup a program was attached to and will detach it on our own
* whenever we close the BPF fd. */
(void) bpf_program_cgroup_detach(p);
safe_close(p->kernel_fd);
free(p->prog_name);
free(p->instructions);
free(p->attached_path);
return mfree(p);
}
/* struct bpf_prog_info info must be initialized since its value is both input and output
* for BPF_OBJ_GET_INFO_BY_FD syscall. */
static int bpf_program_get_info_by_fd(int prog_fd, struct bpf_prog_info *info, uint32_t info_len) {
union bpf_attr attr;
/* Explicitly memset to zero since some compilers may produce non-zero-initialized padding when
* structured initialization is used.
* Refer to https://github.com/systemd/systemd/issues/18164
*/
zero(attr);
attr.info.bpf_fd = prog_fd;
attr.info.info_len = info_len;
attr.info.info = PTR_TO_UINT64(info);
return RET_NERRNO(bpf(BPF_OBJ_GET_INFO_BY_FD, &attr, sizeof(attr)));
}
int bpf_program_new(uint32_t prog_type, const char *prog_name, BPFProgram **ret) {
_cleanup_(bpf_program_freep) BPFProgram *p = NULL;
_cleanup_free_ char *name = NULL;
if (prog_name) {
if (strlen(prog_name) >= BPF_OBJ_NAME_LEN)
return -ENAMETOOLONG;
name = strdup(prog_name);
if (!name)
return -ENOMEM;
}
p = new(BPFProgram, 1);
if (!p)
return -ENOMEM;
*p = (BPFProgram) {
.prog_type = prog_type,
.kernel_fd = -EBADF,
.prog_name = TAKE_PTR(name),
};
*ret = TAKE_PTR(p);
return 0;
}
int bpf_program_new_from_bpffs_path(const char *path, BPFProgram **ret) {
_cleanup_(bpf_program_freep) BPFProgram *p = NULL;
struct bpf_prog_info info = {};
int r;
assert(path);
assert(ret);
p = new(BPFProgram, 1);
if (!p)
return -ENOMEM;
*p = (BPFProgram) {
.prog_type = BPF_PROG_TYPE_UNSPEC,
.kernel_fd = -EBADF,
};
r = bpf_program_load_from_bpf_fs(p, path);
if (r < 0)
return r;
r = bpf_program_get_info_by_fd(p->kernel_fd, &info, sizeof(info));
if (r < 0)
return r;
p->prog_type = info.type;
*ret = TAKE_PTR(p);
return 0;
}
int bpf_program_add_instructions(BPFProgram *p, const struct bpf_insn *instructions, size_t count) {
assert(p);
if (p->kernel_fd >= 0) /* don't allow modification after we uploaded things to the kernel */
return -EBUSY;
if (!GREEDY_REALLOC(p->instructions, p->n_instructions + count))
return -ENOMEM;
memcpy(p->instructions + p->n_instructions, instructions, sizeof(struct bpf_insn) * count);
p->n_instructions += count;
return 0;
}
int bpf_program_load_kernel(BPFProgram *p, char *log_buf, size_t log_size) {
union bpf_attr attr;
assert(p);
if (p->kernel_fd >= 0) { /* make this idempotent */
memzero(log_buf, log_size);
return 0;
}
// FIXME: Clang doesn't 0-pad with structured initialization, causing
// the kernel to reject the bpf_attr as invalid. See:
// https://github.com/torvalds/linux/blob/v5.9/kernel/bpf/syscall.c#L65
// Ideally it should behave like GCC, so that we can remove these workarounds.
zero(attr);
attr.prog_type = p->prog_type;
attr.insns = PTR_TO_UINT64(p->instructions);
attr.insn_cnt = p->n_instructions;
attr.license = PTR_TO_UINT64("GPL");
attr.log_buf = PTR_TO_UINT64(log_buf);
attr.log_level = !!log_buf;
attr.log_size = log_size;
if (p->prog_name)
strncpy(attr.prog_name, p->prog_name, BPF_OBJ_NAME_LEN - 1);
p->kernel_fd = bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
if (p->kernel_fd < 0)
return -errno;
return 0;
}
int bpf_program_load_from_bpf_fs(BPFProgram *p, const char *path) {
union bpf_attr attr;
assert(p);
if (p->kernel_fd >= 0) /* don't overwrite an assembled or loaded program */
return -EBUSY;
zero(attr);
attr.pathname = PTR_TO_UINT64(path);
p->kernel_fd = bpf(BPF_OBJ_GET, &attr, sizeof(attr));
if (p->kernel_fd < 0)
return -errno;
return 0;
}
int bpf_program_cgroup_attach(BPFProgram *p, int type, const char *path, uint32_t flags) {
_cleanup_free_ char *copy = NULL;
_cleanup_close_ int fd = -EBADF;
union bpf_attr attr;
int r;
assert(p);
assert(type >= 0);
assert(path);
if (!IN_SET(flags, 0, BPF_F_ALLOW_OVERRIDE, BPF_F_ALLOW_MULTI))
return -EINVAL;
/* We need to track which cgroup the program is attached to, and we can only track one attachment, hence let's
* refuse this early. */
if (p->attached_path) {
if (!path_equal(p->attached_path, path))
return -EBUSY;
if (p->attached_type != type)
return -EBUSY;
if (p->attached_flags != flags)
return -EBUSY;
/* Here's a shortcut: if we previously attached this program already, then we don't have to do so
* again. Well, with one exception: if we are in BPF_F_ALLOW_OVERRIDE mode then someone else might have
* replaced our program since the last time, hence let's reattach it again, just to be safe. In flags
* == 0 mode this is not an issue since nobody else can replace our program in that case, and in flags
* == BPF_F_ALLOW_MULTI mode any other's program would be installed in addition to ours hence ours
* would remain in effect. */
if (flags != BPF_F_ALLOW_OVERRIDE)
return 0;
}
/* Ensure we have a kernel object for this. */
r = bpf_program_load_kernel(p, NULL, 0);
if (r < 0)
return r;
copy = strdup(path);
if (!copy)
return -ENOMEM;
fd = open(path, O_DIRECTORY|O_RDONLY|O_CLOEXEC);
if (fd < 0)
return -errno;
zero(attr);
attr.attach_type = type;
attr.target_fd = fd;
attr.attach_bpf_fd = p->kernel_fd;
attr.attach_flags = flags;
if (bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)) < 0)
return -errno;
free_and_replace(p->attached_path, copy);
p->attached_type = type;
p->attached_flags = flags;
return 0;
}
int bpf_program_cgroup_detach(BPFProgram *p) {
_cleanup_close_ int fd = -EBADF;
assert(p);
if (!p->attached_path)
return -EUNATCH;
fd = open(p->attached_path, O_DIRECTORY|O_RDONLY|O_CLOEXEC);
if (fd < 0) {
if (errno != ENOENT)
return -errno;
/* If the cgroup does not exist anymore, then we don't have to explicitly detach, it got detached
* implicitly by the removal, hence don't complain */
} else {
union bpf_attr attr;
zero(attr);
attr.attach_type = p->attached_type;
attr.target_fd = fd;
attr.attach_bpf_fd = p->kernel_fd;
if (bpf(BPF_PROG_DETACH, &attr, sizeof(attr)) < 0)
return -errno;
}
p->attached_path = mfree(p->attached_path);
return 0;
}
int bpf_map_new(
const char *name,
enum bpf_map_type type,
size_t key_size,
size_t value_size,
size_t max_entries,
uint32_t flags) {
union bpf_attr attr;
const char *n = name;
zero(attr);
attr.map_type = type;
attr.key_size = key_size;
attr.value_size = value_size;
attr.max_entries = max_entries;
attr.map_flags = flags;
/* The map name is primarily informational for debugging purposes, and typically too short
* to carry the full unit name, hence we employ a trivial lossy escaping to make it fit
* (truncation + only alphanumerical, "." and "_" are allowed as per
* https://docs.kernel.org/bpf/maps.html#usage-notes) */
for (size_t i = 0; i < sizeof(attr.map_name) - 1 && *n; i++, n++)
attr.map_name[i] = strchr(ALPHANUMERICAL ".", *n) ? *n : '_';
return RET_NERRNO(bpf(BPF_MAP_CREATE, &attr, sizeof(attr)));
}
int bpf_map_update_element(int fd, const void *key, void *value) {
union bpf_attr attr;
zero(attr);
attr.map_fd = fd;
attr.key = PTR_TO_UINT64(key);
attr.value = PTR_TO_UINT64(value);
return RET_NERRNO(bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr)));
}
int bpf_map_lookup_element(int fd, const void *key, void *value) {
union bpf_attr attr;
zero(attr);
attr.map_fd = fd;
attr.key = PTR_TO_UINT64(key);
attr.value = PTR_TO_UINT64(value);
return RET_NERRNO(bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr)));
}
int bpf_program_pin(int prog_fd, const char *bpffs_path) {
union bpf_attr attr;
zero(attr);
attr.pathname = PTR_TO_UINT64((void *) bpffs_path);
attr.bpf_fd = prog_fd;
return RET_NERRNO(bpf(BPF_OBJ_PIN, &attr, sizeof(attr)));
}
int bpf_program_get_id_by_fd(int prog_fd, uint32_t *ret_id) {
struct bpf_prog_info info = {};
int r;
assert(ret_id);
r = bpf_program_get_info_by_fd(prog_fd, &info, sizeof(info));
if (r < 0)
return r;
*ret_id = info.id;
return 0;
};
int bpf_program_serialize_attachment(
FILE *f,
FDSet *fds,
const char *key,
BPFProgram *p) {
_cleanup_free_ char *escaped = NULL;
int copy, r;
if (!p || !p->attached_path)
return 0;
assert(p->kernel_fd >= 0);
escaped = cescape(p->attached_path);
if (!escaped)
return -ENOMEM;
copy = fdset_put_dup(fds, p->kernel_fd);
if (copy < 0)
return log_error_errno(copy, "Failed to add BPF kernel fd to serialize: %m");
r = serialize_item_format(
f,
key,
"%i %s %s",
copy,
bpf_cgroup_attach_type_to_string(p->attached_type),
escaped);
if (r < 0)
return r;
/* After serialization, let's forget the fact that this program is attached. The attachment — if you
* so will — is now 'owned' by the serialization, and not us anymore. Why does that matter? Because
* of BPF's less-than-ideal lifecycle handling: to detach a program from a cgroup we have to
* explicitly do so, it's not done implicitly on close(). Now, since we are serializing here we don't
* want the program to be detached while freeing things, so that the attachment can be retained after
* deserializing again. bpf_program_free() implicitly detaches things, if attached_path is non-NULL,
* hence we set it to NULL here. */
p->attached_path = mfree(p->attached_path);
return 0;
}
int bpf_program_serialize_attachment_set(FILE *f, FDSet *fds, const char *key, Set *set) {
BPFProgram *p;
int r;
SET_FOREACH(p, set) {
r = bpf_program_serialize_attachment(f, fds, key, p);
if (r < 0)
return r;
}
return 0;
}
int bpf_program_deserialize_attachment(const char *v, FDSet *fds, BPFProgram **bpfp) {
_cleanup_free_ char *sfd = NULL, *sat = NULL, *unescaped = NULL;
_cleanup_(bpf_program_freep) BPFProgram *p = NULL;
_cleanup_close_ int fd = -EBADF;
ssize_t l;
int ifd, at, r;
assert(v);
assert(bpfp);
/* Extract first word: the fd number */
r = extract_first_word(&v, &sfd, NULL, 0);
if (r < 0)
return r;
if (r == 0)
return -EINVAL;
ifd = parse_fd(sfd);
if (ifd < 0)
return r;
/* Extract second word: the attach type */
r = extract_first_word(&v, &sat, NULL, 0);
if (r < 0)
return r;
if (r == 0)
return -EINVAL;
at = bpf_cgroup_attach_type_from_string(sat);
if (at < 0)
return at;
/* The rest is the path */
if (isempty(v))
return -EINVAL;
l = cunescape(v, 0, &unescaped);
if (l < 0)
return l;
fd = fdset_remove(fds, ifd);
if (fd < 0)
return fd;
p = new(BPFProgram, 1);
if (!p)
return -ENOMEM;
*p = (BPFProgram) {
.kernel_fd = TAKE_FD(fd),
.prog_type = BPF_PROG_TYPE_UNSPEC,
.attached_path = TAKE_PTR(unescaped),
.attached_type = at,
};
if (*bpfp)
bpf_program_free(*bpfp);
*bpfp = TAKE_PTR(p);
return 0;
}
int bpf_program_deserialize_attachment_set(const char *v, FDSet *fds, Set **bpfsetp) {
BPFProgram *p = NULL;
int r;
assert(v);
assert(bpfsetp);
r = bpf_program_deserialize_attachment(v, fds, &p);
if (r < 0)
return r;
r = set_ensure_consume(bpfsetp, &bpf_program_hash_ops, p);
if (r < 0)
return r;
return 0;
}