tree-wide: drop support for kernels without pidfd_open() and pidfd_send_signal() (#35971)

This commit is contained in:
Lennart Poettering
2025-01-16 11:37:17 +01:00
committed by GitHub
7 changed files with 71 additions and 163 deletions

View File

@@ -137,12 +137,7 @@ static inline int missing_name_to_handle_at(int fd, const char *name, struct fil
#if !HAVE_SETNS
static inline int missing_setns(int fd, int nstype) {
# ifdef __NR_setns
return syscall(__NR_setns, fd, nstype);
# else
errno = ENOSYS;
return -1;
# endif
}
# define setns missing_setns
@@ -162,12 +157,7 @@ static inline pid_t raw_getpid(void) {
#if !HAVE_RENAMEAT2
static inline int missing_renameat2(int oldfd, const char *oldname, int newfd, const char *newname, unsigned flags) {
# ifdef __NR_renameat2
return syscall(__NR_renameat2, oldfd, oldname, newfd, newname, flags);
# else
errno = ENOSYS;
return -1;
# endif
}
# define renameat2 missing_renameat2
@@ -177,12 +167,7 @@ static inline int missing_renameat2(int oldfd, const char *oldname, int newfd, c
#if !HAVE_KCMP
static inline int missing_kcmp(pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) {
# if defined __NR_kcmp && __NR_kcmp >= 0
return syscall(__NR_kcmp, pid1, pid2, type, idx1, idx2);
# else
errno = ENOSYS;
return -1;
# endif
}
# define kcmp missing_kcmp
@@ -192,34 +177,19 @@ static inline int missing_kcmp(pid_t pid1, pid_t pid2, int type, unsigned long i
#if !HAVE_KEYCTL
static inline long missing_keyctl(int cmd, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) {
# if defined __NR_keyctl && __NR_keyctl >= 0
return syscall(__NR_keyctl, cmd, arg2, arg3, arg4, arg5);
# else
errno = ENOSYS;
return -1;
# endif
# define keyctl missing_keyctl
}
static inline key_serial_t missing_add_key(const char *type, const char *description, const void *payload, size_t plen, key_serial_t ringid) {
# if defined __NR_add_key && __NR_add_key >= 0
return syscall(__NR_add_key, type, description, payload, plen, ringid);
# else
errno = ENOSYS;
return -1;
# endif
# define add_key missing_add_key
}
static inline key_serial_t missing_request_key(const char *type, const char *description, const char * callout_info, key_serial_t destringid) {
# if defined __NR_request_key && __NR_request_key >= 0
return syscall(__NR_request_key, type, description, callout_info, destringid);
# else
errno = ENOSYS;
return -1;
# endif
# define request_key missing_request_key
}
@@ -329,12 +299,7 @@ static inline long missing_get_mempolicy(int *mode, unsigned long *nodemask,
#if !HAVE_PIDFD_SEND_SIGNAL
static inline int missing_pidfd_send_signal(int fd, int sig, siginfo_t *info, unsigned flags) {
# ifdef __NR_pidfd_send_signal
return syscall(__NR_pidfd_send_signal, fd, sig, info, flags);
# else
errno = ENOSYS;
return -1;
# endif
}
# define pidfd_send_signal missing_pidfd_send_signal
@@ -342,12 +307,7 @@ static inline int missing_pidfd_send_signal(int fd, int sig, siginfo_t *info, un
#if !HAVE_PIDFD_OPEN
static inline int missing_pidfd_open(pid_t pid, unsigned flags) {
# ifdef __NR_pidfd_open
return syscall(__NR_pidfd_open, pid, flags);
# else
errno = ENOSYS;
return -1;
# endif
}
# define pidfd_open missing_pidfd_open
@@ -661,12 +621,7 @@ static inline ssize_t missing_getdents64(int fd, void *buffer, size_t length) {
#if !HAVE_SCHED_SETATTR
static inline ssize_t missing_sched_setattr(pid_t pid, struct sched_attr *attr, unsigned int flags) {
# if defined __NR_sched_setattr
return syscall(__NR_sched_setattr, pid, attr, flags);
# else
errno = ENOSYS;
return -1;
# endif
}
# define sched_setattr missing_sched_setattr

View File

@@ -24,12 +24,8 @@ static int pidfd_check_pidfs(void) {
return have_pidfs;
_cleanup_close_ int fd = pidfd_open(getpid_cached(), 0);
if (fd < 0) {
if (ERRNO_IS_NOT_SUPPORTED(errno))
return (have_pidfs = false);
if (fd < 0)
return -errno;
}
return (have_pidfs = fd_is_fs_type(fd, PID_FS_MAGIC));
}

View File

@@ -84,8 +84,8 @@ int pidref_set_pid(PidRef *pidref, pid_t pid) {
fd = pidfd_open(pid, 0);
if (fd < 0) {
/* Graceful fallback in case the kernel doesn't support pidfds or is out of fds */
if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno) && !ERRNO_IS_RESOURCE(errno))
/* Graceful fallback in case the kernel is out of fds */
if (!ERRNO_IS_RESOURCE(errno))
return log_debug_errno(errno, "Failed to open pidfd for pid " PID_FMT ": %m", pid);
fd = -EBADF;

View File

@@ -2042,7 +2042,7 @@ int posix_spawn_wrapper(
* issues.
*
* Also, move the newly-created process into 'cgroup' through POSIX_SPAWN_SETCGROUP (clone3())
* if available. Note that CLONE_INTO_CGROUP is only supported on cgroup v2.
* if available.
* returns 1: We're already in the right cgroup
* 0: 'cgroup' not specified or POSIX_SPAWN_SETCGROUP is not supported. The caller
* needs to call 'cg_attach' on their own */
@@ -2061,14 +2061,10 @@ int posix_spawn_wrapper(
_unused_ _cleanup_(posix_spawnattr_destroyp) posix_spawnattr_t *attr_destructor = &attr;
#if HAVE_PIDFD_SPAWN
static enum {
CLONE_ONLY_PID,
CLONE_CAN_PIDFD, /* 5.2 */
CLONE_CAN_CGROUP, /* 5.7 */
} clone_support = CLONE_CAN_CGROUP;
static bool have_clone_into_cgroup = true; /* kernel 5.7+ */
_cleanup_close_ int cgroup_fd = -EBADF;
if (cgroup && clone_support >= CLONE_CAN_CGROUP) {
if (cgroup && have_clone_into_cgroup) {
_cleanup_free_ char *resolved_cgroup = NULL;
r = cg_get_path_and_check(
@@ -2099,47 +2095,41 @@ int posix_spawn_wrapper(
return -r;
#if HAVE_PIDFD_SPAWN
if (clone_support >= CLONE_CAN_PIDFD) {
_cleanup_close_ int pidfd = -EBADF;
_cleanup_close_ int pidfd = -EBADF;
r = pidfd_spawn(&pidfd, path, NULL, &attr, argv, envp);
if (ERRNO_IS_NOT_SUPPORTED(r) && FLAGS_SET(flags, POSIX_SPAWN_SETCGROUP) &&
cg_is_threaded(cgroup) > 0) /* clone3() could also return EOPNOTSUPP if the target cgroup is in threaded mode. */
return -EUCLEAN;
if ((ERRNO_IS_NOT_SUPPORTED(r) || ERRNO_IS_PRIVILEGE(r) || r == E2BIG) &&
FLAGS_SET(flags, POSIX_SPAWN_SETCGROUP)) {
/* Compiled on a newer host, or seccomp&friends blocking clone3()? Fallback, but
* need to disable POSIX_SPAWN_SETCGROUP, which is what redirects to clone3().
* Note that we might get E2BIG here since some kernels (e.g. 5.4) support clone3()
* but not CLONE_INTO_CGROUP. */
r = pidfd_spawn(&pidfd, path, NULL, &attr, argv, envp);
if (ERRNO_IS_NOT_SUPPORTED(r) && FLAGS_SET(flags, POSIX_SPAWN_SETCGROUP) && cg_is_threaded(cgroup) > 0)
return -EUCLEAN; /* clone3() could also return EOPNOTSUPP if the target cgroup is in threaded mode,
turn that into something recognizable */
if ((ERRNO_IS_NOT_SUPPORTED(r) || ERRNO_IS_PRIVILEGE(r) || r == E2BIG) &&
FLAGS_SET(flags, POSIX_SPAWN_SETCGROUP)) {
/* Compiled on a newer host, or seccomp&friends blocking clone3()? Fallback, but
* need to disable POSIX_SPAWN_SETCGROUP, which is what redirects to clone3().
* Note that we might get E2BIG here since some kernels (e.g. 5.4) support clone3()
* but not CLONE_INTO_CGROUP. */
/* CLONE_INTO_CGROUP definitely won't work, hence remember the fact so that we don't
* retry every time. */
assert(clone_support >= CLONE_CAN_CGROUP);
clone_support = CLONE_CAN_PIDFD;
/* CLONE_INTO_CGROUP definitely won't work, hence remember the fact so that we don't
* retry every time. */
have_clone_into_cgroup = false;
flags &= ~POSIX_SPAWN_SETCGROUP;
r = posix_spawnattr_setflags(&attr, flags);
if (r != 0)
return -r;
r = pidfd_spawn(&pidfd, path, NULL, &attr, argv, envp);
}
if (r == 0) {
r = pidref_set_pidfd_consume(ret_pidref, TAKE_FD(pidfd));
if (r < 0)
return r;
return FLAGS_SET(flags, POSIX_SPAWN_SETCGROUP);
}
if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
flags &= ~POSIX_SPAWN_SETCGROUP;
r = posix_spawnattr_setflags(&attr, flags);
if (r != 0)
return -r;
clone_support = CLONE_ONLY_PID; /* No CLONE_PIDFD either? */
r = pidfd_spawn(&pidfd, path, NULL, &attr, argv, envp);
}
#endif
if (r != 0)
return -r;
r = pidref_set_pidfd_consume(ret_pidref, TAKE_FD(pidfd));
if (r < 0)
return r;
return FLAGS_SET(flags, POSIX_SPAWN_SETCGROUP);
#else
pid_t pid;
r = posix_spawn(&pid, path, NULL, &attr, argv, envp);
if (r != 0)
return -r;
@@ -2149,6 +2139,7 @@ int posix_spawn_wrapper(
return r;
return 0; /* We did not use CLONE_INTO_CGROUP so return 0, the caller will have to move the child */
#endif
}
int proc_dir_open(DIR **ret) {

View File

@@ -25,6 +25,7 @@
#include "missing_magic.h"
#include "missing_syscall.h"
#include "missing_threads.h"
#include "missing_wait.h"
#include "origin-id.h"
#include "path-util.h"
#include "prioq.h"
@@ -1074,6 +1075,8 @@ static void source_disconnect(sd_event_source *s) {
}
static sd_event_source* source_free(sd_event_source *s) {
int r;
assert(s);
source_disconnect(s);
@@ -1087,31 +1090,23 @@ static sd_event_source* source_free(sd_event_source *s) {
if (s->child.process_owned) {
if (!s->child.exited) {
bool sent = false;
if (s->child.pidfd >= 0) {
if (pidfd_send_signal(s->child.pidfd, SIGKILL, NULL, 0) < 0) {
if (errno == ESRCH) /* Already dead */
sent = true;
else if (!ERRNO_IS_NOT_SUPPORTED(errno))
log_debug_errno(errno, "Failed to kill process " PID_FMT " via pidfd_send_signal(), re-trying via kill(): %m",
s->child.pid);
} else
sent = true;
}
if (!sent)
if (kill(s->child.pid, SIGKILL) < 0)
if (errno != ESRCH) /* Already dead */
log_debug_errno(errno, "Failed to kill process " PID_FMT " via kill(), ignoring: %m",
s->child.pid);
if (s->child.pidfd >= 0)
r = RET_NERRNO(pidfd_send_signal(s->child.pidfd, SIGKILL, NULL, 0));
else
r = RET_NERRNO(kill(s->child.pid, SIGKILL));
if (r < 0 && r != -ESRCH)
log_debug_errno(r, "Failed to kill process " PID_FMT ", ignoring: %m",
s->child.pid);
}
if (!s->child.waited) {
siginfo_t si = {};
/* Reap the child if we can */
(void) waitid(P_PID, s->child.pid, &si, WEXITED);
if (s->child.pidfd >= 0)
(void) waitid(P_PIDFD, s->child.pidfd, &si, WEXITED);
else
(void) waitid(P_PID, s->child.pid, &si, WEXITED);
}
}
@@ -1578,11 +1573,6 @@ static int child_exit_callback(sd_event_source *s, const siginfo_t *si, void *us
return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
}
static bool shall_use_pidfd(void) {
/* Mostly relevant for debugging, i.e. this is used in test-event.c to test the event loop once with and once without pidfd */
return secure_getenv_bool("SYSTEMD_PIDFD") != 0;
}
_public_ int sd_event_add_child(
sd_event *e,
sd_event_source **ret,
@@ -1630,34 +1620,29 @@ _public_ int sd_event_add_child(
if (!s)
return -ENOMEM;
/* We always take a pidfd here if we can, even if we wait for anything else than WEXITED, so that we
* pin the PID, and make regular waitid() handling race-free. */
s->child.pidfd = pidfd_open(pid, 0);
if (s->child.pidfd < 0)
return -errno;
s->child.pidfd_owned = true; /* If we allocate the pidfd we own it by default */
s->wakeup = WAKEUP_EVENT_SOURCE;
s->child.options = options;
s->child.callback = callback;
s->userdata = userdata;
s->enabled = SD_EVENT_ONESHOT;
/* We always take a pidfd here if we can, even if we wait for anything else than WEXITED, so that we
* pin the PID, and make regular waitid() handling race-free. */
if (shall_use_pidfd()) {
s->child.pidfd = pidfd_open(pid, 0);
if (s->child.pidfd < 0) {
/* Propagate errors unless the syscall is not supported or blocked */
if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
return -errno;
} else
s->child.pidfd_owned = true; /* If we allocate the pidfd we own it by default */
} else
s->child.pidfd = -EBADF;
if (EVENT_SOURCE_WATCH_PIDFD(s)) {
/* We have a pidfd and we only want to watch for exit */
/* We only want to watch for exit */
r = source_child_pidfd_register(s, s->enabled);
if (r < 0)
return r;
} else {
/* We have no pidfd or we shall wait for some other event than WEXITED */
/* We shall wait for some other event than WEXITED */
r = event_make_signal_data(e, SIGCHLD, NULL);
if (r < 0)
return r;
@@ -1727,17 +1712,12 @@ _public_ int sd_event_add_child_pidfd(
s->wakeup = WAKEUP_EVENT_SOURCE;
s->child.pidfd = pidfd;
s->child.pid = pid;
s->child.options = options;
s->child.callback = callback;
s->child.pidfd_owned = false; /* If we got the pidfd passed in we don't own it by default (similar to the IO fd case) */
s->userdata = userdata;
s->enabled = SD_EVENT_ONESHOT;
r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
if (r < 0)
return r;
if (EVENT_SOURCE_WATCH_PIDFD(s)) {
/* We only want to watch for WEXITED */
r = source_child_pidfd_register(s, s->enabled);
@@ -1752,6 +1732,11 @@ _public_ int sd_event_add_child_pidfd(
e->need_process_child = true;
}
r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
if (r < 0)
return r;
s->child.pid = pid;
e->n_online_child_sources++;
if (ret)
@@ -3239,12 +3224,10 @@ _public_ int sd_event_source_send_child_signal(sd_event_source *s, int sig, cons
if (si)
copy = *si;
if (pidfd_send_signal(s->child.pidfd, sig, si ? &copy : NULL, 0) < 0) {
/* Let's propagate the error only if the system call is not implemented or prohibited */
if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
return -errno;
} else
return 0;
if (pidfd_send_signal(s->child.pidfd, sig, si ? &copy : NULL, 0) < 0)
return -errno;
return 0;
}
/* Flags are only supported for pidfd_send_signal(), not for rt_sigqueueinfo(), hence let's refuse

View File

@@ -198,7 +198,7 @@ static int post_handler(sd_event_source *s, void *userdata) {
return 2;
}
static void test_basic_one(bool with_pidfd) {
TEST(basic) {
sd_event *e = NULL;
sd_event_source *w = NULL, *x = NULL, *y = NULL, *z = NULL, *q = NULL, *t = NULL;
static const char ch = 'x';
@@ -207,10 +207,6 @@ static void test_basic_one(bool with_pidfd) {
uint64_t event_now;
int64_t priority;
log_info("/* %s(pidfd=%s) */", __func__, yes_no(with_pidfd));
assert_se(setenv("SYSTEMD_PIDFD", yes_no(with_pidfd), 1) >= 0);
assert_se(pipe(a) >= 0);
assert_se(pipe(b) >= 0);
assert_se(pipe(d) >= 0);
@@ -301,13 +297,6 @@ static void test_basic_one(bool with_pidfd) {
safe_close_pair(b);
safe_close_pair(d);
safe_close_pair(k);
assert_se(unsetenv("SYSTEMD_PIDFD") >= 0);
}
TEST(basic) {
test_basic_one(true); /* test with pidfd */
test_basic_one(false); /* test without pidfd */
}
TEST(sd_event_now) {
@@ -583,13 +572,7 @@ TEST(pidfd) {
assert_se(pid > 1);
pidfd = pidfd_open(pid, 0);
if (pidfd < 0) {
/* No pidfd_open() supported or blocked? */
assert_se(ERRNO_IS_NOT_SUPPORTED(errno) || ERRNO_IS_PRIVILEGE(errno));
(void) wait_for_terminate(pid, NULL);
return;
}
ASSERT_OK(pidfd = pidfd_open(pid, 0));
pid2 = fork();
if (pid2 == 0)

View File

@@ -879,7 +879,7 @@ static int create_session_message(
if (!avoid_pidfd) {
pidfd = pidfd_open(getpid_cached(), 0);
if (pidfd < 0 && !ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
if (pidfd < 0)
return -errno;
}