Merge pull request #28123 from poettering/threads-no-more

get rid of any threads in PID 1
This commit is contained in:
Luca Boccassi
2023-06-23 13:24:41 +01:00
committed by GitHub
17 changed files with 484 additions and 212 deletions

View File

@@ -750,3 +750,32 @@ SPDX-License-Identifier: LGPL-2.1-or-later
- Reasonable use of non-ASCII Unicode UTF-8 characters in code comments is
welcome. If your code comment contains an emoji or two this will certainly
brighten the day of the occasional reviewer of your code. Really! 😊
## Threading
- We generally avoid using threads, to the level this is possible. In
particular in the service manager/PID 1 threads are not OK to use. This is
because you cannot mix memory allocation in threads with use of glibc's
`clone()` call, or manual `clone()`/`clone3()` system call wrappers. Only
glibc's own `fork()` call will properly synchronize the memory allocation
locks around the process clone operation. This means that if a process is
cloned via `clone()`/`clone3()` and another thread currently has the
`malloc()` lock taken, it will be cloned in locked state to the child, and
thus can never be acquired in the child, leading to deadlocks. Hence, when
using `clone()`/`clone3()` there are only two ways out: never use threads in the
parent, or never do memory allocation in the child. For our uses we need
`clone()`/`clone3()` and hence decided to avoid threads. Of course, sometimes the
concurrency threads allow is beneficial, however we suggest forking off
worker *processes* rather than worker *threads* for this purpose, ideally
even with an `execve()` to remove the CoW trap situation `fork()` easily
triggers.
- A corollary of the above is: never use `clone()` where a `fork()` would do
too. Also consider using `posix_spawn()` which combines `clone()` +
`execve()` into one and has nice properties since it avoids becoming a CoW
trap by using `CLONE_VORK` and `CLONE_VM` together.
- While we avoid forking off threads on our own, writing thread-safe code is a
good idea where it might end up running inside of libsystemd.so or
similar. Hence, use TLS (i.e. `thread_local`) where appropriate, and maybe
the occasional `pthread_once()`.

View File

@@ -1,107 +0,0 @@
/* SPDX-License-Identifier: LGPL-2.1-or-later */
#include <errno.h>
#include <pthread.h>
#include <stddef.h>
#include <unistd.h>
#include "async.h"
#include "errno-util.h"
#include "fd-util.h"
#include "log.h"
#include "macro.h"
#include "process-util.h"
#include "signal-util.h"
int asynchronous_job(void* (*func)(void *p), void *arg) {
sigset_t ss, saved_ss;
pthread_attr_t a;
pthread_t t;
int r, k;
/* It kinda sucks that we have to resort to threads to implement an asynchronous close(), but well, such is
* life. */
r = pthread_attr_init(&a);
if (r > 0)
return -r;
r = pthread_attr_setdetachstate(&a, PTHREAD_CREATE_DETACHED);
if (r > 0) {
r = -r;
goto finish;
}
assert_se(sigfillset(&ss) >= 0);
/* Block all signals before forking off the thread, so that the new thread is started with all signals
* blocked. This way the existence of the new thread won't affect signal handling in other threads. */
r = pthread_sigmask(SIG_BLOCK, &ss, &saved_ss);
if (r > 0) {
r = -r;
goto finish;
}
r = pthread_create(&t, &a, func, arg);
k = pthread_sigmask(SIG_SETMASK, &saved_ss, NULL);
if (r > 0)
r = -r;
else if (k > 0)
r = -k;
else
r = 0;
finish:
pthread_attr_destroy(&a);
return r;
}
int asynchronous_sync(pid_t *ret_pid) {
int r;
/* This forks off an invocation of fork() as a child process, in order to initiate synchronization to
* disk. Note that we implement this as helper process rather than thread as we don't want the sync() to hang our
* original process ever, and a thread would do that as the process can't exit with threads hanging in blocking
* syscalls. */
r = safe_fork("(sd-sync)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS, ret_pid);
if (r < 0)
return r;
if (r == 0) {
/* Child process */
sync();
_exit(EXIT_SUCCESS);
}
return 0;
}
static void *close_thread(void *p) {
(void) pthread_setname_np(pthread_self(), "close");
assert_se(close_nointr(PTR_TO_FD(p)) != -EBADF);
return NULL;
}
int asynchronous_close(int fd) {
int r;
/* This is supposed to behave similar to safe_close(), but
* actually invoke close() asynchronously, so that it will
* never block. Ideally the kernel would have an API for this,
* but it doesn't, so we work around it, and hide this as a
* far away as we can. */
if (fd >= 0) {
PROTECT_ERRNO;
r = asynchronous_job(close_thread, FD_TO_PTR(fd));
if (r < 0)
assert_se(close_nointr(fd) != -EBADF);
}
return -EBADF;
}

View File

@@ -1,13 +0,0 @@
/* SPDX-License-Identifier: LGPL-2.1-or-later */
#pragma once
#include <sys/types.h>
#include "macro.h"
int asynchronous_job(void* (*func)(void *p), void *arg);
int asynchronous_sync(pid_t *ret_pid);
int asynchronous_close(int fd);
DEFINE_TRIVIAL_CLEANUP_FUNC(int, asynchronous_close);

View File

@@ -7,7 +7,6 @@ basic_sources = files(
'architecture.c',
'argv-util.c',
'arphrd-util.c',
'async.c',
'audit-util.c',
'build.c',
'bus-label.c',

View File

@@ -1149,6 +1149,41 @@ static void restore_sigsetp(sigset_t **ssp) {
(void) sigprocmask(SIG_SETMASK, *ssp, NULL);
}
pid_t clone_with_nested_stack(int (*fn)(void *), int flags, void *userdata) {
size_t ps;
pid_t pid;
void *mystack;
/* A wrapper around glibc's clone() call that automatically sets up a "nested" stack. Only supports
* invocations without CLONE_VM, so that we can continue to use the parent's stack mapping.
*
* Note: glibc's clone() wrapper does not synchronize malloc() locks. This means that if the parent
* is threaded these locks will be in an undefined state in the child, and hence memory allocations
* are likely going to run into deadlocks. Hence: if you use this function make sure your parent is
* strictly single-threaded or your child never calls malloc(). */
assert((flags & (CLONE_VM|CLONE_PARENT_SETTID|CLONE_CHILD_SETTID|
CLONE_CHILD_CLEARTID|CLONE_SETTLS)) == 0);
/* We allocate some space on the stack to use as the stack for the child (hence "nested"). Note that
* the net effect is that the child will have the start of its stack inside the stack of the parent,
* but since they are a CoW copy of each other that's fine. We allocate one page-aligned page. But
* since we don't want to deal with differences between systems where the stack grows backwards or
* forwards we'll allocate one more and place the stack address in the middle. Except that we also
* want it page aligned, hence we'll allocate one page more. Makes 3. */
ps = page_size();
mystack = alloca(ps*3);
mystack = (uint8_t*) mystack + ps; /* move pointer one page ahead since stacks usually grow backwards */
mystack = (void*) ALIGN_TO((uintptr_t) mystack, ps); /* align to page size (moving things further ahead) */
pid = clone(fn, mystack, flags, userdata);
if (pid < 0)
return -errno;
return pid;
}
int safe_fork_full(
const char *name,
const int stdio_fds[3],
@@ -1160,9 +1195,12 @@ int safe_fork_full(
pid_t original_pid, pid;
sigset_t saved_ss, ss;
_unused_ _cleanup_(restore_sigsetp) sigset_t *saved_ssp = NULL;
bool block_signals = false, block_all = false;
bool block_signals = false, block_all = false, intermediary = false;
int prio, r;
assert(!FLAGS_SET(flags, FORK_DETACH) || !ret_pid);
assert(!FLAGS_SET(flags, FORK_DETACH|FORK_WAIT));
/* A wrapper around fork(), that does a couple of important initializations in addition to mere forking. Always
* returns the child's PID in *ret_pid. Returns == 0 in the child, and > 0 in the parent. */
@@ -1196,6 +1234,31 @@ int safe_fork_full(
saved_ssp = &saved_ss;
}
if (FLAGS_SET(flags, FORK_DETACH)) {
assert(!FLAGS_SET(flags, FORK_WAIT));
assert(!ret_pid);
/* Fork off intermediary child if needed */
r = is_reaper_process();
if (r < 0)
return log_full_errno(prio, r, "Failed to determine if we are a reaper process: %m");
if (!r) {
/* Not a reaper process, hence do a double fork() so we are reparented to one */
pid = fork();
if (pid < 0)
return log_full_errno(prio, errno, "Failed to fork off '%s': %m", strna(name));
if (pid > 0) {
log_debug("Successfully forked off intermediary '%s' as PID " PID_FMT ".", strna(name), pid);
return 1; /* return in the parent */
}
intermediary = true;
}
}
if ((flags & (FORK_NEW_MOUNTNS|FORK_NEW_USERNS)) != 0)
pid = raw_clone(SIGCHLD|
(FLAGS_SET(flags, FORK_NEW_MOUNTNS) ? CLONE_NEWNS : 0) |
@@ -1205,8 +1268,12 @@ int safe_fork_full(
if (pid < 0)
return log_full_errno(prio, errno, "Failed to fork off '%s': %m", strna(name));
if (pid > 0) {
/* We are in the parent process */
/* If we are in the intermediary process, exit now */
if (intermediary)
_exit(EXIT_SUCCESS);
/* We are in the parent process */
log_debug("Successfully forked off '%s' as PID " PID_FMT ".", strna(name), pid);
if (flags & FORK_WAIT) {
@@ -1623,6 +1690,39 @@ int get_process_threads(pid_t pid) {
return n;
}
int is_reaper_process(void) {
int b = 0;
/* Checks if we are running in a reaper process, i.e. if we are expected to deal with processes
* reparented to us. This simply checks if we are PID 1 or if PR_SET_CHILD_SUBREAPER was called. */
if (getpid_cached() == 1)
return true;
if (prctl(PR_GET_CHILD_SUBREAPER, (unsigned long) &b, 0UL, 0UL, 0UL) < 0)
return -errno;
return b != 0;
}
int make_reaper_process(bool b) {
if (getpid_cached() == 1) {
if (!b)
return -EINVAL;
return 0;
}
/* Some prctl()s insist that all 5 arguments are specified, others do not. Let's always specify all,
* to avoid any ambiguities */
if (prctl(PR_SET_CHILD_SUBREAPER, (unsigned long) b, 0UL, 0UL, 0UL) < 0)
return -errno;
return 0;
}
static const char *const sigchld_code_table[] = {
[CLD_EXITED] = "exited",
[CLD_KILLED] = "killed",

View File

@@ -139,6 +139,13 @@ void reset_cached_pid(void);
int must_be_root(void);
pid_t clone_with_nested_stack(int (*fn)(void *), int flags, void *userdata);
/* 💣 Note that FORK_NEW_USERNS + FORK_NEW_MOUNTNS should not be called in threaded programs, because they
* cause us to use raw_clone() which does not synchronize the glibc malloc() locks, and thus will cause
* deadlocks if the parent uses threads and the child does memory allocations. Hence: if the parent is
* threaded these flags may not be used. These flags cannot be used if the parent uses threads or the child
* uses malloc(). 💣 */
typedef enum ForkFlags {
FORK_RESET_SIGNALS = 1 << 0, /* Reset all signal handlers and signal mask */
FORK_CLOSE_ALL_FDS = 1 << 1, /* Close all open file descriptors in the child, except for 0,1,2 */
@@ -148,15 +155,16 @@ typedef enum ForkFlags {
FORK_REOPEN_LOG = 1 << 5, /* Reopen log connection */
FORK_LOG = 1 << 6, /* Log above LOG_DEBUG log level about failures */
FORK_WAIT = 1 << 7, /* Wait until child exited */
FORK_NEW_MOUNTNS = 1 << 8, /* Run child in its own mount namespace */
FORK_NEW_MOUNTNS = 1 << 8, /* Run child in its own mount namespace 💣 DO NOT USE IN THREADED PROGRAMS! 💣 */
FORK_MOUNTNS_SLAVE = 1 << 9, /* Make child's mount namespace MS_SLAVE */
FORK_PRIVATE_TMP = 1 << 10, /* Mount new /tmp/ in the child (combine with FORK_NEW_MOUNTNS!) */
FORK_RLIMIT_NOFILE_SAFE = 1 << 11, /* Set RLIMIT_NOFILE soft limit to 1K for select() compat */
FORK_STDOUT_TO_STDERR = 1 << 12, /* Make stdout a copy of stderr */
FORK_FLUSH_STDIO = 1 << 13, /* fflush() stdout (and stderr) before forking */
FORK_NEW_USERNS = 1 << 14, /* Run child in its own user namespace */
FORK_NEW_USERNS = 1 << 14, /* Run child in its own user namespace 💣 DO NOT USE IN THREADED PROGRAMS! 💣 */
FORK_CLOEXEC_OFF = 1 << 15, /* In the child: turn off O_CLOEXEC on all fds in except_fds[] */
FORK_KEEP_NOTIFY_SOCKET = 1 << 16, /* Unless this specified, $NOTIFY_SOCKET will be unset. */
FORK_DETACH = 1 << 17, /* Double fork if needed to ensure PID1/subreaper is parent */
} ForkFlags;
int safe_fork_full(
@@ -199,3 +207,6 @@ int setpriority_closest(int priority);
_noreturn_ void freeze(void);
int get_process_threads(pid_t pid);
int is_reaper_process(void);
int make_reaper_process(bool b);

View File

@@ -4,7 +4,6 @@
#include <errno.h>
#include <fcntl.h>
#include <linux/random.h>
#include <pthread.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>

View File

@@ -27,6 +27,11 @@
* Additionally, as this function does not pass the ptid, newtls and ctid parameters to the kernel, flags must not
* contain CLONE_PARENT_SETTID, CLONE_CHILD_SETTID, CLONE_CHILD_CLEARTID or CLONE_SETTLS.
*
* WARNING: 💣 this call (just like glibc's own clone() wrapper) will not synchronize on glibc's malloc
* locks, which means they will be in an undefined state in the child if the parent is
* threaded. This means: the parent must either never use threads, or the child cannot use memory
* allocation itself. This is a major pitfall, hence be careful! 💣
*
* Returns: 0 in the child process and the child process id in the parent.
*/
static inline pid_t raw_clone(unsigned long flags) {

View File

@@ -44,22 +44,6 @@ static const UnitActiveState state_translation_table[_AUTOMOUNT_STATE_MAX] = {
[AUTOMOUNT_FAILED] = UNIT_FAILED
};
struct expire_data {
int dev_autofs_fd;
int ioctl_fd;
};
static struct expire_data* expire_data_free(struct expire_data *data) {
if (!data)
return NULL;
safe_close(data->dev_autofs_fd);
safe_close(data->ioctl_fd);
return mfree(data);
}
DEFINE_TRIVIAL_CLEANUP_FUNC(struct expire_data*, expire_data_free);
static int open_dev_autofs(Manager *m);
static int automount_dispatch_io(sd_event_source *s, int fd, uint32_t events, void *userdata);
static int automount_start_expire(Automount *a);
@@ -674,55 +658,57 @@ fail:
automount_enter_dead(a, AUTOMOUNT_FAILURE_RESOURCES);
}
static void *expire_thread(void *p) {
struct autofs_dev_ioctl param;
_cleanup_(expire_data_freep) struct expire_data *data = p;
static int asynchronous_expire(int dev_autofs_fd, int ioctl_fd) {
int r;
assert(data->dev_autofs_fd >= 0);
assert(data->ioctl_fd >= 0);
assert(dev_autofs_fd >= 0);
assert(ioctl_fd >= 0);
init_autofs_dev_ioctl(&param);
param.ioctlfd = data->ioctl_fd;
/* Issue AUTOFS_DEV_IOCTL_EXPIRE in subprocess, asynchronously. Note that we don't keep track of the
* child's PID, we are PID1/autoreaper after all, hence when it dies we'll automatically clean it up
* anyway. */
do {
r = ioctl(data->dev_autofs_fd, AUTOFS_DEV_IOCTL_EXPIRE, &param);
} while (r >= 0);
r = safe_fork_full("(sd-expire)",
/* stdio_fds= */ NULL,
(int[]) { dev_autofs_fd, ioctl_fd },
/* n_except_fds= */ 2,
FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_REOPEN_LOG,
/* pid= */ NULL);
if (r != 0)
return r;
/* Child */
for (;;) {
struct autofs_dev_ioctl param;
init_autofs_dev_ioctl(&param);
param.ioctlfd = ioctl_fd;
if (ioctl(dev_autofs_fd, AUTOFS_DEV_IOCTL_EXPIRE, &param) < 0)
break;
}
if (errno != EAGAIN)
log_warning_errno(errno, "Failed to expire automount, ignoring: %m");
return NULL;
_exit(EXIT_SUCCESS);
}
static int automount_dispatch_expire(sd_event_source *source, usec_t usec, void *userdata) {
_cleanup_close_ int ioctl_fd = -EBADF;
Automount *a = AUTOMOUNT(userdata);
_cleanup_(expire_data_freep) struct expire_data *data = NULL;
int r;
assert(a);
assert(source == a->expire_event_source);
data = new0(struct expire_data, 1);
if (!data)
return log_oom();
ioctl_fd = open_ioctl_fd(UNIT(a)->manager->dev_autofs_fd, a->where, a->dev_id);
if (ioctl_fd < 0)
return log_unit_error_errno(UNIT(a), ioctl_fd, "Couldn't open autofs ioctl fd: %m");
data->ioctl_fd = -EBADF;
data->dev_autofs_fd = fcntl(UNIT(a)->manager->dev_autofs_fd, F_DUPFD_CLOEXEC, 3);
if (data->dev_autofs_fd < 0)
return log_unit_error_errno(UNIT(a), errno, "Failed to duplicate autofs fd: %m");
data->ioctl_fd = open_ioctl_fd(UNIT(a)->manager->dev_autofs_fd, a->where, a->dev_id);
if (data->ioctl_fd < 0)
return log_unit_error_errno(UNIT(a), data->ioctl_fd, "Couldn't open autofs ioctl fd: %m");
r = asynchronous_job(expire_thread, data);
r = asynchronous_expire(UNIT(a)->manager->dev_autofs_fd, ioctl_fd);
if (r < 0)
return log_unit_error_errno(UNIT(a), r, "Failed to start expire job: %m");
data = NULL;
return automount_start_expire(a);
}

View File

@@ -7318,28 +7318,17 @@ int exec_command_append(ExecCommand *c, const char *path, ...) {
return 0;
}
static void *rm_rf_thread(void *p) {
_cleanup_free_ char *path = p;
static char *destroy_tree(char *path) {
if (!path)
return NULL;
(void) rm_rf(path, REMOVE_ROOT|REMOVE_SUBVOLUME|REMOVE_PHYSICAL);
return NULL;
}
if (!path_equal(path, RUN_SYSTEMD_EMPTY)) {
log_debug("Spawning process to nuke '%s'", path);
static void asynchronous_rm_rf(char **path) {
int r;
(void) asynchronous_rm_rf(path, REMOVE_ROOT|REMOVE_SUBVOLUME|REMOVE_PHYSICAL);
}
assert(path);
if (!*path || streq(*path, RUN_SYSTEMD_EMPTY))
return;
log_debug("Spawning thread to nuke %s", *path);
r = asynchronous_job(rm_rf_thread, *path);
if (r < 0)
log_warning_errno(r, "Failed to nuke %s: %m", *path);
else
*path = NULL;
return mfree(path);
}
static ExecSharedRuntime* exec_shared_runtime_free(ExecSharedRuntime *rt) {
@@ -7370,8 +7359,8 @@ ExecSharedRuntime* exec_shared_runtime_destroy(ExecSharedRuntime *rt) {
if (rt->n_ref > 0)
return NULL;
asynchronous_rm_rf(&rt->tmp_dir);
asynchronous_rm_rf(&rt->var_tmp_dir);
rt->tmp_dir = destroy_tree(rt->tmp_dir);
rt->var_tmp_dir = destroy_tree(rt->var_tmp_dir);
return exec_shared_runtime_free(rt);
}
@@ -7862,9 +7851,8 @@ ExecRuntime* exec_runtime_free(ExecRuntime *rt) {
exec_shared_runtime_unref(rt->shared);
dynamic_creds_unref(rt->dynamic_creds);
asynchronous_rm_rf(&rt->ephemeral_copy);
rt->ephemeral_copy = destroy_tree(rt->ephemeral_copy);
free(rt->ephemeral_copy);
safe_close_pair(rt->ephemeral_storage_socket);
return mfree(rt);
}

View File

@@ -2294,10 +2294,9 @@ static int initialize_runtime(
}
}
if (arg_runtime_scope == RUNTIME_SCOPE_USER)
/* Become reaper of our children */
if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0)
log_warning_errno(errno, "Failed to make us a subreaper, ignoring: %m");
r = make_reaper_process(true);
if (r < 0)
log_warning_errno(r, "Failed to make us a subreaper, ignoring: %m");
/* Bump up RLIMIT_NOFILE for systemd itself */
(void) bump_rlimit_nofile(saved_rlimit_nofile);

View File

@@ -5826,8 +5826,9 @@ static int run(int argc, char *argv[]) {
assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, SIGRTMIN+18, -1) >= 0);
if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) < 0) {
r = log_error_errno(errno, "Failed to become subreaper: %m");
r = make_reaper_process(true);
if (r < 0) {
log_error_errno(r, "Failed to become subreaper: %m");
goto finish;
}

138
src/shared/async.c Normal file
View File

@@ -0,0 +1,138 @@
/* SPDX-License-Identifier: LGPL-2.1-or-later */
#include <errno.h>
#include <stddef.h>
#include <sys/prctl.h>
#include <sys/wait.h>
#include <unistd.h>
#include "async.h"
#include "errno-util.h"
#include "fd-util.h"
#include "log.h"
#include "macro.h"
#include "process-util.h"
#include "signal-util.h"
int asynchronous_sync(pid_t *ret_pid) {
int r;
/* This forks off an invocation of fork() as a child process, in order to initiate synchronization to
* disk. Note that we implement this as helper process rather than thread as we don't want the sync() to hang our
* original process ever, and a thread would do that as the process can't exit with threads hanging in blocking
* syscalls. */
r = safe_fork("(sd-sync)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|(ret_pid ? 0 : FORK_DETACH), ret_pid);
if (r < 0)
return r;
if (r == 0) {
/* Child process */
sync();
_exit(EXIT_SUCCESS);
}
return 0;
}
/* We encode the fd to close in the userdata pointer as an unsigned value. The highest bit indicates whether
* we need to fork again */
#define NEED_DOUBLE_FORK (1U << (sizeof(unsigned) * 8 - 1))
static int close_func(void *p) {
unsigned v = PTR_TO_UINT(p);
(void) prctl(PR_SET_NAME, (unsigned long*) "(close)");
/* Note: 💣 This function is invoked in a child process created via glibc's clone() wrapper. In such
* children memory allocation is not allowed, since glibc does not release malloc mutexes in
* clone() 💣 */
if (v & NEED_DOUBLE_FORK) {
pid_t pid;
v &= ~NEED_DOUBLE_FORK;
/* This inner child will be reparented to the subreaper/PID 1. Here we turn on SIGCHLD, so
* that the reaper knows when it's time to reap. */
pid = clone_with_nested_stack(close_func, SIGCHLD|CLONE_FILES, UINT_TO_PTR(v));
if (pid >= 0)
return 0;
}
close((int) v); /* no assert() here, we are in the child and the result would be eaten up anyway */
return 0;
}
int asynchronous_close(int fd) {
unsigned v;
pid_t pid;
int r;
/* This is supposed to behave similar to safe_close(), but actually invoke close() asynchronously, so
* that it will never block. Ideally the kernel would have an API for this, but it doesn't, so we
* work around it, and hide this as a far away as we can.
*
* It is important to us that we don't use threads (via glibc pthread) in PID 1, hence we'll do a
* minimal subprocess instead which shares our fd table via CLONE_FILES. */
if (fd < 0)
return -EBADF; /* already invalid */
PROTECT_ERRNO;
v = (unsigned) fd;
/* We want to fork off a process that is automatically reaped. For that we'd usually double-fork. But
* we can optimize this a bit: if we are PID 1 or a subreaper anyway (the systemd service manager
* process qualifies as this), we can avoid the double forking, since the double forked process would
* be reparented back to us anyway. */
r = is_reaper_process();
if (r < 0)
log_debug_errno(r, "Cannot determine if we are a reaper process, assuming we are not: %m");
if (r <= 0)
v |= NEED_DOUBLE_FORK;
pid = clone_with_nested_stack(close_func, CLONE_FILES | ((v & NEED_DOUBLE_FORK) ? 0 : SIGCHLD), UINT_TO_PTR(v));
if (pid < 0)
assert_se(close_nointr(fd) != -EBADF); /* local fallback */
else if (v & NEED_DOUBLE_FORK) {
/* Reap the intermediate child. Key here is that we specify __WCLONE, since we didn't ask for
* any signal to be sent to us on process exit, and otherwise waitid() would refuse waiting
* then.
*
* We usually prefer calling waitid(), but before kernel 4.7 it didn't support __WCLONE while
* waitpid() did. Hence let's use waitpid() here, it's good enough for our purposes here. */
for (;;) {
if (waitpid(pid, NULL, WEXITED|__WCLONE) >= 0 || errno != EINTR)
break;
}
}
return -EBADF; /* return an invalidated fd */
}
int asynchronous_rm_rf(const char *p, RemoveFlags flags) {
int r;
assert(p);
/* Forks off a child that destroys the specified path. This will be best effort only, i.e. the child
* will attempt to do its thing, but we won't wait for it or check its success. */
r = safe_fork("(sd-rmrf)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DETACH, NULL);
if (r != 0)
return r;
/* Child */
r = rm_rf(p, flags);
if (r < 0) {
log_debug_errno(r, "Failed to rm -rf '%s', ignoring: %m", p);
_exit(EXIT_FAILURE); /* This is a detached process, hence noone really cares, but who knows
* maybe it's good for debugging/tracing to return an exit code
* indicative of our failure here. */
}
_exit(EXIT_SUCCESS);
}

26
src/shared/async.h Normal file
View File

@@ -0,0 +1,26 @@
/* SPDX-License-Identifier: LGPL-2.1-or-later */
#pragma once
#include <sys/types.h>
#include "macro.h"
#include "rm-rf.h"
/* These functions implement various potentially slow operations that are executed asynchronously. They are
* carefully written to not use pthreads, but use fork() or clone() (without CLONE_VM) so that the child does
* not share any memory with the parent process, and thus cannot possibly interfere with the malloc()
* synchronization locks.
*
* Background: glibc only synchronizes malloc() locks when doing fork(), but not when doing clone()
* (regardless if through glibc's own wrapper or ours). This means if another thread in the parent has the
* malloc() lock taken while a thread is cloning, the mutex will remain locked in the child (but the other
* thread won't exist there), with no chance to ever be unlocked again. This will result in deadlocks. Hence
* one has to make the choice: either never use threads in the parent, or never do memory allocation in the
* child, or never use clone()/clone3() and stick to fork() only. Because we need clone()/clone3() we opted
* for avoiding threads. */
int asynchronous_sync(pid_t *ret_pid);
int asynchronous_close(int fd);
int asynchronous_rm_rf(const char *p, RemoveFlags flags);
DEFINE_TRIVIAL_CLEANUP_FUNC(int, asynchronous_close);

View File

@@ -4,6 +4,7 @@ shared_sources = files(
'acpi-fpdt.c',
'apparmor-util.c',
'ask-password-api.c',
'async.c',
'barrier.c',
'base-filesystem.c',
'battery-util.c',

View File

@@ -1,37 +1,100 @@
/* SPDX-License-Identifier: LGPL-2.1-or-later */
#include <fcntl.h>
#include <sys/prctl.h>
#include <sys/wait.h>
#include <unistd.h>
#include "async.h"
#include "fs-util.h"
#include "tmpfile-util.h"
#include "path-util.h"
#include "process-util.h"
#include "signal-util.h"
#include "tests.h"
#include "tmpfile-util.h"
static bool test_async = false;
static void *async_func(void *arg) {
test_async = true;
return NULL;
TEST(test_asynchronous_sync) {
assert_se(asynchronous_sync(NULL) >= 0);
}
TEST(test_async) {
TEST(asynchronous_close) {
_cleanup_(unlink_tempfilep) char name[] = "/tmp/test-asynchronous_close.XXXXXX";
int fd;
int fd, r;
fd = mkostemp_safe(name);
assert_se(fd >= 0);
asynchronous_close(fd);
assert_se(asynchronous_job(async_func, NULL) >= 0);
assert_se(asynchronous_sync(NULL) >= 0);
sleep(1);
assert_se(fcntl(fd, F_GETFD) == -1);
assert_se(errno == EBADF);
assert_se(test_async);
r = safe_fork("(subreaper)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DEATHSIG|FORK_LOG|FORK_WAIT, NULL);
assert(r >= 0);
if (r == 0) {
/* child */
assert(make_reaper_process(true) >= 0);
fd = open("/dev/null", O_RDONLY|O_CLOEXEC);
assert_se(fd >= 0);
asynchronous_close(fd);
sleep(1);
assert_se(fcntl(fd, F_GETFD) == -1);
assert_se(errno == EBADF);
_exit(EXIT_SUCCESS);
}
}
TEST(asynchronous_rm_rf) {
_cleanup_free_ char *t = NULL, *k = NULL;
int r;
assert_se(mkdtemp_malloc(NULL, &t) >= 0);
assert_se(k = path_join(t, "somefile"));
assert_se(touch(k) >= 0);
assert_se(asynchronous_rm_rf(t, REMOVE_ROOT|REMOVE_PHYSICAL) >= 0);
/* Do this once more, form a subreaper. Which is nice, because we can watch the async child even
* though detached */
r = safe_fork("(subreaper)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DEATHSIG|FORK_LOG|FORK_WAIT, NULL);
assert_se(r >= 0);
if (r == 0) {
_cleanup_free_ char *tt = NULL, *kk = NULL;
/* child */
assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, -1) >= 0);
assert_se(make_reaper_process(true) >= 0);
assert_se(mkdtemp_malloc(NULL, &tt) >= 0);
assert_se(kk = path_join(tt, "somefile"));
assert_se(touch(kk) >= 0);
assert_se(asynchronous_rm_rf(tt, REMOVE_ROOT|REMOVE_PHYSICAL) >= 0);
for (;;) {
siginfo_t si = {};
assert_se(waitid(P_ALL, 0, &si, WEXITED) >= 0);
if (access(tt, F_OK) < 0) {
assert_se(errno == ENOENT);
break;
}
/* wasn't the rm_rf() call. let's wait longer */
}
_exit(EXIT_SUCCESS);
}
}
DEFINE_TEST_MAIN(LOG_DEBUG);

View File

@@ -891,6 +891,53 @@ TEST(get_process_threads) {
}
}
TEST(is_reaper_process) {
int r;
r = safe_fork("(regular)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_WAIT, NULL);
assert_se(r >= 0);
if (r == 0) {
/* child */
assert_se(is_reaper_process() == 0);
_exit(EXIT_SUCCESS);
}
r = safe_fork("(newpid)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_WAIT, NULL);
assert_se(r >= 0);
if (r == 0) {
/* child */
if (unshare(CLONE_NEWPID) < 0) {
if (ERRNO_IS_PRIVILEGE(errno) || ERRNO_IS_NOT_SUPPORTED(errno)) {
log_notice("Skipping CLONE_NEWPID reaper check, lacking privileges/support");
_exit(EXIT_SUCCESS);
}
}
r = safe_fork("(newpid1)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_WAIT, NULL);
assert_se(r >= 0);
if (r == 0) {
/* grandchild, which is PID1 in a pidns */
assert_se(getpid_cached() == 1);
assert_se(is_reaper_process() > 0);
_exit(EXIT_SUCCESS);
}
_exit(EXIT_SUCCESS);
}
r = safe_fork("(subreaper)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_WAIT, NULL);
assert_se(r >= 0);
if (r == 0) {
/* child */
assert_se(make_reaper_process(true) >= 0);
assert_se(is_reaper_process() > 0);
_exit(EXIT_SUCCESS);
}
}
static int intro(void) {
log_show_color(true);
return EXIT_SUCCESS;