mirror of
https://github.com/morgan9e/systemd
synced 2026-04-14 00:14:32 +09:00
Merge pull request #28123 from poettering/threads-no-more
get rid of any threads in PID 1
This commit is contained in:
@@ -750,3 +750,32 @@ SPDX-License-Identifier: LGPL-2.1-or-later
|
||||
- Reasonable use of non-ASCII Unicode UTF-8 characters in code comments is
|
||||
welcome. If your code comment contains an emoji or two this will certainly
|
||||
brighten the day of the occasional reviewer of your code. Really! 😊
|
||||
|
||||
## Threading
|
||||
|
||||
- We generally avoid using threads, to the level this is possible. In
|
||||
particular in the service manager/PID 1 threads are not OK to use. This is
|
||||
because you cannot mix memory allocation in threads with use of glibc's
|
||||
`clone()` call, or manual `clone()`/`clone3()` system call wrappers. Only
|
||||
glibc's own `fork()` call will properly synchronize the memory allocation
|
||||
locks around the process clone operation. This means that if a process is
|
||||
cloned via `clone()`/`clone3()` and another thread currently has the
|
||||
`malloc()` lock taken, it will be cloned in locked state to the child, and
|
||||
thus can never be acquired in the child, leading to deadlocks. Hence, when
|
||||
using `clone()`/`clone3()` there are only two ways out: never use threads in the
|
||||
parent, or never do memory allocation in the child. For our uses we need
|
||||
`clone()`/`clone3()` and hence decided to avoid threads. Of course, sometimes the
|
||||
concurrency threads allow is beneficial, however we suggest forking off
|
||||
worker *processes* rather than worker *threads* for this purpose, ideally
|
||||
even with an `execve()` to remove the CoW trap situation `fork()` easily
|
||||
triggers.
|
||||
|
||||
- A corollary of the above is: never use `clone()` where a `fork()` would do
|
||||
too. Also consider using `posix_spawn()` which combines `clone()` +
|
||||
`execve()` into one and has nice properties since it avoids becoming a CoW
|
||||
trap by using `CLONE_VORK` and `CLONE_VM` together.
|
||||
|
||||
- While we avoid forking off threads on our own, writing thread-safe code is a
|
||||
good idea where it might end up running inside of libsystemd.so or
|
||||
similar. Hence, use TLS (i.e. `thread_local`) where appropriate, and maybe
|
||||
the occasional `pthread_once()`.
|
||||
|
||||
@@ -1,107 +0,0 @@
|
||||
/* SPDX-License-Identifier: LGPL-2.1-or-later */
|
||||
|
||||
#include <errno.h>
|
||||
#include <pthread.h>
|
||||
#include <stddef.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "async.h"
|
||||
#include "errno-util.h"
|
||||
#include "fd-util.h"
|
||||
#include "log.h"
|
||||
#include "macro.h"
|
||||
#include "process-util.h"
|
||||
#include "signal-util.h"
|
||||
|
||||
int asynchronous_job(void* (*func)(void *p), void *arg) {
|
||||
sigset_t ss, saved_ss;
|
||||
pthread_attr_t a;
|
||||
pthread_t t;
|
||||
int r, k;
|
||||
|
||||
/* It kinda sucks that we have to resort to threads to implement an asynchronous close(), but well, such is
|
||||
* life. */
|
||||
|
||||
r = pthread_attr_init(&a);
|
||||
if (r > 0)
|
||||
return -r;
|
||||
|
||||
r = pthread_attr_setdetachstate(&a, PTHREAD_CREATE_DETACHED);
|
||||
if (r > 0) {
|
||||
r = -r;
|
||||
goto finish;
|
||||
}
|
||||
|
||||
assert_se(sigfillset(&ss) >= 0);
|
||||
|
||||
/* Block all signals before forking off the thread, so that the new thread is started with all signals
|
||||
* blocked. This way the existence of the new thread won't affect signal handling in other threads. */
|
||||
|
||||
r = pthread_sigmask(SIG_BLOCK, &ss, &saved_ss);
|
||||
if (r > 0) {
|
||||
r = -r;
|
||||
goto finish;
|
||||
}
|
||||
|
||||
r = pthread_create(&t, &a, func, arg);
|
||||
|
||||
k = pthread_sigmask(SIG_SETMASK, &saved_ss, NULL);
|
||||
|
||||
if (r > 0)
|
||||
r = -r;
|
||||
else if (k > 0)
|
||||
r = -k;
|
||||
else
|
||||
r = 0;
|
||||
|
||||
finish:
|
||||
pthread_attr_destroy(&a);
|
||||
return r;
|
||||
}
|
||||
|
||||
int asynchronous_sync(pid_t *ret_pid) {
|
||||
int r;
|
||||
|
||||
/* This forks off an invocation of fork() as a child process, in order to initiate synchronization to
|
||||
* disk. Note that we implement this as helper process rather than thread as we don't want the sync() to hang our
|
||||
* original process ever, and a thread would do that as the process can't exit with threads hanging in blocking
|
||||
* syscalls. */
|
||||
|
||||
r = safe_fork("(sd-sync)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS, ret_pid);
|
||||
if (r < 0)
|
||||
return r;
|
||||
if (r == 0) {
|
||||
/* Child process */
|
||||
sync();
|
||||
_exit(EXIT_SUCCESS);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void *close_thread(void *p) {
|
||||
(void) pthread_setname_np(pthread_self(), "close");
|
||||
|
||||
assert_se(close_nointr(PTR_TO_FD(p)) != -EBADF);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
int asynchronous_close(int fd) {
|
||||
int r;
|
||||
|
||||
/* This is supposed to behave similar to safe_close(), but
|
||||
* actually invoke close() asynchronously, so that it will
|
||||
* never block. Ideally the kernel would have an API for this,
|
||||
* but it doesn't, so we work around it, and hide this as a
|
||||
* far away as we can. */
|
||||
|
||||
if (fd >= 0) {
|
||||
PROTECT_ERRNO;
|
||||
|
||||
r = asynchronous_job(close_thread, FD_TO_PTR(fd));
|
||||
if (r < 0)
|
||||
assert_se(close_nointr(fd) != -EBADF);
|
||||
}
|
||||
|
||||
return -EBADF;
|
||||
}
|
||||
@@ -1,13 +0,0 @@
|
||||
/* SPDX-License-Identifier: LGPL-2.1-or-later */
|
||||
#pragma once
|
||||
|
||||
#include <sys/types.h>
|
||||
|
||||
#include "macro.h"
|
||||
|
||||
int asynchronous_job(void* (*func)(void *p), void *arg);
|
||||
|
||||
int asynchronous_sync(pid_t *ret_pid);
|
||||
int asynchronous_close(int fd);
|
||||
|
||||
DEFINE_TRIVIAL_CLEANUP_FUNC(int, asynchronous_close);
|
||||
@@ -7,7 +7,6 @@ basic_sources = files(
|
||||
'architecture.c',
|
||||
'argv-util.c',
|
||||
'arphrd-util.c',
|
||||
'async.c',
|
||||
'audit-util.c',
|
||||
'build.c',
|
||||
'bus-label.c',
|
||||
|
||||
@@ -1149,6 +1149,41 @@ static void restore_sigsetp(sigset_t **ssp) {
|
||||
(void) sigprocmask(SIG_SETMASK, *ssp, NULL);
|
||||
}
|
||||
|
||||
pid_t clone_with_nested_stack(int (*fn)(void *), int flags, void *userdata) {
|
||||
size_t ps;
|
||||
pid_t pid;
|
||||
void *mystack;
|
||||
|
||||
/* A wrapper around glibc's clone() call that automatically sets up a "nested" stack. Only supports
|
||||
* invocations without CLONE_VM, so that we can continue to use the parent's stack mapping.
|
||||
*
|
||||
* Note: glibc's clone() wrapper does not synchronize malloc() locks. This means that if the parent
|
||||
* is threaded these locks will be in an undefined state in the child, and hence memory allocations
|
||||
* are likely going to run into deadlocks. Hence: if you use this function make sure your parent is
|
||||
* strictly single-threaded or your child never calls malloc(). */
|
||||
|
||||
assert((flags & (CLONE_VM|CLONE_PARENT_SETTID|CLONE_CHILD_SETTID|
|
||||
CLONE_CHILD_CLEARTID|CLONE_SETTLS)) == 0);
|
||||
|
||||
/* We allocate some space on the stack to use as the stack for the child (hence "nested"). Note that
|
||||
* the net effect is that the child will have the start of its stack inside the stack of the parent,
|
||||
* but since they are a CoW copy of each other that's fine. We allocate one page-aligned page. But
|
||||
* since we don't want to deal with differences between systems where the stack grows backwards or
|
||||
* forwards we'll allocate one more and place the stack address in the middle. Except that we also
|
||||
* want it page aligned, hence we'll allocate one page more. Makes 3. */
|
||||
|
||||
ps = page_size();
|
||||
mystack = alloca(ps*3);
|
||||
mystack = (uint8_t*) mystack + ps; /* move pointer one page ahead since stacks usually grow backwards */
|
||||
mystack = (void*) ALIGN_TO((uintptr_t) mystack, ps); /* align to page size (moving things further ahead) */
|
||||
|
||||
pid = clone(fn, mystack, flags, userdata);
|
||||
if (pid < 0)
|
||||
return -errno;
|
||||
|
||||
return pid;
|
||||
}
|
||||
|
||||
int safe_fork_full(
|
||||
const char *name,
|
||||
const int stdio_fds[3],
|
||||
@@ -1160,9 +1195,12 @@ int safe_fork_full(
|
||||
pid_t original_pid, pid;
|
||||
sigset_t saved_ss, ss;
|
||||
_unused_ _cleanup_(restore_sigsetp) sigset_t *saved_ssp = NULL;
|
||||
bool block_signals = false, block_all = false;
|
||||
bool block_signals = false, block_all = false, intermediary = false;
|
||||
int prio, r;
|
||||
|
||||
assert(!FLAGS_SET(flags, FORK_DETACH) || !ret_pid);
|
||||
assert(!FLAGS_SET(flags, FORK_DETACH|FORK_WAIT));
|
||||
|
||||
/* A wrapper around fork(), that does a couple of important initializations in addition to mere forking. Always
|
||||
* returns the child's PID in *ret_pid. Returns == 0 in the child, and > 0 in the parent. */
|
||||
|
||||
@@ -1196,6 +1234,31 @@ int safe_fork_full(
|
||||
saved_ssp = &saved_ss;
|
||||
}
|
||||
|
||||
if (FLAGS_SET(flags, FORK_DETACH)) {
|
||||
assert(!FLAGS_SET(flags, FORK_WAIT));
|
||||
assert(!ret_pid);
|
||||
|
||||
/* Fork off intermediary child if needed */
|
||||
|
||||
r = is_reaper_process();
|
||||
if (r < 0)
|
||||
return log_full_errno(prio, r, "Failed to determine if we are a reaper process: %m");
|
||||
|
||||
if (!r) {
|
||||
/* Not a reaper process, hence do a double fork() so we are reparented to one */
|
||||
|
||||
pid = fork();
|
||||
if (pid < 0)
|
||||
return log_full_errno(prio, errno, "Failed to fork off '%s': %m", strna(name));
|
||||
if (pid > 0) {
|
||||
log_debug("Successfully forked off intermediary '%s' as PID " PID_FMT ".", strna(name), pid);
|
||||
return 1; /* return in the parent */
|
||||
}
|
||||
|
||||
intermediary = true;
|
||||
}
|
||||
}
|
||||
|
||||
if ((flags & (FORK_NEW_MOUNTNS|FORK_NEW_USERNS)) != 0)
|
||||
pid = raw_clone(SIGCHLD|
|
||||
(FLAGS_SET(flags, FORK_NEW_MOUNTNS) ? CLONE_NEWNS : 0) |
|
||||
@@ -1205,8 +1268,12 @@ int safe_fork_full(
|
||||
if (pid < 0)
|
||||
return log_full_errno(prio, errno, "Failed to fork off '%s': %m", strna(name));
|
||||
if (pid > 0) {
|
||||
/* We are in the parent process */
|
||||
|
||||
/* If we are in the intermediary process, exit now */
|
||||
if (intermediary)
|
||||
_exit(EXIT_SUCCESS);
|
||||
|
||||
/* We are in the parent process */
|
||||
log_debug("Successfully forked off '%s' as PID " PID_FMT ".", strna(name), pid);
|
||||
|
||||
if (flags & FORK_WAIT) {
|
||||
@@ -1623,6 +1690,39 @@ int get_process_threads(pid_t pid) {
|
||||
return n;
|
||||
}
|
||||
|
||||
int is_reaper_process(void) {
|
||||
int b = 0;
|
||||
|
||||
/* Checks if we are running in a reaper process, i.e. if we are expected to deal with processes
|
||||
* reparented to us. This simply checks if we are PID 1 or if PR_SET_CHILD_SUBREAPER was called. */
|
||||
|
||||
if (getpid_cached() == 1)
|
||||
return true;
|
||||
|
||||
if (prctl(PR_GET_CHILD_SUBREAPER, (unsigned long) &b, 0UL, 0UL, 0UL) < 0)
|
||||
return -errno;
|
||||
|
||||
return b != 0;
|
||||
}
|
||||
|
||||
int make_reaper_process(bool b) {
|
||||
|
||||
if (getpid_cached() == 1) {
|
||||
|
||||
if (!b)
|
||||
return -EINVAL;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Some prctl()s insist that all 5 arguments are specified, others do not. Let's always specify all,
|
||||
* to avoid any ambiguities */
|
||||
if (prctl(PR_SET_CHILD_SUBREAPER, (unsigned long) b, 0UL, 0UL, 0UL) < 0)
|
||||
return -errno;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const char *const sigchld_code_table[] = {
|
||||
[CLD_EXITED] = "exited",
|
||||
[CLD_KILLED] = "killed",
|
||||
|
||||
@@ -139,6 +139,13 @@ void reset_cached_pid(void);
|
||||
|
||||
int must_be_root(void);
|
||||
|
||||
pid_t clone_with_nested_stack(int (*fn)(void *), int flags, void *userdata);
|
||||
|
||||
/* 💣 Note that FORK_NEW_USERNS + FORK_NEW_MOUNTNS should not be called in threaded programs, because they
|
||||
* cause us to use raw_clone() which does not synchronize the glibc malloc() locks, and thus will cause
|
||||
* deadlocks if the parent uses threads and the child does memory allocations. Hence: if the parent is
|
||||
* threaded these flags may not be used. These flags cannot be used if the parent uses threads or the child
|
||||
* uses malloc(). 💣 */
|
||||
typedef enum ForkFlags {
|
||||
FORK_RESET_SIGNALS = 1 << 0, /* Reset all signal handlers and signal mask */
|
||||
FORK_CLOSE_ALL_FDS = 1 << 1, /* Close all open file descriptors in the child, except for 0,1,2 */
|
||||
@@ -148,15 +155,16 @@ typedef enum ForkFlags {
|
||||
FORK_REOPEN_LOG = 1 << 5, /* Reopen log connection */
|
||||
FORK_LOG = 1 << 6, /* Log above LOG_DEBUG log level about failures */
|
||||
FORK_WAIT = 1 << 7, /* Wait until child exited */
|
||||
FORK_NEW_MOUNTNS = 1 << 8, /* Run child in its own mount namespace */
|
||||
FORK_NEW_MOUNTNS = 1 << 8, /* Run child in its own mount namespace 💣 DO NOT USE IN THREADED PROGRAMS! 💣 */
|
||||
FORK_MOUNTNS_SLAVE = 1 << 9, /* Make child's mount namespace MS_SLAVE */
|
||||
FORK_PRIVATE_TMP = 1 << 10, /* Mount new /tmp/ in the child (combine with FORK_NEW_MOUNTNS!) */
|
||||
FORK_RLIMIT_NOFILE_SAFE = 1 << 11, /* Set RLIMIT_NOFILE soft limit to 1K for select() compat */
|
||||
FORK_STDOUT_TO_STDERR = 1 << 12, /* Make stdout a copy of stderr */
|
||||
FORK_FLUSH_STDIO = 1 << 13, /* fflush() stdout (and stderr) before forking */
|
||||
FORK_NEW_USERNS = 1 << 14, /* Run child in its own user namespace */
|
||||
FORK_NEW_USERNS = 1 << 14, /* Run child in its own user namespace 💣 DO NOT USE IN THREADED PROGRAMS! 💣 */
|
||||
FORK_CLOEXEC_OFF = 1 << 15, /* In the child: turn off O_CLOEXEC on all fds in except_fds[] */
|
||||
FORK_KEEP_NOTIFY_SOCKET = 1 << 16, /* Unless this specified, $NOTIFY_SOCKET will be unset. */
|
||||
FORK_DETACH = 1 << 17, /* Double fork if needed to ensure PID1/subreaper is parent */
|
||||
} ForkFlags;
|
||||
|
||||
int safe_fork_full(
|
||||
@@ -199,3 +207,6 @@ int setpriority_closest(int priority);
|
||||
_noreturn_ void freeze(void);
|
||||
|
||||
int get_process_threads(pid_t pid);
|
||||
|
||||
int is_reaper_process(void);
|
||||
int make_reaper_process(bool b);
|
||||
|
||||
@@ -4,7 +4,6 @@
|
||||
#include <errno.h>
|
||||
#include <fcntl.h>
|
||||
#include <linux/random.h>
|
||||
#include <pthread.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
@@ -27,6 +27,11 @@
|
||||
* Additionally, as this function does not pass the ptid, newtls and ctid parameters to the kernel, flags must not
|
||||
* contain CLONE_PARENT_SETTID, CLONE_CHILD_SETTID, CLONE_CHILD_CLEARTID or CLONE_SETTLS.
|
||||
*
|
||||
* WARNING: 💣 this call (just like glibc's own clone() wrapper) will not synchronize on glibc's malloc
|
||||
* locks, which means they will be in an undefined state in the child if the parent is
|
||||
* threaded. This means: the parent must either never use threads, or the child cannot use memory
|
||||
* allocation itself. This is a major pitfall, hence be careful! 💣
|
||||
*
|
||||
* Returns: 0 in the child process and the child process id in the parent.
|
||||
*/
|
||||
static inline pid_t raw_clone(unsigned long flags) {
|
||||
|
||||
@@ -44,22 +44,6 @@ static const UnitActiveState state_translation_table[_AUTOMOUNT_STATE_MAX] = {
|
||||
[AUTOMOUNT_FAILED] = UNIT_FAILED
|
||||
};
|
||||
|
||||
struct expire_data {
|
||||
int dev_autofs_fd;
|
||||
int ioctl_fd;
|
||||
};
|
||||
|
||||
static struct expire_data* expire_data_free(struct expire_data *data) {
|
||||
if (!data)
|
||||
return NULL;
|
||||
|
||||
safe_close(data->dev_autofs_fd);
|
||||
safe_close(data->ioctl_fd);
|
||||
return mfree(data);
|
||||
}
|
||||
|
||||
DEFINE_TRIVIAL_CLEANUP_FUNC(struct expire_data*, expire_data_free);
|
||||
|
||||
static int open_dev_autofs(Manager *m);
|
||||
static int automount_dispatch_io(sd_event_source *s, int fd, uint32_t events, void *userdata);
|
||||
static int automount_start_expire(Automount *a);
|
||||
@@ -674,55 +658,57 @@ fail:
|
||||
automount_enter_dead(a, AUTOMOUNT_FAILURE_RESOURCES);
|
||||
}
|
||||
|
||||
static void *expire_thread(void *p) {
|
||||
struct autofs_dev_ioctl param;
|
||||
_cleanup_(expire_data_freep) struct expire_data *data = p;
|
||||
static int asynchronous_expire(int dev_autofs_fd, int ioctl_fd) {
|
||||
int r;
|
||||
|
||||
assert(data->dev_autofs_fd >= 0);
|
||||
assert(data->ioctl_fd >= 0);
|
||||
assert(dev_autofs_fd >= 0);
|
||||
assert(ioctl_fd >= 0);
|
||||
|
||||
init_autofs_dev_ioctl(¶m);
|
||||
param.ioctlfd = data->ioctl_fd;
|
||||
/* Issue AUTOFS_DEV_IOCTL_EXPIRE in subprocess, asynchronously. Note that we don't keep track of the
|
||||
* child's PID, we are PID1/autoreaper after all, hence when it dies we'll automatically clean it up
|
||||
* anyway. */
|
||||
|
||||
do {
|
||||
r = ioctl(data->dev_autofs_fd, AUTOFS_DEV_IOCTL_EXPIRE, ¶m);
|
||||
} while (r >= 0);
|
||||
r = safe_fork_full("(sd-expire)",
|
||||
/* stdio_fds= */ NULL,
|
||||
(int[]) { dev_autofs_fd, ioctl_fd },
|
||||
/* n_except_fds= */ 2,
|
||||
FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_REOPEN_LOG,
|
||||
/* pid= */ NULL);
|
||||
if (r != 0)
|
||||
return r;
|
||||
|
||||
/* Child */
|
||||
for (;;) {
|
||||
struct autofs_dev_ioctl param;
|
||||
init_autofs_dev_ioctl(¶m);
|
||||
param.ioctlfd = ioctl_fd;
|
||||
|
||||
if (ioctl(dev_autofs_fd, AUTOFS_DEV_IOCTL_EXPIRE, ¶m) < 0)
|
||||
break;
|
||||
}
|
||||
|
||||
if (errno != EAGAIN)
|
||||
log_warning_errno(errno, "Failed to expire automount, ignoring: %m");
|
||||
|
||||
return NULL;
|
||||
_exit(EXIT_SUCCESS);
|
||||
}
|
||||
|
||||
static int automount_dispatch_expire(sd_event_source *source, usec_t usec, void *userdata) {
|
||||
_cleanup_close_ int ioctl_fd = -EBADF;
|
||||
Automount *a = AUTOMOUNT(userdata);
|
||||
_cleanup_(expire_data_freep) struct expire_data *data = NULL;
|
||||
int r;
|
||||
|
||||
assert(a);
|
||||
assert(source == a->expire_event_source);
|
||||
|
||||
data = new0(struct expire_data, 1);
|
||||
if (!data)
|
||||
return log_oom();
|
||||
ioctl_fd = open_ioctl_fd(UNIT(a)->manager->dev_autofs_fd, a->where, a->dev_id);
|
||||
if (ioctl_fd < 0)
|
||||
return log_unit_error_errno(UNIT(a), ioctl_fd, "Couldn't open autofs ioctl fd: %m");
|
||||
|
||||
data->ioctl_fd = -EBADF;
|
||||
|
||||
data->dev_autofs_fd = fcntl(UNIT(a)->manager->dev_autofs_fd, F_DUPFD_CLOEXEC, 3);
|
||||
if (data->dev_autofs_fd < 0)
|
||||
return log_unit_error_errno(UNIT(a), errno, "Failed to duplicate autofs fd: %m");
|
||||
|
||||
data->ioctl_fd = open_ioctl_fd(UNIT(a)->manager->dev_autofs_fd, a->where, a->dev_id);
|
||||
if (data->ioctl_fd < 0)
|
||||
return log_unit_error_errno(UNIT(a), data->ioctl_fd, "Couldn't open autofs ioctl fd: %m");
|
||||
|
||||
r = asynchronous_job(expire_thread, data);
|
||||
r = asynchronous_expire(UNIT(a)->manager->dev_autofs_fd, ioctl_fd);
|
||||
if (r < 0)
|
||||
return log_unit_error_errno(UNIT(a), r, "Failed to start expire job: %m");
|
||||
|
||||
data = NULL;
|
||||
|
||||
return automount_start_expire(a);
|
||||
}
|
||||
|
||||
|
||||
@@ -7318,28 +7318,17 @@ int exec_command_append(ExecCommand *c, const char *path, ...) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void *rm_rf_thread(void *p) {
|
||||
_cleanup_free_ char *path = p;
|
||||
static char *destroy_tree(char *path) {
|
||||
if (!path)
|
||||
return NULL;
|
||||
|
||||
(void) rm_rf(path, REMOVE_ROOT|REMOVE_SUBVOLUME|REMOVE_PHYSICAL);
|
||||
return NULL;
|
||||
}
|
||||
if (!path_equal(path, RUN_SYSTEMD_EMPTY)) {
|
||||
log_debug("Spawning process to nuke '%s'", path);
|
||||
|
||||
static void asynchronous_rm_rf(char **path) {
|
||||
int r;
|
||||
(void) asynchronous_rm_rf(path, REMOVE_ROOT|REMOVE_SUBVOLUME|REMOVE_PHYSICAL);
|
||||
}
|
||||
|
||||
assert(path);
|
||||
|
||||
if (!*path || streq(*path, RUN_SYSTEMD_EMPTY))
|
||||
return;
|
||||
|
||||
log_debug("Spawning thread to nuke %s", *path);
|
||||
|
||||
r = asynchronous_job(rm_rf_thread, *path);
|
||||
if (r < 0)
|
||||
log_warning_errno(r, "Failed to nuke %s: %m", *path);
|
||||
else
|
||||
*path = NULL;
|
||||
return mfree(path);
|
||||
}
|
||||
|
||||
static ExecSharedRuntime* exec_shared_runtime_free(ExecSharedRuntime *rt) {
|
||||
@@ -7370,8 +7359,8 @@ ExecSharedRuntime* exec_shared_runtime_destroy(ExecSharedRuntime *rt) {
|
||||
if (rt->n_ref > 0)
|
||||
return NULL;
|
||||
|
||||
asynchronous_rm_rf(&rt->tmp_dir);
|
||||
asynchronous_rm_rf(&rt->var_tmp_dir);
|
||||
rt->tmp_dir = destroy_tree(rt->tmp_dir);
|
||||
rt->var_tmp_dir = destroy_tree(rt->var_tmp_dir);
|
||||
|
||||
return exec_shared_runtime_free(rt);
|
||||
}
|
||||
@@ -7862,9 +7851,8 @@ ExecRuntime* exec_runtime_free(ExecRuntime *rt) {
|
||||
exec_shared_runtime_unref(rt->shared);
|
||||
dynamic_creds_unref(rt->dynamic_creds);
|
||||
|
||||
asynchronous_rm_rf(&rt->ephemeral_copy);
|
||||
rt->ephemeral_copy = destroy_tree(rt->ephemeral_copy);
|
||||
|
||||
free(rt->ephemeral_copy);
|
||||
safe_close_pair(rt->ephemeral_storage_socket);
|
||||
return mfree(rt);
|
||||
}
|
||||
|
||||
@@ -2294,10 +2294,9 @@ static int initialize_runtime(
|
||||
}
|
||||
}
|
||||
|
||||
if (arg_runtime_scope == RUNTIME_SCOPE_USER)
|
||||
/* Become reaper of our children */
|
||||
if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0)
|
||||
log_warning_errno(errno, "Failed to make us a subreaper, ignoring: %m");
|
||||
r = make_reaper_process(true);
|
||||
if (r < 0)
|
||||
log_warning_errno(r, "Failed to make us a subreaper, ignoring: %m");
|
||||
|
||||
/* Bump up RLIMIT_NOFILE for systemd itself */
|
||||
(void) bump_rlimit_nofile(saved_rlimit_nofile);
|
||||
|
||||
@@ -5826,8 +5826,9 @@ static int run(int argc, char *argv[]) {
|
||||
|
||||
assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, SIGRTMIN+18, -1) >= 0);
|
||||
|
||||
if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) < 0) {
|
||||
r = log_error_errno(errno, "Failed to become subreaper: %m");
|
||||
r = make_reaper_process(true);
|
||||
if (r < 0) {
|
||||
log_error_errno(r, "Failed to become subreaper: %m");
|
||||
goto finish;
|
||||
}
|
||||
|
||||
|
||||
138
src/shared/async.c
Normal file
138
src/shared/async.c
Normal file
@@ -0,0 +1,138 @@
|
||||
/* SPDX-License-Identifier: LGPL-2.1-or-later */
|
||||
|
||||
#include <errno.h>
|
||||
#include <stddef.h>
|
||||
#include <sys/prctl.h>
|
||||
#include <sys/wait.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "async.h"
|
||||
#include "errno-util.h"
|
||||
#include "fd-util.h"
|
||||
#include "log.h"
|
||||
#include "macro.h"
|
||||
#include "process-util.h"
|
||||
#include "signal-util.h"
|
||||
|
||||
int asynchronous_sync(pid_t *ret_pid) {
|
||||
int r;
|
||||
|
||||
/* This forks off an invocation of fork() as a child process, in order to initiate synchronization to
|
||||
* disk. Note that we implement this as helper process rather than thread as we don't want the sync() to hang our
|
||||
* original process ever, and a thread would do that as the process can't exit with threads hanging in blocking
|
||||
* syscalls. */
|
||||
|
||||
r = safe_fork("(sd-sync)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|(ret_pid ? 0 : FORK_DETACH), ret_pid);
|
||||
if (r < 0)
|
||||
return r;
|
||||
if (r == 0) {
|
||||
/* Child process */
|
||||
sync();
|
||||
_exit(EXIT_SUCCESS);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* We encode the fd to close in the userdata pointer as an unsigned value. The highest bit indicates whether
|
||||
* we need to fork again */
|
||||
#define NEED_DOUBLE_FORK (1U << (sizeof(unsigned) * 8 - 1))
|
||||
|
||||
static int close_func(void *p) {
|
||||
unsigned v = PTR_TO_UINT(p);
|
||||
|
||||
(void) prctl(PR_SET_NAME, (unsigned long*) "(close)");
|
||||
|
||||
/* Note: 💣 This function is invoked in a child process created via glibc's clone() wrapper. In such
|
||||
* children memory allocation is not allowed, since glibc does not release malloc mutexes in
|
||||
* clone() 💣 */
|
||||
|
||||
if (v & NEED_DOUBLE_FORK) {
|
||||
pid_t pid;
|
||||
|
||||
v &= ~NEED_DOUBLE_FORK;
|
||||
|
||||
/* This inner child will be reparented to the subreaper/PID 1. Here we turn on SIGCHLD, so
|
||||
* that the reaper knows when it's time to reap. */
|
||||
pid = clone_with_nested_stack(close_func, SIGCHLD|CLONE_FILES, UINT_TO_PTR(v));
|
||||
if (pid >= 0)
|
||||
return 0;
|
||||
}
|
||||
|
||||
close((int) v); /* no assert() here, we are in the child and the result would be eaten up anyway */
|
||||
return 0;
|
||||
}
|
||||
|
||||
int asynchronous_close(int fd) {
|
||||
unsigned v;
|
||||
pid_t pid;
|
||||
int r;
|
||||
|
||||
/* This is supposed to behave similar to safe_close(), but actually invoke close() asynchronously, so
|
||||
* that it will never block. Ideally the kernel would have an API for this, but it doesn't, so we
|
||||
* work around it, and hide this as a far away as we can.
|
||||
*
|
||||
* It is important to us that we don't use threads (via glibc pthread) in PID 1, hence we'll do a
|
||||
* minimal subprocess instead which shares our fd table via CLONE_FILES. */
|
||||
|
||||
if (fd < 0)
|
||||
return -EBADF; /* already invalid */
|
||||
|
||||
PROTECT_ERRNO;
|
||||
|
||||
v = (unsigned) fd;
|
||||
|
||||
/* We want to fork off a process that is automatically reaped. For that we'd usually double-fork. But
|
||||
* we can optimize this a bit: if we are PID 1 or a subreaper anyway (the systemd service manager
|
||||
* process qualifies as this), we can avoid the double forking, since the double forked process would
|
||||
* be reparented back to us anyway. */
|
||||
r = is_reaper_process();
|
||||
if (r < 0)
|
||||
log_debug_errno(r, "Cannot determine if we are a reaper process, assuming we are not: %m");
|
||||
if (r <= 0)
|
||||
v |= NEED_DOUBLE_FORK;
|
||||
|
||||
pid = clone_with_nested_stack(close_func, CLONE_FILES | ((v & NEED_DOUBLE_FORK) ? 0 : SIGCHLD), UINT_TO_PTR(v));
|
||||
if (pid < 0)
|
||||
assert_se(close_nointr(fd) != -EBADF); /* local fallback */
|
||||
else if (v & NEED_DOUBLE_FORK) {
|
||||
|
||||
/* Reap the intermediate child. Key here is that we specify __WCLONE, since we didn't ask for
|
||||
* any signal to be sent to us on process exit, and otherwise waitid() would refuse waiting
|
||||
* then.
|
||||
*
|
||||
* We usually prefer calling waitid(), but before kernel 4.7 it didn't support __WCLONE while
|
||||
* waitpid() did. Hence let's use waitpid() here, it's good enough for our purposes here. */
|
||||
for (;;) {
|
||||
if (waitpid(pid, NULL, WEXITED|__WCLONE) >= 0 || errno != EINTR)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return -EBADF; /* return an invalidated fd */
|
||||
}
|
||||
|
||||
int asynchronous_rm_rf(const char *p, RemoveFlags flags) {
|
||||
int r;
|
||||
|
||||
assert(p);
|
||||
|
||||
/* Forks off a child that destroys the specified path. This will be best effort only, i.e. the child
|
||||
* will attempt to do its thing, but we won't wait for it or check its success. */
|
||||
|
||||
r = safe_fork("(sd-rmrf)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DETACH, NULL);
|
||||
if (r != 0)
|
||||
return r;
|
||||
|
||||
/* Child */
|
||||
|
||||
r = rm_rf(p, flags);
|
||||
if (r < 0) {
|
||||
log_debug_errno(r, "Failed to rm -rf '%s', ignoring: %m", p);
|
||||
_exit(EXIT_FAILURE); /* This is a detached process, hence noone really cares, but who knows
|
||||
* maybe it's good for debugging/tracing to return an exit code
|
||||
* indicative of our failure here. */
|
||||
}
|
||||
|
||||
_exit(EXIT_SUCCESS);
|
||||
}
|
||||
26
src/shared/async.h
Normal file
26
src/shared/async.h
Normal file
@@ -0,0 +1,26 @@
|
||||
/* SPDX-License-Identifier: LGPL-2.1-or-later */
|
||||
#pragma once
|
||||
|
||||
#include <sys/types.h>
|
||||
|
||||
#include "macro.h"
|
||||
#include "rm-rf.h"
|
||||
|
||||
/* These functions implement various potentially slow operations that are executed asynchronously. They are
|
||||
* carefully written to not use pthreads, but use fork() or clone() (without CLONE_VM) so that the child does
|
||||
* not share any memory with the parent process, and thus cannot possibly interfere with the malloc()
|
||||
* synchronization locks.
|
||||
*
|
||||
* Background: glibc only synchronizes malloc() locks when doing fork(), but not when doing clone()
|
||||
* (regardless if through glibc's own wrapper or ours). This means if another thread in the parent has the
|
||||
* malloc() lock taken while a thread is cloning, the mutex will remain locked in the child (but the other
|
||||
* thread won't exist there), with no chance to ever be unlocked again. This will result in deadlocks. Hence
|
||||
* one has to make the choice: either never use threads in the parent, or never do memory allocation in the
|
||||
* child, or never use clone()/clone3() and stick to fork() only. Because we need clone()/clone3() we opted
|
||||
* for avoiding threads. */
|
||||
|
||||
int asynchronous_sync(pid_t *ret_pid);
|
||||
int asynchronous_close(int fd);
|
||||
int asynchronous_rm_rf(const char *p, RemoveFlags flags);
|
||||
|
||||
DEFINE_TRIVIAL_CLEANUP_FUNC(int, asynchronous_close);
|
||||
@@ -4,6 +4,7 @@ shared_sources = files(
|
||||
'acpi-fpdt.c',
|
||||
'apparmor-util.c',
|
||||
'ask-password-api.c',
|
||||
'async.c',
|
||||
'barrier.c',
|
||||
'base-filesystem.c',
|
||||
'battery-util.c',
|
||||
|
||||
@@ -1,37 +1,100 @@
|
||||
/* SPDX-License-Identifier: LGPL-2.1-or-later */
|
||||
|
||||
#include <fcntl.h>
|
||||
#include <sys/prctl.h>
|
||||
#include <sys/wait.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "async.h"
|
||||
#include "fs-util.h"
|
||||
#include "tmpfile-util.h"
|
||||
#include "path-util.h"
|
||||
#include "process-util.h"
|
||||
#include "signal-util.h"
|
||||
#include "tests.h"
|
||||
#include "tmpfile-util.h"
|
||||
|
||||
static bool test_async = false;
|
||||
|
||||
static void *async_func(void *arg) {
|
||||
test_async = true;
|
||||
|
||||
return NULL;
|
||||
TEST(test_asynchronous_sync) {
|
||||
assert_se(asynchronous_sync(NULL) >= 0);
|
||||
}
|
||||
|
||||
TEST(test_async) {
|
||||
TEST(asynchronous_close) {
|
||||
_cleanup_(unlink_tempfilep) char name[] = "/tmp/test-asynchronous_close.XXXXXX";
|
||||
int fd;
|
||||
int fd, r;
|
||||
|
||||
fd = mkostemp_safe(name);
|
||||
assert_se(fd >= 0);
|
||||
asynchronous_close(fd);
|
||||
|
||||
assert_se(asynchronous_job(async_func, NULL) >= 0);
|
||||
assert_se(asynchronous_sync(NULL) >= 0);
|
||||
|
||||
sleep(1);
|
||||
|
||||
assert_se(fcntl(fd, F_GETFD) == -1);
|
||||
assert_se(errno == EBADF);
|
||||
assert_se(test_async);
|
||||
|
||||
r = safe_fork("(subreaper)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DEATHSIG|FORK_LOG|FORK_WAIT, NULL);
|
||||
assert(r >= 0);
|
||||
|
||||
if (r == 0) {
|
||||
/* child */
|
||||
|
||||
assert(make_reaper_process(true) >= 0);
|
||||
|
||||
fd = open("/dev/null", O_RDONLY|O_CLOEXEC);
|
||||
assert_se(fd >= 0);
|
||||
asynchronous_close(fd);
|
||||
|
||||
sleep(1);
|
||||
|
||||
assert_se(fcntl(fd, F_GETFD) == -1);
|
||||
assert_se(errno == EBADF);
|
||||
|
||||
_exit(EXIT_SUCCESS);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(asynchronous_rm_rf) {
|
||||
_cleanup_free_ char *t = NULL, *k = NULL;
|
||||
int r;
|
||||
|
||||
assert_se(mkdtemp_malloc(NULL, &t) >= 0);
|
||||
assert_se(k = path_join(t, "somefile"));
|
||||
assert_se(touch(k) >= 0);
|
||||
assert_se(asynchronous_rm_rf(t, REMOVE_ROOT|REMOVE_PHYSICAL) >= 0);
|
||||
|
||||
/* Do this once more, form a subreaper. Which is nice, because we can watch the async child even
|
||||
* though detached */
|
||||
|
||||
r = safe_fork("(subreaper)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DEATHSIG|FORK_LOG|FORK_WAIT, NULL);
|
||||
assert_se(r >= 0);
|
||||
|
||||
if (r == 0) {
|
||||
_cleanup_free_ char *tt = NULL, *kk = NULL;
|
||||
|
||||
/* child */
|
||||
|
||||
assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, -1) >= 0);
|
||||
assert_se(make_reaper_process(true) >= 0);
|
||||
|
||||
assert_se(mkdtemp_malloc(NULL, &tt) >= 0);
|
||||
assert_se(kk = path_join(tt, "somefile"));
|
||||
assert_se(touch(kk) >= 0);
|
||||
assert_se(asynchronous_rm_rf(tt, REMOVE_ROOT|REMOVE_PHYSICAL) >= 0);
|
||||
|
||||
for (;;) {
|
||||
siginfo_t si = {};
|
||||
|
||||
assert_se(waitid(P_ALL, 0, &si, WEXITED) >= 0);
|
||||
|
||||
if (access(tt, F_OK) < 0) {
|
||||
assert_se(errno == ENOENT);
|
||||
break;
|
||||
}
|
||||
|
||||
/* wasn't the rm_rf() call. let's wait longer */
|
||||
}
|
||||
|
||||
_exit(EXIT_SUCCESS);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
DEFINE_TEST_MAIN(LOG_DEBUG);
|
||||
|
||||
@@ -891,6 +891,53 @@ TEST(get_process_threads) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST(is_reaper_process) {
|
||||
int r;
|
||||
|
||||
r = safe_fork("(regular)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_WAIT, NULL);
|
||||
assert_se(r >= 0);
|
||||
if (r == 0) {
|
||||
/* child */
|
||||
|
||||
assert_se(is_reaper_process() == 0);
|
||||
_exit(EXIT_SUCCESS);
|
||||
}
|
||||
|
||||
r = safe_fork("(newpid)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_WAIT, NULL);
|
||||
assert_se(r >= 0);
|
||||
if (r == 0) {
|
||||
/* child */
|
||||
|
||||
if (unshare(CLONE_NEWPID) < 0) {
|
||||
if (ERRNO_IS_PRIVILEGE(errno) || ERRNO_IS_NOT_SUPPORTED(errno)) {
|
||||
log_notice("Skipping CLONE_NEWPID reaper check, lacking privileges/support");
|
||||
_exit(EXIT_SUCCESS);
|
||||
}
|
||||
}
|
||||
|
||||
r = safe_fork("(newpid1)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_WAIT, NULL);
|
||||
assert_se(r >= 0);
|
||||
if (r == 0) {
|
||||
/* grandchild, which is PID1 in a pidns */
|
||||
assert_se(getpid_cached() == 1);
|
||||
assert_se(is_reaper_process() > 0);
|
||||
_exit(EXIT_SUCCESS);
|
||||
}
|
||||
|
||||
_exit(EXIT_SUCCESS);
|
||||
}
|
||||
|
||||
r = safe_fork("(subreaper)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_WAIT, NULL);
|
||||
assert_se(r >= 0);
|
||||
if (r == 0) {
|
||||
/* child */
|
||||
assert_se(make_reaper_process(true) >= 0);
|
||||
|
||||
assert_se(is_reaper_process() > 0);
|
||||
_exit(EXIT_SUCCESS);
|
||||
}
|
||||
}
|
||||
|
||||
static int intro(void) {
|
||||
log_show_color(true);
|
||||
return EXIT_SUCCESS;
|
||||
|
||||
Reference in New Issue
Block a user