diff --git a/docs/CODING_STYLE.md b/docs/CODING_STYLE.md index f76525205f..f3eefeaae6 100644 --- a/docs/CODING_STYLE.md +++ b/docs/CODING_STYLE.md @@ -750,3 +750,32 @@ SPDX-License-Identifier: LGPL-2.1-or-later - Reasonable use of non-ASCII Unicode UTF-8 characters in code comments is welcome. If your code comment contains an emoji or two this will certainly brighten the day of the occasional reviewer of your code. Really! 😊 + +## Threading + +- We generally avoid using threads, to the level this is possible. In + particular in the service manager/PID 1 threads are not OK to use. This is + because you cannot mix memory allocation in threads with use of glibc's + `clone()` call, or manual `clone()`/`clone3()` system call wrappers. Only + glibc's own `fork()` call will properly synchronize the memory allocation + locks around the process clone operation. This means that if a process is + cloned via `clone()`/`clone3()` and another thread currently has the + `malloc()` lock taken, it will be cloned in locked state to the child, and + thus can never be acquired in the child, leading to deadlocks. Hence, when + using `clone()`/`clone3()` there are only two ways out: never use threads in the + parent, or never do memory allocation in the child. For our uses we need + `clone()`/`clone3()` and hence decided to avoid threads. Of course, sometimes the + concurrency threads allow is beneficial, however we suggest forking off + worker *processes* rather than worker *threads* for this purpose, ideally + even with an `execve()` to remove the CoW trap situation `fork()` easily + triggers. + +- A corollary of the above is: never use `clone()` where a `fork()` would do + too. Also consider using `posix_spawn()` which combines `clone()` + + `execve()` into one and has nice properties since it avoids becoming a CoW + trap by using `CLONE_VORK` and `CLONE_VM` together. + +- While we avoid forking off threads on our own, writing thread-safe code is a + good idea where it might end up running inside of libsystemd.so or + similar. Hence, use TLS (i.e. `thread_local`) where appropriate, and maybe + the occasional `pthread_once()`. diff --git a/src/basic/async.c b/src/basic/async.c deleted file mode 100644 index 5e347dde8b..0000000000 --- a/src/basic/async.c +++ /dev/null @@ -1,107 +0,0 @@ -/* SPDX-License-Identifier: LGPL-2.1-or-later */ - -#include -#include -#include -#include - -#include "async.h" -#include "errno-util.h" -#include "fd-util.h" -#include "log.h" -#include "macro.h" -#include "process-util.h" -#include "signal-util.h" - -int asynchronous_job(void* (*func)(void *p), void *arg) { - sigset_t ss, saved_ss; - pthread_attr_t a; - pthread_t t; - int r, k; - - /* It kinda sucks that we have to resort to threads to implement an asynchronous close(), but well, such is - * life. */ - - r = pthread_attr_init(&a); - if (r > 0) - return -r; - - r = pthread_attr_setdetachstate(&a, PTHREAD_CREATE_DETACHED); - if (r > 0) { - r = -r; - goto finish; - } - - assert_se(sigfillset(&ss) >= 0); - - /* Block all signals before forking off the thread, so that the new thread is started with all signals - * blocked. This way the existence of the new thread won't affect signal handling in other threads. */ - - r = pthread_sigmask(SIG_BLOCK, &ss, &saved_ss); - if (r > 0) { - r = -r; - goto finish; - } - - r = pthread_create(&t, &a, func, arg); - - k = pthread_sigmask(SIG_SETMASK, &saved_ss, NULL); - - if (r > 0) - r = -r; - else if (k > 0) - r = -k; - else - r = 0; - -finish: - pthread_attr_destroy(&a); - return r; -} - -int asynchronous_sync(pid_t *ret_pid) { - int r; - - /* This forks off an invocation of fork() as a child process, in order to initiate synchronization to - * disk. Note that we implement this as helper process rather than thread as we don't want the sync() to hang our - * original process ever, and a thread would do that as the process can't exit with threads hanging in blocking - * syscalls. */ - - r = safe_fork("(sd-sync)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS, ret_pid); - if (r < 0) - return r; - if (r == 0) { - /* Child process */ - sync(); - _exit(EXIT_SUCCESS); - } - - return 0; -} - -static void *close_thread(void *p) { - (void) pthread_setname_np(pthread_self(), "close"); - - assert_se(close_nointr(PTR_TO_FD(p)) != -EBADF); - return NULL; -} - -int asynchronous_close(int fd) { - int r; - - /* This is supposed to behave similar to safe_close(), but - * actually invoke close() asynchronously, so that it will - * never block. Ideally the kernel would have an API for this, - * but it doesn't, so we work around it, and hide this as a - * far away as we can. */ - - if (fd >= 0) { - PROTECT_ERRNO; - - r = asynchronous_job(close_thread, FD_TO_PTR(fd)); - if (r < 0) - assert_se(close_nointr(fd) != -EBADF); - } - - return -EBADF; -} diff --git a/src/basic/async.h b/src/basic/async.h deleted file mode 100644 index e0bbaa5658..0000000000 --- a/src/basic/async.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: LGPL-2.1-or-later */ -#pragma once - -#include - -#include "macro.h" - -int asynchronous_job(void* (*func)(void *p), void *arg); - -int asynchronous_sync(pid_t *ret_pid); -int asynchronous_close(int fd); - -DEFINE_TRIVIAL_CLEANUP_FUNC(int, asynchronous_close); diff --git a/src/basic/meson.build b/src/basic/meson.build index 1decb4827f..fc9a925847 100644 --- a/src/basic/meson.build +++ b/src/basic/meson.build @@ -7,7 +7,6 @@ basic_sources = files( 'architecture.c', 'argv-util.c', 'arphrd-util.c', - 'async.c', 'audit-util.c', 'build.c', 'bus-label.c', diff --git a/src/basic/process-util.c b/src/basic/process-util.c index 0f642fdf1b..001002a55e 100644 --- a/src/basic/process-util.c +++ b/src/basic/process-util.c @@ -1149,6 +1149,41 @@ static void restore_sigsetp(sigset_t **ssp) { (void) sigprocmask(SIG_SETMASK, *ssp, NULL); } +pid_t clone_with_nested_stack(int (*fn)(void *), int flags, void *userdata) { + size_t ps; + pid_t pid; + void *mystack; + + /* A wrapper around glibc's clone() call that automatically sets up a "nested" stack. Only supports + * invocations without CLONE_VM, so that we can continue to use the parent's stack mapping. + * + * Note: glibc's clone() wrapper does not synchronize malloc() locks. This means that if the parent + * is threaded these locks will be in an undefined state in the child, and hence memory allocations + * are likely going to run into deadlocks. Hence: if you use this function make sure your parent is + * strictly single-threaded or your child never calls malloc(). */ + + assert((flags & (CLONE_VM|CLONE_PARENT_SETTID|CLONE_CHILD_SETTID| + CLONE_CHILD_CLEARTID|CLONE_SETTLS)) == 0); + + /* We allocate some space on the stack to use as the stack for the child (hence "nested"). Note that + * the net effect is that the child will have the start of its stack inside the stack of the parent, + * but since they are a CoW copy of each other that's fine. We allocate one page-aligned page. But + * since we don't want to deal with differences between systems where the stack grows backwards or + * forwards we'll allocate one more and place the stack address in the middle. Except that we also + * want it page aligned, hence we'll allocate one page more. Makes 3. */ + + ps = page_size(); + mystack = alloca(ps*3); + mystack = (uint8_t*) mystack + ps; /* move pointer one page ahead since stacks usually grow backwards */ + mystack = (void*) ALIGN_TO((uintptr_t) mystack, ps); /* align to page size (moving things further ahead) */ + + pid = clone(fn, mystack, flags, userdata); + if (pid < 0) + return -errno; + + return pid; +} + int safe_fork_full( const char *name, const int stdio_fds[3], @@ -1160,9 +1195,12 @@ int safe_fork_full( pid_t original_pid, pid; sigset_t saved_ss, ss; _unused_ _cleanup_(restore_sigsetp) sigset_t *saved_ssp = NULL; - bool block_signals = false, block_all = false; + bool block_signals = false, block_all = false, intermediary = false; int prio, r; + assert(!FLAGS_SET(flags, FORK_DETACH) || !ret_pid); + assert(!FLAGS_SET(flags, FORK_DETACH|FORK_WAIT)); + /* A wrapper around fork(), that does a couple of important initializations in addition to mere forking. Always * returns the child's PID in *ret_pid. Returns == 0 in the child, and > 0 in the parent. */ @@ -1196,6 +1234,31 @@ int safe_fork_full( saved_ssp = &saved_ss; } + if (FLAGS_SET(flags, FORK_DETACH)) { + assert(!FLAGS_SET(flags, FORK_WAIT)); + assert(!ret_pid); + + /* Fork off intermediary child if needed */ + + r = is_reaper_process(); + if (r < 0) + return log_full_errno(prio, r, "Failed to determine if we are a reaper process: %m"); + + if (!r) { + /* Not a reaper process, hence do a double fork() so we are reparented to one */ + + pid = fork(); + if (pid < 0) + return log_full_errno(prio, errno, "Failed to fork off '%s': %m", strna(name)); + if (pid > 0) { + log_debug("Successfully forked off intermediary '%s' as PID " PID_FMT ".", strna(name), pid); + return 1; /* return in the parent */ + } + + intermediary = true; + } + } + if ((flags & (FORK_NEW_MOUNTNS|FORK_NEW_USERNS)) != 0) pid = raw_clone(SIGCHLD| (FLAGS_SET(flags, FORK_NEW_MOUNTNS) ? CLONE_NEWNS : 0) | @@ -1205,8 +1268,12 @@ int safe_fork_full( if (pid < 0) return log_full_errno(prio, errno, "Failed to fork off '%s': %m", strna(name)); if (pid > 0) { - /* We are in the parent process */ + /* If we are in the intermediary process, exit now */ + if (intermediary) + _exit(EXIT_SUCCESS); + + /* We are in the parent process */ log_debug("Successfully forked off '%s' as PID " PID_FMT ".", strna(name), pid); if (flags & FORK_WAIT) { @@ -1623,6 +1690,39 @@ int get_process_threads(pid_t pid) { return n; } +int is_reaper_process(void) { + int b = 0; + + /* Checks if we are running in a reaper process, i.e. if we are expected to deal with processes + * reparented to us. This simply checks if we are PID 1 or if PR_SET_CHILD_SUBREAPER was called. */ + + if (getpid_cached() == 1) + return true; + + if (prctl(PR_GET_CHILD_SUBREAPER, (unsigned long) &b, 0UL, 0UL, 0UL) < 0) + return -errno; + + return b != 0; +} + +int make_reaper_process(bool b) { + + if (getpid_cached() == 1) { + + if (!b) + return -EINVAL; + + return 0; + } + + /* Some prctl()s insist that all 5 arguments are specified, others do not. Let's always specify all, + * to avoid any ambiguities */ + if (prctl(PR_SET_CHILD_SUBREAPER, (unsigned long) b, 0UL, 0UL, 0UL) < 0) + return -errno; + + return 0; +} + static const char *const sigchld_code_table[] = { [CLD_EXITED] = "exited", [CLD_KILLED] = "killed", diff --git a/src/basic/process-util.h b/src/basic/process-util.h index 230a0edb09..8f87fdc2ae 100644 --- a/src/basic/process-util.h +++ b/src/basic/process-util.h @@ -139,6 +139,13 @@ void reset_cached_pid(void); int must_be_root(void); +pid_t clone_with_nested_stack(int (*fn)(void *), int flags, void *userdata); + +/* 💣 Note that FORK_NEW_USERNS + FORK_NEW_MOUNTNS should not be called in threaded programs, because they + * cause us to use raw_clone() which does not synchronize the glibc malloc() locks, and thus will cause + * deadlocks if the parent uses threads and the child does memory allocations. Hence: if the parent is + * threaded these flags may not be used. These flags cannot be used if the parent uses threads or the child + * uses malloc(). 💣 */ typedef enum ForkFlags { FORK_RESET_SIGNALS = 1 << 0, /* Reset all signal handlers and signal mask */ FORK_CLOSE_ALL_FDS = 1 << 1, /* Close all open file descriptors in the child, except for 0,1,2 */ @@ -148,15 +155,16 @@ typedef enum ForkFlags { FORK_REOPEN_LOG = 1 << 5, /* Reopen log connection */ FORK_LOG = 1 << 6, /* Log above LOG_DEBUG log level about failures */ FORK_WAIT = 1 << 7, /* Wait until child exited */ - FORK_NEW_MOUNTNS = 1 << 8, /* Run child in its own mount namespace */ + FORK_NEW_MOUNTNS = 1 << 8, /* Run child in its own mount namespace 💣 DO NOT USE IN THREADED PROGRAMS! 💣 */ FORK_MOUNTNS_SLAVE = 1 << 9, /* Make child's mount namespace MS_SLAVE */ FORK_PRIVATE_TMP = 1 << 10, /* Mount new /tmp/ in the child (combine with FORK_NEW_MOUNTNS!) */ FORK_RLIMIT_NOFILE_SAFE = 1 << 11, /* Set RLIMIT_NOFILE soft limit to 1K for select() compat */ FORK_STDOUT_TO_STDERR = 1 << 12, /* Make stdout a copy of stderr */ FORK_FLUSH_STDIO = 1 << 13, /* fflush() stdout (and stderr) before forking */ - FORK_NEW_USERNS = 1 << 14, /* Run child in its own user namespace */ + FORK_NEW_USERNS = 1 << 14, /* Run child in its own user namespace 💣 DO NOT USE IN THREADED PROGRAMS! 💣 */ FORK_CLOEXEC_OFF = 1 << 15, /* In the child: turn off O_CLOEXEC on all fds in except_fds[] */ FORK_KEEP_NOTIFY_SOCKET = 1 << 16, /* Unless this specified, $NOTIFY_SOCKET will be unset. */ + FORK_DETACH = 1 << 17, /* Double fork if needed to ensure PID1/subreaper is parent */ } ForkFlags; int safe_fork_full( @@ -199,3 +207,6 @@ int setpriority_closest(int priority); _noreturn_ void freeze(void); int get_process_threads(pid_t pid); + +int is_reaper_process(void); +int make_reaper_process(bool b); diff --git a/src/basic/random-util.c b/src/basic/random-util.c index 9a7d3480c0..d4a3c9ca5c 100644 --- a/src/basic/random-util.c +++ b/src/basic/random-util.c @@ -4,7 +4,6 @@ #include #include #include -#include #include #include #include diff --git a/src/basic/raw-clone.h b/src/basic/raw-clone.h index a3b768f826..6de67ab752 100644 --- a/src/basic/raw-clone.h +++ b/src/basic/raw-clone.h @@ -27,6 +27,11 @@ * Additionally, as this function does not pass the ptid, newtls and ctid parameters to the kernel, flags must not * contain CLONE_PARENT_SETTID, CLONE_CHILD_SETTID, CLONE_CHILD_CLEARTID or CLONE_SETTLS. * + * WARNING: 💣 this call (just like glibc's own clone() wrapper) will not synchronize on glibc's malloc + * locks, which means they will be in an undefined state in the child if the parent is + * threaded. This means: the parent must either never use threads, or the child cannot use memory + * allocation itself. This is a major pitfall, hence be careful! 💣 + * * Returns: 0 in the child process and the child process id in the parent. */ static inline pid_t raw_clone(unsigned long flags) { diff --git a/src/core/automount.c b/src/core/automount.c index 3254275d6b..52fb4b6948 100644 --- a/src/core/automount.c +++ b/src/core/automount.c @@ -44,22 +44,6 @@ static const UnitActiveState state_translation_table[_AUTOMOUNT_STATE_MAX] = { [AUTOMOUNT_FAILED] = UNIT_FAILED }; -struct expire_data { - int dev_autofs_fd; - int ioctl_fd; -}; - -static struct expire_data* expire_data_free(struct expire_data *data) { - if (!data) - return NULL; - - safe_close(data->dev_autofs_fd); - safe_close(data->ioctl_fd); - return mfree(data); -} - -DEFINE_TRIVIAL_CLEANUP_FUNC(struct expire_data*, expire_data_free); - static int open_dev_autofs(Manager *m); static int automount_dispatch_io(sd_event_source *s, int fd, uint32_t events, void *userdata); static int automount_start_expire(Automount *a); @@ -674,55 +658,57 @@ fail: automount_enter_dead(a, AUTOMOUNT_FAILURE_RESOURCES); } -static void *expire_thread(void *p) { - struct autofs_dev_ioctl param; - _cleanup_(expire_data_freep) struct expire_data *data = p; +static int asynchronous_expire(int dev_autofs_fd, int ioctl_fd) { int r; - assert(data->dev_autofs_fd >= 0); - assert(data->ioctl_fd >= 0); + assert(dev_autofs_fd >= 0); + assert(ioctl_fd >= 0); - init_autofs_dev_ioctl(¶m); - param.ioctlfd = data->ioctl_fd; + /* Issue AUTOFS_DEV_IOCTL_EXPIRE in subprocess, asynchronously. Note that we don't keep track of the + * child's PID, we are PID1/autoreaper after all, hence when it dies we'll automatically clean it up + * anyway. */ - do { - r = ioctl(data->dev_autofs_fd, AUTOFS_DEV_IOCTL_EXPIRE, ¶m); - } while (r >= 0); + r = safe_fork_full("(sd-expire)", + /* stdio_fds= */ NULL, + (int[]) { dev_autofs_fd, ioctl_fd }, + /* n_except_fds= */ 2, + FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_REOPEN_LOG, + /* pid= */ NULL); + if (r != 0) + return r; + + /* Child */ + for (;;) { + struct autofs_dev_ioctl param; + init_autofs_dev_ioctl(¶m); + param.ioctlfd = ioctl_fd; + + if (ioctl(dev_autofs_fd, AUTOFS_DEV_IOCTL_EXPIRE, ¶m) < 0) + break; + } if (errno != EAGAIN) log_warning_errno(errno, "Failed to expire automount, ignoring: %m"); - return NULL; + _exit(EXIT_SUCCESS); } static int automount_dispatch_expire(sd_event_source *source, usec_t usec, void *userdata) { + _cleanup_close_ int ioctl_fd = -EBADF; Automount *a = AUTOMOUNT(userdata); - _cleanup_(expire_data_freep) struct expire_data *data = NULL; int r; assert(a); assert(source == a->expire_event_source); - data = new0(struct expire_data, 1); - if (!data) - return log_oom(); + ioctl_fd = open_ioctl_fd(UNIT(a)->manager->dev_autofs_fd, a->where, a->dev_id); + if (ioctl_fd < 0) + return log_unit_error_errno(UNIT(a), ioctl_fd, "Couldn't open autofs ioctl fd: %m"); - data->ioctl_fd = -EBADF; - - data->dev_autofs_fd = fcntl(UNIT(a)->manager->dev_autofs_fd, F_DUPFD_CLOEXEC, 3); - if (data->dev_autofs_fd < 0) - return log_unit_error_errno(UNIT(a), errno, "Failed to duplicate autofs fd: %m"); - - data->ioctl_fd = open_ioctl_fd(UNIT(a)->manager->dev_autofs_fd, a->where, a->dev_id); - if (data->ioctl_fd < 0) - return log_unit_error_errno(UNIT(a), data->ioctl_fd, "Couldn't open autofs ioctl fd: %m"); - - r = asynchronous_job(expire_thread, data); + r = asynchronous_expire(UNIT(a)->manager->dev_autofs_fd, ioctl_fd); if (r < 0) return log_unit_error_errno(UNIT(a), r, "Failed to start expire job: %m"); - data = NULL; - return automount_start_expire(a); } diff --git a/src/core/execute.c b/src/core/execute.c index b7fe922c7a..90af8fa619 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -7318,28 +7318,17 @@ int exec_command_append(ExecCommand *c, const char *path, ...) { return 0; } -static void *rm_rf_thread(void *p) { - _cleanup_free_ char *path = p; +static char *destroy_tree(char *path) { + if (!path) + return NULL; - (void) rm_rf(path, REMOVE_ROOT|REMOVE_SUBVOLUME|REMOVE_PHYSICAL); - return NULL; -} + if (!path_equal(path, RUN_SYSTEMD_EMPTY)) { + log_debug("Spawning process to nuke '%s'", path); -static void asynchronous_rm_rf(char **path) { - int r; + (void) asynchronous_rm_rf(path, REMOVE_ROOT|REMOVE_SUBVOLUME|REMOVE_PHYSICAL); + } - assert(path); - - if (!*path || streq(*path, RUN_SYSTEMD_EMPTY)) - return; - - log_debug("Spawning thread to nuke %s", *path); - - r = asynchronous_job(rm_rf_thread, *path); - if (r < 0) - log_warning_errno(r, "Failed to nuke %s: %m", *path); - else - *path = NULL; + return mfree(path); } static ExecSharedRuntime* exec_shared_runtime_free(ExecSharedRuntime *rt) { @@ -7370,8 +7359,8 @@ ExecSharedRuntime* exec_shared_runtime_destroy(ExecSharedRuntime *rt) { if (rt->n_ref > 0) return NULL; - asynchronous_rm_rf(&rt->tmp_dir); - asynchronous_rm_rf(&rt->var_tmp_dir); + rt->tmp_dir = destroy_tree(rt->tmp_dir); + rt->var_tmp_dir = destroy_tree(rt->var_tmp_dir); return exec_shared_runtime_free(rt); } @@ -7862,9 +7851,8 @@ ExecRuntime* exec_runtime_free(ExecRuntime *rt) { exec_shared_runtime_unref(rt->shared); dynamic_creds_unref(rt->dynamic_creds); - asynchronous_rm_rf(&rt->ephemeral_copy); + rt->ephemeral_copy = destroy_tree(rt->ephemeral_copy); - free(rt->ephemeral_copy); safe_close_pair(rt->ephemeral_storage_socket); return mfree(rt); } diff --git a/src/core/main.c b/src/core/main.c index f067b13fff..208d22f4f3 100644 --- a/src/core/main.c +++ b/src/core/main.c @@ -2294,10 +2294,9 @@ static int initialize_runtime( } } - if (arg_runtime_scope == RUNTIME_SCOPE_USER) - /* Become reaper of our children */ - if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) - log_warning_errno(errno, "Failed to make us a subreaper, ignoring: %m"); + r = make_reaper_process(true); + if (r < 0) + log_warning_errno(r, "Failed to make us a subreaper, ignoring: %m"); /* Bump up RLIMIT_NOFILE for systemd itself */ (void) bump_rlimit_nofile(saved_rlimit_nofile); diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 97a2c386e6..4fdb5e9fd7 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -5826,8 +5826,9 @@ static int run(int argc, char *argv[]) { assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, SIGRTMIN+18, -1) >= 0); - if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) < 0) { - r = log_error_errno(errno, "Failed to become subreaper: %m"); + r = make_reaper_process(true); + if (r < 0) { + log_error_errno(r, "Failed to become subreaper: %m"); goto finish; } diff --git a/src/shared/async.c b/src/shared/async.c new file mode 100644 index 0000000000..b7ecb9c4b7 --- /dev/null +++ b/src/shared/async.c @@ -0,0 +1,138 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "async.h" +#include "errno-util.h" +#include "fd-util.h" +#include "log.h" +#include "macro.h" +#include "process-util.h" +#include "signal-util.h" + +int asynchronous_sync(pid_t *ret_pid) { + int r; + + /* This forks off an invocation of fork() as a child process, in order to initiate synchronization to + * disk. Note that we implement this as helper process rather than thread as we don't want the sync() to hang our + * original process ever, and a thread would do that as the process can't exit with threads hanging in blocking + * syscalls. */ + + r = safe_fork("(sd-sync)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|(ret_pid ? 0 : FORK_DETACH), ret_pid); + if (r < 0) + return r; + if (r == 0) { + /* Child process */ + sync(); + _exit(EXIT_SUCCESS); + } + + return 0; +} + +/* We encode the fd to close in the userdata pointer as an unsigned value. The highest bit indicates whether + * we need to fork again */ +#define NEED_DOUBLE_FORK (1U << (sizeof(unsigned) * 8 - 1)) + +static int close_func(void *p) { + unsigned v = PTR_TO_UINT(p); + + (void) prctl(PR_SET_NAME, (unsigned long*) "(close)"); + + /* Note: 💣 This function is invoked in a child process created via glibc's clone() wrapper. In such + * children memory allocation is not allowed, since glibc does not release malloc mutexes in + * clone() 💣 */ + + if (v & NEED_DOUBLE_FORK) { + pid_t pid; + + v &= ~NEED_DOUBLE_FORK; + + /* This inner child will be reparented to the subreaper/PID 1. Here we turn on SIGCHLD, so + * that the reaper knows when it's time to reap. */ + pid = clone_with_nested_stack(close_func, SIGCHLD|CLONE_FILES, UINT_TO_PTR(v)); + if (pid >= 0) + return 0; + } + + close((int) v); /* no assert() here, we are in the child and the result would be eaten up anyway */ + return 0; +} + +int asynchronous_close(int fd) { + unsigned v; + pid_t pid; + int r; + + /* This is supposed to behave similar to safe_close(), but actually invoke close() asynchronously, so + * that it will never block. Ideally the kernel would have an API for this, but it doesn't, so we + * work around it, and hide this as a far away as we can. + * + * It is important to us that we don't use threads (via glibc pthread) in PID 1, hence we'll do a + * minimal subprocess instead which shares our fd table via CLONE_FILES. */ + + if (fd < 0) + return -EBADF; /* already invalid */ + + PROTECT_ERRNO; + + v = (unsigned) fd; + + /* We want to fork off a process that is automatically reaped. For that we'd usually double-fork. But + * we can optimize this a bit: if we are PID 1 or a subreaper anyway (the systemd service manager + * process qualifies as this), we can avoid the double forking, since the double forked process would + * be reparented back to us anyway. */ + r = is_reaper_process(); + if (r < 0) + log_debug_errno(r, "Cannot determine if we are a reaper process, assuming we are not: %m"); + if (r <= 0) + v |= NEED_DOUBLE_FORK; + + pid = clone_with_nested_stack(close_func, CLONE_FILES | ((v & NEED_DOUBLE_FORK) ? 0 : SIGCHLD), UINT_TO_PTR(v)); + if (pid < 0) + assert_se(close_nointr(fd) != -EBADF); /* local fallback */ + else if (v & NEED_DOUBLE_FORK) { + + /* Reap the intermediate child. Key here is that we specify __WCLONE, since we didn't ask for + * any signal to be sent to us on process exit, and otherwise waitid() would refuse waiting + * then. + * + * We usually prefer calling waitid(), but before kernel 4.7 it didn't support __WCLONE while + * waitpid() did. Hence let's use waitpid() here, it's good enough for our purposes here. */ + for (;;) { + if (waitpid(pid, NULL, WEXITED|__WCLONE) >= 0 || errno != EINTR) + break; + } + } + + return -EBADF; /* return an invalidated fd */ +} + +int asynchronous_rm_rf(const char *p, RemoveFlags flags) { + int r; + + assert(p); + + /* Forks off a child that destroys the specified path. This will be best effort only, i.e. the child + * will attempt to do its thing, but we won't wait for it or check its success. */ + + r = safe_fork("(sd-rmrf)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DETACH, NULL); + if (r != 0) + return r; + + /* Child */ + + r = rm_rf(p, flags); + if (r < 0) { + log_debug_errno(r, "Failed to rm -rf '%s', ignoring: %m", p); + _exit(EXIT_FAILURE); /* This is a detached process, hence noone really cares, but who knows + * maybe it's good for debugging/tracing to return an exit code + * indicative of our failure here. */ + } + + _exit(EXIT_SUCCESS); +} diff --git a/src/shared/async.h b/src/shared/async.h new file mode 100644 index 0000000000..96148f9006 --- /dev/null +++ b/src/shared/async.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "macro.h" +#include "rm-rf.h" + +/* These functions implement various potentially slow operations that are executed asynchronously. They are + * carefully written to not use pthreads, but use fork() or clone() (without CLONE_VM) so that the child does + * not share any memory with the parent process, and thus cannot possibly interfere with the malloc() + * synchronization locks. + * + * Background: glibc only synchronizes malloc() locks when doing fork(), but not when doing clone() + * (regardless if through glibc's own wrapper or ours). This means if another thread in the parent has the + * malloc() lock taken while a thread is cloning, the mutex will remain locked in the child (but the other + * thread won't exist there), with no chance to ever be unlocked again. This will result in deadlocks. Hence + * one has to make the choice: either never use threads in the parent, or never do memory allocation in the + * child, or never use clone()/clone3() and stick to fork() only. Because we need clone()/clone3() we opted + * for avoiding threads. */ + +int asynchronous_sync(pid_t *ret_pid); +int asynchronous_close(int fd); +int asynchronous_rm_rf(const char *p, RemoveFlags flags); + +DEFINE_TRIVIAL_CLEANUP_FUNC(int, asynchronous_close); diff --git a/src/shared/meson.build b/src/shared/meson.build index 5a982f3184..1e015bd38e 100644 --- a/src/shared/meson.build +++ b/src/shared/meson.build @@ -4,6 +4,7 @@ shared_sources = files( 'acpi-fpdt.c', 'apparmor-util.c', 'ask-password-api.c', + 'async.c', 'barrier.c', 'base-filesystem.c', 'battery-util.c', diff --git a/src/test/test-async.c b/src/test/test-async.c index 69785e47fa..dc0e34b48f 100644 --- a/src/test/test-async.c +++ b/src/test/test-async.c @@ -1,37 +1,100 @@ /* SPDX-License-Identifier: LGPL-2.1-or-later */ #include +#include +#include #include #include "async.h" #include "fs-util.h" -#include "tmpfile-util.h" +#include "path-util.h" +#include "process-util.h" +#include "signal-util.h" #include "tests.h" +#include "tmpfile-util.h" -static bool test_async = false; - -static void *async_func(void *arg) { - test_async = true; - - return NULL; +TEST(test_asynchronous_sync) { + assert_se(asynchronous_sync(NULL) >= 0); } -TEST(test_async) { +TEST(asynchronous_close) { _cleanup_(unlink_tempfilep) char name[] = "/tmp/test-asynchronous_close.XXXXXX"; - int fd; + int fd, r; fd = mkostemp_safe(name); assert_se(fd >= 0); asynchronous_close(fd); - assert_se(asynchronous_job(async_func, NULL) >= 0); - assert_se(asynchronous_sync(NULL) >= 0); - sleep(1); assert_se(fcntl(fd, F_GETFD) == -1); assert_se(errno == EBADF); - assert_se(test_async); + + r = safe_fork("(subreaper)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DEATHSIG|FORK_LOG|FORK_WAIT, NULL); + assert(r >= 0); + + if (r == 0) { + /* child */ + + assert(make_reaper_process(true) >= 0); + + fd = open("/dev/null", O_RDONLY|O_CLOEXEC); + assert_se(fd >= 0); + asynchronous_close(fd); + + sleep(1); + + assert_se(fcntl(fd, F_GETFD) == -1); + assert_se(errno == EBADF); + + _exit(EXIT_SUCCESS); + } } +TEST(asynchronous_rm_rf) { + _cleanup_free_ char *t = NULL, *k = NULL; + int r; + + assert_se(mkdtemp_malloc(NULL, &t) >= 0); + assert_se(k = path_join(t, "somefile")); + assert_se(touch(k) >= 0); + assert_se(asynchronous_rm_rf(t, REMOVE_ROOT|REMOVE_PHYSICAL) >= 0); + + /* Do this once more, form a subreaper. Which is nice, because we can watch the async child even + * though detached */ + + r = safe_fork("(subreaper)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DEATHSIG|FORK_LOG|FORK_WAIT, NULL); + assert_se(r >= 0); + + if (r == 0) { + _cleanup_free_ char *tt = NULL, *kk = NULL; + + /* child */ + + assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, -1) >= 0); + assert_se(make_reaper_process(true) >= 0); + + assert_se(mkdtemp_malloc(NULL, &tt) >= 0); + assert_se(kk = path_join(tt, "somefile")); + assert_se(touch(kk) >= 0); + assert_se(asynchronous_rm_rf(tt, REMOVE_ROOT|REMOVE_PHYSICAL) >= 0); + + for (;;) { + siginfo_t si = {}; + + assert_se(waitid(P_ALL, 0, &si, WEXITED) >= 0); + + if (access(tt, F_OK) < 0) { + assert_se(errno == ENOENT); + break; + } + + /* wasn't the rm_rf() call. let's wait longer */ + } + + _exit(EXIT_SUCCESS); + } +} + + DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-process-util.c b/src/test/test-process-util.c index e915207e8c..ebf73c54ec 100644 --- a/src/test/test-process-util.c +++ b/src/test/test-process-util.c @@ -891,6 +891,53 @@ TEST(get_process_threads) { } } +TEST(is_reaper_process) { + int r; + + r = safe_fork("(regular)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_WAIT, NULL); + assert_se(r >= 0); + if (r == 0) { + /* child */ + + assert_se(is_reaper_process() == 0); + _exit(EXIT_SUCCESS); + } + + r = safe_fork("(newpid)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_WAIT, NULL); + assert_se(r >= 0); + if (r == 0) { + /* child */ + + if (unshare(CLONE_NEWPID) < 0) { + if (ERRNO_IS_PRIVILEGE(errno) || ERRNO_IS_NOT_SUPPORTED(errno)) { + log_notice("Skipping CLONE_NEWPID reaper check, lacking privileges/support"); + _exit(EXIT_SUCCESS); + } + } + + r = safe_fork("(newpid1)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_WAIT, NULL); + assert_se(r >= 0); + if (r == 0) { + /* grandchild, which is PID1 in a pidns */ + assert_se(getpid_cached() == 1); + assert_se(is_reaper_process() > 0); + _exit(EXIT_SUCCESS); + } + + _exit(EXIT_SUCCESS); + } + + r = safe_fork("(subreaper)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_WAIT, NULL); + assert_se(r >= 0); + if (r == 0) { + /* child */ + assert_se(make_reaper_process(true) >= 0); + + assert_se(is_reaper_process() > 0); + _exit(EXIT_SUCCESS); + } +} + static int intro(void) { log_show_color(true); return EXIT_SUCCESS;