Merge pull request #28123 from poettering/threads-no-more

get rid of any threads in PID 1
2026-04-14 00:14:32 +09:00 · 2023-06-23 13:24:41 +01:00
parent 3965f173ea 1ee20371c7
commit fc84ddda2f
17 changed files with 484 additions and 212 deletions
--- a/docs/CODING_STYLE.md
+++ b/docs/CODING_STYLE.md
@@ -750,3 +750,32 @@ SPDX-License-Identifier: LGPL-2.1-or-later
 - Reasonable use of non-ASCII Unicode UTF-8 characters in code comments is
  welcome. If your code comment contains an emoji or two this will certainly
  brighten the day of the occasional reviewer of your code. Really! 😊
+
+## Threading
+
+- We generally avoid using threads, to the level this is possible. In
+  particular in the service manager/PID 1 threads are not OK to use. This is
+  because you cannot mix memory allocation in threads with use of glibc's
+  `clone()` call, or manual `clone()`/`clone3()` system call wrappers. Only
+  glibc's own `fork()` call will properly synchronize the memory allocation
+  locks around the process clone operation. This means that if a process is
+  cloned via `clone()`/`clone3()` and another thread currently has the
+  `malloc()` lock taken, it will be cloned in locked state to the child, and
+  thus can never be acquired in the child, leading to deadlocks. Hence, when
+  using `clone()`/`clone3()` there are only two ways out: never use threads in the
+  parent, or never do memory allocation in the child. For our uses we need
+  `clone()`/`clone3()` and hence decided to avoid threads. Of course, sometimes the
+  concurrency threads allow is beneficial, however we suggest forking off
+  worker *processes* rather than worker *threads* for this purpose, ideally
+  even with an `execve()` to remove the CoW trap situation `fork()` easily
+  triggers.
+
+- A corollary of the above is: never use `clone()` where a `fork()` would do
+  too. Also consider using `posix_spawn()` which combines `clone()` +
+  `execve()` into one and has nice properties since it avoids becoming a CoW
+  trap by using `CLONE_VORK` and `CLONE_VM` together.
+
+- While we avoid forking off threads on our own, writing thread-safe code is a
+  good idea where it might end up running inside of libsystemd.so or
+  similar. Hence, use TLS (i.e. `thread_local`) where appropriate, and maybe
+  the occasional `pthread_once()`.
--- a/src/basic/async.c
+++ b/src/basic/async.c
@@ -1,107 +0,0 @@
-/* SPDX-License-Identifier: LGPL-2.1-or-later */
-
-#include <errno.h>
-#include <pthread.h>
-#include <stddef.h>
-#include <unistd.h>
-
-#include "async.h"
-#include "errno-util.h"
-#include "fd-util.h"
-#include "log.h"
-#include "macro.h"
-#include "process-util.h"
-#include "signal-util.h"
-
-int asynchronous_job(void* (*func)(void *p), void *arg) {
-        sigset_t ss, saved_ss;
-        pthread_attr_t a;
-        pthread_t t;
-        int r, k;
-
-        /* It kinda sucks that we have to resort to threads to implement an asynchronous close(), but well, such is
-         * life. */
-
-        r = pthread_attr_init(&a);
-        if (r > 0)
-                return -r;
-
-        r = pthread_attr_setdetachstate(&a, PTHREAD_CREATE_DETACHED);
-        if (r > 0) {
-                r = -r;
-                goto finish;
-        }
-
-        assert_se(sigfillset(&ss) >= 0);
-
-        /* Block all signals before forking off the thread, so that the new thread is started with all signals
-         * blocked. This way the existence of the new thread won't affect signal handling in other threads. */
-
-        r = pthread_sigmask(SIG_BLOCK, &ss, &saved_ss);
-        if (r > 0) {
-                r = -r;
-                goto finish;
-        }
-
-        r = pthread_create(&t, &a, func, arg);
-
-        k = pthread_sigmask(SIG_SETMASK, &saved_ss, NULL);
-
-        if (r > 0)
-                r = -r;
-        else if (k > 0)
-                r = -k;
-        else
-                r = 0;
-
-finish:
-        pthread_attr_destroy(&a);
-        return r;
-}
-
-int asynchronous_sync(pid_t *ret_pid) {
-        int r;
-
-        /* This forks off an invocation of fork() as a child process, in order to initiate synchronization to
-         * disk. Note that we implement this as helper process rather than thread as we don't want the sync() to hang our
-         * original process ever, and a thread would do that as the process can't exit with threads hanging in blocking
-         * syscalls. */
-
-        r = safe_fork("(sd-sync)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS, ret_pid);
-        if (r < 0)
-                return r;
-        if (r == 0) {
-                /* Child process */
-                sync();
-                _exit(EXIT_SUCCESS);
-        }
-
-        return 0;
-}
-
-static void *close_thread(void *p) {
-        (void) pthread_setname_np(pthread_self(), "close");
-
-        assert_se(close_nointr(PTR_TO_FD(p)) != -EBADF);
-        return NULL;
-}
-
-int asynchronous_close(int fd) {
-        int r;
-
-        /* This is supposed to behave similar to safe_close(), but
-         * actually invoke close() asynchronously, so that it will
-         * never block. Ideally the kernel would have an API for this,
-         * but it doesn't, so we work around it, and hide this as a
-         * far away as we can. */
-
-        if (fd >= 0) {
-                PROTECT_ERRNO;
-
-                r = asynchronous_job(close_thread, FD_TO_PTR(fd));
-                if (r < 0)
-                         assert_se(close_nointr(fd) != -EBADF);
-        }
-
-        return -EBADF;
-}
--- a/src/basic/async.h
+++ b/src/basic/async.h
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: LGPL-2.1-or-later */
-#pragma once
-
-#include <sys/types.h>
-
-#include "macro.h"
-
-int asynchronous_job(void* (*func)(void *p), void *arg);
-
-int asynchronous_sync(pid_t *ret_pid);
-int asynchronous_close(int fd);
-
-DEFINE_TRIVIAL_CLEANUP_FUNC(int, asynchronous_close);
--- a/src/basic/meson.build
+++ b/src/basic/meson.build
@@ -7,7 +7,6 @@ basic_sources = files(
        'architecture.c',
        'argv-util.c',
        'arphrd-util.c',
-        'async.c',
        'audit-util.c',
        'build.c',
        'bus-label.c',
--- a/src/basic/process-util.c
+++ b/src/basic/process-util.c
@@ -1149,6 +1149,41 @@ static void restore_sigsetp(sigset_t **ssp) {
                (void) sigprocmask(SIG_SETMASK, *ssp, NULL);
 }

+pid_t clone_with_nested_stack(int (*fn)(void *), int flags, void *userdata) {
+        size_t ps;
+        pid_t pid;
+        void *mystack;
+
+        /* A wrapper around glibc's clone() call that automatically sets up a "nested" stack. Only supports
+         * invocations without CLONE_VM, so that we can continue to use the parent's stack mapping.
+         *
+         * Note: glibc's clone() wrapper does not synchronize malloc() locks. This means that if the parent
+         * is threaded these locks will be in an undefined state in the child, and hence memory allocations
+         * are likely going to run into deadlocks. Hence: if you use this function make sure your parent is
+         * strictly single-threaded or your child never calls malloc(). */
+
+        assert((flags & (CLONE_VM|CLONE_PARENT_SETTID|CLONE_CHILD_SETTID|
+                         CLONE_CHILD_CLEARTID|CLONE_SETTLS)) == 0);
+
+        /* We allocate some space on the stack to use as the stack for the child (hence "nested"). Note that
+         * the net effect is that the child will have the start of its stack inside the stack of the parent,
+         * but since they are a CoW copy of each other that's fine. We allocate one page-aligned page. But
+         * since we don't want to deal with differences between systems where the stack grows backwards or
+         * forwards we'll allocate one more and place the stack address in the middle. Except that we also
+         * want it page aligned, hence we'll allocate one page more. Makes 3. */
+
+        ps = page_size();
+        mystack = alloca(ps*3);
+        mystack = (uint8_t*) mystack + ps; /* move pointer one page ahead since stacks usually grow backwards */
+        mystack = (void*) ALIGN_TO((uintptr_t) mystack, ps); /* align to page size (moving things further ahead) */
+
+        pid = clone(fn, mystack, flags, userdata);
+        if (pid < 0)
+                return -errno;
+
+        return pid;
+}
+
 int safe_fork_full(
                const char *name,
                const int stdio_fds[3],
@@ -1160,9 +1195,12 @@ int safe_fork_full(
        pid_t original_pid, pid;
        sigset_t saved_ss, ss;
        _unused_ _cleanup_(restore_sigsetp) sigset_t *saved_ssp = NULL;
-        bool block_signals = false, block_all = false;
+        bool block_signals = false, block_all = false, intermediary = false;
        int prio, r;

+        assert(!FLAGS_SET(flags, FORK_DETACH) || !ret_pid);
+        assert(!FLAGS_SET(flags, FORK_DETACH|FORK_WAIT));
+
        /* A wrapper around fork(), that does a couple of important initializations in addition to mere forking. Always
         * returns the child's PID in *ret_pid. Returns == 0 in the child, and > 0 in the parent. */

@@ -1196,6 +1234,31 @@ int safe_fork_full(
                saved_ssp = &saved_ss;
        }

+        if (FLAGS_SET(flags, FORK_DETACH)) {
+                assert(!FLAGS_SET(flags, FORK_WAIT));
+                assert(!ret_pid);
+
+                /* Fork off intermediary child if needed */
+
+                r = is_reaper_process();
+                if (r < 0)
+                        return log_full_errno(prio, r, "Failed to determine if we are a reaper process: %m");
+
+                if (!r) {
+                        /* Not a reaper process, hence do a double fork() so we are reparented to one */
+
+                        pid = fork();
+                        if (pid < 0)
+                                return log_full_errno(prio, errno, "Failed to fork off '%s': %m", strna(name));
+                        if (pid > 0) {
+                                log_debug("Successfully forked off intermediary '%s' as PID " PID_FMT ".", strna(name), pid);
+                                return 1; /* return in the parent */
+                        }
+
+                        intermediary = true;
+                }
+        }
+
        if ((flags & (FORK_NEW_MOUNTNS|FORK_NEW_USERNS)) != 0)
                pid = raw_clone(SIGCHLD|
                                (FLAGS_SET(flags, FORK_NEW_MOUNTNS) ? CLONE_NEWNS : 0) |
@@ -1205,8 +1268,12 @@ int safe_fork_full(
        if (pid < 0)
                return log_full_errno(prio, errno, "Failed to fork off '%s': %m", strna(name));
        if (pid > 0) {
-                /* We are in the parent process */

+                /* If we are in the intermediary process, exit now */
+                if (intermediary)
+                        _exit(EXIT_SUCCESS);
+
+                /* We are in the parent process */
                log_debug("Successfully forked off '%s' as PID " PID_FMT ".", strna(name), pid);

                if (flags & FORK_WAIT) {
@@ -1623,6 +1690,39 @@ int get_process_threads(pid_t pid) {
        return n;
 }

+int is_reaper_process(void) {
+        int b = 0;
+
+        /* Checks if we are running in a reaper process, i.e. if we are expected to deal with processes
+         * reparented to us. This simply checks if we are PID 1 or if PR_SET_CHILD_SUBREAPER was called. */
+
+        if (getpid_cached() == 1)
+                return true;
+
+        if (prctl(PR_GET_CHILD_SUBREAPER, (unsigned long) &b, 0UL, 0UL, 0UL) < 0)
+                return -errno;
+
+        return b != 0;
+}
+
+int make_reaper_process(bool b) {
+
+        if (getpid_cached() == 1) {
+
+                if (!b)
+                        return -EINVAL;
+
+                return 0;
+        }
+
+        /* Some prctl()s insist that all 5 arguments are specified, others do not. Let's always specify all,
+         * to avoid any ambiguities */
+        if (prctl(PR_SET_CHILD_SUBREAPER, (unsigned long) b, 0UL, 0UL, 0UL) < 0)
+                return -errno;
+
+        return 0;
+}
+
 static const char *const sigchld_code_table[] = {
        [CLD_EXITED] = "exited",
        [CLD_KILLED] = "killed",
--- a/src/basic/process-util.h
+++ b/src/basic/process-util.h
@@ -139,6 +139,13 @@ void reset_cached_pid(void);

 int must_be_root(void);

+pid_t clone_with_nested_stack(int (*fn)(void *), int flags, void *userdata);
+
+/* 💣 Note that FORK_NEW_USERNS + FORK_NEW_MOUNTNS should not be called in threaded programs, because they
+ * cause us to use raw_clone() which does not synchronize the glibc malloc() locks, and thus will cause
+ * deadlocks if the parent uses threads and the child does memory allocations. Hence: if the parent is
+ * threaded these flags may not be used. These flags cannot be used if the parent uses threads or the child
+ * uses malloc(). 💣 */
 typedef enum ForkFlags {
        FORK_RESET_SIGNALS      = 1 <<  0, /* Reset all signal handlers and signal mask */
        FORK_CLOSE_ALL_FDS      = 1 <<  1, /* Close all open file descriptors in the child, except for 0,1,2 */
@@ -148,15 +155,16 @@ typedef enum ForkFlags {
        FORK_REOPEN_LOG         = 1 <<  5, /* Reopen log connection */
        FORK_LOG                = 1 <<  6, /* Log above LOG_DEBUG log level about failures */
        FORK_WAIT               = 1 <<  7, /* Wait until child exited */
-        FORK_NEW_MOUNTNS        = 1 <<  8, /* Run child in its own mount namespace */
+        FORK_NEW_MOUNTNS        = 1 <<  8, /* Run child in its own mount namespace                               💣 DO NOT USE IN THREADED PROGRAMS! 💣 */
        FORK_MOUNTNS_SLAVE      = 1 <<  9, /* Make child's mount namespace MS_SLAVE */
        FORK_PRIVATE_TMP        = 1 << 10, /* Mount new /tmp/ in the child (combine with FORK_NEW_MOUNTNS!) */
        FORK_RLIMIT_NOFILE_SAFE = 1 << 11, /* Set RLIMIT_NOFILE soft limit to 1K for select() compat */
        FORK_STDOUT_TO_STDERR   = 1 << 12, /* Make stdout a copy of stderr */
        FORK_FLUSH_STDIO        = 1 << 13, /* fflush() stdout (and stderr) before forking */
-        FORK_NEW_USERNS         = 1 << 14, /* Run child in its own user namespace */
+        FORK_NEW_USERNS         = 1 << 14, /* Run child in its own user namespace                                💣 DO NOT USE IN THREADED PROGRAMS! 💣 */
        FORK_CLOEXEC_OFF        = 1 << 15, /* In the child: turn off O_CLOEXEC on all fds in except_fds[] */
        FORK_KEEP_NOTIFY_SOCKET = 1 << 16, /* Unless this specified, $NOTIFY_SOCKET will be unset. */
+        FORK_DETACH             = 1 << 17, /* Double fork if needed to ensure PID1/subreaper is parent */
 } ForkFlags;

 int safe_fork_full(
@@ -199,3 +207,6 @@ int setpriority_closest(int priority);
 _noreturn_ void freeze(void);

 int get_process_threads(pid_t pid);
+
+int is_reaper_process(void);
+int make_reaper_process(bool b);
--- a/src/basic/random-util.c
+++ b/src/basic/random-util.c
@@ -4,7 +4,6 @@
 #include <errno.h>
 #include <fcntl.h>
 #include <linux/random.h>
-#include <pthread.h>
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdlib.h>
--- a/src/basic/raw-clone.h
+++ b/src/basic/raw-clone.h
@@ -27,6 +27,11 @@
 * Additionally, as this function does not pass the ptid, newtls and ctid parameters to the kernel, flags must not
 * contain CLONE_PARENT_SETTID, CLONE_CHILD_SETTID, CLONE_CHILD_CLEARTID or CLONE_SETTLS.
 *
+ * WARNING: 💣 this call (just like glibc's own clone() wrapper) will not synchronize on glibc's malloc
+ *          locks, which means they will be in an undefined state in the child if the parent is
+ *          threaded. This means: the parent must either never use threads, or the child cannot use memory
+ *          allocation itself. This is a major pitfall, hence be careful! 💣
+ *
 * Returns: 0 in the child process and the child process id in the parent.
 */
 static inline pid_t raw_clone(unsigned long flags) {
--- a/src/core/automount.c
+++ b/src/core/automount.c
@@ -44,22 +44,6 @@ static const UnitActiveState state_translation_table[_AUTOMOUNT_STATE_MAX] = {
        [AUTOMOUNT_FAILED] = UNIT_FAILED
 };

-struct expire_data {
-        int dev_autofs_fd;
-        int ioctl_fd;
-};
-
-static struct expire_data* expire_data_free(struct expire_data *data) {
-        if (!data)
-                return NULL;
-
-        safe_close(data->dev_autofs_fd);
-        safe_close(data->ioctl_fd);
-        return mfree(data);
-}
-
-DEFINE_TRIVIAL_CLEANUP_FUNC(struct expire_data*, expire_data_free);
-
 static int open_dev_autofs(Manager *m);
 static int automount_dispatch_io(sd_event_source *s, int fd, uint32_t events, void *userdata);
 static int automount_start_expire(Automount *a);
@@ -674,55 +658,57 @@ fail:
        automount_enter_dead(a, AUTOMOUNT_FAILURE_RESOURCES);
 }

-static void *expire_thread(void *p) {
-        struct autofs_dev_ioctl param;
-        _cleanup_(expire_data_freep) struct expire_data *data = p;
+static int asynchronous_expire(int dev_autofs_fd, int ioctl_fd) {
        int r;

-        assert(data->dev_autofs_fd >= 0);
-        assert(data->ioctl_fd >= 0);
+        assert(dev_autofs_fd >= 0);
+        assert(ioctl_fd >= 0);

-        init_autofs_dev_ioctl(&param);
-        param.ioctlfd = data->ioctl_fd;
+        /* Issue AUTOFS_DEV_IOCTL_EXPIRE in subprocess, asynchronously. Note that we don't keep track of the
+         * child's PID, we are PID1/autoreaper after all, hence when it dies we'll automatically clean it up
+         * anyway. */

-        do {
-                r = ioctl(data->dev_autofs_fd, AUTOFS_DEV_IOCTL_EXPIRE, &param);
-        } while (r >= 0);
+        r = safe_fork_full("(sd-expire)",
+                           /* stdio_fds= */ NULL,
+                           (int[]) { dev_autofs_fd, ioctl_fd },
+                           /* n_except_fds= */ 2,
+                           FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_REOPEN_LOG,
+                           /* pid= */ NULL);
+        if (r != 0)
+                return r;
+
+        /* Child */
+        for (;;) {
+                struct autofs_dev_ioctl param;
+                init_autofs_dev_ioctl(&param);
+                param.ioctlfd = ioctl_fd;
+
+                if (ioctl(dev_autofs_fd, AUTOFS_DEV_IOCTL_EXPIRE, &param) < 0)
+                        break;
+        }

        if (errno != EAGAIN)
                log_warning_errno(errno, "Failed to expire automount, ignoring: %m");

-        return NULL;
+        _exit(EXIT_SUCCESS);
 }

 static int automount_dispatch_expire(sd_event_source *source, usec_t usec, void *userdata) {
+        _cleanup_close_ int ioctl_fd = -EBADF;
        Automount *a = AUTOMOUNT(userdata);
-        _cleanup_(expire_data_freep) struct expire_data *data = NULL;
        int r;

        assert(a);
        assert(source == a->expire_event_source);

-        data = new0(struct expire_data, 1);
-        if (!data)
-                return log_oom();
+        ioctl_fd = open_ioctl_fd(UNIT(a)->manager->dev_autofs_fd, a->where, a->dev_id);
+        if (ioctl_fd < 0)
+                return log_unit_error_errno(UNIT(a), ioctl_fd, "Couldn't open autofs ioctl fd: %m");

-        data->ioctl_fd = -EBADF;
-
-        data->dev_autofs_fd = fcntl(UNIT(a)->manager->dev_autofs_fd, F_DUPFD_CLOEXEC, 3);
-        if (data->dev_autofs_fd < 0)
-                return log_unit_error_errno(UNIT(a), errno, "Failed to duplicate autofs fd: %m");
-
-        data->ioctl_fd = open_ioctl_fd(UNIT(a)->manager->dev_autofs_fd, a->where, a->dev_id);
-        if (data->ioctl_fd < 0)
-                return log_unit_error_errno(UNIT(a), data->ioctl_fd, "Couldn't open autofs ioctl fd: %m");
-
-        r = asynchronous_job(expire_thread, data);
+        r = asynchronous_expire(UNIT(a)->manager->dev_autofs_fd, ioctl_fd);
        if (r < 0)
                return log_unit_error_errno(UNIT(a), r, "Failed to start expire job: %m");

-        data = NULL;
-
        return automount_start_expire(a);
 }

--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -7318,28 +7318,17 @@ int exec_command_append(ExecCommand *c, const char *path, ...) {
        return 0;
 }

-static void *rm_rf_thread(void *p) {
-        _cleanup_free_ char *path = p;
+static char *destroy_tree(char *path) {
+        if (!path)
+                return NULL;

-        (void) rm_rf(path, REMOVE_ROOT|REMOVE_SUBVOLUME|REMOVE_PHYSICAL);
-        return NULL;
-}
+        if (!path_equal(path, RUN_SYSTEMD_EMPTY)) {
+                log_debug("Spawning process to nuke '%s'", path);

-static void asynchronous_rm_rf(char **path) {
-        int r;
+                (void) asynchronous_rm_rf(path, REMOVE_ROOT|REMOVE_SUBVOLUME|REMOVE_PHYSICAL);
+        }

-        assert(path);
-
-        if (!*path || streq(*path, RUN_SYSTEMD_EMPTY))
-                return;
-
-        log_debug("Spawning thread to nuke %s", *path);
-
-        r = asynchronous_job(rm_rf_thread, *path);
-        if (r < 0)
-                log_warning_errno(r, "Failed to nuke %s: %m", *path);
-        else
-                *path = NULL;
+        return mfree(path);
 }

 static ExecSharedRuntime* exec_shared_runtime_free(ExecSharedRuntime *rt) {
@@ -7370,8 +7359,8 @@ ExecSharedRuntime* exec_shared_runtime_destroy(ExecSharedRuntime *rt) {
        if (rt->n_ref > 0)
                return NULL;

-        asynchronous_rm_rf(&rt->tmp_dir);
-        asynchronous_rm_rf(&rt->var_tmp_dir);
+        rt->tmp_dir = destroy_tree(rt->tmp_dir);
+        rt->var_tmp_dir = destroy_tree(rt->var_tmp_dir);

        return exec_shared_runtime_free(rt);
 }
@@ -7862,9 +7851,8 @@ ExecRuntime* exec_runtime_free(ExecRuntime *rt) {
        exec_shared_runtime_unref(rt->shared);
        dynamic_creds_unref(rt->dynamic_creds);

-        asynchronous_rm_rf(&rt->ephemeral_copy);
+        rt->ephemeral_copy = destroy_tree(rt->ephemeral_copy);

-        free(rt->ephemeral_copy);
        safe_close_pair(rt->ephemeral_storage_socket);
        return mfree(rt);
 }
--- a/src/core/main.c
+++ b/src/core/main.c
@@ -2294,10 +2294,9 @@ static int initialize_runtime(
                }
        }

-        if (arg_runtime_scope == RUNTIME_SCOPE_USER)
-                /* Become reaper of our children */
-                if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0)
-                        log_warning_errno(errno, "Failed to make us a subreaper, ignoring: %m");
+        r = make_reaper_process(true);
+        if (r < 0)
+                log_warning_errno(r, "Failed to make us a subreaper, ignoring: %m");

        /* Bump up RLIMIT_NOFILE for systemd itself */
        (void) bump_rlimit_nofile(saved_rlimit_nofile);
--- a/src/nspawn/nspawn.c
+++ b/src/nspawn/nspawn.c
@@ -5826,8 +5826,9 @@ static int run(int argc, char *argv[]) {

        assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, SIGRTMIN+18, -1) >= 0);

-        if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) < 0) {
-                r = log_error_errno(errno, "Failed to become subreaper: %m");
+        r = make_reaper_process(true);
+        if (r < 0) {
+                log_error_errno(r, "Failed to become subreaper: %m");
                goto finish;
        }

--- a/src/shared/async.c
+++ b/src/shared/async.c
@@ -0,0 +1,138 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <stddef.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "async.h"
+#include "errno-util.h"
+#include "fd-util.h"
+#include "log.h"
+#include "macro.h"
+#include "process-util.h"
+#include "signal-util.h"
+
+int asynchronous_sync(pid_t *ret_pid) {
+        int r;
+
+        /* This forks off an invocation of fork() as a child process, in order to initiate synchronization to
+         * disk. Note that we implement this as helper process rather than thread as we don't want the sync() to hang our
+         * original process ever, and a thread would do that as the process can't exit with threads hanging in blocking
+         * syscalls. */
+
+        r = safe_fork("(sd-sync)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|(ret_pid ? 0 : FORK_DETACH), ret_pid);
+        if (r < 0)
+                return r;
+        if (r == 0) {
+                /* Child process */
+                sync();
+                _exit(EXIT_SUCCESS);
+        }
+
+        return 0;
+}
+
+/* We encode the fd to close in the userdata pointer as an unsigned value. The highest bit indicates whether
+ * we need to fork again */
+#define NEED_DOUBLE_FORK (1U << (sizeof(unsigned) * 8 - 1))
+
+static int close_func(void *p) {
+        unsigned v = PTR_TO_UINT(p);
+
+        (void) prctl(PR_SET_NAME, (unsigned long*) "(close)");
+
+        /* Note: 💣 This function is invoked in a child process created via glibc's clone() wrapper. In such
+         *       children memory allocation is not allowed, since glibc does not release malloc mutexes in
+         *       clone() 💣 */
+
+        if (v & NEED_DOUBLE_FORK) {
+                pid_t pid;
+
+                v &= ~NEED_DOUBLE_FORK;
+
+                /* This inner child will be reparented to the subreaper/PID 1. Here we turn on SIGCHLD, so
+                 * that the reaper knows when it's time to reap. */
+                pid = clone_with_nested_stack(close_func, SIGCHLD|CLONE_FILES, UINT_TO_PTR(v));
+                if (pid >= 0)
+                        return 0;
+        }
+
+        close((int) v); /* no assert() here, we are in the child and the result would be eaten up anyway */
+        return 0;
+}
+
+int asynchronous_close(int fd) {
+        unsigned v;
+        pid_t pid;
+        int r;
+
+        /* This is supposed to behave similar to safe_close(), but actually invoke close() asynchronously, so
+         * that it will never block. Ideally the kernel would have an API for this, but it doesn't, so we
+         * work around it, and hide this as a far away as we can.
+         *
+         * It is important to us that we don't use threads (via glibc pthread) in PID 1, hence we'll do a
+         * minimal subprocess instead which shares our fd table via CLONE_FILES. */
+
+        if (fd < 0)
+                return -EBADF; /* already invalid */
+
+        PROTECT_ERRNO;
+
+        v = (unsigned) fd;
+
+        /* We want to fork off a process that is automatically reaped. For that we'd usually double-fork. But
+         * we can optimize this a bit: if we are PID 1 or a subreaper anyway (the systemd service manager
+         * process qualifies as this), we can avoid the double forking, since the double forked process would
+         * be reparented back to us anyway. */
+        r = is_reaper_process();
+        if (r < 0)
+                log_debug_errno(r, "Cannot determine if we are a reaper process, assuming we are not: %m");
+        if (r <= 0)
+                v |= NEED_DOUBLE_FORK;
+
+        pid = clone_with_nested_stack(close_func, CLONE_FILES | ((v & NEED_DOUBLE_FORK) ? 0 : SIGCHLD), UINT_TO_PTR(v));
+        if (pid < 0)
+                assert_se(close_nointr(fd) != -EBADF); /* local fallback */
+        else if (v & NEED_DOUBLE_FORK) {
+
+                /* Reap the intermediate child. Key here is that we specify __WCLONE, since we didn't ask for
+                 * any signal to be sent to us on process exit, and otherwise waitid() would refuse waiting
+                 * then.
+                 *
+                 * We usually prefer calling waitid(), but before kernel 4.7 it didn't support __WCLONE while
+                 * waitpid() did. Hence let's use waitpid() here, it's good enough for our purposes here. */
+                for (;;) {
+                        if (waitpid(pid, NULL, WEXITED|__WCLONE) >= 0 || errno != EINTR)
+                                break;
+                }
+        }
+
+        return -EBADF; /* return an invalidated fd */
+}
+
+int asynchronous_rm_rf(const char *p, RemoveFlags flags) {
+        int r;
+
+        assert(p);
+
+        /* Forks off a child that destroys the specified path. This will be best effort only, i.e. the child
+         * will attempt to do its thing, but we won't wait for it or check its success. */
+
+        r = safe_fork("(sd-rmrf)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DETACH, NULL);
+        if (r != 0)
+                return r;
+
+        /* Child */
+
+        r = rm_rf(p, flags);
+        if (r < 0) {
+                log_debug_errno(r, "Failed to rm -rf '%s', ignoring: %m", p);
+                _exit(EXIT_FAILURE); /* This is a detached process, hence noone really cares, but who knows
+                                      * maybe it's good for debugging/tracing to return an exit code
+                                      * indicative of our failure here. */
+        }
+
+        _exit(EXIT_SUCCESS);
+}
--- a/src/shared/async.h
+++ b/src/shared/async.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <sys/types.h>
+
+#include "macro.h"
+#include "rm-rf.h"
+
+/* These functions implement various potentially slow operations that are executed asynchronously. They are
+ * carefully written to not use pthreads, but use fork() or clone() (without CLONE_VM) so that the child does
+ * not share any memory with the parent process, and thus cannot possibly interfere with the malloc()
+ * synchronization locks.
+ *
+ * Background: glibc only synchronizes malloc() locks when doing fork(), but not when doing clone()
+ * (regardless if through glibc's own wrapper or ours). This means if another thread in the parent has the
+ * malloc() lock taken while a thread is cloning, the mutex will remain locked in the child (but the other
+ * thread won't exist there), with no chance to ever be unlocked again. This will result in deadlocks. Hence
+ * one has to make the choice: either never use threads in the parent, or never do memory allocation in the
+ * child, or never use clone()/clone3() and stick to fork() only. Because we need clone()/clone3() we opted
+ * for avoiding threads. */
+
+int asynchronous_sync(pid_t *ret_pid);
+int asynchronous_close(int fd);
+int asynchronous_rm_rf(const char *p, RemoveFlags flags);
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(int, asynchronous_close);
--- a/src/shared/meson.build
+++ b/src/shared/meson.build
@@ -4,6 +4,7 @@ shared_sources = files(
        'acpi-fpdt.c',
        'apparmor-util.c',
        'ask-password-api.c',
+        'async.c',
        'barrier.c',
        'base-filesystem.c',
        'battery-util.c',
--- a/src/test/test-async.c
+++ b/src/test/test-async.c
@@ -1,37 +1,100 @@
 /* SPDX-License-Identifier: LGPL-2.1-or-later */

 #include <fcntl.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
 #include <unistd.h>

 #include "async.h"
 #include "fs-util.h"
-#include "tmpfile-util.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "signal-util.h"
 #include "tests.h"
+#include "tmpfile-util.h"

-static bool test_async = false;
-
-static void *async_func(void *arg) {
-        test_async = true;
-
-        return NULL;
+TEST(test_asynchronous_sync) {
+        assert_se(asynchronous_sync(NULL) >= 0);
 }

-TEST(test_async) {
+TEST(asynchronous_close) {
        _cleanup_(unlink_tempfilep) char name[] = "/tmp/test-asynchronous_close.XXXXXX";
-        int fd;
+        int fd, r;

        fd = mkostemp_safe(name);
        assert_se(fd >= 0);
        asynchronous_close(fd);

-        assert_se(asynchronous_job(async_func, NULL) >= 0);
-        assert_se(asynchronous_sync(NULL) >= 0);
-
        sleep(1);

        assert_se(fcntl(fd, F_GETFD) == -1);
        assert_se(errno == EBADF);
-        assert_se(test_async);
+
+        r = safe_fork("(subreaper)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DEATHSIG|FORK_LOG|FORK_WAIT, NULL);
+        assert(r >= 0);
+
+        if (r == 0) {
+                /* child */
+
+                assert(make_reaper_process(true) >= 0);
+
+                fd = open("/dev/null", O_RDONLY|O_CLOEXEC);
+                assert_se(fd >= 0);
+                asynchronous_close(fd);
+
+                sleep(1);
+
+                assert_se(fcntl(fd, F_GETFD) == -1);
+                assert_se(errno == EBADF);
+
+                _exit(EXIT_SUCCESS);
+        }
 }

+TEST(asynchronous_rm_rf) {
+        _cleanup_free_ char *t = NULL, *k = NULL;
+        int r;
+
+        assert_se(mkdtemp_malloc(NULL, &t) >= 0);
+        assert_se(k = path_join(t, "somefile"));
+        assert_se(touch(k) >= 0);
+        assert_se(asynchronous_rm_rf(t, REMOVE_ROOT|REMOVE_PHYSICAL) >= 0);
+
+        /* Do this once more, form a subreaper. Which is nice, because we can watch the async child even
+         * though detached */
+
+        r = safe_fork("(subreaper)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DEATHSIG|FORK_LOG|FORK_WAIT, NULL);
+        assert_se(r >= 0);
+
+        if (r == 0) {
+                _cleanup_free_ char *tt = NULL, *kk = NULL;
+
+                /* child */
+
+                assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, -1) >= 0);
+                assert_se(make_reaper_process(true) >= 0);
+
+                assert_se(mkdtemp_malloc(NULL, &tt) >= 0);
+                assert_se(kk = path_join(tt, "somefile"));
+                assert_se(touch(kk) >= 0);
+                assert_se(asynchronous_rm_rf(tt, REMOVE_ROOT|REMOVE_PHYSICAL) >= 0);
+
+                for (;;) {
+                        siginfo_t si = {};
+
+                        assert_se(waitid(P_ALL, 0, &si, WEXITED) >= 0);
+
+                        if (access(tt, F_OK) < 0) {
+                                assert_se(errno == ENOENT);
+                                break;
+                        }
+
+                        /* wasn't the rm_rf() call. let's wait longer */
+                }
+
+                _exit(EXIT_SUCCESS);
+        }
+}
+
+
 DEFINE_TEST_MAIN(LOG_DEBUG);
--- a/src/test/test-process-util.c
+++ b/src/test/test-process-util.c
@@ -891,6 +891,53 @@ TEST(get_process_threads) {
        }
 }

+TEST(is_reaper_process) {
+        int r;
+
+        r = safe_fork("(regular)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_WAIT, NULL);
+        assert_se(r >= 0);
+        if (r == 0) {
+                /* child */
+
+                assert_se(is_reaper_process() == 0);
+                _exit(EXIT_SUCCESS);
+        }
+
+        r = safe_fork("(newpid)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_WAIT, NULL);
+        assert_se(r >= 0);
+        if (r == 0) {
+                /* child */
+
+                if (unshare(CLONE_NEWPID) < 0) {
+                        if (ERRNO_IS_PRIVILEGE(errno) || ERRNO_IS_NOT_SUPPORTED(errno)) {
+                                log_notice("Skipping CLONE_NEWPID reaper check, lacking privileges/support");
+                                _exit(EXIT_SUCCESS);
+                        }
+                }
+
+                r = safe_fork("(newpid1)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_WAIT, NULL);
+                assert_se(r >= 0);
+                if (r == 0) {
+                        /* grandchild, which is PID1 in a pidns */
+                        assert_se(getpid_cached() == 1);
+                        assert_se(is_reaper_process() > 0);
+                        _exit(EXIT_SUCCESS);
+                }
+
+                _exit(EXIT_SUCCESS);
+        }
+
+        r = safe_fork("(subreaper)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_WAIT, NULL);
+        assert_se(r >= 0);
+        if (r == 0) {
+                /* child */
+                assert_se(make_reaper_process(true) >= 0);
+
+                assert_se(is_reaper_process() > 0);
+                _exit(EXIT_SUCCESS);
+        }
+}
+
 static int intro(void) {
        log_show_color(true);
        return EXIT_SUCCESS;