From 09f9530bafeebd1648bace80384b035322265c44 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 22 Jun 2023 11:52:06 +0200 Subject: [PATCH 01/14] process-util: add helper that detects if we are a reaper process --- src/basic/process-util.c | 15 ++++++++++++ src/basic/process-util.h | 2 ++ src/test/test-process-util.c | 47 ++++++++++++++++++++++++++++++++++++ 3 files changed, 64 insertions(+) diff --git a/src/basic/process-util.c b/src/basic/process-util.c index 0f642fdf1b..6373909bc7 100644 --- a/src/basic/process-util.c +++ b/src/basic/process-util.c @@ -1623,6 +1623,21 @@ int get_process_threads(pid_t pid) { return n; } +int is_reaper_process(void) { + int b = 0; + + /* Checks if we are running in a reaper process, i.e. if we are expected to deal with processes + * reparented to us. This simply checks if we are PID 1 or if PR_SET_CHILD_SUBREAPER was called. */ + + if (getpid_cached() == 1) + return true; + + if (prctl(PR_GET_CHILD_SUBREAPER, (unsigned long) &b, 0UL, 0UL, 0UL) < 0) + return -errno; + + return b != 0; +} + static const char *const sigchld_code_table[] = { [CLD_EXITED] = "exited", [CLD_KILLED] = "killed", diff --git a/src/basic/process-util.h b/src/basic/process-util.h index 230a0edb09..6ad4316614 100644 --- a/src/basic/process-util.h +++ b/src/basic/process-util.h @@ -199,3 +199,5 @@ int setpriority_closest(int priority); _noreturn_ void freeze(void); int get_process_threads(pid_t pid); + +int is_reaper_process(void); diff --git a/src/test/test-process-util.c b/src/test/test-process-util.c index e915207e8c..6de09c3c11 100644 --- a/src/test/test-process-util.c +++ b/src/test/test-process-util.c @@ -891,6 +891,53 @@ TEST(get_process_threads) { } } +TEST(is_reaper_process) { + int r; + + r = safe_fork("(regular)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_WAIT, NULL); + assert_se(r >= 0); + if (r == 0) { + /* child */ + + assert_se(is_reaper_process() == 0); + _exit(EXIT_SUCCESS); + } + + r = safe_fork("(newpid)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_WAIT, NULL); + assert_se(r >= 0); + if (r == 0) { + /* child */ + + if (unshare(CLONE_NEWPID) < 0) { + if (ERRNO_IS_PRIVILEGE(errno) || ERRNO_IS_NOT_SUPPORTED(errno)) { + log_notice("Skipping CLONE_NEWPID reaper check, lacking privileges/support"); + _exit(EXIT_SUCCESS); + } + } + + r = safe_fork("(newpid1)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_WAIT, NULL); + assert_se(r >= 0); + if (r == 0) { + /* grandchild, which is PID1 in a pidns */ + assert_se(getpid_cached() == 1); + assert_se(is_reaper_process() > 0); + _exit(EXIT_SUCCESS); + } + + _exit(EXIT_SUCCESS); + } + + r = safe_fork("(subreaper)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_WAIT, NULL); + assert_se(r >= 0); + if (r == 0) { + /* child */ + assert_se(prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) >= 0); + + assert_se(is_reaper_process() > 0); + _exit(EXIT_SUCCESS); + } +} + static int intro(void) { log_show_color(true); return EXIT_SUCCESS; From 29c3520f28773eb978c47f1c0e76ae6f7da8a04b Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 22 Jun 2023 10:27:17 +0200 Subject: [PATCH 02/14] process-util: add clone_with_nested_stack() helper This wraps glibc's clone() but deals with the 'stack' parameter in a sensible way. Only supports invocations without CLONE_VM, i.e. when child is a CoW copy of parent. --- src/basic/process-util.c | 35 +++++++++++++++++++++++++++++++++++ src/basic/process-util.h | 2 ++ 2 files changed, 37 insertions(+) diff --git a/src/basic/process-util.c b/src/basic/process-util.c index 6373909bc7..483fc7b192 100644 --- a/src/basic/process-util.c +++ b/src/basic/process-util.c @@ -1149,6 +1149,41 @@ static void restore_sigsetp(sigset_t **ssp) { (void) sigprocmask(SIG_SETMASK, *ssp, NULL); } +pid_t clone_with_nested_stack(int (*fn)(void *), int flags, void *userdata) { + size_t ps; + pid_t pid; + void *mystack; + + /* A wrapper around glibc's clone() call that automatically sets up a "nested" stack. Only supports + * invocations without CLONE_VM, so that we can continue to use the parent's stack mapping. + * + * Note: glibc's clone() wrapper does not synchronize malloc() locks. This means that if the parent + * is threaded these locks will be in an undefined state in the child, and hence memory allocations + * are likely going to run into deadlocks. Hence: if you use this function make sure your parent is + * strictly single-threaded or your child never calls malloc(). */ + + assert((flags & (CLONE_VM|CLONE_PARENT_SETTID|CLONE_CHILD_SETTID| + CLONE_CHILD_CLEARTID|CLONE_SETTLS)) == 0); + + /* We allocate some space on the stack to use as the stack for the child (hence "nested"). Note that + * the net effect is that the child will have the start of its stack inside the stack of the parent, + * but since they are a CoW copy of each other that's fine. We allocate one page-aligned page. But + * since we don't want to deal with differences between systems where the stack grows backwards or + * forwards we'll allocate one more and place the stack address in the middle. Except that we also + * want it page aligned, hence we'll allocate one page more. Makes 3. */ + + ps = page_size(); + mystack = alloca(ps*3); + mystack = (uint8_t*) mystack + ps; /* move pointer one page ahead since stacks usually grow backwards */ + mystack = (void*) ALIGN_TO((uintptr_t) mystack, ps); /* align to page size (moving things further ahead) */ + + pid = clone(fn, mystack, flags, userdata); + if (pid < 0) + return -errno; + + return pid; +} + int safe_fork_full( const char *name, const int stdio_fds[3], diff --git a/src/basic/process-util.h b/src/basic/process-util.h index 6ad4316614..920074815e 100644 --- a/src/basic/process-util.h +++ b/src/basic/process-util.h @@ -139,6 +139,8 @@ void reset_cached_pid(void); int must_be_root(void); +pid_t clone_with_nested_stack(int (*fn)(void *), int flags, void *userdata); + typedef enum ForkFlags { FORK_RESET_SIGNALS = 1 << 0, /* Reset all signal handlers and signal mask */ FORK_CLOSE_ALL_FDS = 1 << 1, /* Close all open file descriptors in the child, except for 0,1,2 */ From c26d7837bb08508c8d906d849dff8f1bc465063e Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 22 Jun 2023 10:28:13 +0200 Subject: [PATCH 03/14] async: stop using threads for asynchronous_close() Let's work towards PID1 being purely single threaded again. Let's rework asynchronous_close() on top of clone() with CLONE_FILES (so that we can manipulate PID1's fd table correctly). One less use of pthread_create() in PID 1. --- src/basic/async.c | 83 +++++++++++++++++++++++++++++++++++-------- src/test/test-async.c | 37 +++++++++++++++---- 2 files changed, 99 insertions(+), 21 deletions(-) diff --git a/src/basic/async.c b/src/basic/async.c index 5e347dde8b..c0e1641cb2 100644 --- a/src/basic/async.c +++ b/src/basic/async.c @@ -3,6 +3,8 @@ #include #include #include +#include +#include #include #include "async.h" @@ -79,29 +81,80 @@ int asynchronous_sync(pid_t *ret_pid) { return 0; } -static void *close_thread(void *p) { - (void) pthread_setname_np(pthread_self(), "close"); +/* We encode the fd to close in the userdata pointer as an unsigned value. The highest bit indicates whether + * we need to fork again */ +#define NEED_DOUBLE_FORK (1U << (sizeof(unsigned) * 8 - 1)) - assert_se(close_nointr(PTR_TO_FD(p)) != -EBADF); - return NULL; +static int close_func(void *p) { + unsigned v = PTR_TO_UINT(p); + + (void) prctl(PR_SET_NAME, (unsigned long*) "(close)"); + + /* Note: 💣 This function is invoked in a child process created via glibc's clone() wrapper. In such + * children memory allocation is not allowed, since glibc does not release malloc mutexes in + * clone() 💣 */ + + if (v & NEED_DOUBLE_FORK) { + pid_t pid; + + v &= ~NEED_DOUBLE_FORK; + + /* This inner child will be reparented to the subreaper/PID 1. Here we turn on SIGCHLD, so + * that the reaper knows when it's time to reap. */ + pid = clone_with_nested_stack(close_func, SIGCHLD|CLONE_FILES, UINT_TO_PTR(v)); + if (pid >= 0) + return 0; + } + + close((int) v); /* no assert() here, we are in the child and the result would be eaten up anyway */ + return 0; } int asynchronous_close(int fd) { + unsigned v; + pid_t pid; int r; - /* This is supposed to behave similar to safe_close(), but - * actually invoke close() asynchronously, so that it will - * never block. Ideally the kernel would have an API for this, - * but it doesn't, so we work around it, and hide this as a - * far away as we can. */ + /* This is supposed to behave similar to safe_close(), but actually invoke close() asynchronously, so + * that it will never block. Ideally the kernel would have an API for this, but it doesn't, so we + * work around it, and hide this as a far away as we can. + * + * It is important to us that we don't use threads (via glibc pthread) in PID 1, hence we'll do a + * minimal subprocess instead which shares our fd table via CLONE_FILES. */ - if (fd >= 0) { - PROTECT_ERRNO; + if (fd < 0) + return -EBADF; /* already invalid */ - r = asynchronous_job(close_thread, FD_TO_PTR(fd)); - if (r < 0) - assert_se(close_nointr(fd) != -EBADF); + PROTECT_ERRNO; + + v = (unsigned) fd; + + /* We want to fork off a process that is automatically reaped. For that we'd usually double-fork. But + * we can optimize this a bit: if we are PID 1 or a subreaper anyway (the systemd service manager + * process qualifies as this), we can avoid the double forking, since the double forked process would + * be reparented back to us anyway. */ + r = is_reaper_process(); + if (r < 0) + log_debug_errno(r, "Cannot determine if we are a reaper process, assuming we are not: %m"); + if (r <= 0) + v |= NEED_DOUBLE_FORK; + + pid = clone_with_nested_stack(close_func, CLONE_FILES | ((v & NEED_DOUBLE_FORK) ? 0 : SIGCHLD), UINT_TO_PTR(v)); + if (pid < 0) + assert_se(close_nointr(fd) != -EBADF); /* local fallback */ + else if (v & NEED_DOUBLE_FORK) { + + /* Reap the intermediate child. Key here is that we specify __WCLONE, since we didn't ask for + * any signal to be sent to us on process exit, and otherwise waitid() would refuse waiting + * then. + * + * We usually prefer calling waitid(), but before kernel 4.7 it didn't support __WCLONE while + * waitpid() did. Hence let's use waitpid() here, it's good enough for our purposes here. */ + for (;;) { + if (waitpid(pid, NULL, WEXITED|__WCLONE) >= 0 || errno != EINTR) + break; + } } - return -EBADF; + return -EBADF; /* return an invalidated fd */ } diff --git a/src/test/test-async.c b/src/test/test-async.c index 69785e47fa..c2bf801196 100644 --- a/src/test/test-async.c +++ b/src/test/test-async.c @@ -1,12 +1,14 @@ /* SPDX-License-Identifier: LGPL-2.1-or-later */ #include +#include #include #include "async.h" #include "fs-util.h" -#include "tmpfile-util.h" +#include "process-util.h" #include "tests.h" +#include "tmpfile-util.h" static bool test_async = false; @@ -17,21 +19,44 @@ static void *async_func(void *arg) { } TEST(test_async) { + assert_se(asynchronous_job(async_func, NULL) >= 0); + assert_se(asynchronous_sync(NULL) >= 0); + + assert_se(test_async); +} + +TEST(asynchronous_close) { _cleanup_(unlink_tempfilep) char name[] = "/tmp/test-asynchronous_close.XXXXXX"; - int fd; + int fd, r; fd = mkostemp_safe(name); assert_se(fd >= 0); asynchronous_close(fd); - assert_se(asynchronous_job(async_func, NULL) >= 0); - assert_se(asynchronous_sync(NULL) >= 0); - sleep(1); assert_se(fcntl(fd, F_GETFD) == -1); assert_se(errno == EBADF); - assert_se(test_async); + + r = safe_fork("(subreaper)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DEATHSIG|FORK_LOG|FORK_WAIT, NULL); + assert(r >= 0); + + if (r == 0) { + /* child */ + + assert(prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) >= 0); + + fd = open("/dev/null", O_RDONLY|O_CLOEXEC); + assert_se(fd >= 0); + asynchronous_close(fd); + + sleep(1); + + assert_se(fcntl(fd, F_GETFD) == -1); + assert_se(errno == EBADF); + + _exit(EXIT_SUCCESS); + } } DEFINE_TEST_MAIN(LOG_DEBUG); From 01ab446c35816ac17d63cf3b99367b8016856d5b Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 22 Jun 2023 10:21:32 +0200 Subject: [PATCH 04/14] basic: add comments about raw_clone() calls not supporting threads/malloc in child --- src/basic/process-util.h | 9 +++++++-- src/basic/raw-clone.h | 5 +++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/basic/process-util.h b/src/basic/process-util.h index 920074815e..41432a8565 100644 --- a/src/basic/process-util.h +++ b/src/basic/process-util.h @@ -141,6 +141,11 @@ int must_be_root(void); pid_t clone_with_nested_stack(int (*fn)(void *), int flags, void *userdata); +/* 💣 Note that FORK_NEW_USERNS + FORK_NEW_MOUNTNS should not be called in threaded programs, because they + * cause us to use raw_clone() which does not synchronize the glibc malloc() locks, and thus will cause + * deadlocks if the parent uses threads and the child does memory allocations. Hence: if the parent is + * threaded these flags may not be used. These flags cannot be used if the parent uses threads or the child + * uses malloc(). 💣 */ typedef enum ForkFlags { FORK_RESET_SIGNALS = 1 << 0, /* Reset all signal handlers and signal mask */ FORK_CLOSE_ALL_FDS = 1 << 1, /* Close all open file descriptors in the child, except for 0,1,2 */ @@ -150,13 +155,13 @@ typedef enum ForkFlags { FORK_REOPEN_LOG = 1 << 5, /* Reopen log connection */ FORK_LOG = 1 << 6, /* Log above LOG_DEBUG log level about failures */ FORK_WAIT = 1 << 7, /* Wait until child exited */ - FORK_NEW_MOUNTNS = 1 << 8, /* Run child in its own mount namespace */ + FORK_NEW_MOUNTNS = 1 << 8, /* Run child in its own mount namespace 💣 DO NOT USE IN THREADED PROGRAMS! 💣 */ FORK_MOUNTNS_SLAVE = 1 << 9, /* Make child's mount namespace MS_SLAVE */ FORK_PRIVATE_TMP = 1 << 10, /* Mount new /tmp/ in the child (combine with FORK_NEW_MOUNTNS!) */ FORK_RLIMIT_NOFILE_SAFE = 1 << 11, /* Set RLIMIT_NOFILE soft limit to 1K for select() compat */ FORK_STDOUT_TO_STDERR = 1 << 12, /* Make stdout a copy of stderr */ FORK_FLUSH_STDIO = 1 << 13, /* fflush() stdout (and stderr) before forking */ - FORK_NEW_USERNS = 1 << 14, /* Run child in its own user namespace */ + FORK_NEW_USERNS = 1 << 14, /* Run child in its own user namespace 💣 DO NOT USE IN THREADED PROGRAMS! 💣 */ FORK_CLOEXEC_OFF = 1 << 15, /* In the child: turn off O_CLOEXEC on all fds in except_fds[] */ FORK_KEEP_NOTIFY_SOCKET = 1 << 16, /* Unless this specified, $NOTIFY_SOCKET will be unset. */ } ForkFlags; diff --git a/src/basic/raw-clone.h b/src/basic/raw-clone.h index a3b768f826..6de67ab752 100644 --- a/src/basic/raw-clone.h +++ b/src/basic/raw-clone.h @@ -27,6 +27,11 @@ * Additionally, as this function does not pass the ptid, newtls and ctid parameters to the kernel, flags must not * contain CLONE_PARENT_SETTID, CLONE_CHILD_SETTID, CLONE_CHILD_CLEARTID or CLONE_SETTLS. * + * WARNING: 💣 this call (just like glibc's own clone() wrapper) will not synchronize on glibc's malloc + * locks, which means they will be in an undefined state in the child if the parent is + * threaded. This means: the parent must either never use threads, or the child cannot use memory + * allocation itself. This is a major pitfall, hence be careful! 💣 + * * Returns: 0 in the child process and the child process id in the parent. */ static inline pid_t raw_clone(unsigned long flags) { From f7bccef178943bb7507193db246887024a62a07f Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 22 Jun 2023 10:57:31 +0200 Subject: [PATCH 05/14] automont: rework expiry to use subprocess rather than thread One more step towards a thread-free PID1: let's do automount expiry in a subprocess rather than a thread. --- src/core/automount.c | 74 ++++++++++++++++++-------------------------- 1 file changed, 30 insertions(+), 44 deletions(-) diff --git a/src/core/automount.c b/src/core/automount.c index 3254275d6b..52fb4b6948 100644 --- a/src/core/automount.c +++ b/src/core/automount.c @@ -44,22 +44,6 @@ static const UnitActiveState state_translation_table[_AUTOMOUNT_STATE_MAX] = { [AUTOMOUNT_FAILED] = UNIT_FAILED }; -struct expire_data { - int dev_autofs_fd; - int ioctl_fd; -}; - -static struct expire_data* expire_data_free(struct expire_data *data) { - if (!data) - return NULL; - - safe_close(data->dev_autofs_fd); - safe_close(data->ioctl_fd); - return mfree(data); -} - -DEFINE_TRIVIAL_CLEANUP_FUNC(struct expire_data*, expire_data_free); - static int open_dev_autofs(Manager *m); static int automount_dispatch_io(sd_event_source *s, int fd, uint32_t events, void *userdata); static int automount_start_expire(Automount *a); @@ -674,55 +658,57 @@ fail: automount_enter_dead(a, AUTOMOUNT_FAILURE_RESOURCES); } -static void *expire_thread(void *p) { - struct autofs_dev_ioctl param; - _cleanup_(expire_data_freep) struct expire_data *data = p; +static int asynchronous_expire(int dev_autofs_fd, int ioctl_fd) { int r; - assert(data->dev_autofs_fd >= 0); - assert(data->ioctl_fd >= 0); + assert(dev_autofs_fd >= 0); + assert(ioctl_fd >= 0); - init_autofs_dev_ioctl(¶m); - param.ioctlfd = data->ioctl_fd; + /* Issue AUTOFS_DEV_IOCTL_EXPIRE in subprocess, asynchronously. Note that we don't keep track of the + * child's PID, we are PID1/autoreaper after all, hence when it dies we'll automatically clean it up + * anyway. */ - do { - r = ioctl(data->dev_autofs_fd, AUTOFS_DEV_IOCTL_EXPIRE, ¶m); - } while (r >= 0); + r = safe_fork_full("(sd-expire)", + /* stdio_fds= */ NULL, + (int[]) { dev_autofs_fd, ioctl_fd }, + /* n_except_fds= */ 2, + FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_REOPEN_LOG, + /* pid= */ NULL); + if (r != 0) + return r; + + /* Child */ + for (;;) { + struct autofs_dev_ioctl param; + init_autofs_dev_ioctl(¶m); + param.ioctlfd = ioctl_fd; + + if (ioctl(dev_autofs_fd, AUTOFS_DEV_IOCTL_EXPIRE, ¶m) < 0) + break; + } if (errno != EAGAIN) log_warning_errno(errno, "Failed to expire automount, ignoring: %m"); - return NULL; + _exit(EXIT_SUCCESS); } static int automount_dispatch_expire(sd_event_source *source, usec_t usec, void *userdata) { + _cleanup_close_ int ioctl_fd = -EBADF; Automount *a = AUTOMOUNT(userdata); - _cleanup_(expire_data_freep) struct expire_data *data = NULL; int r; assert(a); assert(source == a->expire_event_source); - data = new0(struct expire_data, 1); - if (!data) - return log_oom(); + ioctl_fd = open_ioctl_fd(UNIT(a)->manager->dev_autofs_fd, a->where, a->dev_id); + if (ioctl_fd < 0) + return log_unit_error_errno(UNIT(a), ioctl_fd, "Couldn't open autofs ioctl fd: %m"); - data->ioctl_fd = -EBADF; - - data->dev_autofs_fd = fcntl(UNIT(a)->manager->dev_autofs_fd, F_DUPFD_CLOEXEC, 3); - if (data->dev_autofs_fd < 0) - return log_unit_error_errno(UNIT(a), errno, "Failed to duplicate autofs fd: %m"); - - data->ioctl_fd = open_ioctl_fd(UNIT(a)->manager->dev_autofs_fd, a->where, a->dev_id); - if (data->ioctl_fd < 0) - return log_unit_error_errno(UNIT(a), data->ioctl_fd, "Couldn't open autofs ioctl fd: %m"); - - r = asynchronous_job(expire_thread, data); + r = asynchronous_expire(UNIT(a)->manager->dev_autofs_fd, ioctl_fd); if (r < 0) return log_unit_error_errno(UNIT(a), r, "Failed to start expire job: %m"); - data = NULL; - return automount_start_expire(a); } From 2e7b105eb9386971f60b9876eede32dff5ed67aa Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 22 Jun 2023 11:51:25 +0200 Subject: [PATCH 06/14] process-util: add FORK_DETACH flag for forking of detached child A test for this is later added indirectly, via aynchronous_rm_rf() that uses this and comes with a suitable test. --- src/basic/process-util.c | 36 ++++++++++++++++++++++++++++++++++-- src/basic/process-util.h | 1 + 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/src/basic/process-util.c b/src/basic/process-util.c index 483fc7b192..437e83bc6d 100644 --- a/src/basic/process-util.c +++ b/src/basic/process-util.c @@ -1195,9 +1195,12 @@ int safe_fork_full( pid_t original_pid, pid; sigset_t saved_ss, ss; _unused_ _cleanup_(restore_sigsetp) sigset_t *saved_ssp = NULL; - bool block_signals = false, block_all = false; + bool block_signals = false, block_all = false, intermediary = false; int prio, r; + assert(!FLAGS_SET(flags, FORK_DETACH) || !ret_pid); + assert(!FLAGS_SET(flags, FORK_DETACH|FORK_WAIT)); + /* A wrapper around fork(), that does a couple of important initializations in addition to mere forking. Always * returns the child's PID in *ret_pid. Returns == 0 in the child, and > 0 in the parent. */ @@ -1231,6 +1234,31 @@ int safe_fork_full( saved_ssp = &saved_ss; } + if (FLAGS_SET(flags, FORK_DETACH)) { + assert(!FLAGS_SET(flags, FORK_WAIT)); + assert(!ret_pid); + + /* Fork off intermediary child if needed */ + + r = is_reaper_process(); + if (r < 0) + return log_full_errno(prio, r, "Failed to determine if we are a reaper process: %m"); + + if (!r) { + /* Not a reaper process, hence do a double fork() so we are reparented to one */ + + pid = fork(); + if (pid < 0) + return log_full_errno(prio, errno, "Failed to fork off '%s': %m", strna(name)); + if (pid > 0) { + log_debug("Successfully forked off intermediary '%s' as PID " PID_FMT ".", strna(name), pid); + return 1; /* return in the parent */ + } + + intermediary = true; + } + } + if ((flags & (FORK_NEW_MOUNTNS|FORK_NEW_USERNS)) != 0) pid = raw_clone(SIGCHLD| (FLAGS_SET(flags, FORK_NEW_MOUNTNS) ? CLONE_NEWNS : 0) | @@ -1240,8 +1268,12 @@ int safe_fork_full( if (pid < 0) return log_full_errno(prio, errno, "Failed to fork off '%s': %m", strna(name)); if (pid > 0) { - /* We are in the parent process */ + /* If we are in the intermediary process, exit now */ + if (intermediary) + _exit(EXIT_SUCCESS); + + /* We are in the parent process */ log_debug("Successfully forked off '%s' as PID " PID_FMT ".", strna(name), pid); if (flags & FORK_WAIT) { diff --git a/src/basic/process-util.h b/src/basic/process-util.h index 41432a8565..1b77478cf5 100644 --- a/src/basic/process-util.h +++ b/src/basic/process-util.h @@ -164,6 +164,7 @@ typedef enum ForkFlags { FORK_NEW_USERNS = 1 << 14, /* Run child in its own user namespace 💣 DO NOT USE IN THREADED PROGRAMS! 💣 */ FORK_CLOEXEC_OFF = 1 << 15, /* In the child: turn off O_CLOEXEC on all fds in except_fds[] */ FORK_KEEP_NOTIFY_SOCKET = 1 << 16, /* Unless this specified, $NOTIFY_SOCKET will be unset. */ + FORK_DETACH = 1 << 17, /* Double fork if needed to ensure PID1/subreaper is parent */ } ForkFlags; int safe_fork_full( From 7e14a308cfc1390479af786af96e63395bc3a487 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 22 Jun 2023 11:54:51 +0200 Subject: [PATCH 07/14] =?UTF-8?q?shared:=20move=20async.[ch]=20from=20src/?= =?UTF-8?q?basic/=20=E2=86=92=20src/shared/?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit That way we can later add an async wrapper for rm_rf() which is in src/shared/, too. --- src/basic/meson.build | 1 - src/{basic => shared}/async.c | 0 src/{basic => shared}/async.h | 0 src/shared/meson.build | 1 + 4 files changed, 1 insertion(+), 1 deletion(-) rename src/{basic => shared}/async.c (100%) rename src/{basic => shared}/async.h (100%) diff --git a/src/basic/meson.build b/src/basic/meson.build index 1decb4827f..fc9a925847 100644 --- a/src/basic/meson.build +++ b/src/basic/meson.build @@ -7,7 +7,6 @@ basic_sources = files( 'architecture.c', 'argv-util.c', 'arphrd-util.c', - 'async.c', 'audit-util.c', 'build.c', 'bus-label.c', diff --git a/src/basic/async.c b/src/shared/async.c similarity index 100% rename from src/basic/async.c rename to src/shared/async.c diff --git a/src/basic/async.h b/src/shared/async.h similarity index 100% rename from src/basic/async.h rename to src/shared/async.h diff --git a/src/shared/meson.build b/src/shared/meson.build index 5a982f3184..1e015bd38e 100644 --- a/src/shared/meson.build +++ b/src/shared/meson.build @@ -4,6 +4,7 @@ shared_sources = files( 'acpi-fpdt.c', 'apparmor-util.c', 'ask-password-api.c', + 'async.c', 'barrier.c', 'base-filesystem.c', 'battery-util.c', From 437f3e35b4d580ac99a52e307542aa4370854768 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 22 Jun 2023 11:55:59 +0200 Subject: [PATCH 08/14] async: add generic implementation of asynchronous_rm_rf() This one doesn't use threads anymore. This is the last use of threads in PID 1. Yay! Fixes: #27287 --- src/core/execute.c | 34 ++++++++++-------------------- src/shared/async.c | 25 ++++++++++++++++++++++ src/shared/async.h | 2 ++ src/test/test-async.c | 49 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 87 insertions(+), 23 deletions(-) diff --git a/src/core/execute.c b/src/core/execute.c index b7fe922c7a..90af8fa619 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -7318,28 +7318,17 @@ int exec_command_append(ExecCommand *c, const char *path, ...) { return 0; } -static void *rm_rf_thread(void *p) { - _cleanup_free_ char *path = p; +static char *destroy_tree(char *path) { + if (!path) + return NULL; - (void) rm_rf(path, REMOVE_ROOT|REMOVE_SUBVOLUME|REMOVE_PHYSICAL); - return NULL; -} + if (!path_equal(path, RUN_SYSTEMD_EMPTY)) { + log_debug("Spawning process to nuke '%s'", path); -static void asynchronous_rm_rf(char **path) { - int r; + (void) asynchronous_rm_rf(path, REMOVE_ROOT|REMOVE_SUBVOLUME|REMOVE_PHYSICAL); + } - assert(path); - - if (!*path || streq(*path, RUN_SYSTEMD_EMPTY)) - return; - - log_debug("Spawning thread to nuke %s", *path); - - r = asynchronous_job(rm_rf_thread, *path); - if (r < 0) - log_warning_errno(r, "Failed to nuke %s: %m", *path); - else - *path = NULL; + return mfree(path); } static ExecSharedRuntime* exec_shared_runtime_free(ExecSharedRuntime *rt) { @@ -7370,8 +7359,8 @@ ExecSharedRuntime* exec_shared_runtime_destroy(ExecSharedRuntime *rt) { if (rt->n_ref > 0) return NULL; - asynchronous_rm_rf(&rt->tmp_dir); - asynchronous_rm_rf(&rt->var_tmp_dir); + rt->tmp_dir = destroy_tree(rt->tmp_dir); + rt->var_tmp_dir = destroy_tree(rt->var_tmp_dir); return exec_shared_runtime_free(rt); } @@ -7862,9 +7851,8 @@ ExecRuntime* exec_runtime_free(ExecRuntime *rt) { exec_shared_runtime_unref(rt->shared); dynamic_creds_unref(rt->dynamic_creds); - asynchronous_rm_rf(&rt->ephemeral_copy); + rt->ephemeral_copy = destroy_tree(rt->ephemeral_copy); - free(rt->ephemeral_copy); safe_close_pair(rt->ephemeral_storage_socket); return mfree(rt); } diff --git a/src/shared/async.c b/src/shared/async.c index c0e1641cb2..a98f31d3b8 100644 --- a/src/shared/async.c +++ b/src/shared/async.c @@ -158,3 +158,28 @@ int asynchronous_close(int fd) { return -EBADF; /* return an invalidated fd */ } + +int asynchronous_rm_rf(const char *p, RemoveFlags flags) { + int r; + + assert(p); + + /* Forks off a child that destroys the specified path. This will be best effort only, i.e. the child + * will attempt to do its thing, but we won't wait for it or check its success. */ + + r = safe_fork("(sd-rmrf)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DETACH, NULL); + if (r != 0) + return r; + + /* Child */ + + r = rm_rf(p, flags); + if (r < 0) { + log_debug_errno(r, "Failed to rm -rf '%s', ignoring: %m", p); + _exit(EXIT_FAILURE); /* This is a detached process, hence noone really cares, but who knows + * maybe it's good for debugging/tracing to return an exit code + * indicative of our failure here. */ + } + + _exit(EXIT_SUCCESS); +} diff --git a/src/shared/async.h b/src/shared/async.h index e0bbaa5658..2c7def993d 100644 --- a/src/shared/async.h +++ b/src/shared/async.h @@ -4,10 +4,12 @@ #include #include "macro.h" +#include "rm-rf.h" int asynchronous_job(void* (*func)(void *p), void *arg); int asynchronous_sync(pid_t *ret_pid); int asynchronous_close(int fd); +int asynchronous_rm_rf(const char *p, RemoveFlags flags); DEFINE_TRIVIAL_CLEANUP_FUNC(int, asynchronous_close); diff --git a/src/test/test-async.c b/src/test/test-async.c index c2bf801196..02028ce323 100644 --- a/src/test/test-async.c +++ b/src/test/test-async.c @@ -2,11 +2,14 @@ #include #include +#include #include #include "async.h" #include "fs-util.h" +#include "path-util.h" #include "process-util.h" +#include "signal-util.h" #include "tests.h" #include "tmpfile-util.h" @@ -59,4 +62,50 @@ TEST(asynchronous_close) { } } +TEST(asynchronous_rm_rf) { + _cleanup_free_ char *t = NULL, *k = NULL; + int r; + + assert_se(mkdtemp_malloc(NULL, &t) >= 0); + assert_se(k = path_join(t, "somefile")); + assert_se(touch(k) >= 0); + assert_se(asynchronous_rm_rf(t, REMOVE_ROOT|REMOVE_PHYSICAL) >= 0); + + /* Do this once more, form a subreaper. Which is nice, because we can watch the async child even + * though detached */ + + r = safe_fork("(subreaper)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DEATHSIG|FORK_LOG|FORK_WAIT, NULL); + assert_se(r >= 0); + + if (r == 0) { + _cleanup_free_ char *tt = NULL, *kk = NULL; + + /* child */ + + assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, -1) >= 0); + assert_se(prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) >= 0); + + assert_se(mkdtemp_malloc(NULL, &tt) >= 0); + assert_se(kk = path_join(tt, "somefile")); + assert_se(touch(kk) >= 0); + assert_se(asynchronous_rm_rf(tt, REMOVE_ROOT|REMOVE_PHYSICAL) >= 0); + + for (;;) { + siginfo_t si = {}; + + assert_se(waitid(P_ALL, 0, &si, WEXITED) >= 0); + + if (access(tt, F_OK) < 0) { + assert_se(errno == ENOENT); + break; + } + + /* wasn't the rm_rf() call. let's wait longer */ + } + + _exit(EXIT_SUCCESS); + } +} + + DEFINE_TEST_MAIN(LOG_DEBUG); From 78b680f99bc23f8e55d3a0be7a58c0d8cda3959d Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 22 Jun 2023 12:04:46 +0200 Subject: [PATCH 09/14] async: drop the now unused asynchronous_job() --- src/shared/async.c | 47 ------------------------------------------- src/shared/async.h | 2 -- src/test/test-async.c | 13 +----------- 3 files changed, 1 insertion(+), 61 deletions(-) diff --git a/src/shared/async.c b/src/shared/async.c index a98f31d3b8..cb5e179cdf 100644 --- a/src/shared/async.c +++ b/src/shared/async.c @@ -1,7 +1,6 @@ /* SPDX-License-Identifier: LGPL-2.1-or-later */ #include -#include #include #include #include @@ -15,52 +14,6 @@ #include "process-util.h" #include "signal-util.h" -int asynchronous_job(void* (*func)(void *p), void *arg) { - sigset_t ss, saved_ss; - pthread_attr_t a; - pthread_t t; - int r, k; - - /* It kinda sucks that we have to resort to threads to implement an asynchronous close(), but well, such is - * life. */ - - r = pthread_attr_init(&a); - if (r > 0) - return -r; - - r = pthread_attr_setdetachstate(&a, PTHREAD_CREATE_DETACHED); - if (r > 0) { - r = -r; - goto finish; - } - - assert_se(sigfillset(&ss) >= 0); - - /* Block all signals before forking off the thread, so that the new thread is started with all signals - * blocked. This way the existence of the new thread won't affect signal handling in other threads. */ - - r = pthread_sigmask(SIG_BLOCK, &ss, &saved_ss); - if (r > 0) { - r = -r; - goto finish; - } - - r = pthread_create(&t, &a, func, arg); - - k = pthread_sigmask(SIG_SETMASK, &saved_ss, NULL); - - if (r > 0) - r = -r; - else if (k > 0) - r = -k; - else - r = 0; - -finish: - pthread_attr_destroy(&a); - return r; -} - int asynchronous_sync(pid_t *ret_pid) { int r; diff --git a/src/shared/async.h b/src/shared/async.h index 2c7def993d..71d044dd61 100644 --- a/src/shared/async.h +++ b/src/shared/async.h @@ -6,8 +6,6 @@ #include "macro.h" #include "rm-rf.h" -int asynchronous_job(void* (*func)(void *p), void *arg); - int asynchronous_sync(pid_t *ret_pid); int asynchronous_close(int fd); int asynchronous_rm_rf(const char *p, RemoveFlags flags); diff --git a/src/test/test-async.c b/src/test/test-async.c index 02028ce323..64b94886ff 100644 --- a/src/test/test-async.c +++ b/src/test/test-async.c @@ -13,19 +13,8 @@ #include "tests.h" #include "tmpfile-util.h" -static bool test_async = false; - -static void *async_func(void *arg) { - test_async = true; - - return NULL; -} - -TEST(test_async) { - assert_se(asynchronous_job(async_func, NULL) >= 0); +TEST(test_asynchronous_sync) { assert_se(asynchronous_sync(NULL) >= 0); - - assert_se(test_async); } TEST(asynchronous_close) { From e4687bb8a6a2986642e38b5272694ce21e927679 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 22 Jun 2023 15:09:50 +0200 Subject: [PATCH 10/14] async: add explanatory comment --- src/shared/async.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/shared/async.h b/src/shared/async.h index 71d044dd61..96148f9006 100644 --- a/src/shared/async.h +++ b/src/shared/async.h @@ -6,6 +6,19 @@ #include "macro.h" #include "rm-rf.h" +/* These functions implement various potentially slow operations that are executed asynchronously. They are + * carefully written to not use pthreads, but use fork() or clone() (without CLONE_VM) so that the child does + * not share any memory with the parent process, and thus cannot possibly interfere with the malloc() + * synchronization locks. + * + * Background: glibc only synchronizes malloc() locks when doing fork(), but not when doing clone() + * (regardless if through glibc's own wrapper or ours). This means if another thread in the parent has the + * malloc() lock taken while a thread is cloning, the mutex will remain locked in the child (but the other + * thread won't exist there), with no chance to ever be unlocked again. This will result in deadlocks. Hence + * one has to make the choice: either never use threads in the parent, or never do memory allocation in the + * child, or never use clone()/clone3() and stick to fork() only. Because we need clone()/clone3() we opted + * for avoiding threads. */ + int asynchronous_sync(pid_t *ret_pid); int asynchronous_close(int fd); int asynchronous_rm_rf(const char *p, RemoveFlags flags); From 530f6ada2ef253d726b91e5f8e2f6b204f69aea5 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 22 Jun 2023 15:11:52 +0200 Subject: [PATCH 11/14] async: use FORK_DETACH for asynchronous syncs To get proper "fire-and-forget" feeling we really want to make sure noone has to reap the forked off process. --- src/shared/async.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/shared/async.c b/src/shared/async.c index cb5e179cdf..b7ecb9c4b7 100644 --- a/src/shared/async.c +++ b/src/shared/async.c @@ -22,7 +22,7 @@ int asynchronous_sync(pid_t *ret_pid) { * original process ever, and a thread would do that as the process can't exit with threads hanging in blocking * syscalls. */ - r = safe_fork("(sd-sync)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS, ret_pid); + r = safe_fork("(sd-sync)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|(ret_pid ? 0 : FORK_DETACH), ret_pid); if (r < 0) return r; if (r == 0) { From 2499d320224d3f9f2a1d8c9a99b09753e62928e0 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 22 Jun 2023 11:53:16 +0200 Subject: [PATCH 12/14] docs: document threading situation in coding style --- docs/CODING_STYLE.md | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/docs/CODING_STYLE.md b/docs/CODING_STYLE.md index f76525205f..f3eefeaae6 100644 --- a/docs/CODING_STYLE.md +++ b/docs/CODING_STYLE.md @@ -750,3 +750,32 @@ SPDX-License-Identifier: LGPL-2.1-or-later - Reasonable use of non-ASCII Unicode UTF-8 characters in code comments is welcome. If your code comment contains an emoji or two this will certainly brighten the day of the occasional reviewer of your code. Really! 😊 + +## Threading + +- We generally avoid using threads, to the level this is possible. In + particular in the service manager/PID 1 threads are not OK to use. This is + because you cannot mix memory allocation in threads with use of glibc's + `clone()` call, or manual `clone()`/`clone3()` system call wrappers. Only + glibc's own `fork()` call will properly synchronize the memory allocation + locks around the process clone operation. This means that if a process is + cloned via `clone()`/`clone3()` and another thread currently has the + `malloc()` lock taken, it will be cloned in locked state to the child, and + thus can never be acquired in the child, leading to deadlocks. Hence, when + using `clone()`/`clone3()` there are only two ways out: never use threads in the + parent, or never do memory allocation in the child. For our uses we need + `clone()`/`clone3()` and hence decided to avoid threads. Of course, sometimes the + concurrency threads allow is beneficial, however we suggest forking off + worker *processes* rather than worker *threads* for this purpose, ideally + even with an `execve()` to remove the CoW trap situation `fork()` easily + triggers. + +- A corollary of the above is: never use `clone()` where a `fork()` would do + too. Also consider using `posix_spawn()` which combines `clone()` + + `execve()` into one and has nice properties since it avoids becoming a CoW + trap by using `CLONE_VORK` and `CLONE_VM` together. + +- While we avoid forking off threads on our own, writing thread-safe code is a + good idea where it might end up running inside of libsystemd.so or + similar. Hence, use TLS (i.e. `thread_local`) where appropriate, and maybe + the occasional `pthread_once()`. From 8c3fe1b5b59ebfd6e462245f2ab82097b4f7494d Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 22 Jun 2023 22:24:04 +0200 Subject: [PATCH 13/14] process-util: add simple wrapper around PR_SET_CHILD_SUBREAPER Let's a simple helper that knows how to deal with PID == 1. --- src/basic/process-util.c | 18 ++++++++++++++++++ src/basic/process-util.h | 1 + src/core/main.c | 7 +++---- src/nspawn/nspawn.c | 5 +++-- src/test/test-async.c | 4 ++-- src/test/test-process-util.c | 2 +- 6 files changed, 28 insertions(+), 9 deletions(-) diff --git a/src/basic/process-util.c b/src/basic/process-util.c index 437e83bc6d..001002a55e 100644 --- a/src/basic/process-util.c +++ b/src/basic/process-util.c @@ -1705,6 +1705,24 @@ int is_reaper_process(void) { return b != 0; } +int make_reaper_process(bool b) { + + if (getpid_cached() == 1) { + + if (!b) + return -EINVAL; + + return 0; + } + + /* Some prctl()s insist that all 5 arguments are specified, others do not. Let's always specify all, + * to avoid any ambiguities */ + if (prctl(PR_SET_CHILD_SUBREAPER, (unsigned long) b, 0UL, 0UL, 0UL) < 0) + return -errno; + + return 0; +} + static const char *const sigchld_code_table[] = { [CLD_EXITED] = "exited", [CLD_KILLED] = "killed", diff --git a/src/basic/process-util.h b/src/basic/process-util.h index 1b77478cf5..8f87fdc2ae 100644 --- a/src/basic/process-util.h +++ b/src/basic/process-util.h @@ -209,3 +209,4 @@ _noreturn_ void freeze(void); int get_process_threads(pid_t pid); int is_reaper_process(void); +int make_reaper_process(bool b); diff --git a/src/core/main.c b/src/core/main.c index f067b13fff..208d22f4f3 100644 --- a/src/core/main.c +++ b/src/core/main.c @@ -2294,10 +2294,9 @@ static int initialize_runtime( } } - if (arg_runtime_scope == RUNTIME_SCOPE_USER) - /* Become reaper of our children */ - if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) - log_warning_errno(errno, "Failed to make us a subreaper, ignoring: %m"); + r = make_reaper_process(true); + if (r < 0) + log_warning_errno(r, "Failed to make us a subreaper, ignoring: %m"); /* Bump up RLIMIT_NOFILE for systemd itself */ (void) bump_rlimit_nofile(saved_rlimit_nofile); diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 97a2c386e6..4fdb5e9fd7 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -5826,8 +5826,9 @@ static int run(int argc, char *argv[]) { assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, SIGRTMIN+18, -1) >= 0); - if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) < 0) { - r = log_error_errno(errno, "Failed to become subreaper: %m"); + r = make_reaper_process(true); + if (r < 0) { + log_error_errno(r, "Failed to become subreaper: %m"); goto finish; } diff --git a/src/test/test-async.c b/src/test/test-async.c index 64b94886ff..dc0e34b48f 100644 --- a/src/test/test-async.c +++ b/src/test/test-async.c @@ -36,7 +36,7 @@ TEST(asynchronous_close) { if (r == 0) { /* child */ - assert(prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) >= 0); + assert(make_reaper_process(true) >= 0); fd = open("/dev/null", O_RDONLY|O_CLOEXEC); assert_se(fd >= 0); @@ -72,7 +72,7 @@ TEST(asynchronous_rm_rf) { /* child */ assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, -1) >= 0); - assert_se(prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) >= 0); + assert_se(make_reaper_process(true) >= 0); assert_se(mkdtemp_malloc(NULL, &tt) >= 0); assert_se(kk = path_join(tt, "somefile")); diff --git a/src/test/test-process-util.c b/src/test/test-process-util.c index 6de09c3c11..ebf73c54ec 100644 --- a/src/test/test-process-util.c +++ b/src/test/test-process-util.c @@ -931,7 +931,7 @@ TEST(is_reaper_process) { assert_se(r >= 0); if (r == 0) { /* child */ - assert_se(prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) >= 0); + assert_se(make_reaper_process(true) >= 0); assert_se(is_reaper_process() > 0); _exit(EXIT_SUCCESS); From 1ee20371c743402b7c0a15431baec4b671763a21 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 22 Jun 2023 22:24:30 +0200 Subject: [PATCH 14/14] basic: drop unused include --- src/basic/random-util.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/basic/random-util.c b/src/basic/random-util.c index 9a7d3480c0..d4a3c9ca5c 100644 --- a/src/basic/random-util.c +++ b/src/basic/random-util.c @@ -4,7 +4,6 @@ #include #include #include -#include #include #include #include