From a87a9625f8bde776ece11b8ddb77588cfff73038 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 13 Dec 2024 18:51:34 +0100 Subject: [PATCH 01/14] tree-wide: drop acquire_data_fd_full() helper Let's drop support systems lacking memfds, i.e. pre kernel 3.17 systems. This allows us to drastically simplify the "data fd" concept, so far that we can remove it entirely. This replaces acquire_data_fd() with a specialized call to memfd_new_and_seal(), not that memfds can be the only implementation of the concept. --- src/basic/memfd-util.c | 3 + src/basic/memfd-util.h | 3 + src/core/dbus-manager.c | 4 +- src/core/exec-invoke.c | 4 +- src/home/homed-home.c | 3 +- src/home/homework-cifs.c | 4 +- src/network/networkd-serialize.c | 4 +- src/oom/oomd-manager-bus.c | 4 +- src/shared/bus-util.c | 4 +- src/shared/data-fd-util.c | 238 +++++-------------------------- src/shared/data-fd-util.h | 13 -- src/test/test-data-fd-util.c | 54 +------ src/test/test-fd-util.c | 4 +- src/test/test-varlink.c | 12 +- 14 files changed, 65 insertions(+), 289 deletions(-) diff --git a/src/basic/memfd-util.c b/src/basic/memfd-util.c index 92b84f95a6..0ae9a4f777 100644 --- a/src/basic/memfd-util.c +++ b/src/basic/memfd-util.c @@ -177,6 +177,9 @@ int memfd_new_and_seal(const char *name, const void *data, size_t sz) { assert(data || sz == 0); + if (sz == SIZE_MAX) + sz = strlen(data); + fd = memfd_new(name); if (fd < 0) return fd; diff --git a/src/basic/memfd-util.h b/src/basic/memfd-util.h index 9b2103e0ce..020dccb4f6 100644 --- a/src/basic/memfd-util.h +++ b/src/basic/memfd-util.h @@ -11,6 +11,9 @@ int memfd_create_wrapper(const char *name, unsigned mode); int memfd_new(const char *name); int memfd_new_and_map(const char *name, size_t sz, void **p); int memfd_new_and_seal(const char *name, const void *data, size_t sz); +static inline int memfd_new_and_seal_string(const char *name, const char *s) { + return memfd_new_and_seal(name, s, SIZE_MAX); +} int memfd_add_seals(int fd, unsigned int seals); int memfd_get_seals(int fd, unsigned int *ret_seals); diff --git a/src/core/dbus-manager.c b/src/core/dbus-manager.c index 20b05446b6..4e9ea8ac27 100644 --- a/src/core/dbus-manager.c +++ b/src/core/dbus-manager.c @@ -14,7 +14,6 @@ #include "bus-util.h" #include "chase.h" #include "confidential-virt.h" -#include "data-fd-util.h" #include "dbus-cgroup.h" #include "dbus-execute.h" #include "dbus-job.h" @@ -33,6 +32,7 @@ #include "locale-util.h" #include "log.h" #include "manager-dump.h" +#include "memfd-util.h" #include "os-util.h" #include "parse-util.h" #include "path-util.h" @@ -1447,7 +1447,7 @@ static int method_dump(sd_bus_message *message, void *userdata, sd_bus_error *er static int reply_dump_by_fd(sd_bus_message *message, char *dump) { _cleanup_close_ int fd = -EBADF; - fd = acquire_data_fd(dump); + fd = memfd_new_and_seal_string("dump", dump); if (fd < 0) return fd; diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c index 142c539d45..ff616813f6 100644 --- a/src/core/exec-invoke.c +++ b/src/core/exec-invoke.c @@ -31,7 +31,6 @@ #include "chattr-util.h" #include "chown-recursive.h" #include "copy.h" -#include "data-fd-util.h" #include "env-util.h" #include "escape.h" #include "exec-credential.h" @@ -44,6 +43,7 @@ #include "io-util.h" #include "iovec-util.h" #include "journal-send.h" +#include "memfd-util.h" #include "missing_ioprio.h" #include "missing_prctl.h" #include "missing_sched.h" @@ -406,7 +406,7 @@ static int setup_input( case EXEC_INPUT_DATA: { int fd; - fd = acquire_data_fd_full(context->stdin_data, context->stdin_data_size, /* flags = */ 0); + fd = memfd_new_and_seal("exec-input", context->stdin_data, context->stdin_data_size); if (fd < 0) return fd; diff --git a/src/home/homed-home.c b/src/home/homed-home.c index 32691e4f81..751b197be8 100644 --- a/src/home/homed-home.c +++ b/src/home/homed-home.c @@ -13,7 +13,6 @@ #include "build-path.h" #include "bus-common-errors.h" #include "bus-locator.h" -#include "data-fd-util.h" #include "env-util.h" #include "errno-list.h" #include "errno-util.h" @@ -1266,7 +1265,7 @@ static int home_start_work( if (r < 0) return r; - stdin_fd = acquire_data_fd(formatted); + stdin_fd = memfd_new_and_seal_string("request", formatted); if (stdin_fd < 0) return stdin_fd; diff --git a/src/home/homework-cifs.c b/src/home/homework-cifs.c index edc9a4b274..f665c75d7f 100644 --- a/src/home/homework-cifs.c +++ b/src/home/homework-cifs.c @@ -5,7 +5,6 @@ #include #endif -#include "data-fd-util.h" #include "dirent-util.h" #include "fd-util.h" #include "fileio.h" @@ -13,6 +12,7 @@ #include "fs-util.h" #include "homework-cifs.h" #include "homework-mount.h" +#include "memfd-util.h" #include "mkdir.h" #include "mount-util.h" #include "process-util.h" @@ -76,7 +76,7 @@ int home_setup_cifs( pid_t mount_pid; int exit_status; - passwd_fd = acquire_data_fd(*pw); + passwd_fd = memfd_new_and_seal_string("cifspw", *pw); if (passwd_fd < 0) return log_error_errno(passwd_fd, "Failed to create data FD for password: %m"); diff --git a/src/network/networkd-serialize.c b/src/network/networkd-serialize.c index 26a38bd4dd..6e9536e182 100644 --- a/src/network/networkd-serialize.c +++ b/src/network/networkd-serialize.c @@ -2,11 +2,11 @@ #include "af-list.h" #include "daemon-util.h" -#include "data-fd-util.h" #include "fd-util.h" #include "fileio.h" #include "iovec-util.h" #include "json-util.h" +#include "memfd-util.h" #include "networkd-address.h" #include "networkd-json.h" #include "networkd-link.h" @@ -69,7 +69,7 @@ int manager_serialize(Manager *manager) { return r; _cleanup_close_ int fd = -EBADF; - fd = acquire_data_fd(dump); + fd = memfd_new_and_seal_string("serialization", dump); if (fd < 0) return fd; diff --git a/src/oom/oomd-manager-bus.c b/src/oom/oomd-manager-bus.c index 7d2edb5561..77bd31e4e7 100644 --- a/src/oom/oomd-manager-bus.c +++ b/src/oom/oomd-manager-bus.c @@ -4,8 +4,8 @@ #include "bus-common-errors.h" #include "bus-polkit.h" -#include "data-fd-util.h" #include "fd-util.h" +#include "memfd-util.h" #include "oomd-manager-bus.h" #include "oomd-manager.h" #include "user-util.h" @@ -22,7 +22,7 @@ static int bus_method_dump_by_fd(sd_bus_message *message, void *userdata, sd_bus if (r < 0) return r; - fd = acquire_data_fd(dump); + fd = memfd_new_and_seal_string("oomd-dump", dump); if (fd < 0) return fd; diff --git a/src/shared/bus-util.c b/src/shared/bus-util.c index 8313dfdd14..bfb1f52c7f 100644 --- a/src/shared/bus-util.c +++ b/src/shared/bus-util.c @@ -21,10 +21,10 @@ #include "capsule-util.h" #include "chase.h" #include "daemon-util.h" -#include "data-fd-util.h" #include "env-util.h" #include "fd-util.h" #include "format-util.h" +#include "memfd-util.h" #include "memstream-util.h" #include "path-util.h" #include "socket-util.h" @@ -803,7 +803,7 @@ static int method_dump_memory_state_by_fd(sd_bus_message *message, void *userdat if (r < 0) return r; - fd = acquire_data_fd_full(dump, dump_size, /* flags = */ 0); + fd = memfd_new_and_seal("malloc-info", dump, dump_size); if (fd < 0) return fd; diff --git a/src/shared/data-fd-util.c b/src/shared/data-fd-util.c index adc7d7417e..b948ab6cc4 100644 --- a/src/shared/data-fd-util.c +++ b/src/shared/data-fd-util.c @@ -19,137 +19,12 @@ #include "missing_syscall.h" #include "tmpfile-util.h" -/* When the data is smaller or equal to 64K, try to place the copy in a memfd/pipe */ +/* When the data is smaller or equal to 64K, try to place the copy in a memfd */ #define DATA_FD_MEMORY_LIMIT (64U * U64_KB) -/* If memfd/pipe didn't work out, then let's use a file in /tmp up to a size of 1M. If it's large than that use /var/tmp instead. */ +/* If memfd didn't work out, then let's use a file in /tmp up to a size of 1M. If it's large than that use /var/tmp/ instead. */ #define DATA_FD_TMP_LIMIT (1U * U64_MB) -int acquire_data_fd_full(const void *data, size_t size, DataFDFlags flags) { - _cleanup_close_ int fd = -EBADF; - ssize_t n; - int r; - - assert(data || size == 0); - - /* Acquire a read-only file descriptor that when read from returns the specified data. This is much more - * complex than I wish it was. But here's why: - * - * a) First we try to use memfds. They are the best option, as we can seal them nicely to make them - * read-only. Unfortunately they require kernel 3.17, and – at the time of writing – we still support 3.14. - * - * b) Then, we try classic pipes. They are the second best options, as we can close the writing side, retaining - * a nicely read-only fd in the reading side. However, they are by default quite small, and unprivileged - * clients can only bump their size to a system-wide limit, which might be quite low. - * - * c) Then, we try an O_TMPFILE file in /dev/shm (that dir is the only suitable one known to exist from - * earliest boot on). To make it read-only we open the fd a second time with O_RDONLY via - * /proc/self/. Unfortunately O_TMPFILE is not available on older kernels on tmpfs. - * - * d) Finally, we try creating a regular file in /dev/shm, which we then delete. - * - * It sucks a bit that depending on the situation we return very different objects here, but that's Linux I - * figure. */ - - if (size == SIZE_MAX) - size = strlen(data); - - if (size == 0 && !FLAGS_SET(flags, ACQUIRE_NO_DEV_NULL)) - /* As a special case, return /dev/null if we have been called for an empty data block */ - return RET_NERRNO(open("/dev/null", O_RDONLY|O_CLOEXEC|O_NOCTTY)); - - if (!FLAGS_SET(flags, ACQUIRE_NO_MEMFD)) { - fd = memfd_new_and_seal("data-fd", data, size); - if (fd < 0 && !ERRNO_IS_NOT_SUPPORTED(fd)) - return fd; - if (fd >= 0) - return TAKE_FD(fd); - } - - if (!FLAGS_SET(flags, ACQUIRE_NO_PIPE)) { - _cleanup_close_pair_ int pipefds[2] = EBADF_PAIR; - int isz; - - if (pipe2(pipefds, O_CLOEXEC|O_NONBLOCK) < 0) - return -errno; - - isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0); - if (isz < 0) - return -errno; - - if ((size_t) isz < size) { - isz = (int) size; - if (isz < 0 || (size_t) isz != size) - return -E2BIG; - - /* Try to bump the pipe size */ - (void) fcntl(pipefds[1], F_SETPIPE_SZ, isz); - - /* See if that worked */ - isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0); - if (isz < 0) - return -errno; - - if ((size_t) isz < size) - goto try_dev_shm; - } - - n = write(pipefds[1], data, size); - if (n < 0) - return -errno; - if ((size_t) n != size) - return -EIO; - - (void) fd_nonblock(pipefds[0], false); - - return TAKE_FD(pipefds[0]); - } - -try_dev_shm: - if (!FLAGS_SET(flags, ACQUIRE_NO_TMPFILE)) { - fd = open("/dev/shm", O_RDWR|O_TMPFILE|O_CLOEXEC, 0500); - if (fd < 0) - goto try_dev_shm_without_o_tmpfile; - - n = write(fd, data, size); - if (n < 0) - return -errno; - if ((size_t) n != size) - return -EIO; - - /* Let's reopen the thing, in order to get an O_RDONLY fd for the original O_RDWR one */ - return fd_reopen(fd, O_RDONLY|O_CLOEXEC); - } - -try_dev_shm_without_o_tmpfile: - if (!FLAGS_SET(flags, ACQUIRE_NO_REGULAR)) { - char pattern[] = "/dev/shm/data-fd-XXXXXX"; - - fd = mkostemp_safe(pattern); - if (fd < 0) - return fd; - - n = write(fd, data, size); - if (n < 0) { - r = -errno; - goto unlink_and_return; - } - if ((size_t) n != size) { - r = -EIO; - goto unlink_and_return; - } - - /* Let's reopen the thing, in order to get an O_RDONLY fd for the original O_RDWR one */ - r = fd_reopen(fd, O_RDONLY|O_CLOEXEC); - - unlink_and_return: - (void) unlink(pattern); - return r; - } - - return -EOPNOTSUPP; -} - int copy_data_fd(int fd) { _cleanup_close_ int copy_fd = -EBADF, tmp_fd = -EBADF; _cleanup_free_ void *remains = NULL; @@ -158,11 +33,11 @@ int copy_data_fd(int fd) { struct stat st; int r; - /* Creates a 'data' fd from the specified source fd, containing all the same data in a read-only fashion, but - * independent of it (i.e. the source fd can be closed and unmounted after this call succeeded). Tries to be - * somewhat smart about where to place the data. In the best case uses a memfd(). If memfd() are not supported - * uses a pipe instead. For larger data will use an unlinked file in /tmp, and for even larger data one in - * /var/tmp. */ + /* Creates a 'data' fd from the specified source fd, containing all the same data in a read-only + * fashion, but independent of it (i.e. the source fd can be closed and unmounted after this call + * succeeded). Tries to be somewhat smart about where to place the data. In the best case uses a + * memfd(). For larger data will use an unlinked file in /tmp/, and for even larger data one in + * /var/tmp/. */ if (fstat(fd, &st) < 0) return -errno; @@ -175,96 +50,49 @@ int copy_data_fd(int fd) { if (!S_ISREG(st.st_mode) && !S_ISSOCK(st.st_mode) && !S_ISFIFO(st.st_mode) && !S_ISCHR(st.st_mode)) return -EBADFD; - /* If we have reason to believe the data is bounded in size, then let's use memfds or pipes as backing fd. Note - * that we use the reported regular file size only as a hint, given that there are plenty special files in - * /proc and /sys which report a zero file size but can be read from. */ + /* If we have reason to believe the data is bounded in size, then let's use memfds as backing + * fd. Note that we use the reported regular file size only as a hint, given that there are plenty + * special files in /proc/ and /sys/ which report a zero file size but can be read from. */ if (!S_ISREG(st.st_mode) || (uint64_t) st.st_size < DATA_FD_MEMORY_LIMIT) { /* Try a memfd first */ copy_fd = memfd_new("data-fd"); - if (copy_fd >= 0) { - off_t f; + if (copy_fd < 0) + return copy_fd; - r = copy_bytes(fd, copy_fd, DATA_FD_MEMORY_LIMIT, 0); + r = copy_bytes(fd, copy_fd, DATA_FD_MEMORY_LIMIT, COPY_REFLINK); + if (r < 0) + return r; + + off_t f = lseek(copy_fd, 0, SEEK_SET); + if (f < 0) + return -errno; + if (f != 0) + return -EIO; + + if (r == 0) { + /* Did it fit into the limit? If so, we are done. */ + r = memfd_set_sealed(copy_fd); if (r < 0) return r; - f = lseek(copy_fd, 0, SEEK_SET); - if (f != 0) - return -errno; - - if (r == 0) { - /* Did it fit into the limit? If so, we are done. */ - r = memfd_set_sealed(copy_fd); - if (r < 0) - return r; - - return TAKE_FD(copy_fd); - } - - /* Hmm, pity, this didn't fit. Let's fall back to /tmp then, see below */ - - } else { - _cleanup_close_pair_ int pipefds[2] = EBADF_PAIR; - int isz; - - /* If memfds aren't available, use a pipe. Set O_NONBLOCK so that we will get EAGAIN rather - * then block indefinitely when we hit the pipe size limit */ - - if (pipe2(pipefds, O_CLOEXEC|O_NONBLOCK) < 0) - return -errno; - - isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0); - if (isz < 0) - return -errno; - - /* Try to enlarge the pipe size if necessary */ - if ((size_t) isz < DATA_FD_MEMORY_LIMIT) { - - (void) fcntl(pipefds[1], F_SETPIPE_SZ, DATA_FD_MEMORY_LIMIT); - - isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0); - if (isz < 0) - return -errno; - } - - if ((size_t) isz >= DATA_FD_MEMORY_LIMIT) { - - r = copy_bytes_full(fd, pipefds[1], DATA_FD_MEMORY_LIMIT, 0, &remains, &remains_size, NULL, NULL); - if (r < 0 && r != -EAGAIN) - return r; /* If we get EAGAIN it could be because of the source or because of - * the destination fd, we can't know, as sendfile() and friends won't - * tell us. Hence, treat this as reason to fall back, just to be - * sure. */ - if (r == 0) { - /* Everything fit in, yay! */ - (void) fd_nonblock(pipefds[0], false); - - return TAKE_FD(pipefds[0]); - } - - /* Things didn't fit in. But we read data into the pipe, let's remember that, so that - * when writing the new file we incorporate this first. */ - copy_fd = TAKE_FD(pipefds[0]); - } + return TAKE_FD(copy_fd); } } /* If we have reason to believe this will fit fine in /tmp, then use that as first fallback. */ if ((!S_ISREG(st.st_mode) || (uint64_t) st.st_size < DATA_FD_TMP_LIMIT) && (DATA_FD_MEMORY_LIMIT + remains_size) < DATA_FD_TMP_LIMIT) { - off_t f; - tmp_fd = open_tmpfile_unlinkable(NULL /* NULL as directory means /tmp */, O_RDWR|O_CLOEXEC); if (tmp_fd < 0) return tmp_fd; if (copy_fd >= 0) { - /* If we tried a memfd/pipe first and it ended up being too large, then copy this into the + /* If we tried a memfd first and it ended up being too large, then copy this into the * temporary file first. */ - r = copy_bytes(copy_fd, tmp_fd, UINT64_MAX, 0); + r = copy_bytes(copy_fd, tmp_fd, UINT64_MAX, COPY_REFLINK); if (r < 0) return r; @@ -287,9 +115,11 @@ int copy_data_fd(int fd) { goto finish; /* Yay, it fit in */ /* It didn't fit in. Let's not forget to use what we already used */ - f = lseek(tmp_fd, 0, SEEK_SET); - if (f != 0) + off_t f = lseek(tmp_fd, 0, SEEK_SET); + if (f < 0) return -errno; + if (f != 0) + return -EIO; close_and_replace(copy_fd, tmp_fd); @@ -297,7 +127,7 @@ int copy_data_fd(int fd) { remains_size = 0; } - /* As last fallback use /var/tmp */ + /* As last fallback use /var/tmp/ */ r = var_tmp_dir(&td); if (r < 0) return r; @@ -307,7 +137,7 @@ int copy_data_fd(int fd) { return tmp_fd; if (copy_fd >= 0) { - /* If we tried a memfd/pipe first, or a file in /tmp, and it ended up being too large, than copy this + /* If we tried a memfd first, or a file in /tmp/, and it ended up being too large, than copy this * into the temporary file first. */ r = copy_bytes(copy_fd, tmp_fd, UINT64_MAX, COPY_REFLINK); if (r < 0) diff --git a/src/shared/data-fd-util.h b/src/shared/data-fd-util.h index d77e09fb06..db03fba251 100644 --- a/src/shared/data-fd-util.h +++ b/src/shared/data-fd-util.h @@ -4,18 +4,5 @@ #include #include -typedef enum DataFDFlags { - ACQUIRE_NO_DEV_NULL = 1 << 0, - ACQUIRE_NO_MEMFD = 1 << 1, - ACQUIRE_NO_PIPE = 1 << 2, - ACQUIRE_NO_TMPFILE = 1 << 3, - ACQUIRE_NO_REGULAR = 1 << 4, -} DataFDFlags; - -int acquire_data_fd_full(const void *data, size_t size, DataFDFlags flags); -static inline int acquire_data_fd(const void *data) { - return acquire_data_fd_full(data, SIZE_MAX, 0); -} - int copy_data_fd(int fd); int memfd_clone_fd(int fd, const char *name, int mode); diff --git a/src/test/test-data-fd-util.c b/src/test/test-data-fd-util.c index 4abd7a6519..bdbceee171 100644 --- a/src/test/test-data-fd-util.c +++ b/src/test/test-data-fd-util.c @@ -7,57 +7,11 @@ #include "data-fd-util.h" #include "fd-util.h" +#include "memfd-util.h" #include "memory-util.h" #include "process-util.h" -#include "tests.h" #include "random-util.h" - -static void test_acquire_data_fd_one(unsigned flags) { - char wbuffer[196*1024 - 7]; - char rbuffer[sizeof(wbuffer)]; - int fd; - - fd = acquire_data_fd_full("foo", 3, flags); - assert_se(fd >= 0); - - zero(rbuffer); - assert_se(read(fd, rbuffer, sizeof(rbuffer)) == 3); - ASSERT_STREQ(rbuffer, "foo"); - - fd = safe_close(fd); - - fd = acquire_data_fd_full("", SIZE_MAX, flags); - assert_se(fd >= 0); - - zero(rbuffer); - assert_se(read(fd, rbuffer, sizeof(rbuffer)) == 0); - ASSERT_STREQ(rbuffer, ""); - - fd = safe_close(fd); - - random_bytes(wbuffer, sizeof(wbuffer)); - - fd = acquire_data_fd_full(wbuffer, sizeof(wbuffer), flags); - assert_se(fd >= 0); - - zero(rbuffer); - assert_se(read(fd, rbuffer, sizeof(rbuffer)) == sizeof(rbuffer)); - assert_se(memcmp(rbuffer, wbuffer, sizeof(rbuffer)) == 0); - - fd = safe_close(fd); -} - -TEST(acquire_data_fd) { - test_acquire_data_fd_one(0); - test_acquire_data_fd_one(ACQUIRE_NO_DEV_NULL); - test_acquire_data_fd_one(ACQUIRE_NO_MEMFD); - test_acquire_data_fd_one(ACQUIRE_NO_DEV_NULL|ACQUIRE_NO_MEMFD); - test_acquire_data_fd_one(ACQUIRE_NO_PIPE); - test_acquire_data_fd_one(ACQUIRE_NO_DEV_NULL|ACQUIRE_NO_PIPE); - test_acquire_data_fd_one(ACQUIRE_NO_MEMFD|ACQUIRE_NO_PIPE); - test_acquire_data_fd_one(ACQUIRE_NO_DEV_NULL|ACQUIRE_NO_MEMFD|ACQUIRE_NO_PIPE); - test_acquire_data_fd_one(ACQUIRE_NO_DEV_NULL|ACQUIRE_NO_MEMFD|ACQUIRE_NO_PIPE|ACQUIRE_NO_TMPFILE); -} +#include "tests.h" static void assert_equal_fd(int fd1, int fd2) { for (;;) { @@ -98,14 +52,14 @@ TEST(copy_data_fd) { fd1 = safe_close(fd1); fd2 = safe_close(fd2); - fd1 = acquire_data_fd("hallo"); + fd1 = memfd_new_and_seal_string("data", "hallo"); assert_se(fd1 >= 0); fd2 = copy_data_fd(fd1); assert_se(fd2 >= 0); safe_close(fd1); - fd1 = acquire_data_fd("hallo"); + fd1 = memfd_new_and_seal_string("data", "hallo"); assert_se(fd1 >= 0); assert_equal_fd(fd1, fd2); diff --git a/src/test/test-fd-util.c b/src/test/test-fd-util.c index a359efa052..a0d0bc731a 100644 --- a/src/test/test-fd-util.c +++ b/src/test/test-fd-util.c @@ -6,11 +6,11 @@ #include #include "alloc-util.h" -#include "data-fd-util.h" #include "fd-util.h" #include "fileio.h" #include "fs-util.h" #include "macro.h" +#include "memfd-util.h" #include "memory-util.h" #include "missing_syscall.h" #include "mkdir.h" @@ -203,7 +203,7 @@ TEST(rearrange_stdio) { assert_se(pipe_read_fd >= 3); assert_se(open("/dev/full", O_WRONLY|O_CLOEXEC) == 0); - assert_se(acquire_data_fd("foobar") == 2); + assert_se(memfd_new_and_seal_string("data", "foobar") == 2); assert_se(rearrange_stdio(2, 0, 1) >= 0); diff --git a/src/test/test-varlink.c b/src/test/test-varlink.c index bd1d940585..fe54ec08a5 100644 --- a/src/test/test-varlink.c +++ b/src/test/test-varlink.c @@ -8,9 +8,9 @@ #include "sd-json.h" #include "sd-varlink.h" -#include "data-fd-util.h" #include "fd-util.h" #include "json-util.h" +#include "memfd-util.h" #include "rm-rf.h" #include "strv.h" #include "tests.h" @@ -134,8 +134,8 @@ static int method_passfd(sd_varlink *link, sd_json_variant *parameters, sd_varli test_fd(yy, "bar", 3); test_fd(zz, "quux", 4); - _cleanup_close_ int vv = acquire_data_fd("miau"); - _cleanup_close_ int ww = acquire_data_fd("wuff"); + _cleanup_close_ int vv = memfd_new_and_seal_string("data", "miau"); + _cleanup_close_ int ww = memfd_new_and_seal_string("data", "wuff"); assert_se(vv >= 0); assert_se(ww >= 0); @@ -284,9 +284,9 @@ static void *thread(void *arg) { assert_se(sd_json_variant_integer(sd_json_variant_by_key(o, "sum")) == 88 + 99); assert_se(!e); - int fd1 = acquire_data_fd("foo"); - int fd2 = acquire_data_fd("bar"); - int fd3 = acquire_data_fd("quux"); + int fd1 = memfd_new_and_seal_string("data", "foo"); + int fd2 = memfd_new_and_seal_string("data", "bar"); + int fd3 = memfd_new_and_seal_string("data", "quux"); assert_se(fd1 >= 0); assert_se(fd2 >= 0); From db5381c49c671df28136a7025454bf85e555c0d9 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 13 Dec 2024 18:55:00 +0100 Subject: [PATCH 02/14] memfd-util: simplify memfd_new_and_seal() Let's use pwrite() to write the contents of the memfd. This has the benefit of not moving the file offset, which means we don't have to reset it after at all. --- src/basic/memfd-util.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/basic/memfd-util.c b/src/basic/memfd-util.c index 0ae9a4f777..9ceb18cc2f 100644 --- a/src/basic/memfd-util.c +++ b/src/basic/memfd-util.c @@ -171,8 +171,6 @@ int memfd_new_and_map(const char *name, size_t sz, void **p) { int memfd_new_and_seal(const char *name, const void *data, size_t sz) { _cleanup_close_ int fd = -EBADF; - ssize_t n; - off_t f; int r; assert(data || sz == 0); @@ -185,15 +183,11 @@ int memfd_new_and_seal(const char *name, const void *data, size_t sz) { return fd; if (sz > 0) { - n = write(fd, data, sz); + ssize_t n = pwrite(fd, data, sz, 0); if (n < 0) return -errno; if ((size_t) n != sz) return -EIO; - - f = lseek(fd, 0, SEEK_SET); - if (f != 0) - return -errno; } r = memfd_set_sealed(fd); From e1c52c9238718347a2b74a0f9abaf76cdd36902f Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 13 Dec 2024 18:55:59 +0100 Subject: [PATCH 03/14] memfd-util: short memfd_clone_fd() --- src/shared/data-fd-util.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/shared/data-fd-util.c b/src/shared/data-fd-util.c index b948ab6cc4..4ef4100564 100644 --- a/src/shared/data-fd-util.c +++ b/src/shared/data-fd-util.c @@ -200,17 +200,11 @@ int memfd_clone_fd(int fd, const char *name, int mode) { return r; if (ro) { - _cleanup_close_ int rfd = -EBADF; - r = memfd_set_sealed(mfd); if (r < 0) return r; - rfd = fd_reopen(mfd, mode); - if (rfd < 0) - return rfd; - - return TAKE_FD(rfd); + return fd_reopen(mfd, mode); } off_t f = lseek(mfd, 0, SEEK_SET); From 52cd287933b0f116913ddcc9065fdbf4a81f20c8 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 13 Dec 2024 18:59:15 +0100 Subject: [PATCH 04/14] serialize: drop memfd fallback when serializing --- src/shared/serialize.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/src/shared/serialize.c b/src/shared/serialize.c index 735caf4978..eb490aa2c4 100644 --- a/src/shared/serialize.c +++ b/src/shared/serialize.c @@ -547,21 +547,12 @@ void deserialize_ratelimit(RateLimit *rl, const char *name, const char *value) { } int open_serialization_fd(const char *ident) { - int fd; - fd = memfd_create_wrapper(ident, MFD_CLOEXEC | MFD_NOEXEC_SEAL); - if (fd < 0) { - const char *path; - - path = getpid_cached() == 1 ? "/run/systemd" : "/tmp"; - fd = open_tmpfile_unlinkable(path, O_RDWR|O_CLOEXEC); - if (fd < 0) - return fd; - - log_debug("Serializing %s to %s.", ident, path); - } else - log_debug("Serializing %s to memfd.", ident); + int fd = memfd_create_wrapper(ident, MFD_CLOEXEC | MFD_NOEXEC_SEAL); + if (fd < 0) + return fd; + log_debug("Serializing %s to memfd.", ident); return fd; } From ce66a2f2bbfe216207407f27552d6c3f8a6fe4bf Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 13 Dec 2024 19:05:41 +0100 Subject: [PATCH 05/14] sd-journal: drop memfd fallback --- src/libsystemd/sd-journal/journal-send.c | 33 ++++++------------------ 1 file changed, 8 insertions(+), 25 deletions(-) diff --git a/src/libsystemd/sd-journal/journal-send.c b/src/libsystemd/sd-journal/journal-send.c index 7d02b57d7b..bbfe9cf0fa 100644 --- a/src/libsystemd/sd-journal/journal-send.c +++ b/src/libsystemd/sd-journal/journal-send.c @@ -233,7 +233,6 @@ _public_ int sd_journal_sendv(const struct iovec *iov, int n) { }; ssize_t k; bool have_syslog_identifier = false; - bool seal = true; assert_return(iov, -EINVAL); assert_return(n > 0, -EINVAL); @@ -312,35 +311,19 @@ _public_ int sd_journal_sendv(const struct iovec *iov, int n) { if (!IN_SET(errno, EMSGSIZE, ENOBUFS, EAGAIN)) return -errno; - /* Message doesn't fit... Let's dump the data in a memfd or - * temporary file and just pass a file descriptor of it to the - * other side. - * - * For the temporary files we use /dev/shm instead of /tmp - * here, since we want this to be a tmpfs, and one that is - * available from early boot on and where unprivileged users - * can create files. */ - buffer_fd = memfd_new(NULL); - if (buffer_fd < 0) { - if (buffer_fd == -ENOSYS) { - buffer_fd = open_tmpfile_unlinkable("/dev/shm", O_RDWR | O_CLOEXEC); - if (buffer_fd < 0) - return buffer_fd; - - seal = false; - } else - return buffer_fd; - } + /* Message doesn't fit... Let's dump the data in a memfd or temporary file and just pass a file + * descriptor of it to the other side. */ + buffer_fd = memfd_new("journal-data"); + if (buffer_fd < 0) + return buffer_fd; n = writev(buffer_fd, w, j); if (n < 0) return -errno; - if (seal) { - r = memfd_set_sealed(buffer_fd); - if (r < 0) - return r; - } + r = memfd_set_sealed(buffer_fd); + if (r < 0) + return r; r = send_one_fd_sa(fd, buffer_fd, mh.msg_name, mh.msg_namelen, 0); if (r == -ENOENT) From caf1436ee8adab3cf3d50dbefc76f1dd5e64e281 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 16 Dec 2024 11:04:03 +0100 Subject: [PATCH 06/14] memfd-util: use TASK_COMM_LEN at one more place Note this corrects the size of the array from 17 to 16, as the 16 already includes space for a trailing NUL. --- src/basic/memfd-util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/basic/memfd-util.c b/src/basic/memfd-util.c index 9ceb18cc2f..96bbba35e6 100644 --- a/src/basic/memfd-util.c +++ b/src/basic/memfd-util.c @@ -40,7 +40,7 @@ int memfd_new(const char *name) { _cleanup_free_ char *g = NULL; if (!name) { - char pr[17] = {}; + char pr[TASK_COMM_LEN] = {}; /* If no name is specified we generate one. We include * a hint indicating our library implementation, and From 9b1d97cccd0d8b2b52ccc03c96b53691059625b1 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 16 Dec 2024 11:27:58 +0100 Subject: [PATCH 07/14] memfd-util: explain what memfd_create_wrapper() is for in a comment --- src/basic/memfd-util.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/basic/memfd-util.c b/src/basic/memfd-util.c index 96bbba35e6..42ceb93545 100644 --- a/src/basic/memfd-util.c +++ b/src/basic/memfd-util.c @@ -24,6 +24,11 @@ int memfd_create_wrapper(const char *name, unsigned mode) { unsigned mode_compat; int mfd; + assert(name); + + /* Wrapper around memfd_create() which adds compat with older kernels where memfd_create() didn't + * support MFD_EXEC/MFD_NOEXEC_SEAL. (kernel 6.3+) */ + mfd = RET_NERRNO(memfd_create(name, mode)); if (mfd != -EINVAL) return mfd; From 4d98709cb2d04525e3e96917128dfa7fd58a5125 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 16 Dec 2024 11:28:46 +0100 Subject: [PATCH 08/14] memfd-util: introduce memfd_new_full() helper This is just like memfd_new(), but allows fine grained control of the sealing flags. This switches over all uses of memfd_new() where we actually want sealing to use memfd_new_full(). This then allows use to use memfd_new() for two further calls, where we previously used the more lowlevel memfd_create_wrapper(). --- src/basic/memfd-util.c | 8 +++++--- src/basic/memfd-util.h | 7 ++++++- src/home/homed-home.c | 2 +- src/libsystemd/sd-journal/journal-send.c | 3 ++- src/shared/data-fd-util.c | 2 +- src/shared/serialize.c | 3 ++- src/test/test-memfd-util.c | 3 ++- 7 files changed, 19 insertions(+), 9 deletions(-) diff --git a/src/basic/memfd-util.c b/src/basic/memfd-util.c index 42ceb93545..3bcb5c9582 100644 --- a/src/basic/memfd-util.c +++ b/src/basic/memfd-util.c @@ -41,7 +41,7 @@ int memfd_create_wrapper(const char *name, unsigned mode) { return RET_NERRNO(memfd_create(name, mode_compat)); } -int memfd_new(const char *name) { +int memfd_new_full(const char *name, unsigned extra_flags) { _cleanup_free_ char *g = NULL; if (!name) { @@ -70,7 +70,9 @@ int memfd_new(const char *name) { } } - return memfd_create_wrapper(name, MFD_ALLOW_SEALING | MFD_CLOEXEC | MFD_NOEXEC_SEAL); + return memfd_create_wrapper( + name, + MFD_CLOEXEC | MFD_NOEXEC_SEAL | extra_flags); } int memfd_add_seals(int fd, unsigned int seals) { @@ -183,7 +185,7 @@ int memfd_new_and_seal(const char *name, const void *data, size_t sz) { if (sz == SIZE_MAX) sz = strlen(data); - fd = memfd_new(name); + fd = memfd_new_full(name, MFD_ALLOW_SEALING); if (fd < 0) return fd; diff --git a/src/basic/memfd-util.h b/src/basic/memfd-util.h index 020dccb4f6..06af55c1b0 100644 --- a/src/basic/memfd-util.h +++ b/src/basic/memfd-util.h @@ -8,8 +8,13 @@ int memfd_create_wrapper(const char *name, unsigned mode); -int memfd_new(const char *name); +int memfd_new_full(const char *name, unsigned extra_flags); +static inline int memfd_new(const char *name) { + return memfd_new_full(name, 0); +} + int memfd_new_and_map(const char *name, size_t sz, void **p); + int memfd_new_and_seal(const char *name, const void *data, size_t sz); static inline int memfd_new_and_seal_string(const char *name, const char *s) { return memfd_new_and_seal(name, s, SIZE_MAX); diff --git a/src/home/homed-home.c b/src/home/homed-home.c index 751b197be8..fd9d90be90 100644 --- a/src/home/homed-home.c +++ b/src/home/homed-home.c @@ -1271,7 +1271,7 @@ static int home_start_work( log_debug("Sending to worker: %s", formatted); - stdout_fd = memfd_create_wrapper("homework-stdout", MFD_CLOEXEC | MFD_NOEXEC_SEAL); + stdout_fd = memfd_new("homework-stdout"); if (stdout_fd < 0) return stdout_fd; diff --git a/src/libsystemd/sd-journal/journal-send.c b/src/libsystemd/sd-journal/journal-send.c index bbfe9cf0fa..93f9fc9df0 100644 --- a/src/libsystemd/sd-journal/journal-send.c +++ b/src/libsystemd/sd-journal/journal-send.c @@ -22,6 +22,7 @@ #include "iovec-util.h" #include "journal-send.h" #include "memfd-util.h" +#include "missing_mman.h" #include "missing_syscall.h" #include "process-util.h" #include "socket-util.h" @@ -313,7 +314,7 @@ _public_ int sd_journal_sendv(const struct iovec *iov, int n) { /* Message doesn't fit... Let's dump the data in a memfd or temporary file and just pass a file * descriptor of it to the other side. */ - buffer_fd = memfd_new("journal-data"); + buffer_fd = memfd_new_full("journal-data", MFD_ALLOW_SEALING); if (buffer_fd < 0) return buffer_fd; diff --git a/src/shared/data-fd-util.c b/src/shared/data-fd-util.c index 4ef4100564..2a9549f0c2 100644 --- a/src/shared/data-fd-util.c +++ b/src/shared/data-fd-util.c @@ -57,7 +57,7 @@ int copy_data_fd(int fd) { if (!S_ISREG(st.st_mode) || (uint64_t) st.st_size < DATA_FD_MEMORY_LIMIT) { /* Try a memfd first */ - copy_fd = memfd_new("data-fd"); + copy_fd = memfd_new_full("data-fd", MFD_ALLOW_SEALING); if (copy_fd < 0) return copy_fd; diff --git a/src/shared/serialize.c b/src/shared/serialize.c index eb490aa2c4..8fce19ce90 100644 --- a/src/shared/serialize.c +++ b/src/shared/serialize.c @@ -547,8 +547,9 @@ void deserialize_ratelimit(RateLimit *rl, const char *name, const char *value) { } int open_serialization_fd(const char *ident) { + assert(ident); - int fd = memfd_create_wrapper(ident, MFD_CLOEXEC | MFD_NOEXEC_SEAL); + int fd = memfd_new(ident); if (fd < 0) return fd; diff --git a/src/test/test-memfd-util.c b/src/test/test-memfd-util.c index f8e1b46075..d38397a3f3 100644 --- a/src/test/test-memfd-util.c +++ b/src/test/test-memfd-util.c @@ -5,6 +5,7 @@ #include "errno-util.h" #include "fd-util.h" #include "memfd-util.h" +#include "missing_mman.h" #include "string-util.h" #include "tests.h" @@ -12,7 +13,7 @@ TEST(memfd_get_sealed) { #define TEST_TEXT "this is some random test text we are going to write to a memfd" _cleanup_close_ int fd = -EBADF; - fd = memfd_new("test-memfd-get-sealed"); + fd = memfd_new_full("test-memfd-get-sealed", MFD_ALLOW_SEALING); if (fd < 0) { assert_se(ERRNO_IS_NOT_SUPPORTED(fd)); return; From 5d1e57b820381700f50d5d1a0c1d6df0dc8b244a Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 16 Dec 2024 11:29:52 +0100 Subject: [PATCH 09/14] serialize: add explicit calls for finishing serialization These new calls will do three things: 1. in case of FILE* stuff: flush any pending bytes onto the fd, just in case 2. seal the backing memfd 3. seek back to the beginning. Note that this adds sealing to serialization: once we serialized fully, we'll seal the thing off for further modifications, before we pass the fd over to the target process. This should add a bit of robustness, and maybe finds a bug or two one day, if we accidentally write to a serialization that is complete. --- src/core/execute.c | 5 +++-- src/core/main.c | 16 +++++++++------- src/core/manager-serialize.c | 4 ---- src/core/manager.c | 6 ++++-- src/shared/exec-util.c | 11 +++++++---- src/shared/serialize.c | 31 +++++++++++++++++++++++++++++-- src/shared/serialize.h | 3 +++ src/test/test-fd-util.c | 4 ++++ 8 files changed, 59 insertions(+), 21 deletions(-) diff --git a/src/core/execute.c b/src/core/execute.c index 3c8d4c2be1..f12667f175 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -516,8 +516,9 @@ int exec_spawn( if (r < 0) return log_unit_error_errno(unit, r, "Failed to serialize parameters: %m"); - if (fseeko(f, 0, SEEK_SET) < 0) - return log_unit_error_errno(unit, errno, "Failed to reseek on serialization stream: %m"); + r = finish_serialization_file(f); + if (r < 0) + return log_unit_error_errno(unit, r, "Failed to finish serialization stream: %m"); r = fd_cloexec(fileno(f), false); if (r < 0) diff --git a/src/core/main.c b/src/core/main.c index 172742c769..693293ac9c 100644 --- a/src/core/main.c +++ b/src/core/main.c @@ -34,12 +34,12 @@ #include "clock-warp.h" #include "conf-parser.h" #include "confidential-virt.h" +#include "constants.h" #include "copy.h" #include "cpu-set-util.h" #include "crash-handler.h" #include "dbus-manager.h" #include "dbus.h" -#include "constants.h" #include "dev-setup.h" #include "efi-random.h" #include "efivars.h" @@ -87,6 +87,7 @@ #include "seccomp-util.h" #include "selinux-setup.h" #include "selinux-util.h" +#include "serialize.h" #include "signal-util.h" #include "smack-setup.h" #include "special.h" @@ -1233,14 +1234,14 @@ static int prepare_reexecute( assert(ret_f); assert(ret_fds); - r = manager_open_serialization(m, &f); - if (r < 0) - return log_error_errno(r, "Failed to create serialization file: %m"); - /* Make sure nothing is really destructed when we shut down */ m->n_reloading++; bus_manager_send_reloading(m, true); + r = manager_open_serialization(m, &f); + if (r < 0) + return log_error_errno(r, "Failed to create serialization file: %m"); + fds = fdset_new(); if (!fds) return log_oom(); @@ -1249,8 +1250,9 @@ static int prepare_reexecute( if (r < 0) return r; - if (fseeko(f, 0, SEEK_SET) < 0) - return log_error_errno(errno, "Failed to rewind serialization fd: %m"); + r = finish_serialization_file(f); + if (r < 0) + return log_error_errno(r, "Failed to finish serialization file: %m"); r = fd_cloexec(fileno(f), false); if (r < 0) diff --git a/src/core/manager-serialize.c b/src/core/manager-serialize.c index 3f624619df..e8bc3d58b7 100644 --- a/src/core/manager-serialize.c +++ b/src/core/manager-serialize.c @@ -186,10 +186,6 @@ int manager_serialize( return r; } - r = fflush_and_check(f); - if (r < 0) - return log_error_errno(r, "Failed to flush serialization: %m"); - r = bus_fdset_add_all(m, fds); if (r < 0) return log_error_errno(r, "Failed to add bus sockets to serialization: %m"); diff --git a/src/core/manager.c b/src/core/manager.c index f21a4f7ceb..343bc83a77 100644 --- a/src/core/manager.c +++ b/src/core/manager.c @@ -80,6 +80,7 @@ #include "rlimit-util.h" #include "rm-rf.h" #include "selinux-util.h" +#include "serialize.h" #include "signal-util.h" #include "socket-util.h" #include "special.h" @@ -3762,8 +3763,9 @@ int manager_reload(Manager *m) { if (r < 0) return r; - if (fseeko(f, 0, SEEK_SET) < 0) - return log_error_errno(errno, "Failed to seek to beginning of serialization: %m"); + r = finish_serialization_file(f); + if (r < 0) + return log_error_errno(r, "Failed to finish serialization: %m"); /* 💀 This is the point of no return, from here on there is no way back. 💀 */ reloading = NULL; diff --git a/src/shared/exec-util.c b/src/shared/exec-util.c index fbd66acd9a..c673e344ee 100644 --- a/src/shared/exec-util.c +++ b/src/shared/exec-util.c @@ -199,8 +199,9 @@ static int do_execute( } if (callbacks) { - if (lseek(fd, 0, SEEK_SET) < 0) - return log_error_errno(errno, "Failed to seek on serialization fd: %m"); + r = finish_serialization_fd(fd); + if (r < 0) + return log_error_errno(r, "Failed to finish serialization fd: %m"); r = callbacks[STDOUT_GENERATE](TAKE_FD(fd), callback_args[STDOUT_GENERATE]); if (r < 0) @@ -290,12 +291,14 @@ int execute_strv( if (!callbacks) return 0; - if (lseek(fd, 0, SEEK_SET) < 0) - return log_error_errno(errno, "Failed to rewind serialization fd: %m"); + r = finish_serialization_fd(fd); + if (r < 0) + return log_error_errno(r, "Failed to finish serialization fd: %m"); r = callbacks[STDOUT_CONSUME](TAKE_FD(fd), callback_args[STDOUT_CONSUME]); if (r < 0) return log_error_errno(r, "Failed to parse returned data: %m"); + return 0; } diff --git a/src/shared/serialize.c b/src/shared/serialize.c index 8fce19ce90..4d9ee6cc32 100644 --- a/src/shared/serialize.c +++ b/src/shared/serialize.c @@ -549,7 +549,7 @@ void deserialize_ratelimit(RateLimit *rl, const char *name, const char *value) { int open_serialization_fd(const char *ident) { assert(ident); - int fd = memfd_new(ident); + int fd = memfd_new_full(ident, MFD_ALLOW_SEALING); if (fd < 0) return fd; @@ -572,6 +572,33 @@ int open_serialization_file(const char *ident, FILE **ret) { return -errno; *ret = TAKE_PTR(f); - return 0; } + +int finish_serialization_fd(int fd) { + assert(fd >= 0); + + if (lseek(fd, 0, SEEK_SET) < 0) + return -errno; + + return memfd_set_sealed(fd); +} + +int finish_serialization_file(FILE *f) { + int r; + + assert(f); + + r = fflush_and_check(f); + if (r < 0) + return r; + + if (fseeko(f, 0, SEEK_SET) < 0) + return -errno; + + int fd = fileno(f); + if (fd < 0) + return -EBADF; + + return memfd_set_sealed(fd); +} diff --git a/src/shared/serialize.h b/src/shared/serialize.h index 355eff9b8f..dc753465c4 100644 --- a/src/shared/serialize.h +++ b/src/shared/serialize.h @@ -51,3 +51,6 @@ void deserialize_ratelimit(RateLimit *rl, const char *name, const char *value); int open_serialization_fd(const char *ident); int open_serialization_file(const char *ident, FILE **ret); + +int finish_serialization_fd(int fd); +int finish_serialization_file(FILE *f); diff --git a/src/test/test-fd-util.c b/src/test/test-fd-util.c index a0d0bc731a..2ebc776f47 100644 --- a/src/test/test-fd-util.c +++ b/src/test/test-fd-util.c @@ -127,6 +127,8 @@ TEST(open_serialization_fd) { assert_se(fd >= 0); assert_se(write(fd, "test\n", 5) == 5); + + assert_se(finish_serialization_fd(fd) >= 0); } TEST(open_serialization_file) { @@ -138,6 +140,8 @@ TEST(open_serialization_file) { assert_se(f); assert_se(fwrite("test\n", 1, 5, f) == 5); + + assert_se(finish_serialization_file(f) >= 0); } TEST(fd_move_above_stdio) { From d54bbc4cdccf0af5efe539844268483143c24ec3 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 16 Dec 2024 11:32:07 +0100 Subject: [PATCH 10/14] memfd-util: trivial modernizations --- src/basic/memfd-util.c | 22 +++++++++++----------- src/basic/memfd-util.h | 10 +++++----- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/basic/memfd-util.c b/src/basic/memfd-util.c index 3bcb5c9582..2d710267d0 100644 --- a/src/basic/memfd-util.c +++ b/src/basic/memfd-util.c @@ -75,13 +75,13 @@ int memfd_new_full(const char *name, unsigned extra_flags) { MFD_CLOEXEC | MFD_NOEXEC_SEAL | extra_flags); } -int memfd_add_seals(int fd, unsigned int seals) { +int memfd_add_seals(int fd, unsigned seals) { assert(fd >= 0); return RET_NERRNO(fcntl(fd, F_ADD_SEALS, seals)); } -int memfd_get_seals(int fd, unsigned int *ret_seals) { +int memfd_get_seals(int fd, unsigned *ret_seals) { int r; assert(fd >= 0); @@ -95,14 +95,14 @@ int memfd_get_seals(int fd, unsigned int *ret_seals) { return 0; } -int memfd_map(int fd, uint64_t offset, size_t size, void **p) { +int memfd_map(int fd, uint64_t offset, size_t size, void **ret) { unsigned int seals; void *q; int r; assert(fd >= 0); assert(size > 0); - assert(p); + assert(ret); r = memfd_get_seals(fd, &seals); if (r < 0) @@ -115,7 +115,7 @@ int memfd_map(int fd, uint64_t offset, size_t size, void **p) { if (q == MAP_FAILED) return -errno; - *p = q; + *ret = q; return 0; } @@ -135,16 +135,16 @@ int memfd_get_sealed(int fd) { return FLAGS_SET(seals, F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE); } -int memfd_get_size(int fd, uint64_t *sz) { +int memfd_get_size(int fd, uint64_t *ret) { struct stat stat; assert(fd >= 0); - assert(sz); + assert(ret); if (fstat(fd, &stat) < 0) return -errno; - *sz = stat.st_size; + *ret = stat.st_size; return 0; } @@ -154,12 +154,12 @@ int memfd_set_size(int fd, uint64_t sz) { return RET_NERRNO(ftruncate(fd, sz)); } -int memfd_new_and_map(const char *name, size_t sz, void **p) { +int memfd_new_and_map(const char *name, size_t sz, void **ret) { _cleanup_close_ int fd = -EBADF; int r; assert(sz > 0); - assert(p); + assert(ret); fd = memfd_new(name); if (fd < 0) @@ -169,7 +169,7 @@ int memfd_new_and_map(const char *name, size_t sz, void **p) { if (r < 0) return r; - r = memfd_map(fd, 0, sz, p); + r = memfd_map(fd, 0, sz, ret); if (r < 0) return r; diff --git a/src/basic/memfd-util.h b/src/basic/memfd-util.h index 06af55c1b0..92d8b57914 100644 --- a/src/basic/memfd-util.h +++ b/src/basic/memfd-util.h @@ -13,19 +13,19 @@ static inline int memfd_new(const char *name) { return memfd_new_full(name, 0); } -int memfd_new_and_map(const char *name, size_t sz, void **p); +int memfd_new_and_map(const char *name, size_t sz, void **ret); int memfd_new_and_seal(const char *name, const void *data, size_t sz); static inline int memfd_new_and_seal_string(const char *name, const char *s) { return memfd_new_and_seal(name, s, SIZE_MAX); } -int memfd_add_seals(int fd, unsigned int seals); -int memfd_get_seals(int fd, unsigned int *ret_seals); -int memfd_map(int fd, uint64_t offset, size_t size, void **p); +int memfd_add_seals(int fd, unsigned seals); +int memfd_get_seals(int fd, unsigned *ret_seals); +int memfd_map(int fd, uint64_t offset, size_t size, void **ret); int memfd_set_sealed(int fd); int memfd_get_sealed(int fd); -int memfd_get_size(int fd, uint64_t *sz); +int memfd_get_size(int fd, uint64_t *ret); int memfd_set_size(int fd, uint64_t sz); From 65d9ef40f222588fcaf55e2932f45b0d4bdaf194 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 16 Dec 2024 11:48:19 +0100 Subject: [PATCH 11/14] pid1: drop check that ensures /run/ has plenty space before reexec/reload Now that we only support serialization into a memfd (rather than a file in /run/) there's no point to check the free space in /run/. Let's drop it. One error scenario gone. Yay. --- src/core/dbus-manager.c | 87 ----------------------- src/core/manager.c | 6 -- src/libsystemd/sd-bus/bus-common-errors.c | 1 - src/libsystemd/sd-bus/bus-common-errors.h | 1 - 4 files changed, 95 deletions(-) diff --git a/src/core/dbus-manager.c b/src/core/dbus-manager.c index 4e9ea8ac27..3c66d69893 100644 --- a/src/core/dbus-manager.c +++ b/src/core/dbus-manager.c @@ -48,10 +48,6 @@ #include "virt.h" #include "watchdog.h" -/* Require 16MiB free in /run/systemd for reloading/reexecing. After all we need to serialize our state - * there, and if we can't we'll fail badly. */ -#define RELOAD_DISK_SPACE_MIN (UINT64_C(16) * UINT64_C(1024) * UINT64_C(1024)) - static UnitFileFlags unit_file_bools_to_flags(bool runtime, bool force) { return (runtime ? UNIT_FILE_RUNTIME : 0) | (force ? UNIT_FILE_FORCE : 0); @@ -1485,73 +1481,6 @@ static int method_refuse_snapshot(sd_bus_message *message, void *userdata, sd_bu return sd_bus_error_set(error, SD_BUS_ERROR_NOT_SUPPORTED, "Support for snapshots has been removed."); } -static int get_run_space(uint64_t *ret, sd_bus_error *error) { - struct statvfs svfs; - - assert(ret); - - if (statvfs("/run/systemd", &svfs) < 0) - return sd_bus_error_set_errnof(error, errno, "Failed to statvfs(/run/systemd): %m"); - - *ret = (uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize; - return 0; -} - -static int verify_run_space(const char *message, sd_bus_error *error) { - uint64_t available = 0; /* unnecessary, but used to trick out gcc's incorrect maybe-uninitialized warning */ - int r; - - assert(message); - - r = get_run_space(&available, error); - if (r < 0) - return r; - - if (available < RELOAD_DISK_SPACE_MIN) - return sd_bus_error_setf(error, - BUS_ERROR_DISK_FULL, - "%s, not enough space available on /run/systemd/. " - "Currently, %s are free, but a safety buffer of %s is enforced.", - message, - FORMAT_BYTES(available), - FORMAT_BYTES(RELOAD_DISK_SPACE_MIN)); - - return 0; -} - -int verify_run_space_and_log(const char *message) { - _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; - int r; - - assert(message); - - r = verify_run_space(message, &error); - if (r < 0) - return log_error_errno(r, "%s", bus_error_message(&error, r)); - - return 0; -} - -static int verify_run_space_permissive(const char *message, sd_bus_error *error) { - uint64_t available = 0; /* unnecessary, but used to trick out gcc's incorrect maybe-uninitialized warning */ - int r; - - assert(message); - - r = get_run_space(&available, error); - if (r < 0) - return r; - - if (available < RELOAD_DISK_SPACE_MIN) - log_warning("Dangerously low amount of free space on /run/systemd/, %s.\n" - "Currently, %s are free, but %s are suggested. Proceeding anyway.", - message, - FORMAT_BYTES(available), - FORMAT_BYTES(RELOAD_DISK_SPACE_MIN)); - - return 0; -} - static void log_caller(sd_bus_message *message, Manager *manager, const char *method) { _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; @@ -1585,10 +1514,6 @@ static int method_reload(sd_bus_message *message, void *userdata, sd_bus_error * assert(message); - r = verify_run_space("Refusing to reload", error); - if (r < 0) - return r; - r = mac_selinux_access_check(message, "reload", error); if (r < 0) return r; @@ -1631,10 +1556,6 @@ static int method_reexecute(sd_bus_message *message, void *userdata, sd_bus_erro assert(message); - r = verify_run_space("Refusing to reexecute", error); - if (r < 0) - return r; - r = mac_selinux_access_check(message, "reload", error); if (r < 0) return r; @@ -1718,10 +1639,6 @@ static int method_soft_reboot(sd_bus_message *message, void *userdata, sd_bus_er return sd_bus_error_set(error, SD_BUS_ERROR_NOT_SUPPORTED, "Soft reboot is only supported by system manager."); - r = verify_run_space_permissive("soft reboot may fail", error); - if (r < 0) - return r; - r = mac_selinux_access_check(message, "reboot", error); if (r < 0) return r; @@ -1826,10 +1743,6 @@ static int method_switch_root(sd_bus_message *message, void *userdata, sd_bus_er return sd_bus_error_set(error, SD_BUS_ERROR_NOT_SUPPORTED, "Root switching is only supported by system manager."); - r = verify_run_space_permissive("root switching may fail", error); - if (r < 0) - return r; - r = mac_selinux_access_check(message, "reboot", error); if (r < 0) return r; diff --git a/src/core/manager.c b/src/core/manager.c index 343bc83a77..e75c760b6f 100644 --- a/src/core/manager.c +++ b/src/core/manager.c @@ -3124,9 +3124,6 @@ static int manager_dispatch_signal_fd(sd_event_source *source, int fd, uint32_t case SIGTERM: if (MANAGER_IS_SYSTEM(m)) { /* This is for compatibility with the original sysvinit */ - if (verify_run_space_and_log("Refusing to reexecute") < 0) - break; - m->objective = MANAGER_REEXECUTE; break; } @@ -3180,9 +3177,6 @@ static int manager_dispatch_signal_fd(sd_event_source *source, int fd, uint32_t } case SIGHUP: - if (verify_run_space_and_log("Refusing to reload") < 0) - break; - m->objective = MANAGER_RELOAD; break; diff --git a/src/libsystemd/sd-bus/bus-common-errors.c b/src/libsystemd/sd-bus/bus-common-errors.c index 895626c872..cb5c1b74d5 100644 --- a/src/libsystemd/sd-bus/bus-common-errors.c +++ b/src/libsystemd/sd-bus/bus-common-errors.c @@ -32,7 +32,6 @@ BUS_ERROR_MAP_ELF_REGISTER const sd_bus_error_map bus_common_errors[] = { SD_BUS_ERROR_MAP(BUS_ERROR_SCOPE_NOT_RUNNING, EHOSTDOWN), SD_BUS_ERROR_MAP(BUS_ERROR_NO_SUCH_DYNAMIC_USER, ESRCH), SD_BUS_ERROR_MAP(BUS_ERROR_NOT_REFERENCED, EUNATCH), - SD_BUS_ERROR_MAP(BUS_ERROR_DISK_FULL, ENOSPC), SD_BUS_ERROR_MAP(BUS_ERROR_FILE_DESCRIPTOR_STORE_DISABLED, EHOSTDOWN), SD_BUS_ERROR_MAP(BUS_ERROR_FROZEN_BY_PARENT, EDEADLK), diff --git a/src/libsystemd/sd-bus/bus-common-errors.h b/src/libsystemd/sd-bus/bus-common-errors.h index 138d8a171e..edc49027b6 100644 --- a/src/libsystemd/sd-bus/bus-common-errors.h +++ b/src/libsystemd/sd-bus/bus-common-errors.h @@ -28,7 +28,6 @@ #define BUS_ERROR_SCOPE_NOT_RUNNING "org.freedesktop.systemd1.ScopeNotRunning" #define BUS_ERROR_NO_SUCH_DYNAMIC_USER "org.freedesktop.systemd1.NoSuchDynamicUser" #define BUS_ERROR_NOT_REFERENCED "org.freedesktop.systemd1.NotReferenced" -#define BUS_ERROR_DISK_FULL "org.freedesktop.systemd1.DiskFull" #define BUS_ERROR_NOTHING_TO_CLEAN "org.freedesktop.systemd1.NothingToClean" #define BUS_ERROR_UNIT_BUSY "org.freedesktop.systemd1.UnitBusy" #define BUS_ERROR_UNIT_INACTIVE "org.freedesktop.systemd1.UnitInactive" From 4dac692094029b81156bf3e0bf80ddb7cde4e98c Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 16 Dec 2024 12:18:17 +0100 Subject: [PATCH 12/14] fuzz-journal-remote: use memfd_new_and_seal() where appropriate This means we can drop memfd_new_and_map() and results in generally shorter code. --- src/basic/memfd-util.c | 46 ------------------------ src/basic/memfd-util.h | 3 -- src/journal-remote/fuzz-journal-remote.c | 6 +--- 3 files changed, 1 insertion(+), 54 deletions(-) diff --git a/src/basic/memfd-util.c b/src/basic/memfd-util.c index 2d710267d0..9d3983b181 100644 --- a/src/basic/memfd-util.c +++ b/src/basic/memfd-util.c @@ -95,30 +95,6 @@ int memfd_get_seals(int fd, unsigned *ret_seals) { return 0; } -int memfd_map(int fd, uint64_t offset, size_t size, void **ret) { - unsigned int seals; - void *q; - int r; - - assert(fd >= 0); - assert(size > 0); - assert(ret); - - r = memfd_get_seals(fd, &seals); - if (r < 0) - return r; - - if (seals & F_SEAL_WRITE) - q = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, offset); - else - q = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, offset); - if (q == MAP_FAILED) - return -errno; - - *ret = q; - return 0; -} - int memfd_set_sealed(int fd) { return memfd_add_seals(fd, F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE); } @@ -154,28 +130,6 @@ int memfd_set_size(int fd, uint64_t sz) { return RET_NERRNO(ftruncate(fd, sz)); } -int memfd_new_and_map(const char *name, size_t sz, void **ret) { - _cleanup_close_ int fd = -EBADF; - int r; - - assert(sz > 0); - assert(ret); - - fd = memfd_new(name); - if (fd < 0) - return fd; - - r = memfd_set_size(fd, sz); - if (r < 0) - return r; - - r = memfd_map(fd, 0, sz, ret); - if (r < 0) - return r; - - return TAKE_FD(fd); -} - int memfd_new_and_seal(const char *name, const void *data, size_t sz) { _cleanup_close_ int fd = -EBADF; int r; diff --git a/src/basic/memfd-util.h b/src/basic/memfd-util.h index 92d8b57914..077ca6f67b 100644 --- a/src/basic/memfd-util.h +++ b/src/basic/memfd-util.h @@ -13,8 +13,6 @@ static inline int memfd_new(const char *name) { return memfd_new_full(name, 0); } -int memfd_new_and_map(const char *name, size_t sz, void **ret); - int memfd_new_and_seal(const char *name, const void *data, size_t sz); static inline int memfd_new_and_seal_string(const char *name, const char *s) { return memfd_new_and_seal(name, s, SIZE_MAX); @@ -22,7 +20,6 @@ static inline int memfd_new_and_seal_string(const char *name, const char *s) { int memfd_add_seals(int fd, unsigned seals); int memfd_get_seals(int fd, unsigned *ret_seals); -int memfd_map(int fd, uint64_t offset, size_t size, void **ret); int memfd_set_sealed(int fd); int memfd_get_sealed(int fd); diff --git a/src/journal-remote/fuzz-journal-remote.c b/src/journal-remote/fuzz-journal-remote.c index 774389dee3..3e874f19b7 100644 --- a/src/journal-remote/fuzz-journal-remote.c +++ b/src/journal-remote/fuzz-journal-remote.c @@ -24,7 +24,6 @@ int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { _cleanup_(unlink_and_freep) char *name = NULL; _cleanup_(sd_journal_closep) sd_journal *j = NULL; _cleanup_(journal_remote_server_destroy) RemoteServer s = {}; - void *mem; int fdin, r; if (outside_size_range(size, 3, 65536)) @@ -35,13 +34,10 @@ int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { assert_se(mkdtemp_malloc("/tmp/fuzz-journal-remote-XXXXXX", &tmp) >= 0); assert_se(name = path_join(tmp, "fuzz-journal-remote.XXXXXX.journal")); - fdin = fdin_close = memfd_new_and_map("fuzz-journal-remote", size, &mem); + fdin = fdin_close = memfd_new_and_seal("fuzz-journal-remote", data, size); if (fdin < 0) return log_error_errno(fdin, "memfd_new_and_map() failed: %m"); - memcpy(mem, data, size); - assert_se(munmap(mem, size) == 0); - fdout = mkostemps(name, STRLEN(".journal"), O_CLOEXEC); if (fdout < 0) return log_error_errno(errno, "mkostemps() failed: %m"); From 6db5a6e799db9d95185cd0c2a327109e80114aea Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 13 Dec 2024 19:08:16 +0100 Subject: [PATCH 13/14] doc: document new baseline requires memfd_create() --- README | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README b/README index c6cc8f2b8f..a0818bff2b 100644 --- a/README +++ b/README @@ -43,7 +43,7 @@ REQUIREMENTS: ⛔ Kernel versions below 4.3 ("minimum baseline") are not supported at all, and are missing required functionality (e.g. CLOCK_BOOTTIME - support for timerfd_create() or ambient capabilities). + support for timerfd_create(), ambient capabilities, or memfd_create()). ⚠️ Kernel versions below 5.4 ("recommended baseline") have significant gaps in functionality and are not recommended for use with this version From 3ee8082947b2781eecc08b7b38df927458121fc8 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 13 Dec 2024 19:52:33 +0100 Subject: [PATCH 14/14] update TODO --- TODO | 3 --- 1 file changed, 3 deletions(-) diff --git a/TODO b/TODO index b160e2f8b9..91127eff94 100644 --- a/TODO +++ b/TODO @@ -101,9 +101,6 @@ Deprecations and removals: and then rework cgroupsv2 support around fds, i.e. keep one fd per active unit around, and always operate on that, instead of cgroup fs paths. -* drop support for kernels lacking memfd_create() (i.e. make 3.17 new - baseline), then drop all pipe() based fallbacks. - * drop support for getrandom()-less kernels. (GRND_INSECURE means once kernel 5.6 becomes our baseline). See https://github.com/systemd/systemd/pull/24101#issuecomment-1193966468 for