From 66130f0a55c9b8e0cda869a21674749dcb70e83a Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Thu, 23 Mar 2023 13:48:42 +0100 Subject: [PATCH 1/8] namespace: Load sidecar verity settings in apply_mount_namespace() Let's reduce the argument count of setup_namespace() a bit by loading the sidecar verity settings in apply_mount_namespace(). This will also make it possible to pass file descriptors to the root image/directory into setup_namespace() as before this wasn't possible because the verity settings logic looks for sidecar files next to the root image which requires the path to be available. --- src/core/execute.c | 75 ++++++++++++++++++++++++++++++++--- src/core/namespace.c | 82 +++------------------------------------ src/core/namespace.h | 8 +--- src/test/test-namespace.c | 6 --- src/test/test-ns.c | 6 --- 5 files changed, 76 insertions(+), 101 deletions(-) diff --git a/src/core/execute.c b/src/core/execute.c index e46875f5b0..204c5a1f8c 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -3823,6 +3823,62 @@ static bool insist_on_sandboxing( return false; } +static int verity_settings_prepare( + VeritySettings *verity, + const char *root_image, + const void *root_hash, + size_t root_hash_size, + const char *root_hash_path, + const void *root_hash_sig, + size_t root_hash_sig_size, + const char *root_hash_sig_path, + const char *verity_data_path) { + + int r; + + assert(verity); + + if (root_hash) { + void *d; + + d = memdup(root_hash, root_hash_size); + if (!d) + return -ENOMEM; + + free_and_replace(verity->root_hash, d); + verity->root_hash_size = root_hash_size; + verity->designator = PARTITION_ROOT; + } + + if (root_hash_sig) { + void *d; + + d = memdup(root_hash_sig, root_hash_sig_size); + if (!d) + return -ENOMEM; + + free_and_replace(verity->root_hash_sig, d); + verity->root_hash_sig_size = root_hash_sig_size; + verity->designator = PARTITION_ROOT; + } + + if (verity_data_path) { + r = free_and_strdup(&verity->data_path, verity_data_path); + if (r < 0) + return r; + } + + r = verity_settings_load( + verity, + root_image, + root_hash_path, + root_hash_sig_path); + if (r < 0) + return log_debug_errno(r, "Failed to load root hash: %m"); + + return 0; +} + static int apply_mount_namespace( const Unit *u, ExecCommandFlags command_flags, @@ -3832,12 +3888,12 @@ static int apply_mount_namespace( const char *memory_pressure_path, char **error_path) { + _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT; _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL, **read_write_paths_cleanup = NULL; - const char *tmp_dir = NULL, *var_tmp_dir = NULL; - const char *root_dir = NULL, *root_image = NULL; _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL, *extension_dir = NULL; + const char *root_dir = NULL, *root_image = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL; char **read_write_paths; NamespaceInfo ns_info; bool needs_sandboxing; @@ -3956,6 +4012,17 @@ static int apply_mount_namespace( if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0) return -ENOMEM; + if (root_image) { + r = verity_settings_prepare( + &verity, + root_image, + context->root_hash, context->root_hash_size, context->root_hash_path, + context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path, + context->root_verity); + if (r < 0) + return r; + } + r = setup_namespace( root_dir, root_image, @@ -3981,9 +4048,7 @@ static int apply_mount_namespace( creds_path, context->log_namespace, context->mount_propagation_flag, - context->root_hash, context->root_hash_size, context->root_hash_path, - context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path, - context->root_verity, + &verity, context->extension_images, context->n_extension_images, context->extension_image_policy ?: &image_policy_sysext, diff --git a/src/core/namespace.c b/src/core/namespace.c index fbcc4505b5..bf01c5e028 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -1967,62 +1967,6 @@ static bool home_read_only( return false; } -static int verity_settings_prepare( - VeritySettings *verity, - const char *root_image, - const void *root_hash, - size_t root_hash_size, - const char *root_hash_path, - const void *root_hash_sig, - size_t root_hash_sig_size, - const char *root_hash_sig_path, - const char *verity_data_path) { - - int r; - - assert(verity); - - if (root_hash) { - void *d; - - d = memdup(root_hash, root_hash_size); - if (!d) - return -ENOMEM; - - free_and_replace(verity->root_hash, d); - verity->root_hash_size = root_hash_size; - verity->designator = PARTITION_ROOT; - } - - if (root_hash_sig) { - void *d; - - d = memdup(root_hash_sig, root_hash_sig_size); - if (!d) - return -ENOMEM; - - free_and_replace(verity->root_hash_sig, d); - verity->root_hash_sig_size = root_hash_sig_size; - verity->designator = PARTITION_ROOT; - } - - if (verity_data_path) { - r = free_and_strdup(&verity->data_path, verity_data_path); - if (r < 0) - return r; - } - - r = verity_settings_load( - verity, - root_image, - root_hash_path, - root_hash_sig_path); - if (r < 0) - return log_debug_errno(r, "Failed to load root hash: %m"); - - return 0; -} - int setup_namespace( const char* root_directory, const char* root_image, @@ -2048,13 +1992,7 @@ int setup_namespace( const char *creds_path, const char *log_namespace, unsigned long mount_propagation_flag, - const void *root_hash, - size_t root_hash_size, - const char *root_hash_path, - const void *root_hash_sig, - size_t root_hash_sig_size, - const char *root_hash_sig_path, - const char *verity_data_path, + VeritySettings *verity, const MountImage *extension_images, size_t n_extension_images, const ImagePolicy *extension_image_policy, @@ -2067,7 +2005,6 @@ int setup_namespace( _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL; _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL; - _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT; _cleanup_strv_free_ char **hierarchies = NULL; MountEntry *m = NULL, *mounts = NULL; bool require_prefix = false, setup_propagate = false; @@ -2107,16 +2044,7 @@ int setup_namespace( strv_isempty(read_write_paths)) dissect_image_flags |= DISSECT_IMAGE_READ_ONLY; - r = verity_settings_prepare( - &verity, - root_image, - root_hash, root_hash_size, root_hash_path, - root_hash_sig, root_hash_sig_size, root_hash_sig_path, - verity_data_path); - if (r < 0) - return r; - - SET_FLAG(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE, verity.data_path); + SET_FLAG(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE, verity && verity->data_path); r = loop_device_make_by_path( root_image, @@ -2130,7 +2058,7 @@ int setup_namespace( r = dissect_loop_device( loop_device, - &verity, + verity, root_image_mount_options, root_image_policy, dissect_image_flags, @@ -2141,14 +2069,14 @@ int setup_namespace( r = dissected_image_load_verity_sig_partition( dissected_image, loop_device->fd, - &verity); + verity); if (r < 0) return r; r = dissected_image_decrypt( dissected_image, NULL, - &verity, + verity, dissect_image_flags); if (r < 0) return log_debug_errno(r, "Failed to decrypt dissected image: %m"); diff --git a/src/core/namespace.h b/src/core/namespace.h index 39b510f41d..4ddd6a7d58 100644 --- a/src/core/namespace.h +++ b/src/core/namespace.h @@ -124,13 +124,7 @@ int setup_namespace( const char *creds_path, const char *log_namespace, unsigned long mount_propagation_flag, - const void *root_hash, - size_t root_hash_size, - const char *root_hash_path, - const void *root_hash_sig, - size_t root_hash_sig_size, - const char *root_hash_sig_path, - const char *root_verity, + VeritySettings *verity, const MountImage *extension_images, size_t n_extension_images, const ImagePolicy *extension_image_policy, diff --git a/src/test/test-namespace.c b/src/test/test-namespace.c index 82be09dd6a..b6ee628533 100644 --- a/src/test/test-namespace.c +++ b/src/test/test-namespace.c @@ -197,12 +197,6 @@ TEST(protect_kernel_logs) { NULL, 0, NULL, - 0, - NULL, - NULL, - 0, - NULL, - NULL, NULL, 0, NULL, diff --git a/src/test/test-ns.c b/src/test/test-ns.c index 485069670b..3a3af3584d 100644 --- a/src/test/test-ns.c +++ b/src/test/test-ns.c @@ -99,12 +99,6 @@ int main(int argc, char *argv[]) { NULL, 0, NULL, - 0, - NULL, - NULL, - 0, - NULL, - NULL, NULL, 0, NULL, From d31f8e0cb81e2fc721d991df911b50d1660b4bf7 Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Tue, 6 Jun 2023 16:54:24 +0200 Subject: [PATCH 2/8] dissect-image: Log if verity signature partition is too large --- src/shared/dissect-image.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/shared/dissect-image.c b/src/shared/dissect-image.c index cf432e2177..a1e4d333c8 100644 --- a/src/shared/dissect-image.c +++ b/src/shared/dissect-image.c @@ -3196,7 +3196,7 @@ int dissected_image_load_verity_sig_partition( return -EINVAL; if (p->size > 4*1024*1024) /* Signature data cannot possible be larger than 4M, refuse that */ - return -EFBIG; + return log_debug_errno(SYNTHETIC_ERRNO(EFBIG), "Verity signature partition is larger than 4M, refusing."); buf = new(char, p->size+1); if (!buf) From b572e8da4142fe23c67e28f1d6a232e14effba6e Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Tue, 6 Jun 2023 16:55:14 +0200 Subject: [PATCH 3/8] tmpfiles: Fix BSD lock logging messages --- src/tmpfiles/tmpfiles.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tmpfiles/tmpfiles.c b/src/tmpfiles/tmpfiles.c index 43e1285649..9469bbb4a3 100644 --- a/src/tmpfiles/tmpfiles.c +++ b/src/tmpfiles/tmpfiles.c @@ -720,7 +720,7 @@ static int dir_cleanup( } if (flock(dirfd(sub_dir), LOCK_EX|LOCK_NB) < 0) { - log_debug_errno(errno, "Couldn't acquire shared BSD lock on directory \"%s\", skipping: %m", p); + log_debug_errno(errno, "Couldn't acquire shared BSD lock on directory \"%s\", skipping: %m", sub_path); continue; } @@ -808,7 +808,7 @@ static int dir_cleanup( if (fd < 0 && fd != -ENOENT) log_warning_errno(fd, "Opening file \"%s\" failed, ignoring: %m", sub_path); if (fd >= 0 && flock(fd, LOCK_EX|LOCK_NB) < 0 && errno == EAGAIN) { - log_debug_errno(errno, "Couldn't acquire shared BSD lock on file \"%s\", skipping: %m", p); + log_debug_errno(errno, "Couldn't acquire shared BSD lock on file \"%s\", skipping: %m", sub_path); continue; } From 08c25eee3477aab83ac8f4d3237bc628c3ba2615 Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Tue, 6 Jun 2023 17:23:10 +0200 Subject: [PATCH 4/8] tmpfiles: Don't log about harmless errors when trying to lock file Let's make sure we don't log if the file is a symlink or does not exist. --- src/tmpfiles/tmpfiles.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/tmpfiles/tmpfiles.c b/src/tmpfiles/tmpfiles.c index 9469bbb4a3..eabac56320 100644 --- a/src/tmpfiles/tmpfiles.c +++ b/src/tmpfiles/tmpfiles.c @@ -510,7 +510,9 @@ static DIR* xopendirat_nomod(int dirfd, const char *path) { if (dir) return dir; - log_debug_errno(errno, "Cannot open %sdirectory \"%s\": %m", dirfd == AT_FDCWD ? "" : "sub", path); + if (!IN_SET(errno, ENOENT, ELOOP)) + log_debug_errno(errno, "Cannot open %sdirectory \"%s\": %m", dirfd == AT_FDCWD ? "" : "sub", path); + if (errno != EPERM) return NULL; @@ -805,7 +807,7 @@ static int dir_cleanup( O_RDONLY|O_CLOEXEC|O_NOFOLLOW|O_NOATIME, /* xopen_flags = */ 0, /* mode = */ 0); - if (fd < 0 && fd != -ENOENT) + if (fd < 0 && !IN_SET(fd, -ENOENT, -ELOOP)) log_warning_errno(fd, "Opening file \"%s\" failed, ignoring: %m", sub_path); if (fd >= 0 && flock(fd, LOCK_EX|LOCK_NB) < 0 && errno == EAGAIN) { log_debug_errno(errno, "Couldn't acquire shared BSD lock on file \"%s\", skipping: %m", sub_path); From dbc3cc8b832a7648f68b69e1322563d64cffd9c4 Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Tue, 20 Jun 2023 12:59:24 +0200 Subject: [PATCH 5/8] tmpfiles: Add note to man page about guaranteed cleanup for files/directories --- man/tmpfiles.d.xml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/man/tmpfiles.d.xml b/man/tmpfiles.d.xml index ef0bb1f7f8..495315d55c 100644 --- a/man/tmpfiles.d.xml +++ b/man/tmpfiles.d.xml @@ -661,6 +661,14 @@ d /tmp/foo/bar - - - bmA:1h - Applications may use this to temporarily exclude certain directory subtrees from the aging algorithm: the applications can take a BSD file lock themselves, and as long as they keep it aging of the directory/file and everything below it is disabled. + + This behavior can be used to ensure guaranteed cleanup of files or directories whose lifetime + should be aligned with the process that created them by having that process create them in a location + monitored by systemd-tmpfiles with an age of 0, and having the + process immediately lock the directory or file before using it. Because the BSD lock is process + specific, the file is guaranteed to be unlocked as soon as the process exits, meaning that even if the + process crashes, those files and directories will be unlocked and cleaned up by + systemd-tmpfiles. From 9c0c670125ff065c22c12bb82f1f20c7b2c8c46d Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Tue, 6 Jun 2023 17:44:09 +0200 Subject: [PATCH 6/8] core: Add RootEphemeral= setting This setting allows services to run in an ephemeral copy of the root directory or root image. To make sure the ephemeral copies are always cleaned up, we add a tmpfiles snippet to unconditionally clean up /var/lib/systemd/ephemeral. To prevent in use ephemeral copies from being cleaned up by tmpfiles, we use the newly added COPY_LOCK_BSD and BTRFS_SNAPSHOT_LOCK_BSD flags to take a BSD lock on the ephemeral copies which instruct tmpfiles to not touch those ephemeral copies as long as the BSD lock is held. --- man/org.freedesktop.systemd1.xml | 24 +++ man/systemd.exec.xml | 20 ++ src/core/dbus-execute.c | 4 + src/core/execute.c | 174 ++++++++++++++---- src/core/execute.h | 11 +- src/core/load-fragment-gperf.gperf.in | 1 + src/core/service.c | 2 +- src/core/unit.c | 2 +- src/shared/bus-unit-util.c | 3 +- .../fuzz-unit-file/directives-all.service | 1 + tmpfiles.d/systemd.conf.in | 4 + 11 files changed, 208 insertions(+), 38 deletions(-) diff --git a/man/org.freedesktop.systemd1.xml b/man/org.freedesktop.systemd1.xml index b50ddb95e9..560ae252e3 100644 --- a/man/org.freedesktop.systemd1.xml +++ b/man/org.freedesktop.systemd1.xml @@ -2950,6 +2950,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly s RootVerity = '...'; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly b RootEphemeral = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly as ExtensionDirectories = ['...', ...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly a(sba(ss)) ExtensionImages = [...]; @@ -3547,6 +3549,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + + @@ -4189,6 +4193,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + + @@ -4972,6 +4978,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly s RootVerity = '...'; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly b RootEphemeral = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly as ExtensionDirectories = ['...', ...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly a(sba(ss)) ExtensionImages = [...]; @@ -5581,6 +5589,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { + + @@ -6203,6 +6213,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { + + @@ -6861,6 +6873,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly s RootVerity = '...'; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly b RootEphemeral = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly as ExtensionDirectories = ['...', ...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly a(sba(ss)) ExtensionImages = [...]; @@ -7398,6 +7412,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { + + @@ -7938,6 +7954,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { + + @@ -8723,6 +8741,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly s RootVerity = '...'; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly b RootEphemeral = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly as ExtensionDirectories = ['...', ...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly a(sba(ss)) ExtensionImages = [...]; @@ -9246,6 +9266,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { + + @@ -9772,6 +9794,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { + + diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 938a3c87a9..84eda5c584 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -200,6 +200,26 @@ + + RootEphemeral= + + Takes a boolean argument. If enabled, executed processes will run in an ephemeral + copy of the root directory or root image. The ephemeral copy is placed in + /var/lib/systemd/ephemeral-trees/ while the service is active and is cleaned up + when the service is stopped or restarted. If RootDirectory= is used and the root + directory is a subvolume, the ephemeral copy will be created by making a snapshot of the subvolume. + + + To make sure making ephemeral copies can be made efficiently, the root directory or root image + should be located on the same filesystem as /var/lib/systemd/ephemeral-trees/. + When using RootEphemeral= with root directories, btrfs should be used as the + filesystem and the root directory should ideally be a subvolume which systemd can + snapshot to make the ephemeral copy. For root images, a filesystem with support for reflinks should + be used to ensure an efficient ephemeral copy. + + + + RootHash= diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c index 04070a7f1e..80a035ab90 100644 --- a/src/core/dbus-execute.c +++ b/src/core/dbus-execute.c @@ -1231,6 +1231,7 @@ const sd_bus_vtable bus_exec_vtable[] = { SD_BUS_PROPERTY("RootHashSignature", "ay", property_get_root_hash_sig, 0, SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("RootHashSignaturePath", "s", NULL, offsetof(ExecContext, root_hash_sig_path), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("RootVerity", "s", NULL, offsetof(ExecContext, root_verity), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RootEphemeral", "b", bus_property_get_bool, offsetof(ExecContext, root_ephemeral), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("ExtensionDirectories", "as", NULL, offsetof(ExecContext, extension_directories), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("ExtensionImages", "a(sba(ss))", property_get_extension_images, 0, SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("MountImages", "a(ssba(ss))", property_get_mount_images, 0, SD_BUS_VTABLE_PROPERTY_CONST), @@ -1865,6 +1866,9 @@ int bus_exec_context_set_transient_property( if (streq(name, "RootDirectory")) return bus_set_transient_path(u, name, &c->root_directory, message, flags, error); + if (streq(name, "RootEphemeral")) + return bus_set_transient_bool(u, name, &c->root_ephemeral, message, flags, error); + if (streq(name, "SyslogIdentifier")) return bus_set_transient_string(u, name, &c->syslog_identifier, message, flags, error); diff --git a/src/core/execute.c b/src/core/execute.c index 204c5a1f8c..b7fe922c7a 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -15,6 +15,8 @@ #include #include +#include /* Must be included after */ + #if HAVE_PAM #include #endif @@ -43,8 +45,10 @@ #include "async.h" #include "barrier.h" #include "bpf-lsm.h" +#include "btrfs-util.h" #include "cap-list.h" #include "capability-util.h" +#include "chattr-util.h" #include "cgroup-setup.h" #include "chase.h" #include "chown-recursive.h" @@ -66,6 +70,7 @@ #include "io-util.h" #include "ioprio-util.h" #include "label-util.h" +#include "lock-util.h" #include "log.h" #include "macro.h" #include "manager.h" @@ -2170,6 +2175,10 @@ bool exec_needs_network_namespace(const ExecContext *context) { return context->private_network || context->network_namespace_path; } +static bool exec_needs_ephemeral(const ExecContext *context) { + return (context->root_image || context->root_directory) && context->root_ephemeral; +} + static bool exec_needs_ipc_namespace(const ExecContext *context) { assert(context); @@ -3823,6 +3832,63 @@ static bool insist_on_sandboxing( return false; } +static int setup_ephemeral(const ExecContext *context, ExecRuntime *runtime) { + _cleanup_close_ int fd = -EBADF; + int r; + + if (!runtime || !runtime->ephemeral_copy) + return 0; + + r = posix_lock(runtime->ephemeral_storage_socket[0], LOCK_EX); + if (r < 0) + return log_debug_errno(r, "Failed to lock ephemeral storage socket: %m"); + + CLEANUP_POSIX_UNLOCK(runtime->ephemeral_storage_socket[0]); + + fd = receive_one_fd(runtime->ephemeral_storage_socket[0], MSG_PEEK|MSG_DONTWAIT); + if (fd >= 0) + /* We got an fd! That means ephemeral has already been set up, so nothing to do here. */ + return 0; + + if (fd != -EAGAIN) + return log_debug_errno(fd, "Failed to receive file descriptor queued on ephemeral storage socket: %m"); + + log_debug("Making ephemeral snapshot of %s to %s", + context->root_image ?: context->root_directory, runtime->ephemeral_copy); + + if (context->root_image) + fd = copy_file(context->root_image, runtime->ephemeral_copy, O_EXCL, 0600, + COPY_LOCK_BSD|COPY_REFLINK|COPY_CRTIME); + else + fd = btrfs_subvol_snapshot_at(AT_FDCWD, context->root_directory, + AT_FDCWD, runtime->ephemeral_copy, + BTRFS_SNAPSHOT_FALLBACK_COPY | + BTRFS_SNAPSHOT_FALLBACK_DIRECTORY | + BTRFS_SNAPSHOT_RECURSIVE | + BTRFS_SNAPSHOT_LOCK_BSD); + if (fd < 0) + return log_debug_errno(fd, "Failed to snapshot %s to %s: %m", + context->root_image ?: context->root_directory, runtime->ephemeral_copy); + + if (context->root_image) { + /* A root image might be subject to lots of random writes so let's try to disable COW on it + * which tends to not perform well in combination with lots of random writes. + * + * Note: btrfs actually isn't impressed by us setting the flag after making the reflink'ed + * copy, but we at least want to make the intention clear. + */ + r = chattr_fd(fd, FS_NOCOW_FL, FS_NOCOW_FL, NULL); + if (r < 0) + log_debug_errno(fd, "Failed to disable copy-on-write for %s, ignoring: %m", runtime->ephemeral_copy); + } + + r = send_one_fd(runtime->ephemeral_storage_socket[1], fd, MSG_DONTWAIT); + if (r < 0) + return log_debug_errno(r, "Failed to queue file descriptor on ephemeral storage socket: %m"); + + return 1; +} + static int verity_settings_prepare( VeritySettings *verity, const char *root_image, @@ -3884,7 +3950,7 @@ static int apply_mount_namespace( ExecCommandFlags command_flags, const ExecContext *context, const ExecParameters *params, - const ExecRuntime *runtime, + ExecRuntime *runtime, const char *memory_pressure_path, char **error_path) { @@ -3906,10 +3972,14 @@ static int apply_mount_namespace( CLEANUP_ARRAY(bind_mounts, n_bind_mounts, bind_mount_free_many); if (params->flags & EXEC_APPLY_CHROOT) { - root_image = context->root_image; + r = setup_ephemeral(context, runtime); + if (r < 0) + return r; - if (!root_image) - root_dir = context->root_directory; + if (context->root_image) + root_image = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_image; + else + root_dir = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory; } r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories); @@ -4090,6 +4160,7 @@ static int apply_mount_namespace( static int apply_working_directory( const ExecContext *context, const ExecParameters *params, + ExecRuntime *runtime, const char *home, int *exit_status) { @@ -4113,7 +4184,7 @@ static int apply_working_directory( if (params->flags & EXEC_APPLY_CHROOT) d = wd; else - d = prefix_roota(context->root_directory, wd); + d = prefix_roota((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory, wd); if (chdir(d) < 0 && !context->working_directory_missing_ok) { *exit_status = EXIT_CHDIR; @@ -4126,6 +4197,7 @@ static int apply_working_directory( static int apply_root_directory( const ExecContext *context, const ExecParameters *params, + ExecRuntime *runtime, const bool needs_mount_ns, int *exit_status) { @@ -4134,7 +4206,7 @@ static int apply_root_directory( if (params->flags & EXEC_APPLY_CHROOT) if (!needs_mount_ns && context->root_directory) - if (chroot(context->root_directory) < 0) { + if (chroot((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory) < 0) { *exit_status = EXIT_CHROOT; return -errno; } @@ -4271,7 +4343,7 @@ static int close_remaining_fds( const int *fds, size_t n_fds) { size_t n_dont_close = 0; - int dont_close[n_fds + 12]; + int dont_close[n_fds + 14]; assert(params); @@ -4289,6 +4361,9 @@ static int close_remaining_fds( n_dont_close += n_fds; } + if (runtime) + append_socket_pair(dont_close, &n_dont_close, runtime->ephemeral_storage_socket); + if (runtime && runtime->shared) { append_socket_pair(dont_close, &n_dont_close, runtime->shared->netns_storage_socket); append_socket_pair(dont_close, &n_dont_close, runtime->shared->ipcns_storage_socket); @@ -5584,7 +5659,7 @@ static int exec_child( } /* chroot to root directory first, before we lose the ability to chroot */ - r = apply_root_directory(context, params, needs_mount_namespace, exit_status); + r = apply_root_directory(context, params, runtime, needs_mount_namespace, exit_status); if (r < 0) return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m"); @@ -5610,7 +5685,7 @@ static int exec_child( /* Apply working directory here, because the working directory might be on NFS and only the user running * this service might have the correct privilege to change to the working directory */ - r = apply_working_directory(context, params, home, exit_status); + r = apply_working_directory(context, params, runtime, home, exit_status); if (r < 0) return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m"); @@ -6422,6 +6497,7 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) { "%sUMask: %04o\n" "%sWorkingDirectory: %s\n" "%sRootDirectory: %s\n" + "%sRootEphemeral: %s\n" "%sNonBlocking: %s\n" "%sPrivateTmp: %s\n" "%sPrivateDevices: %s\n" @@ -6446,6 +6522,7 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) { prefix, c->umask, prefix, empty_to_root(c->working_directory), prefix, empty_to_root(c->root_directory), + prefix, yes_no(c->root_ephemeral), prefix, yes_no(c->non_blocking), prefix, yes_no(c->private_tmp), prefix, yes_no(c->private_devices), @@ -7241,13 +7318,30 @@ int exec_command_append(ExecCommand *c, const char *path, ...) { return 0; } -static void *remove_tmpdir_thread(void *p) { +static void *rm_rf_thread(void *p) { _cleanup_free_ char *path = p; - (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL); + (void) rm_rf(path, REMOVE_ROOT|REMOVE_SUBVOLUME|REMOVE_PHYSICAL); return NULL; } +static void asynchronous_rm_rf(char **path) { + int r; + + assert(path); + + if (!*path || streq(*path, RUN_SYSTEMD_EMPTY)) + return; + + log_debug("Spawning thread to nuke %s", *path); + + r = asynchronous_job(rm_rf_thread, *path); + if (r < 0) + log_warning_errno(r, "Failed to nuke %s: %m", *path); + else + *path = NULL; +} + static ExecSharedRuntime* exec_shared_runtime_free(ExecSharedRuntime *rt) { if (!rt) return NULL; @@ -7267,8 +7361,6 @@ DEFINE_TRIVIAL_UNREF_FUNC(ExecSharedRuntime, exec_shared_runtime, exec_shared_ru DEFINE_TRIVIAL_CLEANUP_FUNC(ExecSharedRuntime*, exec_shared_runtime_free); ExecSharedRuntime* exec_shared_runtime_destroy(ExecSharedRuntime *rt) { - int r; - if (!rt) return NULL; @@ -7278,25 +7370,8 @@ ExecSharedRuntime* exec_shared_runtime_destroy(ExecSharedRuntime *rt) { if (rt->n_ref > 0) return NULL; - if (rt->tmp_dir && !streq(rt->tmp_dir, RUN_SYSTEMD_EMPTY)) { - log_debug("Spawning thread to nuke %s", rt->tmp_dir); - - r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir); - if (r < 0) - log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir); - else - rt->tmp_dir = NULL; - } - - if (rt->var_tmp_dir && !streq(rt->var_tmp_dir, RUN_SYSTEMD_EMPTY)) { - log_debug("Spawning thread to nuke %s", rt->var_tmp_dir); - - r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir); - if (r < 0) - log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir); - else - rt->var_tmp_dir = NULL; - } + asynchronous_rm_rf(&rt->tmp_dir); + asynchronous_rm_rf(&rt->var_tmp_dir); return exec_shared_runtime_free(rt); } @@ -7731,16 +7806,39 @@ void exec_shared_runtime_vacuum(Manager *m) { } } -int exec_runtime_make(ExecSharedRuntime *shared, DynamicCreds *creds, ExecRuntime **ret) { +int exec_runtime_make( + const Unit *unit, + const ExecContext *context, + ExecSharedRuntime *shared, + DynamicCreds *creds, + ExecRuntime **ret) { + _cleanup_close_pair_ int ephemeral_storage_socket[2] = PIPE_EBADF; + _cleanup_free_ char *ephemeral = NULL; _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL; + int r; + assert(unit); + assert(context); assert(ret); - if (!shared && !creds) { + if (!shared && !creds && !exec_needs_ephemeral(context)) { *ret = NULL; return 0; } + if (exec_needs_ephemeral(context)) { + r = mkdir_p("/var/lib/systemd/ephemeral-trees", 0755); + if (r < 0) + return r; + + r = tempfn_random_child("/var/lib/systemd/ephemeral-trees", unit->id, &ephemeral); + if (r < 0) + return r; + + if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ephemeral_storage_socket) < 0) + return -errno; + } + rt = new(ExecRuntime, 1); if (!rt) return -ENOMEM; @@ -7748,6 +7846,9 @@ int exec_runtime_make(ExecSharedRuntime *shared, DynamicCreds *creds, ExecRuntim *rt = (ExecRuntime) { .shared = shared, .dynamic_creds = creds, + .ephemeral_copy = TAKE_PTR(ephemeral), + .ephemeral_storage_socket[0] = TAKE_FD(ephemeral_storage_socket[0]), + .ephemeral_storage_socket[1] = TAKE_FD(ephemeral_storage_socket[1]), }; *ret = TAKE_PTR(rt); @@ -7760,6 +7861,11 @@ ExecRuntime* exec_runtime_free(ExecRuntime *rt) { exec_shared_runtime_unref(rt->shared); dynamic_creds_unref(rt->dynamic_creds); + + asynchronous_rm_rf(&rt->ephemeral_copy); + + free(rt->ephemeral_copy); + safe_close_pair(rt->ephemeral_storage_socket); return mfree(rt); } diff --git a/src/core/execute.h b/src/core/execute.h index 953dc9e7f7..ee73fb6367 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -129,6 +129,14 @@ struct ExecSharedRuntime { struct ExecRuntime { ExecSharedRuntime *shared; DynamicCreds *dynamic_creds; + + /* The path to the ephemeral snapshot of the root directory or root image if one was requested. */ + char *ephemeral_copy; + + /* An AF_UNIX socket pair that receives the locked file descriptor referring to the ephemeral copy of + * the root directory or root image. The lock prevents tmpfiles from removing the ephemeral snapshot + * until we're done using it. */ + int ephemeral_storage_socket[2]; }; typedef enum ExecDirectoryType { @@ -195,6 +203,7 @@ struct ExecContext { void *root_hash, *root_hash_sig; size_t root_hash_size, root_hash_sig_size; LIST_HEAD(MountOptions, root_image_options); + bool root_ephemeral; bool working_directory_missing_ok:1; bool working_directory_home:1; @@ -506,7 +515,7 @@ int exec_shared_runtime_deserialize_compat(Unit *u, const char *key, const char int exec_shared_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds); void exec_shared_runtime_vacuum(Manager *m); -int exec_runtime_make(ExecSharedRuntime *shared, DynamicCreds *creds, ExecRuntime **ret); +int exec_runtime_make(const Unit *unit, const ExecContext *context, ExecSharedRuntime *shared, DynamicCreds *creds, ExecRuntime **ret); ExecRuntime* exec_runtime_free(ExecRuntime *rt); DEFINE_TRIVIAL_CLEANUP_FUNC(ExecRuntime*, exec_runtime_free); ExecRuntime* exec_runtime_destroy(ExecRuntime *rt); diff --git a/src/core/load-fragment-gperf.gperf.in b/src/core/load-fragment-gperf.gperf.in index ae318dae89..382b60ea90 100644 --- a/src/core/load-fragment-gperf.gperf.in +++ b/src/core/load-fragment-gperf.gperf.in @@ -10,6 +10,7 @@ {{type}}.RootHash, config_parse_exec_root_hash, 0, offsetof({{type}}, exec_context) {{type}}.RootHashSignature, config_parse_exec_root_hash_sig, 0, offsetof({{type}}, exec_context) {{type}}.RootVerity, config_parse_unit_path_printf, true, offsetof({{type}}, exec_context.root_verity) +{{type}}.RootEphemeral, config_parse_bool, 0, offsetof({{type}}, exec_context.root_ephemeral) {{type}}.ExtensionDirectories, config_parse_namespace_path_strv, 0, offsetof({{type}}, exec_context.extension_directories) {{type}}.ExtensionImages, config_parse_extension_images, 0, offsetof({{type}}, exec_context) {{type}}.ExtensionImagePolicy, config_parse_image_policy, 0, offsetof({{type}}, exec_context.extension_image_policy) diff --git a/src/core/service.c b/src/core/service.c index cecdd3bf50..146b892e46 100644 --- a/src/core/service.c +++ b/src/core/service.c @@ -2029,7 +2029,7 @@ static void service_enter_dead(Service *s, ServiceResult f, bool allow_restart) /* Reset NotifyAccess override */ s->notify_access_override = _NOTIFY_ACCESS_INVALID; - /* We want fresh tmpdirs in case service is started again immediately */ + /* We want fresh tmpdirs and ephemeral snapshots in case the service is started again immediately. */ s->exec_runtime = exec_runtime_destroy(s->exec_runtime); /* Also, remove the runtime directory */ diff --git a/src/core/unit.c b/src/core/unit.c index 570234c8f4..f51b5687f8 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -5012,7 +5012,7 @@ int unit_setup_exec_runtime(Unit *u) { return r; } - r = exec_runtime_make(esr, dcreds, rt); + r = exec_runtime_make(u, ec, esr, dcreds, rt); if (r < 0) return r; diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c index 6e93d0ca43..cc287feb8e 100644 --- a/src/shared/bus-unit-util.c +++ b/src/shared/bus-unit-util.c @@ -992,7 +992,8 @@ static int bus_append_execute_property(sd_bus_message *m, const char *field, con "LockPersonality", "ProtectHostname", "MemoryKSM", - "RestrictSUIDSGID")) + "RestrictSUIDSGID", + "RootEphemeral")) return bus_append_parse_boolean(m, field, eq); if (STR_IN_SET(field, "ReadWriteDirectories", diff --git a/test/fuzz/fuzz-unit-file/directives-all.service b/test/fuzz/fuzz-unit-file/directives-all.service index 818fb28dbf..4bdc48a59b 100644 --- a/test/fuzz/fuzz-unit-file/directives-all.service +++ b/test/fuzz/fuzz-unit-file/directives-all.service @@ -222,6 +222,7 @@ RootImage= RootHash= RootHashSignature= RootVerity= +RootEphemeral= ExtensionDirectories= ExtensionImages= RuntimeMaxSec= diff --git a/tmpfiles.d/systemd.conf.in b/tmpfiles.d/systemd.conf.in index fa838d8d06..3781c579e0 100644 --- a/tmpfiles.d/systemd.conf.in +++ b/tmpfiles.d/systemd.conf.in @@ -60,6 +60,10 @@ a+ /var/log/journal/%m/system.journal - - - - group:wheel:r-- d /var/lib/systemd 0755 root root - d /var/lib/systemd/coredump 0755 root root 3d +# Files and directories in /var/lib/systemd/ephemeral-trees are locked by pid 1 to prevent tmpfiles from +# removing them, and tmpfiles is told to clean up anything in /var/lib/systemd/ephemeral-trees that isn't +# locked unconditionally. +d /var/lib/systemd/ephemeral-trees 0755 root root 0 d /var/lib/private 0700 root root - d /var/log/private 0700 root root - From d110169b65689156b8a9d408caede2c581f63217 Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Tue, 20 Jun 2023 13:53:45 +0200 Subject: [PATCH 7/8] test: Add touch into minimal verity test image --- test/test-functions | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test-functions b/test/test-functions index 2b112fbb2a..d57594f17b 100644 --- a/test/test-functions +++ b/test/test-functions @@ -691,6 +691,7 @@ install_verity_minimal() { grep mount sleep + touch ) oldinitdir="$initdir" rm -rfv "$TESTDIR/minimal" From cdad1f14070d83c477cc120ee76b6ca7ebb4bb07 Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Tue, 20 Jun 2023 13:54:07 +0200 Subject: [PATCH 8/8] test: Add RootEphemeral= integration test --- test/TEST-50-DISSECT/test.sh | 1 + test/units/testsuite-50.sh | 12 ++++++++++++ 2 files changed, 13 insertions(+) diff --git a/test/TEST-50-DISSECT/test.sh b/test/TEST-50-DISSECT/test.sh index bcc81749af..6e5179c842 100755 --- a/test/TEST-50-DISSECT/test.sh +++ b/test/TEST-50-DISSECT/test.sh @@ -24,6 +24,7 @@ test_append_files() { if command -v openssl >/dev/null 2>&1; then inst_binary openssl fi + inst_binary unsquashfs install_verity_minimal } diff --git a/test/units/testsuite-50.sh b/test/units/testsuite-50.sh index cdb7c0043b..9222fc7010 100755 --- a/test/units/testsuite-50.sh +++ b/test/units/testsuite-50.sh @@ -528,6 +528,18 @@ systemd-confext status systemd-confext unmerge rm -rf /run/confexts/ +unsquashfs -no-xattrs -d /tmp/img "${image}.raw" +systemd-run --unit=test-root-ephemeral \ + -p RootDirectory=/tmp/img \ + -p RootEphemeral=yes \ + -p Type=exec \ + bash -c "touch /abc && sleep infinity" +test -n "$(ls -A /var/lib/systemd/ephemeral-trees)" +systemctl stop test-root-ephemeral +# shellcheck disable=SC2016 +timeout 10 bash -c 'while ! test -z "$(ls -A /var/lib/systemd/ephemeral-trees)"; do sleep .5; done' +test ! -f /tmp/img/abc + echo OK >/testok exit 0