diff --git a/man/org.freedesktop.systemd1.xml b/man/org.freedesktop.systemd1.xml
index b50ddb95e9..560ae252e3 100644
--- a/man/org.freedesktop.systemd1.xml
+++ b/man/org.freedesktop.systemd1.xml
@@ -2950,6 +2950,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s RootVerity = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+ readonly b RootEphemeral = ...;
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly as ExtensionDirectories = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly a(sba(ss)) ExtensionImages = [...];
@@ -3547,6 +3549,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
+
+
@@ -4189,6 +4193,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
+
+
@@ -4972,6 +4978,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s RootVerity = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+ readonly b RootEphemeral = ...;
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly as ExtensionDirectories = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly a(sba(ss)) ExtensionImages = [...];
@@ -5581,6 +5589,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
+
+
@@ -6203,6 +6213,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
+
+
@@ -6861,6 +6873,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s RootVerity = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+ readonly b RootEphemeral = ...;
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly as ExtensionDirectories = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly a(sba(ss)) ExtensionImages = [...];
@@ -7398,6 +7412,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
+
+
@@ -7938,6 +7954,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
+
+
@@ -8723,6 +8741,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s RootVerity = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+ readonly b RootEphemeral = ...;
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly as ExtensionDirectories = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly a(sba(ss)) ExtensionImages = [...];
@@ -9246,6 +9266,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
+
+
@@ -9772,6 +9794,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
+
+
diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index 938a3c87a9..84eda5c584 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -200,6 +200,26 @@
+
+ RootEphemeral=
+
+ Takes a boolean argument. If enabled, executed processes will run in an ephemeral
+ copy of the root directory or root image. The ephemeral copy is placed in
+ /var/lib/systemd/ephemeral-trees/ while the service is active and is cleaned up
+ when the service is stopped or restarted. If RootDirectory= is used and the root
+ directory is a subvolume, the ephemeral copy will be created by making a snapshot of the subvolume.
+
+
+ To make sure making ephemeral copies can be made efficiently, the root directory or root image
+ should be located on the same filesystem as /var/lib/systemd/ephemeral-trees/.
+ When using RootEphemeral= with root directories, btrfs should be used as the
+ filesystem and the root directory should ideally be a subvolume which systemd can
+ snapshot to make the ephemeral copy. For root images, a filesystem with support for reflinks should
+ be used to ensure an efficient ephemeral copy.
+
+
+
+
RootHash=
diff --git a/man/tmpfiles.d.xml b/man/tmpfiles.d.xml
index ef0bb1f7f8..495315d55c 100644
--- a/man/tmpfiles.d.xml
+++ b/man/tmpfiles.d.xml
@@ -661,6 +661,14 @@ d /tmp/foo/bar - - - bmA:1h -
Applications may use this to temporarily exclude certain directory subtrees from the aging algorithm:
the applications can take a BSD file lock themselves, and as long as they keep it aging of the
directory/file and everything below it is disabled.
+
+ This behavior can be used to ensure guaranteed cleanup of files or directories whose lifetime
+ should be aligned with the process that created them by having that process create them in a location
+ monitored by systemd-tmpfiles with an age of 0, and having the
+ process immediately lock the directory or file before using it. Because the BSD lock is process
+ specific, the file is guaranteed to be unlocked as soon as the process exits, meaning that even if the
+ process crashes, those files and directories will be unlocked and cleaned up by
+ systemd-tmpfiles.
diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c
index 04070a7f1e..80a035ab90 100644
--- a/src/core/dbus-execute.c
+++ b/src/core/dbus-execute.c
@@ -1231,6 +1231,7 @@ const sd_bus_vtable bus_exec_vtable[] = {
SD_BUS_PROPERTY("RootHashSignature", "ay", property_get_root_hash_sig, 0, SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("RootHashSignaturePath", "s", NULL, offsetof(ExecContext, root_hash_sig_path), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("RootVerity", "s", NULL, offsetof(ExecContext, root_verity), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RootEphemeral", "b", bus_property_get_bool, offsetof(ExecContext, root_ephemeral), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("ExtensionDirectories", "as", NULL, offsetof(ExecContext, extension_directories), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("ExtensionImages", "a(sba(ss))", property_get_extension_images, 0, SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("MountImages", "a(ssba(ss))", property_get_mount_images, 0, SD_BUS_VTABLE_PROPERTY_CONST),
@@ -1865,6 +1866,9 @@ int bus_exec_context_set_transient_property(
if (streq(name, "RootDirectory"))
return bus_set_transient_path(u, name, &c->root_directory, message, flags, error);
+ if (streq(name, "RootEphemeral"))
+ return bus_set_transient_bool(u, name, &c->root_ephemeral, message, flags, error);
+
if (streq(name, "SyslogIdentifier"))
return bus_set_transient_string(u, name, &c->syslog_identifier, message, flags, error);
diff --git a/src/core/execute.c b/src/core/execute.c
index e46875f5b0..b7fe922c7a 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -15,6 +15,8 @@
#include
#include
+#include /* Must be included after */
+
#if HAVE_PAM
#include
#endif
@@ -43,8 +45,10 @@
#include "async.h"
#include "barrier.h"
#include "bpf-lsm.h"
+#include "btrfs-util.h"
#include "cap-list.h"
#include "capability-util.h"
+#include "chattr-util.h"
#include "cgroup-setup.h"
#include "chase.h"
#include "chown-recursive.h"
@@ -66,6 +70,7 @@
#include "io-util.h"
#include "ioprio-util.h"
#include "label-util.h"
+#include "lock-util.h"
#include "log.h"
#include "macro.h"
#include "manager.h"
@@ -2170,6 +2175,10 @@ bool exec_needs_network_namespace(const ExecContext *context) {
return context->private_network || context->network_namespace_path;
}
+static bool exec_needs_ephemeral(const ExecContext *context) {
+ return (context->root_image || context->root_directory) && context->root_ephemeral;
+}
+
static bool exec_needs_ipc_namespace(const ExecContext *context) {
assert(context);
@@ -3823,21 +3832,134 @@ static bool insist_on_sandboxing(
return false;
}
+static int setup_ephemeral(const ExecContext *context, ExecRuntime *runtime) {
+ _cleanup_close_ int fd = -EBADF;
+ int r;
+
+ if (!runtime || !runtime->ephemeral_copy)
+ return 0;
+
+ r = posix_lock(runtime->ephemeral_storage_socket[0], LOCK_EX);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to lock ephemeral storage socket: %m");
+
+ CLEANUP_POSIX_UNLOCK(runtime->ephemeral_storage_socket[0]);
+
+ fd = receive_one_fd(runtime->ephemeral_storage_socket[0], MSG_PEEK|MSG_DONTWAIT);
+ if (fd >= 0)
+ /* We got an fd! That means ephemeral has already been set up, so nothing to do here. */
+ return 0;
+
+ if (fd != -EAGAIN)
+ return log_debug_errno(fd, "Failed to receive file descriptor queued on ephemeral storage socket: %m");
+
+ log_debug("Making ephemeral snapshot of %s to %s",
+ context->root_image ?: context->root_directory, runtime->ephemeral_copy);
+
+ if (context->root_image)
+ fd = copy_file(context->root_image, runtime->ephemeral_copy, O_EXCL, 0600,
+ COPY_LOCK_BSD|COPY_REFLINK|COPY_CRTIME);
+ else
+ fd = btrfs_subvol_snapshot_at(AT_FDCWD, context->root_directory,
+ AT_FDCWD, runtime->ephemeral_copy,
+ BTRFS_SNAPSHOT_FALLBACK_COPY |
+ BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
+ BTRFS_SNAPSHOT_RECURSIVE |
+ BTRFS_SNAPSHOT_LOCK_BSD);
+ if (fd < 0)
+ return log_debug_errno(fd, "Failed to snapshot %s to %s: %m",
+ context->root_image ?: context->root_directory, runtime->ephemeral_copy);
+
+ if (context->root_image) {
+ /* A root image might be subject to lots of random writes so let's try to disable COW on it
+ * which tends to not perform well in combination with lots of random writes.
+ *
+ * Note: btrfs actually isn't impressed by us setting the flag after making the reflink'ed
+ * copy, but we at least want to make the intention clear.
+ */
+ r = chattr_fd(fd, FS_NOCOW_FL, FS_NOCOW_FL, NULL);
+ if (r < 0)
+ log_debug_errno(fd, "Failed to disable copy-on-write for %s, ignoring: %m", runtime->ephemeral_copy);
+ }
+
+ r = send_one_fd(runtime->ephemeral_storage_socket[1], fd, MSG_DONTWAIT);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to queue file descriptor on ephemeral storage socket: %m");
+
+ return 1;
+}
+
+static int verity_settings_prepare(
+ VeritySettings *verity,
+ const char *root_image,
+ const void *root_hash,
+ size_t root_hash_size,
+ const char *root_hash_path,
+ const void *root_hash_sig,
+ size_t root_hash_sig_size,
+ const char *root_hash_sig_path,
+ const char *verity_data_path) {
+
+ int r;
+
+ assert(verity);
+
+ if (root_hash) {
+ void *d;
+
+ d = memdup(root_hash, root_hash_size);
+ if (!d)
+ return -ENOMEM;
+
+ free_and_replace(verity->root_hash, d);
+ verity->root_hash_size = root_hash_size;
+ verity->designator = PARTITION_ROOT;
+ }
+
+ if (root_hash_sig) {
+ void *d;
+
+ d = memdup(root_hash_sig, root_hash_sig_size);
+ if (!d)
+ return -ENOMEM;
+
+ free_and_replace(verity->root_hash_sig, d);
+ verity->root_hash_sig_size = root_hash_sig_size;
+ verity->designator = PARTITION_ROOT;
+ }
+
+ if (verity_data_path) {
+ r = free_and_strdup(&verity->data_path, verity_data_path);
+ if (r < 0)
+ return r;
+ }
+
+ r = verity_settings_load(
+ verity,
+ root_image,
+ root_hash_path,
+ root_hash_sig_path);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to load root hash: %m");
+
+ return 0;
+}
+
static int apply_mount_namespace(
const Unit *u,
ExecCommandFlags command_flags,
const ExecContext *context,
const ExecParameters *params,
- const ExecRuntime *runtime,
+ ExecRuntime *runtime,
const char *memory_pressure_path,
char **error_path) {
+ _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
_cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
**read_write_paths_cleanup = NULL;
- const char *tmp_dir = NULL, *var_tmp_dir = NULL;
- const char *root_dir = NULL, *root_image = NULL;
_cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
*extension_dir = NULL;
+ const char *root_dir = NULL, *root_image = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
char **read_write_paths;
NamespaceInfo ns_info;
bool needs_sandboxing;
@@ -3850,10 +3972,14 @@ static int apply_mount_namespace(
CLEANUP_ARRAY(bind_mounts, n_bind_mounts, bind_mount_free_many);
if (params->flags & EXEC_APPLY_CHROOT) {
- root_image = context->root_image;
+ r = setup_ephemeral(context, runtime);
+ if (r < 0)
+ return r;
- if (!root_image)
- root_dir = context->root_directory;
+ if (context->root_image)
+ root_image = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_image;
+ else
+ root_dir = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory;
}
r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
@@ -3956,6 +4082,17 @@ static int apply_mount_namespace(
if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0)
return -ENOMEM;
+ if (root_image) {
+ r = verity_settings_prepare(
+ &verity,
+ root_image,
+ context->root_hash, context->root_hash_size, context->root_hash_path,
+ context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
+ context->root_verity);
+ if (r < 0)
+ return r;
+ }
+
r = setup_namespace(
root_dir,
root_image,
@@ -3981,9 +4118,7 @@ static int apply_mount_namespace(
creds_path,
context->log_namespace,
context->mount_propagation_flag,
- context->root_hash, context->root_hash_size, context->root_hash_path,
- context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
- context->root_verity,
+ &verity,
context->extension_images,
context->n_extension_images,
context->extension_image_policy ?: &image_policy_sysext,
@@ -4025,6 +4160,7 @@ static int apply_mount_namespace(
static int apply_working_directory(
const ExecContext *context,
const ExecParameters *params,
+ ExecRuntime *runtime,
const char *home,
int *exit_status) {
@@ -4048,7 +4184,7 @@ static int apply_working_directory(
if (params->flags & EXEC_APPLY_CHROOT)
d = wd;
else
- d = prefix_roota(context->root_directory, wd);
+ d = prefix_roota((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory, wd);
if (chdir(d) < 0 && !context->working_directory_missing_ok) {
*exit_status = EXIT_CHDIR;
@@ -4061,6 +4197,7 @@ static int apply_working_directory(
static int apply_root_directory(
const ExecContext *context,
const ExecParameters *params,
+ ExecRuntime *runtime,
const bool needs_mount_ns,
int *exit_status) {
@@ -4069,7 +4206,7 @@ static int apply_root_directory(
if (params->flags & EXEC_APPLY_CHROOT)
if (!needs_mount_ns && context->root_directory)
- if (chroot(context->root_directory) < 0) {
+ if (chroot((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory) < 0) {
*exit_status = EXIT_CHROOT;
return -errno;
}
@@ -4206,7 +4343,7 @@ static int close_remaining_fds(
const int *fds, size_t n_fds) {
size_t n_dont_close = 0;
- int dont_close[n_fds + 12];
+ int dont_close[n_fds + 14];
assert(params);
@@ -4224,6 +4361,9 @@ static int close_remaining_fds(
n_dont_close += n_fds;
}
+ if (runtime)
+ append_socket_pair(dont_close, &n_dont_close, runtime->ephemeral_storage_socket);
+
if (runtime && runtime->shared) {
append_socket_pair(dont_close, &n_dont_close, runtime->shared->netns_storage_socket);
append_socket_pair(dont_close, &n_dont_close, runtime->shared->ipcns_storage_socket);
@@ -5519,7 +5659,7 @@ static int exec_child(
}
/* chroot to root directory first, before we lose the ability to chroot */
- r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
+ r = apply_root_directory(context, params, runtime, needs_mount_namespace, exit_status);
if (r < 0)
return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
@@ -5545,7 +5685,7 @@ static int exec_child(
/* Apply working directory here, because the working directory might be on NFS and only the user running
* this service might have the correct privilege to change to the working directory */
- r = apply_working_directory(context, params, home, exit_status);
+ r = apply_working_directory(context, params, runtime, home, exit_status);
if (r < 0)
return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
@@ -6357,6 +6497,7 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
"%sUMask: %04o\n"
"%sWorkingDirectory: %s\n"
"%sRootDirectory: %s\n"
+ "%sRootEphemeral: %s\n"
"%sNonBlocking: %s\n"
"%sPrivateTmp: %s\n"
"%sPrivateDevices: %s\n"
@@ -6381,6 +6522,7 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
prefix, c->umask,
prefix, empty_to_root(c->working_directory),
prefix, empty_to_root(c->root_directory),
+ prefix, yes_no(c->root_ephemeral),
prefix, yes_no(c->non_blocking),
prefix, yes_no(c->private_tmp),
prefix, yes_no(c->private_devices),
@@ -7176,13 +7318,30 @@ int exec_command_append(ExecCommand *c, const char *path, ...) {
return 0;
}
-static void *remove_tmpdir_thread(void *p) {
+static void *rm_rf_thread(void *p) {
_cleanup_free_ char *path = p;
- (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
+ (void) rm_rf(path, REMOVE_ROOT|REMOVE_SUBVOLUME|REMOVE_PHYSICAL);
return NULL;
}
+static void asynchronous_rm_rf(char **path) {
+ int r;
+
+ assert(path);
+
+ if (!*path || streq(*path, RUN_SYSTEMD_EMPTY))
+ return;
+
+ log_debug("Spawning thread to nuke %s", *path);
+
+ r = asynchronous_job(rm_rf_thread, *path);
+ if (r < 0)
+ log_warning_errno(r, "Failed to nuke %s: %m", *path);
+ else
+ *path = NULL;
+}
+
static ExecSharedRuntime* exec_shared_runtime_free(ExecSharedRuntime *rt) {
if (!rt)
return NULL;
@@ -7202,8 +7361,6 @@ DEFINE_TRIVIAL_UNREF_FUNC(ExecSharedRuntime, exec_shared_runtime, exec_shared_ru
DEFINE_TRIVIAL_CLEANUP_FUNC(ExecSharedRuntime*, exec_shared_runtime_free);
ExecSharedRuntime* exec_shared_runtime_destroy(ExecSharedRuntime *rt) {
- int r;
-
if (!rt)
return NULL;
@@ -7213,25 +7370,8 @@ ExecSharedRuntime* exec_shared_runtime_destroy(ExecSharedRuntime *rt) {
if (rt->n_ref > 0)
return NULL;
- if (rt->tmp_dir && !streq(rt->tmp_dir, RUN_SYSTEMD_EMPTY)) {
- log_debug("Spawning thread to nuke %s", rt->tmp_dir);
-
- r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
- if (r < 0)
- log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
- else
- rt->tmp_dir = NULL;
- }
-
- if (rt->var_tmp_dir && !streq(rt->var_tmp_dir, RUN_SYSTEMD_EMPTY)) {
- log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
-
- r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
- if (r < 0)
- log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
- else
- rt->var_tmp_dir = NULL;
- }
+ asynchronous_rm_rf(&rt->tmp_dir);
+ asynchronous_rm_rf(&rt->var_tmp_dir);
return exec_shared_runtime_free(rt);
}
@@ -7666,16 +7806,39 @@ void exec_shared_runtime_vacuum(Manager *m) {
}
}
-int exec_runtime_make(ExecSharedRuntime *shared, DynamicCreds *creds, ExecRuntime **ret) {
+int exec_runtime_make(
+ const Unit *unit,
+ const ExecContext *context,
+ ExecSharedRuntime *shared,
+ DynamicCreds *creds,
+ ExecRuntime **ret) {
+ _cleanup_close_pair_ int ephemeral_storage_socket[2] = PIPE_EBADF;
+ _cleanup_free_ char *ephemeral = NULL;
_cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
+ int r;
+ assert(unit);
+ assert(context);
assert(ret);
- if (!shared && !creds) {
+ if (!shared && !creds && !exec_needs_ephemeral(context)) {
*ret = NULL;
return 0;
}
+ if (exec_needs_ephemeral(context)) {
+ r = mkdir_p("/var/lib/systemd/ephemeral-trees", 0755);
+ if (r < 0)
+ return r;
+
+ r = tempfn_random_child("/var/lib/systemd/ephemeral-trees", unit->id, &ephemeral);
+ if (r < 0)
+ return r;
+
+ if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ephemeral_storage_socket) < 0)
+ return -errno;
+ }
+
rt = new(ExecRuntime, 1);
if (!rt)
return -ENOMEM;
@@ -7683,6 +7846,9 @@ int exec_runtime_make(ExecSharedRuntime *shared, DynamicCreds *creds, ExecRuntim
*rt = (ExecRuntime) {
.shared = shared,
.dynamic_creds = creds,
+ .ephemeral_copy = TAKE_PTR(ephemeral),
+ .ephemeral_storage_socket[0] = TAKE_FD(ephemeral_storage_socket[0]),
+ .ephemeral_storage_socket[1] = TAKE_FD(ephemeral_storage_socket[1]),
};
*ret = TAKE_PTR(rt);
@@ -7695,6 +7861,11 @@ ExecRuntime* exec_runtime_free(ExecRuntime *rt) {
exec_shared_runtime_unref(rt->shared);
dynamic_creds_unref(rt->dynamic_creds);
+
+ asynchronous_rm_rf(&rt->ephemeral_copy);
+
+ free(rt->ephemeral_copy);
+ safe_close_pair(rt->ephemeral_storage_socket);
return mfree(rt);
}
diff --git a/src/core/execute.h b/src/core/execute.h
index 953dc9e7f7..ee73fb6367 100644
--- a/src/core/execute.h
+++ b/src/core/execute.h
@@ -129,6 +129,14 @@ struct ExecSharedRuntime {
struct ExecRuntime {
ExecSharedRuntime *shared;
DynamicCreds *dynamic_creds;
+
+ /* The path to the ephemeral snapshot of the root directory or root image if one was requested. */
+ char *ephemeral_copy;
+
+ /* An AF_UNIX socket pair that receives the locked file descriptor referring to the ephemeral copy of
+ * the root directory or root image. The lock prevents tmpfiles from removing the ephemeral snapshot
+ * until we're done using it. */
+ int ephemeral_storage_socket[2];
};
typedef enum ExecDirectoryType {
@@ -195,6 +203,7 @@ struct ExecContext {
void *root_hash, *root_hash_sig;
size_t root_hash_size, root_hash_sig_size;
LIST_HEAD(MountOptions, root_image_options);
+ bool root_ephemeral;
bool working_directory_missing_ok:1;
bool working_directory_home:1;
@@ -506,7 +515,7 @@ int exec_shared_runtime_deserialize_compat(Unit *u, const char *key, const char
int exec_shared_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds);
void exec_shared_runtime_vacuum(Manager *m);
-int exec_runtime_make(ExecSharedRuntime *shared, DynamicCreds *creds, ExecRuntime **ret);
+int exec_runtime_make(const Unit *unit, const ExecContext *context, ExecSharedRuntime *shared, DynamicCreds *creds, ExecRuntime **ret);
ExecRuntime* exec_runtime_free(ExecRuntime *rt);
DEFINE_TRIVIAL_CLEANUP_FUNC(ExecRuntime*, exec_runtime_free);
ExecRuntime* exec_runtime_destroy(ExecRuntime *rt);
diff --git a/src/core/load-fragment-gperf.gperf.in b/src/core/load-fragment-gperf.gperf.in
index ae318dae89..382b60ea90 100644
--- a/src/core/load-fragment-gperf.gperf.in
+++ b/src/core/load-fragment-gperf.gperf.in
@@ -10,6 +10,7 @@
{{type}}.RootHash, config_parse_exec_root_hash, 0, offsetof({{type}}, exec_context)
{{type}}.RootHashSignature, config_parse_exec_root_hash_sig, 0, offsetof({{type}}, exec_context)
{{type}}.RootVerity, config_parse_unit_path_printf, true, offsetof({{type}}, exec_context.root_verity)
+{{type}}.RootEphemeral, config_parse_bool, 0, offsetof({{type}}, exec_context.root_ephemeral)
{{type}}.ExtensionDirectories, config_parse_namespace_path_strv, 0, offsetof({{type}}, exec_context.extension_directories)
{{type}}.ExtensionImages, config_parse_extension_images, 0, offsetof({{type}}, exec_context)
{{type}}.ExtensionImagePolicy, config_parse_image_policy, 0, offsetof({{type}}, exec_context.extension_image_policy)
diff --git a/src/core/namespace.c b/src/core/namespace.c
index fbcc4505b5..bf01c5e028 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -1967,62 +1967,6 @@ static bool home_read_only(
return false;
}
-static int verity_settings_prepare(
- VeritySettings *verity,
- const char *root_image,
- const void *root_hash,
- size_t root_hash_size,
- const char *root_hash_path,
- const void *root_hash_sig,
- size_t root_hash_sig_size,
- const char *root_hash_sig_path,
- const char *verity_data_path) {
-
- int r;
-
- assert(verity);
-
- if (root_hash) {
- void *d;
-
- d = memdup(root_hash, root_hash_size);
- if (!d)
- return -ENOMEM;
-
- free_and_replace(verity->root_hash, d);
- verity->root_hash_size = root_hash_size;
- verity->designator = PARTITION_ROOT;
- }
-
- if (root_hash_sig) {
- void *d;
-
- d = memdup(root_hash_sig, root_hash_sig_size);
- if (!d)
- return -ENOMEM;
-
- free_and_replace(verity->root_hash_sig, d);
- verity->root_hash_sig_size = root_hash_sig_size;
- verity->designator = PARTITION_ROOT;
- }
-
- if (verity_data_path) {
- r = free_and_strdup(&verity->data_path, verity_data_path);
- if (r < 0)
- return r;
- }
-
- r = verity_settings_load(
- verity,
- root_image,
- root_hash_path,
- root_hash_sig_path);
- if (r < 0)
- return log_debug_errno(r, "Failed to load root hash: %m");
-
- return 0;
-}
-
int setup_namespace(
const char* root_directory,
const char* root_image,
@@ -2048,13 +1992,7 @@ int setup_namespace(
const char *creds_path,
const char *log_namespace,
unsigned long mount_propagation_flag,
- const void *root_hash,
- size_t root_hash_size,
- const char *root_hash_path,
- const void *root_hash_sig,
- size_t root_hash_sig_size,
- const char *root_hash_sig_path,
- const char *verity_data_path,
+ VeritySettings *verity,
const MountImage *extension_images,
size_t n_extension_images,
const ImagePolicy *extension_image_policy,
@@ -2067,7 +2005,6 @@ int setup_namespace(
_cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
_cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
- _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
_cleanup_strv_free_ char **hierarchies = NULL;
MountEntry *m = NULL, *mounts = NULL;
bool require_prefix = false, setup_propagate = false;
@@ -2107,16 +2044,7 @@ int setup_namespace(
strv_isempty(read_write_paths))
dissect_image_flags |= DISSECT_IMAGE_READ_ONLY;
- r = verity_settings_prepare(
- &verity,
- root_image,
- root_hash, root_hash_size, root_hash_path,
- root_hash_sig, root_hash_sig_size, root_hash_sig_path,
- verity_data_path);
- if (r < 0)
- return r;
-
- SET_FLAG(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE, verity.data_path);
+ SET_FLAG(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE, verity && verity->data_path);
r = loop_device_make_by_path(
root_image,
@@ -2130,7 +2058,7 @@ int setup_namespace(
r = dissect_loop_device(
loop_device,
- &verity,
+ verity,
root_image_mount_options,
root_image_policy,
dissect_image_flags,
@@ -2141,14 +2069,14 @@ int setup_namespace(
r = dissected_image_load_verity_sig_partition(
dissected_image,
loop_device->fd,
- &verity);
+ verity);
if (r < 0)
return r;
r = dissected_image_decrypt(
dissected_image,
NULL,
- &verity,
+ verity,
dissect_image_flags);
if (r < 0)
return log_debug_errno(r, "Failed to decrypt dissected image: %m");
diff --git a/src/core/namespace.h b/src/core/namespace.h
index 39b510f41d..4ddd6a7d58 100644
--- a/src/core/namespace.h
+++ b/src/core/namespace.h
@@ -124,13 +124,7 @@ int setup_namespace(
const char *creds_path,
const char *log_namespace,
unsigned long mount_propagation_flag,
- const void *root_hash,
- size_t root_hash_size,
- const char *root_hash_path,
- const void *root_hash_sig,
- size_t root_hash_sig_size,
- const char *root_hash_sig_path,
- const char *root_verity,
+ VeritySettings *verity,
const MountImage *extension_images,
size_t n_extension_images,
const ImagePolicy *extension_image_policy,
diff --git a/src/core/service.c b/src/core/service.c
index cecdd3bf50..146b892e46 100644
--- a/src/core/service.c
+++ b/src/core/service.c
@@ -2029,7 +2029,7 @@ static void service_enter_dead(Service *s, ServiceResult f, bool allow_restart)
/* Reset NotifyAccess override */
s->notify_access_override = _NOTIFY_ACCESS_INVALID;
- /* We want fresh tmpdirs in case service is started again immediately */
+ /* We want fresh tmpdirs and ephemeral snapshots in case the service is started again immediately. */
s->exec_runtime = exec_runtime_destroy(s->exec_runtime);
/* Also, remove the runtime directory */
diff --git a/src/core/unit.c b/src/core/unit.c
index 570234c8f4..f51b5687f8 100644
--- a/src/core/unit.c
+++ b/src/core/unit.c
@@ -5012,7 +5012,7 @@ int unit_setup_exec_runtime(Unit *u) {
return r;
}
- r = exec_runtime_make(esr, dcreds, rt);
+ r = exec_runtime_make(u, ec, esr, dcreds, rt);
if (r < 0)
return r;
diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c
index 6e93d0ca43..cc287feb8e 100644
--- a/src/shared/bus-unit-util.c
+++ b/src/shared/bus-unit-util.c
@@ -992,7 +992,8 @@ static int bus_append_execute_property(sd_bus_message *m, const char *field, con
"LockPersonality",
"ProtectHostname",
"MemoryKSM",
- "RestrictSUIDSGID"))
+ "RestrictSUIDSGID",
+ "RootEphemeral"))
return bus_append_parse_boolean(m, field, eq);
if (STR_IN_SET(field, "ReadWriteDirectories",
diff --git a/src/shared/dissect-image.c b/src/shared/dissect-image.c
index cf432e2177..a1e4d333c8 100644
--- a/src/shared/dissect-image.c
+++ b/src/shared/dissect-image.c
@@ -3196,7 +3196,7 @@ int dissected_image_load_verity_sig_partition(
return -EINVAL;
if (p->size > 4*1024*1024) /* Signature data cannot possible be larger than 4M, refuse that */
- return -EFBIG;
+ return log_debug_errno(SYNTHETIC_ERRNO(EFBIG), "Verity signature partition is larger than 4M, refusing.");
buf = new(char, p->size+1);
if (!buf)
diff --git a/src/test/test-namespace.c b/src/test/test-namespace.c
index 82be09dd6a..b6ee628533 100644
--- a/src/test/test-namespace.c
+++ b/src/test/test-namespace.c
@@ -197,12 +197,6 @@ TEST(protect_kernel_logs) {
NULL,
0,
NULL,
- 0,
- NULL,
- NULL,
- 0,
- NULL,
- NULL,
NULL,
0,
NULL,
diff --git a/src/test/test-ns.c b/src/test/test-ns.c
index 485069670b..3a3af3584d 100644
--- a/src/test/test-ns.c
+++ b/src/test/test-ns.c
@@ -99,12 +99,6 @@ int main(int argc, char *argv[]) {
NULL,
0,
NULL,
- 0,
- NULL,
- NULL,
- 0,
- NULL,
- NULL,
NULL,
0,
NULL,
diff --git a/src/tmpfiles/tmpfiles.c b/src/tmpfiles/tmpfiles.c
index 43e1285649..eabac56320 100644
--- a/src/tmpfiles/tmpfiles.c
+++ b/src/tmpfiles/tmpfiles.c
@@ -510,7 +510,9 @@ static DIR* xopendirat_nomod(int dirfd, const char *path) {
if (dir)
return dir;
- log_debug_errno(errno, "Cannot open %sdirectory \"%s\": %m", dirfd == AT_FDCWD ? "" : "sub", path);
+ if (!IN_SET(errno, ENOENT, ELOOP))
+ log_debug_errno(errno, "Cannot open %sdirectory \"%s\": %m", dirfd == AT_FDCWD ? "" : "sub", path);
+
if (errno != EPERM)
return NULL;
@@ -720,7 +722,7 @@ static int dir_cleanup(
}
if (flock(dirfd(sub_dir), LOCK_EX|LOCK_NB) < 0) {
- log_debug_errno(errno, "Couldn't acquire shared BSD lock on directory \"%s\", skipping: %m", p);
+ log_debug_errno(errno, "Couldn't acquire shared BSD lock on directory \"%s\", skipping: %m", sub_path);
continue;
}
@@ -805,10 +807,10 @@ static int dir_cleanup(
O_RDONLY|O_CLOEXEC|O_NOFOLLOW|O_NOATIME,
/* xopen_flags = */ 0,
/* mode = */ 0);
- if (fd < 0 && fd != -ENOENT)
+ if (fd < 0 && !IN_SET(fd, -ENOENT, -ELOOP))
log_warning_errno(fd, "Opening file \"%s\" failed, ignoring: %m", sub_path);
if (fd >= 0 && flock(fd, LOCK_EX|LOCK_NB) < 0 && errno == EAGAIN) {
- log_debug_errno(errno, "Couldn't acquire shared BSD lock on file \"%s\", skipping: %m", p);
+ log_debug_errno(errno, "Couldn't acquire shared BSD lock on file \"%s\", skipping: %m", sub_path);
continue;
}
diff --git a/test/TEST-50-DISSECT/test.sh b/test/TEST-50-DISSECT/test.sh
index bcc81749af..6e5179c842 100755
--- a/test/TEST-50-DISSECT/test.sh
+++ b/test/TEST-50-DISSECT/test.sh
@@ -24,6 +24,7 @@ test_append_files() {
if command -v openssl >/dev/null 2>&1; then
inst_binary openssl
fi
+ inst_binary unsquashfs
install_verity_minimal
}
diff --git a/test/fuzz/fuzz-unit-file/directives-all.service b/test/fuzz/fuzz-unit-file/directives-all.service
index 818fb28dbf..4bdc48a59b 100644
--- a/test/fuzz/fuzz-unit-file/directives-all.service
+++ b/test/fuzz/fuzz-unit-file/directives-all.service
@@ -222,6 +222,7 @@ RootImage=
RootHash=
RootHashSignature=
RootVerity=
+RootEphemeral=
ExtensionDirectories=
ExtensionImages=
RuntimeMaxSec=
diff --git a/test/test-functions b/test/test-functions
index 2b112fbb2a..d57594f17b 100644
--- a/test/test-functions
+++ b/test/test-functions
@@ -691,6 +691,7 @@ install_verity_minimal() {
grep
mount
sleep
+ touch
)
oldinitdir="$initdir"
rm -rfv "$TESTDIR/minimal"
diff --git a/test/units/testsuite-50.sh b/test/units/testsuite-50.sh
index cdb7c0043b..9222fc7010 100755
--- a/test/units/testsuite-50.sh
+++ b/test/units/testsuite-50.sh
@@ -528,6 +528,18 @@ systemd-confext status
systemd-confext unmerge
rm -rf /run/confexts/
+unsquashfs -no-xattrs -d /tmp/img "${image}.raw"
+systemd-run --unit=test-root-ephemeral \
+ -p RootDirectory=/tmp/img \
+ -p RootEphemeral=yes \
+ -p Type=exec \
+ bash -c "touch /abc && sleep infinity"
+test -n "$(ls -A /var/lib/systemd/ephemeral-trees)"
+systemctl stop test-root-ephemeral
+# shellcheck disable=SC2016
+timeout 10 bash -c 'while ! test -z "$(ls -A /var/lib/systemd/ephemeral-trees)"; do sleep .5; done'
+test ! -f /tmp/img/abc
+
echo OK >/testok
exit 0
diff --git a/tmpfiles.d/systemd.conf.in b/tmpfiles.d/systemd.conf.in
index fa838d8d06..3781c579e0 100644
--- a/tmpfiles.d/systemd.conf.in
+++ b/tmpfiles.d/systemd.conf.in
@@ -60,6 +60,10 @@ a+ /var/log/journal/%m/system.journal - - - - group:wheel:r--
d /var/lib/systemd 0755 root root -
d /var/lib/systemd/coredump 0755 root root 3d
+# Files and directories in /var/lib/systemd/ephemeral-trees are locked by pid 1 to prevent tmpfiles from
+# removing them, and tmpfiles is told to clean up anything in /var/lib/systemd/ephemeral-trees that isn't
+# locked unconditionally.
+d /var/lib/systemd/ephemeral-trees 0755 root root 0
d /var/lib/private 0700 root root -
d /var/log/private 0700 root root -