Merge pull request #27942 from DaanDeMeyer/root-ephemeral

core: Add RootEphemeral= setting
This commit is contained in:
Daan De Meyer
2023-06-21 17:24:39 +02:00
committed by GitHub
21 changed files with 313 additions and 144 deletions

View File

@@ -2950,6 +2950,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s RootVerity = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b RootEphemeral = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly as ExtensionDirectories = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly a(sba(ss)) ExtensionImages = [...];
@@ -3547,6 +3549,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
<!--property RootHashSignaturePath is not documented!-->
<!--property RootEphemeral is not documented!-->
<!--property OOMScoreAdjust is not documented!-->
<!--property CoredumpFilter is not documented!-->
@@ -4189,6 +4193,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
<variablelist class="dbus-property" generated="True" extra-ref="RootVerity"/>
<variablelist class="dbus-property" generated="True" extra-ref="RootEphemeral"/>
<variablelist class="dbus-property" generated="True" extra-ref="ExtensionDirectories"/>
<variablelist class="dbus-property" generated="True" extra-ref="ExtensionImages"/>
@@ -4972,6 +4978,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s RootVerity = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b RootEphemeral = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly as ExtensionDirectories = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly a(sba(ss)) ExtensionImages = [...];
@@ -5581,6 +5589,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
<!--property RootHashSignaturePath is not documented!-->
<!--property RootEphemeral is not documented!-->
<!--property OOMScoreAdjust is not documented!-->
<!--property CoredumpFilter is not documented!-->
@@ -6203,6 +6213,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
<variablelist class="dbus-property" generated="True" extra-ref="RootVerity"/>
<variablelist class="dbus-property" generated="True" extra-ref="RootEphemeral"/>
<variablelist class="dbus-property" generated="True" extra-ref="ExtensionDirectories"/>
<variablelist class="dbus-property" generated="True" extra-ref="ExtensionImages"/>
@@ -6861,6 +6873,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s RootVerity = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b RootEphemeral = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly as ExtensionDirectories = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly a(sba(ss)) ExtensionImages = [...];
@@ -7398,6 +7412,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
<!--property RootHashSignaturePath is not documented!-->
<!--property RootEphemeral is not documented!-->
<!--property OOMScoreAdjust is not documented!-->
<!--property CoredumpFilter is not documented!-->
@@ -7938,6 +7954,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
<variablelist class="dbus-property" generated="True" extra-ref="RootVerity"/>
<variablelist class="dbus-property" generated="True" extra-ref="RootEphemeral"/>
<variablelist class="dbus-property" generated="True" extra-ref="ExtensionDirectories"/>
<variablelist class="dbus-property" generated="True" extra-ref="ExtensionImages"/>
@@ -8723,6 +8741,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s RootVerity = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b RootEphemeral = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly as ExtensionDirectories = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly a(sba(ss)) ExtensionImages = [...];
@@ -9246,6 +9266,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
<!--property RootHashSignaturePath is not documented!-->
<!--property RootEphemeral is not documented!-->
<!--property OOMScoreAdjust is not documented!-->
<!--property CoredumpFilter is not documented!-->
@@ -9772,6 +9794,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
<variablelist class="dbus-property" generated="True" extra-ref="RootVerity"/>
<variablelist class="dbus-property" generated="True" extra-ref="RootEphemeral"/>
<variablelist class="dbus-property" generated="True" extra-ref="ExtensionDirectories"/>
<variablelist class="dbus-property" generated="True" extra-ref="ExtensionImages"/>

View File

@@ -200,6 +200,26 @@
<xi:include href="system-only.xml" xpointer="singular"/></listitem>
</varlistentry>
<varlistentry>
<term><varname>RootEphemeral=</varname></term>
<listitem><para>Takes a boolean argument. If enabled, executed processes will run in an ephemeral
copy of the root directory or root image. The ephemeral copy is placed in
<filename>/var/lib/systemd/ephemeral-trees/</filename> while the service is active and is cleaned up
when the service is stopped or restarted. If <varname>RootDirectory=</varname> is used and the root
directory is a subvolume, the ephemeral copy will be created by making a snapshot of the subvolume.
</para>
<para>To make sure making ephemeral copies can be made efficiently, the root directory or root image
should be located on the same filesystem as <filename>/var/lib/systemd/ephemeral-trees/</filename>.
When using <varname>RootEphemeral=</varname> with root directories, btrfs should be used as the
filesystem and the root directory should ideally be a subvolume which <command>systemd</command> can
snapshot to make the ephemeral copy. For root images, a filesystem with support for reflinks should
be used to ensure an efficient ephemeral copy.</para>
<xi:include href="system-only.xml" xpointer="singular"/></listitem>
</varlistentry>
<varlistentry>
<term><varname>RootHash=</varname></term>

View File

@@ -661,6 +661,14 @@ d /tmp/foo/bar - - - bmA:1h -</programlisting></para>
Applications may use this to temporarily exclude certain directory subtrees from the aging algorithm:
the applications can take a BSD file lock themselves, and as long as they keep it aging of the
directory/file and everything below it is disabled.</para>
<para>This behavior can be used to ensure guaranteed cleanup of files or directories whose lifetime
should be aligned with the process that created them by having that process create them in a location
monitored by <command>systemd-tmpfiles</command> with an age of <literal>0</literal>, and having the
process immediately lock the directory or file before using it. Because the BSD lock is process
specific, the file is guaranteed to be unlocked as soon as the process exits, meaning that even if the
process crashes, those files and directories will be unlocked and cleaned up by
<command>systemd-tmpfiles</command>.</para>
</refsect2>
<refsect2>

View File

@@ -1231,6 +1231,7 @@ const sd_bus_vtable bus_exec_vtable[] = {
SD_BUS_PROPERTY("RootHashSignature", "ay", property_get_root_hash_sig, 0, SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("RootHashSignaturePath", "s", NULL, offsetof(ExecContext, root_hash_sig_path), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("RootVerity", "s", NULL, offsetof(ExecContext, root_verity), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("RootEphemeral", "b", bus_property_get_bool, offsetof(ExecContext, root_ephemeral), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("ExtensionDirectories", "as", NULL, offsetof(ExecContext, extension_directories), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("ExtensionImages", "a(sba(ss))", property_get_extension_images, 0, SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("MountImages", "a(ssba(ss))", property_get_mount_images, 0, SD_BUS_VTABLE_PROPERTY_CONST),
@@ -1865,6 +1866,9 @@ int bus_exec_context_set_transient_property(
if (streq(name, "RootDirectory"))
return bus_set_transient_path(u, name, &c->root_directory, message, flags, error);
if (streq(name, "RootEphemeral"))
return bus_set_transient_bool(u, name, &c->root_ephemeral, message, flags, error);
if (streq(name, "SyslogIdentifier"))
return bus_set_transient_string(u, name, &c->syslog_identifier, message, flags, error);

View File

@@ -15,6 +15,8 @@
#include <unistd.h>
#include <utmpx.h>
#include <linux/fs.h> /* Must be included after <sys/mount.h> */
#if HAVE_PAM
#include <security/pam_appl.h>
#endif
@@ -43,8 +45,10 @@
#include "async.h"
#include "barrier.h"
#include "bpf-lsm.h"
#include "btrfs-util.h"
#include "cap-list.h"
#include "capability-util.h"
#include "chattr-util.h"
#include "cgroup-setup.h"
#include "chase.h"
#include "chown-recursive.h"
@@ -66,6 +70,7 @@
#include "io-util.h"
#include "ioprio-util.h"
#include "label-util.h"
#include "lock-util.h"
#include "log.h"
#include "macro.h"
#include "manager.h"
@@ -2170,6 +2175,10 @@ bool exec_needs_network_namespace(const ExecContext *context) {
return context->private_network || context->network_namespace_path;
}
static bool exec_needs_ephemeral(const ExecContext *context) {
return (context->root_image || context->root_directory) && context->root_ephemeral;
}
static bool exec_needs_ipc_namespace(const ExecContext *context) {
assert(context);
@@ -3823,21 +3832,134 @@ static bool insist_on_sandboxing(
return false;
}
static int setup_ephemeral(const ExecContext *context, ExecRuntime *runtime) {
_cleanup_close_ int fd = -EBADF;
int r;
if (!runtime || !runtime->ephemeral_copy)
return 0;
r = posix_lock(runtime->ephemeral_storage_socket[0], LOCK_EX);
if (r < 0)
return log_debug_errno(r, "Failed to lock ephemeral storage socket: %m");
CLEANUP_POSIX_UNLOCK(runtime->ephemeral_storage_socket[0]);
fd = receive_one_fd(runtime->ephemeral_storage_socket[0], MSG_PEEK|MSG_DONTWAIT);
if (fd >= 0)
/* We got an fd! That means ephemeral has already been set up, so nothing to do here. */
return 0;
if (fd != -EAGAIN)
return log_debug_errno(fd, "Failed to receive file descriptor queued on ephemeral storage socket: %m");
log_debug("Making ephemeral snapshot of %s to %s",
context->root_image ?: context->root_directory, runtime->ephemeral_copy);
if (context->root_image)
fd = copy_file(context->root_image, runtime->ephemeral_copy, O_EXCL, 0600,
COPY_LOCK_BSD|COPY_REFLINK|COPY_CRTIME);
else
fd = btrfs_subvol_snapshot_at(AT_FDCWD, context->root_directory,
AT_FDCWD, runtime->ephemeral_copy,
BTRFS_SNAPSHOT_FALLBACK_COPY |
BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
BTRFS_SNAPSHOT_RECURSIVE |
BTRFS_SNAPSHOT_LOCK_BSD);
if (fd < 0)
return log_debug_errno(fd, "Failed to snapshot %s to %s: %m",
context->root_image ?: context->root_directory, runtime->ephemeral_copy);
if (context->root_image) {
/* A root image might be subject to lots of random writes so let's try to disable COW on it
* which tends to not perform well in combination with lots of random writes.
*
* Note: btrfs actually isn't impressed by us setting the flag after making the reflink'ed
* copy, but we at least want to make the intention clear.
*/
r = chattr_fd(fd, FS_NOCOW_FL, FS_NOCOW_FL, NULL);
if (r < 0)
log_debug_errno(fd, "Failed to disable copy-on-write for %s, ignoring: %m", runtime->ephemeral_copy);
}
r = send_one_fd(runtime->ephemeral_storage_socket[1], fd, MSG_DONTWAIT);
if (r < 0)
return log_debug_errno(r, "Failed to queue file descriptor on ephemeral storage socket: %m");
return 1;
}
static int verity_settings_prepare(
VeritySettings *verity,
const char *root_image,
const void *root_hash,
size_t root_hash_size,
const char *root_hash_path,
const void *root_hash_sig,
size_t root_hash_sig_size,
const char *root_hash_sig_path,
const char *verity_data_path) {
int r;
assert(verity);
if (root_hash) {
void *d;
d = memdup(root_hash, root_hash_size);
if (!d)
return -ENOMEM;
free_and_replace(verity->root_hash, d);
verity->root_hash_size = root_hash_size;
verity->designator = PARTITION_ROOT;
}
if (root_hash_sig) {
void *d;
d = memdup(root_hash_sig, root_hash_sig_size);
if (!d)
return -ENOMEM;
free_and_replace(verity->root_hash_sig, d);
verity->root_hash_sig_size = root_hash_sig_size;
verity->designator = PARTITION_ROOT;
}
if (verity_data_path) {
r = free_and_strdup(&verity->data_path, verity_data_path);
if (r < 0)
return r;
}
r = verity_settings_load(
verity,
root_image,
root_hash_path,
root_hash_sig_path);
if (r < 0)
return log_debug_errno(r, "Failed to load root hash: %m");
return 0;
}
static int apply_mount_namespace(
const Unit *u,
ExecCommandFlags command_flags,
const ExecContext *context,
const ExecParameters *params,
const ExecRuntime *runtime,
ExecRuntime *runtime,
const char *memory_pressure_path,
char **error_path) {
_cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
_cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
**read_write_paths_cleanup = NULL;
const char *tmp_dir = NULL, *var_tmp_dir = NULL;
const char *root_dir = NULL, *root_image = NULL;
_cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
*extension_dir = NULL;
const char *root_dir = NULL, *root_image = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
char **read_write_paths;
NamespaceInfo ns_info;
bool needs_sandboxing;
@@ -3850,10 +3972,14 @@ static int apply_mount_namespace(
CLEANUP_ARRAY(bind_mounts, n_bind_mounts, bind_mount_free_many);
if (params->flags & EXEC_APPLY_CHROOT) {
root_image = context->root_image;
r = setup_ephemeral(context, runtime);
if (r < 0)
return r;
if (!root_image)
root_dir = context->root_directory;
if (context->root_image)
root_image = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_image;
else
root_dir = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory;
}
r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
@@ -3956,6 +4082,17 @@ static int apply_mount_namespace(
if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0)
return -ENOMEM;
if (root_image) {
r = verity_settings_prepare(
&verity,
root_image,
context->root_hash, context->root_hash_size, context->root_hash_path,
context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
context->root_verity);
if (r < 0)
return r;
}
r = setup_namespace(
root_dir,
root_image,
@@ -3981,9 +4118,7 @@ static int apply_mount_namespace(
creds_path,
context->log_namespace,
context->mount_propagation_flag,
context->root_hash, context->root_hash_size, context->root_hash_path,
context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
context->root_verity,
&verity,
context->extension_images,
context->n_extension_images,
context->extension_image_policy ?: &image_policy_sysext,
@@ -4025,6 +4160,7 @@ static int apply_mount_namespace(
static int apply_working_directory(
const ExecContext *context,
const ExecParameters *params,
ExecRuntime *runtime,
const char *home,
int *exit_status) {
@@ -4048,7 +4184,7 @@ static int apply_working_directory(
if (params->flags & EXEC_APPLY_CHROOT)
d = wd;
else
d = prefix_roota(context->root_directory, wd);
d = prefix_roota((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory, wd);
if (chdir(d) < 0 && !context->working_directory_missing_ok) {
*exit_status = EXIT_CHDIR;
@@ -4061,6 +4197,7 @@ static int apply_working_directory(
static int apply_root_directory(
const ExecContext *context,
const ExecParameters *params,
ExecRuntime *runtime,
const bool needs_mount_ns,
int *exit_status) {
@@ -4069,7 +4206,7 @@ static int apply_root_directory(
if (params->flags & EXEC_APPLY_CHROOT)
if (!needs_mount_ns && context->root_directory)
if (chroot(context->root_directory) < 0) {
if (chroot((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory) < 0) {
*exit_status = EXIT_CHROOT;
return -errno;
}
@@ -4206,7 +4343,7 @@ static int close_remaining_fds(
const int *fds, size_t n_fds) {
size_t n_dont_close = 0;
int dont_close[n_fds + 12];
int dont_close[n_fds + 14];
assert(params);
@@ -4224,6 +4361,9 @@ static int close_remaining_fds(
n_dont_close += n_fds;
}
if (runtime)
append_socket_pair(dont_close, &n_dont_close, runtime->ephemeral_storage_socket);
if (runtime && runtime->shared) {
append_socket_pair(dont_close, &n_dont_close, runtime->shared->netns_storage_socket);
append_socket_pair(dont_close, &n_dont_close, runtime->shared->ipcns_storage_socket);
@@ -5519,7 +5659,7 @@ static int exec_child(
}
/* chroot to root directory first, before we lose the ability to chroot */
r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
r = apply_root_directory(context, params, runtime, needs_mount_namespace, exit_status);
if (r < 0)
return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
@@ -5545,7 +5685,7 @@ static int exec_child(
/* Apply working directory here, because the working directory might be on NFS and only the user running
* this service might have the correct privilege to change to the working directory */
r = apply_working_directory(context, params, home, exit_status);
r = apply_working_directory(context, params, runtime, home, exit_status);
if (r < 0)
return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
@@ -6357,6 +6497,7 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
"%sUMask: %04o\n"
"%sWorkingDirectory: %s\n"
"%sRootDirectory: %s\n"
"%sRootEphemeral: %s\n"
"%sNonBlocking: %s\n"
"%sPrivateTmp: %s\n"
"%sPrivateDevices: %s\n"
@@ -6381,6 +6522,7 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
prefix, c->umask,
prefix, empty_to_root(c->working_directory),
prefix, empty_to_root(c->root_directory),
prefix, yes_no(c->root_ephemeral),
prefix, yes_no(c->non_blocking),
prefix, yes_no(c->private_tmp),
prefix, yes_no(c->private_devices),
@@ -7176,13 +7318,30 @@ int exec_command_append(ExecCommand *c, const char *path, ...) {
return 0;
}
static void *remove_tmpdir_thread(void *p) {
static void *rm_rf_thread(void *p) {
_cleanup_free_ char *path = p;
(void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
(void) rm_rf(path, REMOVE_ROOT|REMOVE_SUBVOLUME|REMOVE_PHYSICAL);
return NULL;
}
static void asynchronous_rm_rf(char **path) {
int r;
assert(path);
if (!*path || streq(*path, RUN_SYSTEMD_EMPTY))
return;
log_debug("Spawning thread to nuke %s", *path);
r = asynchronous_job(rm_rf_thread, *path);
if (r < 0)
log_warning_errno(r, "Failed to nuke %s: %m", *path);
else
*path = NULL;
}
static ExecSharedRuntime* exec_shared_runtime_free(ExecSharedRuntime *rt) {
if (!rt)
return NULL;
@@ -7202,8 +7361,6 @@ DEFINE_TRIVIAL_UNREF_FUNC(ExecSharedRuntime, exec_shared_runtime, exec_shared_ru
DEFINE_TRIVIAL_CLEANUP_FUNC(ExecSharedRuntime*, exec_shared_runtime_free);
ExecSharedRuntime* exec_shared_runtime_destroy(ExecSharedRuntime *rt) {
int r;
if (!rt)
return NULL;
@@ -7213,25 +7370,8 @@ ExecSharedRuntime* exec_shared_runtime_destroy(ExecSharedRuntime *rt) {
if (rt->n_ref > 0)
return NULL;
if (rt->tmp_dir && !streq(rt->tmp_dir, RUN_SYSTEMD_EMPTY)) {
log_debug("Spawning thread to nuke %s", rt->tmp_dir);
r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
if (r < 0)
log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
else
rt->tmp_dir = NULL;
}
if (rt->var_tmp_dir && !streq(rt->var_tmp_dir, RUN_SYSTEMD_EMPTY)) {
log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
if (r < 0)
log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
else
rt->var_tmp_dir = NULL;
}
asynchronous_rm_rf(&rt->tmp_dir);
asynchronous_rm_rf(&rt->var_tmp_dir);
return exec_shared_runtime_free(rt);
}
@@ -7666,16 +7806,39 @@ void exec_shared_runtime_vacuum(Manager *m) {
}
}
int exec_runtime_make(ExecSharedRuntime *shared, DynamicCreds *creds, ExecRuntime **ret) {
int exec_runtime_make(
const Unit *unit,
const ExecContext *context,
ExecSharedRuntime *shared,
DynamicCreds *creds,
ExecRuntime **ret) {
_cleanup_close_pair_ int ephemeral_storage_socket[2] = PIPE_EBADF;
_cleanup_free_ char *ephemeral = NULL;
_cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
int r;
assert(unit);
assert(context);
assert(ret);
if (!shared && !creds) {
if (!shared && !creds && !exec_needs_ephemeral(context)) {
*ret = NULL;
return 0;
}
if (exec_needs_ephemeral(context)) {
r = mkdir_p("/var/lib/systemd/ephemeral-trees", 0755);
if (r < 0)
return r;
r = tempfn_random_child("/var/lib/systemd/ephemeral-trees", unit->id, &ephemeral);
if (r < 0)
return r;
if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ephemeral_storage_socket) < 0)
return -errno;
}
rt = new(ExecRuntime, 1);
if (!rt)
return -ENOMEM;
@@ -7683,6 +7846,9 @@ int exec_runtime_make(ExecSharedRuntime *shared, DynamicCreds *creds, ExecRuntim
*rt = (ExecRuntime) {
.shared = shared,
.dynamic_creds = creds,
.ephemeral_copy = TAKE_PTR(ephemeral),
.ephemeral_storage_socket[0] = TAKE_FD(ephemeral_storage_socket[0]),
.ephemeral_storage_socket[1] = TAKE_FD(ephemeral_storage_socket[1]),
};
*ret = TAKE_PTR(rt);
@@ -7695,6 +7861,11 @@ ExecRuntime* exec_runtime_free(ExecRuntime *rt) {
exec_shared_runtime_unref(rt->shared);
dynamic_creds_unref(rt->dynamic_creds);
asynchronous_rm_rf(&rt->ephemeral_copy);
free(rt->ephemeral_copy);
safe_close_pair(rt->ephemeral_storage_socket);
return mfree(rt);
}

View File

@@ -129,6 +129,14 @@ struct ExecSharedRuntime {
struct ExecRuntime {
ExecSharedRuntime *shared;
DynamicCreds *dynamic_creds;
/* The path to the ephemeral snapshot of the root directory or root image if one was requested. */
char *ephemeral_copy;
/* An AF_UNIX socket pair that receives the locked file descriptor referring to the ephemeral copy of
* the root directory or root image. The lock prevents tmpfiles from removing the ephemeral snapshot
* until we're done using it. */
int ephemeral_storage_socket[2];
};
typedef enum ExecDirectoryType {
@@ -195,6 +203,7 @@ struct ExecContext {
void *root_hash, *root_hash_sig;
size_t root_hash_size, root_hash_sig_size;
LIST_HEAD(MountOptions, root_image_options);
bool root_ephemeral;
bool working_directory_missing_ok:1;
bool working_directory_home:1;
@@ -506,7 +515,7 @@ int exec_shared_runtime_deserialize_compat(Unit *u, const char *key, const char
int exec_shared_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds);
void exec_shared_runtime_vacuum(Manager *m);
int exec_runtime_make(ExecSharedRuntime *shared, DynamicCreds *creds, ExecRuntime **ret);
int exec_runtime_make(const Unit *unit, const ExecContext *context, ExecSharedRuntime *shared, DynamicCreds *creds, ExecRuntime **ret);
ExecRuntime* exec_runtime_free(ExecRuntime *rt);
DEFINE_TRIVIAL_CLEANUP_FUNC(ExecRuntime*, exec_runtime_free);
ExecRuntime* exec_runtime_destroy(ExecRuntime *rt);

View File

@@ -10,6 +10,7 @@
{{type}}.RootHash, config_parse_exec_root_hash, 0, offsetof({{type}}, exec_context)
{{type}}.RootHashSignature, config_parse_exec_root_hash_sig, 0, offsetof({{type}}, exec_context)
{{type}}.RootVerity, config_parse_unit_path_printf, true, offsetof({{type}}, exec_context.root_verity)
{{type}}.RootEphemeral, config_parse_bool, 0, offsetof({{type}}, exec_context.root_ephemeral)
{{type}}.ExtensionDirectories, config_parse_namespace_path_strv, 0, offsetof({{type}}, exec_context.extension_directories)
{{type}}.ExtensionImages, config_parse_extension_images, 0, offsetof({{type}}, exec_context)
{{type}}.ExtensionImagePolicy, config_parse_image_policy, 0, offsetof({{type}}, exec_context.extension_image_policy)

View File

@@ -1967,62 +1967,6 @@ static bool home_read_only(
return false;
}
static int verity_settings_prepare(
VeritySettings *verity,
const char *root_image,
const void *root_hash,
size_t root_hash_size,
const char *root_hash_path,
const void *root_hash_sig,
size_t root_hash_sig_size,
const char *root_hash_sig_path,
const char *verity_data_path) {
int r;
assert(verity);
if (root_hash) {
void *d;
d = memdup(root_hash, root_hash_size);
if (!d)
return -ENOMEM;
free_and_replace(verity->root_hash, d);
verity->root_hash_size = root_hash_size;
verity->designator = PARTITION_ROOT;
}
if (root_hash_sig) {
void *d;
d = memdup(root_hash_sig, root_hash_sig_size);
if (!d)
return -ENOMEM;
free_and_replace(verity->root_hash_sig, d);
verity->root_hash_sig_size = root_hash_sig_size;
verity->designator = PARTITION_ROOT;
}
if (verity_data_path) {
r = free_and_strdup(&verity->data_path, verity_data_path);
if (r < 0)
return r;
}
r = verity_settings_load(
verity,
root_image,
root_hash_path,
root_hash_sig_path);
if (r < 0)
return log_debug_errno(r, "Failed to load root hash: %m");
return 0;
}
int setup_namespace(
const char* root_directory,
const char* root_image,
@@ -2048,13 +1992,7 @@ int setup_namespace(
const char *creds_path,
const char *log_namespace,
unsigned long mount_propagation_flag,
const void *root_hash,
size_t root_hash_size,
const char *root_hash_path,
const void *root_hash_sig,
size_t root_hash_sig_size,
const char *root_hash_sig_path,
const char *verity_data_path,
VeritySettings *verity,
const MountImage *extension_images,
size_t n_extension_images,
const ImagePolicy *extension_image_policy,
@@ -2067,7 +2005,6 @@ int setup_namespace(
_cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
_cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
_cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
_cleanup_strv_free_ char **hierarchies = NULL;
MountEntry *m = NULL, *mounts = NULL;
bool require_prefix = false, setup_propagate = false;
@@ -2107,16 +2044,7 @@ int setup_namespace(
strv_isempty(read_write_paths))
dissect_image_flags |= DISSECT_IMAGE_READ_ONLY;
r = verity_settings_prepare(
&verity,
root_image,
root_hash, root_hash_size, root_hash_path,
root_hash_sig, root_hash_sig_size, root_hash_sig_path,
verity_data_path);
if (r < 0)
return r;
SET_FLAG(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE, verity.data_path);
SET_FLAG(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE, verity && verity->data_path);
r = loop_device_make_by_path(
root_image,
@@ -2130,7 +2058,7 @@ int setup_namespace(
r = dissect_loop_device(
loop_device,
&verity,
verity,
root_image_mount_options,
root_image_policy,
dissect_image_flags,
@@ -2141,14 +2069,14 @@ int setup_namespace(
r = dissected_image_load_verity_sig_partition(
dissected_image,
loop_device->fd,
&verity);
verity);
if (r < 0)
return r;
r = dissected_image_decrypt(
dissected_image,
NULL,
&verity,
verity,
dissect_image_flags);
if (r < 0)
return log_debug_errno(r, "Failed to decrypt dissected image: %m");

View File

@@ -124,13 +124,7 @@ int setup_namespace(
const char *creds_path,
const char *log_namespace,
unsigned long mount_propagation_flag,
const void *root_hash,
size_t root_hash_size,
const char *root_hash_path,
const void *root_hash_sig,
size_t root_hash_sig_size,
const char *root_hash_sig_path,
const char *root_verity,
VeritySettings *verity,
const MountImage *extension_images,
size_t n_extension_images,
const ImagePolicy *extension_image_policy,

View File

@@ -2029,7 +2029,7 @@ static void service_enter_dead(Service *s, ServiceResult f, bool allow_restart)
/* Reset NotifyAccess override */
s->notify_access_override = _NOTIFY_ACCESS_INVALID;
/* We want fresh tmpdirs in case service is started again immediately */
/* We want fresh tmpdirs and ephemeral snapshots in case the service is started again immediately. */
s->exec_runtime = exec_runtime_destroy(s->exec_runtime);
/* Also, remove the runtime directory */

View File

@@ -5012,7 +5012,7 @@ int unit_setup_exec_runtime(Unit *u) {
return r;
}
r = exec_runtime_make(esr, dcreds, rt);
r = exec_runtime_make(u, ec, esr, dcreds, rt);
if (r < 0)
return r;

View File

@@ -992,7 +992,8 @@ static int bus_append_execute_property(sd_bus_message *m, const char *field, con
"LockPersonality",
"ProtectHostname",
"MemoryKSM",
"RestrictSUIDSGID"))
"RestrictSUIDSGID",
"RootEphemeral"))
return bus_append_parse_boolean(m, field, eq);
if (STR_IN_SET(field, "ReadWriteDirectories",

View File

@@ -3196,7 +3196,7 @@ int dissected_image_load_verity_sig_partition(
return -EINVAL;
if (p->size > 4*1024*1024) /* Signature data cannot possible be larger than 4M, refuse that */
return -EFBIG;
return log_debug_errno(SYNTHETIC_ERRNO(EFBIG), "Verity signature partition is larger than 4M, refusing.");
buf = new(char, p->size+1);
if (!buf)

View File

@@ -197,12 +197,6 @@ TEST(protect_kernel_logs) {
NULL,
0,
NULL,
0,
NULL,
NULL,
0,
NULL,
NULL,
NULL,
0,
NULL,

View File

@@ -99,12 +99,6 @@ int main(int argc, char *argv[]) {
NULL,
0,
NULL,
0,
NULL,
NULL,
0,
NULL,
NULL,
NULL,
0,
NULL,

View File

@@ -510,7 +510,9 @@ static DIR* xopendirat_nomod(int dirfd, const char *path) {
if (dir)
return dir;
log_debug_errno(errno, "Cannot open %sdirectory \"%s\": %m", dirfd == AT_FDCWD ? "" : "sub", path);
if (!IN_SET(errno, ENOENT, ELOOP))
log_debug_errno(errno, "Cannot open %sdirectory \"%s\": %m", dirfd == AT_FDCWD ? "" : "sub", path);
if (errno != EPERM)
return NULL;
@@ -720,7 +722,7 @@ static int dir_cleanup(
}
if (flock(dirfd(sub_dir), LOCK_EX|LOCK_NB) < 0) {
log_debug_errno(errno, "Couldn't acquire shared BSD lock on directory \"%s\", skipping: %m", p);
log_debug_errno(errno, "Couldn't acquire shared BSD lock on directory \"%s\", skipping: %m", sub_path);
continue;
}
@@ -805,10 +807,10 @@ static int dir_cleanup(
O_RDONLY|O_CLOEXEC|O_NOFOLLOW|O_NOATIME,
/* xopen_flags = */ 0,
/* mode = */ 0);
if (fd < 0 && fd != -ENOENT)
if (fd < 0 && !IN_SET(fd, -ENOENT, -ELOOP))
log_warning_errno(fd, "Opening file \"%s\" failed, ignoring: %m", sub_path);
if (fd >= 0 && flock(fd, LOCK_EX|LOCK_NB) < 0 && errno == EAGAIN) {
log_debug_errno(errno, "Couldn't acquire shared BSD lock on file \"%s\", skipping: %m", p);
log_debug_errno(errno, "Couldn't acquire shared BSD lock on file \"%s\", skipping: %m", sub_path);
continue;
}

View File

@@ -24,6 +24,7 @@ test_append_files() {
if command -v openssl >/dev/null 2>&1; then
inst_binary openssl
fi
inst_binary unsquashfs
install_verity_minimal
}

View File

@@ -222,6 +222,7 @@ RootImage=
RootHash=
RootHashSignature=
RootVerity=
RootEphemeral=
ExtensionDirectories=
ExtensionImages=
RuntimeMaxSec=

View File

@@ -691,6 +691,7 @@ install_verity_minimal() {
grep
mount
sleep
touch
)
oldinitdir="$initdir"
rm -rfv "$TESTDIR/minimal"

View File

@@ -528,6 +528,18 @@ systemd-confext status
systemd-confext unmerge
rm -rf /run/confexts/
unsquashfs -no-xattrs -d /tmp/img "${image}.raw"
systemd-run --unit=test-root-ephemeral \
-p RootDirectory=/tmp/img \
-p RootEphemeral=yes \
-p Type=exec \
bash -c "touch /abc && sleep infinity"
test -n "$(ls -A /var/lib/systemd/ephemeral-trees)"
systemctl stop test-root-ephemeral
# shellcheck disable=SC2016
timeout 10 bash -c 'while ! test -z "$(ls -A /var/lib/systemd/ephemeral-trees)"; do sleep .5; done'
test ! -f /tmp/img/abc
echo OK >/testok
exit 0

View File

@@ -60,6 +60,10 @@ a+ /var/log/journal/%m/system.journal - - - - group:wheel:r--
d /var/lib/systemd 0755 root root -
d /var/lib/systemd/coredump 0755 root root 3d
# Files and directories in /var/lib/systemd/ephemeral-trees are locked by pid 1 to prevent tmpfiles from
# removing them, and tmpfiles is told to clean up anything in /var/lib/systemd/ephemeral-trees that isn't
# locked unconditionally.
d /var/lib/systemd/ephemeral-trees 0755 root root 0
d /var/lib/private 0700 root root -
d /var/log/private 0700 root root -