From 571867ffa76c7829d3901386aa43294852a0363c Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 2 Jul 2025 11:12:23 +0200 Subject: [PATCH 01/22] pidref: add pidref_set_pid_and_pidfd_id() This new helper takes both a PID and and a pidfd ID, and initializes a PidRef from it. It ensures they actually belong together and returns an error if not. --- src/basic/pidref.c | 25 +++++++++++++++++++++++++ src/basic/pidref.h | 1 + 2 files changed, 26 insertions(+) diff --git a/src/basic/pidref.c b/src/basic/pidref.c index e4c51dd952..7f66496dd3 100644 --- a/src/basic/pidref.c +++ b/src/basic/pidref.c @@ -118,6 +118,31 @@ int pidref_set_pid(PidRef *pidref, pid_t pid) { return 0; } +int pidref_set_pid_and_pidfd_id( + PidRef *pidref, + pid_t pid, + uint64_t pidfd_id) { + + int r; + + assert(pidref); + + _cleanup_(pidref_done) PidRef n = PIDREF_NULL; + r = pidref_set_pid(&n, pid); + if (r < 0) + return r; + + if (pidfd_id > 0) { + pidref_acquire_pidfd_id(&n); + + if (n.fd_id != pidfd_id) + return -ESRCH; + } + + *pidref = TAKE_PIDREF(n); + return 0; +} + int pidref_set_pidstr(PidRef *pidref, const char *pid) { pid_t nr; int r; diff --git a/src/basic/pidref.h b/src/basic/pidref.h index 724871842b..4d8a084c4b 100644 --- a/src/basic/pidref.h +++ b/src/basic/pidref.h @@ -68,6 +68,7 @@ bool pidref_equal(PidRef *a, PidRef *b); * PIDREF_MAKE_FROM_PID() above, which does not acquire a pidfd.) */ int pidref_set_pid(PidRef *pidref, pid_t pid); int pidref_set_pidstr(PidRef *pidref, const char *pid); +int pidref_set_pid_and_pidfd_id(PidRef *pidref, pid_t pid, uint64_t pidfd_id); int pidref_set_pidfd(PidRef *pidref, int fd); int pidref_set_pidfd_take(PidRef *pidref, int fd); /* takes ownership of the passed pidfd on success */ int pidref_set_pidfd_consume(PidRef *pidref, int fd); /* takes ownership of the passed pidfd in both success and failure */ From 7bb1147b009772c96250e5fef93b96b7023fa36c Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 21 May 2025 17:21:36 +0200 Subject: [PATCH 02/22] cgroup-util: add cg_path_get_unit_full() helper and related calls This helper returns not only the unit a cgroup belongs to, but also the cgroup sub-path beyond it. --- src/basic/cgroup-util.c | 51 ++++++++++++++++++++++++------------- src/basic/cgroup-util.h | 15 ++++++++--- src/test/test-cgroup-util.c | 30 ++++++++++++++++++++++ 3 files changed, 76 insertions(+), 20 deletions(-) diff --git a/src/basic/cgroup-util.c b/src/basic/cgroup-util.c index f3270ff278..0184cbffbe 100644 --- a/src/basic/cgroup-util.c +++ b/src/basic/cgroup-util.c @@ -973,15 +973,14 @@ static const char* skip_slices(const char *p) { } } -int cg_path_get_unit(const char *path, char **ret) { - _cleanup_free_ char *unit = NULL; - const char *e; +int cg_path_get_unit_full(const char *path, char **ret_unit, char **ret_subgroup) { int r; assert(path); - e = skip_slices(path); + const char *e = skip_slices(path); + _cleanup_free_ char *unit = NULL; r = cg_path_decode_unit(e, &unit); if (r < 0) return r; @@ -990,8 +989,27 @@ int cg_path_get_unit(const char *path, char **ret) { if (endswith(unit, ".slice")) return -ENXIO; - if (ret) - *ret = TAKE_PTR(unit); + if (ret_subgroup) { + _cleanup_free_ char *subgroup = NULL; + e += strcspn(e, "/"); + e += strspn(e, "/"); + + if (isempty(e)) + subgroup = NULL; + else { + subgroup = strdup(e); + if (!subgroup) + return -ENOMEM; + } + + path_simplify(subgroup); + + *ret_subgroup = TAKE_PTR(subgroup); + } + + if (ret_unit) + *ret_unit = TAKE_PTR(unit); + return 0; } @@ -1017,31 +1035,27 @@ int cg_path_get_unit_path(const char *path, char **ret) { return 0; } -int cg_pid_get_unit(pid_t pid, char **ret_unit) { - _cleanup_free_ char *cgroup = NULL; +int cg_pid_get_unit_full(pid_t pid, char **ret_unit, char **ret_subgroup) { int r; - assert(ret_unit); - + _cleanup_free_ char *cgroup = NULL; r = cg_pid_get_path_shifted(pid, NULL, &cgroup); if (r < 0) return r; - return cg_path_get_unit(cgroup, ret_unit); + return cg_path_get_unit_full(cgroup, ret_unit, ret_subgroup); } -int cg_pidref_get_unit(const PidRef *pidref, char **ret) { - _cleanup_free_ char *unit = NULL; +int cg_pidref_get_unit_full(const PidRef *pidref, char **ret_unit, char **ret_subgroup) { int r; - assert(ret); - if (!pidref_is_set(pidref)) return -ESRCH; if (pidref_is_remote(pidref)) return -EREMOTE; - r = cg_pid_get_unit(pidref->pid, &unit); + _cleanup_free_ char *unit = NULL, *subgroup = NULL; + r = cg_pid_get_unit_full(pidref->pid, &unit, &subgroup); if (r < 0) return r; @@ -1049,7 +1063,10 @@ int cg_pidref_get_unit(const PidRef *pidref, char **ret) { if (r < 0) return r; - *ret = TAKE_PTR(unit); + if (ret_unit) + *ret_unit = TAKE_PTR(unit); + if (ret_subgroup) + *ret_subgroup = TAKE_PTR(subgroup); return 0; } diff --git a/src/basic/cgroup-util.h b/src/basic/cgroup-util.h index f49fc48832..81424418e0 100644 --- a/src/basic/cgroup-util.h +++ b/src/basic/cgroup-util.h @@ -203,7 +203,10 @@ int cg_get_root_path(char **path); int cg_path_get_session(const char *path, char **ret_session); int cg_path_get_owner_uid(const char *path, uid_t *ret_uid); -int cg_path_get_unit(const char *path, char **ret_unit); +int cg_path_get_unit_full(const char *path, char **ret_unit, char **ret_subgroup); +static inline int cg_path_get_unit(const char *path, char **ret_unit) { + return cg_path_get_unit_full(path, ret_unit, NULL); +} int cg_path_get_unit_path(const char *path, char **ret_unit); int cg_path_get_user_unit(const char *path, char **ret_unit); int cg_path_get_machine_name(const char *path, char **ret_machine); @@ -217,8 +220,14 @@ int cg_pid_get_session(pid_t pid, char **ret_session); int cg_pidref_get_session(const PidRef *pidref, char **ret); int cg_pid_get_owner_uid(pid_t pid, uid_t *ret_uid); int cg_pidref_get_owner_uid(const PidRef *pidref, uid_t *ret); -int cg_pid_get_unit(pid_t pid, char **ret_unit); -int cg_pidref_get_unit(const PidRef *pidref, char **ret); +int cg_pid_get_unit_full(pid_t pid, char **ret_unit, char **ret_subgroup); +static inline int cg_pid_get_unit(pid_t pid, char **ret_unit) { + return cg_pid_get_unit_full(pid, ret_unit, NULL); +} +int cg_pidref_get_unit_full(const PidRef *pidref, char **ret_unit, char **ret_subgroup); +static inline int cg_pidref_get_unit(const PidRef *pidref, char **ret_unit) { + return cg_pidref_get_unit_full(pidref, ret_unit, NULL); +} int cg_pid_get_user_unit(pid_t pid, char **ret_unit); int cg_pid_get_machine_name(pid_t pid, char **ret_machine); int cg_pid_get_slice(pid_t pid, char **ret_slice); diff --git a/src/test/test-cgroup-util.c b/src/test/test-cgroup-util.c index 5fa6fe2da3..7c8cfaaa11 100644 --- a/src/test/test-cgroup-util.c +++ b/src/test/test-cgroup-util.c @@ -60,6 +60,36 @@ TEST(path_get_unit) { check_p_g_u("/user.slice/user-1000.slice/user@.service/server.service", -ENXIO, NULL); } +static void check_p_g_u_f(const char *path, int expected_code, const char *expected_unit, const char *expected_subgroup) { + _cleanup_free_ char *unit = NULL, *subgroup = NULL; + int r; + + r = cg_path_get_unit_full(path, &unit, &subgroup); + printf("%s: %s → %s %s %d expected %s %s %d\n", __func__, path, unit, subgroup, r, strnull(expected_unit), strnull(expected_subgroup), expected_code); + ASSERT_EQ(r, expected_code); + ASSERT_STREQ(unit, expected_unit); + ASSERT_STREQ(subgroup, expected_subgroup); +} + +TEST(path_get_unit_full) { + check_p_g_u_f("/system.slice/foobar.service/sdfdsaf", 0, "foobar.service", "sdfdsaf"); + check_p_g_u_f("/system.slice/foobar.service//sdfdsaf", 0, "foobar.service", "sdfdsaf"); + check_p_g_u_f("/system.slice/foobar.service/sdfdsaf/", 0, "foobar.service", "sdfdsaf"); + check_p_g_u_f("/system.slice/foobar.service//sdfdsaf/", 0, "foobar.service", "sdfdsaf"); + check_p_g_u_f("/system.slice/foobar.service//sdfdsaf//", 0, "foobar.service", "sdfdsaf"); + check_p_g_u_f("/system.slice/foobar.service/sdfdsaf/urks", 0, "foobar.service", "sdfdsaf/urks"); + check_p_g_u_f("/system.slice/foobar.service//sdfdsaf//urks", 0, "foobar.service", "sdfdsaf/urks"); + check_p_g_u_f("/system.slice/foobar.service/sdfdsaf/urks/", 0, "foobar.service", "sdfdsaf/urks"); + check_p_g_u_f("/system.slice/foobar.service//sdfdsaf//urks//", 0, "foobar.service", "sdfdsaf/urks"); + check_p_g_u_f("/system.slice/foobar.service", 0, "foobar.service", NULL); + check_p_g_u_f("/system.slice/foobar.service/", 0, "foobar.service", NULL); + check_p_g_u_f("/system.slice/foobar.service//", 0, "foobar.service", NULL); + check_p_g_u_f("/system.slice/", -ENXIO, NULL, NULL); + check_p_g_u_f("/system.slice/piff", -ENXIO, NULL, NULL); + check_p_g_u_f("/system.service/piff", 0, "system.service", "piff"); + check_p_g_u_f("//system.service//piff", 0, "system.service", "piff"); +} + static void check_p_g_u_p(const char *path, int code, const char *result) { _cleanup_free_ char *unit_path = NULL; int r; From d5feeb373cc13d96fa66967a6bdb7461df32c920 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 21 May 2025 17:23:47 +0200 Subject: [PATCH 03/22] machined: optionally track machines in cgroup subgroups --- man/org.freedesktop.machine1.xml | 11 +++++-- src/machine/machine-dbus.c | 1 + src/machine/machine-varlink.c | 2 +- src/machine/machine.c | 41 +++++++++++++++++-------- src/machine/machine.h | 1 + src/machine/machinectl.c | 26 ++++++++++++++-- src/machine/machined-core.c | 37 ++++++++++++++-------- src/machine/machined-dbus.c | 9 ++++-- src/machine/machined-varlink.c | 1 + src/machine/machined.h | 6 ++-- src/shared/bus-unit-procs.c | 8 ++++- src/shared/output-mode.h | 1 + src/shared/varlink-io.systemd.Machine.c | 4 ++- 13 files changed, 110 insertions(+), 38 deletions(-) diff --git a/man/org.freedesktop.machine1.xml b/man/org.freedesktop.machine1.xml index 35b2d64cc8..ea3a706215 100644 --- a/man/org.freedesktop.machine1.xml +++ b/man/org.freedesktop.machine1.xml @@ -506,6 +506,8 @@ node /org/freedesktop/machine1/machine/rawhide { @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly s Unit = '...'; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly s Subgroup = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly u Leader = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly t LeaderPIDFDId = ...; @@ -598,6 +600,8 @@ node /org/freedesktop/machine1/machine/rawhide { + + @@ -676,6 +680,9 @@ node /org/freedesktop/machine1/machine/rawhide { running, or closing. Note that the state machine is not considered part of the API and states might be removed or added without this being considered API breakage. + + Subgroup contains the sub-control-group path this machine's processes reside + in, relative to the specified unit's control group. @@ -717,9 +724,9 @@ $ gdbus introspect --system \ Machine Objects CopyFromWithFlags() and CopyToWithFlags() were added in version 252. - GetSSHInfo(), VSockCID, SSHAddress + GetSSHInfo(), VSockCID, SSHAddress, and SSHPrivateKeyPath were added in version 256. - LeaderPIDFDId was added in version 258. + LeaderPIDFDId and Subgroup were added in version 258. diff --git a/src/machine/machine-dbus.c b/src/machine/machine-dbus.c index f4676106ac..8935b59465 100644 --- a/src/machine/machine-dbus.c +++ b/src/machine/machine-dbus.c @@ -717,6 +717,7 @@ static const sd_bus_vtable machine_vtable[] = { SD_BUS_PROPERTY("Service", "s", NULL, offsetof(Machine, service), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("Unit", "s", NULL, offsetof(Machine, unit), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("Scope", "s", NULL, offsetof(Machine, unit), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), + SD_BUS_PROPERTY("Subgroup", "s", NULL, offsetof(Machine, subgroup), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("Leader", "u", bus_property_get_pid, offsetof(Machine, leader.pid), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("LeaderPIDFDId", "t", bus_property_get_pidfdid, offsetof(Machine, leader), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("Class", "s", property_get_class, offsetof(Machine, class), SD_BUS_VTABLE_PROPERTY_CONST), diff --git a/src/machine/machine-varlink.c b/src/machine/machine-varlink.c index 1c2013de51..b5754e2c49 100644 --- a/src/machine/machine-varlink.c +++ b/src/machine/machine-varlink.c @@ -175,7 +175,7 @@ int vl_method_register(sd_varlink *link, sd_json_variant *parameters, sd_varlink return r; if (!machine->allocate_unit) { - r = cg_pidref_get_unit(&machine->leader, &machine->unit); + r = cg_pidref_get_unit_full(&machine->leader, &machine->unit, &machine->subgroup); if (r < 0) return r; } diff --git a/src/machine/machine.c b/src/machine/machine.c index 3975bc1754..7ce512dd62 100644 --- a/src/machine/machine.c +++ b/src/machine/machine.c @@ -134,13 +134,19 @@ Machine* machine_free(Machine *m) { sd_bus_message_unref(m->create_message); free(m->name); - free(m->scope_job); + free(m->state_file); free(m->service); free(m->root_directory); + + free(m->unit); + free(m->subgroup); + free(m->scope_job); + free(m->netif); free(m->ssh_address); free(m->ssh_private_key_path); + return mfree(m); } @@ -156,7 +162,7 @@ int machine_save(Machine *m) { return 0; _cleanup_(unlink_and_freep) char *sl = NULL; /* auto-unlink! */ - if (m->unit) { + if (m->unit && !m->subgroup) { sl = strjoin("/run/systemd/machines/unit:", m->unit); if (!sl) return log_oom(); @@ -244,7 +250,7 @@ int machine_save(Machine *m) { static void machine_unlink(Machine *m) { assert(m); - if (m->unit) { + if (m->unit && !m->subgroup) { const char *sl = strjoina("/run/systemd/machines/unit:", m->unit); (void) unlink(sl); } @@ -266,6 +272,7 @@ int machine_load(Machine *m) { r = parse_env_file(NULL, m->state_file, "NAME", &name, "SCOPE", &m->unit, + "SUBGROUP", &m->subgroup, "SCOPE_JOB", &m->scope_job, "SERVICE", &m->service, "ROOT", &m->root_directory, @@ -380,6 +387,7 @@ static int machine_start_scope( assert(machine); assert(pidref_is_set(&machine->leader)); assert(!machine->unit); + assert(!machine->subgroup); escaped = unit_name_escape(machine->name); if (!escaped) @@ -476,9 +484,11 @@ static int machine_ensure_scope(Machine *m, sd_bus_message *properties, sd_bus_e assert(m->unit); - r = hashmap_ensure_put(&m->manager->machines_by_unit, &string_hash_ops, m->unit, m); - if (r < 0) - return r; + if (!m->subgroup) { + r = hashmap_ensure_put(&m->manager->machines_by_unit, &string_hash_ops, m->unit, m); + if (r < 0) + return r; + } return 0; } @@ -566,7 +576,7 @@ int machine_stop(Machine *m) { if (!IN_SET(m->class, MACHINE_CONTAINER, MACHINE_VM)) return -EOPNOTSUPP; - if (m->unit) { + if (m->unit && !m->subgroup) { _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; char *job = NULL; @@ -637,7 +647,7 @@ bool machine_may_gc(Machine *m, bool drop_not_started) { return false; } - if (m->unit) { + if (m->unit && !m->subgroup) { _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; r = manager_unit_is_active(m->manager, m->unit, &error); @@ -683,14 +693,14 @@ int machine_kill(Machine *m, KillWhom whom, int signo) { if (!IN_SET(m->class, MACHINE_VM, MACHINE_CONTAINER)) return -EOPNOTSUPP; - if (!m->unit) - return -ESRCH; - if (whom == KILL_LEADER) /* If we shall simply kill the leader, do so directly */ return pidref_kill(&m->leader, signo); + if (!m->unit) + return -ESRCH; + /* Otherwise, make PID 1 do it for us, for the entire cgroup */ - return manager_kill_unit(m->manager, m->unit, signo, NULL); + return manager_kill_unit(m->manager, m->unit, m->subgroup, signo, /* error= */ NULL); } int machine_openpt(Machine *m, int flags, char **ret_peer) { @@ -1124,8 +1134,13 @@ void machine_release_unit(Machine *m) { m->referenced = false; } - (void) hashmap_remove_value(m->manager->machines_by_unit, m->unit, m); + if (!m->subgroup) + (void) hashmap_remove_value(m->manager->machines_by_unit, m->unit, m); + m->unit = mfree(m->unit); + + /* Also free the subgroup, because it only makes sense in the context of the unit */ + m->subgroup = mfree(m->subgroup); } int machine_get_uid_shift(Machine *m, uid_t *ret) { diff --git a/src/machine/machine.h b/src/machine/machine.h index 870146fc22..8762263a8b 100644 --- a/src/machine/machine.h +++ b/src/machine/machine.h @@ -45,6 +45,7 @@ typedef struct Machine { char *root_directory; char *unit; + char *subgroup; char *scope_job; PidRef leader; diff --git a/src/machine/machinectl.c b/src/machine/machinectl.c index 116d5ef74d..7731fae708 100644 --- a/src/machine/machinectl.c +++ b/src/machine/machinectl.c @@ -412,9 +412,15 @@ static int list_images(int argc, char *argv[], void *userdata) { return show_table(table, "images"); } -static int show_unit_cgroup(sd_bus *bus, const char *unit, pid_t leader) { +static int show_unit_cgroup( + sd_bus *bus, + const char *unit, + const char *subgroup, + pid_t leader) { + _cleanup_free_ char *cgroup = NULL; _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + OutputFlags extra_flags = 0; int r; assert(bus); @@ -427,8 +433,16 @@ static int show_unit_cgroup(sd_bus *bus, const char *unit, pid_t leader) { if (isempty(cgroup)) return 0; + if (!empty_or_root(subgroup)) { + if (!path_extend(&cgroup, subgroup)) + return log_oom(); + + /* If we have a subcgroup, then hide all processes outside of it */ + extra_flags |= OUTPUT_HIDE_EXTRA; + } + unsigned c = MAX(LESS_BY(columns(), 18U), 10U); - r = unit_show_processes(bus, unit, cgroup, "\t\t ", c, get_output_flags(), &error); + r = unit_show_processes(bus, unit, cgroup, "\t\t ", c, get_output_flags() | extra_flags, &error); if (r == -EBADR) { if (arg_transport == BUS_TRANSPORT_REMOTE) @@ -494,6 +508,7 @@ typedef struct MachineStatusInfo { const char *class; const char *service; const char *unit; + const char *subgroup; const char *root_directory; pid_t leader; struct dual_timestamp timestamp; @@ -589,7 +604,11 @@ static void print_machine_status_info(sd_bus *bus, MachineStatusInfo *i) { if (i->unit) { printf("\t Unit: %s\n", i->unit); - show_unit_cgroup(bus, i->unit, i->leader); + + if (!empty_or_root(i->subgroup)) + printf("\tSubgroup: %s\n", i->subgroup); + + show_unit_cgroup(bus, i->unit, i->subgroup, i->leader); if (arg_transport == BUS_TRANSPORT_LOCAL) @@ -636,6 +655,7 @@ static int show_machine_info(const char *verb, sd_bus *bus, const char *path, bo { "Class", "s", NULL, offsetof(MachineStatusInfo, class) }, { "Service", "s", NULL, offsetof(MachineStatusInfo, service) }, { "Unit", "s", NULL, offsetof(MachineStatusInfo, unit) }, + { "Subgroup", "s", NULL, offsetof(MachineStatusInfo, subgroup) }, { "RootDirectory", "s", NULL, offsetof(MachineStatusInfo, root_directory) }, { "Leader", "u", NULL, offsetof(MachineStatusInfo, leader) }, { "Timestamp", "t", NULL, offsetof(MachineStatusInfo, timestamp.realtime) }, diff --git a/src/machine/machined-core.c b/src/machine/machined-core.c index 93f1cc5ee8..075ea4a803 100644 --- a/src/machine/machined-core.c +++ b/src/machine/machined-core.c @@ -22,28 +22,41 @@ #include "user-util.h" int manager_get_machine_by_pidref(Manager *m, const PidRef *pidref, Machine **ret) { - Machine *mm; - int r; + _cleanup_(pidref_done) PidRef current = PIDREF_NULL; + Machine *mm = NULL; assert(m); assert(pidref_is_set(pidref)); assert(ret); - mm = hashmap_get(m->machines_by_leader, pidref); - if (!mm) { - _cleanup_free_ char *unit = NULL; + for (;;) { + /* First, compare by leader */ + mm = hashmap_get(m->machines_by_leader, pidref); + if (mm) + break; - r = cg_pidref_get_unit(pidref, &unit); - if (r >= 0) + /* Then look for the unit */ + _cleanup_free_ char *unit = NULL; + if (cg_pidref_get_unit(pidref, &unit) >= 0) { mm = hashmap_get(m->machines_by_unit, unit); - } - if (!mm) { - *ret = NULL; - return 0; + if (mm) + break; + } + + /* Maybe this process is in per-user unit? If so, let's go up the process tree, and check + * that, we should eventually hit PID 1 of the container tree, which we should be able to + * recognize. */ + _cleanup_(pidref_done) PidRef parent = PIDREF_NULL; + if (pidref_get_ppid_as_pidref(pidref, &parent) < 0) + break; + + pidref_done(¤t); + current = TAKE_PIDREF(parent); + pidref = ¤t; } *ret = mm; - return 1; + return !!mm; } int manager_add_machine(Manager *m, const char *name, Machine **ret) { diff --git a/src/machine/machined-dbus.c b/src/machine/machined-dbus.c index 6a31e6aa3b..8aa00ba3ed 100644 --- a/src/machine/machined-dbus.c +++ b/src/machine/machined-dbus.c @@ -411,7 +411,7 @@ static int method_register_machine_internal(sd_bus_message *message, bool read_n if (r == 0) return 1; /* Will call us back */ - r = cg_pidref_get_unit(&m->leader, &m->unit); + r = cg_pidref_get_unit_full(&m->leader, &m->unit, &m->subgroup); if (r < 0) { r = sd_bus_error_set_errnof(error, r, "Failed to determine unit of process "PID_FMT" : %m", @@ -1276,11 +1276,14 @@ int manager_stop_unit(Manager *manager, const char *unit, sd_bus_error *error, c return 1; } -int manager_kill_unit(Manager *manager, const char *unit, int signo, sd_bus_error *error) { +int manager_kill_unit(Manager *manager, const char *unit, const char *subgroup, int signo, sd_bus_error *reterr_error) { assert(manager); assert(unit); - return bus_call_method(manager->bus, bus_systemd_mgr, "KillUnit", error, NULL, "ssi", unit, "all", signo); + if (empty_or_root(subgroup)) + return bus_call_method(manager->bus, bus_systemd_mgr, "KillUnit", reterr_error, NULL, "ssi", unit, "all", signo); + + return bus_call_method(manager->bus, bus_systemd_mgr, "KillUnitSubgroup", reterr_error, NULL, "sssi", unit, "cgroup", subgroup, signo); } int manager_unit_is_active(Manager *manager, const char *unit, sd_bus_error *reterr_error) { diff --git a/src/machine/machined-varlink.c b/src/machine/machined-varlink.c index f3351724b4..8551d703e7 100644 --- a/src/machine/machined-varlink.c +++ b/src/machine/machined-varlink.c @@ -477,6 +477,7 @@ static int list_machine_one_and_maybe_read_metadata(sd_varlink *link, Machine *m JSON_BUILD_PAIR_STRING_NON_EMPTY("service", m->service), JSON_BUILD_PAIR_STRING_NON_EMPTY("rootDirectory", m->root_directory), JSON_BUILD_PAIR_STRING_NON_EMPTY("unit", m->unit), + JSON_BUILD_PAIR_STRING_NON_EMPTY("subgroup", m->subgroup), SD_JSON_BUILD_PAIR_CONDITION(pidref_is_set(&m->leader), "leader", JSON_BUILD_PIDREF(&m->leader)), SD_JSON_BUILD_PAIR_CONDITION(dual_timestamp_is_set(&m->timestamp), "timestamp", JSON_BUILD_DUAL_TIMESTAMP(&m->timestamp)), JSON_BUILD_PAIR_UNSIGNED_NOT_EQUAL("vSockCid", m->vsock_cid, VMADDR_CID_ANY), diff --git a/src/machine/machined.h b/src/machine/machined.h index c4753b8bfa..d03577be3c 100644 --- a/src/machine/machined.h +++ b/src/machine/machined.h @@ -10,7 +10,9 @@ typedef struct Manager { sd_bus *bus; Hashmap *machines; - Hashmap *machines_by_unit; + Hashmap *machines_by_unit; /* This hashmap only tracks machines where a system-level encapsulates + * the machine fully, and exclusively. It's not used if a machine is + * run in a cgroup further down the tree. */ Hashmap *machines_by_leader; sd_event_source *deferred_gc_event_source; @@ -44,7 +46,7 @@ int match_properties_changed(sd_bus_message *message, void *userdata, sd_bus_err int match_job_removed(sd_bus_message *message, void *userdata, sd_bus_error *error); int manager_stop_unit(Manager *manager, const char *unit, sd_bus_error *error, char **job); -int manager_kill_unit(Manager *manager, const char *unit, int signo, sd_bus_error *error); +int manager_kill_unit(Manager *manager, const char *unit, const char *subgroup, int signo, sd_bus_error *error); int manager_unref_unit(Manager *m, const char *unit, sd_bus_error *error); int manager_unit_is_active(Manager *manager, const char *unit, sd_bus_error *reterr_errno); int manager_job_is_active(Manager *manager, const char *path, sd_bus_error *reterr_errno); diff --git a/src/shared/bus-unit-procs.c b/src/shared/bus-unit-procs.c index 9aea7db785..bd510efebd 100644 --- a/src/shared/bus-unit-procs.c +++ b/src/shared/bus-unit-procs.c @@ -394,7 +394,13 @@ int unit_show_processes( if (r < 0) goto finish; - r = dump_extra_processes(cgroups, prefix, n_columns, flags); + if (!FLAGS_SET(flags, OUTPUT_HIDE_EXTRA)) { + r = dump_extra_processes(cgroups, prefix, n_columns, flags); + if (r < 0) + goto finish; + } + + r = 0; finish: while ((cg = hashmap_first(cgroups))) diff --git a/src/shared/output-mode.h b/src/shared/output-mode.h index ffd2e4fe80..7ee17df938 100644 --- a/src/shared/output-mode.h +++ b/src/shared/output-mode.h @@ -48,6 +48,7 @@ typedef enum OutputFlags { OUTPUT_KERNEL_THREADS = 1 << 9, OUTPUT_CGROUP_XATTRS = 1 << 10, OUTPUT_CGROUP_ID = 1 << 11, + OUTPUT_HIDE_EXTRA = 1 << 12, } OutputFlags; sd_json_format_flags_t output_mode_to_json_format_flags(OutputMode m); diff --git a/src/shared/varlink-io.systemd.Machine.c b/src/shared/varlink-io.systemd.Machine.c index b80cbc65d0..1bb7c00fa1 100644 --- a/src/shared/varlink-io.systemd.Machine.c +++ b/src/shared/varlink-io.systemd.Machine.c @@ -96,7 +96,9 @@ static SD_VARLINK_DEFINE_METHOD_FULL( SD_VARLINK_FIELD_COMMENT("OS release information of the machine. It contains an array of key value pairs read from the os-release(5) file in the image."), SD_VARLINK_DEFINE_OUTPUT(OSRelease, SD_VARLINK_STRING, SD_VARLINK_NULLABLE|SD_VARLINK_ARRAY), SD_VARLINK_FIELD_COMMENT("Return the base UID/GID of the machine"), - SD_VARLINK_DEFINE_OUTPUT(UIDShift, SD_VARLINK_INT, SD_VARLINK_NULLABLE)); + SD_VARLINK_DEFINE_OUTPUT(UIDShift, SD_VARLINK_INT, SD_VARLINK_NULLABLE), + SD_VARLINK_FIELD_COMMENT("Subcgroup path of the machine, relative to the unit's cgroup path"), + SD_VARLINK_DEFINE_OUTPUT(Subgroup, SD_VARLINK_STRING, SD_VARLINK_NULLABLE)); static SD_VARLINK_DEFINE_ENUM_TYPE( MachineOpenMode, From 276d20018623ef14956ce87975be48da5de63f29 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 23 May 2025 15:30:22 +0200 Subject: [PATCH 04/22] machined: track UID owner of machines Now that unpriv clients can register machines, let's register their UID too. This allows us to do two things: 1. make sure the scope delegation is assigned to the right UID (so that the unpriv user can actually create cgroups below the delegated scope) 2. permit certain types of access (i.e. killing, or pty access) to the client without auth if it owns the machine. --- man/org.freedesktop.machine1.xml | 9 +++++- src/machine/machine-dbus.c | 28 ++++++++++++++---- src/machine/machine-varlink.c | 23 ++++++++++++--- src/machine/machine.c | 39 +++++++++++++++++++------ src/machine/machine.h | 2 ++ src/machine/machinectl.c | 5 ++++ src/machine/machined-dbus.c | 11 +++++++ src/machine/machined-varlink.c | 3 +- src/shared/varlink-io.systemd.Machine.c | 4 ++- 9 files changed, 102 insertions(+), 22 deletions(-) diff --git a/man/org.freedesktop.machine1.xml b/man/org.freedesktop.machine1.xml index ea3a706215..c52aed0dbc 100644 --- a/man/org.freedesktop.machine1.xml +++ b/man/org.freedesktop.machine1.xml @@ -525,6 +525,8 @@ node /org/freedesktop/machine1/machine/rawhide { readonly s SSHPrivateKeyPath = '...'; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly s State = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly u UID = ...; }; interface org.freedesktop.DBus.Peer { ... }; interface org.freedesktop.DBus.Introspectable { ... }; @@ -620,6 +622,8 @@ node /org/freedesktop/machine1/machine/rawhide { + + @@ -683,6 +687,8 @@ node /org/freedesktop/machine1/machine/rawhide { Subgroup contains the sub-control-group path this machine's processes reside in, relative to the specified unit's control group. + + UID contains the numeric UNIX UID of the user who registered the machine. @@ -726,7 +732,8 @@ $ gdbus introspect --system \ CopyToWithFlags() were added in version 252. GetSSHInfo(), VSockCID, SSHAddress, and SSHPrivateKeyPath were added in version 256. - LeaderPIDFDId and Subgroup were added in version 258. + LeaderPIDFDId, Subgroup, and UID were + added in version 258. diff --git a/src/machine/machine-dbus.c b/src/machine/machine-dbus.c index 8935b59465..321ddbfdcc 100644 --- a/src/machine/machine-dbus.c +++ b/src/machine/machine-dbus.c @@ -60,10 +60,12 @@ int bus_machine_method_unregister(sd_bus_message *message, void *userdata, sd_bu NULL }; - r = bus_verify_polkit_async( + r = bus_verify_polkit_async_full( message, "org.freedesktop.machine1.manage-machines", details, + m->uid, + /* flags= */ 0, &m->manager->polkit_registry, error); if (r < 0) @@ -90,10 +92,12 @@ int bus_machine_method_terminate(sd_bus_message *message, void *userdata, sd_bus NULL }; - r = bus_verify_polkit_async( + r = bus_verify_polkit_async_full( message, "org.freedesktop.machine1.manage-machines", details, + m->uid, + /* flags= */ 0, &m->manager->polkit_registry, error); if (r < 0) @@ -138,10 +142,12 @@ int bus_machine_method_kill(sd_bus_message *message, void *userdata, sd_bus_erro NULL }; - r = bus_verify_polkit_async( + r = bus_verify_polkit_async_full( message, "org.freedesktop.machine1.manage-machines", details, + m->uid, + /* flags= */ 0, &m->manager->polkit_registry, error); if (r < 0) @@ -258,10 +264,12 @@ int bus_machine_method_open_pty(sd_bus_message *message, void *userdata, sd_bus_ NULL }; - r = bus_verify_polkit_async( + r = bus_verify_polkit_async_full( message, m->class == MACHINE_HOST ? "org.freedesktop.machine1.host-open-pty" : "org.freedesktop.machine1.open-pty", details, + m->uid, + /* flags= */ 0, &m->manager->polkit_registry, error); if (r < 0) @@ -299,10 +307,12 @@ int bus_machine_method_open_login(sd_bus_message *message, void *userdata, sd_bu NULL }; - r = bus_verify_polkit_async( + r = bus_verify_polkit_async_full( message, m->class == MACHINE_HOST ? "org.freedesktop.machine1.host-login" : "org.freedesktop.machine1.login", details, + m->uid, + /* flags= */ 0, &m->manager->polkit_registry, error); if (r < 0) @@ -383,10 +393,12 @@ int bus_machine_method_open_shell(sd_bus_message *message, void *userdata, sd_bu NULL }; - r = bus_verify_polkit_async( + r = bus_verify_polkit_async_full( message, m->class == MACHINE_HOST ? "org.freedesktop.machine1.host-shell" : "org.freedesktop.machine1.shell", details, + m->uid, + /* flags= */ 0, &m->manager->polkit_registry, error); if (r < 0) @@ -446,6 +458,7 @@ int bus_machine_method_bind_mount(sd_bus_message *message, void *userdata, sd_bu NULL }; + /* NB: For now not opened up to owner of machine without auth */ r = bus_verify_polkit_async( message, "org.freedesktop.machine1.manage-machines", @@ -531,6 +544,7 @@ int bus_machine_method_copy(sd_bus_message *message, void *userdata, sd_bus_erro NULL }; + /* NB: For now not opened up to owner of machine without auth */ r = bus_verify_polkit_async( message, "org.freedesktop.machine1.manage-machines", @@ -574,6 +588,7 @@ int bus_machine_method_open_root_directory(sd_bus_message *message, void *userda NULL }; + /* NB: For now not opened up to owner of machine without auth */ r = bus_verify_polkit_async( message, "org.freedesktop.machine1.manage-machines", @@ -727,6 +742,7 @@ static const sd_bus_vtable machine_vtable[] = { SD_BUS_PROPERTY("SSHAddress", "s", NULL, offsetof(Machine, ssh_address), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("SSHPrivateKeyPath", "s", NULL, offsetof(Machine, ssh_private_key_path), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("State", "s", property_get_state, 0, 0), + SD_BUS_PROPERTY("UID", "u", bus_property_get_uid, offsetof(Machine, uid), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_METHOD("Terminate", NULL, diff --git a/src/machine/machine-varlink.c b/src/machine/machine-varlink.c index b5754e2c49..8c437efc17 100644 --- a/src/machine/machine-varlink.c +++ b/src/machine/machine-varlink.c @@ -168,6 +168,10 @@ int vl_method_register(sd_varlink *link, sd_json_variant *parameters, sd_varlink return r; } + r = sd_varlink_get_peer_uid(link, &machine->uid); + if (r < 0) + return r; + r = machine_link(manager, machine); if (r == -EEXIST) return sd_varlink_error(link, VARLINK_ERROR_MACHINE_EXISTS, NULL); @@ -278,12 +282,14 @@ int vl_method_unregister_internal(sd_varlink *link, sd_json_variant *parameters, Manager *manager = ASSERT_PTR(machine->manager); int r; - r = varlink_verify_polkit_async( + r = varlink_verify_polkit_async_full( link, manager->bus, "org.freedesktop.machine1.manage-machines", (const char**) STRV_MAKE("name", machine->name, "verb", "unregister"), + machine->uid, + /* flags= */ 0, &manager->polkit_registry); if (r <= 0) return r; @@ -300,12 +306,14 @@ int vl_method_terminate_internal(sd_varlink *link, sd_json_variant *parameters, Manager *manager = ASSERT_PTR(machine->manager); int r; - r = varlink_verify_polkit_async( + r = varlink_verify_polkit_async_full( link, manager->bus, "org.freedesktop.machine1.manage-machines", (const char**) STRV_MAKE("name", machine->name, "verb", "terminate"), + machine->uid, + /* flags= */ 0, &manager->polkit_registry); if (r <= 0) return r; @@ -368,12 +376,14 @@ int vl_method_kill(sd_varlink *link, sd_json_variant *parameters, sd_varlink_met return sd_varlink_error_invalid_parameter_name(link, "whom"); } - r = varlink_verify_polkit_async( + r = varlink_verify_polkit_async_full( link, manager->bus, "org.freedesktop.machine1.manage-machines", (const char**) STRV_MAKE("name", machine->name, "verb", "kill"), + machine->uid, + /* flags= */ 0, &manager->polkit_registry); if (r <= 0) return r; @@ -510,11 +520,13 @@ int vl_method_open(sd_varlink *link, sd_json_variant *parameters, sd_varlink_met return r; polkit_details = machine_open_polkit_details(p.mode, machine->name, user, path, command_line); - r = varlink_verify_polkit_async( + r = varlink_verify_polkit_async_full( link, manager->bus, machine_open_polkit_action(p.mode, machine->class), (const char**) polkit_details, + machine->uid, + /* flags= */ 0, &manager->polkit_registry); if (r <= 0) return r; @@ -788,6 +800,7 @@ int vl_method_bind_mount(sd_varlink *link, sd_json_variant *parameters, sd_varli if (machine->class != MACHINE_CONTAINER) return sd_varlink_error(link, VARLINK_ERROR_MACHINE_NOT_SUPPORTED, NULL); + /* NB: For now not opened up to owner of machine without auth */ r = varlink_verify_polkit_async( link, manager->bus, @@ -899,6 +912,7 @@ int vl_method_copy_internal(sd_varlink *link, sd_json_variant *parameters, sd_va if (machine->class != MACHINE_CONTAINER) return sd_varlink_error(link, VARLINK_ERROR_MACHINE_NOT_SUPPORTED, NULL); + /* NB: For now not opened up to owner of machine without auth */ r = varlink_verify_polkit_async( link, manager->bus, @@ -928,6 +942,7 @@ int vl_method_open_root_directory_internal(sd_varlink *link, sd_json_variant *pa Manager *manager = ASSERT_PTR(machine->manager); int r; + /* NB: For now not opened up to owner of machine without auth */ r = varlink_verify_polkit_async( link, manager->bus, diff --git a/src/machine/machine.c b/src/machine/machine.c index 7ce512dd62..91c1450184 100644 --- a/src/machine/machine.c +++ b/src/machine/machine.c @@ -183,8 +183,10 @@ int machine_save(Machine *m) { fprintf(f, "# This is private data. Do not parse.\n" - "NAME=%s\n", - m->name); + "NAME=%s\n" + "UID=" UID_FMT "\n", + m->name, + m->uid); /* We continue to call this "SCOPE=" because it is internal only, and we want to stay compatible with old files */ env_file_fputs_assignment(f, "SCOPE=", m->unit); @@ -261,7 +263,7 @@ static void machine_unlink(Machine *m) { int machine_load(Machine *m) { _cleanup_free_ char *name = NULL, *realtime = NULL, *monotonic = NULL, *id = NULL, *leader = NULL, *leader_pidfdid = NULL, - *class = NULL, *netif = NULL, *vsock_cid = NULL; + *class = NULL, *netif = NULL, *vsock_cid = NULL, *uid = NULL; int r; assert(m); @@ -285,7 +287,8 @@ int machine_load(Machine *m) { "NETIF", &netif, "VSOCK_CID", &vsock_cid, "SSH_ADDRESS", &m->ssh_address, - "SSH_PRIVATE_KEY_PATH", &m->ssh_private_key_path); + "SSH_PRIVATE_KEY_PATH", &m->ssh_private_key_path, + "UID", &uid); if (r == -ENOENT) return 0; if (r < 0) @@ -369,6 +372,10 @@ int machine_load(Machine *m) { log_warning_errno(r, "Failed to parse AF_VSOCK CID, ignoring: %s", vsock_cid); } + r = parse_uid(uid, &m->uid); + if (r < 0) + log_warning_errno(r, "Failed to parse owning UID, ignoring: %s", uid); + return r; } @@ -426,14 +433,28 @@ static int machine_start_scope( if (r < 0) return r; - r = sd_bus_message_append(m, "(sv)(sv)(sv)(sv)", - "Delegate", "b", 1, - "CollectMode", "s", "inactive-or-failed", - "AddRef", "b", 1, - "TasksMax", "t", UINT64_C(16384)); + r = sd_bus_message_append( + m, "(sv)(sv)(sv)(sv)", + "Delegate", "b", 1, + "CollectMode", "s", "inactive-or-failed", + "AddRef", "b", 1, + "TasksMax", "t", UINT64_C(16384)); if (r < 0) return r; + if (machine->uid != 0) { + _cleanup_free_ char *u = NULL; + + if (asprintf(&u, UID_FMT, machine->uid) < 0) + return -ENOMEM; + + r = sd_bus_message_append( + m, "(sv)", + "User", "s", u); + if (r < 0) + return r; + } + if (more_properties) { r = sd_bus_message_copy(m, more_properties, true); if (r < 0) diff --git a/src/machine/machine.h b/src/machine/machine.h index 8762263a8b..dddc0c8000 100644 --- a/src/machine/machine.h +++ b/src/machine/machine.h @@ -38,6 +38,8 @@ typedef struct Machine { char *name; sd_id128_t id; + uid_t uid; + MachineClass class; char *state_file; diff --git a/src/machine/machinectl.c b/src/machine/machinectl.c index 7731fae708..d31ef407f4 100644 --- a/src/machine/machinectl.c +++ b/src/machine/machinectl.c @@ -514,6 +514,7 @@ typedef struct MachineStatusInfo { struct dual_timestamp timestamp; int *netif; size_t n_netif; + uid_t uid; } MachineStatusInfo; static void machine_status_info_clear(MachineStatusInfo *info) { @@ -567,6 +568,9 @@ static void print_machine_status_info(sd_bus *bus, MachineStatusInfo *i) { } else if (i->class) printf("\t Class: %s\n", i->class); + if (i->uid != 0) + printf("\t UID: " UID_FMT "\n", i->uid); + if (i->root_directory) printf("\t Root: %s\n", i->root_directory); @@ -662,6 +666,7 @@ static int show_machine_info(const char *verb, sd_bus *bus, const char *path, bo { "TimestampMonotonic", "t", NULL, offsetof(MachineStatusInfo, timestamp.monotonic) }, { "Id", "ay", bus_map_id128, offsetof(MachineStatusInfo, id) }, { "NetworkInterfaces", "ai", map_netif, 0 }, + { "UID", "u", NULL, offsetof(MachineStatusInfo, uid) }, {} }; diff --git a/src/machine/machined-dbus.c b/src/machine/machined-dbus.c index 8aa00ba3ed..30f722a449 100644 --- a/src/machine/machined-dbus.c +++ b/src/machine/machined-dbus.c @@ -300,6 +300,16 @@ static int method_create_or_register_machine( if (hashmap_get(manager->machines, name)) return sd_bus_error_setf(error, BUS_ERROR_MACHINE_EXISTS, "Machine '%s' already exists", name); + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_EUID, &creds); + if (r < 0) + return r; + + uid_t uid; + r = sd_bus_creds_get_euid(creds, &uid); + if (r < 0) + return r; + const char *details[] = { "name", name, "class", machine_class_to_string(c), @@ -324,6 +334,7 @@ static int method_create_or_register_machine( m->leader = TAKE_PIDREF(pidref); m->class = c; m->id = id; + m->uid = uid; if (!isempty(service)) { m->service = strdup(service); diff --git a/src/machine/machined-varlink.c b/src/machine/machined-varlink.c index 8551d703e7..7dac3cb0d2 100644 --- a/src/machine/machined-varlink.c +++ b/src/machine/machined-varlink.c @@ -485,7 +485,8 @@ static int list_machine_one_and_maybe_read_metadata(sd_varlink *link, Machine *m JSON_BUILD_PAIR_STRING_NON_EMPTY("sshPrivateKeyPath", m->ssh_private_key_path), JSON_BUILD_PAIR_VARIANT_NON_NULL("addresses", addr_array), JSON_BUILD_PAIR_STRV_ENV_PAIR_NON_EMPTY("OSRelease", os_release), - JSON_BUILD_PAIR_UNSIGNED_NOT_EQUAL("UIDShift", shift, UID_INVALID)); + JSON_BUILD_PAIR_UNSIGNED_NOT_EQUAL("UIDShift", shift, UID_INVALID), + SD_JSON_BUILD_PAIR_UNSIGNED("UID", m->uid)); if (r < 0) return r; diff --git a/src/shared/varlink-io.systemd.Machine.c b/src/shared/varlink-io.systemd.Machine.c index 1bb7c00fa1..ad7dff228f 100644 --- a/src/shared/varlink-io.systemd.Machine.c +++ b/src/shared/varlink-io.systemd.Machine.c @@ -98,7 +98,9 @@ static SD_VARLINK_DEFINE_METHOD_FULL( SD_VARLINK_FIELD_COMMENT("Return the base UID/GID of the machine"), SD_VARLINK_DEFINE_OUTPUT(UIDShift, SD_VARLINK_INT, SD_VARLINK_NULLABLE), SD_VARLINK_FIELD_COMMENT("Subcgroup path of the machine, relative to the unit's cgroup path"), - SD_VARLINK_DEFINE_OUTPUT(Subgroup, SD_VARLINK_STRING, SD_VARLINK_NULLABLE)); + SD_VARLINK_DEFINE_OUTPUT(Subgroup, SD_VARLINK_STRING, SD_VARLINK_NULLABLE), + SD_VARLINK_FIELD_COMMENT("Numeric UNIX UID of the user who registered the machine"), + SD_VARLINK_DEFINE_OUTPUT(UID, SD_VARLINK_INT, SD_VARLINK_NULLABLE)); static SD_VARLINK_DEFINE_ENUM_TYPE( MachineOpenMode, From adaff8eb35d9c471af81fddaa4403bc5843a256f Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 23 May 2025 22:10:36 +0200 Subject: [PATCH 05/22] machined: use different polkit actions for registering and creating a machine The difference between these two operations are large: one is relatively superficial: for "registration" all resources remain associated with the invoking user, only the cgroup is reported to machined which then keeps track of the machine, too. OTOH "creation" a scope is allocated in system context, hence the invoked code will be owned by the system, and its resource usage charged against the system. Hence, use two distinct polkit actions for this, so that we can relax access to registration, but keep access to creation tough. --- src/machine/machine-varlink.c | 2 +- src/machine/machined-dbus.c | 7 ++++--- src/machine/org.freedesktop.machine1.policy | 12 +++++++++++- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/src/machine/machine-varlink.c b/src/machine/machine-varlink.c index 8c437efc17..a773094cfe 100644 --- a/src/machine/machine-varlink.c +++ b/src/machine/machine-varlink.c @@ -155,7 +155,7 @@ int vl_method_register(sd_varlink *link, sd_json_variant *parameters, sd_varlink r = varlink_verify_polkit_async( link, manager->bus, - "org.freedesktop.machine1.create-machine", + machine->allocate_unit ? "org.freedesktop.machine1.create-machine" : "org.freedesktop.machine1.register-machine", (const char**) STRV_MAKE("name", machine->name, "class", machine_class_to_string(machine->class)), &manager->polkit_registry); diff --git a/src/machine/machined-dbus.c b/src/machine/machined-dbus.c index 30f722a449..82c0addefb 100644 --- a/src/machine/machined-dbus.c +++ b/src/machine/machined-dbus.c @@ -228,6 +228,7 @@ static int method_list_machines(sd_bus_message *message, void *userdata, sd_bus_ static int method_create_or_register_machine( Manager *manager, sd_bus_message *message, + const char *polkit_action, bool read_network, Machine **ret, sd_bus_error *error) { @@ -318,7 +319,7 @@ static int method_create_or_register_machine( r = bus_verify_polkit_async( message, - "org.freedesktop.machine1.create-machine", + polkit_action, details, &manager->polkit_registry, error); @@ -378,7 +379,7 @@ static int method_create_machine_internal(sd_bus_message *message, bool read_net assert(message); - r = method_create_or_register_machine(manager, message, read_network, &m, error); + r = method_create_or_register_machine(manager, message, "org.freedesktop.machine1.create-machine", read_network, &m, error); if (r < 0) return r; if (r == 0) @@ -416,7 +417,7 @@ static int method_register_machine_internal(sd_bus_message *message, bool read_n assert(message); - r = method_create_or_register_machine(manager, message, read_network, &m, error); + r = method_create_or_register_machine(manager, message, "org.freedesktop.machine1.register-machine", read_network, &m, error); if (r < 0) return r; if (r == 0) diff --git a/src/machine/org.freedesktop.machine1.policy b/src/machine/org.freedesktop.machine1.policy index fe125ed0db..d5b8d83d2a 100644 --- a/src/machine/org.freedesktop.machine1.policy +++ b/src/machine/org.freedesktop.machine1.policy @@ -99,7 +99,17 @@ auth_admin auth_admin_keep - org.freedesktop.login1.shell org.freedesktop.login1.login + org.freedesktop.login1.shell org.freedesktop.login1.login org.freedesktop.machine1.register-machine + + + + Register a local virtual machine or container + Authentication is required to register a local virtual machine or container. + + auth_admin + auth_admin + yes + From 97754cd14dc7b3630585383ecba92191667860e4 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 2 Jul 2025 11:20:23 +0200 Subject: [PATCH 06/22] machined: also track 'supervisor' process of a machine So far, machined strictly tracked the "leader" process of a machine, i.e. the topmost process that is actually the payload of the machine. Its runtime also defines the runtime of the machine, and we can directly interact with it if we need to, for example for containers to join the namespaces, or kill it. Let's optionally also track the "supervisor" process of a machine, i.e. the host process that manages the payload if there is one. This is generally useful info, but in particular is useful because we might need to communicate with it to shutdown a machine without cooperation of the payload. Traditionally we did this by simply stopping the unit of the machine, but this is not doable now that the host machined can be used to track per-user machines. In the long run we probably want a more bespoke protocol between machined and supervisors (so that we can execute other commands too, such as request cooperative reboots/shutdowns), but that's for later. Some environments call the concept "monitor" rather than "supervisor" or use some other term. I stuck to "supervisor" because nspawn uses this, and ultimately one name is as good as another. And of course, in other implementations of VM managers of containers there might not be a single process tracking each VM/container. Because of this, the concept of a supervisor is optional. --- man/machinectl.xml | 10 +- man/org.freedesktop.machine1.xml | 24 ++++- src/machine/machine-dbus.c | 2 + src/machine/machine-varlink.c | 52 +++++---- src/machine/machine.c | 134 +++++++++++++++++------- src/machine/machine.h | 13 ++- src/machine/machined-dbus.c | 19 +++- src/machine/machined-varlink.c | 1 + src/shared/varlink-io.systemd.Machine.c | 8 +- 9 files changed, 191 insertions(+), 72 deletions(-) diff --git a/man/machinectl.xml b/man/machinectl.xml index 13fa0eae71..e64a20bb1d 100644 --- a/man/machinectl.xml +++ b/man/machinectl.xml @@ -595,12 +595,10 @@ - When used with kill, choose - which processes to kill. Must be one of - , or to select - whether to kill only the leader process of the machine or all - processes of the machine. If omitted, defaults to - . + When used with kill, choose which processes to kill. Must be one + of , , or to select whether to + kill only the leader process of the machine, the supervisor process of the machine, or all processes + of the machine. If omitted, defaults to . diff --git a/man/org.freedesktop.machine1.xml b/man/org.freedesktop.machine1.xml index c52aed0dbc..ccd81be659 100644 --- a/man/org.freedesktop.machine1.xml +++ b/man/org.freedesktop.machine1.xml @@ -512,6 +512,10 @@ node /org/freedesktop/machine1/machine/rawhide { @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly t LeaderPIDFDId = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly u Supervisor = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly t SupervisorPIDFDId = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly s Class = '...'; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly s RootDirectory = '...'; @@ -608,6 +612,10 @@ node /org/freedesktop/machine1/machine/rawhide { + + + + @@ -655,11 +663,20 @@ node /org/freedesktop/machine1/machine/rawhide { Unit is the systemd scope or service unit name for the machine. - Leader is the PID of the leader process of the machine. + Leader is the PID of the leader process of the machine. (The leader process is the + top-level process of the payload of the machine, i.e. for containers PID 1 inside the container, and + for VMs the process encapsulating the VM. Its lifetime defines the lifetime of the machine.) LeaderPIDFDId encodes the Linux pidfd inode ID of the leader process of the machine. + Supervisor is the PID of the supervisor process of the machine. (The + supervisor process is the process that manages the machine, if there's a separate one for it. A + supervisor process is not always defined, and does not define the machine's lifetime.) + + SupervisorPIDFDId encodes the Linux pidfd inode ID of the supervisor process of the + machine. + Class is the class of the machine and is either the string "vm" (for real VMs based on virtualized hardware) or "container" (for lightweight userspace virtualization sharing the same kernel as the host). @@ -732,8 +749,9 @@ $ gdbus introspect --system \ CopyToWithFlags() were added in version 252. GetSSHInfo(), VSockCID, SSHAddress, and SSHPrivateKeyPath were added in version 256. - LeaderPIDFDId, Subgroup, and UID were - added in version 258. + LeaderPIDFDId, Supervisor, + SupervisorPIDFDId, Subgroup, and UID were added + in version 258. diff --git a/src/machine/machine-dbus.c b/src/machine/machine-dbus.c index 321ddbfdcc..f903a1371e 100644 --- a/src/machine/machine-dbus.c +++ b/src/machine/machine-dbus.c @@ -735,6 +735,8 @@ static const sd_bus_vtable machine_vtable[] = { SD_BUS_PROPERTY("Subgroup", "s", NULL, offsetof(Machine, subgroup), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("Leader", "u", bus_property_get_pid, offsetof(Machine, leader.pid), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("LeaderPIDFDId", "t", bus_property_get_pidfdid, offsetof(Machine, leader), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Supervisor", "u", bus_property_get_pid, offsetof(Machine, supervisor.pid), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SupervisorPIDFDId", "t", bus_property_get_pidfdid, offsetof(Machine, supervisor), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("Class", "s", property_get_class, offsetof(Machine, class), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("RootDirectory", "s", NULL, offsetof(Machine, root_directory), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("NetworkInterfaces", "ai", property_get_netif, 0, SD_BUS_VTABLE_PROPERTY_CONST), diff --git a/src/machine/machine-varlink.c b/src/machine/machine-varlink.c index a773094cfe..776654b51d 100644 --- a/src/machine/machine-varlink.c +++ b/src/machine/machine-varlink.c @@ -46,8 +46,8 @@ static int machine_name(const char *name, sd_json_variant *variant, sd_json_disp return 0; } -static int machine_leader(const char *name, sd_json_variant *variant, sd_json_dispatch_flags_t flags, void *userdata) { - PidRef *leader = ASSERT_PTR(userdata); +static int machine_pidref(const char *name, sd_json_variant *variant, sd_json_dispatch_flags_t flags, void *userdata) { + PidRef *pidref = ASSERT_PTR(userdata); _cleanup_(pidref_done) PidRef temp = PIDREF_NULL; int r; @@ -56,14 +56,14 @@ static int machine_leader(const char *name, sd_json_variant *variant, sd_json_di return r; if (temp.pid == 1) /* refuse PID 1 */ - return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a valid leader PID.", strna(name)); + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a valid PID.", strna(name)); /* When both leader and leaderProcessId are specified, they must be consistent with each other. */ - if (pidref_is_set(leader) && !pidref_equal(leader, &temp)) - return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' conflicts with already dispatched leader PID.", strna(name)); + if (pidref_is_set(pidref) && !pidref_equal(pidref, &temp)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' conflicts with already dispatched PID.", strna(name)); - pidref_done(leader); - *leader = TAKE_PIDREF(temp); + pidref_done(pidref); + *pidref = TAKE_PIDREF(temp); return 0; } @@ -128,18 +128,20 @@ int vl_method_register(sd_varlink *link, sd_json_variant *parameters, sd_varlink int r; static const sd_json_dispatch_field dispatch_table[] = { - { "name", SD_JSON_VARIANT_STRING, machine_name, offsetof(Machine, name), SD_JSON_MANDATORY }, - { "id", SD_JSON_VARIANT_STRING, sd_json_dispatch_id128, offsetof(Machine, id), 0 }, - { "service", SD_JSON_VARIANT_STRING, sd_json_dispatch_string, offsetof(Machine, service), 0 }, - { "class", SD_JSON_VARIANT_STRING, dispatch_machine_class, offsetof(Machine, class), SD_JSON_MANDATORY }, - { "leader", _SD_JSON_VARIANT_TYPE_INVALID, machine_leader, offsetof(Machine, leader), SD_JSON_STRICT }, - { "leaderProcessId", SD_JSON_VARIANT_OBJECT, machine_leader, offsetof(Machine, leader), SD_JSON_STRICT }, - { "rootDirectory", SD_JSON_VARIANT_STRING, json_dispatch_path, offsetof(Machine, root_directory), 0 }, - { "ifIndices", SD_JSON_VARIANT_ARRAY, machine_ifindices, 0, 0 }, - { "vSockCid", _SD_JSON_VARIANT_TYPE_INVALID, machine_cid, offsetof(Machine, vsock_cid), 0 }, - { "sshAddress", SD_JSON_VARIANT_STRING, sd_json_dispatch_string, offsetof(Machine, ssh_address), SD_JSON_STRICT }, - { "sshPrivateKeyPath", SD_JSON_VARIANT_STRING, json_dispatch_path, offsetof(Machine, ssh_private_key_path), 0 }, - { "allocateUnit", SD_JSON_VARIANT_BOOLEAN, sd_json_dispatch_stdbool, offsetof(Machine, allocate_unit), 0 }, + { "name", SD_JSON_VARIANT_STRING, machine_name, offsetof(Machine, name), SD_JSON_MANDATORY }, + { "id", SD_JSON_VARIANT_STRING, sd_json_dispatch_id128, offsetof(Machine, id), 0 }, + { "service", SD_JSON_VARIANT_STRING, sd_json_dispatch_string, offsetof(Machine, service), 0 }, + { "class", SD_JSON_VARIANT_STRING, dispatch_machine_class, offsetof(Machine, class), SD_JSON_MANDATORY }, + { "leader", _SD_JSON_VARIANT_TYPE_INVALID, machine_pidref, offsetof(Machine, leader), SD_JSON_STRICT }, + { "leaderProcessId", SD_JSON_VARIANT_OBJECT, machine_pidref, offsetof(Machine, leader), SD_JSON_STRICT }, + { "supervisor", _SD_JSON_VARIANT_TYPE_INVALID, machine_pidref, offsetof(Machine, supervisor), SD_JSON_STRICT }, + { "supervisorProcessId", SD_JSON_VARIANT_OBJECT, machine_pidref, offsetof(Machine, supervisor), SD_JSON_STRICT }, + { "rootDirectory", SD_JSON_VARIANT_STRING, json_dispatch_path, offsetof(Machine, root_directory), 0 }, + { "ifIndices", SD_JSON_VARIANT_ARRAY, machine_ifindices, 0, 0 }, + { "vSockCid", _SD_JSON_VARIANT_TYPE_INVALID, machine_cid, offsetof(Machine, vsock_cid), 0 }, + { "sshAddress", SD_JSON_VARIANT_STRING, sd_json_dispatch_string, offsetof(Machine, ssh_address), SD_JSON_STRICT }, + { "sshPrivateKeyPath", SD_JSON_VARIANT_STRING, json_dispatch_path, offsetof(Machine, ssh_private_key_path), 0 }, + { "allocateUnit", SD_JSON_VARIANT_BOOLEAN, sd_json_dispatch_stdbool, offsetof(Machine, allocate_unit), 0 }, VARLINK_DISPATCH_POLKIT_FIELD, {} }; @@ -168,6 +170,18 @@ int vl_method_register(sd_varlink *link, sd_json_variant *parameters, sd_varlink return r; } + if (!pidref_is_set(&machine->supervisor)) { + _cleanup_(pidref_done) PidRef client_pidref = PIDREF_NULL; + + r = varlink_get_peer_pidref(link, &client_pidref); + if (r < 0) + return r; + + /* If the client process is not the leader, then make it the supervisor */ + if (!pidref_equal(&client_pidref, &machine->leader)) + machine->supervisor = TAKE_PIDREF(client_pidref); + } + r = sd_varlink_get_peer_uid(link, &machine->uid); if (r < 0) return r; diff --git a/src/machine/machine.c b/src/machine/machine.c index 91c1450184..0a8db2dfa8 100644 --- a/src/machine/machine.c +++ b/src/machine/machine.c @@ -62,6 +62,7 @@ int machine_new(MachineClass class, const char *name, Machine **ret) { *m = (Machine) { .class = class, .leader = PIDREF_NULL, + .supervisor = PIDREF_NULL, .vsock_cid = VMADDR_CID_ANY, }; @@ -131,6 +132,9 @@ Machine* machine_free(Machine *m) { pidref_done(&m->leader); } + m->supervisor_pidfd_event_source = sd_event_source_disable_unref(m->supervisor_pidfd_event_source); + pidref_done(&m->supervisor); + sd_bus_message_unref(m->create_message); free(m->name); @@ -205,6 +209,13 @@ int machine_save(Machine *m) { fprintf(f, "LEADER_PIDFDID=%" PRIu64 "\n", m->leader.fd_id); } + if (pidref_is_set(&m->supervisor)) { + fprintf(f, "SUPERVISOR=" PID_FMT "\n", m->supervisor.pid); + (void) pidref_acquire_pidfd_id(&m->supervisor); + if (m->supervisor.fd_id != 0) + fprintf(f, "SUPERVISOR_PIDFDID=%" PRIu64 "\n", m->supervisor.fd_id); + } + if (m->class != _MACHINE_CLASS_INVALID) fprintf(f, "CLASS=%s\n", machine_class_to_string(m->class)); @@ -261,8 +272,41 @@ static void machine_unlink(Machine *m) { (void) unlink(m->state_file); } +static void parse_pid_and_pidfdid( + PidRef *pidref, + const char *pid, + const char *pidfdid, + const char *name) { + + int r; + + assert(pidref); + assert(name); + + pidref_done(pidref); + + if (!pid) + return; + r = pidref_set_pidstr(pidref, pid); + if (r < 0) + return (void) log_debug_errno(r, "Failed to set %s PID to '%s', ignoring: %m", name, pid); + + if (!pidfdid) + return; + uint64_t fd_id; + r = safe_atou64(pidfdid, &fd_id); + if (r < 0) + return (void) log_warning_errno(r, "Failed to parse %s pidfd ID, ignoring: %s", name, pidfdid); + (void) pidref_acquire_pidfd_id(pidref); + if (fd_id != pidref->fd_id) { + log_debug("PID of %s got recycled, ignoring.", name); + pidref_done(pidref); + } +} + int machine_load(Machine *m) { - _cleanup_free_ char *name = NULL, *realtime = NULL, *monotonic = NULL, *id = NULL, *leader = NULL, *leader_pidfdid = NULL, + _cleanup_free_ char *name = NULL, *realtime = NULL, *monotonic = NULL, *id = NULL, + *leader = NULL, *leader_pidfdid = NULL, *supervisor = NULL, *supervisor_pidfdid = NULL, *class = NULL, *netif = NULL, *vsock_cid = NULL, *uid = NULL; int r; @@ -281,6 +325,8 @@ int machine_load(Machine *m) { "ID", &id, "LEADER", &leader, "LEADER_PIDFDID", &leader_pidfdid, + "SUPERVISOR", &supervisor, + "SUPERVISOR_PIDFDID", &supervisor_pidfdid, "CLASS", &class, "REALTIME", &realtime, "MONOTONIC", &monotonic, @@ -300,26 +346,8 @@ int machine_load(Machine *m) { if (id) (void) sd_id128_from_string(id, &m->id); - pidref_done(&m->leader); - if (leader) { - r = pidref_set_pidstr(&m->leader, leader); - if (r < 0) - log_debug_errno(r, "Failed to set leader PID to '%s', ignoring: %m", leader); - else if (leader_pidfdid) { - uint64_t fd_id; - r = safe_atou64(leader_pidfdid, &fd_id); - if (r < 0) - log_warning_errno(r, "Failed to parse leader pidfd ID, ignoring: %s", leader_pidfdid); - else { - (void) pidref_acquire_pidfd_id(&m->leader); - - if (fd_id != m->leader.fd_id) { - log_debug("Leader PID got recycled, ignoring."); - pidref_done(&m->leader); - } - } - } - } + parse_pid_and_pidfdid(&m->leader, leader, leader_pidfdid, "leader"); + parse_pid_and_pidfdid(&m->supervisor, supervisor, supervisor_pidfdid, "supervisor"); if (class) { MachineClass c = machine_class_from_string(class); @@ -523,25 +551,35 @@ static int machine_dispatch_leader_pidfd(sd_event_source *s, int fd, unsigned re return 0; } -static int machine_watch_pidfd(Machine *m) { +static int machine_dispatch_supervisor_pidfd(sd_event_source *s, int fd, unsigned revents, void *userdata) { + Machine *m = ASSERT_PTR(userdata); + + m->supervisor_pidfd_event_source = sd_event_source_disable_unref(m->supervisor_pidfd_event_source); + machine_add_to_gc_queue(m); + + return 0; +} + +static int machine_watch_pidfd(Machine *m, PidRef *pidref, sd_event_source **source, sd_event_io_handler_t cb) { int r; assert(m); assert(m->manager); - assert(pidref_is_set(&m->leader)); - assert(!m->leader_pidfd_event_source); + assert(source); + assert(!*source); + assert(cb); - if (m->leader.fd < 0) + if (!pidref_is_set(pidref) || pidref->fd < 0) return 0; - /* If we have a pidfd for the leader, let's also track it for POLLIN, and GC the machine + /* If we have a pidfd for the leader or supervisor, let's also track it for POLLIN, and GC the machine * automatically if it dies */ - r = sd_event_add_io(m->manager->event, &m->leader_pidfd_event_source, m->leader.fd, EPOLLIN, machine_dispatch_leader_pidfd, m); + r = sd_event_add_io(m->manager->event, source, pidref->fd, EPOLLIN, cb, m); if (r < 0) return r; - (void) sd_event_source_set_description(m->leader_pidfd_event_source, "machine-pidfd"); + (void) sd_event_source_set_description(*source, "machine-pidfd"); return 0; } @@ -561,7 +599,11 @@ int machine_start(Machine *m, sd_bus_message *properties, sd_bus_error *error) { if (r < 0) return r; - r = machine_watch_pidfd(m); + r = machine_watch_pidfd(m, &m->leader, &m->leader_pidfd_event_source, machine_dispatch_leader_pidfd); + if (r < 0) + return r; + + r = machine_watch_pidfd(m, &m->supervisor, &m->supervisor_pidfd_event_source, machine_dispatch_supervisor_pidfd); if (r < 0) return r; @@ -598,6 +640,7 @@ int machine_stop(Machine *m) { return -EOPNOTSUPP; if (m->unit && !m->subgroup) { + /* If the machine runs as its own unit, then we'll terminate that */ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; char *job = NULL; @@ -606,7 +649,14 @@ int machine_stop(Machine *m) { return log_error_errno(r, "Failed to stop machine unit: %s", bus_error_message(&error, r)); free_and_replace(m->scope_job, job); - } + + } else if (pidref_is_set(&m->supervisor)) { + /* Otherwise, send a friendly SIGTERM to the supervisor */ + r = pidref_kill(&m->supervisor, SIGTERM); + if (r < 0) + return log_error_errno(r, "Failed to kill supervisor process " PID_FMT " of machine '%s': %m", m->supervisor.pid, m->name); + } else + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Don't know how to terminate machine '%s'.", m->name); m->stopping = true; @@ -714,14 +764,23 @@ int machine_kill(Machine *m, KillWhom whom, int signo) { if (!IN_SET(m->class, MACHINE_VM, MACHINE_CONTAINER)) return -EOPNOTSUPP; - if (whom == KILL_LEADER) /* If we shall simply kill the leader, do so directly */ + switch (whom) { + + case KILL_LEADER: return pidref_kill(&m->leader, signo); - if (!m->unit) - return -ESRCH; + case KILL_SUPERVISOR: + return pidref_kill(&m->supervisor, signo); - /* Otherwise, make PID 1 do it for us, for the entire cgroup */ - return manager_kill_unit(m->manager, m->unit, m->subgroup, signo, /* error= */ NULL); + case KILL_ALL: + if (!m->unit) + return -ESRCH; + + return manager_kill_unit(m->manager, m->unit, m->subgroup, signo, /* error= */ NULL); + + default: + assert_not_reached(); + } } int machine_openpt(Machine *m, int flags, char **ret_peer) { @@ -1504,8 +1563,9 @@ static const char* const machine_state_table[_MACHINE_STATE_MAX] = { DEFINE_STRING_TABLE_LOOKUP(machine_state, MachineState); static const char* const kill_whom_table[_KILL_WHOM_MAX] = { - [KILL_LEADER] = "leader", - [KILL_ALL] = "all" + [KILL_LEADER] = "leader", + [KILL_SUPERVISOR] = "supervisor", + [KILL_ALL] = "all", }; DEFINE_STRING_TABLE_LOOKUP(kill_whom, KillWhom); diff --git a/src/machine/machine.h b/src/machine/machine.h index dddc0c8000..dda1612916 100644 --- a/src/machine/machine.h +++ b/src/machine/machine.h @@ -27,6 +27,7 @@ typedef enum MachineClass { typedef enum KillWhom { KILL_LEADER, + KILL_SUPERVISOR, KILL_ALL, _KILL_WHOM_MAX, _KILL_WHOM_INVALID = -EINVAL, @@ -50,8 +51,16 @@ typedef struct Machine { char *subgroup; char *scope_job; - PidRef leader; - sd_event_source *leader_pidfd_event_source; + /* Leader: the top-level process that encapsulates the machine itself. For containers that's PID 1, + * for VMs that's qemu or whatever process wraps the actual VM code. This process defines the runtime + * lifecycle of the machine. In case of containers we can use this reference to enter namespaces, + * send signals and so on. + * + * Supervisor: the process that supervises the machine, if there is any and if that process is + * responsible for a single machine. Sending SIGTERM to this process should (non-cooperatively) + * terminate the machine. */ + PidRef leader, supervisor; + sd_event_source *leader_pidfd_event_source, *supervisor_pidfd_event_source; dual_timestamp timestamp; diff --git a/src/machine/machined-dbus.c b/src/machine/machined-dbus.c index 82c0addefb..72141b8116 100644 --- a/src/machine/machined-dbus.c +++ b/src/machine/machined-dbus.c @@ -233,7 +233,7 @@ static int method_create_or_register_machine( Machine **ret, sd_bus_error *error) { - _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; + _cleanup_(pidref_done) PidRef leader_pidref = PIDREF_NULL, supervisor_pidref = PIDREF_NULL; const char *name, *service, *class, *root_directory; const int32_t *netif = NULL; MachineClass c; @@ -289,13 +289,23 @@ static int method_create_or_register_machine( return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Root directory must be empty or an absolute path"); if (leader == 0) { - r = bus_query_sender_pidref(message, &pidref); + /* If no PID is specified, the client is the leader */ + r = bus_query_sender_pidref(message, &leader_pidref); if (r < 0) return sd_bus_error_set_errnof(error, r, "Failed to pin client process: %m"); } else { - r = pidref_set_pid(&pidref, leader); + /* If a PID is specified that's the leader, but if the client process is different from it, than that's the supervisor */ + r = pidref_set_pid(&leader_pidref, leader); if (r < 0) return sd_bus_error_set_errnof(error, r, "Failed to pin process " PID_FMT ": %m", (pid_t) leader); + + _cleanup_(pidref_done) PidRef client_pidref = PIDREF_NULL; + r = bus_query_sender_pidref(message, &client_pidref); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to pin client process: %m"); + + if (!pidref_equal(&client_pidref, &leader_pidref)) + supervisor_pidref = TAKE_PIDREF(client_pidref); } if (hashmap_get(manager->machines, name)) @@ -332,7 +342,8 @@ static int method_create_or_register_machine( if (r < 0) return r; - m->leader = TAKE_PIDREF(pidref); + m->leader = TAKE_PIDREF(leader_pidref); + m->supervisor = TAKE_PIDREF(supervisor_pidref); m->class = c; m->id = id; m->uid = uid; diff --git a/src/machine/machined-varlink.c b/src/machine/machined-varlink.c index 7dac3cb0d2..52b1fc12d2 100644 --- a/src/machine/machined-varlink.c +++ b/src/machine/machined-varlink.c @@ -479,6 +479,7 @@ static int list_machine_one_and_maybe_read_metadata(sd_varlink *link, Machine *m JSON_BUILD_PAIR_STRING_NON_EMPTY("unit", m->unit), JSON_BUILD_PAIR_STRING_NON_EMPTY("subgroup", m->subgroup), SD_JSON_BUILD_PAIR_CONDITION(pidref_is_set(&m->leader), "leader", JSON_BUILD_PIDREF(&m->leader)), + SD_JSON_BUILD_PAIR_CONDITION(pidref_is_set(&m->supervisor), "supervisor", JSON_BUILD_PIDREF(&m->supervisor)), SD_JSON_BUILD_PAIR_CONDITION(dual_timestamp_is_set(&m->timestamp), "timestamp", JSON_BUILD_DUAL_TIMESTAMP(&m->timestamp)), JSON_BUILD_PAIR_UNSIGNED_NOT_EQUAL("vSockCid", m->vsock_cid, VMADDR_CID_ANY), JSON_BUILD_PAIR_STRING_NON_EMPTY("sshAddress", m->ssh_address), diff --git a/src/shared/varlink-io.systemd.Machine.c b/src/shared/varlink-io.systemd.Machine.c index ad7dff228f..d37f672d05 100644 --- a/src/shared/varlink-io.systemd.Machine.c +++ b/src/shared/varlink-io.systemd.Machine.c @@ -38,6 +38,10 @@ static SD_VARLINK_DEFINE_METHOD( SD_VARLINK_DEFINE_INPUT(leader, SD_VARLINK_INT, SD_VARLINK_NULLABLE), SD_VARLINK_FIELD_COMMENT("The leader PID as ProcessId structure. If both the leader and leaderProcessId parameters are specified they must reference the same process. Typically one would only specify one or the other however. It's generally recommended to specify leaderProcessId as it references a process in a robust way without risk of identifier recycling."), SD_VARLINK_DEFINE_INPUT_BY_TYPE(leaderProcessId, ProcessId, SD_VARLINK_NULLABLE), + SD_VARLINK_FIELD_COMMENT("The supervisor PID as simple positive integer."), + SD_VARLINK_DEFINE_INPUT(supervisor, SD_VARLINK_INT, SD_VARLINK_NULLABLE), + SD_VARLINK_FIELD_COMMENT("The supervisor PID as ProcessId structure. If both the supervisor and supervisorProcessId parameters are specified they must reference the same process. Typically only one or the other would be specified. It's generally recommended to specify supervisorProcessId as it references a process in a robust way without risk of identifier recycling."), + SD_VARLINK_DEFINE_INPUT_BY_TYPE(supervisorProcessId, ProcessId, SD_VARLINK_NULLABLE), SD_VARLINK_DEFINE_INPUT(rootDirectory, SD_VARLINK_STRING, SD_VARLINK_NULLABLE), SD_VARLINK_DEFINE_INPUT(ifIndices, SD_VARLINK_INT, SD_VARLINK_ARRAY|SD_VARLINK_NULLABLE), SD_VARLINK_DEFINE_INPUT(vSockCid, SD_VARLINK_INT, SD_VARLINK_NULLABLE), @@ -58,7 +62,7 @@ static SD_VARLINK_DEFINE_METHOD( static SD_VARLINK_DEFINE_METHOD( Kill, VARLINK_DEFINE_MACHINE_LOOKUP_AND_POLKIT_INPUT_FIELDS, - SD_VARLINK_FIELD_COMMENT("Identifier that specifies what precisely to send the signal to (either 'leader' or 'all')."), + SD_VARLINK_FIELD_COMMENT("Identifier that specifies what precisely to send the signal to (either 'leader', 'supervisor', or 'all')."), SD_VARLINK_DEFINE_INPUT(whom, SD_VARLINK_STRING, SD_VARLINK_NULLABLE), SD_VARLINK_FIELD_COMMENT("Numeric UNIX signal integer."), SD_VARLINK_DEFINE_INPUT(signal, SD_VARLINK_INT, 0)); @@ -79,6 +83,8 @@ static SD_VARLINK_DEFINE_METHOD_FULL( SD_VARLINK_DEFINE_OUTPUT(class, SD_VARLINK_STRING, 0), SD_VARLINK_FIELD_COMMENT("Leader process PID of this machine"), SD_VARLINK_DEFINE_OUTPUT_BY_TYPE(leader, ProcessId, SD_VARLINK_NULLABLE), + SD_VARLINK_FIELD_COMMENT("Supervisor process PID of this machine"), + SD_VARLINK_DEFINE_OUTPUT_BY_TYPE(supervisor, ProcessId, SD_VARLINK_NULLABLE), SD_VARLINK_FIELD_COMMENT("Root directory of this machine, if known, relative to host file system"), SD_VARLINK_DEFINE_OUTPUT(rootDirectory, SD_VARLINK_STRING, SD_VARLINK_NULLABLE), SD_VARLINK_FIELD_COMMENT("The service manager unit this machine resides in"), From 74546a7e292228f21511d8ea49d781dbea694234 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 9 Jul 2025 09:35:12 +0200 Subject: [PATCH 07/22] machined: explicitly watch machine cgroup for getting empty --- src/machine/machine.c | 52 +++++++++++++++++++++++++++++++++++++ src/machine/machine.h | 3 +++ src/machine/machined-dbus.c | 13 ++++++++++ 3 files changed, 68 insertions(+) diff --git a/src/machine/machine.c b/src/machine/machine.c index 0a8db2dfa8..26252ff9cd 100644 --- a/src/machine/machine.c +++ b/src/machine/machine.c @@ -137,6 +137,8 @@ Machine* machine_free(Machine *m) { sd_bus_message_unref(m->create_message); + m->cgroup_empty_event_source = sd_event_source_disable_unref(m->cgroup_empty_event_source); + free(m->name); free(m->state_file); @@ -146,6 +148,7 @@ Machine* machine_free(Machine *m) { free(m->unit); free(m->subgroup); free(m->scope_job); + free(m->cgroup); free(m->netif); free(m->ssh_address); @@ -584,6 +587,43 @@ static int machine_watch_pidfd(Machine *m, PidRef *pidref, sd_event_source **sou return 0; } +static int machine_dispatch_cgroup_empty(sd_event_source *s, const struct inotify_event *event, void *userdata) { + Machine *m = ASSERT_PTR(userdata); + int r; + + assert(m->cgroup); + + r = cg_is_empty(SYSTEMD_CGROUP_CONTROLLER, m->cgroup); + if (r < 0) + return log_error_errno(r, "Failed to determine if cgroup '%s' is empty: %m", m->cgroup); + + if (r > 0) + machine_add_to_gc_queue(m); + + return 0; +} + +static int machine_watch_cgroup(Machine *m) { + int r; + + assert(m); + assert(!m->cgroup_empty_event_source); + + if (!m->cgroup) + return 0; + + _cleanup_free_ char *p = NULL; + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup, "cgroup.events", &p); + if (r < 0) + return log_error_errno(r, "Failed to get cgroup path for cgroup '%s': %m", m->cgroup); + + r = sd_event_add_inotify(m->manager->event, &m->cgroup_empty_event_source, p, IN_MODIFY, machine_dispatch_cgroup_empty, m); + if (r < 0) + return log_error_errno(r, "Failed to watch %s events: %m", p); + + return 0; +} + int machine_start(Machine *m, sd_bus_message *properties, sd_bus_error *error) { int r; @@ -607,6 +647,10 @@ int machine_start(Machine *m, sd_bus_message *properties, sd_bus_error *error) { if (r < 0) return r; + r = machine_watch_cgroup(m); + if (r < 0) + return r; + /* Create cgroup */ r = machine_ensure_scope(m, properties, error); if (r < 0) @@ -728,6 +772,14 @@ bool machine_may_gc(Machine *m, bool drop_not_started) { return false; } + if (m->cgroup) { + r = cg_is_empty(SYSTEMD_CGROUP_CONTROLLER, m->cgroup); + if (IN_SET(r, 0, -ENOENT)) + return true; + if (r < 0) + log_debug_errno(r, "Failed to determine if cgroup '%s' is empty, ignoring: %m", m->cgroup); + } + return true; } diff --git a/src/machine/machine.h b/src/machine/machine.h index dda1612916..8f1a044f7a 100644 --- a/src/machine/machine.h +++ b/src/machine/machine.h @@ -50,6 +50,7 @@ typedef struct Machine { char *unit; char *subgroup; char *scope_job; + char *cgroup; /* Leader: the top-level process that encapsulates the machine itself. For containers that's PID 1, * for VMs that's qemu or whatever process wraps the actual VM code. This process defines the runtime @@ -64,6 +65,8 @@ typedef struct Machine { dual_timestamp timestamp; + sd_event_source *cgroup_empty_event_source; + bool in_gc_queue:1; bool started:1; bool stopping:1; diff --git a/src/machine/machined-dbus.c b/src/machine/machined-dbus.c index 72141b8116..883c8c1338 100644 --- a/src/machine/machined-dbus.c +++ b/src/machine/machined-dbus.c @@ -442,6 +442,19 @@ static int method_register_machine_internal(sd_bus_message *message, bool read_n goto fail; } + if (!empty_or_root(m->subgroup)) { + /* If this is not a top-level cgroup, then we need the cgroup path to be able to watch when + * it empties */ + + r = cg_pidref_get_path(SYSTEMD_CGROUP_CONTROLLER, &m->leader, &m->cgroup); + if (r < 0) { + r = sd_bus_error_set_errnof(error, r, + "Failed to determine cgroup of process "PID_FMT" : %m", + m->leader.pid); + goto fail; + } + } + r = machine_start(m, NULL, error); if (r < 0) goto fail; From 596c596d0926b4c3bc183335b3ce626cc1ac4840 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 9 Jul 2025 09:35:25 +0200 Subject: [PATCH 08/22] machined: add a bit more debug logging --- src/machine/machine.c | 5 +++++ src/machine/machined-core.c | 5 ++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/src/machine/machine.c b/src/machine/machine.c index 26252ff9cd..e127ddcb0e 100644 --- a/src/machine/machine.c +++ b/src/machine/machine.c @@ -33,6 +33,7 @@ #include "path-util.h" #include "process-util.h" #include "serialize.h" +#include "signal-util.h" #include "socket-util.h" #include "special.h" #include "stdio-util.h" @@ -680,6 +681,8 @@ int machine_stop(Machine *m) { assert(m); + log_debug("Stopping machine '%s'.", m->name); + if (!IN_SET(m->class, MACHINE_CONTAINER, MACHINE_VM)) return -EOPNOTSUPP; @@ -813,6 +816,8 @@ MachineState machine_get_state(Machine *s) { int machine_kill(Machine *m, KillWhom whom, int signo) { assert(m); + log_debug("Killing machine '%s' (%s) with signal %s.", m->name, kill_whom_to_string(whom), signal_to_string(signo)); + if (!IN_SET(m->class, MACHINE_VM, MACHINE_CONTAINER)) return -EOPNOTSUPP; diff --git a/src/machine/machined-core.c b/src/machine/machined-core.c index 075ea4a803..61078d51ec 100644 --- a/src/machine/machined-core.c +++ b/src/machine/machined-core.c @@ -162,13 +162,16 @@ void manager_gc(Manager *m, bool drop_not_started) { /* First, if we are not closing yet, initiate stopping */ if (machine_may_gc(machine, drop_not_started) && - machine_get_state(machine) != MACHINE_CLOSING) + machine_get_state(machine) != MACHINE_CLOSING) { + log_debug("Stopping machine '%s' due to GC.", machine->name); machine_stop(machine); + } /* Now, the stop probably made this referenced * again, but if it didn't, then it's time to let it * go entirely. */ if (machine_may_gc(machine, drop_not_started)) { + log_debug("Finalizing machine '%s' due to GC.", machine->name); machine_finalize(machine); machine_free(machine); } From ca1daebdd6a93f57c4a70936ef6e02d0306ce69b Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 2 Jul 2025 13:12:06 +0200 Subject: [PATCH 09/22] machinectl: output supervisor info in status output --- src/machine/machinectl.c | 59 ++++++++++++++++++++++++++++------------ 1 file changed, 42 insertions(+), 17 deletions(-) diff --git a/src/machine/machinectl.c b/src/machine/machinectl.c index d31ef407f4..9eddc53990 100644 --- a/src/machine/machinectl.c +++ b/src/machine/machinectl.c @@ -28,9 +28,9 @@ #include "cgroup-util.h" #include "edit-util.h" #include "env-util.h" -#include "format-util.h" #include "format-ifname.h" #include "format-table.h" +#include "format-util.h" #include "hostname-util.h" #include "import-util.h" #include "in-addr-util.h" @@ -46,6 +46,7 @@ #include "parse-argument.h" #include "parse-util.h" #include "path-util.h" +#include "pidref.h" #include "polkit-agent.h" #include "pretty-print.h" #include "process-util.h" @@ -511,17 +512,47 @@ typedef struct MachineStatusInfo { const char *subgroup; const char *root_directory; pid_t leader; + uint64_t leader_pidfdid; + pid_t supervisor; + uint64_t supervisor_pidfdid; struct dual_timestamp timestamp; int *netif; size_t n_netif; uid_t uid; } MachineStatusInfo; -static void machine_status_info_clear(MachineStatusInfo *info) { - if (info) { - free(info->netif); - zero(*info); +static void machine_status_info_done(MachineStatusInfo *info) { + if (!info) + return; + + free(info->netif); + zero(*info); +} + +static void print_process_info(const char *field, pid_t pid, uint64_t pidfdid) { + int r; + + assert(field); + + if (pid <= 0) + return; + + printf("%s: " PID_FMT, field, pid); + + _cleanup_(pidref_done) PidRef pr = PIDREF_NULL; + r = pidref_set_pid_and_pidfd_id(&pr, pid, pidfdid); + if (r < 0) + log_debug_errno(r, "Failed to acquire reference to process, ignoring: %m"); + else { + _cleanup_free_ char *t = NULL; + r = pidref_get_comm(&pr, &t); + if (r < 0) + log_debug_errno(r, "Failed to acquire name of process, ignoring: %m"); + else + printf(" (%s)", t); } + + putchar('\n'); } static void print_machine_status_info(sd_bus *bus, MachineStatusInfo *i) { @@ -546,17 +577,8 @@ static void print_machine_status_info(sd_bus *bus, MachineStatusInfo *i) { else if (!isempty(s2)) printf("\t Since: %s\n", s2); - if (i->leader > 0) { - _cleanup_free_ char *t = NULL; - - printf("\t Leader: %u", (unsigned) i->leader); - - (void) pid_get_comm(i->leader, &t); - if (t) - printf(" (%s)", t); - - putchar('\n'); - } + print_process_info("\t Leader", i->leader, i->leader_pidfdid); + print_process_info("\t Superv.", i->supervisor, i->supervisor_pidfdid); if (i->service) { printf("\t Service: %s", i->service); @@ -662,6 +684,9 @@ static int show_machine_info(const char *verb, sd_bus *bus, const char *path, bo { "Subgroup", "s", NULL, offsetof(MachineStatusInfo, subgroup) }, { "RootDirectory", "s", NULL, offsetof(MachineStatusInfo, root_directory) }, { "Leader", "u", NULL, offsetof(MachineStatusInfo, leader) }, + { "LeaderPIDFDId", "t", NULL, offsetof(MachineStatusInfo, leader_pidfdid) }, + { "Supervisor", "u", NULL, offsetof(MachineStatusInfo, supervisor) }, + { "SupervisorPIDFDId", "t", NULL, offsetof(MachineStatusInfo, supervisor_pidfdid) }, { "Timestamp", "t", NULL, offsetof(MachineStatusInfo, timestamp.realtime) }, { "TimestampMonotonic", "t", NULL, offsetof(MachineStatusInfo, timestamp.monotonic) }, { "Id", "ay", bus_map_id128, offsetof(MachineStatusInfo, id) }, @@ -672,7 +697,7 @@ static int show_machine_info(const char *verb, sd_bus *bus, const char *path, bo _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; - _cleanup_(machine_status_info_clear) MachineStatusInfo info = {}; + _cleanup_(machine_status_info_done) MachineStatusInfo info = {}; int r; assert(verb); From f2f26f1527529b1ea7dcb0dba85456ac98800627 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 23 May 2025 22:04:56 +0200 Subject: [PATCH 10/22] nspawn: reorganize scope allocation/registration logic This cleans up allocation of a scope unit for the container: when invoked in user context we'll now allocate a scope through the per-user service manager instead of the per-system manager. This makes a ton more sense, since it's the user that invokes things after all. And given that machined now can register containers in the user manager there's nothing stopping us to clean this up. Note that this means we'll connect to two busses if run unpriv: once to the per-user bus to allocate the scope unit, and once to the per-system bus to register it with machined. --- src/nspawn/nspawn.c | 136 ++++++++++++++++++++++++++++---------------- 1 file changed, 88 insertions(+), 48 deletions(-) diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 4ccfab8a88..805dd91389 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -1667,11 +1667,6 @@ static int verify_arguments(void) { if (has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts)) arg_read_only = true; - if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0) - /* Save the user from accidentally registering either user-$SESSION.scope or user@.service. - * The latter is not technically a user session, but we don't need to labour the point. */ - return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--keep-unit --register=yes may not be used when invoked from a user session."); - if (arg_directory && arg_image) return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--directory= and --image= may not be combined."); @@ -5136,7 +5131,6 @@ static int run_container( _cleanup_(sd_event_unrefp) sd_event *event = NULL; _cleanup_(pty_forward_freep) PTYForward *forward = NULL; _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; - _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; _cleanup_free_ uid_t *bind_user_uid = NULL; size_t n_bind_user_uid = 0; ContainerStatus container_status = 0; @@ -5459,19 +5453,35 @@ static int run_container( return r; } - if (arg_register || !arg_keep_unit) { - if (arg_privileged || arg_register) - r = sd_bus_default_system(&bus); - else - r = sd_bus_default_user(&bus); + /* Registration always happens on the system bus */ + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *system_bus = NULL; + if (arg_register || arg_privileged) { + r = sd_bus_default_system(&system_bus); if (r < 0) - return log_error_errno(r, "Failed to open bus: %m"); + return log_error_errno(r, "Failed to open system bus: %m"); - r = sd_bus_set_close_on_exit(bus, false); + r = sd_bus_set_close_on_exit(system_bus, false); if (r < 0) return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m"); - (void) sd_bus_set_allow_interactive_authorization(bus, arg_ask_password); + (void) sd_bus_set_allow_interactive_authorization(system_bus, arg_ask_password); + } + + /* Scope allocation happens on the user bus if we are unpriv, otherwise system bus. */ + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *user_bus = NULL; + _cleanup_(sd_bus_unrefp) sd_bus *runtime_bus = NULL; + if (arg_privileged) + runtime_bus = sd_bus_ref(system_bus); + else { + r = sd_bus_default_user(&user_bus); + if (r < 0) + return log_error_errno(r, "Failed to open user bus: %m"); + + r = sd_bus_set_close_on_exit(user_bus, false); + if (r < 0) + return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m"); + + runtime_bus = sd_bus_ref(user_bus); } if (!arg_keep_unit) { @@ -5480,7 +5490,7 @@ static int run_container( * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */ r = sd_bus_match_signal_async( - bus, + runtime_bus, /* ret= */ NULL, "org.freedesktop.systemd1", /* path= */ NULL, @@ -5493,11 +5503,46 @@ static int run_container( return log_error_errno(r, "Failed to request RequestStop match: %m"); } + if (arg_keep_unit) { + /* If we are not supposed to allocate a unit, then let's move the process now, so that we can + * register things while being in the right cgroup location already. Otherwise, let's move + * the process later, once we have unit and hence cgroup. */ + r = create_subcgroup( + pid, + arg_keep_unit, + arg_uid_shift, + userns_fd, + arg_userns_mode); + if (r < 0) + return r; + } + + bool scope_allocated = false; + if (!arg_keep_unit && (!arg_register || !arg_privileged)) { + AllocateScopeFlags flags = ALLOCATE_SCOPE_ALLOW_PIDFD; + r = allocate_scope( + runtime_bus, + arg_machine, + pid, + arg_slice, + arg_custom_mounts, arg_n_custom_mounts, + arg_kill_signal, + arg_property, + arg_property_message, + arg_start_mode, + flags); + if (r < 0) + return r; + + scope_allocated = true; + } + + bool registered = false; if (arg_register) { RegisterMachineFlags flags = 0; - SET_FLAG(flags, REGISTER_MACHINE_KEEP_UNIT, arg_keep_unit); + SET_FLAG(flags, REGISTER_MACHINE_KEEP_UNIT, arg_keep_unit || !arg_privileged); r = register_machine( - bus, + system_bus, arg_machine, pid, arg_directory, @@ -5514,33 +5559,22 @@ static int run_container( if (r < 0) return r; - } else if (!arg_keep_unit) { - AllocateScopeFlags flags = ALLOCATE_SCOPE_ALLOW_PIDFD; - r = allocate_scope( - bus, - arg_machine, - pid, - arg_slice, - arg_custom_mounts, arg_n_custom_mounts, - arg_kill_signal, - arg_property, - arg_property_message, - arg_start_mode, - flags); - if (r < 0) - return r; + registered = true; + } - } else if (arg_slice || arg_property) + if (arg_keep_unit && (arg_slice || arg_property)) log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect."); - r = create_subcgroup( - pid, - arg_keep_unit, - arg_uid_shift, - userns_fd, - arg_userns_mode); - if (r < 0) - return r; + if (!arg_keep_unit) { + r = create_subcgroup( + pid, + arg_keep_unit, + arg_uid_shift, + userns_fd, + arg_userns_mode); + if (r < 0) + return r; + } /* Notify the child that the parent is ready with all its setup (including cgroup-ification), and * that the child can now hand over control to the code to run inside the container. */ @@ -5561,10 +5595,16 @@ static int run_container( (void) sd_event_set_watchdog(event, true); - if (bus) { - r = sd_bus_attach_event(bus, event, 0); + if (system_bus) { + r = sd_bus_attach_event(system_bus, event, 0); if (r < 0) - return log_error_errno(r, "Failed to attach bus to event loop: %m"); + return log_error_errno(r, "Failed to attach system bus to event loop: %m"); + } + + if (user_bus) { + r = sd_bus_attach_event(user_bus, event, 0); + if (r < 0) + return log_error_errno(r, "Failed to attach user bus to event loop: %m"); } r = setup_notify_parent(event, notify_socket, pid, ¬ify_event_source); @@ -5710,8 +5750,8 @@ static int run_container( return log_error_errno(r, "Failed to run event loop: %m"); /* Kill if it is not dead yet anyway */ - if (!arg_register && !arg_keep_unit && bus) - terminate_scope(bus, arg_machine); + if (scope_allocated) + terminate_scope(runtime_bus, arg_machine); /* Normally redundant, but better safe than sorry */ (void) pidref_kill(pid, SIGKILL); @@ -5731,8 +5771,8 @@ static int run_container( r = wait_for_container(pid, &container_status); /* Tell machined that we are gone. */ - if (arg_register && bus) - (void) unregister_machine(bus, arg_machine); + if (registered) + (void) unregister_machine(system_bus, arg_machine); if (r < 0) /* We failed to wait for the container, or the container exited abnormally. */ From f63ca4fc1489590b1d70e8dba13d63762f7eb5b5 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 16 Jun 2025 10:46:37 +0200 Subject: [PATCH 11/22] nspawn: slightly beef up READY= logic in nspawn Let's also send out a STATUS= message when we get READY=1 if it didn't come with a STATUS= message itself. Also, let's initially say the container is "started", and only once the READY=1 is seen claim it was "running". --- src/nspawn/nspawn.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 805dd91389..8f18d2fc60 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -4612,15 +4612,18 @@ static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t r log_debug("Got sd_notify() message: %s", strnull(joined)); } + char *status = strv_find_startswith(tags, "STATUS="); + if (status) + (void) sd_notifyf(/* unset_environment= */ false, "STATUS=Container running: %s", status); + if (strv_contains(tags, "READY=1")) { - r = sd_notify(false, "READY=1\n"); + r = sd_notify(/* unset_environment= */ false, "READY=1\n"); if (r < 0) log_warning_errno(r, "Failed to send readiness notification, ignoring: %m"); - } - char *p = strv_find_startswith(tags, "STATUS="); - if (p) - (void) sd_notifyf(false, "STATUS=Container running: %s", p); + if (!status) + (void) sd_notifyf(/* unset_environment= */ false, "STATUS=Container running."); + } return 0; } @@ -5632,11 +5635,11 @@ static int run_container( * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */ etc_passwd_lock = safe_close(etc_passwd_lock); - (void) sd_notifyf(false, - "STATUS=Container running.\n" + (void) sd_notifyf(/* unset_environment= */ false, + "STATUS=Container started.\n" "X_NSPAWN_LEADER_PID=" PID_FMT, pid->pid); if (!arg_notify_ready) { - r = sd_notify(false, "READY=1\n"); + r = sd_notify(/* unset_environment= */ false, "READY=1\n"); if (r < 0) log_warning_errno(r, "Failed to send readiness notification, ignoring: %m"); } From 0c250b39192dc44412478dba33ba14634cf21db4 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 9 Jul 2025 09:36:25 +0200 Subject: [PATCH 12/22] nspawn: tweak logging/notifications when processing exit requests --- src/nspawn/nspawn.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 8f18d2fc60..e21d0ca29b 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -40,6 +40,7 @@ #include "common-signal.h" #include "copy.h" #include "cpu-set-util.h" +#include "daemon-util.h" #include "dev-setup.h" #include "devnum-util.h" #include "discover-image.h" @@ -2955,13 +2956,19 @@ static int wait_for_container(PidRef *pid, ContainerStatus *container) { static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) { PidRef *pid = ASSERT_PTR(userdata); + assert(si); + if (pidref_is_set(pid)) if (pidref_kill(pid, arg_kill_signal) >= 0) { - log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination."); + log_info("Trying to halt container by sending %s to container PID 1. Send SIGTERM again to trigger immediate termination.", + signal_to_string(si->ssi_signo)); sd_event_source_set_userdata(s, NULL); + sd_notify(/* unset_environment= */ false, NOTIFY_STOPPING_MESSAGE); return 0; } + log_debug("Got %s, exiting.", signal_to_string(si->ssi_signo)); + sd_event_exit(sd_event_source_get_event(s), 0); return 0; } From 6ef1fc6d02d2cf3be48dc5b62e12a4e95a86b679 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 11 Jul 2025 14:25:40 +0200 Subject: [PATCH 13/22] nspawn: properly order include of constants.h --- src/nspawn/nspawn.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index e21d0ca29b..64b5ca87a5 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -12,7 +12,6 @@ #include #include #include -#include "constants.h" #if HAVE_SELINUX #include @@ -38,6 +37,7 @@ #include "cgroup-util.h" #include "chase.h" #include "common-signal.h" +#include "constants.h" #include "copy.h" #include "cpu-set-util.h" #include "daemon-util.h" From 6dc6e6459bb4dba7cd88c4f6688b1aea9159c9d2 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 15 May 2025 11:00:17 +0200 Subject: [PATCH 14/22] vmspawn: use VM leader PID not vmspawn PID to register machine Let's make vmspawn machine registration more like nspawn machine registration, and register the payload, not vmspawn/nspawn itself. --- src/vmspawn/vmspawn-register.c | 8 ++++-- src/vmspawn/vmspawn-register.h | 1 + src/vmspawn/vmspawn.c | 52 +++++++++++++++++----------------- 3 files changed, 33 insertions(+), 28 deletions(-) diff --git a/src/vmspawn/vmspawn-register.c b/src/vmspawn/vmspawn-register.c index ae7edd0fdb..d9925f171d 100644 --- a/src/vmspawn/vmspawn-register.c +++ b/src/vmspawn/vmspawn-register.c @@ -10,7 +10,9 @@ #include "bus-error.h" #include "bus-locator.h" #include "errno-util.h" +#include "json-util.h" #include "log.h" +#include "pidref.h" #include "process-util.h" #include "socket-util.h" #include "string-util.h" @@ -23,6 +25,7 @@ int register_machine( const char *machine_name, sd_id128_t uuid, const char *service, + const PidRef *pidref, const char *directory, unsigned cid, const char *address, @@ -54,7 +57,7 @@ int register_machine( SD_BUS_MESSAGE_APPEND_ID128(uuid), service, "vm", - (uint32_t) getpid_cached(), + (uint32_t) (pidref_is_set(pidref) ? pidref->pid : 0), strempty(directory)); if (r < 0) return log_error_errno(r, "Failed to register machine: %s", bus_error_message(&error, r)); @@ -77,7 +80,8 @@ int register_machine( SD_JSON_BUILD_PAIR_CONDITION(!!address, "sshAddress", SD_JSON_BUILD_STRING(address)), SD_JSON_BUILD_PAIR_CONDITION(!!key_path, "sshPrivateKeyPath", SD_JSON_BUILD_STRING(key_path)), SD_JSON_BUILD_PAIR_CONDITION(isatty_safe(STDIN_FILENO), "allowInteractiveAuthentication", SD_JSON_BUILD_BOOLEAN(true)), - SD_JSON_BUILD_PAIR_CONDITION(!keep_unit, "allocateUnit", SD_JSON_BUILD_BOOLEAN(true))); + SD_JSON_BUILD_PAIR_CONDITION(!keep_unit, "allocateUnit", SD_JSON_BUILD_BOOLEAN(true)), + SD_JSON_BUILD_PAIR_CONDITION(pidref_is_set(pidref), "leaderProcessId", JSON_BUILD_PIDREF(pidref))); } int unregister_machine(sd_bus *bus, const char *machine_name) { diff --git a/src/vmspawn/vmspawn-register.h b/src/vmspawn/vmspawn-register.h index 58cae542ce..2adfabdc03 100644 --- a/src/vmspawn/vmspawn-register.h +++ b/src/vmspawn/vmspawn-register.h @@ -7,6 +7,7 @@ int register_machine( const char *machine_name, sd_id128_t uuid, const char *service, + const PidRef *pidref, const char *directory, unsigned cid, const char *address, diff --git a/src/vmspawn/vmspawn.c b/src/vmspawn/vmspawn.c index 0ddc1cdb5a..bf16056b15 100644 --- a/src/vmspawn/vmspawn.c +++ b/src/vmspawn/vmspawn.c @@ -2326,34 +2326,8 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) { log_debug("Executing: %s", joined); } - if (arg_register) { - char vm_address[STRLEN("vsock/") + DECIMAL_STR_MAX(unsigned)]; - - xsprintf(vm_address, "vsock/%u", child_cid); - r = register_machine( - bus, - arg_machine, - arg_uuid, - "systemd-vmspawn", - arg_directory, - child_cid, - child_cid != VMADDR_CID_ANY ? vm_address : NULL, - ssh_private_key_path, - arg_keep_unit); - if (r < 0) - return r; - } - assert_se(sigprocmask_many(SIG_BLOCK, /* ret_old_mask=*/ NULL, SIGCHLD) >= 0); - _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL; - _cleanup_(sd_event_unrefp) sd_event *event = NULL; - r = sd_event_new(&event); - if (r < 0) - return log_error_errno(r, "Failed to get default event source: %m"); - - (void) sd_event_set_watchdog(event, true); - _cleanup_(pidref_done) PidRef child_pidref = PIDREF_NULL; r = pidref_safe_fork_full( @@ -2386,6 +2360,32 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) { child_vsock_fd = safe_close(child_vsock_fd); tap_fd = safe_close(tap_fd); + if (arg_register) { + char vm_address[STRLEN("vsock/") + DECIMAL_STR_MAX(unsigned)]; + xsprintf(vm_address, "vsock/%u", child_cid); + r = register_machine( + bus, + arg_machine, + arg_uuid, + "systemd-vmspawn", + &child_pidref, + arg_directory, + child_cid, + child_cid != VMADDR_CID_ANY ? vm_address : NULL, + ssh_private_key_path, + arg_keep_unit); + if (r < 0) + return r; + } + + _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL; + _cleanup_(sd_event_unrefp) sd_event *event = NULL; + r = sd_event_new(&event); + if (r < 0) + return log_error_errno(r, "Failed to get default event source: %m"); + + (void) sd_event_set_watchdog(event, true); + int exit_status = INT_MAX; if (use_vsock) { r = setup_notify_parent(event, notify_sock_fd, &exit_status, ¬ify_event_source); From 3cfa7826d281d27a7aba3fe4c08bdcf54ddfb89b Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 12 Jun 2025 11:48:37 +0200 Subject: [PATCH 15/22] vmspawn: spawn polkit during registration phase Just like in nspawn, there's a chance we need to PK authenticate the registration, hence let's spawn off the agent for that during that phase, and terminate it once we don't need it anymore. --- src/vmspawn/vmspawn.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/vmspawn/vmspawn.c b/src/vmspawn/vmspawn.c index bf16056b15..94453356b6 100644 --- a/src/vmspawn/vmspawn.c +++ b/src/vmspawn/vmspawn.c @@ -52,6 +52,7 @@ #include "path-lookup.h" #include "path-util.h" #include "pidref.h" +#include "polkit-agent.h" #include "pretty-print.h" #include "process-util.h" #include "ptyfwd.h" @@ -1546,6 +1547,8 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) { const char *accel, *shm; int r; + polkit_agent_open(); + if (arg_privileged) r = sd_bus_default_system(&bus); else @@ -2378,6 +2381,11 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) { return r; } + /* All operations that might need Polkit authorizations (i.e. machine registration, netif + * acquisition, …) are complete now, get rid of the agent again, so that we retain exclusive control + * of the TTY from now on. */ + polkit_agent_close(); + _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL; _cleanup_(sd_event_unrefp) sd_event *event = NULL; r = sd_event_new(&event); From 0fc45c8d20ad46ab9be0d8f29b16e606e0dd44ca Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 13 Jun 2025 10:29:01 +0200 Subject: [PATCH 16/22] vmspawn: substantially beef up cgroup logic, to match more closely what nspawn does This beefs up the cgroup logic, adding --slice=, --property= to vmspawn the same way it already exists in nspawn. There are a bunch of differences though: we don't delegate the cgroup access in the allocated unit (since qemu wouldn't need that), and we do registration via varlink not dbus. Hence, while this follows a similar logic now, it differs in a lot of details. This makes in particular one change: when invoked on the command line we'll only add the qemu instance to the allocated scope, not the vmspawn process itself (this follows more closely how nspawn does this where only the container payload has its scope, not nspawn itself). This is quite tricky to implement: unlike in nspawn we have auxiliary services to start, with depencies to the scope. This means we need to start the scope early, so that we know the scope's name. But the command line to invoke is only assembled from the data we learn about the auxiliary services, hence much later. To addres we'll now fork off the child that eventually will become early, then move it to a scope, prepare the cmdline and then very late send the cmdline (and the fds we want to pass) to the prepared child, which then execs it. --- man/systemd-vmspawn.xml | 25 +++ src/vmspawn/vmspawn-scope.c | 115 ++++++++-- src/vmspawn/vmspawn-scope.h | 5 +- src/vmspawn/vmspawn.c | 430 ++++++++++++++++++++++++++++++------ 4 files changed, 492 insertions(+), 83 deletions(-) diff --git a/man/systemd-vmspawn.xml b/man/systemd-vmspawn.xml index cbf7d20e64..304bbb44a3 100644 --- a/man/systemd-vmspawn.xml +++ b/man/systemd-vmspawn.xml @@ -343,6 +343,30 @@ Property Options + + + + + Make the VM part of the specified slice, instead of the default + machine.slice. This applies only if the machine is run in its own scope unit, + i.e. if is not used. + + + + + + + + + Set a unit property on the scope unit to register for the machine. This applies + only if the machine is run in its own scope unit, i.e. if is not + used. Takes unit property assignments in the same format as systemctl + set-property. This is useful to set memory limits and similar for the VM. + + + + + @@ -532,6 +556,7 @@ + diff --git a/src/vmspawn/vmspawn-scope.c b/src/vmspawn/vmspawn-scope.c index 624bd7730d..2aa2aaa232 100644 --- a/src/vmspawn/vmspawn-scope.c +++ b/src/vmspawn/vmspawn-scope.c @@ -12,11 +12,40 @@ #include "pidref.h" #include "random-util.h" #include "socket-util.h" +#include "special.h" #include "string-util.h" #include "strv.h" +#include "unit-def.h" +#include "unit-name.h" #include "vmspawn-scope.h" -int start_transient_scope(sd_bus *bus, const char *machine_name, bool allow_pidfd, char **ret_scope) { +static int append_controller_property(sd_bus *bus, sd_bus_message *m) { + const char *unique; + int r; + + assert(bus); + assert(m); + + r = sd_bus_get_unique_name(bus, &unique); + if (r < 0) + return log_error_errno(r, "Failed to get unique name: %m"); + + r = sd_bus_message_append(m, "(sv)", "Controller", "s", unique); + if (r < 0) + return bus_log_create_error(r); + + return 0; +} + +int allocate_scope( + sd_bus *bus, + const char *machine_name, + const PidRef *pid, + const char *slice, + char **properties, + bool allow_pidfd, + char **ret_scope) { + _cleanup_(bus_wait_for_jobs_freep) BusWaitForJobs *w = NULL; _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL, *m = NULL; @@ -33,8 +62,9 @@ int start_transient_scope(sd_bus *bus, const char *machine_name, bool allow_pidf if (r < 0) return log_error_errno(r, "Could not watch job: %m"); - if (asprintf(&scope, "machine-%"PRIu64"-%s.scope", random_u64(), machine_name) < 0) - return log_oom(); + r = unit_name_mangle_with_suffix(machine_name, "as machine name", /* flags= */ 0, ".scope", &scope); + if (r < 0) + return log_error_errno(r, "Failed to mangle scope name: %m"); description = strjoin("Virtual Machine ", machine_name); if (!description) @@ -53,22 +83,26 @@ int start_transient_scope(sd_bus *bus, const char *machine_name, bool allow_pidf if (r < 0) return bus_log_create_error(r); - r = sd_bus_message_append(m, "(sv)(sv)(sv)", - "Description", "s", description, - "AddRef", "b", 1, - "CollectMode", "s", "inactive-or-failed"); + r = bus_append_scope_pidref(m, pid, allow_pidfd); if (r < 0) return bus_log_create_error(r); - _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; - r = pidref_set_self(&pidref); - if (r < 0) - return log_error_errno(r, "Failed to allocate PID reference: %m"); - - r = bus_append_scope_pidref(m, &pidref, allow_pidfd); + r = sd_bus_message_append(m, "(sv)(sv)(sv)(sv)", + "Description", "s", description, + "CollectMode", "s", "inactive-or-failed", + "AddRef", "b", 1, + "Slice", "s", isempty(slice) ? SPECIAL_MACHINE_SLICE : slice); if (r < 0) return bus_log_create_error(r); + r = append_controller_property(bus, m); + if (r < 0) + return r; + + r = bus_append_unit_property_assignment_many(m, UNIT_SCOPE, properties); + if (r < 0) + return r; + r = sd_bus_message_close_container(m); if (r < 0) return bus_log_create_error(r); @@ -87,7 +121,14 @@ int start_transient_scope(sd_bus *bus, const char *machine_name, bool allow_pidf * doesn't support PIDFDs yet, let's try without. */ if (allow_pidfd && sd_bus_error_has_names(&error, SD_BUS_ERROR_UNKNOWN_PROPERTY, SD_BUS_ERROR_PROPERTY_READ_ONLY)) - return start_transient_scope(bus, machine_name, false, ret_scope); + return allocate_scope( + bus, + machine_name, + pid, + slice, + properties, + /* allow_pidfd= */ false, + ret_scope); return log_error_errno(r, "Failed to start transient scope unit: %s", bus_error_message(&error, r)); } @@ -96,7 +137,11 @@ int start_transient_scope(sd_bus *bus, const char *machine_name, bool allow_pidf if (r < 0) return bus_log_parse_error(r); - r = bus_wait_for_jobs_one(w, object, /* quiet */ false, NULL); + r = bus_wait_for_jobs_one( + w, + object, + BUS_WAIT_JOBS_LOG_ERROR, + /* extra_args= */ NULL); if (r < 0) return r; @@ -106,6 +151,46 @@ int start_transient_scope(sd_bus *bus, const char *machine_name, bool allow_pidf return 0; } +int terminate_scope( + sd_bus *bus, + const char *machine_name) { + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_free_ char *scope = NULL; + int r; + + r = unit_name_mangle_with_suffix(machine_name, "to terminate", /* flags= */ 0, ".scope", &scope); + if (r < 0) + return log_error_errno(r, "Failed to mangle scope name: %m"); + + r = bus_call_method(bus, bus_systemd_mgr, "AbandonScope", &error, /* ret_reply= */ NULL, "s", scope); + if (r < 0) { + log_debug_errno(r, "Failed to abandon scope '%s', ignoring: %s", scope, bus_error_message(&error, r)); + sd_bus_error_free(&error); + } + + r = bus_call_method( + bus, + bus_systemd_mgr, + "KillUnit", + &error, + NULL, + "ssi", + scope, + "all", + (int32_t) SIGKILL); + if (r < 0) { + log_debug_errno(r, "Failed to SIGKILL scope '%s', ignoring: %s", scope, bus_error_message(&error, r)); + sd_bus_error_free(&error); + } + + r = bus_call_method(bus, bus_systemd_mgr, "UnrefUnit", &error, /* ret_reply= */ NULL, "s", scope); + if (r < 0) + log_debug_errno(r, "Failed to drop reference to scope '%s', ignoring: %s", scope, bus_error_message(&error, r)); + + return 0; +} + static int message_add_commands(sd_bus_message *m, const char *exec_type, char ***commands, size_t n_commands) { int r; diff --git a/src/vmspawn/vmspawn-scope.h b/src/vmspawn/vmspawn-scope.h index c29325a314..e456498f2e 100644 --- a/src/vmspawn/vmspawn-scope.h +++ b/src/vmspawn/vmspawn-scope.h @@ -14,5 +14,8 @@ typedef struct SocketServicePair { void socket_service_pair_done(SocketServicePair *p); -int start_transient_scope(sd_bus *bus, const char *machine_name, bool allow_pidfd, char **ret_scope); +int allocate_scope(sd_bus *bus, const char *machine_name, const PidRef *pid, const char *slice, char **properties, bool allow_pidfd, char **ret_scope); + +int terminate_scope(sd_bus *bus, const char *machine_name); + int start_socket_service_pair(sd_bus *bus, const char *scope, SocketServicePair *p); diff --git a/src/vmspawn/vmspawn.c b/src/vmspawn/vmspawn.c index 94453356b6..ce623cfd99 100644 --- a/src/vmspawn/vmspawn.c +++ b/src/vmspawn/vmspawn.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -38,6 +39,8 @@ #include "hostname-setup.h" #include "hostname-util.h" #include "id128-util.h" +#include "io-util.h" +#include "iovec-util.h" #include "log.h" #include "machine-credential.h" #include "main-func.h" @@ -45,6 +48,7 @@ #include "namespace-util.h" #include "netif-util.h" #include "nsresource.h" +#include "nulstr-util.h" #include "osc-context.h" #include "pager.h" #include "parse-argument.h" @@ -100,6 +104,8 @@ static PagerFlags arg_pager_flags = 0; static char *arg_directory = NULL; static char *arg_image = NULL; static char *arg_machine = NULL; +static char *arg_slice = NULL; +static char **arg_property = NULL; static char *arg_cpus = NULL; static uint64_t arg_ram = UINT64_C(2) * U64_GB; static int arg_kvm = -1; @@ -118,7 +124,7 @@ static SettingsMask arg_settings_mask = 0; static char *arg_firmware = NULL; static char *arg_forward_journal = NULL; static bool arg_privileged = false; -static bool arg_register = false; +static bool arg_register = true; static bool arg_keep_unit = false; static sd_id128_t arg_uuid = {}; static char **arg_kernel_cmdline_extra = NULL; @@ -132,10 +138,12 @@ static char **arg_smbios11 = NULL; static uint64_t arg_grow_image = 0; static char *arg_tpm_state_path = NULL; static TpmStateMode arg_tpm_state_mode = TPM_STATE_AUTO; +static bool arg_ask_password = true; STATIC_DESTRUCTOR_REGISTER(arg_directory, freep); STATIC_DESTRUCTOR_REGISTER(arg_image, freep); STATIC_DESTRUCTOR_REGISTER(arg_machine, freep); +STATIC_DESTRUCTOR_REGISTER(arg_slice, freep); STATIC_DESTRUCTOR_REGISTER(arg_cpus, freep); STATIC_DESTRUCTOR_REGISTER(arg_credentials, machine_credential_context_done); STATIC_DESTRUCTOR_REGISTER(arg_firmware, freep); @@ -149,6 +157,7 @@ STATIC_DESTRUCTOR_REGISTER(arg_background, freep); STATIC_DESTRUCTOR_REGISTER(arg_ssh_key_type, freep); STATIC_DESTRUCTOR_REGISTER(arg_smbios11, strv_freep); STATIC_DESTRUCTOR_REGISTER(arg_tpm_state_path, freep); +STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep); static int help(void) { _cleanup_free_ char *link = NULL; @@ -166,6 +175,7 @@ static int help(void) { " --version Print version string\n" " -q --quiet Do not show status information\n" " --no-pager Do not pipe output into a pager\n" + " --no-ask-password Do not prompt for password\n" "\n%3$sImage:%4$s\n" " -D --directory=PATH Root directory for the VM\n" " -i --image=FILE|DEVICE Root file system disk image or device for the VM\n" @@ -191,8 +201,11 @@ static int help(void) { " -M --machine=NAME Set the machine name for the VM\n" " --uuid=UUID Set a specific machine UUID for the VM\n" "\n%3$sProperties:%4$s\n" - " --register=BOOLEAN Register VM with systemd-machined\n" - " --keep-unit Don't let systemd-machined allocate scope unit for us\n" + " -S --slice=SLICE Place the VM in the specified slice\n" + " --property=NAME=VALUE Set scope unit property\n" + " --register=BOOLEAN Register VM as machine\n" + " --keep-unit Do not register a scope for the machine, reuse\n" + " the service unit vmspawn is running in\n" "\n%3$sUser Namespacing:%4$s\n" " --private-users=UIDBASE[:NUIDS]\n" " Configure the UID/GID range to map into the\n" @@ -274,6 +287,8 @@ static int parse_argv(int argc, char *argv[]) { ARG_CONSOLE, ARG_BACKGROUND, ARG_TPM_STATE, + ARG_NO_ASK_PASSWORD, + ARG_PROPERTY, }; static const struct option options[] = { @@ -284,6 +299,7 @@ static int parse_argv(int argc, char *argv[]) { { "image", required_argument, NULL, 'i' }, { "directory", required_argument, NULL, 'D' }, { "machine", required_argument, NULL, 'M' }, + { "slice", required_argument, NULL, 'S' }, { "cpus", required_argument, NULL, ARG_CPUS }, { "qemu-smp", required_argument, NULL, ARG_CPUS }, /* Compat alias */ { "ram", required_argument, NULL, ARG_RAM }, @@ -319,6 +335,8 @@ static int parse_argv(int argc, char *argv[]) { { "smbios11", required_argument, NULL, 's' }, { "grow-image", required_argument, NULL, 'G' }, { "tpm-state", required_argument, NULL, ARG_TPM_STATE }, + { "no-ask-password", no_argument, NULL, ARG_NO_ASK_PASSWORD }, + { "property", required_argument, NULL, ARG_PROPERTY }, {} }; @@ -328,7 +346,7 @@ static int parse_argv(int argc, char *argv[]) { assert(argv); optind = 0; - while ((c = getopt_long(argc, argv, "+hD:i:M:nqs:G:", options, NULL)) >= 0) + while ((c = getopt_long(argc, argv, "+hD:i:M:nqs:G:S:", options, NULL)) >= 0) switch (c) { case 'h': return help(); @@ -634,6 +652,27 @@ static int parse_argv(int argc, char *argv[]) { arg_tpm_state_path = mfree(arg_tpm_state_path); break; + case ARG_NO_ASK_PASSWORD: + arg_ask_password = false; + break; + + case 'S': { + _cleanup_free_ char *mangled = NULL; + + r = unit_name_mangle_with_suffix(optarg, /* operation= */ NULL, UNIT_NAME_MANGLE_WARN, ".slice", &mangled); + if (r < 0) + return log_error_errno(r, "Failed to turn '%s' into unit name: %m", optarg); + + free_and_replace(arg_slice, mangled); + break; + } + + case ARG_PROPERTY: + if (strv_extend(&arg_property, optarg) < 0) + return log_oom(); + + break; + case '?': return -EINVAL; @@ -1535,30 +1574,261 @@ static int grow_image(const char *path, uint64_t size) { return 1; } +static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) { + assert(m); + + log_info("VM termination requested. Exiting."); + sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0); + + return 0; +} + +static int datagram_read_cmdline_and_exec(int _fd /* always taking possession, even on error */) { + _cleanup_close_ int fd = TAKE_FD(_fd); + int r; + + assert(fd >= 0); + + /* The first datagram contains the cmdline */ + r = fd_wait_for_event(fd, POLLIN, USEC_INFINITY); + if (r < 0) + return log_error_errno(r, "Failed to wait for command line: %m"); + + ssize_t n = next_datagram_size_fd(fd); + if (n < 0) + return log_error_errno(n, "Failed to determine datagram size: %m"); + n += 1; /* extra byte to validate that the size we determined here was correct */ + + _cleanup_free_ char *p = malloc(n); + if (!p) + return log_oom(); + + ssize_t m = recv(fd, p, n, /* flags= */ 0); + if (m < 0) + return log_error_errno(errno, "Failed to read datagram: %m"); + if (m >= n) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Unexpected message size."); + + _cleanup_strv_free_ char **a = strv_parse_nulstr(p, m); + if (!a) + return log_oom(); + if (strv_isempty(a)) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Invalid command line."); + + /* The second datagram contains an integer array of the intended fd numbers, and the an SCM_RIGHTS fd + * list along with it, matching that. */ + r = fd_wait_for_event(fd, POLLIN, USEC_INFINITY); + if (r < 0) + return log_error_errno(r, "Failed to wait for command line: %m"); + + n = next_datagram_size_fd(fd); + if (n < 0) + return log_error_errno(n, "Failed to determine datagram size: %m"); + n += 1; /* extra byte to validate that the size we determined here was correct */ + + _cleanup_free_ int *f = malloc(n); + if (!p) + return log_oom(); + + struct iovec iov = { + .iov_base = f, + .iov_len = n, + }; + + int *fds = NULL; + size_t n_fds = 0; + CLEANUP_ARRAY(fds, n_fds, close_many_and_free); + + m = receive_many_fds_iov( + fd, + &iov, /* iovlen= */ 1, + &fds, + &n_fds, + /* flags= */ MSG_TRUNC); + if (m < 0) + return log_error_errno(m, "Failed to read datagram: %m"); + if (m >= n || (size_t) m != n_fds * sizeof(int)) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Unexpected message size."); + + fd = safe_close(fd); + + /* At this point the fds[] contains the file descriptors we got, and f[] contains the numbers we want + * for them. Let's rearrange things. */ + + /* 1. Determine largest number we want */ + int max_fd = 2; + for (size_t k = 0; k < n_fds; k++) + max_fd = MAX(max_fd, f[k]); + + /* 2. Move all fds we got above that */ + for (size_t k = 0; k < n_fds; k++) { + if (fds[k] > max_fd) + continue; + + _cleanup_close_ int copy = fcntl(fds[k], F_DUPFD_CLOEXEC, max_fd+1); + if (copy < 0) + return log_error_errno(errno, "Failed to duplicate file descriptor: %m"); + + safe_close(fds[k]); + fds[k] = TAKE_FD(copy); + + assert(fds[k] > max_fd); + } + + log_close(); + + r = close_all_fds(fds, n_fds); + if (r < 0) + return log_error_errno(r, "Failed to close remaining file descriptors: %m"); + + /* 3. Move into place (this also disables O_CLOEXEC) */ + for (size_t k = 0; k < n_fds; k++) { + if (dup2(fds[k], f[k]) < 0) + return log_error_errno(errno, "Failed to move file descriptor: %m"); + + safe_close(fds[k]); + fds[k] = f[k]; + } + + execv(a[0], a); + return log_error_errno(errno, "Failed to execve %s: %m", a[0]); +} + +_noreturn_ static void child(int cmdline_fd) { + assert(cmdline_fd >= 0); + + /* set TERM and LANG if they are missing */ + if (setenv("TERM", "vt220", 0) < 0) { + log_oom(); + goto fail; + } + + if (setenv("LANG", "C.UTF-8", 0) < 0) { + log_oom(); + goto fail; + } + + /* Now wait for the command line from the parent, and then execute it */ + + (void) datagram_read_cmdline_and_exec(TAKE_FD(cmdline_fd)); + +fail: + _exit(EXIT_FAILURE); +} + static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) { _cleanup_(ovmf_config_freep) OvmfConfig *ovmf_config = NULL; - _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; - _cleanup_free_ char *machine = NULL, *qemu_binary = NULL, *mem = NULL, *trans_scope = NULL, *kernel = NULL; + _cleanup_free_ char *qemu_binary = NULL, *mem = NULL, *kernel = NULL; _cleanup_(rm_rf_physical_and_freep) char *ssh_private_key_path = NULL, *ssh_public_key_path = NULL; _cleanup_close_ int notify_sock_fd = -EBADF; _cleanup_strv_free_ char **cmdline = NULL; _cleanup_free_ int *pass_fds = NULL; size_t n_pass_fds = 0; - const char *accel, *shm; + const char *accel; int r; polkit_agent_open(); - if (arg_privileged) - r = sd_bus_default_system(&bus); - else - r = sd_bus_default_user(&bus); - if (r < 0) - return log_error_errno(r, "Failed to connect to systemd bus: %m"); + /* Registration always happens on the system bus */ + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *system_bus = NULL; + if (arg_register || arg_privileged) { + r = sd_bus_default_system(&system_bus); + if (r < 0) + return log_error_errno(r, "Failed to open system bus: %m"); - r = start_transient_scope(bus, arg_machine, /* allow_pidfd= */ true, &trans_scope); + r = sd_bus_set_close_on_exit(system_bus, false); + if (r < 0) + return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m"); + + (void) sd_bus_set_allow_interactive_authorization(system_bus, arg_ask_password); + } + + /* Scope allocation happens on the user bus if we are unpriv, otherwise system bus. */ + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *user_bus = NULL; + _cleanup_(sd_bus_unrefp) sd_bus *runtime_bus = NULL; + if (arg_privileged) + runtime_bus = sd_bus_ref(system_bus); + else { + r = sd_bus_default_user(&user_bus); + if (r < 0) + return log_error_errno(r, "Failed to open system bus: %m"); + + r = sd_bus_set_close_on_exit(user_bus, false); + if (r < 0) + return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m"); + + runtime_bus = sd_bus_ref(user_bus); + } + + assert_se(sigprocmask_many(SIG_BLOCK, /* ret_old_mask=*/ NULL, SIGCHLD) >= 0); + + _cleanup_close_pair_ int cmdline_socket[2] = EBADF_PAIR; + if (socketpair(AF_UNIX, SOCK_DGRAM | SOCK_CLOEXEC | SOCK_NONBLOCK, 0, cmdline_socket) < 0) + return log_error_errno(errno, "Failed to allocate command line socket pair: %m"); + + /* Fork off child early on, as we need to assign it to a scope unit, which we can generate + * dependencies towards for swtpm, virtiofsd and so on. It's just going to hang until we fully + * prepared a command line */ + _cleanup_(pidref_done) PidRef child_pidref = PIDREF_NULL; + r = pidref_safe_fork_full( + "(qemu)", + /* stdio_fds= */ NULL, + cmdline_socket + 0, 1, + FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGTERM|FORK_LOG|FORK_CLOEXEC_OFF|FORK_RLIMIT_NOFILE_SAFE, + &child_pidref); if (r < 0) return r; + if (r == 0) { + cmdline_socket[1] = -EBADF; /* closed due to FORK_CLOEXEC_ALL_FDS */ + + child(cmdline_socket[0]); + assert_not_reached(); + } + + cmdline_socket[0] = safe_close(cmdline_socket[0]); + + if (!arg_keep_unit) { + /* When a new scope is created for this container, then we'll be registered as its controller, in which + * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the + * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */ + + r = sd_bus_match_signal_async( + runtime_bus, + /* ret= */ NULL, + "org.freedesktop.systemd1", + /* path= */ NULL, + "org.freedesktop.systemd1.Scope", + "RequestStop", + on_request_stop, + /* install_callback= */ NULL, + /* userdata= */ NULL); + if (r < 0) + return log_error_errno(r, "Failed to request RequestStop match: %m"); + } + + _cleanup_free_ char *unit = NULL; + bool scope_allocated = false; + if (!arg_keep_unit && (!arg_register || !arg_privileged)) { + r = allocate_scope( + runtime_bus, + arg_machine, + &child_pidref, + arg_slice, + arg_property, + /* allow_pidfd= */ true, + &unit); + if (r < 0) + return r; + + scope_allocated = true; + } else { + if (arg_privileged) + r = cg_pid_get_unit(0, &unit); + else + r = cg_pid_get_user_unit(0, &unit); + if (r < 0) + return log_error_errno(r, "Failed to get our own unit: %m"); + } bool use_kvm = arg_kvm > 0; if (arg_kvm < 0) { @@ -1580,7 +1850,8 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) { log_warning("Couldn't find OVMF firmware blob with Secure Boot support, " "falling back to OVMF firmware blobs without Secure Boot support."); - shm = arg_directory || arg_runtime_mounts.n_mounts != 0 ? ",memory-backend=mem" : ""; + _cleanup_free_ char *machine = NULL; + const char *shm = arg_directory || arg_runtime_mounts.n_mounts != 0 ? ",memory-backend=mem" : ""; const char *hpet = ARCHITECTURE_SUPPORTS_HPET ? ",hpet=off" : ""; if (ARCHITECTURE_SUPPORTS_SMM) machine = strjoin("type=" QEMU_MACHINE_TYPE ",smm=", on_off(ovmf_config->supports_sb), shm, hpet); @@ -1993,7 +2264,13 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) { if (arg_directory) { _cleanup_free_ char *listen_address = NULL; - r = start_virtiofsd(bus, trans_scope, arg_directory, /* uidmap= */ true, runtime_dir, &listen_address); + r = start_virtiofsd( + runtime_bus, + unit, + arg_directory, + /* uidmap= */ true, + runtime_dir, + &listen_address); if (r < 0) return r; @@ -2060,7 +2337,13 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) { FOREACH_ARRAY(mount, arg_runtime_mounts.mounts, arg_runtime_mounts.n_mounts) { _cleanup_free_ char *listen_address = NULL; - r = start_virtiofsd(bus, trans_scope, mount->source, /* uidmap= */ false, runtime_dir, &listen_address); + r = start_virtiofsd( + runtime_bus, + unit, + mount->source, + /* uidmap= */ false, + runtime_dir, + &listen_address); if (r < 0) return r; @@ -2151,7 +2434,11 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) { _cleanup_free_ char *tpm_socket_address = NULL; if (swtpm) { - r = start_tpm(bus, trans_scope, swtpm, runtime_dir, &tpm_socket_address); + r = start_tpm(runtime_bus, + unit, + swtpm, + runtime_dir, + &tpm_socket_address); if (r < 0) { /* only bail if the user asked for a tpm */ if (arg_tpm > 0) @@ -2217,7 +2504,12 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) { if (r < 0) return log_error_errno(r, "Failed to find systemd-journal-remote binary: %m"); - r = start_systemd_journal_remote(bus, trans_scope, child_cid, sd_journal_remote, &listen_address); + r = start_systemd_journal_remote( + runtime_bus, + unit, + child_cid, + sd_journal_remote, + &listen_address); if (r < 0) return r; @@ -2234,7 +2526,7 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) { _cleanup_free_ char *scope_prefix = NULL, *privkey_path = NULL, *pubkey_path = NULL; const char *key_type = arg_ssh_key_type ?: "ed25519"; - r = unit_name_to_prefix(trans_scope, &scope_prefix); + r = unit_name_to_prefix(unit, &scope_prefix); if (r < 0) return log_error_errno(r, "Failed to strip .scope suffix from scope: %m"); @@ -2265,7 +2557,7 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) { if (r < 0) return log_error_errno(r, "Failed to load credential %s: %m", cred_path); - r = unit_name_to_prefix(trans_scope, &scope_prefix); + r = unit_name_to_prefix(unit, &scope_prefix); if (r < 0) return log_error_errno(r, "Failed to strip .scope suffix from scope: %m"); @@ -2329,45 +2621,12 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) { log_debug("Executing: %s", joined); } - assert_se(sigprocmask_many(SIG_BLOCK, /* ret_old_mask=*/ NULL, SIGCHLD) >= 0); - - _cleanup_(pidref_done) PidRef child_pidref = PIDREF_NULL; - - r = pidref_safe_fork_full( - qemu_binary, - /* stdio_fds= */ NULL, - pass_fds, n_pass_fds, - FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGTERM|FORK_LOG|FORK_CLOEXEC_OFF|FORK_RLIMIT_NOFILE_SAFE, - &child_pidref); - if (r < 0) - return r; - if (r == 0) { - /* set TERM and LANG if they are missing */ - if (setenv("TERM", "vt220", 0) < 0) { - log_oom(); - goto fail; - } - - if (setenv("LANG", "C.UTF-8", 0) < 0) { - log_oom(); - goto fail; - } - - execv(qemu_binary, cmdline); - log_error_errno(errno, "Failed to execve %s: %m", qemu_binary); - fail: - _exit(EXIT_FAILURE); - } - - /* Close relevant fds we passed to qemu in the parent. We don't need them anymore. */ - child_vsock_fd = safe_close(child_vsock_fd); - tap_fd = safe_close(tap_fd); - + bool registered = false; if (arg_register) { char vm_address[STRLEN("vsock/") + DECIMAL_STR_MAX(unsigned)]; xsprintf(vm_address, "vsock/%u", child_cid); r = register_machine( - bus, + system_bus, arg_machine, arg_uuid, "systemd-vmspawn", @@ -2376,11 +2635,40 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) { child_cid, child_cid != VMADDR_CID_ANY ? vm_address : NULL, ssh_private_key_path, - arg_keep_unit); + arg_keep_unit || !arg_privileged); if (r < 0) return r; + + registered = true; } + _cleanup_free_ char *nulstr = NULL; + size_t nulstr_size = 0; + if (strv_make_nulstr(cmdline, &nulstr, &nulstr_size) < 0) + return log_oom(); + + /* First datagram: the command line to execute */ + ssize_t n = send(cmdline_socket[1], nulstr, nulstr_size, /* flags= */ 0); + if (n < 0) + return log_error_errno(errno, "Failed to send command line: %m"); + + /* Second datagram: the file descriptor array and the fds inside it */ + n = send_many_fds_iov( + cmdline_socket[1], + pass_fds, n_pass_fds, /* both as payload … */ + &IOVEC_MAKE(pass_fds, n_pass_fds * sizeof(int)), /* … and as auxiliary fds */ + /* iovlen= */ 1, + /* flags= */ 0); + if (n < 0) + return log_error_errno(n, "Failed to send file descriptors to child: %m"); + + /* We submitted the command line now, qemu is running now */ + cmdline_socket[1] = safe_close(cmdline_socket[1]); + + /* Close relevant fds we passed to qemu in the parent. We don't need them anymore. */ + child_vsock_fd = safe_close(child_vsock_fd); + tap_fd = safe_close(tap_fd); + /* All operations that might need Polkit authorizations (i.e. machine registration, netif * acquisition, …) are complete now, get rid of the agent again, so that we retain exclusive control * of the TTY from now on. */ @@ -2394,6 +2682,18 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) { (void) sd_event_set_watchdog(event, true); + if (system_bus) { + r = sd_bus_attach_event(system_bus, event, 0); + if (r < 0) + return log_error_errno(r, "Failed to attach system bus to event loop: %m"); + } + + if (user_bus) { + r = sd_bus_attach_event(user_bus, event, 0); + if (r < 0) + return log_error_errno(r, "Failed to attach user bus to event loop: %m"); + } + int exit_status = INT_MAX; if (use_vsock) { r = setup_notify_parent(event, notify_sock_fd, &exit_status, ¬ify_event_source); @@ -2459,8 +2759,12 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) { if (r < 0) return log_error_errno(r, "Failed to run event loop: %m"); - if (arg_register) - (void) unregister_machine(bus, arg_machine); + /* Kill if it is not dead yet anyway */ + if (scope_allocated) + terminate_scope(runtime_bus, arg_machine); + + if (registered) + (void) unregister_machine(system_bus, arg_machine); if (use_vsock) { if (exit_status == INT_MAX) { @@ -2538,11 +2842,6 @@ static int verify_arguments(void) { if (!strv_isempty(arg_initrds) && !arg_linux) return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Option --initrd= cannot be used without --linux=."); - if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0) - /* Save the user from accidentally registering either user-$SESSION.scope or user@.service. - * The latter is not technically a user session, but we don't need to labour the point. */ - return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--keep-unit --register=yes may not be used when invoked from a user session."); - return 0; } @@ -2554,9 +2853,6 @@ static int run(int argc, char *argv[]) { arg_privileged = getuid() == 0; - /* don't attempt to register as a machine when running as a user */ - arg_register = arg_privileged; - r = parse_environment(); if (r < 0) return r; From f820b275653c0315e31ca9c2100f2280b5582e17 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 16 Jun 2025 10:49:25 +0200 Subject: [PATCH 17/22] vmspawn: introduce --notify-ready= switch This mimics the switch of the same name from nspawn: it controls whether we expect a READY=1 message from the payload or not. Previously we'd always expect that. This makes it configurable, just like it is in nspawn. There's one fundamental difference in behaviour though: in nspawn it defaults to off, in vmspawn it defaults to on. (for historical reasons, ideally we'd default to on in both cases, but changing is quite a compat break both directly and indirectly: since timeouts might get triggered). --- man/systemd-nspawn.xml | 16 +++++++++------- man/systemd-vmspawn.xml | 19 +++++++++++++++++++ src/vmspawn/vmspawn.c | 37 +++++++++++++++++++++++++++++++------ 3 files changed, 59 insertions(+), 13 deletions(-) diff --git a/man/systemd-nspawn.xml b/man/systemd-nspawn.xml index d9d0de918b..d7d7d17f66 100644 --- a/man/systemd-nspawn.xml +++ b/man/systemd-nspawn.xml @@ -651,13 +651,15 @@ Configures support for notifications from the container's init process. - takes a boolean ( and ). - With option systemd-nspawn notifies systemd - with a READY=1 message when the init process is created. - With option systemd-nspawn waits for the - READY=1 message from the init process in the container - before sending its own to systemd. For more details about notifications - see sd_notify3. + takes a boolean. If false systemd-vmpawn + notifies the calling service manager with a READY=1 message when the init process is + created. If true it waits for a READY=1 message from the init process in the VM + before sending its own to the service manager. For more details about notifications see + sd_notify3. + + Defaults to false. (Note that this is unlike the option of the same name to + systemd-vmspawn1 + that defaults to true.) diff --git a/man/systemd-vmspawn.xml b/man/systemd-vmspawn.xml index 304bbb44a3..03a2da649b 100644 --- a/man/systemd-vmspawn.xml +++ b/man/systemd-vmspawn.xml @@ -303,6 +303,25 @@ + + + + + Configures support for notifications from the VM's init process to + systemd-vmspawn. If true, systemd-vmspawn will consider the + machine as ready only when it has received a READY=1 message from the init + process in the VM. If false, systemd-vmspawn will consider the machine as ready + immediately after creation. In either case, systemd-vmspawn sends its own + readiness notification to its manager after the spawned VM is ready. For more details about + notifications see + sd_notify3. + + Defaults to true. (Note that this is unlike the option of the same name to + systemd-vmspawn1 + that defaults to false.) + + + diff --git a/src/vmspawn/vmspawn.c b/src/vmspawn/vmspawn.c index ce623cfd99..6889734e9c 100644 --- a/src/vmspawn/vmspawn.c +++ b/src/vmspawn/vmspawn.c @@ -139,6 +139,7 @@ static uint64_t arg_grow_image = 0; static char *arg_tpm_state_path = NULL; static TpmStateMode arg_tpm_state_mode = TPM_STATE_AUTO; static bool arg_ask_password = true; +static bool arg_notify_ready = true; STATIC_DESTRUCTOR_REGISTER(arg_directory, freep); STATIC_DESTRUCTOR_REGISTER(arg_image, freep); @@ -196,7 +197,9 @@ static int help(void) { " --firmware=PATH|list Select firmware definition file (or list available)\n" " --discard-disk=BOOL Control processing of discard requests\n" " -G --grow-image=BYTES Grow image file to specified size in bytes\n" + "\n%3$sExecution:%4$s\n" " -s --smbios11=STRING Pass an arbitrary SMBIOS Type #11 string to the VM\n" + " --notify-ready=BOOL Wait for ready notification from the VM\n" "\n%3$sSystem Identity:%4$s\n" " -M --machine=NAME Set the machine name for the VM\n" " --uuid=UUID Set a specific machine UUID for the VM\n" @@ -289,6 +292,7 @@ static int parse_argv(int argc, char *argv[]) { ARG_TPM_STATE, ARG_NO_ASK_PASSWORD, ARG_PROPERTY, + ARG_NOTIFY_READY, }; static const struct option options[] = { @@ -337,6 +341,7 @@ static int parse_argv(int argc, char *argv[]) { { "tpm-state", required_argument, NULL, ARG_TPM_STATE }, { "no-ask-password", no_argument, NULL, ARG_NO_ASK_PASSWORD }, { "property", required_argument, NULL, ARG_PROPERTY }, + { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY }, {} }; @@ -673,6 +678,13 @@ static int parse_argv(int argc, char *argv[]) { break; + case ARG_NOTIFY_READY: + r = parse_boolean_argument("--notify-ready=", optarg, &arg_notify_ready); + if (r < 0) + return r; + + break; + case '?': return -EINVAL; @@ -755,17 +767,20 @@ static int read_vsock_notify(NotifyConnectionData *d, int fd) { log_debug("Received notification message with tags: %s", strnull(j)); } + const char *status = strv_find_startswith(tags, "STATUS="); + if (status) + (void) sd_notifyf(/* unset_environment= */ false, "STATUS=VM running: %s", status); + if (strv_contains(tags, "READY=1")) { - r = sd_notify(false, "READY=1"); + r = sd_notify(/* unset_environment= */ false, "READY=1"); if (r < 0) log_warning_errno(r, "Failed to send readiness notification, ignoring: %m"); + + if (!status) + (void) sd_notifyf(/* unset_environment= */ false, "STATUS=VM running."); } - const char *p = strv_find_startswith(tags, "STATUS="); - if (p) - (void) sd_notifyf(false, "STATUS=VM running: %s", p); - - p = strv_find_startswith(tags, "EXIT_STATUS="); + const char *p = strv_find_startswith(tags, "EXIT_STATUS="); if (p) { uint8_t k = 0; r = safe_atou8(p, &k); @@ -2669,6 +2684,16 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) { child_vsock_fd = safe_close(child_vsock_fd); tap_fd = safe_close(tap_fd); + /* Report that the VM is now set up */ + (void) sd_notifyf(/* unset_environment= */ false, + "STATUS=VM started.\n" + "X_VMSPAWN_LEADER_PID=" PID_FMT, child_pidref.pid); + if (!arg_notify_ready) { + r = sd_notify(/* unset_environment= */ false, "READY=1\n"); + if (r < 0) + log_warning_errno(r, "Failed to send readiness notification, ignoring: %m"); + } + /* All operations that might need Polkit authorizations (i.e. machine registration, netif * acquisition, …) are complete now, get rid of the agent again, so that we retain exclusive control * of the TTY from now on. */ From 12d1f44681378c90def1395b3d4ad947e4ebfa83 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 11 Jul 2025 09:05:29 +0200 Subject: [PATCH 18/22] vmspawn: do not set vt220 We do not let qemu do terminal stuff, hence no point in setting any TERM. --- src/vmspawn/vmspawn.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/vmspawn/vmspawn.c b/src/vmspawn/vmspawn.c index 6889734e9c..c5fe0d54d7 100644 --- a/src/vmspawn/vmspawn.c +++ b/src/vmspawn/vmspawn.c @@ -1712,13 +1712,8 @@ static int datagram_read_cmdline_and_exec(int _fd /* always taking possession, e _noreturn_ static void child(int cmdline_fd) { assert(cmdline_fd >= 0); - /* set TERM and LANG if they are missing */ - if (setenv("TERM", "vt220", 0) < 0) { - log_oom(); - goto fail; - } - - if (setenv("LANG", "C.UTF-8", 0) < 0) { + /* set LANG if they are missing */ + if (setenv("LANG", "C.UTF-8", /* override= */ 0) < 0) { log_oom(); goto fail; } From 48cb009afc07fbc6634612ff66088a09acd554fc Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 23 May 2025 15:41:58 +0200 Subject: [PATCH 19/22] units: add units for vmspawn/nspawn in --user mode too --- units/machines.target | 2 +- units/user/machine.slice | 12 +++++++ units/user/machines.target | 16 +++++++++ units/user/meson.build | 48 ++++++++++++++++++++++---- units/user/systemd-nspawn@.service.in | 31 +++++++++++++++++ units/user/systemd-vmspawn@.service.in | 24 +++++++++++++ 6 files changed, 126 insertions(+), 7 deletions(-) create mode 100644 units/user/machine.slice create mode 100644 units/user/machines.target create mode 100644 units/user/systemd-nspawn@.service.in create mode 100644 units/user/systemd-vmspawn@.service.in diff --git a/units/machines.target b/units/machines.target index 165839aeb1..b7a84f4814 100644 --- a/units/machines.target +++ b/units/machines.target @@ -8,7 +8,7 @@ # (at your option) any later version. [Unit] -Description=Containers +Description=Virtual Machine and Containers Documentation=man:systemd.special(7) Before=multi-user.target diff --git a/units/user/machine.slice b/units/user/machine.slice new file mode 100644 index 0000000000..fe729d0eb9 --- /dev/null +++ b/units/user/machine.slice @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later +# +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. + +[Unit] +Description=Virtual Machine and Container Slice +Documentation=man:systemd.special(7) diff --git a/units/user/machines.target b/units/user/machines.target new file mode 100644 index 0000000000..35a99666bb --- /dev/null +++ b/units/user/machines.target @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later +# +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. + +[Unit] +Description=Virtual Machines and Containers +Documentation=man:systemd.special(7) +Before=default.target + +[Install] +WantedBy=default.target diff --git a/units/user/meson.build b/units/user/meson.build index c669e4bb1c..d8d3d9fdac 100644 --- a/units/user/meson.build +++ b/units/user/meson.build @@ -10,6 +10,14 @@ units = [ { 'file' : 'exit.target' }, { 'file' : 'graphical-session-pre.target' }, { 'file' : 'graphical-session.target' }, + { + 'file' : 'machine.slice', + 'conditions' : ['ENABLE_MACHINED'], + }, + { + 'file' : 'machines.target', + 'conditions' : ['ENABLE_MACHINED'], + }, { 'file' : 'paths.target' }, { 'file' : 'printer.target' }, { 'file' : 'session.slice' }, @@ -26,6 +34,14 @@ units = [ { 'file' : 'systemd-tmpfiles-clean.service' }, { 'file' : 'systemd-tmpfiles-clean.timer' }, { 'file' : 'systemd-tmpfiles-setup.service' }, + { + 'file' : 'systemd-nspawn@.service.in', + 'conditions' : ['ENABLE_NSPAWN'], + }, + { + 'file' : 'systemd-vmspawn@.service.in', + 'conditions' : ['ENABLE_VMSPAWN'], + }, { 'file' : 'timers.target' }, { 'file' : 'xdg-desktop-autostart.target', @@ -34,7 +50,17 @@ units = [ ] foreach unit : units - file = unit.get('file') + source = unit.get('file') + + if source.endswith('.in') + needs_jinja = true + name = source.substring(0, -3) + assert(name + '.in' == source) + else + needs_jinja = false + name = source + endif + source = files(source) install = true foreach cond : unit.get('conditions', []) @@ -44,20 +70,30 @@ foreach unit : units endif endforeach - if install - install_data(file, + if needs_jinja + t = custom_target( + name, + input : source, + output : name, + command : [jinja2_cmdline, '@INPUT@', '@OUTPUT@'], + install : install, + install_dir : userunitdir) + elif install + install_data(source, install_dir : userunitdir) + endif + if install foreach target : unit.get('symlinks', []) if target.endswith('/') # '/' is only allowed at the end of the target assert(target.replace('/', '') + '/' == target) - install_symlink(file, - pointing_to : '..' / file, + install_symlink(name, + pointing_to : '..' / name, install_dir : userunitdir / target) else install_symlink(target, - pointing_to : file, + pointing_to : name, install_dir : userunitdir) endif endforeach diff --git a/units/user/systemd-nspawn@.service.in b/units/user/systemd-nspawn@.service.in new file mode 100644 index 0000000000..90fdc0cb48 --- /dev/null +++ b/units/user/systemd-nspawn@.service.in @@ -0,0 +1,31 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later +# +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. + +[Unit] +Description=Container %i +Documentation=man:systemd-nspawn(1) +PartOf=machines.target +Before=machines.target + +[Service] +ExecStart=systemd-nspawn --quiet --keep-unit --register=yes --boot --network-veth -U --settings=override --machine=%i +ExecStopPost=systemd-nspawn --cleanup --machine=%i +KillMode=mixed +Type=notify +RestartForceExitStatus=133 +SuccessExitStatus=133 +Slice=machine.slice +Delegate=yes +DelegateSubgroup=supervisor +CoredumpReceive=yes +TasksMax=16384 +{{SERVICE_WATCHDOG}} + +[Install] +WantedBy=machines.target diff --git a/units/user/systemd-vmspawn@.service.in b/units/user/systemd-vmspawn@.service.in new file mode 100644 index 0000000000..aeea839f8c --- /dev/null +++ b/units/user/systemd-vmspawn@.service.in @@ -0,0 +1,24 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later +# +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. + +[Unit] +Description=Virtual Machine %i +Documentation=man:systemd-vmspawn(1) +PartOf=machines.target +Before=machines.target + +[Service] +ExecStart=systemd-vmspawn --quiet --keep-unit --register=yes --network-tap --machine=%i +KillMode=mixed +Type=notify +Slice=machine.slice +{{SERVICE_WATCHDOG}} + +[Install] +WantedBy=machines.target From 3405b84d8c4c145e2d034bfe954da728daf2d589 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 13 Jun 2025 18:21:11 +0200 Subject: [PATCH 20/22] units: systems might take a while to boot vmspawn systems might take quite a while to boot in particular if they go through uefi and wait for a network lease. Hence let's increase the start timeout to 2min (from 45s). We'll do that for both nspawn and vmspawn, even though the UEFI thing certainly doesn't apply there (but the DHCP thing still does). --- units/systemd-nspawn@.service.in | 1 + units/systemd-vmspawn@.service.in | 1 + units/user/systemd-nspawn@.service.in | 1 + units/user/systemd-vmspawn@.service.in | 1 + 4 files changed, 4 insertions(+) diff --git a/units/systemd-nspawn@.service.in b/units/systemd-nspawn@.service.in index c1426e673b..d540a071a4 100644 --- a/units/systemd-nspawn@.service.in +++ b/units/systemd-nspawn@.service.in @@ -29,6 +29,7 @@ Delegate=yes DelegateSubgroup=supervisor CoredumpReceive=yes TasksMax=16384 +TimeoutSec=2min {{SERVICE_WATCHDOG}} {# Enforce a strict device policy, similar to the one nspawn configures (in diff --git a/units/systemd-vmspawn@.service.in b/units/systemd-vmspawn@.service.in index 454a03e98b..6ae0088048 100644 --- a/units/systemd-vmspawn@.service.in +++ b/units/systemd-vmspawn@.service.in @@ -21,6 +21,7 @@ ExecStart=systemd-vmspawn --quiet --register=yes --keep-unit --network-tap --mac KillMode=mixed Type=notify Slice=machine.slice +TimeoutSec=2min {{SERVICE_WATCHDOG}} {# Enforce a strict device policy. Make sure to keep these policies in sync if you change them! #} diff --git a/units/user/systemd-nspawn@.service.in b/units/user/systemd-nspawn@.service.in index 90fdc0cb48..4f5cf412df 100644 --- a/units/user/systemd-nspawn@.service.in +++ b/units/user/systemd-nspawn@.service.in @@ -25,6 +25,7 @@ Delegate=yes DelegateSubgroup=supervisor CoredumpReceive=yes TasksMax=16384 +TimeoutSec=2min {{SERVICE_WATCHDOG}} [Install] diff --git a/units/user/systemd-vmspawn@.service.in b/units/user/systemd-vmspawn@.service.in index aeea839f8c..bfb321f2bf 100644 --- a/units/user/systemd-vmspawn@.service.in +++ b/units/user/systemd-vmspawn@.service.in @@ -18,6 +18,7 @@ ExecStart=systemd-vmspawn --quiet --keep-unit --register=yes --network-tap --mac KillMode=mixed Type=notify Slice=machine.slice +TimeoutSec=2min {{SERVICE_WATCHDOG}} [Install] From bfd356da63d9fe0720f1b5a61c527c8822c3b808 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 30 Jun 2025 23:13:26 +0200 Subject: [PATCH 21/22] test: add testcase for unpriv machined nspawns reg + killing Let's add a superficial test for the code we just added: spawn a container unpriv, make sure registration fully worked, then kill it via machinectl, to ensure it all works properly. Not too thorough but a good start. --- test/units/TEST-13-NSPAWN.unpriv.sh | 64 +++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100755 test/units/TEST-13-NSPAWN.unpriv.sh diff --git a/test/units/TEST-13-NSPAWN.unpriv.sh b/test/units/TEST-13-NSPAWN.unpriv.sh new file mode 100755 index 0000000000..db58b09291 --- /dev/null +++ b/test/units/TEST-13-NSPAWN.unpriv.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: LGPL-2.1-or-later +# shellcheck disable=SC2016 +set -eux +set -o pipefail + +# shellcheck source=test/units/util.sh +. "$(dirname "$0")"/util.sh + +if [[ ! -f /usr/lib/systemd/system/systemd-mountfsd.socket ]] || + [[ ! -f /usr/lib/systemd/system/systemd-nsresourced.socket ]] || + ! grep -q bpf /sys/kernel/security/lsm || + ! find /usr/lib* -name libbpf.so.1 2>/dev/null | grep . || + systemd-analyze compare-versions "$(uname -r)" lt 6.5 || + systemd-analyze compare-versions "$(pkcheck --version | awk '{print $3}')" lt 124; then + echo "Skipping unpriv nspawn test" + exit 0 +fi + +at_exit() { + rm -rf /home/testuser/.local/state/machines/zurps ||: + machinectl terminate zurps ||: + rm -f /usr/share/polkit-1/rules.d/registermachinetest.rules +} + +trap at_exit EXIT + +systemctl start systemd-mountfsd.socket systemd-nsresourced.socket + +run0 -u testuser mkdir -p .local/state/machines + +create_dummy_container /home/testuser/.local/state/machines/zurps +cat >/home/testuser/.local/state/machines/zurps/sbin/init </usr/share/polkit-1/rules.d/registermachinetest.rules <<'EOF' +polkit.addRule(function(action, subject) { + if (action.id == "org.freedesktop.machine1.register-machine" && + subject.user == "testuser") { + return polkit.Result.YES; + } +}); +EOF + +loginctl enable-linger testuser + +run0 -u testuser systemctl start --user systemd-nspawn@zurps.service + +machinectl status zurps +machinectl status zurps | grep "UID:" | grep "$(id -u testuser)" +machinectl status zurps | grep "Unit: user@" | grep "$(id -u testuser)" +machinectl status zurps | grep "Subgroup: machine.slice/systemd-nspawn@zurps.service/payload" +machinectl terminate zurps + +(! run0 -u testuser systemctl is-active --user systemd-nspawn@zurps.service) + +loginctl disable-linger testuser From 6d44b761ea20be31381df9037e06687da7497fbc Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 16 Jun 2025 12:11:31 +0200 Subject: [PATCH 22/22] update TODO --- TODO | 9 --------- 1 file changed, 9 deletions(-) diff --git a/TODO b/TODO index 1629d43f0d..eda987b3ec 100644 --- a/TODO +++ b/TODO @@ -477,12 +477,6 @@ Features: them. display them in regular output mode (via strip_tab_ansi()), but suppress them in json mode. -* machined: when registering a machine, also take a relative cgroup path, - relative to the machine's unit. This is useful when registering unpriv - machines, as they might sit down the cgroup tree, below a cgroup delegation - boundary. Then, install an inotify watch on that cgroup to track when the - machine's local cgroup goes down. - * resolved: report ttl in resolution replies if we know it. This data is useful for tools such as wireguard which want to periodically re-resolve DNS names, and might want to use the TTL has hint for that. @@ -813,8 +807,6 @@ Features: PCRs. * vmspawn: - - run in scope unit when invoked from command line, and machined registration is off - - sd_notify support - --ephemeral support - --read-only support - automatically suspend/resume the VM if the host suspends. Use logind @@ -2762,7 +2754,6 @@ Features: investigate whether creating the inner child with CLONE_PARENT isn't better. - Reduce the number of sockets that are currently in use and just rely on one or two sockets. - - Support running nspawn as an unprivileged user. * machined: - add an API so that libvirt-lxc can inform us about network interfaces being