diff --git a/TODO b/TODO
index 1629d43f0d..eda987b3ec 100644
--- a/TODO
+++ b/TODO
@@ -477,12 +477,6 @@ Features:
them. display them in regular output mode (via strip_tab_ansi()), but
suppress them in json mode.
-* machined: when registering a machine, also take a relative cgroup path,
- relative to the machine's unit. This is useful when registering unpriv
- machines, as they might sit down the cgroup tree, below a cgroup delegation
- boundary. Then, install an inotify watch on that cgroup to track when the
- machine's local cgroup goes down.
-
* resolved: report ttl in resolution replies if we know it. This data is useful
for tools such as wireguard which want to periodically re-resolve DNS names,
and might want to use the TTL has hint for that.
@@ -813,8 +807,6 @@ Features:
PCRs.
* vmspawn:
- - run in scope unit when invoked from command line, and machined registration is off
- - sd_notify support
- --ephemeral support
- --read-only support
- automatically suspend/resume the VM if the host suspends. Use logind
@@ -2762,7 +2754,6 @@ Features:
investigate whether creating the inner child with CLONE_PARENT isn't better.
- Reduce the number of sockets that are currently in use and just rely on one
or two sockets.
- - Support running nspawn as an unprivileged user.
* machined:
- add an API so that libvirt-lxc can inform us about network interfaces being
diff --git a/man/machinectl.xml b/man/machinectl.xml
index 13fa0eae71..e64a20bb1d 100644
--- a/man/machinectl.xml
+++ b/man/machinectl.xml
@@ -595,12 +595,10 @@
- When used with kill, choose
- which processes to kill. Must be one of
- , or to select
- whether to kill only the leader process of the machine or all
- processes of the machine. If omitted, defaults to
- .
+ When used with kill, choose which processes to kill. Must be one
+ of , , or to select whether to
+ kill only the leader process of the machine, the supervisor process of the machine, or all processes
+ of the machine. If omitted, defaults to .
diff --git a/man/org.freedesktop.machine1.xml b/man/org.freedesktop.machine1.xml
index 35b2d64cc8..ccd81be659 100644
--- a/man/org.freedesktop.machine1.xml
+++ b/man/org.freedesktop.machine1.xml
@@ -506,10 +506,16 @@ node /org/freedesktop/machine1/machine/rawhide {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s Unit = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+ readonly s Subgroup = '...';
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly u Leader = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly t LeaderPIDFDId = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+ readonly u Supervisor = ...;
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+ readonly t SupervisorPIDFDId = ...;
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s Class = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s RootDirectory = '...';
@@ -523,6 +529,8 @@ node /org/freedesktop/machine1/machine/rawhide {
readonly s SSHPrivateKeyPath = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly s State = '...';
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+ readonly u UID = ...;
};
interface org.freedesktop.DBus.Peer { ... };
interface org.freedesktop.DBus.Introspectable { ... };
@@ -598,10 +606,16 @@ node /org/freedesktop/machine1/machine/rawhide {
+
+
+
+
+
+
@@ -616,6 +630,8 @@ node /org/freedesktop/machine1/machine/rawhide {
+
+
@@ -647,11 +663,20 @@ node /org/freedesktop/machine1/machine/rawhide {
Unit is the systemd scope or service unit name for the machine.
- Leader is the PID of the leader process of the machine.
+ Leader is the PID of the leader process of the machine. (The leader process is the
+ top-level process of the payload of the machine, i.e. for containers PID 1 inside the container, and
+ for VMs the process encapsulating the VM. Its lifetime defines the lifetime of the machine.)LeaderPIDFDId encodes the Linux pidfd inode ID of the leader process of the
machine.
+ Supervisor is the PID of the supervisor process of the machine. (The
+ supervisor process is the process that manages the machine, if there's a separate one for it. A
+ supervisor process is not always defined, and does not define the machine's lifetime.)
+
+ SupervisorPIDFDId encodes the Linux pidfd inode ID of the supervisor process of the
+ machine.
+
Class is the class of the machine and is either the string "vm" (for real VMs
based on virtualized hardware) or "container" (for lightweight userspace virtualization sharing the
same kernel as the host).
@@ -676,6 +701,11 @@ node /org/freedesktop/machine1/machine/rawhide {
running, or closing. Note that the state machine is not considered
part of the API and states might be removed or added without this being considered API breakage.
+
+ Subgroup contains the sub-control-group path this machine's processes reside
+ in, relative to the specified unit's control group.
+
+ UID contains the numeric UNIX UID of the user who registered the machine.
@@ -717,9 +747,11 @@ $ gdbus introspect --system \
Machine ObjectsCopyFromWithFlags() and
CopyToWithFlags() were added in version 252.
- GetSSHInfo(), VSockCID, SSHAddress
+ GetSSHInfo(), VSockCID, SSHAddress,
and SSHPrivateKeyPath were added in version 256.
- LeaderPIDFDId was added in version 258.
+ LeaderPIDFDId, Supervisor,
+ SupervisorPIDFDId, Subgroup, and UID were added
+ in version 258.
diff --git a/man/systemd-nspawn.xml b/man/systemd-nspawn.xml
index d9d0de918b..d7d7d17f66 100644
--- a/man/systemd-nspawn.xml
+++ b/man/systemd-nspawn.xml
@@ -651,13 +651,15 @@
Configures support for notifications from the container's init process.
- takes a boolean ( and ).
- With option systemd-nspawn notifies systemd
- with a READY=1 message when the init process is created.
- With option systemd-nspawn waits for the
- READY=1 message from the init process in the container
- before sending its own to systemd. For more details about notifications
- see sd_notify3.
+ takes a boolean. If false systemd-vmpawn
+ notifies the calling service manager with a READY=1 message when the init process is
+ created. If true it waits for a READY=1 message from the init process in the VM
+ before sending its own to the service manager. For more details about notifications see
+ sd_notify3.
+
+ Defaults to false. (Note that this is unlike the option of the same name to
+ systemd-vmspawn1
+ that defaults to true.)
diff --git a/man/systemd-vmspawn.xml b/man/systemd-vmspawn.xml
index cbf7d20e64..03a2da649b 100644
--- a/man/systemd-vmspawn.xml
+++ b/man/systemd-vmspawn.xml
@@ -303,6 +303,25 @@
+
+
+
+
+ Configures support for notifications from the VM's init process to
+ systemd-vmspawn. If true, systemd-vmspawn will consider the
+ machine as ready only when it has received a READY=1 message from the init
+ process in the VM. If false, systemd-vmspawn will consider the machine as ready
+ immediately after creation. In either case, systemd-vmspawn sends its own
+ readiness notification to its manager after the spawned VM is ready. For more details about
+ notifications see
+ sd_notify3.
+
+ Defaults to true. (Note that this is unlike the option of the same name to
+ systemd-vmspawn1
+ that defaults to false.)
+
+
+
@@ -343,6 +362,30 @@
Property Options
+
+
+
+
+ Make the VM part of the specified slice, instead of the default
+ machine.slice. This applies only if the machine is run in its own scope unit,
+ i.e. if is not used.
+
+
+
+
+
+
+
+
+ Set a unit property on the scope unit to register for the machine. This applies
+ only if the machine is run in its own scope unit, i.e. if is not
+ used. Takes unit property assignments in the same format as systemctl
+ set-property. This is useful to set memory limits and similar for the VM.
+
+
+
+
+
@@ -532,6 +575,7 @@
+
diff --git a/src/basic/cgroup-util.c b/src/basic/cgroup-util.c
index f3270ff278..0184cbffbe 100644
--- a/src/basic/cgroup-util.c
+++ b/src/basic/cgroup-util.c
@@ -973,15 +973,14 @@ static const char* skip_slices(const char *p) {
}
}
-int cg_path_get_unit(const char *path, char **ret) {
- _cleanup_free_ char *unit = NULL;
- const char *e;
+int cg_path_get_unit_full(const char *path, char **ret_unit, char **ret_subgroup) {
int r;
assert(path);
- e = skip_slices(path);
+ const char *e = skip_slices(path);
+ _cleanup_free_ char *unit = NULL;
r = cg_path_decode_unit(e, &unit);
if (r < 0)
return r;
@@ -990,8 +989,27 @@ int cg_path_get_unit(const char *path, char **ret) {
if (endswith(unit, ".slice"))
return -ENXIO;
- if (ret)
- *ret = TAKE_PTR(unit);
+ if (ret_subgroup) {
+ _cleanup_free_ char *subgroup = NULL;
+ e += strcspn(e, "/");
+ e += strspn(e, "/");
+
+ if (isempty(e))
+ subgroup = NULL;
+ else {
+ subgroup = strdup(e);
+ if (!subgroup)
+ return -ENOMEM;
+ }
+
+ path_simplify(subgroup);
+
+ *ret_subgroup = TAKE_PTR(subgroup);
+ }
+
+ if (ret_unit)
+ *ret_unit = TAKE_PTR(unit);
+
return 0;
}
@@ -1017,31 +1035,27 @@ int cg_path_get_unit_path(const char *path, char **ret) {
return 0;
}
-int cg_pid_get_unit(pid_t pid, char **ret_unit) {
- _cleanup_free_ char *cgroup = NULL;
+int cg_pid_get_unit_full(pid_t pid, char **ret_unit, char **ret_subgroup) {
int r;
- assert(ret_unit);
-
+ _cleanup_free_ char *cgroup = NULL;
r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
if (r < 0)
return r;
- return cg_path_get_unit(cgroup, ret_unit);
+ return cg_path_get_unit_full(cgroup, ret_unit, ret_subgroup);
}
-int cg_pidref_get_unit(const PidRef *pidref, char **ret) {
- _cleanup_free_ char *unit = NULL;
+int cg_pidref_get_unit_full(const PidRef *pidref, char **ret_unit, char **ret_subgroup) {
int r;
- assert(ret);
-
if (!pidref_is_set(pidref))
return -ESRCH;
if (pidref_is_remote(pidref))
return -EREMOTE;
- r = cg_pid_get_unit(pidref->pid, &unit);
+ _cleanup_free_ char *unit = NULL, *subgroup = NULL;
+ r = cg_pid_get_unit_full(pidref->pid, &unit, &subgroup);
if (r < 0)
return r;
@@ -1049,7 +1063,10 @@ int cg_pidref_get_unit(const PidRef *pidref, char **ret) {
if (r < 0)
return r;
- *ret = TAKE_PTR(unit);
+ if (ret_unit)
+ *ret_unit = TAKE_PTR(unit);
+ if (ret_subgroup)
+ *ret_subgroup = TAKE_PTR(subgroup);
return 0;
}
diff --git a/src/basic/cgroup-util.h b/src/basic/cgroup-util.h
index f49fc48832..81424418e0 100644
--- a/src/basic/cgroup-util.h
+++ b/src/basic/cgroup-util.h
@@ -203,7 +203,10 @@ int cg_get_root_path(char **path);
int cg_path_get_session(const char *path, char **ret_session);
int cg_path_get_owner_uid(const char *path, uid_t *ret_uid);
-int cg_path_get_unit(const char *path, char **ret_unit);
+int cg_path_get_unit_full(const char *path, char **ret_unit, char **ret_subgroup);
+static inline int cg_path_get_unit(const char *path, char **ret_unit) {
+ return cg_path_get_unit_full(path, ret_unit, NULL);
+}
int cg_path_get_unit_path(const char *path, char **ret_unit);
int cg_path_get_user_unit(const char *path, char **ret_unit);
int cg_path_get_machine_name(const char *path, char **ret_machine);
@@ -217,8 +220,14 @@ int cg_pid_get_session(pid_t pid, char **ret_session);
int cg_pidref_get_session(const PidRef *pidref, char **ret);
int cg_pid_get_owner_uid(pid_t pid, uid_t *ret_uid);
int cg_pidref_get_owner_uid(const PidRef *pidref, uid_t *ret);
-int cg_pid_get_unit(pid_t pid, char **ret_unit);
-int cg_pidref_get_unit(const PidRef *pidref, char **ret);
+int cg_pid_get_unit_full(pid_t pid, char **ret_unit, char **ret_subgroup);
+static inline int cg_pid_get_unit(pid_t pid, char **ret_unit) {
+ return cg_pid_get_unit_full(pid, ret_unit, NULL);
+}
+int cg_pidref_get_unit_full(const PidRef *pidref, char **ret_unit, char **ret_subgroup);
+static inline int cg_pidref_get_unit(const PidRef *pidref, char **ret_unit) {
+ return cg_pidref_get_unit_full(pidref, ret_unit, NULL);
+}
int cg_pid_get_user_unit(pid_t pid, char **ret_unit);
int cg_pid_get_machine_name(pid_t pid, char **ret_machine);
int cg_pid_get_slice(pid_t pid, char **ret_slice);
diff --git a/src/basic/pidref.c b/src/basic/pidref.c
index e4c51dd952..7f66496dd3 100644
--- a/src/basic/pidref.c
+++ b/src/basic/pidref.c
@@ -118,6 +118,31 @@ int pidref_set_pid(PidRef *pidref, pid_t pid) {
return 0;
}
+int pidref_set_pid_and_pidfd_id(
+ PidRef *pidref,
+ pid_t pid,
+ uint64_t pidfd_id) {
+
+ int r;
+
+ assert(pidref);
+
+ _cleanup_(pidref_done) PidRef n = PIDREF_NULL;
+ r = pidref_set_pid(&n, pid);
+ if (r < 0)
+ return r;
+
+ if (pidfd_id > 0) {
+ pidref_acquire_pidfd_id(&n);
+
+ if (n.fd_id != pidfd_id)
+ return -ESRCH;
+ }
+
+ *pidref = TAKE_PIDREF(n);
+ return 0;
+}
+
int pidref_set_pidstr(PidRef *pidref, const char *pid) {
pid_t nr;
int r;
diff --git a/src/basic/pidref.h b/src/basic/pidref.h
index 724871842b..4d8a084c4b 100644
--- a/src/basic/pidref.h
+++ b/src/basic/pidref.h
@@ -68,6 +68,7 @@ bool pidref_equal(PidRef *a, PidRef *b);
* PIDREF_MAKE_FROM_PID() above, which does not acquire a pidfd.) */
int pidref_set_pid(PidRef *pidref, pid_t pid);
int pidref_set_pidstr(PidRef *pidref, const char *pid);
+int pidref_set_pid_and_pidfd_id(PidRef *pidref, pid_t pid, uint64_t pidfd_id);
int pidref_set_pidfd(PidRef *pidref, int fd);
int pidref_set_pidfd_take(PidRef *pidref, int fd); /* takes ownership of the passed pidfd on success */
int pidref_set_pidfd_consume(PidRef *pidref, int fd); /* takes ownership of the passed pidfd in both success and failure */
diff --git a/src/machine/machine-dbus.c b/src/machine/machine-dbus.c
index f4676106ac..f903a1371e 100644
--- a/src/machine/machine-dbus.c
+++ b/src/machine/machine-dbus.c
@@ -60,10 +60,12 @@ int bus_machine_method_unregister(sd_bus_message *message, void *userdata, sd_bu
NULL
};
- r = bus_verify_polkit_async(
+ r = bus_verify_polkit_async_full(
message,
"org.freedesktop.machine1.manage-machines",
details,
+ m->uid,
+ /* flags= */ 0,
&m->manager->polkit_registry,
error);
if (r < 0)
@@ -90,10 +92,12 @@ int bus_machine_method_terminate(sd_bus_message *message, void *userdata, sd_bus
NULL
};
- r = bus_verify_polkit_async(
+ r = bus_verify_polkit_async_full(
message,
"org.freedesktop.machine1.manage-machines",
details,
+ m->uid,
+ /* flags= */ 0,
&m->manager->polkit_registry,
error);
if (r < 0)
@@ -138,10 +142,12 @@ int bus_machine_method_kill(sd_bus_message *message, void *userdata, sd_bus_erro
NULL
};
- r = bus_verify_polkit_async(
+ r = bus_verify_polkit_async_full(
message,
"org.freedesktop.machine1.manage-machines",
details,
+ m->uid,
+ /* flags= */ 0,
&m->manager->polkit_registry,
error);
if (r < 0)
@@ -258,10 +264,12 @@ int bus_machine_method_open_pty(sd_bus_message *message, void *userdata, sd_bus_
NULL
};
- r = bus_verify_polkit_async(
+ r = bus_verify_polkit_async_full(
message,
m->class == MACHINE_HOST ? "org.freedesktop.machine1.host-open-pty" : "org.freedesktop.machine1.open-pty",
details,
+ m->uid,
+ /* flags= */ 0,
&m->manager->polkit_registry,
error);
if (r < 0)
@@ -299,10 +307,12 @@ int bus_machine_method_open_login(sd_bus_message *message, void *userdata, sd_bu
NULL
};
- r = bus_verify_polkit_async(
+ r = bus_verify_polkit_async_full(
message,
m->class == MACHINE_HOST ? "org.freedesktop.machine1.host-login" : "org.freedesktop.machine1.login",
details,
+ m->uid,
+ /* flags= */ 0,
&m->manager->polkit_registry,
error);
if (r < 0)
@@ -383,10 +393,12 @@ int bus_machine_method_open_shell(sd_bus_message *message, void *userdata, sd_bu
NULL
};
- r = bus_verify_polkit_async(
+ r = bus_verify_polkit_async_full(
message,
m->class == MACHINE_HOST ? "org.freedesktop.machine1.host-shell" : "org.freedesktop.machine1.shell",
details,
+ m->uid,
+ /* flags= */ 0,
&m->manager->polkit_registry,
error);
if (r < 0)
@@ -446,6 +458,7 @@ int bus_machine_method_bind_mount(sd_bus_message *message, void *userdata, sd_bu
NULL
};
+ /* NB: For now not opened up to owner of machine without auth */
r = bus_verify_polkit_async(
message,
"org.freedesktop.machine1.manage-machines",
@@ -531,6 +544,7 @@ int bus_machine_method_copy(sd_bus_message *message, void *userdata, sd_bus_erro
NULL
};
+ /* NB: For now not opened up to owner of machine without auth */
r = bus_verify_polkit_async(
message,
"org.freedesktop.machine1.manage-machines",
@@ -574,6 +588,7 @@ int bus_machine_method_open_root_directory(sd_bus_message *message, void *userda
NULL
};
+ /* NB: For now not opened up to owner of machine without auth */
r = bus_verify_polkit_async(
message,
"org.freedesktop.machine1.manage-machines",
@@ -717,8 +732,11 @@ static const sd_bus_vtable machine_vtable[] = {
SD_BUS_PROPERTY("Service", "s", NULL, offsetof(Machine, service), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("Unit", "s", NULL, offsetof(Machine, unit), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("Scope", "s", NULL, offsetof(Machine, unit), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN),
+ SD_BUS_PROPERTY("Subgroup", "s", NULL, offsetof(Machine, subgroup), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("Leader", "u", bus_property_get_pid, offsetof(Machine, leader.pid), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("LeaderPIDFDId", "t", bus_property_get_pidfdid, offsetof(Machine, leader), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Supervisor", "u", bus_property_get_pid, offsetof(Machine, supervisor.pid), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SupervisorPIDFDId", "t", bus_property_get_pidfdid, offsetof(Machine, supervisor), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("Class", "s", property_get_class, offsetof(Machine, class), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("RootDirectory", "s", NULL, offsetof(Machine, root_directory), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("NetworkInterfaces", "ai", property_get_netif, 0, SD_BUS_VTABLE_PROPERTY_CONST),
@@ -726,6 +744,7 @@ static const sd_bus_vtable machine_vtable[] = {
SD_BUS_PROPERTY("SSHAddress", "s", NULL, offsetof(Machine, ssh_address), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("SSHPrivateKeyPath", "s", NULL, offsetof(Machine, ssh_private_key_path), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("State", "s", property_get_state, 0, 0),
+ SD_BUS_PROPERTY("UID", "u", bus_property_get_uid, offsetof(Machine, uid), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_METHOD("Terminate",
NULL,
diff --git a/src/machine/machine-varlink.c b/src/machine/machine-varlink.c
index 1c2013de51..776654b51d 100644
--- a/src/machine/machine-varlink.c
+++ b/src/machine/machine-varlink.c
@@ -46,8 +46,8 @@ static int machine_name(const char *name, sd_json_variant *variant, sd_json_disp
return 0;
}
-static int machine_leader(const char *name, sd_json_variant *variant, sd_json_dispatch_flags_t flags, void *userdata) {
- PidRef *leader = ASSERT_PTR(userdata);
+static int machine_pidref(const char *name, sd_json_variant *variant, sd_json_dispatch_flags_t flags, void *userdata) {
+ PidRef *pidref = ASSERT_PTR(userdata);
_cleanup_(pidref_done) PidRef temp = PIDREF_NULL;
int r;
@@ -56,14 +56,14 @@ static int machine_leader(const char *name, sd_json_variant *variant, sd_json_di
return r;
if (temp.pid == 1) /* refuse PID 1 */
- return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a valid leader PID.", strna(name));
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a valid PID.", strna(name));
/* When both leader and leaderProcessId are specified, they must be consistent with each other. */
- if (pidref_is_set(leader) && !pidref_equal(leader, &temp))
- return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' conflicts with already dispatched leader PID.", strna(name));
+ if (pidref_is_set(pidref) && !pidref_equal(pidref, &temp))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' conflicts with already dispatched PID.", strna(name));
- pidref_done(leader);
- *leader = TAKE_PIDREF(temp);
+ pidref_done(pidref);
+ *pidref = TAKE_PIDREF(temp);
return 0;
}
@@ -128,18 +128,20 @@ int vl_method_register(sd_varlink *link, sd_json_variant *parameters, sd_varlink
int r;
static const sd_json_dispatch_field dispatch_table[] = {
- { "name", SD_JSON_VARIANT_STRING, machine_name, offsetof(Machine, name), SD_JSON_MANDATORY },
- { "id", SD_JSON_VARIANT_STRING, sd_json_dispatch_id128, offsetof(Machine, id), 0 },
- { "service", SD_JSON_VARIANT_STRING, sd_json_dispatch_string, offsetof(Machine, service), 0 },
- { "class", SD_JSON_VARIANT_STRING, dispatch_machine_class, offsetof(Machine, class), SD_JSON_MANDATORY },
- { "leader", _SD_JSON_VARIANT_TYPE_INVALID, machine_leader, offsetof(Machine, leader), SD_JSON_STRICT },
- { "leaderProcessId", SD_JSON_VARIANT_OBJECT, machine_leader, offsetof(Machine, leader), SD_JSON_STRICT },
- { "rootDirectory", SD_JSON_VARIANT_STRING, json_dispatch_path, offsetof(Machine, root_directory), 0 },
- { "ifIndices", SD_JSON_VARIANT_ARRAY, machine_ifindices, 0, 0 },
- { "vSockCid", _SD_JSON_VARIANT_TYPE_INVALID, machine_cid, offsetof(Machine, vsock_cid), 0 },
- { "sshAddress", SD_JSON_VARIANT_STRING, sd_json_dispatch_string, offsetof(Machine, ssh_address), SD_JSON_STRICT },
- { "sshPrivateKeyPath", SD_JSON_VARIANT_STRING, json_dispatch_path, offsetof(Machine, ssh_private_key_path), 0 },
- { "allocateUnit", SD_JSON_VARIANT_BOOLEAN, sd_json_dispatch_stdbool, offsetof(Machine, allocate_unit), 0 },
+ { "name", SD_JSON_VARIANT_STRING, machine_name, offsetof(Machine, name), SD_JSON_MANDATORY },
+ { "id", SD_JSON_VARIANT_STRING, sd_json_dispatch_id128, offsetof(Machine, id), 0 },
+ { "service", SD_JSON_VARIANT_STRING, sd_json_dispatch_string, offsetof(Machine, service), 0 },
+ { "class", SD_JSON_VARIANT_STRING, dispatch_machine_class, offsetof(Machine, class), SD_JSON_MANDATORY },
+ { "leader", _SD_JSON_VARIANT_TYPE_INVALID, machine_pidref, offsetof(Machine, leader), SD_JSON_STRICT },
+ { "leaderProcessId", SD_JSON_VARIANT_OBJECT, machine_pidref, offsetof(Machine, leader), SD_JSON_STRICT },
+ { "supervisor", _SD_JSON_VARIANT_TYPE_INVALID, machine_pidref, offsetof(Machine, supervisor), SD_JSON_STRICT },
+ { "supervisorProcessId", SD_JSON_VARIANT_OBJECT, machine_pidref, offsetof(Machine, supervisor), SD_JSON_STRICT },
+ { "rootDirectory", SD_JSON_VARIANT_STRING, json_dispatch_path, offsetof(Machine, root_directory), 0 },
+ { "ifIndices", SD_JSON_VARIANT_ARRAY, machine_ifindices, 0, 0 },
+ { "vSockCid", _SD_JSON_VARIANT_TYPE_INVALID, machine_cid, offsetof(Machine, vsock_cid), 0 },
+ { "sshAddress", SD_JSON_VARIANT_STRING, sd_json_dispatch_string, offsetof(Machine, ssh_address), SD_JSON_STRICT },
+ { "sshPrivateKeyPath", SD_JSON_VARIANT_STRING, json_dispatch_path, offsetof(Machine, ssh_private_key_path), 0 },
+ { "allocateUnit", SD_JSON_VARIANT_BOOLEAN, sd_json_dispatch_stdbool, offsetof(Machine, allocate_unit), 0 },
VARLINK_DISPATCH_POLKIT_FIELD,
{}
};
@@ -155,7 +157,7 @@ int vl_method_register(sd_varlink *link, sd_json_variant *parameters, sd_varlink
r = varlink_verify_polkit_async(
link,
manager->bus,
- "org.freedesktop.machine1.create-machine",
+ machine->allocate_unit ? "org.freedesktop.machine1.create-machine" : "org.freedesktop.machine1.register-machine",
(const char**) STRV_MAKE("name", machine->name,
"class", machine_class_to_string(machine->class)),
&manager->polkit_registry);
@@ -168,6 +170,22 @@ int vl_method_register(sd_varlink *link, sd_json_variant *parameters, sd_varlink
return r;
}
+ if (!pidref_is_set(&machine->supervisor)) {
+ _cleanup_(pidref_done) PidRef client_pidref = PIDREF_NULL;
+
+ r = varlink_get_peer_pidref(link, &client_pidref);
+ if (r < 0)
+ return r;
+
+ /* If the client process is not the leader, then make it the supervisor */
+ if (!pidref_equal(&client_pidref, &machine->leader))
+ machine->supervisor = TAKE_PIDREF(client_pidref);
+ }
+
+ r = sd_varlink_get_peer_uid(link, &machine->uid);
+ if (r < 0)
+ return r;
+
r = machine_link(manager, machine);
if (r == -EEXIST)
return sd_varlink_error(link, VARLINK_ERROR_MACHINE_EXISTS, NULL);
@@ -175,7 +193,7 @@ int vl_method_register(sd_varlink *link, sd_json_variant *parameters, sd_varlink
return r;
if (!machine->allocate_unit) {
- r = cg_pidref_get_unit(&machine->leader, &machine->unit);
+ r = cg_pidref_get_unit_full(&machine->leader, &machine->unit, &machine->subgroup);
if (r < 0)
return r;
}
@@ -278,12 +296,14 @@ int vl_method_unregister_internal(sd_varlink *link, sd_json_variant *parameters,
Manager *manager = ASSERT_PTR(machine->manager);
int r;
- r = varlink_verify_polkit_async(
+ r = varlink_verify_polkit_async_full(
link,
manager->bus,
"org.freedesktop.machine1.manage-machines",
(const char**) STRV_MAKE("name", machine->name,
"verb", "unregister"),
+ machine->uid,
+ /* flags= */ 0,
&manager->polkit_registry);
if (r <= 0)
return r;
@@ -300,12 +320,14 @@ int vl_method_terminate_internal(sd_varlink *link, sd_json_variant *parameters,
Manager *manager = ASSERT_PTR(machine->manager);
int r;
- r = varlink_verify_polkit_async(
+ r = varlink_verify_polkit_async_full(
link,
manager->bus,
"org.freedesktop.machine1.manage-machines",
(const char**) STRV_MAKE("name", machine->name,
"verb", "terminate"),
+ machine->uid,
+ /* flags= */ 0,
&manager->polkit_registry);
if (r <= 0)
return r;
@@ -368,12 +390,14 @@ int vl_method_kill(sd_varlink *link, sd_json_variant *parameters, sd_varlink_met
return sd_varlink_error_invalid_parameter_name(link, "whom");
}
- r = varlink_verify_polkit_async(
+ r = varlink_verify_polkit_async_full(
link,
manager->bus,
"org.freedesktop.machine1.manage-machines",
(const char**) STRV_MAKE("name", machine->name,
"verb", "kill"),
+ machine->uid,
+ /* flags= */ 0,
&manager->polkit_registry);
if (r <= 0)
return r;
@@ -510,11 +534,13 @@ int vl_method_open(sd_varlink *link, sd_json_variant *parameters, sd_varlink_met
return r;
polkit_details = machine_open_polkit_details(p.mode, machine->name, user, path, command_line);
- r = varlink_verify_polkit_async(
+ r = varlink_verify_polkit_async_full(
link,
manager->bus,
machine_open_polkit_action(p.mode, machine->class),
(const char**) polkit_details,
+ machine->uid,
+ /* flags= */ 0,
&manager->polkit_registry);
if (r <= 0)
return r;
@@ -788,6 +814,7 @@ int vl_method_bind_mount(sd_varlink *link, sd_json_variant *parameters, sd_varli
if (machine->class != MACHINE_CONTAINER)
return sd_varlink_error(link, VARLINK_ERROR_MACHINE_NOT_SUPPORTED, NULL);
+ /* NB: For now not opened up to owner of machine without auth */
r = varlink_verify_polkit_async(
link,
manager->bus,
@@ -899,6 +926,7 @@ int vl_method_copy_internal(sd_varlink *link, sd_json_variant *parameters, sd_va
if (machine->class != MACHINE_CONTAINER)
return sd_varlink_error(link, VARLINK_ERROR_MACHINE_NOT_SUPPORTED, NULL);
+ /* NB: For now not opened up to owner of machine without auth */
r = varlink_verify_polkit_async(
link,
manager->bus,
@@ -928,6 +956,7 @@ int vl_method_open_root_directory_internal(sd_varlink *link, sd_json_variant *pa
Manager *manager = ASSERT_PTR(machine->manager);
int r;
+ /* NB: For now not opened up to owner of machine without auth */
r = varlink_verify_polkit_async(
link,
manager->bus,
diff --git a/src/machine/machine.c b/src/machine/machine.c
index 3975bc1754..e127ddcb0e 100644
--- a/src/machine/machine.c
+++ b/src/machine/machine.c
@@ -33,6 +33,7 @@
#include "path-util.h"
#include "process-util.h"
#include "serialize.h"
+#include "signal-util.h"
#include "socket-util.h"
#include "special.h"
#include "stdio-util.h"
@@ -62,6 +63,7 @@ int machine_new(MachineClass class, const char *name, Machine **ret) {
*m = (Machine) {
.class = class,
.leader = PIDREF_NULL,
+ .supervisor = PIDREF_NULL,
.vsock_cid = VMADDR_CID_ANY,
};
@@ -131,16 +133,28 @@ Machine* machine_free(Machine *m) {
pidref_done(&m->leader);
}
+ m->supervisor_pidfd_event_source = sd_event_source_disable_unref(m->supervisor_pidfd_event_source);
+ pidref_done(&m->supervisor);
+
sd_bus_message_unref(m->create_message);
+ m->cgroup_empty_event_source = sd_event_source_disable_unref(m->cgroup_empty_event_source);
+
free(m->name);
- free(m->scope_job);
+
free(m->state_file);
free(m->service);
free(m->root_directory);
+
+ free(m->unit);
+ free(m->subgroup);
+ free(m->scope_job);
+ free(m->cgroup);
+
free(m->netif);
free(m->ssh_address);
free(m->ssh_private_key_path);
+
return mfree(m);
}
@@ -156,7 +170,7 @@ int machine_save(Machine *m) {
return 0;
_cleanup_(unlink_and_freep) char *sl = NULL; /* auto-unlink! */
- if (m->unit) {
+ if (m->unit && !m->subgroup) {
sl = strjoin("/run/systemd/machines/unit:", m->unit);
if (!sl)
return log_oom();
@@ -177,8 +191,10 @@ int machine_save(Machine *m) {
fprintf(f,
"# This is private data. Do not parse.\n"
- "NAME=%s\n",
- m->name);
+ "NAME=%s\n"
+ "UID=" UID_FMT "\n",
+ m->name,
+ m->uid);
/* We continue to call this "SCOPE=" because it is internal only, and we want to stay compatible with old files */
env_file_fputs_assignment(f, "SCOPE=", m->unit);
@@ -197,6 +213,13 @@ int machine_save(Machine *m) {
fprintf(f, "LEADER_PIDFDID=%" PRIu64 "\n", m->leader.fd_id);
}
+ if (pidref_is_set(&m->supervisor)) {
+ fprintf(f, "SUPERVISOR=" PID_FMT "\n", m->supervisor.pid);
+ (void) pidref_acquire_pidfd_id(&m->supervisor);
+ if (m->supervisor.fd_id != 0)
+ fprintf(f, "SUPERVISOR_PIDFDID=%" PRIu64 "\n", m->supervisor.fd_id);
+ }
+
if (m->class != _MACHINE_CLASS_INVALID)
fprintf(f, "CLASS=%s\n", machine_class_to_string(m->class));
@@ -244,7 +267,7 @@ int machine_save(Machine *m) {
static void machine_unlink(Machine *m) {
assert(m);
- if (m->unit) {
+ if (m->unit && !m->subgroup) {
const char *sl = strjoina("/run/systemd/machines/unit:", m->unit);
(void) unlink(sl);
}
@@ -253,9 +276,42 @@ static void machine_unlink(Machine *m) {
(void) unlink(m->state_file);
}
+static void parse_pid_and_pidfdid(
+ PidRef *pidref,
+ const char *pid,
+ const char *pidfdid,
+ const char *name) {
+
+ int r;
+
+ assert(pidref);
+ assert(name);
+
+ pidref_done(pidref);
+
+ if (!pid)
+ return;
+ r = pidref_set_pidstr(pidref, pid);
+ if (r < 0)
+ return (void) log_debug_errno(r, "Failed to set %s PID to '%s', ignoring: %m", name, pid);
+
+ if (!pidfdid)
+ return;
+ uint64_t fd_id;
+ r = safe_atou64(pidfdid, &fd_id);
+ if (r < 0)
+ return (void) log_warning_errno(r, "Failed to parse %s pidfd ID, ignoring: %s", name, pidfdid);
+ (void) pidref_acquire_pidfd_id(pidref);
+ if (fd_id != pidref->fd_id) {
+ log_debug("PID of %s got recycled, ignoring.", name);
+ pidref_done(pidref);
+ }
+}
+
int machine_load(Machine *m) {
- _cleanup_free_ char *name = NULL, *realtime = NULL, *monotonic = NULL, *id = NULL, *leader = NULL, *leader_pidfdid = NULL,
- *class = NULL, *netif = NULL, *vsock_cid = NULL;
+ _cleanup_free_ char *name = NULL, *realtime = NULL, *monotonic = NULL, *id = NULL,
+ *leader = NULL, *leader_pidfdid = NULL, *supervisor = NULL, *supervisor_pidfdid = NULL,
+ *class = NULL, *netif = NULL, *vsock_cid = NULL, *uid = NULL;
int r;
assert(m);
@@ -266,19 +322,23 @@ int machine_load(Machine *m) {
r = parse_env_file(NULL, m->state_file,
"NAME", &name,
"SCOPE", &m->unit,
+ "SUBGROUP", &m->subgroup,
"SCOPE_JOB", &m->scope_job,
"SERVICE", &m->service,
"ROOT", &m->root_directory,
"ID", &id,
"LEADER", &leader,
"LEADER_PIDFDID", &leader_pidfdid,
+ "SUPERVISOR", &supervisor,
+ "SUPERVISOR_PIDFDID", &supervisor_pidfdid,
"CLASS", &class,
"REALTIME", &realtime,
"MONOTONIC", &monotonic,
"NETIF", &netif,
"VSOCK_CID", &vsock_cid,
"SSH_ADDRESS", &m->ssh_address,
- "SSH_PRIVATE_KEY_PATH", &m->ssh_private_key_path);
+ "SSH_PRIVATE_KEY_PATH", &m->ssh_private_key_path,
+ "UID", &uid);
if (r == -ENOENT)
return 0;
if (r < 0)
@@ -290,26 +350,8 @@ int machine_load(Machine *m) {
if (id)
(void) sd_id128_from_string(id, &m->id);
- pidref_done(&m->leader);
- if (leader) {
- r = pidref_set_pidstr(&m->leader, leader);
- if (r < 0)
- log_debug_errno(r, "Failed to set leader PID to '%s', ignoring: %m", leader);
- else if (leader_pidfdid) {
- uint64_t fd_id;
- r = safe_atou64(leader_pidfdid, &fd_id);
- if (r < 0)
- log_warning_errno(r, "Failed to parse leader pidfd ID, ignoring: %s", leader_pidfdid);
- else {
- (void) pidref_acquire_pidfd_id(&m->leader);
-
- if (fd_id != m->leader.fd_id) {
- log_debug("Leader PID got recycled, ignoring.");
- pidref_done(&m->leader);
- }
- }
- }
- }
+ parse_pid_and_pidfdid(&m->leader, leader, leader_pidfdid, "leader");
+ parse_pid_and_pidfdid(&m->supervisor, supervisor, supervisor_pidfdid, "supervisor");
if (class) {
MachineClass c = machine_class_from_string(class);
@@ -362,6 +404,10 @@ int machine_load(Machine *m) {
log_warning_errno(r, "Failed to parse AF_VSOCK CID, ignoring: %s", vsock_cid);
}
+ r = parse_uid(uid, &m->uid);
+ if (r < 0)
+ log_warning_errno(r, "Failed to parse owning UID, ignoring: %s", uid);
+
return r;
}
@@ -380,6 +426,7 @@ static int machine_start_scope(
assert(machine);
assert(pidref_is_set(&machine->leader));
assert(!machine->unit);
+ assert(!machine->subgroup);
escaped = unit_name_escape(machine->name);
if (!escaped)
@@ -418,14 +465,28 @@ static int machine_start_scope(
if (r < 0)
return r;
- r = sd_bus_message_append(m, "(sv)(sv)(sv)(sv)",
- "Delegate", "b", 1,
- "CollectMode", "s", "inactive-or-failed",
- "AddRef", "b", 1,
- "TasksMax", "t", UINT64_C(16384));
+ r = sd_bus_message_append(
+ m, "(sv)(sv)(sv)(sv)",
+ "Delegate", "b", 1,
+ "CollectMode", "s", "inactive-or-failed",
+ "AddRef", "b", 1,
+ "TasksMax", "t", UINT64_C(16384));
if (r < 0)
return r;
+ if (machine->uid != 0) {
+ _cleanup_free_ char *u = NULL;
+
+ if (asprintf(&u, UID_FMT, machine->uid) < 0)
+ return -ENOMEM;
+
+ r = sd_bus_message_append(
+ m, "(sv)",
+ "User", "s", u);
+ if (r < 0)
+ return r;
+ }
+
if (more_properties) {
r = sd_bus_message_copy(m, more_properties, true);
if (r < 0)
@@ -476,9 +537,11 @@ static int machine_ensure_scope(Machine *m, sd_bus_message *properties, sd_bus_e
assert(m->unit);
- r = hashmap_ensure_put(&m->manager->machines_by_unit, &string_hash_ops, m->unit, m);
- if (r < 0)
- return r;
+ if (!m->subgroup) {
+ r = hashmap_ensure_put(&m->manager->machines_by_unit, &string_hash_ops, m->unit, m);
+ if (r < 0)
+ return r;
+ }
return 0;
}
@@ -492,25 +555,72 @@ static int machine_dispatch_leader_pidfd(sd_event_source *s, int fd, unsigned re
return 0;
}
-static int machine_watch_pidfd(Machine *m) {
+static int machine_dispatch_supervisor_pidfd(sd_event_source *s, int fd, unsigned revents, void *userdata) {
+ Machine *m = ASSERT_PTR(userdata);
+
+ m->supervisor_pidfd_event_source = sd_event_source_disable_unref(m->supervisor_pidfd_event_source);
+ machine_add_to_gc_queue(m);
+
+ return 0;
+}
+
+static int machine_watch_pidfd(Machine *m, PidRef *pidref, sd_event_source **source, sd_event_io_handler_t cb) {
int r;
assert(m);
assert(m->manager);
- assert(pidref_is_set(&m->leader));
- assert(!m->leader_pidfd_event_source);
+ assert(source);
+ assert(!*source);
+ assert(cb);
- if (m->leader.fd < 0)
+ if (!pidref_is_set(pidref) || pidref->fd < 0)
return 0;
- /* If we have a pidfd for the leader, let's also track it for POLLIN, and GC the machine
+ /* If we have a pidfd for the leader or supervisor, let's also track it for POLLIN, and GC the machine
* automatically if it dies */
- r = sd_event_add_io(m->manager->event, &m->leader_pidfd_event_source, m->leader.fd, EPOLLIN, machine_dispatch_leader_pidfd, m);
+ r = sd_event_add_io(m->manager->event, source, pidref->fd, EPOLLIN, cb, m);
if (r < 0)
return r;
- (void) sd_event_source_set_description(m->leader_pidfd_event_source, "machine-pidfd");
+ (void) sd_event_source_set_description(*source, "machine-pidfd");
+
+ return 0;
+}
+
+static int machine_dispatch_cgroup_empty(sd_event_source *s, const struct inotify_event *event, void *userdata) {
+ Machine *m = ASSERT_PTR(userdata);
+ int r;
+
+ assert(m->cgroup);
+
+ r = cg_is_empty(SYSTEMD_CGROUP_CONTROLLER, m->cgroup);
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine if cgroup '%s' is empty: %m", m->cgroup);
+
+ if (r > 0)
+ machine_add_to_gc_queue(m);
+
+ return 0;
+}
+
+static int machine_watch_cgroup(Machine *m) {
+ int r;
+
+ assert(m);
+ assert(!m->cgroup_empty_event_source);
+
+ if (!m->cgroup)
+ return 0;
+
+ _cleanup_free_ char *p = NULL;
+ r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup, "cgroup.events", &p);
+ if (r < 0)
+ return log_error_errno(r, "Failed to get cgroup path for cgroup '%s': %m", m->cgroup);
+
+ r = sd_event_add_inotify(m->manager->event, &m->cgroup_empty_event_source, p, IN_MODIFY, machine_dispatch_cgroup_empty, m);
+ if (r < 0)
+ return log_error_errno(r, "Failed to watch %s events: %m", p);
return 0;
}
@@ -530,7 +640,15 @@ int machine_start(Machine *m, sd_bus_message *properties, sd_bus_error *error) {
if (r < 0)
return r;
- r = machine_watch_pidfd(m);
+ r = machine_watch_pidfd(m, &m->leader, &m->leader_pidfd_event_source, machine_dispatch_leader_pidfd);
+ if (r < 0)
+ return r;
+
+ r = machine_watch_pidfd(m, &m->supervisor, &m->supervisor_pidfd_event_source, machine_dispatch_supervisor_pidfd);
+ if (r < 0)
+ return r;
+
+ r = machine_watch_cgroup(m);
if (r < 0)
return r;
@@ -563,10 +681,13 @@ int machine_stop(Machine *m) {
assert(m);
+ log_debug("Stopping machine '%s'.", m->name);
+
if (!IN_SET(m->class, MACHINE_CONTAINER, MACHINE_VM))
return -EOPNOTSUPP;
- if (m->unit) {
+ if (m->unit && !m->subgroup) {
+ /* If the machine runs as its own unit, then we'll terminate that */
_cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
char *job = NULL;
@@ -575,7 +696,14 @@ int machine_stop(Machine *m) {
return log_error_errno(r, "Failed to stop machine unit: %s", bus_error_message(&error, r));
free_and_replace(m->scope_job, job);
- }
+
+ } else if (pidref_is_set(&m->supervisor)) {
+ /* Otherwise, send a friendly SIGTERM to the supervisor */
+ r = pidref_kill(&m->supervisor, SIGTERM);
+ if (r < 0)
+ return log_error_errno(r, "Failed to kill supervisor process " PID_FMT " of machine '%s': %m", m->supervisor.pid, m->name);
+ } else
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Don't know how to terminate machine '%s'.", m->name);
m->stopping = true;
@@ -637,7 +765,7 @@ bool machine_may_gc(Machine *m, bool drop_not_started) {
return false;
}
- if (m->unit) {
+ if (m->unit && !m->subgroup) {
_cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
r = manager_unit_is_active(m->manager, m->unit, &error);
@@ -647,6 +775,14 @@ bool machine_may_gc(Machine *m, bool drop_not_started) {
return false;
}
+ if (m->cgroup) {
+ r = cg_is_empty(SYSTEMD_CGROUP_CONTROLLER, m->cgroup);
+ if (IN_SET(r, 0, -ENOENT))
+ return true;
+ if (r < 0)
+ log_debug_errno(r, "Failed to determine if cgroup '%s' is empty, ignoring: %m", m->cgroup);
+ }
+
return true;
}
@@ -680,17 +816,28 @@ MachineState machine_get_state(Machine *s) {
int machine_kill(Machine *m, KillWhom whom, int signo) {
assert(m);
+ log_debug("Killing machine '%s' (%s) with signal %s.", m->name, kill_whom_to_string(whom), signal_to_string(signo));
+
if (!IN_SET(m->class, MACHINE_VM, MACHINE_CONTAINER))
return -EOPNOTSUPP;
- if (!m->unit)
- return -ESRCH;
+ switch (whom) {
- if (whom == KILL_LEADER) /* If we shall simply kill the leader, do so directly */
+ case KILL_LEADER:
return pidref_kill(&m->leader, signo);
- /* Otherwise, make PID 1 do it for us, for the entire cgroup */
- return manager_kill_unit(m->manager, m->unit, signo, NULL);
+ case KILL_SUPERVISOR:
+ return pidref_kill(&m->supervisor, signo);
+
+ case KILL_ALL:
+ if (!m->unit)
+ return -ESRCH;
+
+ return manager_kill_unit(m->manager, m->unit, m->subgroup, signo, /* error= */ NULL);
+
+ default:
+ assert_not_reached();
+ }
}
int machine_openpt(Machine *m, int flags, char **ret_peer) {
@@ -1124,8 +1271,13 @@ void machine_release_unit(Machine *m) {
m->referenced = false;
}
- (void) hashmap_remove_value(m->manager->machines_by_unit, m->unit, m);
+ if (!m->subgroup)
+ (void) hashmap_remove_value(m->manager->machines_by_unit, m->unit, m);
+
m->unit = mfree(m->unit);
+
+ /* Also free the subgroup, because it only makes sense in the context of the unit */
+ m->subgroup = mfree(m->subgroup);
}
int machine_get_uid_shift(Machine *m, uid_t *ret) {
@@ -1468,8 +1620,9 @@ static const char* const machine_state_table[_MACHINE_STATE_MAX] = {
DEFINE_STRING_TABLE_LOOKUP(machine_state, MachineState);
static const char* const kill_whom_table[_KILL_WHOM_MAX] = {
- [KILL_LEADER] = "leader",
- [KILL_ALL] = "all"
+ [KILL_LEADER] = "leader",
+ [KILL_SUPERVISOR] = "supervisor",
+ [KILL_ALL] = "all",
};
DEFINE_STRING_TABLE_LOOKUP(kill_whom, KillWhom);
diff --git a/src/machine/machine.h b/src/machine/machine.h
index 870146fc22..8f1a044f7a 100644
--- a/src/machine/machine.h
+++ b/src/machine/machine.h
@@ -27,6 +27,7 @@ typedef enum MachineClass {
typedef enum KillWhom {
KILL_LEADER,
+ KILL_SUPERVISOR,
KILL_ALL,
_KILL_WHOM_MAX,
_KILL_WHOM_INVALID = -EINVAL,
@@ -38,6 +39,8 @@ typedef struct Machine {
char *name;
sd_id128_t id;
+ uid_t uid;
+
MachineClass class;
char *state_file;
@@ -45,13 +48,25 @@ typedef struct Machine {
char *root_directory;
char *unit;
+ char *subgroup;
char *scope_job;
+ char *cgroup;
- PidRef leader;
- sd_event_source *leader_pidfd_event_source;
+ /* Leader: the top-level process that encapsulates the machine itself. For containers that's PID 1,
+ * for VMs that's qemu or whatever process wraps the actual VM code. This process defines the runtime
+ * lifecycle of the machine. In case of containers we can use this reference to enter namespaces,
+ * send signals and so on.
+ *
+ * Supervisor: the process that supervises the machine, if there is any and if that process is
+ * responsible for a single machine. Sending SIGTERM to this process should (non-cooperatively)
+ * terminate the machine. */
+ PidRef leader, supervisor;
+ sd_event_source *leader_pidfd_event_source, *supervisor_pidfd_event_source;
dual_timestamp timestamp;
+ sd_event_source *cgroup_empty_event_source;
+
bool in_gc_queue:1;
bool started:1;
bool stopping:1;
diff --git a/src/machine/machinectl.c b/src/machine/machinectl.c
index 116d5ef74d..9eddc53990 100644
--- a/src/machine/machinectl.c
+++ b/src/machine/machinectl.c
@@ -28,9 +28,9 @@
#include "cgroup-util.h"
#include "edit-util.h"
#include "env-util.h"
-#include "format-util.h"
#include "format-ifname.h"
#include "format-table.h"
+#include "format-util.h"
#include "hostname-util.h"
#include "import-util.h"
#include "in-addr-util.h"
@@ -46,6 +46,7 @@
#include "parse-argument.h"
#include "parse-util.h"
#include "path-util.h"
+#include "pidref.h"
#include "polkit-agent.h"
#include "pretty-print.h"
#include "process-util.h"
@@ -412,9 +413,15 @@ static int list_images(int argc, char *argv[], void *userdata) {
return show_table(table, "images");
}
-static int show_unit_cgroup(sd_bus *bus, const char *unit, pid_t leader) {
+static int show_unit_cgroup(
+ sd_bus *bus,
+ const char *unit,
+ const char *subgroup,
+ pid_t leader) {
+
_cleanup_free_ char *cgroup = NULL;
_cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+ OutputFlags extra_flags = 0;
int r;
assert(bus);
@@ -427,8 +434,16 @@ static int show_unit_cgroup(sd_bus *bus, const char *unit, pid_t leader) {
if (isempty(cgroup))
return 0;
+ if (!empty_or_root(subgroup)) {
+ if (!path_extend(&cgroup, subgroup))
+ return log_oom();
+
+ /* If we have a subcgroup, then hide all processes outside of it */
+ extra_flags |= OUTPUT_HIDE_EXTRA;
+ }
+
unsigned c = MAX(LESS_BY(columns(), 18U), 10U);
- r = unit_show_processes(bus, unit, cgroup, "\t\t ", c, get_output_flags(), &error);
+ r = unit_show_processes(bus, unit, cgroup, "\t\t ", c, get_output_flags() | extra_flags, &error);
if (r == -EBADR) {
if (arg_transport == BUS_TRANSPORT_REMOTE)
@@ -494,18 +509,50 @@ typedef struct MachineStatusInfo {
const char *class;
const char *service;
const char *unit;
+ const char *subgroup;
const char *root_directory;
pid_t leader;
+ uint64_t leader_pidfdid;
+ pid_t supervisor;
+ uint64_t supervisor_pidfdid;
struct dual_timestamp timestamp;
int *netif;
size_t n_netif;
+ uid_t uid;
} MachineStatusInfo;
-static void machine_status_info_clear(MachineStatusInfo *info) {
- if (info) {
- free(info->netif);
- zero(*info);
+static void machine_status_info_done(MachineStatusInfo *info) {
+ if (!info)
+ return;
+
+ free(info->netif);
+ zero(*info);
+}
+
+static void print_process_info(const char *field, pid_t pid, uint64_t pidfdid) {
+ int r;
+
+ assert(field);
+
+ if (pid <= 0)
+ return;
+
+ printf("%s: " PID_FMT, field, pid);
+
+ _cleanup_(pidref_done) PidRef pr = PIDREF_NULL;
+ r = pidref_set_pid_and_pidfd_id(&pr, pid, pidfdid);
+ if (r < 0)
+ log_debug_errno(r, "Failed to acquire reference to process, ignoring: %m");
+ else {
+ _cleanup_free_ char *t = NULL;
+ r = pidref_get_comm(&pr, &t);
+ if (r < 0)
+ log_debug_errno(r, "Failed to acquire name of process, ignoring: %m");
+ else
+ printf(" (%s)", t);
}
+
+ putchar('\n');
}
static void print_machine_status_info(sd_bus *bus, MachineStatusInfo *i) {
@@ -530,17 +577,8 @@ static void print_machine_status_info(sd_bus *bus, MachineStatusInfo *i) {
else if (!isempty(s2))
printf("\t Since: %s\n", s2);
- if (i->leader > 0) {
- _cleanup_free_ char *t = NULL;
-
- printf("\t Leader: %u", (unsigned) i->leader);
-
- (void) pid_get_comm(i->leader, &t);
- if (t)
- printf(" (%s)", t);
-
- putchar('\n');
- }
+ print_process_info("\t Leader", i->leader, i->leader_pidfdid);
+ print_process_info("\t Superv.", i->supervisor, i->supervisor_pidfdid);
if (i->service) {
printf("\t Service: %s", i->service);
@@ -552,6 +590,9 @@ static void print_machine_status_info(sd_bus *bus, MachineStatusInfo *i) {
} else if (i->class)
printf("\t Class: %s\n", i->class);
+ if (i->uid != 0)
+ printf("\t UID: " UID_FMT "\n", i->uid);
+
if (i->root_directory)
printf("\t Root: %s\n", i->root_directory);
@@ -589,7 +630,11 @@ static void print_machine_status_info(sd_bus *bus, MachineStatusInfo *i) {
if (i->unit) {
printf("\t Unit: %s\n", i->unit);
- show_unit_cgroup(bus, i->unit, i->leader);
+
+ if (!empty_or_root(i->subgroup))
+ printf("\tSubgroup: %s\n", i->subgroup);
+
+ show_unit_cgroup(bus, i->unit, i->subgroup, i->leader);
if (arg_transport == BUS_TRANSPORT_LOCAL)
@@ -636,18 +681,23 @@ static int show_machine_info(const char *verb, sd_bus *bus, const char *path, bo
{ "Class", "s", NULL, offsetof(MachineStatusInfo, class) },
{ "Service", "s", NULL, offsetof(MachineStatusInfo, service) },
{ "Unit", "s", NULL, offsetof(MachineStatusInfo, unit) },
+ { "Subgroup", "s", NULL, offsetof(MachineStatusInfo, subgroup) },
{ "RootDirectory", "s", NULL, offsetof(MachineStatusInfo, root_directory) },
{ "Leader", "u", NULL, offsetof(MachineStatusInfo, leader) },
+ { "LeaderPIDFDId", "t", NULL, offsetof(MachineStatusInfo, leader_pidfdid) },
+ { "Supervisor", "u", NULL, offsetof(MachineStatusInfo, supervisor) },
+ { "SupervisorPIDFDId", "t", NULL, offsetof(MachineStatusInfo, supervisor_pidfdid) },
{ "Timestamp", "t", NULL, offsetof(MachineStatusInfo, timestamp.realtime) },
{ "TimestampMonotonic", "t", NULL, offsetof(MachineStatusInfo, timestamp.monotonic) },
{ "Id", "ay", bus_map_id128, offsetof(MachineStatusInfo, id) },
{ "NetworkInterfaces", "ai", map_netif, 0 },
+ { "UID", "u", NULL, offsetof(MachineStatusInfo, uid) },
{}
};
_cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
_cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL;
- _cleanup_(machine_status_info_clear) MachineStatusInfo info = {};
+ _cleanup_(machine_status_info_done) MachineStatusInfo info = {};
int r;
assert(verb);
diff --git a/src/machine/machined-core.c b/src/machine/machined-core.c
index 93f1cc5ee8..61078d51ec 100644
--- a/src/machine/machined-core.c
+++ b/src/machine/machined-core.c
@@ -22,28 +22,41 @@
#include "user-util.h"
int manager_get_machine_by_pidref(Manager *m, const PidRef *pidref, Machine **ret) {
- Machine *mm;
- int r;
+ _cleanup_(pidref_done) PidRef current = PIDREF_NULL;
+ Machine *mm = NULL;
assert(m);
assert(pidref_is_set(pidref));
assert(ret);
- mm = hashmap_get(m->machines_by_leader, pidref);
- if (!mm) {
- _cleanup_free_ char *unit = NULL;
+ for (;;) {
+ /* First, compare by leader */
+ mm = hashmap_get(m->machines_by_leader, pidref);
+ if (mm)
+ break;
- r = cg_pidref_get_unit(pidref, &unit);
- if (r >= 0)
+ /* Then look for the unit */
+ _cleanup_free_ char *unit = NULL;
+ if (cg_pidref_get_unit(pidref, &unit) >= 0) {
mm = hashmap_get(m->machines_by_unit, unit);
- }
- if (!mm) {
- *ret = NULL;
- return 0;
+ if (mm)
+ break;
+ }
+
+ /* Maybe this process is in per-user unit? If so, let's go up the process tree, and check
+ * that, we should eventually hit PID 1 of the container tree, which we should be able to
+ * recognize. */
+ _cleanup_(pidref_done) PidRef parent = PIDREF_NULL;
+ if (pidref_get_ppid_as_pidref(pidref, &parent) < 0)
+ break;
+
+ pidref_done(¤t);
+ current = TAKE_PIDREF(parent);
+ pidref = ¤t;
}
*ret = mm;
- return 1;
+ return !!mm;
}
int manager_add_machine(Manager *m, const char *name, Machine **ret) {
@@ -149,13 +162,16 @@ void manager_gc(Manager *m, bool drop_not_started) {
/* First, if we are not closing yet, initiate stopping */
if (machine_may_gc(machine, drop_not_started) &&
- machine_get_state(machine) != MACHINE_CLOSING)
+ machine_get_state(machine) != MACHINE_CLOSING) {
+ log_debug("Stopping machine '%s' due to GC.", machine->name);
machine_stop(machine);
+ }
/* Now, the stop probably made this referenced
* again, but if it didn't, then it's time to let it
* go entirely. */
if (machine_may_gc(machine, drop_not_started)) {
+ log_debug("Finalizing machine '%s' due to GC.", machine->name);
machine_finalize(machine);
machine_free(machine);
}
diff --git a/src/machine/machined-dbus.c b/src/machine/machined-dbus.c
index 6a31e6aa3b..883c8c1338 100644
--- a/src/machine/machined-dbus.c
+++ b/src/machine/machined-dbus.c
@@ -228,11 +228,12 @@ static int method_list_machines(sd_bus_message *message, void *userdata, sd_bus_
static int method_create_or_register_machine(
Manager *manager,
sd_bus_message *message,
+ const char *polkit_action,
bool read_network,
Machine **ret,
sd_bus_error *error) {
- _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
+ _cleanup_(pidref_done) PidRef leader_pidref = PIDREF_NULL, supervisor_pidref = PIDREF_NULL;
const char *name, *service, *class, *root_directory;
const int32_t *netif = NULL;
MachineClass c;
@@ -288,18 +289,38 @@ static int method_create_or_register_machine(
return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Root directory must be empty or an absolute path");
if (leader == 0) {
- r = bus_query_sender_pidref(message, &pidref);
+ /* If no PID is specified, the client is the leader */
+ r = bus_query_sender_pidref(message, &leader_pidref);
if (r < 0)
return sd_bus_error_set_errnof(error, r, "Failed to pin client process: %m");
} else {
- r = pidref_set_pid(&pidref, leader);
+ /* If a PID is specified that's the leader, but if the client process is different from it, than that's the supervisor */
+ r = pidref_set_pid(&leader_pidref, leader);
if (r < 0)
return sd_bus_error_set_errnof(error, r, "Failed to pin process " PID_FMT ": %m", (pid_t) leader);
+
+ _cleanup_(pidref_done) PidRef client_pidref = PIDREF_NULL;
+ r = bus_query_sender_pidref(message, &client_pidref);
+ if (r < 0)
+ return sd_bus_error_set_errnof(error, r, "Failed to pin client process: %m");
+
+ if (!pidref_equal(&client_pidref, &leader_pidref))
+ supervisor_pidref = TAKE_PIDREF(client_pidref);
}
if (hashmap_get(manager->machines, name))
return sd_bus_error_setf(error, BUS_ERROR_MACHINE_EXISTS, "Machine '%s' already exists", name);
+ _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
+ r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_EUID, &creds);
+ if (r < 0)
+ return r;
+
+ uid_t uid;
+ r = sd_bus_creds_get_euid(creds, &uid);
+ if (r < 0)
+ return r;
+
const char *details[] = {
"name", name,
"class", machine_class_to_string(c),
@@ -308,7 +329,7 @@ static int method_create_or_register_machine(
r = bus_verify_polkit_async(
message,
- "org.freedesktop.machine1.create-machine",
+ polkit_action,
details,
&manager->polkit_registry,
error);
@@ -321,9 +342,11 @@ static int method_create_or_register_machine(
if (r < 0)
return r;
- m->leader = TAKE_PIDREF(pidref);
+ m->leader = TAKE_PIDREF(leader_pidref);
+ m->supervisor = TAKE_PIDREF(supervisor_pidref);
m->class = c;
m->id = id;
+ m->uid = uid;
if (!isempty(service)) {
m->service = strdup(service);
@@ -367,7 +390,7 @@ static int method_create_machine_internal(sd_bus_message *message, bool read_net
assert(message);
- r = method_create_or_register_machine(manager, message, read_network, &m, error);
+ r = method_create_or_register_machine(manager, message, "org.freedesktop.machine1.create-machine", read_network, &m, error);
if (r < 0)
return r;
if (r == 0)
@@ -405,13 +428,13 @@ static int method_register_machine_internal(sd_bus_message *message, bool read_n
assert(message);
- r = method_create_or_register_machine(manager, message, read_network, &m, error);
+ r = method_create_or_register_machine(manager, message, "org.freedesktop.machine1.register-machine", read_network, &m, error);
if (r < 0)
return r;
if (r == 0)
return 1; /* Will call us back */
- r = cg_pidref_get_unit(&m->leader, &m->unit);
+ r = cg_pidref_get_unit_full(&m->leader, &m->unit, &m->subgroup);
if (r < 0) {
r = sd_bus_error_set_errnof(error, r,
"Failed to determine unit of process "PID_FMT" : %m",
@@ -419,6 +442,19 @@ static int method_register_machine_internal(sd_bus_message *message, bool read_n
goto fail;
}
+ if (!empty_or_root(m->subgroup)) {
+ /* If this is not a top-level cgroup, then we need the cgroup path to be able to watch when
+ * it empties */
+
+ r = cg_pidref_get_path(SYSTEMD_CGROUP_CONTROLLER, &m->leader, &m->cgroup);
+ if (r < 0) {
+ r = sd_bus_error_set_errnof(error, r,
+ "Failed to determine cgroup of process "PID_FMT" : %m",
+ m->leader.pid);
+ goto fail;
+ }
+ }
+
r = machine_start(m, NULL, error);
if (r < 0)
goto fail;
@@ -1276,11 +1312,14 @@ int manager_stop_unit(Manager *manager, const char *unit, sd_bus_error *error, c
return 1;
}
-int manager_kill_unit(Manager *manager, const char *unit, int signo, sd_bus_error *error) {
+int manager_kill_unit(Manager *manager, const char *unit, const char *subgroup, int signo, sd_bus_error *reterr_error) {
assert(manager);
assert(unit);
- return bus_call_method(manager->bus, bus_systemd_mgr, "KillUnit", error, NULL, "ssi", unit, "all", signo);
+ if (empty_or_root(subgroup))
+ return bus_call_method(manager->bus, bus_systemd_mgr, "KillUnit", reterr_error, NULL, "ssi", unit, "all", signo);
+
+ return bus_call_method(manager->bus, bus_systemd_mgr, "KillUnitSubgroup", reterr_error, NULL, "sssi", unit, "cgroup", subgroup, signo);
}
int manager_unit_is_active(Manager *manager, const char *unit, sd_bus_error *reterr_error) {
diff --git a/src/machine/machined-varlink.c b/src/machine/machined-varlink.c
index f3351724b4..52b1fc12d2 100644
--- a/src/machine/machined-varlink.c
+++ b/src/machine/machined-varlink.c
@@ -477,14 +477,17 @@ static int list_machine_one_and_maybe_read_metadata(sd_varlink *link, Machine *m
JSON_BUILD_PAIR_STRING_NON_EMPTY("service", m->service),
JSON_BUILD_PAIR_STRING_NON_EMPTY("rootDirectory", m->root_directory),
JSON_BUILD_PAIR_STRING_NON_EMPTY("unit", m->unit),
+ JSON_BUILD_PAIR_STRING_NON_EMPTY("subgroup", m->subgroup),
SD_JSON_BUILD_PAIR_CONDITION(pidref_is_set(&m->leader), "leader", JSON_BUILD_PIDREF(&m->leader)),
+ SD_JSON_BUILD_PAIR_CONDITION(pidref_is_set(&m->supervisor), "supervisor", JSON_BUILD_PIDREF(&m->supervisor)),
SD_JSON_BUILD_PAIR_CONDITION(dual_timestamp_is_set(&m->timestamp), "timestamp", JSON_BUILD_DUAL_TIMESTAMP(&m->timestamp)),
JSON_BUILD_PAIR_UNSIGNED_NOT_EQUAL("vSockCid", m->vsock_cid, VMADDR_CID_ANY),
JSON_BUILD_PAIR_STRING_NON_EMPTY("sshAddress", m->ssh_address),
JSON_BUILD_PAIR_STRING_NON_EMPTY("sshPrivateKeyPath", m->ssh_private_key_path),
JSON_BUILD_PAIR_VARIANT_NON_NULL("addresses", addr_array),
JSON_BUILD_PAIR_STRV_ENV_PAIR_NON_EMPTY("OSRelease", os_release),
- JSON_BUILD_PAIR_UNSIGNED_NOT_EQUAL("UIDShift", shift, UID_INVALID));
+ JSON_BUILD_PAIR_UNSIGNED_NOT_EQUAL("UIDShift", shift, UID_INVALID),
+ SD_JSON_BUILD_PAIR_UNSIGNED("UID", m->uid));
if (r < 0)
return r;
diff --git a/src/machine/machined.h b/src/machine/machined.h
index c4753b8bfa..d03577be3c 100644
--- a/src/machine/machined.h
+++ b/src/machine/machined.h
@@ -10,7 +10,9 @@ typedef struct Manager {
sd_bus *bus;
Hashmap *machines;
- Hashmap *machines_by_unit;
+ Hashmap *machines_by_unit; /* This hashmap only tracks machines where a system-level encapsulates
+ * the machine fully, and exclusively. It's not used if a machine is
+ * run in a cgroup further down the tree. */
Hashmap *machines_by_leader;
sd_event_source *deferred_gc_event_source;
@@ -44,7 +46,7 @@ int match_properties_changed(sd_bus_message *message, void *userdata, sd_bus_err
int match_job_removed(sd_bus_message *message, void *userdata, sd_bus_error *error);
int manager_stop_unit(Manager *manager, const char *unit, sd_bus_error *error, char **job);
-int manager_kill_unit(Manager *manager, const char *unit, int signo, sd_bus_error *error);
+int manager_kill_unit(Manager *manager, const char *unit, const char *subgroup, int signo, sd_bus_error *error);
int manager_unref_unit(Manager *m, const char *unit, sd_bus_error *error);
int manager_unit_is_active(Manager *manager, const char *unit, sd_bus_error *reterr_errno);
int manager_job_is_active(Manager *manager, const char *path, sd_bus_error *reterr_errno);
diff --git a/src/machine/org.freedesktop.machine1.policy b/src/machine/org.freedesktop.machine1.policy
index fe125ed0db..d5b8d83d2a 100644
--- a/src/machine/org.freedesktop.machine1.policy
+++ b/src/machine/org.freedesktop.machine1.policy
@@ -99,7 +99,17 @@
auth_adminauth_admin_keep
- org.freedesktop.login1.shell org.freedesktop.login1.login
+ org.freedesktop.login1.shell org.freedesktop.login1.login org.freedesktop.machine1.register-machine
+
+
+
+ Register a local virtual machine or container
+ Authentication is required to register a local virtual machine or container.
+
+ auth_admin
+ auth_admin
+ yes
+
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c
index 5e886e5391..c021fd675e 100644
--- a/src/nspawn/nspawn.c
+++ b/src/nspawn/nspawn.c
@@ -12,7 +12,6 @@
#include
#include
#include
-#include "constants.h"
#if HAVE_SELINUX
#include
@@ -38,8 +37,10 @@
#include "cgroup-util.h"
#include "chase.h"
#include "common-signal.h"
+#include "constants.h"
#include "copy.h"
#include "cpu-set-util.h"
+#include "daemon-util.h"
#include "dev-setup.h"
#include "devnum-util.h"
#include "discover-image.h"
@@ -1667,11 +1668,6 @@ static int verify_arguments(void) {
if (has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts))
arg_read_only = true;
- if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0)
- /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
- * The latter is not technically a user session, but we don't need to labour the point. */
- return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--keep-unit --register=yes may not be used when invoked from a user session.");
-
if (arg_directory && arg_image)
return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--directory= and --image= may not be combined.");
@@ -2960,13 +2956,19 @@ static int wait_for_container(PidRef *pid, ContainerStatus *container) {
static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
PidRef *pid = ASSERT_PTR(userdata);
+ assert(si);
+
if (pidref_is_set(pid))
if (pidref_kill(pid, arg_kill_signal) >= 0) {
- log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
+ log_info("Trying to halt container by sending %s to container PID 1. Send SIGTERM again to trigger immediate termination.",
+ signal_to_string(si->ssi_signo));
sd_event_source_set_userdata(s, NULL);
+ sd_notify(/* unset_environment= */ false, NOTIFY_STOPPING_MESSAGE);
return 0;
}
+ log_debug("Got %s, exiting.", signal_to_string(si->ssi_signo));
+
sd_event_exit(sd_event_source_get_event(s), 0);
return 0;
}
@@ -4617,15 +4619,18 @@ static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t r
log_debug("Got sd_notify() message: %s", strnull(joined));
}
+ char *status = strv_find_startswith(tags, "STATUS=");
+ if (status)
+ (void) sd_notifyf(/* unset_environment= */ false, "STATUS=Container running: %s", status);
+
if (strv_contains(tags, "READY=1")) {
- r = sd_notify(false, "READY=1\n");
+ r = sd_notify(/* unset_environment= */ false, "READY=1\n");
if (r < 0)
log_warning_errno(r, "Failed to send readiness notification, ignoring: %m");
- }
- char *p = strv_find_startswith(tags, "STATUS=");
- if (p)
- (void) sd_notifyf(false, "STATUS=Container running: %s", p);
+ if (!status)
+ (void) sd_notifyf(/* unset_environment= */ false, "STATUS=Container running.");
+ }
return 0;
}
@@ -5136,7 +5141,6 @@ static int run_container(
_cleanup_(sd_event_unrefp) sd_event *event = NULL;
_cleanup_(pty_forward_freep) PTYForward *forward = NULL;
_cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
- _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
_cleanup_free_ uid_t *bind_user_uid = NULL;
size_t n_bind_user_uid = 0;
ContainerStatus container_status = 0;
@@ -5459,19 +5463,35 @@ static int run_container(
return r;
}
- if (arg_register || !arg_keep_unit) {
- if (arg_privileged || arg_register)
- r = sd_bus_default_system(&bus);
- else
- r = sd_bus_default_user(&bus);
+ /* Registration always happens on the system bus */
+ _cleanup_(sd_bus_flush_close_unrefp) sd_bus *system_bus = NULL;
+ if (arg_register || arg_privileged) {
+ r = sd_bus_default_system(&system_bus);
if (r < 0)
- return log_error_errno(r, "Failed to open bus: %m");
+ return log_error_errno(r, "Failed to open system bus: %m");
- r = sd_bus_set_close_on_exit(bus, false);
+ r = sd_bus_set_close_on_exit(system_bus, false);
if (r < 0)
return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
- (void) sd_bus_set_allow_interactive_authorization(bus, arg_ask_password);
+ (void) sd_bus_set_allow_interactive_authorization(system_bus, arg_ask_password);
+ }
+
+ /* Scope allocation happens on the user bus if we are unpriv, otherwise system bus. */
+ _cleanup_(sd_bus_flush_close_unrefp) sd_bus *user_bus = NULL;
+ _cleanup_(sd_bus_unrefp) sd_bus *runtime_bus = NULL;
+ if (arg_privileged)
+ runtime_bus = sd_bus_ref(system_bus);
+ else {
+ r = sd_bus_default_user(&user_bus);
+ if (r < 0)
+ return log_error_errno(r, "Failed to open user bus: %m");
+
+ r = sd_bus_set_close_on_exit(user_bus, false);
+ if (r < 0)
+ return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
+
+ runtime_bus = sd_bus_ref(user_bus);
}
if (!arg_keep_unit) {
@@ -5480,7 +5500,7 @@ static int run_container(
* scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
r = sd_bus_match_signal_async(
- bus,
+ runtime_bus,
/* ret= */ NULL,
"org.freedesktop.systemd1",
/* path= */ NULL,
@@ -5493,11 +5513,46 @@ static int run_container(
return log_error_errno(r, "Failed to request RequestStop match: %m");
}
+ if (arg_keep_unit) {
+ /* If we are not supposed to allocate a unit, then let's move the process now, so that we can
+ * register things while being in the right cgroup location already. Otherwise, let's move
+ * the process later, once we have unit and hence cgroup. */
+ r = create_subcgroup(
+ pid,
+ arg_keep_unit,
+ arg_uid_shift,
+ userns_fd,
+ arg_userns_mode);
+ if (r < 0)
+ return r;
+ }
+
+ bool scope_allocated = false;
+ if (!arg_keep_unit && (!arg_register || !arg_privileged)) {
+ AllocateScopeFlags flags = ALLOCATE_SCOPE_ALLOW_PIDFD;
+ r = allocate_scope(
+ runtime_bus,
+ arg_machine,
+ pid,
+ arg_slice,
+ arg_custom_mounts, arg_n_custom_mounts,
+ arg_kill_signal,
+ arg_property,
+ arg_property_message,
+ arg_start_mode,
+ flags);
+ if (r < 0)
+ return r;
+
+ scope_allocated = true;
+ }
+
+ bool registered = false;
if (arg_register) {
RegisterMachineFlags flags = 0;
- SET_FLAG(flags, REGISTER_MACHINE_KEEP_UNIT, arg_keep_unit);
+ SET_FLAG(flags, REGISTER_MACHINE_KEEP_UNIT, arg_keep_unit || !arg_privileged);
r = register_machine(
- bus,
+ system_bus,
arg_machine,
pid,
arg_directory,
@@ -5514,33 +5569,22 @@ static int run_container(
if (r < 0)
return r;
- } else if (!arg_keep_unit) {
- AllocateScopeFlags flags = ALLOCATE_SCOPE_ALLOW_PIDFD;
- r = allocate_scope(
- bus,
- arg_machine,
- pid,
- arg_slice,
- arg_custom_mounts, arg_n_custom_mounts,
- arg_kill_signal,
- arg_property,
- arg_property_message,
- arg_start_mode,
- flags);
- if (r < 0)
- return r;
+ registered = true;
+ }
- } else if (arg_slice || arg_property)
+ if (arg_keep_unit && (arg_slice || arg_property))
log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
- r = create_subcgroup(
- pid,
- arg_keep_unit,
- arg_uid_shift,
- userns_fd,
- arg_userns_mode);
- if (r < 0)
- return r;
+ if (!arg_keep_unit) {
+ r = create_subcgroup(
+ pid,
+ arg_keep_unit,
+ arg_uid_shift,
+ userns_fd,
+ arg_userns_mode);
+ if (r < 0)
+ return r;
+ }
/* Notify the child that the parent is ready with all its setup (including cgroup-ification), and
* that the child can now hand over control to the code to run inside the container. */
@@ -5561,10 +5605,16 @@ static int run_container(
(void) sd_event_set_watchdog(event, true);
- if (bus) {
- r = sd_bus_attach_event(bus, event, 0);
+ if (system_bus) {
+ r = sd_bus_attach_event(system_bus, event, 0);
if (r < 0)
- return log_error_errno(r, "Failed to attach bus to event loop: %m");
+ return log_error_errno(r, "Failed to attach system bus to event loop: %m");
+ }
+
+ if (user_bus) {
+ r = sd_bus_attach_event(user_bus, event, 0);
+ if (r < 0)
+ return log_error_errno(r, "Failed to attach user bus to event loop: %m");
}
r = setup_notify_parent(event, notify_socket, pid, ¬ify_event_source);
@@ -5592,11 +5642,11 @@ static int run_container(
* will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
etc_passwd_lock = safe_close(etc_passwd_lock);
- (void) sd_notifyf(false,
- "STATUS=Container running.\n"
+ (void) sd_notifyf(/* unset_environment= */ false,
+ "STATUS=Container started.\n"
"X_NSPAWN_LEADER_PID=" PID_FMT, pid->pid);
if (!arg_notify_ready) {
- r = sd_notify(false, "READY=1\n");
+ r = sd_notify(/* unset_environment= */ false, "READY=1\n");
if (r < 0)
log_warning_errno(r, "Failed to send readiness notification, ignoring: %m");
}
@@ -5710,8 +5760,8 @@ static int run_container(
return log_error_errno(r, "Failed to run event loop: %m");
/* Kill if it is not dead yet anyway */
- if (!arg_register && !arg_keep_unit && bus)
- terminate_scope(bus, arg_machine);
+ if (scope_allocated)
+ terminate_scope(runtime_bus, arg_machine);
/* Normally redundant, but better safe than sorry */
(void) pidref_kill(pid, SIGKILL);
@@ -5731,8 +5781,8 @@ static int run_container(
r = wait_for_container(pid, &container_status);
/* Tell machined that we are gone. */
- if (arg_register && bus)
- (void) unregister_machine(bus, arg_machine);
+ if (registered)
+ (void) unregister_machine(system_bus, arg_machine);
if (r < 0)
/* We failed to wait for the container, or the container exited abnormally. */
diff --git a/src/shared/bus-unit-procs.c b/src/shared/bus-unit-procs.c
index 9aea7db785..bd510efebd 100644
--- a/src/shared/bus-unit-procs.c
+++ b/src/shared/bus-unit-procs.c
@@ -394,7 +394,13 @@ int unit_show_processes(
if (r < 0)
goto finish;
- r = dump_extra_processes(cgroups, prefix, n_columns, flags);
+ if (!FLAGS_SET(flags, OUTPUT_HIDE_EXTRA)) {
+ r = dump_extra_processes(cgroups, prefix, n_columns, flags);
+ if (r < 0)
+ goto finish;
+ }
+
+ r = 0;
finish:
while ((cg = hashmap_first(cgroups)))
diff --git a/src/shared/output-mode.h b/src/shared/output-mode.h
index ffd2e4fe80..7ee17df938 100644
--- a/src/shared/output-mode.h
+++ b/src/shared/output-mode.h
@@ -48,6 +48,7 @@ typedef enum OutputFlags {
OUTPUT_KERNEL_THREADS = 1 << 9,
OUTPUT_CGROUP_XATTRS = 1 << 10,
OUTPUT_CGROUP_ID = 1 << 11,
+ OUTPUT_HIDE_EXTRA = 1 << 12,
} OutputFlags;
sd_json_format_flags_t output_mode_to_json_format_flags(OutputMode m);
diff --git a/src/shared/varlink-io.systemd.Machine.c b/src/shared/varlink-io.systemd.Machine.c
index b80cbc65d0..d37f672d05 100644
--- a/src/shared/varlink-io.systemd.Machine.c
+++ b/src/shared/varlink-io.systemd.Machine.c
@@ -38,6 +38,10 @@ static SD_VARLINK_DEFINE_METHOD(
SD_VARLINK_DEFINE_INPUT(leader, SD_VARLINK_INT, SD_VARLINK_NULLABLE),
SD_VARLINK_FIELD_COMMENT("The leader PID as ProcessId structure. If both the leader and leaderProcessId parameters are specified they must reference the same process. Typically one would only specify one or the other however. It's generally recommended to specify leaderProcessId as it references a process in a robust way without risk of identifier recycling."),
SD_VARLINK_DEFINE_INPUT_BY_TYPE(leaderProcessId, ProcessId, SD_VARLINK_NULLABLE),
+ SD_VARLINK_FIELD_COMMENT("The supervisor PID as simple positive integer."),
+ SD_VARLINK_DEFINE_INPUT(supervisor, SD_VARLINK_INT, SD_VARLINK_NULLABLE),
+ SD_VARLINK_FIELD_COMMENT("The supervisor PID as ProcessId structure. If both the supervisor and supervisorProcessId parameters are specified they must reference the same process. Typically only one or the other would be specified. It's generally recommended to specify supervisorProcessId as it references a process in a robust way without risk of identifier recycling."),
+ SD_VARLINK_DEFINE_INPUT_BY_TYPE(supervisorProcessId, ProcessId, SD_VARLINK_NULLABLE),
SD_VARLINK_DEFINE_INPUT(rootDirectory, SD_VARLINK_STRING, SD_VARLINK_NULLABLE),
SD_VARLINK_DEFINE_INPUT(ifIndices, SD_VARLINK_INT, SD_VARLINK_ARRAY|SD_VARLINK_NULLABLE),
SD_VARLINK_DEFINE_INPUT(vSockCid, SD_VARLINK_INT, SD_VARLINK_NULLABLE),
@@ -58,7 +62,7 @@ static SD_VARLINK_DEFINE_METHOD(
static SD_VARLINK_DEFINE_METHOD(
Kill,
VARLINK_DEFINE_MACHINE_LOOKUP_AND_POLKIT_INPUT_FIELDS,
- SD_VARLINK_FIELD_COMMENT("Identifier that specifies what precisely to send the signal to (either 'leader' or 'all')."),
+ SD_VARLINK_FIELD_COMMENT("Identifier that specifies what precisely to send the signal to (either 'leader', 'supervisor', or 'all')."),
SD_VARLINK_DEFINE_INPUT(whom, SD_VARLINK_STRING, SD_VARLINK_NULLABLE),
SD_VARLINK_FIELD_COMMENT("Numeric UNIX signal integer."),
SD_VARLINK_DEFINE_INPUT(signal, SD_VARLINK_INT, 0));
@@ -79,6 +83,8 @@ static SD_VARLINK_DEFINE_METHOD_FULL(
SD_VARLINK_DEFINE_OUTPUT(class, SD_VARLINK_STRING, 0),
SD_VARLINK_FIELD_COMMENT("Leader process PID of this machine"),
SD_VARLINK_DEFINE_OUTPUT_BY_TYPE(leader, ProcessId, SD_VARLINK_NULLABLE),
+ SD_VARLINK_FIELD_COMMENT("Supervisor process PID of this machine"),
+ SD_VARLINK_DEFINE_OUTPUT_BY_TYPE(supervisor, ProcessId, SD_VARLINK_NULLABLE),
SD_VARLINK_FIELD_COMMENT("Root directory of this machine, if known, relative to host file system"),
SD_VARLINK_DEFINE_OUTPUT(rootDirectory, SD_VARLINK_STRING, SD_VARLINK_NULLABLE),
SD_VARLINK_FIELD_COMMENT("The service manager unit this machine resides in"),
@@ -96,7 +102,11 @@ static SD_VARLINK_DEFINE_METHOD_FULL(
SD_VARLINK_FIELD_COMMENT("OS release information of the machine. It contains an array of key value pairs read from the os-release(5) file in the image."),
SD_VARLINK_DEFINE_OUTPUT(OSRelease, SD_VARLINK_STRING, SD_VARLINK_NULLABLE|SD_VARLINK_ARRAY),
SD_VARLINK_FIELD_COMMENT("Return the base UID/GID of the machine"),
- SD_VARLINK_DEFINE_OUTPUT(UIDShift, SD_VARLINK_INT, SD_VARLINK_NULLABLE));
+ SD_VARLINK_DEFINE_OUTPUT(UIDShift, SD_VARLINK_INT, SD_VARLINK_NULLABLE),
+ SD_VARLINK_FIELD_COMMENT("Subcgroup path of the machine, relative to the unit's cgroup path"),
+ SD_VARLINK_DEFINE_OUTPUT(Subgroup, SD_VARLINK_STRING, SD_VARLINK_NULLABLE),
+ SD_VARLINK_FIELD_COMMENT("Numeric UNIX UID of the user who registered the machine"),
+ SD_VARLINK_DEFINE_OUTPUT(UID, SD_VARLINK_INT, SD_VARLINK_NULLABLE));
static SD_VARLINK_DEFINE_ENUM_TYPE(
MachineOpenMode,
diff --git a/src/test/test-cgroup-util.c b/src/test/test-cgroup-util.c
index 5fa6fe2da3..7c8cfaaa11 100644
--- a/src/test/test-cgroup-util.c
+++ b/src/test/test-cgroup-util.c
@@ -60,6 +60,36 @@ TEST(path_get_unit) {
check_p_g_u("/user.slice/user-1000.slice/user@.service/server.service", -ENXIO, NULL);
}
+static void check_p_g_u_f(const char *path, int expected_code, const char *expected_unit, const char *expected_subgroup) {
+ _cleanup_free_ char *unit = NULL, *subgroup = NULL;
+ int r;
+
+ r = cg_path_get_unit_full(path, &unit, &subgroup);
+ printf("%s: %s → %s %s %d expected %s %s %d\n", __func__, path, unit, subgroup, r, strnull(expected_unit), strnull(expected_subgroup), expected_code);
+ ASSERT_EQ(r, expected_code);
+ ASSERT_STREQ(unit, expected_unit);
+ ASSERT_STREQ(subgroup, expected_subgroup);
+}
+
+TEST(path_get_unit_full) {
+ check_p_g_u_f("/system.slice/foobar.service/sdfdsaf", 0, "foobar.service", "sdfdsaf");
+ check_p_g_u_f("/system.slice/foobar.service//sdfdsaf", 0, "foobar.service", "sdfdsaf");
+ check_p_g_u_f("/system.slice/foobar.service/sdfdsaf/", 0, "foobar.service", "sdfdsaf");
+ check_p_g_u_f("/system.slice/foobar.service//sdfdsaf/", 0, "foobar.service", "sdfdsaf");
+ check_p_g_u_f("/system.slice/foobar.service//sdfdsaf//", 0, "foobar.service", "sdfdsaf");
+ check_p_g_u_f("/system.slice/foobar.service/sdfdsaf/urks", 0, "foobar.service", "sdfdsaf/urks");
+ check_p_g_u_f("/system.slice/foobar.service//sdfdsaf//urks", 0, "foobar.service", "sdfdsaf/urks");
+ check_p_g_u_f("/system.slice/foobar.service/sdfdsaf/urks/", 0, "foobar.service", "sdfdsaf/urks");
+ check_p_g_u_f("/system.slice/foobar.service//sdfdsaf//urks//", 0, "foobar.service", "sdfdsaf/urks");
+ check_p_g_u_f("/system.slice/foobar.service", 0, "foobar.service", NULL);
+ check_p_g_u_f("/system.slice/foobar.service/", 0, "foobar.service", NULL);
+ check_p_g_u_f("/system.slice/foobar.service//", 0, "foobar.service", NULL);
+ check_p_g_u_f("/system.slice/", -ENXIO, NULL, NULL);
+ check_p_g_u_f("/system.slice/piff", -ENXIO, NULL, NULL);
+ check_p_g_u_f("/system.service/piff", 0, "system.service", "piff");
+ check_p_g_u_f("//system.service//piff", 0, "system.service", "piff");
+}
+
static void check_p_g_u_p(const char *path, int code, const char *result) {
_cleanup_free_ char *unit_path = NULL;
int r;
diff --git a/src/vmspawn/vmspawn-register.c b/src/vmspawn/vmspawn-register.c
index ae7edd0fdb..d9925f171d 100644
--- a/src/vmspawn/vmspawn-register.c
+++ b/src/vmspawn/vmspawn-register.c
@@ -10,7 +10,9 @@
#include "bus-error.h"
#include "bus-locator.h"
#include "errno-util.h"
+#include "json-util.h"
#include "log.h"
+#include "pidref.h"
#include "process-util.h"
#include "socket-util.h"
#include "string-util.h"
@@ -23,6 +25,7 @@ int register_machine(
const char *machine_name,
sd_id128_t uuid,
const char *service,
+ const PidRef *pidref,
const char *directory,
unsigned cid,
const char *address,
@@ -54,7 +57,7 @@ int register_machine(
SD_BUS_MESSAGE_APPEND_ID128(uuid),
service,
"vm",
- (uint32_t) getpid_cached(),
+ (uint32_t) (pidref_is_set(pidref) ? pidref->pid : 0),
strempty(directory));
if (r < 0)
return log_error_errno(r, "Failed to register machine: %s", bus_error_message(&error, r));
@@ -77,7 +80,8 @@ int register_machine(
SD_JSON_BUILD_PAIR_CONDITION(!!address, "sshAddress", SD_JSON_BUILD_STRING(address)),
SD_JSON_BUILD_PAIR_CONDITION(!!key_path, "sshPrivateKeyPath", SD_JSON_BUILD_STRING(key_path)),
SD_JSON_BUILD_PAIR_CONDITION(isatty_safe(STDIN_FILENO), "allowInteractiveAuthentication", SD_JSON_BUILD_BOOLEAN(true)),
- SD_JSON_BUILD_PAIR_CONDITION(!keep_unit, "allocateUnit", SD_JSON_BUILD_BOOLEAN(true)));
+ SD_JSON_BUILD_PAIR_CONDITION(!keep_unit, "allocateUnit", SD_JSON_BUILD_BOOLEAN(true)),
+ SD_JSON_BUILD_PAIR_CONDITION(pidref_is_set(pidref), "leaderProcessId", JSON_BUILD_PIDREF(pidref)));
}
int unregister_machine(sd_bus *bus, const char *machine_name) {
diff --git a/src/vmspawn/vmspawn-register.h b/src/vmspawn/vmspawn-register.h
index 58cae542ce..2adfabdc03 100644
--- a/src/vmspawn/vmspawn-register.h
+++ b/src/vmspawn/vmspawn-register.h
@@ -7,6 +7,7 @@ int register_machine(
const char *machine_name,
sd_id128_t uuid,
const char *service,
+ const PidRef *pidref,
const char *directory,
unsigned cid,
const char *address,
diff --git a/src/vmspawn/vmspawn-scope.c b/src/vmspawn/vmspawn-scope.c
index 624bd7730d..2aa2aaa232 100644
--- a/src/vmspawn/vmspawn-scope.c
+++ b/src/vmspawn/vmspawn-scope.c
@@ -12,11 +12,40 @@
#include "pidref.h"
#include "random-util.h"
#include "socket-util.h"
+#include "special.h"
#include "string-util.h"
#include "strv.h"
+#include "unit-def.h"
+#include "unit-name.h"
#include "vmspawn-scope.h"
-int start_transient_scope(sd_bus *bus, const char *machine_name, bool allow_pidfd, char **ret_scope) {
+static int append_controller_property(sd_bus *bus, sd_bus_message *m) {
+ const char *unique;
+ int r;
+
+ assert(bus);
+ assert(m);
+
+ r = sd_bus_get_unique_name(bus, &unique);
+ if (r < 0)
+ return log_error_errno(r, "Failed to get unique name: %m");
+
+ r = sd_bus_message_append(m, "(sv)", "Controller", "s", unique);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 0;
+}
+
+int allocate_scope(
+ sd_bus *bus,
+ const char *machine_name,
+ const PidRef *pid,
+ const char *slice,
+ char **properties,
+ bool allow_pidfd,
+ char **ret_scope) {
+
_cleanup_(bus_wait_for_jobs_freep) BusWaitForJobs *w = NULL;
_cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
_cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL, *m = NULL;
@@ -33,8 +62,9 @@ int start_transient_scope(sd_bus *bus, const char *machine_name, bool allow_pidf
if (r < 0)
return log_error_errno(r, "Could not watch job: %m");
- if (asprintf(&scope, "machine-%"PRIu64"-%s.scope", random_u64(), machine_name) < 0)
- return log_oom();
+ r = unit_name_mangle_with_suffix(machine_name, "as machine name", /* flags= */ 0, ".scope", &scope);
+ if (r < 0)
+ return log_error_errno(r, "Failed to mangle scope name: %m");
description = strjoin("Virtual Machine ", machine_name);
if (!description)
@@ -53,22 +83,26 @@ int start_transient_scope(sd_bus *bus, const char *machine_name, bool allow_pidf
if (r < 0)
return bus_log_create_error(r);
- r = sd_bus_message_append(m, "(sv)(sv)(sv)",
- "Description", "s", description,
- "AddRef", "b", 1,
- "CollectMode", "s", "inactive-or-failed");
+ r = bus_append_scope_pidref(m, pid, allow_pidfd);
if (r < 0)
return bus_log_create_error(r);
- _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
- r = pidref_set_self(&pidref);
- if (r < 0)
- return log_error_errno(r, "Failed to allocate PID reference: %m");
-
- r = bus_append_scope_pidref(m, &pidref, allow_pidfd);
+ r = sd_bus_message_append(m, "(sv)(sv)(sv)(sv)",
+ "Description", "s", description,
+ "CollectMode", "s", "inactive-or-failed",
+ "AddRef", "b", 1,
+ "Slice", "s", isempty(slice) ? SPECIAL_MACHINE_SLICE : slice);
if (r < 0)
return bus_log_create_error(r);
+ r = append_controller_property(bus, m);
+ if (r < 0)
+ return r;
+
+ r = bus_append_unit_property_assignment_many(m, UNIT_SCOPE, properties);
+ if (r < 0)
+ return r;
+
r = sd_bus_message_close_container(m);
if (r < 0)
return bus_log_create_error(r);
@@ -87,7 +121,14 @@ int start_transient_scope(sd_bus *bus, const char *machine_name, bool allow_pidf
* doesn't support PIDFDs yet, let's try without. */
if (allow_pidfd &&
sd_bus_error_has_names(&error, SD_BUS_ERROR_UNKNOWN_PROPERTY, SD_BUS_ERROR_PROPERTY_READ_ONLY))
- return start_transient_scope(bus, machine_name, false, ret_scope);
+ return allocate_scope(
+ bus,
+ machine_name,
+ pid,
+ slice,
+ properties,
+ /* allow_pidfd= */ false,
+ ret_scope);
return log_error_errno(r, "Failed to start transient scope unit: %s", bus_error_message(&error, r));
}
@@ -96,7 +137,11 @@ int start_transient_scope(sd_bus *bus, const char *machine_name, bool allow_pidf
if (r < 0)
return bus_log_parse_error(r);
- r = bus_wait_for_jobs_one(w, object, /* quiet */ false, NULL);
+ r = bus_wait_for_jobs_one(
+ w,
+ object,
+ BUS_WAIT_JOBS_LOG_ERROR,
+ /* extra_args= */ NULL);
if (r < 0)
return r;
@@ -106,6 +151,46 @@ int start_transient_scope(sd_bus *bus, const char *machine_name, bool allow_pidf
return 0;
}
+int terminate_scope(
+ sd_bus *bus,
+ const char *machine_name) {
+
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+ _cleanup_free_ char *scope = NULL;
+ int r;
+
+ r = unit_name_mangle_with_suffix(machine_name, "to terminate", /* flags= */ 0, ".scope", &scope);
+ if (r < 0)
+ return log_error_errno(r, "Failed to mangle scope name: %m");
+
+ r = bus_call_method(bus, bus_systemd_mgr, "AbandonScope", &error, /* ret_reply= */ NULL, "s", scope);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to abandon scope '%s', ignoring: %s", scope, bus_error_message(&error, r));
+ sd_bus_error_free(&error);
+ }
+
+ r = bus_call_method(
+ bus,
+ bus_systemd_mgr,
+ "KillUnit",
+ &error,
+ NULL,
+ "ssi",
+ scope,
+ "all",
+ (int32_t) SIGKILL);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to SIGKILL scope '%s', ignoring: %s", scope, bus_error_message(&error, r));
+ sd_bus_error_free(&error);
+ }
+
+ r = bus_call_method(bus, bus_systemd_mgr, "UnrefUnit", &error, /* ret_reply= */ NULL, "s", scope);
+ if (r < 0)
+ log_debug_errno(r, "Failed to drop reference to scope '%s', ignoring: %s", scope, bus_error_message(&error, r));
+
+ return 0;
+}
+
static int message_add_commands(sd_bus_message *m, const char *exec_type, char ***commands, size_t n_commands) {
int r;
diff --git a/src/vmspawn/vmspawn-scope.h b/src/vmspawn/vmspawn-scope.h
index c29325a314..e456498f2e 100644
--- a/src/vmspawn/vmspawn-scope.h
+++ b/src/vmspawn/vmspawn-scope.h
@@ -14,5 +14,8 @@ typedef struct SocketServicePair {
void socket_service_pair_done(SocketServicePair *p);
-int start_transient_scope(sd_bus *bus, const char *machine_name, bool allow_pidfd, char **ret_scope);
+int allocate_scope(sd_bus *bus, const char *machine_name, const PidRef *pid, const char *slice, char **properties, bool allow_pidfd, char **ret_scope);
+
+int terminate_scope(sd_bus *bus, const char *machine_name);
+
int start_socket_service_pair(sd_bus *bus, const char *scope, SocketServicePair *p);
diff --git a/src/vmspawn/vmspawn.c b/src/vmspawn/vmspawn.c
index 0ddc1cdb5a..c5fe0d54d7 100644
--- a/src/vmspawn/vmspawn.c
+++ b/src/vmspawn/vmspawn.c
@@ -4,6 +4,7 @@
#include
#include
#include
+#include
#include
#include
@@ -38,6 +39,8 @@
#include "hostname-setup.h"
#include "hostname-util.h"
#include "id128-util.h"
+#include "io-util.h"
+#include "iovec-util.h"
#include "log.h"
#include "machine-credential.h"
#include "main-func.h"
@@ -45,6 +48,7 @@
#include "namespace-util.h"
#include "netif-util.h"
#include "nsresource.h"
+#include "nulstr-util.h"
#include "osc-context.h"
#include "pager.h"
#include "parse-argument.h"
@@ -52,6 +56,7 @@
#include "path-lookup.h"
#include "path-util.h"
#include "pidref.h"
+#include "polkit-agent.h"
#include "pretty-print.h"
#include "process-util.h"
#include "ptyfwd.h"
@@ -99,6 +104,8 @@ static PagerFlags arg_pager_flags = 0;
static char *arg_directory = NULL;
static char *arg_image = NULL;
static char *arg_machine = NULL;
+static char *arg_slice = NULL;
+static char **arg_property = NULL;
static char *arg_cpus = NULL;
static uint64_t arg_ram = UINT64_C(2) * U64_GB;
static int arg_kvm = -1;
@@ -117,7 +124,7 @@ static SettingsMask arg_settings_mask = 0;
static char *arg_firmware = NULL;
static char *arg_forward_journal = NULL;
static bool arg_privileged = false;
-static bool arg_register = false;
+static bool arg_register = true;
static bool arg_keep_unit = false;
static sd_id128_t arg_uuid = {};
static char **arg_kernel_cmdline_extra = NULL;
@@ -131,10 +138,13 @@ static char **arg_smbios11 = NULL;
static uint64_t arg_grow_image = 0;
static char *arg_tpm_state_path = NULL;
static TpmStateMode arg_tpm_state_mode = TPM_STATE_AUTO;
+static bool arg_ask_password = true;
+static bool arg_notify_ready = true;
STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
STATIC_DESTRUCTOR_REGISTER(arg_image, freep);
STATIC_DESTRUCTOR_REGISTER(arg_machine, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_slice, freep);
STATIC_DESTRUCTOR_REGISTER(arg_cpus, freep);
STATIC_DESTRUCTOR_REGISTER(arg_credentials, machine_credential_context_done);
STATIC_DESTRUCTOR_REGISTER(arg_firmware, freep);
@@ -148,6 +158,7 @@ STATIC_DESTRUCTOR_REGISTER(arg_background, freep);
STATIC_DESTRUCTOR_REGISTER(arg_ssh_key_type, freep);
STATIC_DESTRUCTOR_REGISTER(arg_smbios11, strv_freep);
STATIC_DESTRUCTOR_REGISTER(arg_tpm_state_path, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep);
static int help(void) {
_cleanup_free_ char *link = NULL;
@@ -165,6 +176,7 @@ static int help(void) {
" --version Print version string\n"
" -q --quiet Do not show status information\n"
" --no-pager Do not pipe output into a pager\n"
+ " --no-ask-password Do not prompt for password\n"
"\n%3$sImage:%4$s\n"
" -D --directory=PATH Root directory for the VM\n"
" -i --image=FILE|DEVICE Root file system disk image or device for the VM\n"
@@ -185,13 +197,18 @@ static int help(void) {
" --firmware=PATH|list Select firmware definition file (or list available)\n"
" --discard-disk=BOOL Control processing of discard requests\n"
" -G --grow-image=BYTES Grow image file to specified size in bytes\n"
+ "\n%3$sExecution:%4$s\n"
" -s --smbios11=STRING Pass an arbitrary SMBIOS Type #11 string to the VM\n"
+ " --notify-ready=BOOL Wait for ready notification from the VM\n"
"\n%3$sSystem Identity:%4$s\n"
" -M --machine=NAME Set the machine name for the VM\n"
" --uuid=UUID Set a specific machine UUID for the VM\n"
"\n%3$sProperties:%4$s\n"
- " --register=BOOLEAN Register VM with systemd-machined\n"
- " --keep-unit Don't let systemd-machined allocate scope unit for us\n"
+ " -S --slice=SLICE Place the VM in the specified slice\n"
+ " --property=NAME=VALUE Set scope unit property\n"
+ " --register=BOOLEAN Register VM as machine\n"
+ " --keep-unit Do not register a scope for the machine, reuse\n"
+ " the service unit vmspawn is running in\n"
"\n%3$sUser Namespacing:%4$s\n"
" --private-users=UIDBASE[:NUIDS]\n"
" Configure the UID/GID range to map into the\n"
@@ -273,6 +290,9 @@ static int parse_argv(int argc, char *argv[]) {
ARG_CONSOLE,
ARG_BACKGROUND,
ARG_TPM_STATE,
+ ARG_NO_ASK_PASSWORD,
+ ARG_PROPERTY,
+ ARG_NOTIFY_READY,
};
static const struct option options[] = {
@@ -283,6 +303,7 @@ static int parse_argv(int argc, char *argv[]) {
{ "image", required_argument, NULL, 'i' },
{ "directory", required_argument, NULL, 'D' },
{ "machine", required_argument, NULL, 'M' },
+ { "slice", required_argument, NULL, 'S' },
{ "cpus", required_argument, NULL, ARG_CPUS },
{ "qemu-smp", required_argument, NULL, ARG_CPUS }, /* Compat alias */
{ "ram", required_argument, NULL, ARG_RAM },
@@ -318,6 +339,9 @@ static int parse_argv(int argc, char *argv[]) {
{ "smbios11", required_argument, NULL, 's' },
{ "grow-image", required_argument, NULL, 'G' },
{ "tpm-state", required_argument, NULL, ARG_TPM_STATE },
+ { "no-ask-password", no_argument, NULL, ARG_NO_ASK_PASSWORD },
+ { "property", required_argument, NULL, ARG_PROPERTY },
+ { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
{}
};
@@ -327,7 +351,7 @@ static int parse_argv(int argc, char *argv[]) {
assert(argv);
optind = 0;
- while ((c = getopt_long(argc, argv, "+hD:i:M:nqs:G:", options, NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "+hD:i:M:nqs:G:S:", options, NULL)) >= 0)
switch (c) {
case 'h':
return help();
@@ -633,6 +657,34 @@ static int parse_argv(int argc, char *argv[]) {
arg_tpm_state_path = mfree(arg_tpm_state_path);
break;
+ case ARG_NO_ASK_PASSWORD:
+ arg_ask_password = false;
+ break;
+
+ case 'S': {
+ _cleanup_free_ char *mangled = NULL;
+
+ r = unit_name_mangle_with_suffix(optarg, /* operation= */ NULL, UNIT_NAME_MANGLE_WARN, ".slice", &mangled);
+ if (r < 0)
+ return log_error_errno(r, "Failed to turn '%s' into unit name: %m", optarg);
+
+ free_and_replace(arg_slice, mangled);
+ break;
+ }
+
+ case ARG_PROPERTY:
+ if (strv_extend(&arg_property, optarg) < 0)
+ return log_oom();
+
+ break;
+
+ case ARG_NOTIFY_READY:
+ r = parse_boolean_argument("--notify-ready=", optarg, &arg_notify_ready);
+ if (r < 0)
+ return r;
+
+ break;
+
case '?':
return -EINVAL;
@@ -715,17 +767,20 @@ static int read_vsock_notify(NotifyConnectionData *d, int fd) {
log_debug("Received notification message with tags: %s", strnull(j));
}
+ const char *status = strv_find_startswith(tags, "STATUS=");
+ if (status)
+ (void) sd_notifyf(/* unset_environment= */ false, "STATUS=VM running: %s", status);
+
if (strv_contains(tags, "READY=1")) {
- r = sd_notify(false, "READY=1");
+ r = sd_notify(/* unset_environment= */ false, "READY=1");
if (r < 0)
log_warning_errno(r, "Failed to send readiness notification, ignoring: %m");
+
+ if (!status)
+ (void) sd_notifyf(/* unset_environment= */ false, "STATUS=VM running.");
}
- const char *p = strv_find_startswith(tags, "STATUS=");
- if (p)
- (void) sd_notifyf(false, "STATUS=VM running: %s", p);
-
- p = strv_find_startswith(tags, "EXIT_STATUS=");
+ const char *p = strv_find_startswith(tags, "EXIT_STATUS=");
if (p) {
uint8_t k = 0;
r = safe_atou8(p, &k);
@@ -1534,28 +1589,256 @@ static int grow_image(const char *path, uint64_t size) {
return 1;
}
+static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) {
+ assert(m);
+
+ log_info("VM termination requested. Exiting.");
+ sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0);
+
+ return 0;
+}
+
+static int datagram_read_cmdline_and_exec(int _fd /* always taking possession, even on error */) {
+ _cleanup_close_ int fd = TAKE_FD(_fd);
+ int r;
+
+ assert(fd >= 0);
+
+ /* The first datagram contains the cmdline */
+ r = fd_wait_for_event(fd, POLLIN, USEC_INFINITY);
+ if (r < 0)
+ return log_error_errno(r, "Failed to wait for command line: %m");
+
+ ssize_t n = next_datagram_size_fd(fd);
+ if (n < 0)
+ return log_error_errno(n, "Failed to determine datagram size: %m");
+ n += 1; /* extra byte to validate that the size we determined here was correct */
+
+ _cleanup_free_ char *p = malloc(n);
+ if (!p)
+ return log_oom();
+
+ ssize_t m = recv(fd, p, n, /* flags= */ 0);
+ if (m < 0)
+ return log_error_errno(errno, "Failed to read datagram: %m");
+ if (m >= n)
+ return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Unexpected message size.");
+
+ _cleanup_strv_free_ char **a = strv_parse_nulstr(p, m);
+ if (!a)
+ return log_oom();
+ if (strv_isempty(a))
+ return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Invalid command line.");
+
+ /* The second datagram contains an integer array of the intended fd numbers, and the an SCM_RIGHTS fd
+ * list along with it, matching that. */
+ r = fd_wait_for_event(fd, POLLIN, USEC_INFINITY);
+ if (r < 0)
+ return log_error_errno(r, "Failed to wait for command line: %m");
+
+ n = next_datagram_size_fd(fd);
+ if (n < 0)
+ return log_error_errno(n, "Failed to determine datagram size: %m");
+ n += 1; /* extra byte to validate that the size we determined here was correct */
+
+ _cleanup_free_ int *f = malloc(n);
+ if (!p)
+ return log_oom();
+
+ struct iovec iov = {
+ .iov_base = f,
+ .iov_len = n,
+ };
+
+ int *fds = NULL;
+ size_t n_fds = 0;
+ CLEANUP_ARRAY(fds, n_fds, close_many_and_free);
+
+ m = receive_many_fds_iov(
+ fd,
+ &iov, /* iovlen= */ 1,
+ &fds,
+ &n_fds,
+ /* flags= */ MSG_TRUNC);
+ if (m < 0)
+ return log_error_errno(m, "Failed to read datagram: %m");
+ if (m >= n || (size_t) m != n_fds * sizeof(int))
+ return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Unexpected message size.");
+
+ fd = safe_close(fd);
+
+ /* At this point the fds[] contains the file descriptors we got, and f[] contains the numbers we want
+ * for them. Let's rearrange things. */
+
+ /* 1. Determine largest number we want */
+ int max_fd = 2;
+ for (size_t k = 0; k < n_fds; k++)
+ max_fd = MAX(max_fd, f[k]);
+
+ /* 2. Move all fds we got above that */
+ for (size_t k = 0; k < n_fds; k++) {
+ if (fds[k] > max_fd)
+ continue;
+
+ _cleanup_close_ int copy = fcntl(fds[k], F_DUPFD_CLOEXEC, max_fd+1);
+ if (copy < 0)
+ return log_error_errno(errno, "Failed to duplicate file descriptor: %m");
+
+ safe_close(fds[k]);
+ fds[k] = TAKE_FD(copy);
+
+ assert(fds[k] > max_fd);
+ }
+
+ log_close();
+
+ r = close_all_fds(fds, n_fds);
+ if (r < 0)
+ return log_error_errno(r, "Failed to close remaining file descriptors: %m");
+
+ /* 3. Move into place (this also disables O_CLOEXEC) */
+ for (size_t k = 0; k < n_fds; k++) {
+ if (dup2(fds[k], f[k]) < 0)
+ return log_error_errno(errno, "Failed to move file descriptor: %m");
+
+ safe_close(fds[k]);
+ fds[k] = f[k];
+ }
+
+ execv(a[0], a);
+ return log_error_errno(errno, "Failed to execve %s: %m", a[0]);
+}
+
+_noreturn_ static void child(int cmdline_fd) {
+ assert(cmdline_fd >= 0);
+
+ /* set LANG if they are missing */
+ if (setenv("LANG", "C.UTF-8", /* override= */ 0) < 0) {
+ log_oom();
+ goto fail;
+ }
+
+ /* Now wait for the command line from the parent, and then execute it */
+
+ (void) datagram_read_cmdline_and_exec(TAKE_FD(cmdline_fd));
+
+fail:
+ _exit(EXIT_FAILURE);
+}
+
static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) {
_cleanup_(ovmf_config_freep) OvmfConfig *ovmf_config = NULL;
- _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
- _cleanup_free_ char *machine = NULL, *qemu_binary = NULL, *mem = NULL, *trans_scope = NULL, *kernel = NULL;
+ _cleanup_free_ char *qemu_binary = NULL, *mem = NULL, *kernel = NULL;
_cleanup_(rm_rf_physical_and_freep) char *ssh_private_key_path = NULL, *ssh_public_key_path = NULL;
_cleanup_close_ int notify_sock_fd = -EBADF;
_cleanup_strv_free_ char **cmdline = NULL;
_cleanup_free_ int *pass_fds = NULL;
size_t n_pass_fds = 0;
- const char *accel, *shm;
+ const char *accel;
int r;
- if (arg_privileged)
- r = sd_bus_default_system(&bus);
- else
- r = sd_bus_default_user(&bus);
- if (r < 0)
- return log_error_errno(r, "Failed to connect to systemd bus: %m");
+ polkit_agent_open();
- r = start_transient_scope(bus, arg_machine, /* allow_pidfd= */ true, &trans_scope);
+ /* Registration always happens on the system bus */
+ _cleanup_(sd_bus_flush_close_unrefp) sd_bus *system_bus = NULL;
+ if (arg_register || arg_privileged) {
+ r = sd_bus_default_system(&system_bus);
+ if (r < 0)
+ return log_error_errno(r, "Failed to open system bus: %m");
+
+ r = sd_bus_set_close_on_exit(system_bus, false);
+ if (r < 0)
+ return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
+
+ (void) sd_bus_set_allow_interactive_authorization(system_bus, arg_ask_password);
+ }
+
+ /* Scope allocation happens on the user bus if we are unpriv, otherwise system bus. */
+ _cleanup_(sd_bus_flush_close_unrefp) sd_bus *user_bus = NULL;
+ _cleanup_(sd_bus_unrefp) sd_bus *runtime_bus = NULL;
+ if (arg_privileged)
+ runtime_bus = sd_bus_ref(system_bus);
+ else {
+ r = sd_bus_default_user(&user_bus);
+ if (r < 0)
+ return log_error_errno(r, "Failed to open system bus: %m");
+
+ r = sd_bus_set_close_on_exit(user_bus, false);
+ if (r < 0)
+ return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
+
+ runtime_bus = sd_bus_ref(user_bus);
+ }
+
+ assert_se(sigprocmask_many(SIG_BLOCK, /* ret_old_mask=*/ NULL, SIGCHLD) >= 0);
+
+ _cleanup_close_pair_ int cmdline_socket[2] = EBADF_PAIR;
+ if (socketpair(AF_UNIX, SOCK_DGRAM | SOCK_CLOEXEC | SOCK_NONBLOCK, 0, cmdline_socket) < 0)
+ return log_error_errno(errno, "Failed to allocate command line socket pair: %m");
+
+ /* Fork off child early on, as we need to assign it to a scope unit, which we can generate
+ * dependencies towards for swtpm, virtiofsd and so on. It's just going to hang until we fully
+ * prepared a command line */
+ _cleanup_(pidref_done) PidRef child_pidref = PIDREF_NULL;
+ r = pidref_safe_fork_full(
+ "(qemu)",
+ /* stdio_fds= */ NULL,
+ cmdline_socket + 0, 1,
+ FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGTERM|FORK_LOG|FORK_CLOEXEC_OFF|FORK_RLIMIT_NOFILE_SAFE,
+ &child_pidref);
if (r < 0)
return r;
+ if (r == 0) {
+ cmdline_socket[1] = -EBADF; /* closed due to FORK_CLOEXEC_ALL_FDS */
+
+ child(cmdline_socket[0]);
+ assert_not_reached();
+ }
+
+ cmdline_socket[0] = safe_close(cmdline_socket[0]);
+
+ if (!arg_keep_unit) {
+ /* When a new scope is created for this container, then we'll be registered as its controller, in which
+ * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
+ * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
+
+ r = sd_bus_match_signal_async(
+ runtime_bus,
+ /* ret= */ NULL,
+ "org.freedesktop.systemd1",
+ /* path= */ NULL,
+ "org.freedesktop.systemd1.Scope",
+ "RequestStop",
+ on_request_stop,
+ /* install_callback= */ NULL,
+ /* userdata= */ NULL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to request RequestStop match: %m");
+ }
+
+ _cleanup_free_ char *unit = NULL;
+ bool scope_allocated = false;
+ if (!arg_keep_unit && (!arg_register || !arg_privileged)) {
+ r = allocate_scope(
+ runtime_bus,
+ arg_machine,
+ &child_pidref,
+ arg_slice,
+ arg_property,
+ /* allow_pidfd= */ true,
+ &unit);
+ if (r < 0)
+ return r;
+
+ scope_allocated = true;
+ } else {
+ if (arg_privileged)
+ r = cg_pid_get_unit(0, &unit);
+ else
+ r = cg_pid_get_user_unit(0, &unit);
+ if (r < 0)
+ return log_error_errno(r, "Failed to get our own unit: %m");
+ }
bool use_kvm = arg_kvm > 0;
if (arg_kvm < 0) {
@@ -1577,7 +1860,8 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) {
log_warning("Couldn't find OVMF firmware blob with Secure Boot support, "
"falling back to OVMF firmware blobs without Secure Boot support.");
- shm = arg_directory || arg_runtime_mounts.n_mounts != 0 ? ",memory-backend=mem" : "";
+ _cleanup_free_ char *machine = NULL;
+ const char *shm = arg_directory || arg_runtime_mounts.n_mounts != 0 ? ",memory-backend=mem" : "";
const char *hpet = ARCHITECTURE_SUPPORTS_HPET ? ",hpet=off" : "";
if (ARCHITECTURE_SUPPORTS_SMM)
machine = strjoin("type=" QEMU_MACHINE_TYPE ",smm=", on_off(ovmf_config->supports_sb), shm, hpet);
@@ -1990,7 +2274,13 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) {
if (arg_directory) {
_cleanup_free_ char *listen_address = NULL;
- r = start_virtiofsd(bus, trans_scope, arg_directory, /* uidmap= */ true, runtime_dir, &listen_address);
+ r = start_virtiofsd(
+ runtime_bus,
+ unit,
+ arg_directory,
+ /* uidmap= */ true,
+ runtime_dir,
+ &listen_address);
if (r < 0)
return r;
@@ -2057,7 +2347,13 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) {
FOREACH_ARRAY(mount, arg_runtime_mounts.mounts, arg_runtime_mounts.n_mounts) {
_cleanup_free_ char *listen_address = NULL;
- r = start_virtiofsd(bus, trans_scope, mount->source, /* uidmap= */ false, runtime_dir, &listen_address);
+ r = start_virtiofsd(
+ runtime_bus,
+ unit,
+ mount->source,
+ /* uidmap= */ false,
+ runtime_dir,
+ &listen_address);
if (r < 0)
return r;
@@ -2148,7 +2444,11 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) {
_cleanup_free_ char *tpm_socket_address = NULL;
if (swtpm) {
- r = start_tpm(bus, trans_scope, swtpm, runtime_dir, &tpm_socket_address);
+ r = start_tpm(runtime_bus,
+ unit,
+ swtpm,
+ runtime_dir,
+ &tpm_socket_address);
if (r < 0) {
/* only bail if the user asked for a tpm */
if (arg_tpm > 0)
@@ -2214,7 +2514,12 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) {
if (r < 0)
return log_error_errno(r, "Failed to find systemd-journal-remote binary: %m");
- r = start_systemd_journal_remote(bus, trans_scope, child_cid, sd_journal_remote, &listen_address);
+ r = start_systemd_journal_remote(
+ runtime_bus,
+ unit,
+ child_cid,
+ sd_journal_remote,
+ &listen_address);
if (r < 0)
return r;
@@ -2231,7 +2536,7 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) {
_cleanup_free_ char *scope_prefix = NULL, *privkey_path = NULL, *pubkey_path = NULL;
const char *key_type = arg_ssh_key_type ?: "ed25519";
- r = unit_name_to_prefix(trans_scope, &scope_prefix);
+ r = unit_name_to_prefix(unit, &scope_prefix);
if (r < 0)
return log_error_errno(r, "Failed to strip .scope suffix from scope: %m");
@@ -2262,7 +2567,7 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) {
if (r < 0)
return log_error_errno(r, "Failed to load credential %s: %m", cred_path);
- r = unit_name_to_prefix(trans_scope, &scope_prefix);
+ r = unit_name_to_prefix(unit, &scope_prefix);
if (r < 0)
return log_error_errno(r, "Failed to strip .scope suffix from scope: %m");
@@ -2326,25 +2631,68 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) {
log_debug("Executing: %s", joined);
}
+ bool registered = false;
if (arg_register) {
char vm_address[STRLEN("vsock/") + DECIMAL_STR_MAX(unsigned)];
-
xsprintf(vm_address, "vsock/%u", child_cid);
r = register_machine(
- bus,
+ system_bus,
arg_machine,
arg_uuid,
"systemd-vmspawn",
+ &child_pidref,
arg_directory,
child_cid,
child_cid != VMADDR_CID_ANY ? vm_address : NULL,
ssh_private_key_path,
- arg_keep_unit);
+ arg_keep_unit || !arg_privileged);
if (r < 0)
return r;
+
+ registered = true;
}
- assert_se(sigprocmask_many(SIG_BLOCK, /* ret_old_mask=*/ NULL, SIGCHLD) >= 0);
+ _cleanup_free_ char *nulstr = NULL;
+ size_t nulstr_size = 0;
+ if (strv_make_nulstr(cmdline, &nulstr, &nulstr_size) < 0)
+ return log_oom();
+
+ /* First datagram: the command line to execute */
+ ssize_t n = send(cmdline_socket[1], nulstr, nulstr_size, /* flags= */ 0);
+ if (n < 0)
+ return log_error_errno(errno, "Failed to send command line: %m");
+
+ /* Second datagram: the file descriptor array and the fds inside it */
+ n = send_many_fds_iov(
+ cmdline_socket[1],
+ pass_fds, n_pass_fds, /* both as payload … */
+ &IOVEC_MAKE(pass_fds, n_pass_fds * sizeof(int)), /* … and as auxiliary fds */
+ /* iovlen= */ 1,
+ /* flags= */ 0);
+ if (n < 0)
+ return log_error_errno(n, "Failed to send file descriptors to child: %m");
+
+ /* We submitted the command line now, qemu is running now */
+ cmdline_socket[1] = safe_close(cmdline_socket[1]);
+
+ /* Close relevant fds we passed to qemu in the parent. We don't need them anymore. */
+ child_vsock_fd = safe_close(child_vsock_fd);
+ tap_fd = safe_close(tap_fd);
+
+ /* Report that the VM is now set up */
+ (void) sd_notifyf(/* unset_environment= */ false,
+ "STATUS=VM started.\n"
+ "X_VMSPAWN_LEADER_PID=" PID_FMT, child_pidref.pid);
+ if (!arg_notify_ready) {
+ r = sd_notify(/* unset_environment= */ false, "READY=1\n");
+ if (r < 0)
+ log_warning_errno(r, "Failed to send readiness notification, ignoring: %m");
+ }
+
+ /* All operations that might need Polkit authorizations (i.e. machine registration, netif
+ * acquisition, …) are complete now, get rid of the agent again, so that we retain exclusive control
+ * of the TTY from now on. */
+ polkit_agent_close();
_cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
_cleanup_(sd_event_unrefp) sd_event *event = NULL;
@@ -2354,37 +2702,17 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) {
(void) sd_event_set_watchdog(event, true);
- _cleanup_(pidref_done) PidRef child_pidref = PIDREF_NULL;
-
- r = pidref_safe_fork_full(
- qemu_binary,
- /* stdio_fds= */ NULL,
- pass_fds, n_pass_fds,
- FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGTERM|FORK_LOG|FORK_CLOEXEC_OFF|FORK_RLIMIT_NOFILE_SAFE,
- &child_pidref);
- if (r < 0)
- return r;
- if (r == 0) {
- /* set TERM and LANG if they are missing */
- if (setenv("TERM", "vt220", 0) < 0) {
- log_oom();
- goto fail;
- }
-
- if (setenv("LANG", "C.UTF-8", 0) < 0) {
- log_oom();
- goto fail;
- }
-
- execv(qemu_binary, cmdline);
- log_error_errno(errno, "Failed to execve %s: %m", qemu_binary);
- fail:
- _exit(EXIT_FAILURE);
+ if (system_bus) {
+ r = sd_bus_attach_event(system_bus, event, 0);
+ if (r < 0)
+ return log_error_errno(r, "Failed to attach system bus to event loop: %m");
}
- /* Close relevant fds we passed to qemu in the parent. We don't need them anymore. */
- child_vsock_fd = safe_close(child_vsock_fd);
- tap_fd = safe_close(tap_fd);
+ if (user_bus) {
+ r = sd_bus_attach_event(user_bus, event, 0);
+ if (r < 0)
+ return log_error_errno(r, "Failed to attach user bus to event loop: %m");
+ }
int exit_status = INT_MAX;
if (use_vsock) {
@@ -2451,8 +2779,12 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) {
if (r < 0)
return log_error_errno(r, "Failed to run event loop: %m");
- if (arg_register)
- (void) unregister_machine(bus, arg_machine);
+ /* Kill if it is not dead yet anyway */
+ if (scope_allocated)
+ terminate_scope(runtime_bus, arg_machine);
+
+ if (registered)
+ (void) unregister_machine(system_bus, arg_machine);
if (use_vsock) {
if (exit_status == INT_MAX) {
@@ -2530,11 +2862,6 @@ static int verify_arguments(void) {
if (!strv_isempty(arg_initrds) && !arg_linux)
return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Option --initrd= cannot be used without --linux=.");
- if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0)
- /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
- * The latter is not technically a user session, but we don't need to labour the point. */
- return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--keep-unit --register=yes may not be used when invoked from a user session.");
-
return 0;
}
@@ -2546,9 +2873,6 @@ static int run(int argc, char *argv[]) {
arg_privileged = getuid() == 0;
- /* don't attempt to register as a machine when running as a user */
- arg_register = arg_privileged;
-
r = parse_environment();
if (r < 0)
return r;
diff --git a/test/units/TEST-13-NSPAWN.unpriv.sh b/test/units/TEST-13-NSPAWN.unpriv.sh
new file mode 100755
index 0000000000..db58b09291
--- /dev/null
+++ b/test/units/TEST-13-NSPAWN.unpriv.sh
@@ -0,0 +1,64 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: LGPL-2.1-or-later
+# shellcheck disable=SC2016
+set -eux
+set -o pipefail
+
+# shellcheck source=test/units/util.sh
+. "$(dirname "$0")"/util.sh
+
+if [[ ! -f /usr/lib/systemd/system/systemd-mountfsd.socket ]] ||
+ [[ ! -f /usr/lib/systemd/system/systemd-nsresourced.socket ]] ||
+ ! grep -q bpf /sys/kernel/security/lsm ||
+ ! find /usr/lib* -name libbpf.so.1 2>/dev/null | grep . ||
+ systemd-analyze compare-versions "$(uname -r)" lt 6.5 ||
+ systemd-analyze compare-versions "$(pkcheck --version | awk '{print $3}')" lt 124; then
+ echo "Skipping unpriv nspawn test"
+ exit 0
+fi
+
+at_exit() {
+ rm -rf /home/testuser/.local/state/machines/zurps ||:
+ machinectl terminate zurps ||:
+ rm -f /usr/share/polkit-1/rules.d/registermachinetest.rules
+}
+
+trap at_exit EXIT
+
+systemctl start systemd-mountfsd.socket systemd-nsresourced.socket
+
+run0 -u testuser mkdir -p .local/state/machines
+
+create_dummy_container /home/testuser/.local/state/machines/zurps
+cat >/home/testuser/.local/state/machines/zurps/sbin/init </usr/share/polkit-1/rules.d/registermachinetest.rules <<'EOF'
+polkit.addRule(function(action, subject) {
+ if (action.id == "org.freedesktop.machine1.register-machine" &&
+ subject.user == "testuser") {
+ return polkit.Result.YES;
+ }
+});
+EOF
+
+loginctl enable-linger testuser
+
+run0 -u testuser systemctl start --user systemd-nspawn@zurps.service
+
+machinectl status zurps
+machinectl status zurps | grep "UID:" | grep "$(id -u testuser)"
+machinectl status zurps | grep "Unit: user@" | grep "$(id -u testuser)"
+machinectl status zurps | grep "Subgroup: machine.slice/systemd-nspawn@zurps.service/payload"
+machinectl terminate zurps
+
+(! run0 -u testuser systemctl is-active --user systemd-nspawn@zurps.service)
+
+loginctl disable-linger testuser
diff --git a/units/machines.target b/units/machines.target
index 165839aeb1..b7a84f4814 100644
--- a/units/machines.target
+++ b/units/machines.target
@@ -8,7 +8,7 @@
# (at your option) any later version.
[Unit]
-Description=Containers
+Description=Virtual Machine and Containers
Documentation=man:systemd.special(7)
Before=multi-user.target
diff --git a/units/systemd-nspawn@.service.in b/units/systemd-nspawn@.service.in
index c1426e673b..d540a071a4 100644
--- a/units/systemd-nspawn@.service.in
+++ b/units/systemd-nspawn@.service.in
@@ -29,6 +29,7 @@ Delegate=yes
DelegateSubgroup=supervisor
CoredumpReceive=yes
TasksMax=16384
+TimeoutSec=2min
{{SERVICE_WATCHDOG}}
{# Enforce a strict device policy, similar to the one nspawn configures (in
diff --git a/units/systemd-vmspawn@.service.in b/units/systemd-vmspawn@.service.in
index 454a03e98b..6ae0088048 100644
--- a/units/systemd-vmspawn@.service.in
+++ b/units/systemd-vmspawn@.service.in
@@ -21,6 +21,7 @@ ExecStart=systemd-vmspawn --quiet --register=yes --keep-unit --network-tap --mac
KillMode=mixed
Type=notify
Slice=machine.slice
+TimeoutSec=2min
{{SERVICE_WATCHDOG}}
{# Enforce a strict device policy. Make sure to keep these policies in sync if you change them! #}
diff --git a/units/user/machine.slice b/units/user/machine.slice
new file mode 100644
index 0000000000..fe729d0eb9
--- /dev/null
+++ b/units/user/machine.slice
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+#
+# This file is part of systemd.
+#
+# systemd is free software; you can redistribute it and/or modify it
+# under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or
+# (at your option) any later version.
+
+[Unit]
+Description=Virtual Machine and Container Slice
+Documentation=man:systemd.special(7)
diff --git a/units/user/machines.target b/units/user/machines.target
new file mode 100644
index 0000000000..35a99666bb
--- /dev/null
+++ b/units/user/machines.target
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+#
+# This file is part of systemd.
+#
+# systemd is free software; you can redistribute it and/or modify it
+# under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or
+# (at your option) any later version.
+
+[Unit]
+Description=Virtual Machines and Containers
+Documentation=man:systemd.special(7)
+Before=default.target
+
+[Install]
+WantedBy=default.target
diff --git a/units/user/meson.build b/units/user/meson.build
index c669e4bb1c..d8d3d9fdac 100644
--- a/units/user/meson.build
+++ b/units/user/meson.build
@@ -10,6 +10,14 @@ units = [
{ 'file' : 'exit.target' },
{ 'file' : 'graphical-session-pre.target' },
{ 'file' : 'graphical-session.target' },
+ {
+ 'file' : 'machine.slice',
+ 'conditions' : ['ENABLE_MACHINED'],
+ },
+ {
+ 'file' : 'machines.target',
+ 'conditions' : ['ENABLE_MACHINED'],
+ },
{ 'file' : 'paths.target' },
{ 'file' : 'printer.target' },
{ 'file' : 'session.slice' },
@@ -26,6 +34,14 @@ units = [
{ 'file' : 'systemd-tmpfiles-clean.service' },
{ 'file' : 'systemd-tmpfiles-clean.timer' },
{ 'file' : 'systemd-tmpfiles-setup.service' },
+ {
+ 'file' : 'systemd-nspawn@.service.in',
+ 'conditions' : ['ENABLE_NSPAWN'],
+ },
+ {
+ 'file' : 'systemd-vmspawn@.service.in',
+ 'conditions' : ['ENABLE_VMSPAWN'],
+ },
{ 'file' : 'timers.target' },
{
'file' : 'xdg-desktop-autostart.target',
@@ -34,7 +50,17 @@ units = [
]
foreach unit : units
- file = unit.get('file')
+ source = unit.get('file')
+
+ if source.endswith('.in')
+ needs_jinja = true
+ name = source.substring(0, -3)
+ assert(name + '.in' == source)
+ else
+ needs_jinja = false
+ name = source
+ endif
+ source = files(source)
install = true
foreach cond : unit.get('conditions', [])
@@ -44,20 +70,30 @@ foreach unit : units
endif
endforeach
- if install
- install_data(file,
+ if needs_jinja
+ t = custom_target(
+ name,
+ input : source,
+ output : name,
+ command : [jinja2_cmdline, '@INPUT@', '@OUTPUT@'],
+ install : install,
+ install_dir : userunitdir)
+ elif install
+ install_data(source,
install_dir : userunitdir)
+ endif
+ if install
foreach target : unit.get('symlinks', [])
if target.endswith('/')
# '/' is only allowed at the end of the target
assert(target.replace('/', '') + '/' == target)
- install_symlink(file,
- pointing_to : '..' / file,
+ install_symlink(name,
+ pointing_to : '..' / name,
install_dir : userunitdir / target)
else
install_symlink(target,
- pointing_to : file,
+ pointing_to : name,
install_dir : userunitdir)
endif
endforeach
diff --git a/units/user/systemd-nspawn@.service.in b/units/user/systemd-nspawn@.service.in
new file mode 100644
index 0000000000..4f5cf412df
--- /dev/null
+++ b/units/user/systemd-nspawn@.service.in
@@ -0,0 +1,32 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+#
+# This file is part of systemd.
+#
+# systemd is free software; you can redistribute it and/or modify it
+# under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or
+# (at your option) any later version.
+
+[Unit]
+Description=Container %i
+Documentation=man:systemd-nspawn(1)
+PartOf=machines.target
+Before=machines.target
+
+[Service]
+ExecStart=systemd-nspawn --quiet --keep-unit --register=yes --boot --network-veth -U --settings=override --machine=%i
+ExecStopPost=systemd-nspawn --cleanup --machine=%i
+KillMode=mixed
+Type=notify
+RestartForceExitStatus=133
+SuccessExitStatus=133
+Slice=machine.slice
+Delegate=yes
+DelegateSubgroup=supervisor
+CoredumpReceive=yes
+TasksMax=16384
+TimeoutSec=2min
+{{SERVICE_WATCHDOG}}
+
+[Install]
+WantedBy=machines.target
diff --git a/units/user/systemd-vmspawn@.service.in b/units/user/systemd-vmspawn@.service.in
new file mode 100644
index 0000000000..bfb321f2bf
--- /dev/null
+++ b/units/user/systemd-vmspawn@.service.in
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+#
+# This file is part of systemd.
+#
+# systemd is free software; you can redistribute it and/or modify it
+# under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or
+# (at your option) any later version.
+
+[Unit]
+Description=Virtual Machine %i
+Documentation=man:systemd-vmspawn(1)
+PartOf=machines.target
+Before=machines.target
+
+[Service]
+ExecStart=systemd-vmspawn --quiet --keep-unit --register=yes --network-tap --machine=%i
+KillMode=mixed
+Type=notify
+Slice=machine.slice
+TimeoutSec=2min
+{{SERVICE_WATCHDOG}}
+
+[Install]
+WantedBy=machines.target