From 747b5d963ef8078032e1f6f7ee98f8725d8fb454 Mon Sep 17 00:00:00 2001 From: Luca Boccassi Date: Tue, 3 Jan 2023 18:01:28 +0100 Subject: [PATCH 1/4] src/shared/: split AF_UNIX/AF_VSOCK address parsing into src/basic/ We'll use it from libsystemd0 later, but AF_INET/6 requires some netlink calls and thus the additional library dependency --- src/basic/socket-util.c | 68 +++++++++++++++ src/basic/socket-util.h | 6 ++ src/shared/socket-netlink.c | 160 +++++++++++++----------------------- 3 files changed, 131 insertions(+), 103 deletions(-) diff --git a/src/basic/socket-util.c b/src/basic/socket-util.c index 54f5f1cc5b..d7946a3641 100644 --- a/src/basic/socket-util.c +++ b/src/basic/socket-util.c @@ -1472,3 +1472,71 @@ int connect_unix_path(int fd, int dir_fd, const char *path) { return RET_NERRNO(connect(fd, &sa.sa, salen)); } + +int socket_address_parse_unix(SocketAddress *ret_address, const char *s) { + struct sockaddr_un un; + int r; + + assert(ret_address); + assert(s); + + if (!IN_SET(*s, '/', '@')) + return -EPROTO; + + r = sockaddr_un_set_path(&un, s); + if (r < 0) + return r; + + *ret_address = (SocketAddress) { + .sockaddr.un = un, + .size = r, + }; + + return 0; +} + +int socket_address_parse_vsock(SocketAddress *ret_address, const char *s) { + /* AF_VSOCK socket in vsock:cid:port notation */ + _cleanup_free_ char *n = NULL; + char *e, *cid_start; + unsigned port, cid; + int r; + + assert(ret_address); + assert(s); + + cid_start = startswith(s, "vsock:"); + if (!cid_start) + return -EPROTO; + + e = strchr(cid_start, ':'); + if (!e) + return -EINVAL; + + r = safe_atou(e+1, &port); + if (r < 0) + return r; + + n = strndup(cid_start, e - cid_start); + if (!n) + return -ENOMEM; + + if (isempty(n)) + cid = VMADDR_CID_ANY; + else { + r = safe_atou(n, &cid); + if (r < 0) + return r; + } + + *ret_address = (SocketAddress) { + .sockaddr.vm = { + .svm_cid = cid, + .svm_family = AF_VSOCK, + .svm_port = port, + }, + .size = sizeof(struct sockaddr_vm), + }; + + return 0; +} diff --git a/src/basic/socket-util.h b/src/basic/socket-util.h index 0b8d53e895..5cb35f65fb 100644 --- a/src/basic/socket-util.h +++ b/src/basic/socket-util.h @@ -336,3 +336,9 @@ int socket_get_mtu(int fd, int af, size_t *ret); #define UCRED_INVALID { .pid = 0, .uid = UID_INVALID, .gid = GID_INVALID } int connect_unix_path(int fd, int dir_fd, const char *path); + +/* Parses AF_UNIX and AF_VSOCK addresses. AF_INET[6] require some netlink calls, so it cannot be in + * src/basic/ and is done from 'socket_local_address from src/shared/. Return -EPROTO in case of + * protocol mismatch. */ +int socket_address_parse_unix(SocketAddress *ret_address, const char *s); +int socket_address_parse_vsock(SocketAddress *ret_address, const char *s); diff --git a/src/shared/socket-netlink.c b/src/shared/socket-netlink.c index 494047a5d1..e115dff506 100644 --- a/src/shared/socket-netlink.c +++ b/src/shared/socket-netlink.c @@ -17,120 +17,74 @@ #include "string-util.h" int socket_address_parse(SocketAddress *a, const char *s) { - _cleanup_free_ char *n = NULL; - char *e; + uint16_t port; int r; assert(a); assert(s); - if (IN_SET(*s, '/', '@')) { - /* AF_UNIX socket */ - struct sockaddr_un un; + r = socket_address_parse_unix(a, s); + if (r == -EPROTO) + r = socket_address_parse_vsock(a, s); + if (r != -EPROTO) + return r; - r = sockaddr_un_set_path(&un, s); - if (r < 0) - return r; - - *a = (SocketAddress) { - .sockaddr.un = un, - .size = r, - }; - - } else if (startswith(s, "vsock:")) { - /* AF_VSOCK socket in vsock:cid:port notation */ - const char *cid_start = s + STRLEN("vsock:"); - unsigned port, cid; - - e = strchr(cid_start, ':'); - if (!e) - return -EINVAL; - - r = safe_atou(e+1, &port); - if (r < 0) - return r; - - n = strndup(cid_start, e - cid_start); - if (!n) - return -ENOMEM; - - if (isempty(n)) - cid = VMADDR_CID_ANY; - else { - r = safe_atou(n, &cid); - if (r < 0) - return r; - } - - *a = (SocketAddress) { - .sockaddr.vm = { - .svm_cid = cid, - .svm_family = AF_VSOCK, - .svm_port = port, - }, - .size = sizeof(struct sockaddr_vm), - }; + r = parse_ip_port(s, &port); + if (r == -ERANGE) + return r; /* Valid port syntax, but the numerical value is wrong for a port. */ + if (r >= 0) { + /* Just a port */ + if (socket_ipv6_is_supported()) + *a = (SocketAddress) { + .sockaddr.in6 = { + .sin6_family = AF_INET6, + .sin6_port = htobe16(port), + .sin6_addr = in6addr_any, + }, + .size = sizeof(struct sockaddr_in6), + }; + else + *a = (SocketAddress) { + .sockaddr.in = { + .sin_family = AF_INET, + .sin_port = htobe16(port), + .sin_addr.s_addr = INADDR_ANY, + }, + .size = sizeof(struct sockaddr_in), + }; } else { - uint16_t port; + union in_addr_union address; + int family, ifindex; - r = parse_ip_port(s, &port); - if (r == -ERANGE) - return r; /* Valid port syntax, but the numerical value is wrong for a port. */ - if (r >= 0) { - /* Just a port */ - if (socket_ipv6_is_supported()) - *a = (SocketAddress) { - .sockaddr.in6 = { - .sin6_family = AF_INET6, - .sin6_port = htobe16(port), - .sin6_addr = in6addr_any, - }, - .size = sizeof(struct sockaddr_in6), - }; - else - *a = (SocketAddress) { - .sockaddr.in = { - .sin_family = AF_INET, - .sin_port = htobe16(port), - .sin_addr.s_addr = INADDR_ANY, - }, - .size = sizeof(struct sockaddr_in), - }; + r = in_addr_port_ifindex_name_from_string_auto(s, &family, &address, &port, &ifindex, NULL); + if (r < 0) + return r; - } else { - union in_addr_union address; - int family, ifindex; + if (port == 0) /* No port, no go. */ + return -EINVAL; - r = in_addr_port_ifindex_name_from_string_auto(s, &family, &address, &port, &ifindex, NULL); - if (r < 0) - return r; - - if (port == 0) /* No port, no go. */ - return -EINVAL; - - if (family == AF_INET) - *a = (SocketAddress) { - .sockaddr.in = { - .sin_family = AF_INET, - .sin_addr = address.in, - .sin_port = htobe16(port), - }, - .size = sizeof(struct sockaddr_in), - }; - else if (family == AF_INET6) - *a = (SocketAddress) { - .sockaddr.in6 = { - .sin6_family = AF_INET6, - .sin6_addr = address.in6, - .sin6_port = htobe16(port), - .sin6_scope_id = ifindex, - }, - .size = sizeof(struct sockaddr_in6), - }; - else - assert_not_reached(); - } + if (family == AF_INET) + *a = (SocketAddress) { + .sockaddr.in = { + .sin_family = AF_INET, + .sin_addr = address.in, + .sin_port = htobe16(port), + }, + .size = sizeof(struct sockaddr_in), + }; + else if (family == AF_INET6) + *a = (SocketAddress) { + .sockaddr.in6 = { + .sin6_family = AF_INET6, + .sin6_addr = address.in6, + .sin6_port = htobe16(port), + .sin6_scope_id = ifindex, + }, + .size = sizeof(struct sockaddr_in6), + }; + else + assert_not_reached(); } return 0; From 6c94cfcda5387bcec36867c7639c777179d6e7c2 Mon Sep 17 00:00:00 2001 From: Luca Boccassi Date: Tue, 3 Jan 2023 18:08:09 +0100 Subject: [PATCH 2/4] sd_notify: support AF_VSOCK Allow sending notifications via AF_VSOCK, so that VMs can communicate to the hypervisor/VMM that they are finished booting. Note that if the hypervisor does not support SOCK_DGRAM over AF_VSOCK (ie: qemu at the time of writing), SOCK_SEQPACKET will be used instead. --- man/sd_notify.xml | 27 ++++++++--- src/libsystemd/sd-daemon/sd-daemon.c | 67 +++++++++++++++++++++++++--- 2 files changed, 80 insertions(+), 14 deletions(-) diff --git a/man/sd_notify.xml b/man/sd_notify.xml index de402950bb..021cd0384f 100644 --- a/man/sd_notify.xml +++ b/man/sd_notify.xml @@ -368,13 +368,26 @@ These functions send a single datagram with the - state string as payload to the AF_UNIX socket - referenced in the $NOTIFY_SOCKET environment - variable. If the first character of - $NOTIFY_SOCKET is @, the - string is understood as Linux abstract namespace socket. The - datagram is accompanied by the process credentials of the sending - service, using SCM_CREDENTIALS. + state string as payload to the socket referenced in the + $NOTIFY_SOCKET environment variable. If the + first character of $NOTIFY_SOCKET is + / or @, the string is understood + as an AF_UNIX or Linux abstract namespace socket + (respectively), and in both cases the datagram is accompanied by the + process credentials of the sending service, using SCM_CREDENTIALS. If + the string starts with vsock: then the string is + understood as an AF_VSOCK address, which is useful + for hypervisors/VMMs or other processes on the host to receive a + notification when a virtual machine has finished booting. Note that in + case the hypervisor does not support SOCK_DGRAM + over AF_VSOCK, SOCK_SEQPACKET + will be used instead. The address should be in the form: + vsock:CID:PORT. Note that unlike other uses of vsock, + the CID is mandatory and cannot be VMADDR_CID_ANY. + Note that PID1 will send the VSOCK packets from a privileged port + (i.e.: lower than 1024), as an attempt to address concerns that unprivileged + processes in the guest might try to send malicious notifications to the + host, driving it to make destructive decisions based on them. diff --git a/src/libsystemd/sd-daemon/sd-daemon.c b/src/libsystemd/sd-daemon/sd-daemon.c index 6da351dd9b..8dc11aeb30 100644 --- a/src/libsystemd/sd-daemon/sd-daemon.c +++ b/src/libsystemd/sd-daemon/sd-daemon.c @@ -433,6 +433,23 @@ _public_ int sd_is_mq(int fd, const char *path) { return 1; } +static int vsock_bind_privileged_port(int fd) { + union sockaddr_union sa = { + .vm.svm_family = AF_VSOCK, + .vm.svm_cid = VMADDR_CID_ANY, + .vm.svm_port = 1023, + }; + int r; + + assert(fd >= 0); + + do + r = RET_NERRNO(bind(fd, &sa.sa, sizeof(sa.vm))); + while (r == -EADDRINUSE && --sa.vm.svm_port > 0); + + return r; +} + _public_ int sd_pid_notify_with_fds( pid_t pid, int unset_environment, @@ -440,12 +457,12 @@ _public_ int sd_pid_notify_with_fds( const int *fds, unsigned n_fds) { - union sockaddr_union sockaddr; + SocketAddress address; struct iovec iovec; struct msghdr msghdr = { .msg_iov = &iovec, .msg_iovlen = 1, - .msg_name = &sockaddr, + .msg_name = &address.sockaddr, }; _cleanup_close_ int fd = -EBADF; struct cmsghdr *cmsg = NULL; @@ -467,17 +484,53 @@ _public_ int sd_pid_notify_with_fds( if (!e) return 0; - r = sockaddr_un_set_path(&sockaddr.un, e); + /* Allow AF_UNIX and AF_VSOCK, reject the rest. */ + r = socket_address_parse_unix(&address, e); + if (r == -EPROTO) + r = socket_address_parse_vsock(&address, e); if (r < 0) goto finish; - msghdr.msg_namelen = r; + msghdr.msg_namelen = address.size; - fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0); - if (fd < 0) { - r = -errno; + /* If we didn't get an address (which is a normal pattern when specifying VSOCK tuples) error out, + * we always require a specific CID. */ + if (address.sockaddr.vm.svm_family == AF_VSOCK && address.sockaddr.vm.svm_cid == VMADDR_CID_ANY) { + r = -EINVAL; goto finish; } + /* At the time of writing QEMU does not yet support AF_VSOCK + SOCK_DGRAM and returns + * ENODEV. Fallback to SOCK_SEQPACKET in that case. */ + fd = socket(address.sockaddr.sa.sa_family, SOCK_DGRAM|SOCK_CLOEXEC, 0); + if (fd < 0) { + if (!(ERRNO_IS_NOT_SUPPORTED(errno) || errno == ENODEV) || address.sockaddr.sa.sa_family != AF_VSOCK) { + r = -errno; + goto finish; + } + + fd = socket(address.sockaddr.sa.sa_family, SOCK_SEQPACKET|SOCK_CLOEXEC, 0); + if (fd < 0) { + r = -errno; + goto finish; + } + + r = vsock_bind_privileged_port(fd); + if (r < 0 && !ERRNO_IS_PRIVILEGE(r)) + goto finish; + + if (connect(fd, &address.sockaddr.sa, address.size) < 0) { + r = -errno; + goto finish; + } + + msghdr.msg_name = NULL; + msghdr.msg_namelen = 0; + } else if (address.sockaddr.sa.sa_family == AF_VSOCK) { + r = vsock_bind_privileged_port(fd); + if (r < 0 && !ERRNO_IS_PRIVILEGE(r)) + goto finish; + } + (void) fd_inc_sndbuf(fd, SNDBUF_SIZE); iovec = IOVEC_MAKE_STRING(state); From 4a91ace5bc737d552fa20444d99d1100d9c1f9f7 Mon Sep 17 00:00:00 2001 From: Luca Boccassi Date: Tue, 3 Jan 2023 18:11:04 +0100 Subject: [PATCH 3/4] creds: import 'vmm.notify_socket' and use it to set This is intended to be used with VSOCK, to notify the hypervisor/VMM, eg on the host: qemu <...> -smbios type=11,value=io.systemd.credential:vmm.notify_socket=vsock:2:1234 -device vhost-vsock-pci,id=vhost-vsock-pci0,guest-cid=42 (vsock:2:1234 -> send to host on vsock port 1234, default is to send to 0 which is the hypervisor itself) Also on the host: $ socat - VSOCK-LISTEN:1234,socktype=5 READY=1 STATUS=Ready. --- docs/CREDENTIALS.md | 25 ++++++++++++++++++++++++- man/systemd.system-credentials.xml | 18 ++++++++++++++++++ src/core/import-creds.c | 13 +++++++++++++ 3 files changed, 55 insertions(+), 1 deletion(-) diff --git a/docs/CREDENTIALS.md b/docs/CREDENTIALS.md index 9e16dd3ba4..debe0a714f 100644 --- a/docs/CREDENTIALS.md +++ b/docs/CREDENTIALS.md @@ -330,6 +330,18 @@ systemd-run -p LoadCredential=mycred -P --wait systemd-creds cat mycred Various services shipped with `systemd` consume credentials for tweaking behaviour: +* [`systemd(1)`](https://www.freedesktop.org/software/systemd/man/systemd.html) + (I.E.: PID1, the system manager) will look for the credential `vmm.notify_socket` + and will use it to send a `READY=1` datagram when the system has finished + booting. This is useful for hypervisors/VMMs or other processes on the host + to receive a notification via VSOCK when a virtual machine has finished booting. + Note that in case the hypervisor does not support `SOCK_DGRAM` over `AF_VSOCK`, + `SOCK_SEQPACKET` will be tried instead. The credential payload should be in the + form: `vsock::`, where `` is optional and if omitted will + default to talking to the hypervisor (`0`). Also note that this requires + support for VHOST to be built-in both the guest and the host kernels, and the + kernel modules to be loaded. + * [`systemd-sysusers(8)`](https://www.freedesktop.org/software/systemd/man/systemd-sysusers.html) will look for the credentials `passwd.hashed-password.`, `passwd.plaintext-password.` and `passwd.shell.` to @@ -382,7 +394,8 @@ qemu-system-x86_64 \ ``` This boots the specified disk image via qemu, provisioning public key SSH access -for the root user from the caller's key: +for the root user from the caller's key, and sends a notification when booting +has finished to a process on the host: ``` qemu-system-x86_64 \ @@ -396,8 +409,18 @@ qemu-system-x86_64 \ -drive if=none,id=hd,file=test.raw,format=raw \ -device virtio-scsi-pci,id=scsi \ -device scsi-hd,drive=hd,bootindex=1 \ + -device vhost-vsock-pci,id=vhost-vsock-pci0,guest-cid=42 \ + -smbios type=11,value=io.systemd.credential:vmm.notify_socket=vsock:2:1234 \ -smbios type=11,value=io.systemd.credential.binary:tmpfiles.extra=$(echo "f~ /root/.ssh/authorized_keys 700 root root - $(ssh-add -L | base64 -w 0)" | base64 -w 0) ``` + +A process on the host can listen for the notification, for example: + +``` +$ socat - VSOCK-LISTEN:1234,socktype=5 +READY=1 +``` + ## Relevant Paths From *service* perspective the runtime path to find loaded credentials in is diff --git a/man/systemd.system-credentials.xml b/man/systemd.system-credentials.xml index 3eadf9b985..2a87139a11 100644 --- a/man/systemd.system-credentials.xml +++ b/man/systemd.system-credentials.xml @@ -190,6 +190,24 @@ + + vmm.notify_socket + + This credential is parsed looking for an AF_VSOCK or + AF_UNIX address where to send a READY=1 + notification datagram when the system has finished booting. See: + sd_notify3 + This is useful for hypervisors/VMMs or other processes on the host + to receive a notification via VSOCK when a virtual machine has finished booting. + Note that in case the hypervisor does not support SOCK_DGRAM + over AF_VSOCK, SOCK_SEQPACKET will be + tried instead. The credential payload for AF_VSOCK should be + in the form: vsock:CID:PORT, where CID is + optional and if omitted will default to talking to the hypervisor + (0). + + + diff --git a/src/core/import-creds.c b/src/core/import-creds.c index 1f5a15f73b..ade509be34 100644 --- a/src/core/import-creds.c +++ b/src/core/import-creds.c @@ -713,5 +713,18 @@ int import_credentials(void) { r = q; } + if (r >= 0) { + _cleanup_free_ char *address = NULL; + + r = read_credential("vmm.notify_socket", (void **)&address, /* ret_size= */ NULL); + if (r < 0 && !IN_SET(r, -ENOENT, -ENXIO)) + log_warning_errno(r, "Failed to read 'vmm.notify_socket' credential, ignoring: %m"); + else if (r >= 0 && !isempty(address)) { + r = setenv("NOTIFY_SOCKET", address, /* replace= */ 1); + if (r < 0) + log_warning_errno(errno, "Failed to set $NOTIFY_SOCKET environment variable, ignoring: %m"); + } + } + return r; } From 03ede612beb6f401cf433e90988b21aa57788bb3 Mon Sep 17 00:00:00 2001 From: Luca Boccassi Date: Mon, 2 Jan 2023 23:43:33 +0100 Subject: [PATCH 4/4] Update TODO --- TODO | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/TODO b/TODO index 8d4b46106f..c4df14ea06 100644 --- a/TODO +++ b/TODO @@ -552,10 +552,6 @@ Features: * sd-boot should look for information what to boot in SMBIOS, too, so that VM managers can tell sd-boot what to boot into and suchlike -* PID 1 should look for an SMBIOS variable that encodes an AF_VSOCK address it - should send sd_notify() ready notifications to. That way a VMM can boot up a - system, and generically know when it finished booting. - * add "systemd-sysext identify" verb, that you can point on any file in /usr/ and that determines from which overlayfs layer it originates, which image, and with what it was signed. @@ -778,13 +774,7 @@ Features: don't query this unnecessarily in entirely uninitialized containers. (i.e. containers with empty /etc). -* beef up sd_notify() to support AV_VSOCK in $NOTIFY_SOCKET, so that VM - managers can get ready notifications from VMs, just like container managers - from their payload. Also pick up address from qemu/fw_cfg if set there. - (which has benefits, given SecureBoot and kernel cmdline are not necessarily - friends.) - -* mirroring this: maybe support binding to AV_VSOCK in Type=notify services, +* sd_notify/vsock: maybe support binding to AF_VSOCK in Type=notify services, then passing $NOTIFY_SOCKET and $NOTIFY_GUESTCID with PID1's cid (typically fixed to "2", i.e. the official host cid) and the expected guest cid, for the two sides of the channel. The latter env var could then be used in an