vmspawn: Add --bind-user= and --bind-user-shell= (#38410)

We use virtiofsd ID translation to mimick idmapped mounts and the
transient userdb credentials to provision the mapped user in the VM.
This commit is contained in:
Daan De Meyer
2025-10-22 20:03:47 +02:00
committed by GitHub
7 changed files with 243 additions and 33 deletions

View File

@@ -459,6 +459,73 @@
<xi:include href="version-info.xml" xpointer="v256"/></listitem>
</varlistentry>
<varlistentry>
<term><option>--bind-user=</option></term>
<listitem><para>Binds the home directory of the specified user on the host into the virtual
machine. Takes the name of an existing user on the host as argument. May be used multiple times to
bind multiple users into the virtual machine. This does two things:</para>
<orderedlist>
<listitem><para>The user's home directory is made available from the host into
<filename>/run/vmhost/home/</filename> using virtiofs. virtiofsd id translation to map the host
user's UID/GID to its assigned UID/GID in the virtual machine.</para></listitem>
<listitem><para>JSON user and group records are generated in that describes the mapped user which
are passed into the virtual machine using <literal>userdb.transient.*</literal> credentials.
They contain a minimized representation of the host's user record, adjusted to the UID/GID and
home directory path assigned to the user in the virtual machine. The
<citerefentry><refentrytitle>nss-systemd</refentrytitle><manvolnum>8</manvolnum></citerefentry>
glibc NSS module will pick up these records from there and make them available in the virtual
machine's user/group databases.</para></listitem>
</orderedlist>
<para>The combination of the two operations above ensures that it is possible to log into the
virtual machine using the same account information as on the host. The user is only mapped
transiently, while the virtual machine is running, and the mapping itself does not result in
persistent changes to the virtual machine (except maybe for log messages generated at login time,
and similar). Note that in particular the UID/GID assignment in the virtual machine is not made
persistently. If the user is mapped transiently, it is best to not allow the user to make
persistent changes to the virtual machine. If the user leaves files or directories owned by the
user, and those UIDs/GIDs are reused during later virtual machine invocations (possibly with a
different <option>--bind-user=</option> mapping), those files and directories will be accessible to
the "new" user.</para>
<para>The user/group record mapping only works if the virtual machine contains systemd 258 or
newer, with <command>nss-systemd</command> properly configured in
<filename>nsswitch.conf</filename>. See
<citerefentry><refentrytitle>nss-systemd</refentrytitle><manvolnum>8</manvolnum></citerefentry> for
details.</para>
<para>Note that the user record propagated from the host into the virtual machine will contain the
UNIX password hash of the user, so that seamless logins in the virtual machine are possible. If the
virtual machine is less trusted than the host it is hence important to use a strong UNIX password
hash function (e.g. yescrypt or similar, with the <literal>$y$</literal> hash prefix).</para>
<xi:include href="version-info.xml" xpointer="v259"/></listitem>
</varlistentry>
<varlistentry>
<term><option>--bind-user-shell=</option></term>
<listitem><para>When used with <option>--bind-user=</option>, includes the specified shell in the
user records of users bound into the virtual machine. Takes either a boolean or an absolute path.</para>
<itemizedlist>
<listitem><para>If false (the default), no shell is passed in the user records for users bound into
the virtual machine. This causes bound users to the use the virtual machine's default shell.</para></listitem>
<listitem><para>If true, the shells specified by the host user records are included in the user records of all users bound into the virtual machine.</para></listitem>
<listitem><para>If passed an absolute path, sets that path as the shell for user records of all users bound into the virtual machine.</para></listitem>
</itemizedlist>
<para>Note: This will not check whether the specified shells exist in the virtual machine.</para>
<para>This operation is only supported in combination with <option>--bind-user=</option>.</para>
<xi:include href="version-info.xml" xpointer="v259"/></listitem>
</varlistentry>
</variablelist>
</refsect2>

View File

@@ -427,6 +427,8 @@ static int help(void) {
" --overlay-ro=PATH[:PATH...]:PATH\n"
" Similar, but creates a read-only overlay mount\n"
" --bind-user=NAME Bind user from host to container\n"
" --bind-user-shell=BOOL|PATH\n"
" Configure the shell to use for --bind-user= users\n"
"\n%3$sInput/Output:%4$s\n"
" --console=MODE Select how stdin/stdout/stderr and /dev/console are\n"
" set up for the container.\n"
@@ -4017,6 +4019,7 @@ static int outer_child(
arg_bind_user,
arg_bind_user_shell,
arg_bind_user_shell_copy,
"/run/host/home",
&bind_user_context);
if (r < 0)
return r;

View File

@@ -33,14 +33,14 @@ static int check_etc_passwd_collisions(
if (r == -ENOENT)
return 0; /* no user database? then no user, hence no collision */
if (r < 0)
return log_error_errno(r, "Failed to open /etc/passwd of container: %m");
return log_error_errno(r, "Failed to open /etc/passwd of machine: %m");
for (;;) {
struct passwd *pw;
r = fgetpwent_sane(f, &pw);
if (r < 0)
return log_error_errno(r, "Failed to iterate through /etc/passwd of container: %m");
return log_error_errno(r, "Failed to iterate through /etc/passwd of machine: %m");
if (r == 0) /* EOF */
return 0; /* no collision */
@@ -68,14 +68,14 @@ static int check_etc_group_collisions(
if (r == -ENOENT)
return 0; /* no group database? then no group, hence no collision */
if (r < 0)
return log_error_errno(r, "Failed to open /etc/group of container: %m");
return log_error_errno(r, "Failed to open /etc/group of machine: %m");
for (;;) {
struct group *gr;
r = fgetgrent_sane(f, &gr);
if (r < 0)
return log_error_errno(r, "Failed to iterate through /etc/group of container: %m");
return log_error_errno(r, "Failed to iterate through /etc/group of machine: %m");
if (r == 0)
return 0; /* no collision */
@@ -93,6 +93,7 @@ static int convert_user(
uid_t allocate_uid,
const char *shell,
bool shell_copy,
const char *home_mount_directory,
UserRecord **ret_converted_user,
GroupRecord **ret_converted_group) {
@@ -114,16 +115,16 @@ static int convert_user(
return r;
if (r > 0)
return log_error_errno(SYNTHETIC_ERRNO(EBUSY),
"Sorry, the user '%s' already exists in the container.", u->user_name);
"Sorry, the user '%s' already exists in the machine.", u->user_name);
r = check_etc_group_collisions(directory, g->group_name, GID_INVALID);
if (r < 0)
return r;
if (r > 0)
return log_error_errno(SYNTHETIC_ERRNO(EBUSY),
"Sorry, the group '%s' already exists in the container.", g->group_name);
"Sorry, the group '%s' already exists in the machine.", g->group_name);
h = path_join("/run/host/home/", u->user_name);
h = path_join(home_mount_directory, u->user_name);
if (!h)
return log_oom();
@@ -148,7 +149,7 @@ static int convert_user(
SD_JSON_BUILD_PAIR_CONDITION(!strv_isempty(u->hashed_password), "hashedPassword", SD_JSON_BUILD_VARIANT(hp)),
SD_JSON_BUILD_PAIR_CONDITION(!!ssh, "sshAuthorizedKeys", SD_JSON_BUILD_VARIANT(ssh))))));
if (r < 0)
return log_error_errno(r, "Failed to build container user record: %m");
return log_error_errno(r, "Failed to build machine user record: %m");
r = group_record_build(
&converted_group,
@@ -158,7 +159,7 @@ static int convert_user(
SD_JSON_BUILD_PAIR_CONDITION(g->disposition >= 0, "disposition", SD_JSON_BUILD_STRING(user_disposition_to_string(g->disposition))),
SD_JSON_BUILD_PAIR("service", JSON_BUILD_CONST_STRING("io.systemd.NSpawn"))));
if (r < 0)
return log_error_errno(r, "Failed to build container group record: %m");
return log_error_errno(r, "Failed to build machine group record: %m");
*ret_converted_user = TAKE_PTR(converted_user);
*ret_converted_group = TAKE_PTR(converted_group);
@@ -175,7 +176,7 @@ static int find_free_uid(const char *directory, uid_t *current_uid) {
if (*current_uid > MAP_UID_MAX)
return log_error_errno(
SYNTHETIC_ERRNO(EBUSY),
"No suitable available UID in range " UID_FMT "" UID_FMT " in container detected, can't map user.",
"No suitable available UID in range " UID_FMT "" UID_FMT " in machine detected, can't map user.",
MAP_UID_MIN, MAP_UID_MAX);
r = check_etc_passwd_collisions(directory, NULL, *current_uid);
@@ -210,6 +211,7 @@ int machine_bind_user_prepare(
char **bind_user,
const char *bind_user_shell,
bool bind_user_shell_copy,
const char *bind_user_home_mount_directory,
MachineBindUserContext **ret) {
_cleanup_(machine_bind_user_context_freep) MachineBindUserContext *c = NULL;
@@ -219,7 +221,7 @@ int machine_bind_user_prepare(
assert(ret);
/* This resolves the users specified in 'bind_user', generates a minimalized JSON user + group record
* for it to stick in the container, allocates a UID/GID for it, and updates the custom mount table,
* for it to stick in the machine, allocates a UID/GID for it, and updates the custom mount table,
* to include an appropriate bind mount mapping.
*
* This extends the passed custom_mounts/n_custom_mounts with the home directories, and allocates a
@@ -264,13 +266,13 @@ int machine_bind_user_prepare(
if (r < 0)
return log_error_errno(r, "Failed to resolve group of user '%s': %m", u->user_name);
/* We want to synthesize exactly one user + group from the host into the container. This only
/* We want to synthesize exactly one user + group from the host into the machine. This only
* makes sense if the user on the host has its own private group. We can't reasonably check
* this, so we just check of the name of user and group match.
*
* One of these days we might want to support users in a shared/common group too, but it's
* not clear to me how this would have to be mapped, precisely given that the common group
* probably already exists in the container. */
* probably already exists in the machine. */
if (!streq(u->user_name, g->group_name))
return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
"Sorry, mapping users without private groups is currently not supported.");
@@ -279,7 +281,14 @@ int machine_bind_user_prepare(
if (r < 0)
return r;
r = convert_user(directory, u, g, current_uid, bind_user_shell, bind_user_shell_copy, &cu, &cg);
r = convert_user(
directory,
u, g,
current_uid,
bind_user_shell,
bind_user_shell_copy,
bind_user_home_mount_directory,
&cu, &cg);
if (r < 0)
return r;

View File

@@ -27,4 +27,5 @@ int machine_bind_user_prepare(
char **bind_user,
const char *bind_user_shell,
bool bind_user_shell_copy,
const char *bind_user_home_mount_directory,
MachineBindUserContext **ret);

View File

@@ -7,7 +7,7 @@
#include "string-util.h"
#include "vmspawn-mount.h"
static void runtime_mount_done(RuntimeMount *mount) {
void runtime_mount_done(RuntimeMount *mount) {
assert(mount);
mount->source = mfree(mount->source);
@@ -24,7 +24,11 @@ void runtime_mount_context_done(RuntimeMountContext *ctx) {
}
int runtime_mount_parse(RuntimeMountContext *ctx, const char *s, bool read_only) {
_cleanup_(runtime_mount_done) RuntimeMount mount = { .read_only = read_only };
_cleanup_(runtime_mount_done) RuntimeMount mount = {
.read_only = read_only,
.source_uid = UID_INVALID,
.target_uid = UID_INVALID,
};
_cleanup_free_ char *source_rel = NULL;
int r;

View File

@@ -6,7 +6,9 @@
typedef struct RuntimeMount {
bool read_only;
char *source;
uid_t source_uid;
char *target;
uid_t target_uid;
} RuntimeMount;
typedef struct RuntimeMountContext {
@@ -14,5 +16,6 @@ typedef struct RuntimeMountContext {
size_t n_mounts;
} RuntimeMountContext;
void runtime_mount_done(RuntimeMount *mount);
void runtime_mount_context_done(RuntimeMountContext *ctx);
int runtime_mount_parse(RuntimeMountContext *ctx, const char *s, bool read_only);

View File

@@ -35,11 +35,13 @@
#include "format-util.h"
#include "fs-util.h"
#include "gpt.h"
#include "group-record.h"
#include "hexdecoct.h"
#include "hostname-setup.h"
#include "hostname-util.h"
#include "id128-util.h"
#include "log.h"
#include "machine-bind-user.h"
#include "machine-credential.h"
#include "main-func.h"
#include "mkdir.h"
@@ -68,6 +70,8 @@
#include "terminal-util.h"
#include "tmpfile-util.h"
#include "unit-name.h"
#include "user-record.h"
#include "user-util.h"
#include "utf8.h"
#include "vmspawn-mount.h"
#include "vmspawn-register.h"
@@ -136,6 +140,9 @@ static char *arg_tpm_state_path = NULL;
static TpmStateMode arg_tpm_state_mode = TPM_STATE_AUTO;
static bool arg_ask_password = true;
static bool arg_notify_ready = true;
static char **arg_bind_user = NULL;
static char *arg_bind_user_shell = NULL;
static bool arg_bind_user_shell_copy = false;
STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
STATIC_DESTRUCTOR_REGISTER(arg_image, freep);
@@ -155,6 +162,8 @@ STATIC_DESTRUCTOR_REGISTER(arg_ssh_key_type, freep);
STATIC_DESTRUCTOR_REGISTER(arg_smbios11, strv_freep);
STATIC_DESTRUCTOR_REGISTER(arg_tpm_state_path, freep);
STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep);
STATIC_DESTRUCTOR_REGISTER(arg_bind_user, strv_freep);
STATIC_DESTRUCTOR_REGISTER(arg_bind_user_shell, freep);
static int help(void) {
_cleanup_free_ char *link = NULL;
@@ -215,6 +224,9 @@ static int help(void) {
" --bind-ro=SOURCE[:TARGET]\n"
" Mount a file or directory, but read-only\n"
" --extra-drive=PATH Adds an additional disk to the virtual machine\n"
" --bind-user=NAME Bind user from host to virtual machine\n"
" --bind-user-shell=BOOL|PATH\n"
" Configure the shell to use for --bind-user= users\n"
"\n%3$sIntegration:%4$s\n"
" --forward-journal=FILE|DIR\n"
" Forward the VM's journal to the host\n"
@@ -289,6 +301,8 @@ static int parse_argv(int argc, char *argv[]) {
ARG_NO_ASK_PASSWORD,
ARG_PROPERTY,
ARG_NOTIFY_READY,
ARG_BIND_USER,
ARG_BIND_USER_SHELL,
};
static const struct option options[] = {
@@ -338,6 +352,8 @@ static int parse_argv(int argc, char *argv[]) {
{ "no-ask-password", no_argument, NULL, ARG_NO_ASK_PASSWORD },
{ "property", required_argument, NULL, ARG_PROPERTY },
{ "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
{ "bind-user", required_argument, NULL, ARG_BIND_USER },
{ "bind-user-shell", required_argument, NULL, ARG_BIND_USER_SHELL },
{}
};
@@ -675,6 +691,30 @@ static int parse_argv(int argc, char *argv[]) {
break;
case ARG_BIND_USER:
if (!valid_user_group_name(optarg, /* flags= */ 0))
return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid user name to bind: %s", optarg);
if (strv_extend(&arg_bind_user, optarg) < 0)
return log_oom();
break;
case ARG_BIND_USER_SHELL: {
bool copy = false;
char *sh = NULL;
r = parse_user_shell(optarg, &sh, &copy);
if (r == -ENOMEM)
return log_oom();
if (r < 0)
return log_error_errno(r, "Invalid user shell to bind: %s", optarg);
free_and_replace(arg_bind_user_shell, sh);
arg_bind_user_shell_copy = copy;
break;
}
case '?':
return -EINVAL;
@@ -682,6 +722,12 @@ static int parse_argv(int argc, char *argv[]) {
assert_not_reached();
}
/* Drop duplicate --bind-user= entries */
strv_uniq(arg_bind_user);
if (arg_bind_user_shell && strv_isempty(arg_bind_user))
return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --bind-user-shell= without --bind-user=");
if (argc > optind) {
arg_kernel_cmdline_extra = strv_copy(argv + optind);
if (!arg_kernel_cmdline_extra)
@@ -1359,7 +1405,9 @@ static int find_virtiofsd(char **ret) {
static int start_virtiofsd(
const char *scope,
const char *directory,
bool uidmap,
uid_t source_uid,
uid_t target_uid,
uid_t uid_range,
const char *runtime_dir,
const char *sd_socket_activate,
char **ret_listen_address,
@@ -1397,20 +1445,20 @@ static int start_virtiofsd(
if (!argv)
return log_oom();
if (uidmap && arg_uid_shift != UID_INVALID) {
r = strv_extend(&argv, "--uid-map");
if (source_uid != UID_INVALID && target_uid != UID_INVALID && uid_range != UID_INVALID) {
r = strv_extend(&argv, "--translate-uid");
if (r < 0)
return log_oom();
r = strv_extendf(&argv, ":0:" UID_FMT ":" UID_FMT ":", arg_uid_shift, arg_uid_range);
r = strv_extendf(&argv, "map:" UID_FMT ":" UID_FMT ":" UID_FMT, target_uid, source_uid, uid_range);
if (r < 0)
return log_oom();
r = strv_extend(&argv, "--gid-map");
r = strv_extend(&argv, "--translate-gid");
if (r < 0)
return log_oom();
r = strv_extendf(&argv, ":0:" GID_FMT ":" GID_FMT ":", arg_uid_shift, arg_uid_range);
r = strv_extendf(&argv, "map:" GID_FMT ":" GID_FMT ":" GID_FMT, target_uid, source_uid, uid_range);
if (r < 0)
return log_oom();
}
@@ -1425,6 +1473,65 @@ static int start_virtiofsd(
return 0;
}
static int bind_user_setup(
const MachineBindUserContext *context,
MachineCredentialContext *credentials,
RuntimeMountContext *mounts) {
int r;
assert(credentials);
assert(mounts);
if (!context)
return 0;
FOREACH_ARRAY(bind_user, context->data, context->n_data) {
_cleanup_free_ char *formatted = NULL;
r = sd_json_variant_format(bind_user->payload_user->json, SD_JSON_FORMAT_NEWLINE, &formatted);
if (r < 0)
return log_error_errno(r, "Failed to format JSON user record: %m");
_cleanup_free_ char *cred = strjoin("userdb.transient.user.", bind_user->payload_user->user_name);
if (!cred)
return log_oom();
r = machine_credential_add(credentials, cred, formatted, SIZE_MAX);
if (r < 0)
return r;
formatted = mfree(formatted);
r = sd_json_variant_format(bind_user->payload_group->json, SD_JSON_FORMAT_NEWLINE, &formatted);
if (r < 0)
return log_error_errno(r, "Failed to format JSON group record: %m");
free(cred);
cred = strjoin("userdb.transient.group.", bind_user->payload_group->group_name);
if (!cred)
return log_oom();
r = machine_credential_add(credentials, cred, formatted, SIZE_MAX);
if (r < 0)
return r;
_cleanup_(runtime_mount_done) RuntimeMount mount = {
.source = strdup(user_record_home_directory(bind_user->host_user)),
.source_uid = bind_user->host_user->uid,
.target = strdup(user_record_home_directory(bind_user->payload_user)),
.target_uid = bind_user->payload_user->uid,
};
if (!mount.source || !mount.target)
return log_oom();
if (!GREEDY_REALLOC(mounts->mounts, mounts->n_mounts + 1))
return log_oom();
mounts->mounts[mounts->n_mounts++] = TAKE_STRUCT(mount);
}
return 0;
}
static int kernel_cmdline_maybe_append_root(void) {
int r;
bool cmdline_contains_root = strv_find_startswith(arg_kernel_cmdline_extra, "root=")
@@ -1726,6 +1833,21 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) {
if (r < 0)
return log_error_errno(r, "Failed to find OVMF config: %m");
_cleanup_(machine_bind_user_context_freep) MachineBindUserContext *bind_user_context = NULL;
r = machine_bind_user_prepare(
/* directory= */ NULL,
arg_bind_user,
arg_bind_user_shell,
arg_bind_user_shell_copy,
"/run/vmhost/home",
&bind_user_context);
if (r < 0)
return r;
r = bind_user_setup(bind_user_context, &arg_credentials, &arg_runtime_mounts);
if (r < 0)
return r;
/* only warn if the user hasn't disabled secureboot */
if (!ovmf_config->supports_sb && arg_secure_boot)
log_warning("Couldn't find OVMF firmware blob with Secure Boot support, "
@@ -2177,7 +2299,9 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) {
r = start_virtiofsd(
unit,
arg_directory,
/* uidmap= */ true,
/* source_uid= */ arg_uid_shift,
/* target_uid= */ 0,
/* uid_range= */ arg_uid_range,
runtime_dir,
sd_socket_activate,
&listen_address,
@@ -2267,7 +2391,9 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) {
r = start_virtiofsd(
unit,
mount->source,
/* uidmap= */ false,
/* source_uid= */ mount->source_uid,
/* target_uid= */ mount->target_uid,
/* uid_range= */ 1U,
runtime_dir,
sd_socket_activate,
&listen_address,
@@ -2441,7 +2567,7 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) {
}
if (arg_forward_journal) {
_cleanup_free_ char *listen_address = NULL, *cred = NULL;
_cleanup_free_ char *listen_address = NULL;
if (!GREEDY_REALLOC(children, n_children + 1))
return log_oom();
@@ -2459,11 +2585,7 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) {
pidref_done(&child);
children[n_children++] = TAKE_PTR(source);
cred = strjoin("journal.forward_to_socket:", listen_address);
if (!cred)
return log_oom();
r = machine_credential_set(&arg_credentials, cred);
r = machine_credential_add(&arg_credentials, "journal.forward_to_socket", listen_address, SIZE_MAX);
if (r < 0)
return r;
}
@@ -2509,13 +2631,14 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) {
/* on distros that provide their own sshd@.service file we need to provide a dropin which
* picks up our public key credential */
r = machine_credential_set(
r = machine_credential_add(
&arg_credentials,
"systemd.unit-dropin.sshd-vsock@.service:"
"systemd.unit-dropin.sshd-vsock@.service",
"[Service]\n"
"ExecStart=\n"
"ExecStart=-sshd -i -o 'AuthorizedKeysFile=%d/ssh.ephemeral-authorized_keys-all .ssh/authorized_keys'\n"
"ImportCredential=ssh.ephemeral-authorized_keys-all\n");
"ImportCredential=ssh.ephemeral-authorized_keys-all\n",
SIZE_MAX);
if (r < 0)
return log_error_errno(r, "Failed to set credential systemd.unit-dropin.sshd-vsock@.service: %m");
}