diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 5ab04b121a..b7622b8db4 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -3934,6 +3934,7 @@ static DissectImageFlags determine_dissect_image_flags(void) { static int outer_child( Barrier *barrier, const char *directory, + int mount_fd, DissectedImage *dissected_image, int fd_outer_socket, int fd_inner_socket, @@ -3987,7 +3988,23 @@ static int outer_child( if (r < 0) return r; - if (dissected_image) { + /* Put the root dir into the target directory now. One of three mechanisms is provided: either we + * have a single mount fd (typically unprivileged --directory= mode) or we have a fully dissected + * image (--image= mode), or we have a regular path. */ + if (mount_fd >= 0) { + assert(arg_directory); + assert(!arg_image); + + if (move_mount(mount_fd, "", AT_FDCWD, directory, MOVE_MOUNT_F_EMPTY_PATH) < 0) + return log_error_errno(errno, "Failed to attach root directory: %m"); + + mount_fd = safe_close(mount_fd); + log_debug("Successfully attached root directory to '%s'.", directory); + + } else if (dissected_image) { + assert(!arg_directory); + assert(arg_image); + /* If we are operating on a disk image, then mount its root directory now, but leave out the * rest. We can read the UID shift from it if we need to. Further down we'll mount the rest, * but then with the uid shift known. That way we can mount VFAT file systems shifted to the @@ -4004,6 +4021,13 @@ static int outer_child( (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0)); if (r < 0) return r; + } else { + assert(arg_directory); + assert(!arg_image); + + r = mount_nofollow_verbose(LOG_ERR, arg_directory, directory, /* fstype= */ NULL, MS_BIND|MS_REC, /* options= */ NULL); + if (r < 0) + return r; } r = determine_uid_shift(directory); @@ -4047,27 +4071,6 @@ static int outer_child( "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range); } - if (path_equal(directory, "/")) { - /* If the directory we shall boot is the host, let's operate on a bind mount at a different - * place, so that we can make changes to its mount structure (for example, to implement - * --volatile=) without this interfering with our ability to access files such as - * /etc/localtime to copy into the container. Note that we use a fixed place for this - * (instead of a temporary directory, since we are living in our own mount namespace here - * already, and thus don't need to be afraid of colliding with anyone else's mounts). */ - (void) mkdir_p("/run/systemd/nspawn-root", 0755); - - r = mount_nofollow_verbose(LOG_ERR, "/", "/run/systemd/nspawn-root", NULL, MS_BIND|MS_REC, NULL); - if (r < 0) - return r; - - directory = "/run/systemd/nspawn-root"; - } - - /* Make sure we always have a mount that we can move to root later on. */ - r = make_mount_point(directory); - if (r < 0) - return r; - /* So the whole tree is now MS_SLAVE, i.e. we'll still receive mount/umount events from the host * mount namespace. For the directory we are going to run our container let's turn this off, so that * we'll live in our own little world from now on, and propagation from the host may only happen via @@ -5161,6 +5164,8 @@ static int load_oci_bundle(void) { } static int run_container( + const char *directory, + int mount_fd, DissectedImage *dissected_image, int userns_fd, FDSet *fds, @@ -5298,7 +5303,8 @@ static int run_container( (void) reset_signal_mask(); r = outer_child(&barrier, - arg_directory, + directory, + mount_fd, dissected_image, fd_outer_socket_pair[1], fd_inner_socket_pair[1], @@ -5915,14 +5921,14 @@ static int cant_be_in_netns(void) { } static int run(int argc, char *argv[]) { - bool remove_directory = false, remove_image = false, veth_created = false, remove_tmprootdir = false; - _cleanup_close_ int master = -EBADF, userns_fd = -EBADF; + bool remove_directory = false, remove_image = false, veth_created = false; + _cleanup_close_ int master = -EBADF, userns_fd = -EBADF, mount_fd = -EBADF; _cleanup_fdset_free_ FDSet *fds = NULL; int r, n_fd_passed, ret = EXIT_SUCCESS; char veth_name[IFNAMSIZ] = ""; struct ExposeArgs expose_args = {}; _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT; - char tmprootdir[] = "/tmp/nspawn-root-XXXXXX"; + _cleanup_(rmdir_and_freep) char *rootdir = NULL; _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL; _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL; _cleanup_(fw_ctx_freep) FirewallContext *fw_ctx = NULL; @@ -6023,14 +6029,25 @@ static int run(int argc, char *argv[]) { if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */ arg_quiet = true; - if (arg_directory) { - assert(!arg_image); + if (!arg_privileged) { + /* if we are unprivileged, let's allocate a 64K userns first */ - if (!arg_privileged) { - r = log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Invoking container from plain directory tree is currently not supported if called without privileges."); + _cleanup_free_ char *userns_name = NULL; + if (asprintf(&userns_name, "nspawn-" PID_FMT "-%s", getpid_cached(), arg_machine) < 0) { + r = log_oom(); goto finish; } + userns_fd = nsresource_allocate_userns(userns_name, UINT64_C(0x10000)); + if (userns_fd < 0) { + r = log_error_errno(userns_fd, "Failed to allocate user namespace with 64K users: %m"); + goto finish; + } + } + + if (arg_directory) { + assert(!arg_image); + /* Safety precaution: let's not allow running images from the live host OS image, as long as * /var from the host will propagate into container dynamically (because bad things happen if * two systems write to the same /var). Let's allow it for the special cases where /var is @@ -6200,6 +6217,15 @@ static int run(int argc, char *argv[]) { } } + if (!arg_privileged) { + r = mountfsd_mount_directory( + arg_directory, + userns_fd, + determine_dissect_image_flags(), + &mount_fd); + if (r < 0) + goto finish; + } } else { DissectImageFlags dissect_image_flags = determine_dissect_image_flags(); @@ -6274,19 +6300,6 @@ static int run(int argc, char *argv[]) { dissect_image_flags |= DISSECT_IMAGE_NO_PARTITION_TABLE; } - if (!mkdtemp(tmprootdir)) { - r = log_error_errno(errno, "Failed to create temporary directory: %m"); - goto finish; - } - - remove_tmprootdir = true; - - arg_directory = strdup(tmprootdir); - if (!arg_directory) { - r = log_oom(); - goto finish; - } - if (arg_privileged) { r = loop_device_make_by_path( arg_image, @@ -6339,19 +6352,6 @@ static int run(int argc, char *argv[]) { if (r < 0) goto finish; } else { - _cleanup_free_ char *userns_name = strjoin("nspawn-", arg_machine); - if (!userns_name) { - r = log_oom(); - goto finish; - } - - /* if we are unprivileged, let's allocate a 64K userns first */ - userns_fd = nsresource_allocate_userns(userns_name, UINT64_C(0x10000)); - if (userns_fd < 0) { - r = log_error_errno(userns_fd, "Failed to allocate user namespace with 64K users: %m"); - goto finish; - } - r = mountfsd_mount_image( arg_image, userns_fd, @@ -6370,7 +6370,14 @@ static int run(int argc, char *argv[]) { arg_architecture = dissected_image_architecture(dissected_image); } - r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts); + /* Create a temporary place to mount stuff. */ + r = mkdtemp_malloc("/tmp/nspawn-root-XXXXXX", &rootdir); + if (r < 0) { + log_error_errno(r, "Failed to create temporary directory: %m"); + goto finish; + } + + r = custom_mount_prepare_all(rootdir, arg_custom_mounts, arg_n_custom_mounts); if (r < 0) goto finish; @@ -6405,6 +6412,8 @@ static int run(int argc, char *argv[]) { } for (;;) { r = run_container( + rootdir, + mount_fd, dissected_image, userns_fd, fds, @@ -6447,11 +6456,6 @@ finish: log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image); } - if (remove_tmprootdir) { - if (rmdir(tmprootdir) < 0) - log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir); - } - if (arg_machine && arg_privileged) { const char *p;