diff --git a/src/mountfsd/io.systemd.mount-file-system.policy b/src/mountfsd/io.systemd.mount-file-system.policy index 6a151eb437..78613bfdaf 100644 --- a/src/mountfsd/io.systemd.mount-file-system.policy +++ b/src/mountfsd/io.systemd.mount-file-system.policy @@ -67,4 +67,53 @@ io.systemd.mount-file-system.mount-image-privately + + + + + Allow mounting of directory + Authentication is required for an application to mount directory $(directory). + + auth_admin_keep + auth_admin_keep + yes + + + + + + Allow mounting of untrusted directory + Authentication is required for an application to mount directory $(directory) which is not owned by the user. + + auth_admin + auth_admin + auth_admin + + + io.systemd.mount-file-system.mount-directory + + + + + Allow private mounting of directory + Authentication is required for an application to privately mount directory $(directory). + + yes + yes + yes + + + + + Allow private mounting of untrusted directory + Authentication is required for an application to privately mount directory $(directory) which is not owned by the user. + + auth_admin + auth_admin + auth_admin + + + io.systemd.mount-file-system.mount-directory-privately + diff --git a/src/mountfsd/mountwork.c b/src/mountfsd/mountwork.c index 31f2d47546..2fd610f5d6 100644 --- a/src/mountfsd/mountwork.c +++ b/src/mountfsd/mountwork.c @@ -1,5 +1,10 @@ /* SPDX-License-Identifier: LGPL-2.1-or-later */ +#include +#if WANT_LINUX_FS_H +#include +#endif + #include "sd-daemon.h" #include "sd-varlink.h" @@ -15,12 +20,17 @@ #include "json-util.h" #include "main-func.h" #include "missing_loop.h" +#include "missing_mount.h" +#include "missing_syscall.h" #include "namespace-util.h" #include "nsresource.h" #include "nulstr-util.h" #include "os-util.h" #include "process-util.h" #include "stat-util.h" +#include "string-table.h" +#include "uid-classification.h" +#include "uid-range.h" #include "user-util.h" #include "varlink-io.systemd.MountFileSystem.h" #include "varlink-util.h" @@ -532,6 +542,349 @@ static int vl_method_mount_image( SD_JSON_BUILD_PAIR_CONDITION(!sd_id128_is_null(di->image_uuid), "imageUuid", SD_JSON_BUILD_UUID(di->image_uuid))); } +typedef enum MountMapMode { + MOUNT_MAP_AUTO = 0, /* determine automatically from image and caller */ + MOUNT_MAP_ROOT, /* map caller's UID to root in namespace (map 1 UID only) */ + MOUNT_MAP_FOREIGN, /* map foreign UID range to base in namespace (map 64K) */ + MOUNT_MAP_IDENTITY, /* apply identity mapping (map 64K) */ + _MOUNT_MAP_MODE_MAX, + _MOUNT_MAP_MODE_INVALID = -EINVAL, +} MountMapMode; + +static const char *const mount_map_mode_table[_MOUNT_MAP_MODE_MAX] = { + [MOUNT_MAP_AUTO] = "auto", + [MOUNT_MAP_ROOT] = "root", + [MOUNT_MAP_FOREIGN] = "foreign", + [MOUNT_MAP_IDENTITY] = "identity", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP(mount_map_mode, MountMapMode); + +typedef struct MountDirectoryParameters { + MountMapMode mode; + unsigned directory_fd_idx; + unsigned userns_fd_idx; + int read_only; +} MountDirectoryParameters; + +typedef enum DirectoryOwnership { + DIRECTORY_IS_ROOT_PEER_OWNED, /* This is returned if the directory is owned by the root user and the peer is root */ + DIRECTORY_IS_ROOT_OWNED, /* This is returned if the directory is owned by the root user (and the peer user is not root) */ + DIRECTORY_IS_PEER_OWNED, /* This is returned if the directory is owned by the peer user (who is not root) */ + DIRECTORY_IS_FOREIGN_OWNED, /* This is returned if the directory is owned by the foreign UID range */ + DIRECTORY_IS_OTHERWISE_OWNED, /* This is returned if the directory is owned by something else */ + _DIRECTORY_OWNERSHIP_MAX, + _DIRECTORY_OWNERSHIP_ERRNO_MAX = -ERRNO_MAX, /* Guarantee the whole negative errno range fits */ +} DirectoryOwnership; + +static MountMapMode default_mount_map_mode(DirectoryOwnership ownership) { + /* Derives a suitable mapping mode from the ownership of the base tree */ + + switch (ownership) { + case DIRECTORY_IS_PEER_OWNED: + return MOUNT_MAP_ROOT; /* Map the peer's UID to root in the container */ + + case DIRECTORY_IS_FOREIGN_OWNED: + return MOUNT_MAP_FOREIGN; /* Map the foreign UID range to the container's UID range */ + + case DIRECTORY_IS_ROOT_PEER_OWNED: + case DIRECTORY_IS_ROOT_OWNED: + case DIRECTORY_IS_OTHERWISE_OWNED: + return MOUNT_MAP_IDENTITY; /* Don't map */ + + default: + return _MOUNT_MAP_MODE_INVALID; + } +} + +static JSON_DISPATCH_ENUM_DEFINE(dispatch_mount_directory_mode, MountMapMode, mount_map_mode_from_string); + +static DirectoryOwnership validate_directory_fd(int fd, uid_t peer_uid) { + int r, fl; + + assert(fd >= 0); + + /* Checks if the specified directory fd looks sane. Returns a DirectoryOwnership that categorizes the + * ownership situation in comparison to the peer's UID. + * + * Note one key difference to image validation (as implemented above): for regular files if the + * client provided us with an open fd it implies the client has access, as well as what kind of + * access (i.e. ro or rw). But for directories this doesn't work the same way, as directories are + * always opened read-only only. Hence we use a different mechanism to validate access to them: we + * check if the directory is owned by the peer UID or by the foreign UID range (in the latter case + * one of the parent directories must be owned by the peer though). */ + + struct stat st; + if (fstat(fd, &st) < 0) + return log_debug_errno(errno, "Failed to stat() directory fd: %m"); + + r = stat_verify_directory(&st); + if (r < 0) + return r; + + fl = fd_verify_safe_flags_full(fd, O_DIRECTORY); + if (fl < 0) + return log_debug_errno(fl, "Directory file descriptor has unsafe flags set: %m"); + + if (st.st_uid == 0) { + if (peer_uid == 0) { + log_debug("Directory file descriptor points to root owned directory, who is also the peer."); + return DIRECTORY_IS_ROOT_PEER_OWNED; + } + log_debug("Directory file descriptor points to root owned directory."); + return DIRECTORY_IS_ROOT_OWNED; + } + if (st.st_uid == peer_uid) { + log_debug("Directory file descriptor points to peer owned directory."); + return DIRECTORY_IS_PEER_OWNED; + } + + /* For bind mounted directories we check if they are either owned by the client's UID, or by the + * foreign UID set, but in that case the parent directory must be owned by the client's UID, or some + * directory iteratively up the chain */ + + _cleanup_close_ int parent_fd = -EBADF; + unsigned n_level; + for (n_level = 0; n_level < 16; n_level++) { + /* Stop iteration if we find a directory up the tree that is neither owned by the user, nor is from the foreign UID range */ + if (!uid_is_foreign(st.st_uid) || !gid_is_foreign(st.st_gid)) { + log_debug("Directory file descriptor points to directory which itself or its parents is neither owned by foreign UID range nor by the user."); + return DIRECTORY_IS_OTHERWISE_OWNED; + } + + /* If the peer is root, then it doesn't matter if we find a parent owned by root, let's shortcut things. */ + if (peer_uid == 0) { + log_debug("Directory file descriptor is owned by foreign UID range, and peer is root."); + return DIRECTORY_IS_FOREIGN_OWNED; + } + + /* Go one level up */ + _cleanup_close_ int new_parent_fd = openat(fd, "..", O_DIRECTORY|O_PATH|O_CLOEXEC); + if (new_parent_fd < 0) + return log_debug_errno(errno, "Failed to open parent directory of directory file descriptor: %m"); + + struct stat new_st; + if (fstat(new_parent_fd, &new_st) < 0) + return log_debug_errno(errno, "Failed to stat parent directory of directory file descriptor: %m"); + + /* Safety check to see if we hit the root dir */ + if (stat_inode_same(&st, &new_st)) { + log_debug("Directory file descriptor is owned by foreign UID range, but didn't find parent directory that is owned by peer among ancestors."); + return DIRECTORY_IS_OTHERWISE_OWNED; + } + + if (new_st.st_uid == peer_uid) { /* Parent inode is owned by the peer. That's good! Everything's fine. */ + log_debug("Directory file descriptor is owned by foreign UID range, and ancestor is owned by peer."); + return DIRECTORY_IS_FOREIGN_OWNED; + } + + close_and_replace(parent_fd, new_parent_fd); + st = new_st; + } + + log_debug("Failed to find peer owned parent directory after %u levels, refusing.", n_level); + return DIRECTORY_IS_OTHERWISE_OWNED; +} + +static int vl_method_mount_directory( + sd_varlink *link, + sd_json_variant *parameters, + sd_varlink_method_flags_t flags, + void *userdata) { + + static const sd_json_dispatch_field dispatch_table[] = { + { "mode", SD_JSON_VARIANT_STRING, dispatch_mount_directory_mode, offsetof(MountDirectoryParameters, mode), 0 }, + { "directoryFileDescriptor", SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uint, offsetof(MountDirectoryParameters, directory_fd_idx), SD_JSON_MANDATORY }, + { "userNamespaceFileDescriptor", SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uint, offsetof(MountDirectoryParameters, userns_fd_idx), 0 }, + { "readOnly", SD_JSON_VARIANT_BOOLEAN, sd_json_dispatch_tristate, offsetof(MountDirectoryParameters, read_only), 0 }, + VARLINK_DISPATCH_POLKIT_FIELD, + {} + }; + + MountDirectoryParameters p = { + .mode = MOUNT_MAP_AUTO, + .directory_fd_idx = UINT_MAX, + .userns_fd_idx = UINT_MAX, + .read_only = -1, + }; + _cleanup_close_ int directory_fd = -EBADF, userns_fd = -EBADF; + Hashmap **polkit_registry = ASSERT_PTR(userdata); + int r; + + r = sd_varlink_dispatch(link, parameters, dispatch_table, &p); + if (r != 0) + return r; + + if (p.directory_fd_idx == UINT_MAX) + return sd_varlink_error_invalid_parameter_name(link, "directoryFileDescriptor"); + + directory_fd = sd_varlink_peek_dup_fd(link, p.directory_fd_idx); + if (directory_fd < 0) + return log_debug_errno(directory_fd, "Failed to peek directory fd from client: %m"); + + if (p.userns_fd_idx != UINT_MAX) { + userns_fd = sd_varlink_peek_dup_fd(link, p.userns_fd_idx); + if (userns_fd < 0) + return log_debug_errno(userns_fd, "Failed to peek user namespace fd from client: %m"); + } + + uid_t peer_uid; + r = sd_varlink_get_peer_uid(link, &peer_uid); + if (r < 0) + return log_debug_errno(r, "Failed to get client UID: %m"); + + DirectoryOwnership owned_by = validate_directory_fd(directory_fd, peer_uid); + if (owned_by < 0) + return owned_by; + + r = validate_userns(link, &userns_fd); + if (r != 0) + return r; + + /* If no mode is specified, pick sensible default */ + if (p.mode <= 0) { + p.mode = default_mount_map_mode(owned_by); + assert(p.mode > 0); + } + + _cleanup_free_ char *directory_path = NULL; + (void) fd_get_path(directory_fd, &directory_path); + + log_debug("Mounting '%s' with mapping mode: %s", strna(directory_path), mount_map_mode_to_string(p.mode)); + + const char *polkit_details[] = { + "read_only", one_zero(p.read_only > 0), + "directory", strna(directory_path), + NULL, + }; + + const char *polkit_action, *polkit_untrusted_action; + PolkitFlags polkit_flags; + if (userns_fd < 0) { + /* Mount into the host user namespace */ + polkit_action = "io.systemd.mount-file-system.mount-directory"; + polkit_untrusted_action = "io.systemd.mount-file-system.mount-untrusted-directory"; + polkit_flags = 0; + } else { + /* Mount into a private user namespace */ + polkit_action = "io.systemd.mount-file-system.mount-directory-privately"; + polkit_untrusted_action = "io.systemd.mount-file-system.mount-untrusted-directory-privately"; + + /* If polkit is not around, let's allow mounting authenticated images by default */ + polkit_flags = POLKIT_DEFAULT_ALLOW; + } + + /* We consider a directory "trusted" if it is owned by the peer or the foreign UID range */ + bool trusted_directory = IN_SET(owned_by, DIRECTORY_IS_ROOT_PEER_OWNED, DIRECTORY_IS_PEER_OWNED, DIRECTORY_IS_FOREIGN_OWNED); + + /* Let's definitely acquire the regular action privilege, for mounting properly signed images */ + r = varlink_verify_polkit_async_full( + link, + /* bus= */ NULL, + trusted_directory ? polkit_action : polkit_untrusted_action, + polkit_details, + /* good_user= */ UID_INVALID, + trusted_directory ? polkit_flags : 0, + polkit_registry); + if (r <= 0) + return r; + + /* Generate the common dissection directory here. We are not going to use it, but the clients might, + * and they likely are unprivileged, hence cannot create it themselves. Hence let's just create it + * here, if it is missing. */ + r = get_common_dissect_directory(NULL); + if (r < 0) + return r; + + _cleanup_close_ int mount_fd = open_tree(directory_fd, "", OPEN_TREE_CLONE|OPEN_TREE_CLOEXEC|AT_SYMLINK_NOFOLLOW|AT_EMPTY_PATH); + if (mount_fd < 0) + return log_debug_errno(errno, "Failed to issue open_tree() of provided directory '%s': %m", strna(directory_path)); + + if (p.read_only > 0 && mount_setattr( + mount_fd, "", AT_EMPTY_PATH, + &(struct mount_attr) { + .attr_set = MOUNT_ATTR_RDONLY, + }, MOUNT_ATTR_SIZE_VER0) < 0) + return log_debug_errno(errno, "Failed to enable read-only mode: %m"); + + if (p.mode != MOUNT_MAP_IDENTITY) { + uid_t start; + + if (userns_fd >= 0) { + _cleanup_(uid_range_freep) UIDRange *uid_range_outside = NULL, *uid_range_inside = NULL, *gid_range_outside = NULL, *gid_range_inside = NULL; + r = uid_range_load_userns_by_fd(userns_fd, UID_RANGE_USERNS_OUTSIDE, &uid_range_outside); + if (r < 0) + return log_debug_errno(r, "Failed to load outside UID range of provided userns: %m"); + r = uid_range_load_userns_by_fd(userns_fd, UID_RANGE_USERNS_INSIDE, &uid_range_inside); + if (r < 0) + return log_debug_errno(r, "Failed to load inside UID range of provided userns: %m"); + r = uid_range_load_userns_by_fd(userns_fd, GID_RANGE_USERNS_OUTSIDE, &gid_range_outside); + if (r < 0) + return log_debug_errno(r, "Failed to load outside GID range of provided userns: %m"); + r = uid_range_load_userns_by_fd(userns_fd, GID_RANGE_USERNS_INSIDE, &gid_range_inside); + if (r < 0) + return log_debug_errno(r, "Failed to load inside GID range of provided userns: %m"); + + /* Be very strict for now */ + if (!uid_range_equal(uid_range_outside, gid_range_outside) || + !uid_range_equal(uid_range_inside, gid_range_inside) || + uid_range_outside->n_entries != 1 || + uid_range_outside->entries[0].nr != 0x10000 || + uid_range_inside->n_entries != 1 || + uid_range_inside->entries[0].start != 0 || + uid_range_inside->entries[0].nr != 0x10000) + return sd_varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor"); + + start = uid_range_outside->entries[0].start; + } else + start = 0; + + _cleanup_free_ char *new_uid_map = NULL; + switch (p.mode) { + case MOUNT_MAP_ROOT: + r = strextendf(&new_uid_map, UID_FMT " " UID_FMT " " UID_FMT, + peer_uid, start, (uid_t) 1); + break; + case MOUNT_MAP_FOREIGN: + r = strextendf(&new_uid_map, UID_FMT " " UID_FMT " " UID_FMT, + (uid_t) FOREIGN_UID_MIN, start, (uid_t) 0x10000); + break; + default: + assert_not_reached(); + } + if (r < 0) + return r; + + _cleanup_close_ int idmap_userns_fd = userns_acquire(new_uid_map, new_uid_map); + if (idmap_userns_fd < 0) + return log_debug_errno(idmap_userns_fd, "Failed to acquire user namespace for id mapping: %m"); + + if (mount_setattr(mount_fd, "", AT_EMPTY_PATH, + &(struct mount_attr) { + .attr_set = MOUNT_ATTR_IDMAP, + .userns_fd = idmap_userns_fd, + .propagation = MS_PRIVATE, + }, MOUNT_ATTR_SIZE_VER0) < 0) + return log_debug_errno(errno, "Failed to enable id mapping: %m"); + } + + if (userns_fd >= 0) { + r = nsresource_add_mount(userns_fd, mount_fd); + if (r < 0) + return r; + } + + int fd_idx = sd_varlink_push_fd(link, mount_fd); + if (fd_idx < 0) + return fd_idx; + + TAKE_FD(mount_fd); + + return sd_varlink_replybo( + link, + SD_JSON_BUILD_PAIR("mountFileDescriptor", SD_JSON_BUILD_INTEGER(fd_idx))); +} + static int process_connection(sd_varlink_server *server, int _fd) { _cleanup_close_ int fd = TAKE_FD(_fd); /* always take possession */ _cleanup_(sd_varlink_close_unrefp) sd_varlink *vl = NULL; @@ -601,7 +954,8 @@ static int run(int argc, char *argv[]) { r = sd_varlink_server_bind_method_many( server, - "io.systemd.MountFileSystem.MountImage", vl_method_mount_image); + "io.systemd.MountFileSystem.MountImage", vl_method_mount_image, + "io.systemd.MountFileSystem.MountDirectory", vl_method_mount_directory); if (r < 0) return log_error_errno(r, "Failed to bind methods: %m"); diff --git a/src/shared/varlink-io.systemd.MountFileSystem.c b/src/shared/varlink-io.systemd.MountFileSystem.c index 43b812b0d2..78e7ce06ab 100644 --- a/src/shared/varlink-io.systemd.MountFileSystem.c +++ b/src/shared/varlink-io.systemd.MountFileSystem.c @@ -49,6 +49,31 @@ static SD_VARLINK_DEFINE_METHOD( SD_VARLINK_DEFINE_OUTPUT(imageName, SD_VARLINK_STRING, SD_VARLINK_NULLABLE), SD_VARLINK_DEFINE_OUTPUT(imageUuid, SD_VARLINK_STRING, SD_VARLINK_NULLABLE)); +static SD_VARLINK_DEFINE_ENUM_TYPE( + MountMapMode, + SD_VARLINK_FIELD_COMMENT("Map the caller's UID to root in the user namespace, do not map anything else."), + SD_VARLINK_DEFINE_ENUM_VALUE(root), + SD_VARLINK_FIELD_COMMENT("Map the foreign UID range to the base UID range in the user namespace (i.e. UID zero and above), covering 64K users."), + SD_VARLINK_DEFINE_ENUM_VALUE(foreign), + SD_VARLINK_FIELD_COMMENT("Apply an identity (1:1) mapping, but limit it to 64K users."), + SD_VARLINK_DEFINE_ENUM_VALUE(identity), + SD_VARLINK_FIELD_COMMENT("Determine automatically based on provided directory and caller."), + SD_VARLINK_DEFINE_ENUM_VALUE(auto)); + +static SD_VARLINK_DEFINE_METHOD( + MountDirectory, + SD_VARLINK_FIELD_COMMENT("Directory file descriptor of the directory to assign to the user namespace. Must be a regular, i.e. non-O_PATH file descriptor."), + SD_VARLINK_DEFINE_INPUT(directoryFileDescriptor, SD_VARLINK_INT, 0), + SD_VARLINK_FIELD_COMMENT("File descriptor to the user namespace to assign this directory to. If not specified uses the host user namespace."), + SD_VARLINK_DEFINE_INPUT(userNamespaceFileDescriptor, SD_VARLINK_INT, SD_VARLINK_NULLABLE), + SD_VARLINK_FIELD_COMMENT("Whether to mark the resulting mount file descriptor as read-only. If not specified defaults to false."), + SD_VARLINK_DEFINE_INPUT(readOnly, SD_VARLINK_BOOL, SD_VARLINK_NULLABLE), + SD_VARLINK_FIELD_COMMENT("Which kinda of UID/GID mapping to apply to the resulting mount file descriptor."), + SD_VARLINK_DEFINE_INPUT_BY_TYPE(mode, MountMapMode, SD_VARLINK_NULLABLE), + VARLINK_DEFINE_POLKIT_INPUT, + SD_VARLINK_FIELD_COMMENT("The freshly allocated mount file descriptor for the mount."), + SD_VARLINK_DEFINE_OUTPUT(mountFileDescriptor, SD_VARLINK_INT, 0)); + static SD_VARLINK_DEFINE_ERROR(IncompatibleImage); static SD_VARLINK_DEFINE_ERROR(MultipleRootPartitionsFound); static SD_VARLINK_DEFINE_ERROR(RootPartitionNotFound); @@ -59,9 +84,17 @@ static SD_VARLINK_DEFINE_ERROR(VerityFailure); SD_VARLINK_DEFINE_INTERFACE( io_systemd_MountFileSystem, "io.systemd.MountFileSystem", + SD_VARLINK_INTERFACE_COMMENT("APIs for unpriviliged mounting."), + SD_VARLINK_SYMBOL_COMMENT("Encodes the designated purpose of a partition."), &vl_type_PartitionDesignator, + SD_VARLINK_SYMBOL_COMMENT("Information about a specific partition."), &vl_type_PartitionInfo, + SD_VARLINK_SYMBOL_COMMENT("Selects the type of UID/GID mapping to apply."), + &vl_type_MountMapMode, + SD_VARLINK_SYMBOL_COMMENT("Takes a disk image file descriptor as input, returns a set of mount file descriptors for it."), &vl_method_MountImage, + SD_VARLINK_SYMBOL_COMMENT("Takes a directory file descriptor as input, returns a mount file descriptor."), + &vl_method_MountDirectory, &vl_error_IncompatibleImage, &vl_error_MultipleRootPartitionsFound, &vl_error_RootPartitionNotFound, diff --git a/units/systemd-mountfsd.service.in b/units/systemd-mountfsd.service.in index ef413684c3..00a0827fbc 100644 --- a/units/systemd-mountfsd.service.in +++ b/units/systemd-mountfsd.service.in @@ -25,13 +25,7 @@ LimitNOFILE={{HIGH_RLIMIT_NOFILE}} LockPersonality=yes MemoryDenyWriteExecute=yes NoNewPrivileges=yes -ProtectProc=invisible -ProtectControlGroups=yes -ProtectHome=yes ProtectHostname=yes -ProtectKernelLogs=yes -ProtectKernelModules=yes -ProtectSystem=strict RestrictAddressFamilies=AF_UNIX AF_NETLINK AF_INET AF_INET6 RestrictRealtime=yes RestrictSUIDSGID=yes