mountfsd: make MountDirectory() work with systemd-homed

systemd-homed already applies an idmap to its mounts, hence we need to
undo it before we can create our own.
This commit is contained in:
Lennart Poettering
2025-08-23 08:15:01 +02:00
committed by Daan De Meyer
parent 0d87de0b8e
commit 9f69ff69f7
5 changed files with 94 additions and 17 deletions

View File

@@ -27,6 +27,7 @@
#include "loop-util.h"
#include "main-func.h"
#include "memory-util.h"
#include "mount-util.h"
#include "namespace-util.h"
#include "nsresource.h"
#include "nulstr-util.h"
@@ -41,6 +42,7 @@
#include "time-util.h"
#include "uid-classification.h"
#include "uid-range.h"
#include "user-util.h"
#include "varlink-io.systemd.MountFileSystem.h"
#include "varlink-util.h"
@@ -637,10 +639,16 @@ static MountMapMode default_mount_map_mode(DirectoryOwnership ownership) {
static JSON_DISPATCH_ENUM_DEFINE(dispatch_mount_directory_mode, MountMapMode, mount_map_mode_from_string);
static DirectoryOwnership validate_directory_fd(int fd, uid_t peer_uid) {
static DirectoryOwnership validate_directory_fd(
int fd,
uid_t peer_uid,
uid_t *ret_current_owner_uid) {
int r, fl;
assert(fd >= 0);
assert(uid_is_valid(peer_uid));
assert(ret_current_owner_uid);
/* Checks if the specified directory fd looks sane. Returns a DirectoryOwnership that categorizes the
* ownership situation in comparison to the peer's UID.
@@ -665,6 +673,7 @@ static DirectoryOwnership validate_directory_fd(int fd, uid_t peer_uid) {
return log_debug_errno(fl, "Directory file descriptor has unsafe flags set: %m");
if (st.st_uid == 0) {
*ret_current_owner_uid = st.st_uid;
if (peer_uid == 0) {
log_debug("Directory file descriptor points to root owned directory, who is also the peer.");
return DIRECTORY_IS_ROOT_PEER_OWNED;
@@ -674,6 +683,7 @@ static DirectoryOwnership validate_directory_fd(int fd, uid_t peer_uid) {
}
if (st.st_uid == peer_uid) {
log_debug("Directory file descriptor points to peer owned directory.");
*ret_current_owner_uid = st.st_uid;
return DIRECTORY_IS_PEER_OWNED;
}
@@ -687,12 +697,14 @@ static DirectoryOwnership validate_directory_fd(int fd, uid_t peer_uid) {
/* Stop iteration if we find a directory up the tree that is neither owned by the user, nor is from the foreign UID range */
if (!uid_is_foreign(st.st_uid) || !gid_is_foreign(st.st_gid)) {
log_debug("Directory file descriptor points to directory which itself or its parents is neither owned by foreign UID range nor by the user.");
*ret_current_owner_uid = st.st_uid;
return DIRECTORY_IS_OTHERWISE_OWNED;
}
/* If the peer is root, then it doesn't matter if we find a parent owned by root, let's shortcut things. */
if (peer_uid == 0) {
log_debug("Directory file descriptor is owned by foreign UID range, and peer is root.");
*ret_current_owner_uid = st.st_uid;
return DIRECTORY_IS_FOREIGN_OWNED;
}
@@ -708,11 +720,13 @@ static DirectoryOwnership validate_directory_fd(int fd, uid_t peer_uid) {
/* Safety check to see if we hit the root dir */
if (stat_inode_same(&st, &new_st)) {
log_debug("Directory file descriptor is owned by foreign UID range, but didn't find parent directory that is owned by peer among ancestors.");
*ret_current_owner_uid = st.st_uid;
return DIRECTORY_IS_OTHERWISE_OWNED;
}
if (new_st.st_uid == peer_uid) { /* Parent inode is owned by the peer. That's good! Everything's fine. */
log_debug("Directory file descriptor is owned by foreign UID range, and ancestor is owned by peer.");
*ret_current_owner_uid = st.st_uid;
return DIRECTORY_IS_FOREIGN_OWNED;
}
@@ -721,6 +735,7 @@ static DirectoryOwnership validate_directory_fd(int fd, uid_t peer_uid) {
}
log_debug("Failed to find peer owned parent directory after %u levels, refusing.", n_level);
*ret_current_owner_uid = st.st_uid;
return DIRECTORY_IS_OTHERWISE_OWNED;
}
@@ -771,7 +786,8 @@ static int vl_method_mount_directory(
if (r < 0)
return log_debug_errno(r, "Failed to get client UID: %m");
DirectoryOwnership owned_by = validate_directory_fd(directory_fd, peer_uid);
uid_t current_owner_uid;
DirectoryOwnership owned_by = validate_directory_fd(directory_fd, peer_uid, &current_owner_uid);
if (owned_by == -EREMOTEIO)
return sd_varlink_errorbo(link, "io.systemd.MountFileSystem.BadFileDescriptorFlags", SD_JSON_BUILD_PAIR_STRING("parameter", "directoryFileDescriptor"));
if (owned_by < 0)
@@ -836,10 +852,28 @@ static int vl_method_mount_directory(
if (r < 0)
return r;
_cleanup_close_ int mount_fd = open_tree(directory_fd, "", OPEN_TREE_CLONE|OPEN_TREE_CLOEXEC|AT_SYMLINK_NOFOLLOW|AT_EMPTY_PATH);
_cleanup_close_ int mount_fd = open_tree_try_drop_idmap(
directory_fd,
"",
OPEN_TREE_CLONE|OPEN_TREE_CLOEXEC|AT_SYMLINK_NOFOLLOW|AT_EMPTY_PATH);
if (mount_fd < 0)
return log_debug_errno(errno, "Failed to issue open_tree() of provided directory '%s': %m", strna(directory_path));
/* MOUNT_ATTR_IDMAP has possibly been cleared. Let's verify that the underlying data matches our expectations. */
struct stat unmapped_st;
if (fstat(mount_fd, &unmapped_st) < 0)
return log_debug_errno(errno, "Failed to stat unmapped inode: %m");
r = stat_verify_directory(&unmapped_st);
if (r < 0)
return r;
/* For now, let's simply refuse things if dropping the idmapping changed anything. For now that
* should be good enough, because the primary usecase for this (homed) will mount the foreign UID
* range 1:1. */
if (unmapped_st.st_uid != current_owner_uid)
return log_debug_errno(SYNTHETIC_ERRNO(EPERM), "Owner UID of mount after clearing ID mapping not the same anymore, refusing.");
if (p.read_only > 0 && mount_setattr(
mount_fd, "", AT_EMPTY_PATH,
&(struct mount_attr) {

View File

@@ -826,17 +826,16 @@ static int mount_bind(const char *dest, CustomMount *m, uid_t uid_shift, uid_t u
* caller's userns *without* any mount idmapping in place. To get that uid, we clone the
* mount source tree and clear any existing idmapping and temporarily mount that tree over
* the mount source before we stat the mount source to figure out the source uid. */
_cleanup_close_ int fd_clone = open_tree_attr_with_fallback(
_cleanup_close_ int fd_clone =
idmapping == REMOUNT_IDMAPPING_NONE ?
RET_NERRNO(open_tree(
AT_FDCWD,
m->source,
open_tree_flags,
&(struct mount_attr) {
.attr_clr = idmapping != REMOUNT_IDMAPPING_NONE ? MOUNT_ATTR_IDMAP : 0,
});
if (ERRNO_IS_NEG_NOT_SUPPORTED(fd_clone))
/* We can only clear idmapped mounts with open_tree_attr(), but there might not be one in
* the first place, so we keep going if we get a not supported error. */
fd_clone = open_tree(AT_FDCWD, m->source, open_tree_flags);
open_tree_flags)) :
open_tree_try_drop_idmap(
AT_FDCWD,
m->source,
open_tree_flags);
if (fd_clone < 0)
return log_error_errno(errno, "Failed to clone %s: %m", m->source);

View File

@@ -1477,7 +1477,7 @@ int make_userns(uid_t uid_shift,
return TAKE_FD(userns_fd);
}
int open_tree_attr_with_fallback(int dir_fd, const char *path, unsigned int flags, struct mount_attr *attr) {
int open_tree_attr_with_fallback(int dir_fd, const char *path, unsigned flags, struct mount_attr *attr) {
_cleanup_close_ int fd = -EBADF;
assert(dir_fd >= 0 || dir_fd == AT_FDCWD);
@@ -1507,6 +1507,42 @@ int open_tree_attr_with_fallback(int dir_fd, const char *path, unsigned int flag
return TAKE_FD(fd);
}
int open_tree_try_drop_idmap(int dir_fd, const char *path, unsigned flags) {
/* Tries to drop MOUNT_ATTR_IDMAP while calling open_tree_attr(), but if that doesn't work just uses
* a regular open_tree() */
assert(dir_fd >= 0 || dir_fd == AT_FDCWD);
if (isempty(path)) {
path = "";
flags |= AT_EMPTY_PATH;
}
_cleanup_close_ int fd = open_tree_attr_with_fallback(
dir_fd,
path,
flags,
&(struct mount_attr) {
.attr_clr = MOUNT_ATTR_IDMAP,
});
if (fd < 0) {
if (!ERRNO_IS_NEG_NOT_SUPPORTED(fd))
return log_debug_errno(fd, "Failed to clear idmap of directory with open_tree_attr(): %m");
log_debug_errno(fd, "Failed to clear idmap with open_tree_attr(), retrying open_tree() without clearing idmap: %m");
fd = RET_NERRNO(open_tree(dir_fd, path, flags));
if (fd < 0)
return log_debug_errno(fd, "Both open_tree() and open_tree_attr() failed, giving up: %m");
log_debug("open_tree() without clearing idmap worked.");
return TAKE_FD(fd);
}
log_debug("Successfully acquired mount fd with cleared idmap.");
return TAKE_FD(fd);
}
int remount_idmap_fd(
char **paths,
int userns_fd,

View File

@@ -145,7 +145,8 @@ typedef enum RemountIdmapping {
_REMOUNT_IDMAPPING_INVALID = -EINVAL,
} RemountIdmapping;
int open_tree_attr_with_fallback(int dir_fd, const char *path, unsigned int flags, struct mount_attr *attr);
int open_tree_attr_with_fallback(int dir_fd, const char *path, unsigned flags, struct mount_attr *attr);
int open_tree_try_drop_idmap(int dir_fd, const char *path, unsigned flags);
int make_userns(uid_t uid_shift, uid_t uid_range, uid_t host_owner, uid_t dest_owner, RemountIdmapping idmapping);
int remount_idmap_fd(char **p, int userns_fd, uint64_t extra_mount_attr_set);

View File

@@ -29,9 +29,16 @@ ProtectHostname=yes
RestrictAddressFamilies=AF_UNIX AF_NETLINK AF_INET AF_INET6
RestrictRealtime=yes
RestrictSUIDSGID=yes
SystemCallArchitectures=native
SystemCallErrorNumber=EPERM
SystemCallFilter=@system-service @mount
# FIXME: libseccomp (as of 2.6.0) doesn't know the open_tree_attr() system call
# yet which we need. Thus, applying any system call filter would block this
# system call unconditionally, which is not desirable. This should be added
# back once libseccomp is updated. See
# https://github.com/seccomp/libseccomp/issues/465
#SystemCallArchitectures=native
#SystemCallErrorNumber=EPERM
#SystemCallFilter=@system-service @mount
Type=notify
NotifyAccess=all
FileDescriptorStoreMax=4096