diff --git a/src/core/credential.c b/src/core/credential.c index 7c9d6f6a9b..d9a88004c5 100644 --- a/src/core/credential.c +++ b/src/core/credential.c @@ -10,6 +10,7 @@ #include "glob-util.h" #include "io-util.h" #include "label-util.h" +#include "missing_syscall.h" #include "mkdir-label.h" #include "mount-util.h" #include "mountpoint-util.h" @@ -17,6 +18,7 @@ #include "random-util.h" #include "recurse-dir.h" #include "rm-rf.h" +#include "socket-util.h" #include "tmpfile-util.h" ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) { @@ -730,7 +732,8 @@ static int setup_credentials_internal( bool reuse_workspace, /* Whether to reuse any existing workspace mount if it already is a mount */ bool must_mount, /* Whether to require that we mount something, it's not OK to use the plain directory fall back */ uid_t uid, - gid_t gid) { + gid_t gid, + int *ret_mount_fd) { int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true * if we mounted something; false if we definitely can't mount anything */ @@ -848,6 +851,27 @@ static int setup_credentials_internal( if (r < 0) return r; + if (ret_mount_fd) { + _cleanup_close_ int mount_fd = -EBADF; + + r = mount_fd = RET_NERRNO(open_tree(AT_FDCWD, workspace, OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC)); + if (r >= 0) { + /* The workspace is already cloned in the above, and not necessary + * anymore. Even though the workspace is unmounted when the short-lived + * child process exits, let's explicitly unmount it here for safety. */ + r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW); + if (r < 0) + return r; + + *ret_mount_fd = TAKE_FD(mount_fd); + return 0; + } + + /* Old kernel? Unprivileged? */ + if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r)) + return r; + } + /* And mount it to the final place, read-only */ r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL); } else @@ -868,6 +892,8 @@ static int setup_credentials_internal( return -errno; } + if (ret_mount_fd) + *ret_mount_fd = -EBADF; return 0; } @@ -877,17 +903,22 @@ int setup_credentials( const char *unit, uid_t uid, gid_t gid, - char **ret_path) { + char **ret_path, + int *ret_mount_fd) { + _cleanup_close_pair_ int socket_pair[2] = PIPE_EBADF; _cleanup_free_ char *p = NULL, *q = NULL; + _cleanup_close_ int mount_fd = -EBADF; int r; assert(context); assert(params); assert(ret_path); + assert(ret_mount_fd); if (!exec_context_has_credentials(context)) { *ret_path = NULL; + *ret_mount_fd = -EBADF; return 0; } @@ -912,7 +943,12 @@ int setup_credentials( if (r < 0 && r != -EEXIST) return r; - r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG|FORK_WAIT|FORK_NEW_MOUNTNS, NULL); + if (socketpair(AF_UNIX, SOCK_DGRAM | SOCK_CLOEXEC, 0, socket_pair) < 0) + return -errno; + + r = safe_fork_full("(sd-mkdcreds)", + NULL, &socket_pair[1], 1, + FORK_DEATHSIG|FORK_CLOSE_ALL_FDS|FORK_WAIT|FORK_NEW_MOUNTNS|FORK_REOPEN_LOG, NULL); if (r < 0) { _cleanup_free_ char *t = NULL, *u = NULL; @@ -948,14 +984,15 @@ int setup_credentials( true, /* reuse the workspace if it is already a mount */ false, /* it's OK to fall back to a plain directory if we can't mount anything */ uid, - gid); + gid, + NULL); (void) rmdir(u); /* remove the workspace again if we can. */ if (r < 0) return r; - } else if (r == 0) { + } else if (r == 0) { /* child */ /* We managed to set up a mount namespace, and are now in a child. That's great. In this case * we can use the same directory for all cases, after turning off propagation. Question @@ -974,6 +1011,8 @@ int setup_credentials( * given that we do this in a privately namespaced short-lived single-threaded process that * no one else sees this should be OK to do. */ + _cleanup_close_ int fd = -EBADF; + /* Turn off propagation from our namespace to host */ r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL); if (r < 0) @@ -988,7 +1027,14 @@ int setup_credentials( false, /* do not reuse /dev/shm if it is already a mount, under no circumstances */ true, /* insist that something is mounted, do not allow fallback to plain directory */ uid, - gid); + gid, + &fd); + if (r < 0) + goto child_fail; + + r = send_one_fd_iov(socket_pair[1], fd, + &IOVEC_MAKE((int[]) { fd >= 0 }, sizeof(int)), 1, + MSG_DONTWAIT); if (r < 0) goto child_fail; @@ -996,6 +1042,17 @@ int setup_credentials( child_fail: _exit(EXIT_FAILURE); + + } else { /* parent */ + + int ret; + struct iovec iov = IOVEC_MAKE(&ret, sizeof(int)); + + r = receive_one_fd_iov(socket_pair[0], &iov, 1, MSG_DONTWAIT, &mount_fd); + if (r < 0) + return r; + if (ret > 0 && mount_fd < 0) + return -EIO; } /* If the credentials dir is empty and not a mount point, then there's no point in having it. Let's @@ -1005,5 +1062,6 @@ int setup_credentials( (void) rmdir(p); *ret_path = TAKE_PTR(p); + *ret_mount_fd = TAKE_FD(mount_fd); return 0; } diff --git a/src/core/credential.h b/src/core/credential.h index 2afd88dfc5..4b936b32d6 100644 --- a/src/core/credential.h +++ b/src/core/credential.h @@ -46,4 +46,5 @@ int setup_credentials( const char *unit, uid_t uid, gid_t gid, - char **ret_path); + char **ret_path, + int *ret_mount_fd); diff --git a/src/core/execute.c b/src/core/execute.c index 9c900095ce..eb452a45ec 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -72,6 +73,7 @@ #include "missing_fs.h" #include "missing_ioprio.h" #include "missing_prctl.h" +#include "missing_syscall.h" #include "mkdir-label.h" #include "namespace.h" #include "parse-util.h" @@ -3112,6 +3114,7 @@ static int apply_mount_namespace( ExecRuntime *runtime, const char *memory_pressure_path, const char *creds_path, + int creds_fd, char **error_path) { _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT; @@ -3287,6 +3290,7 @@ static int apply_mount_namespace( tmp_dir, var_tmp_dir, creds_path, + creds_fd, context->log_namespace, context->mount_propagation_flag, &verity, @@ -3971,6 +3975,7 @@ static int exec_child( int ngids_after_pam = 0; _cleanup_free_ int *fds = NULL; _cleanup_strv_free_ char **fdnames = NULL; + _cleanup_close_ int creds_fd = -EBADF; assert(unit); assert(command); @@ -4421,7 +4426,7 @@ static int exec_child( } if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) { - r = setup_credentials(context, params, unit->id, uid, gid, &creds_path); + r = setup_credentials(context, params, unit->id, uid, gid, &creds_path, &creds_fd); if (r < 0) { *exit_status = EXIT_CREDENTIALS; return log_unit_error_errno(unit, r, "Failed to set up credentials: %m"); @@ -4635,7 +4640,7 @@ static int exec_child( if (needs_mount_namespace) { _cleanup_free_ char *error_path = NULL; - r = apply_mount_namespace(unit, command->flags, context, params, runtime, memory_pressure_path, creds_path, &error_path); + r = apply_mount_namespace(unit, command->flags, context, params, runtime, memory_pressure_path, creds_path, creds_fd, &error_path); if (r < 0) { *exit_status = EXIT_NAMESPACE; return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m", @@ -4643,6 +4648,19 @@ static int exec_child( } } + if (creds_fd >= 0) { + assert(creds_path); + + /* When a mount namespace is not requested, then the target directory may not exist yet. + * Here, we ignore the failure, as if it fails, the subsequent move_mount() will fail. */ + (void) mkdir_p_label(creds_path, 0755); + + if (move_mount(creds_fd, "", AT_FDCWD, creds_path, MOVE_MOUNT_F_EMPTY_PATH) < 0) { + *exit_status = EXIT_CREDENTIALS; + return log_unit_error_errno(unit, errno, "Failed to mount credentials directory on %s: %m", creds_path); + } + } + if (needs_sandboxing) { r = apply_protect_hostname(unit, context, exit_status); if (r < 0) diff --git a/src/core/namespace.c b/src/core/namespace.c index 51b5aad9c9..86406007ad 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -74,7 +74,8 @@ typedef enum MountMode { EXTENSION_DIRECTORIES, /* Bind-mounted outside the root directory, and used by subsequent mounts */ EXTENSION_IMAGES, /* Mounted outside the root directory, and used by subsequent mounts */ MQUEUEFS, - READWRITE_IMPLICIT, /* Should have the lowest priority. */ + READWRITE_IMPLICIT, /* Should have the 2nd lowest priority. */ + MKDIR, /* Should have the lowest priority. */ _MOUNT_MODE_MAX, } MountMode; @@ -227,6 +228,7 @@ static const char * const mount_mode_table[_MOUNT_MODE_MAX] = { [EXEC] = "exec", [NOEXEC] = "noexec", [MQUEUEFS] = "mqueuefs", + [MKDIR] = "mkdir", }; /* Helper struct for naming simplicity and reusability */ @@ -1551,6 +1553,12 @@ static int apply_one_mount( case OVERLAY_MOUNT: return mount_overlay(m); + case MKDIR: + r = mkdir_p_label(mount_entry_path(m), 0755); + if (r < 0) + return r; + return 1; + default: assert_not_reached(); } @@ -2017,6 +2025,7 @@ int setup_namespace( const char* tmp_dir, const char* var_tmp_dir, const char *creds_path, + int creds_fd, const char *log_namespace, unsigned long mount_propagation_flag, VeritySettings *verity, @@ -2339,13 +2348,22 @@ int setup_namespace( .flags = MS_NODEV|MS_STRICTATIME|MS_NOSUID|MS_NOEXEC, }; - *(m++) = (MountEntry) { - .path_const = creds_path, - .mode = BIND_MOUNT, - .read_only = true, - .source_const = creds_path, - .ignore = true, - }; + /* If we have mount fd for credentials directory, then it will be mounted after + * namespace is set up. So, here we only create the mount point. */ + + if (creds_fd < 0) + *(m++) = (MountEntry) { + .path_const = creds_path, + .mode = BIND_MOUNT, + .read_only = true, + .source_const = creds_path, + .ignore = true, + }; + else + *(m++) = (MountEntry) { + .path_const = creds_path, + .mode = MKDIR, + }; } else { /* If our service has no credentials store configured, then make the whole * credentials tree inaccessible wholesale. */ diff --git a/src/core/namespace.h b/src/core/namespace.h index b6132154c5..b8615cbd98 100644 --- a/src/core/namespace.h +++ b/src/core/namespace.h @@ -122,6 +122,7 @@ int setup_namespace( const char *tmp_dir, const char *var_tmp_dir, const char *creds_path, + int creds_fd, const char *log_namespace, unsigned long mount_propagation_flag, VeritySettings *verity, diff --git a/src/test/test-namespace.c b/src/test/test-namespace.c index 25aafc35ca..436a784e86 100644 --- a/src/test/test-namespace.c +++ b/src/test/test-namespace.c @@ -194,6 +194,7 @@ TEST(protect_kernel_logs) { NULL, NULL, NULL, + -EBADF, NULL, 0, NULL, diff --git a/src/test/test-ns.c b/src/test/test-ns.c index 77afd2f6b9..56f3de83b6 100644 --- a/src/test/test-ns.c +++ b/src/test/test-ns.c @@ -96,6 +96,7 @@ int main(int argc, char *argv[]) { tmp_dir, var_tmp_dir, NULL, + -EBADF, NULL, 0, NULL,