From 1236f06c4245477553bb60694e5e5a4ba9971c01 Mon Sep 17 00:00:00 2001 From: Yu Watanabe Date: Mon, 17 Feb 2025 15:09:50 +0900 Subject: [PATCH 1/5] nspawn: use FOREACH_ARRAY() where applicable --- src/nspawn/nspawn-bind-user.c | 15 ++++++-------- src/nspawn/nspawn-mount.c | 12 ++++------- src/nspawn/nspawn-oci.c | 38 ++++++++++++++++------------------- src/nspawn/nspawn-register.c | 5 +---- src/nspawn/nspawn.c | 17 ++++++---------- 5 files changed, 34 insertions(+), 53 deletions(-) diff --git a/src/nspawn/nspawn-bind-user.c b/src/nspawn/nspawn-bind-user.c index 8964de22a1..4ec8393b02 100644 --- a/src/nspawn/nspawn-bind-user.c +++ b/src/nspawn/nspawn-bind-user.c @@ -181,13 +181,11 @@ BindUserContext* bind_user_context_free(BindUserContext *c) { if (!c) return NULL; - assert(c->n_data == 0 || c->data); - - for (size_t i = 0; i < c->n_data; i++) { - user_record_unref(c->data[i].host_user); - group_record_unref(c->data[i].host_group); - user_record_unref(c->data[i].payload_user); - group_record_unref(c->data[i].payload_group); + FOREACH_ARRAY(d, c->data, c->n_data) { + user_record_unref(d->host_user); + group_record_unref(d->host_group); + user_record_unref(d->payload_user); + group_record_unref(d->payload_group); } return mfree(c); @@ -400,10 +398,9 @@ int bind_user_setup( if (r < 0) return log_error_errno(r, "Failed to create /run/host/userdb: %m"); - for (size_t i = 0; i < c->n_data; i++) { + FOREACH_ARRAY(d, c->data, c->n_data) { _cleanup_(group_record_unrefp) GroupRecord *stripped_group = NULL, *shadow_group = NULL; _cleanup_(user_record_unrefp) UserRecord *stripped_user = NULL, *shadow_user = NULL; - const BindUserData *d = c->data + i; /* First, write shadow (i.e. privileged) data for group record */ r = group_record_clone(d->payload_group, shadow_flags, &shadow_group); diff --git a/src/nspawn/nspawn-mount.c b/src/nspawn/nspawn-mount.c index 552d629a18..3e69781572 100644 --- a/src/nspawn/nspawn-mount.c +++ b/src/nspawn/nspawn-mount.c @@ -48,9 +48,7 @@ CustomMount* custom_mount_add(CustomMount **l, size_t *n, CustomMountType t) { } void custom_mount_free_all(CustomMount *l, size_t n) { - for (size_t i = 0; i < n; i++) { - CustomMount *m = l + i; - + FOREACH_ARRAY(m, l, n) { free(m->source); free(m->destination); free(m->options); @@ -1024,9 +1022,7 @@ int mount_custom( assert(dest); - for (size_t i = 0; i < n; i++) { - CustomMount *m = mounts + i; - + FOREACH_ARRAY(m, mounts, n) { if (FLAGS_SET(mount_settings, MOUNT_IN_USERNS) != m->in_userns) continue; @@ -1070,8 +1066,8 @@ int mount_custom( } bool has_custom_root_mount(const CustomMount *mounts, size_t n) { - for (size_t i = 0; i < n; i++) - if (path_equal(mounts[i].destination, "/")) + FOREACH_ARRAY(m, mounts, n) + if (path_equal(m->destination, "/")) return true; return false; diff --git a/src/nspawn/nspawn-oci.c b/src/nspawn/nspawn-oci.c index 1b2fe76f8a..ecbcaefcbb 100644 --- a/src/nspawn/nspawn-oci.c +++ b/src/nspawn/nspawn-oci.c @@ -919,21 +919,17 @@ struct device_data { static int oci_cgroup_device_access(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) { struct device_data *d = ASSERT_PTR(userdata); bool r = false, w = false, m = false; - const char *s; - size_t i; - assert_se(s = sd_json_variant_string(v)); - - for (i = 0; s[i]; i++) - if (s[i] == 'r') + for (const char *s = ASSERT_PTR(sd_json_variant_string(v)); *s; s++) + if (*s == 'r') r = true; - else if (s[i] == 'w') + else if (*s == 'w') w = true; - else if (s[i] == 'm') + else if (*s == 'm') m = true; else return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), - "Unknown device access character '%c'.", s[i]); + "Unknown device access character '%c'.", *s); d->r = r; d->w = w; @@ -945,7 +941,7 @@ static int oci_cgroup_device_access(const char *name, sd_json_variant *v, sd_jso static int oci_cgroup_devices(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) { _cleanup_free_ struct device_data *list = NULL; Settings *s = ASSERT_PTR(userdata); - size_t n_list = 0, i; + size_t n_list = 0; bool noop = false; sd_json_variant *e; int r; @@ -1036,43 +1032,43 @@ static int oci_cgroup_devices(const char *name, sd_json_variant *v, sd_json_disp if (r < 0) return bus_log_create_error(r); - for (i = 0; i < n_list; i++) { + FOREACH_ARRAY(d, list, n_list) { _cleanup_free_ char *pattern = NULL; char access[4]; size_t n = 0; - if (list[i].minor == UINT_MAX) { + if (d->minor == UINT_MAX) { const char *t; - if (list[i].type == S_IFBLK) + if (d->type == S_IFBLK) t = "block"; else { - assert(list[i].type == S_IFCHR); + assert(d->type == S_IFCHR); t = "char"; } - if (list[i].major == UINT_MAX) { + if (d->major == UINT_MAX) { pattern = strjoin(t, "-*"); if (!pattern) return log_oom(); } else { - if (asprintf(&pattern, "%s-%u", t, list[i].major) < 0) + if (asprintf(&pattern, "%s-%u", t, d->major) < 0) return log_oom(); } } else { - assert(list[i].major != UINT_MAX); /* If a minor is specified, then a major also needs to be specified */ + assert(d->major != UINT_MAX); /* If a minor is specified, then a major also needs to be specified */ - r = device_path_make_major_minor(list[i].type, makedev(list[i].major, list[i].minor), &pattern); + r = device_path_make_major_minor(d->type, makedev(d->major, d->minor), &pattern); if (r < 0) return log_oom(); } - if (list[i].r) + if (d->r) access[n++] = 'r'; - if (list[i].w) + if (d->w) access[n++] = 'w'; - if (list[i].m) + if (d->m) access[n++] = 'm'; access[n] = 0; diff --git a/src/nspawn/nspawn-register.c b/src/nspawn/nspawn-register.c index 009f71f59f..4193a33813 100644 --- a/src/nspawn/nspawn-register.c +++ b/src/nspawn/nspawn-register.c @@ -21,7 +21,6 @@ static int append_machine_properties( int kill_signal, bool coredump_receive) { - unsigned j; int r; assert(m); @@ -48,9 +47,7 @@ static int append_machine_properties( return bus_log_create_error(r); } - for (j = 0; j < n_mounts; j++) { - CustomMount *cm = mounts + j; - + FOREACH_ARRAY(cm, mounts, n_mounts) { if (cm->type != CUSTOM_MOUNT_BIND) continue; diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 4c054b2dbb..8fa05b9bc2 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -471,11 +471,7 @@ static int help(void) { } static int custom_mount_check_all(void) { - size_t i; - - for (i = 0; i < arg_n_custom_mounts; i++) { - CustomMount *m = &arg_custom_mounts[i]; - + FOREACH_ARRAY(m, arg_custom_mounts, arg_n_custom_mounts) if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) { if (arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_OFF) return log_error_errno(SYNTHETIC_ERRNO(EINVAL), @@ -485,7 +481,6 @@ static int custom_mount_check_all(void) { return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--private-users with automatic UID shift may not be combined with custom root mounts."); } - } return 0; } @@ -4154,12 +4149,12 @@ static int outer_child( /* Send the user maps we determined to the parent, so that it installs it in our user * namespace UID map table */ - for (size_t i = 0; i < bind_user_context->n_data; i++) { + FOREACH_ARRAY(d, bind_user_context->data, bind_user_context->n_data) { uid_t map[] = { - bind_user_context->data[i].payload_user->uid, - bind_user_context->data[i].host_user->uid, - (uid_t) bind_user_context->data[i].payload_group->gid, - (uid_t) bind_user_context->data[i].host_group->gid, + d->payload_user->uid, + d->host_user->uid, + (uid_t) d->payload_group->gid, + (uid_t) d->host_group->gid, }; l = send(fd_outer_socket, map, sizeof(map), MSG_NOSIGNAL); From 3cc23a2c2345eb188551565349c89ec1fa8f650f Mon Sep 17 00:00:00 2001 From: Yu Watanabe Date: Mon, 17 Feb 2025 15:06:10 +0900 Subject: [PATCH 2/5] nspawn: enable FUSE unconditionally FUSE is userns-safe since kernel v4.18 (da315f6e03988a7127680bbc26e1028991b899b8), and now our kernel base line is 5.4. Let's drop the logic of checking the version of FUSE, and unconditionally enable FUSE. --- src/nspawn/nspawn-register.c | 21 +++---- src/nspawn/nspawn-register.h | 2 - src/nspawn/nspawn.c | 104 +++-------------------------------- 3 files changed, 14 insertions(+), 113 deletions(-) diff --git a/src/nspawn/nspawn-register.c b/src/nspawn/nspawn-register.c index 4193a33813..0387e0b783 100644 --- a/src/nspawn/nspawn-register.c +++ b/src/nspawn/nspawn-register.c @@ -15,7 +15,6 @@ static int append_machine_properties( sd_bus_message *m, - bool enable_fuse, CustomMount *mounts, unsigned n_mounts, int kill_signal, @@ -31,21 +30,17 @@ static int append_machine_properties( /* If you make changes here, also make sure to update systemd-nspawn@.service, to keep the device * policies in sync regardless if we are run with or without the --keep-unit switch. */ - r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 2, - /* Allow the container to access and create the API device nodes, so that - * PrivateDevices= in the container can work fine */ + r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 3, + /* Allow the container to access and create the API device node, so that + * PrivateDevices= in the container can work fine. */ "/dev/net/tun", "rwm", - /* Allow the container access to ptys. However, do not permit the container + /* Allow the container to access ptys. However, do not permit the container * to ever create these device nodes. */ - "char-pts", "rw"); + "char-pts", "rw", + /* Allow the container to access and create the FUSE API device node. */ + "/dev/fuse", "rwm"); if (r < 0) return bus_log_create_error(r); - if (enable_fuse) { - r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 1, - "/dev/fuse", "rwm"); - if (r < 0) - return bus_log_create_error(r); - } FOREACH_ARRAY(cm, mounts, n_mounts) { if (cm->type != CUSTOM_MOUNT_BIND) @@ -204,7 +199,6 @@ int register_machine( r = append_machine_properties( m, - FLAGS_SET(flags, REGISTER_MACHINE_ENABLE_FUSE), mounts, n_mounts, kill_signal, @@ -325,7 +319,6 @@ int allocate_scope( r = append_machine_properties( m, - FLAGS_SET(flags, ALLOCATE_SCOPE_ENABLE_FUSE), mounts, n_mounts, kill_signal, diff --git a/src/nspawn/nspawn-register.h b/src/nspawn/nspawn-register.h index 5e187e33bb..89e35f02a7 100644 --- a/src/nspawn/nspawn-register.h +++ b/src/nspawn/nspawn-register.h @@ -10,7 +10,6 @@ typedef enum RegisterMachineFlags { REGISTER_MACHINE_KEEP_UNIT = 1 << 0, - REGISTER_MACHINE_ENABLE_FUSE = 1 << 1, } RegisterMachineFlags; int register_machine( @@ -32,7 +31,6 @@ int unregister_machine(sd_bus *bus, const char *machine_name); typedef enum AllocateScopeFlags { ALLOCATE_SCOPE_ALLOW_PIDFD = 1 << 0, - ALLOCATE_SCOPE_ENABLE_FUSE = 1 << 1, } AllocateScopeFlags; int allocate_scope( diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 8fa05b9bc2..31757f4ee1 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -2170,85 +2170,6 @@ static int setup_boot_id(void) { return mount_nofollow_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL); } -static int get_fuse_version(uint32_t *ret_major, uint32_t *ret_minor) { - /* Must be called with mount privileges, either via arg_privileged or by being uid=0 in new - * CLONE_NEWUSER/CLONE_NEWNS namespaces. This is true when called from outer_child(). */ - ssize_t n; - _cleanup_close_ int fuse_fd = -EBADF, mnt_fd = -EBADF; - _cleanup_free_ char *opts = NULL; - union { - char unstructured[FUSE_MIN_READ_BUFFER]; - struct { - struct fuse_in_header header; - /* Don't use :`struct fuse_init_in` because a newer fuse.h might give - * us a bigger struct than what an older kernel actually gives us, and that would - * break our .header.len check. */ - struct { - uint32_t major; - uint32_t minor; - } body; - } structured; - } request; - - assert(ret_major); - assert(ret_minor); - - /* Get a FUSE handle. */ - fuse_fd = open("/dev/fuse", O_CLOEXEC|O_RDWR); - if (fuse_fd < 0) - return log_debug_errno(errno, "Failed to open /dev/fuse: %m"); - if (asprintf(&opts, "fd=%i,rootmode=40000,user_id=0,group_id=0", fuse_fd) < 0) - return log_oom_debug(); - mnt_fd = make_fsmount(LOG_DEBUG, "nspawn-fuse", "fuse.nspawn", 0, opts, -EBADF); - if (mnt_fd < 0) - return mnt_fd; - - /* Read a request from the FUSE handle. */ - n = read(fuse_fd, &request.unstructured, sizeof request); - if (n < 0) - return log_debug_errno(errno, "Failed to read /dev/fuse: %m"); - if ((size_t) n < sizeof request.structured.header || - (size_t) n < request.structured.header.len) - return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Failed to read /dev/fuse: Short read"); - - /* Assume that the request is a FUSE_INIT request, and return the version information from it. */ - if (request.structured.header.opcode != FUSE_INIT) - return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Initial request from /dev/fuse should have opcode=%i (FUSE_INIT), but has opcode=%"PRIu32, - FUSE_INIT, request.structured.header.opcode); - if (request.structured.header.len < sizeof request.structured) - return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Initial FUSE_INIT request from /dev/fuse is too short"); - *ret_major = request.structured.body.major; - *ret_minor = request.structured.body.minor; - return 0; -} - -static bool should_enable_fuse(void) { - uint32_t fuse_major, fuse_minor; - int r; - - r = get_fuse_version(&fuse_major, &fuse_minor); - if (r < 0) { - if (ERRNO_IS_NEG_DEVICE_ABSENT(r)) - log_debug_errno(r, "Disabling FUSE: FUSE appears to be disabled on the host: %m"); - else if (ERRNO_IS_NEG_NOT_SUPPORTED(r)) - log_debug_errno(r, "Disabling FUSE: Kernel does not support the fsopen() family of syscalls: %m"); - else - log_full_errno(ERRNO_IS_NEG_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r, - "Disabling FUSE: Failed to determine FUSE version: %m"); - return false; - } - - /* FUSE is only userns-safe in FUSE version 7.27 and later. - * https://github.com/torvalds/linux/commit/da315f6e03988a7127680bbc26e1028991b899b8 */ - if (fuse_major < 7 || (fuse_major == 7 && fuse_minor < 27)) { - log_debug("Disabling FUSE: FUSE version %" PRIu32 ".%" PRIu32 " is too old to support user namespaces", - fuse_major, fuse_minor); - return false; - } - - return true; -} - static int bind_mount_devnode(const char *from, const char *to) { int r; @@ -2367,7 +2288,7 @@ static int copy_devnode_one(const char *dest, const char *node, bool ignore_mkno return 0; } -static int copy_devnodes(const char *dest, bool enable_fuse) { +static int copy_devnodes(const char *dest) { int r = 0; assert(dest); @@ -2378,7 +2299,10 @@ static int copy_devnodes(const char *dest, bool enable_fuse) { return r; } - if (enable_fuse) { + /* Create /dev/fuse only when it is accessible. The check is necessary, as some custom service + * units that invoke nspawn may enable DevicePolicy= without DeviceAllow= for the device node. */ + _cleanup_close_ int fuse_fd = open("/dev/fuse", O_CLOEXEC|O_RDWR); + if (fuse_fd >= 0) { r = copy_devnode_one(dest, "fuse", /* ignore_mknod_failure = */ false); if (r < 0) return r; @@ -3970,7 +3894,7 @@ static int outer_child( _cleanup_(bind_user_context_freep) BindUserContext *bind_user_context = NULL; _cleanup_strv_free_ char **os_release_pairs = NULL; - bool idmap = false, enable_fuse; + bool idmap = false; const char *p; pid_t pid; ssize_t l; @@ -4314,12 +4238,7 @@ static int outer_child( if (r < 0) return r; - enable_fuse = should_enable_fuse(); - l = send(fd_outer_socket, &enable_fuse, sizeof enable_fuse, 0); - if (l < 0) - return log_error_errno(errno, "Failed to send whether to enable FUSE: %m"); - - r = copy_devnodes(directory, enable_fuse); + r = copy_devnodes(directory); if (r < 0) return r; @@ -5275,7 +5194,6 @@ static int run_container( ssize_t l; sigset_t mask_chld; _cleanup_close_ int child_netns_fd = -EBADF; - bool enable_fuse; assert_se(sigemptyset(&mask_chld) == 0); assert_se(sigaddset(&mask_chld, SIGCHLD) == 0); @@ -5459,12 +5377,6 @@ static int run_container( l, l == 0 ? " The child is most likely dead." : ""); } - l = recv(fd_outer_socket_pair[0], &enable_fuse, sizeof enable_fuse, 0); - if (l < 0) - return log_error_errno(errno, "Failed to read whether to enable FUSE: %m"); - if (l != sizeof enable_fuse) - return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading whether to enable FUSE."); - /* Wait for the outer child. */ r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL); if (r < 0) @@ -5619,7 +5531,6 @@ static int run_container( if (arg_register) { RegisterMachineFlags flags = 0; SET_FLAG(flags, REGISTER_MACHINE_KEEP_UNIT, arg_keep_unit); - SET_FLAG(flags, REGISTER_MACHINE_ENABLE_FUSE, enable_fuse); r = register_machine( bus, arg_machine, @@ -5640,7 +5551,6 @@ static int run_container( } else if (!arg_keep_unit) { AllocateScopeFlags flags = ALLOCATE_SCOPE_ALLOW_PIDFD; - SET_FLAG(flags, ALLOCATE_SCOPE_ENABLE_FUSE, enable_fuse); r = allocate_scope( bus, arg_machine, From 9fff6bf59e6cfbbc8bddee1e802a94a38d29063a Mon Sep 17 00:00:00 2001 From: Yu Watanabe Date: Mon, 17 Feb 2025 23:59:46 +0900 Subject: [PATCH 3/5] nspawn: create /dev/net/tun only when it is accessible Follow-up for 985ea98e7f90c92fcc0b8441fafb190353d2feb8. When DevicePolicy= is enabled, but DeviceAllow= for /dev/net/tun is not specified, bind-mounting the device node from the host system is meaningless, as it cannot be used in the container anyway. Let's check the device node is accessible before creating or bind-mounting. --- src/nspawn/nspawn.c | 13 ++++++----- test/units/TEST-13-NSPAWN.nspawn.sh | 35 +++++++++-------------------- 2 files changed, 17 insertions(+), 31 deletions(-) diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 31757f4ee1..8ded13506d 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -2308,12 +2308,13 @@ static int copy_devnodes(const char *dest) { return r; } - /* We unconditionally try to create /dev/net/tun, but let's ignore failure if --private-network is - * unspecified. The failure can be triggered when e.g. DevicePolicy= is set, but DeviceAllow= does - * not contains the device node, and --private-users=pick is specified. */ - r = copy_devnode_one(dest, "net/tun", /* ignore_mknod_failure = */ !arg_private_network); - if (r < 0) - return r; + /* Similarly, create /dev/net/tun only when it is accessible. */ + _cleanup_close_ int tun_fd = open("/dev/net/tun", O_CLOEXEC|O_RDWR); + if (tun_fd >= 0) { + r = copy_devnode_one(dest, "net/tun", /* ignore_mknod_failure = */ false); + if (r < 0) + return r; + } return 0; } diff --git a/test/units/TEST-13-NSPAWN.nspawn.sh b/test/units/TEST-13-NSPAWN.nspawn.sh index 076e94c7b1..e814cede57 100755 --- a/test/units/TEST-13-NSPAWN.nspawn.sh +++ b/test/units/TEST-13-NSPAWN.nspawn.sh @@ -1231,31 +1231,16 @@ testcase_unpriv_fuse() { } test_tun() { - local expect=${1?} - local exists=${2?} - local command command_exists command_not_exists - shift 2 - - command_exists='[[ -c /dev/net/tun ]]; [[ "$(stat /dev/net/tun --format=%u)" == 0 ]]; [[ "$(stat /dev/net/tun --format=%g)" == 0 ]]' - command_not_exists='[[ ! -e /dev/net/tun ]]' - - if [[ "$exists" == 0 ]]; then - command="$command_not_exists" - else - command="$command_exists" - fi - - systemd-nspawn "$@" bash -xec "$command_exists" + systemd-nspawn "$@" bash -xec '[[ -c /dev/net/tun ]]; [[ "$(stat /dev/net/tun --format=%u)" == 0 ]]; [[ "$(stat /dev/net/tun --format=%g)" == 0 ]]' # check if the owner of the host device is unchanged, see issue #34243. [[ "$(stat /dev/net/tun --format=%u)" == 0 ]] [[ "$(stat /dev/net/tun --format=%g)" == 0 ]] # Without DeviceAllow= for /dev/net/tun, see issue #35116. - assert_rc \ - "$expect" \ - systemd-run --wait -p Environment=SYSTEMD_LOG_LEVEL=debug -p DevicePolicy=closed -p DeviceAllow="char-pts rw" \ - systemd-nspawn "$@" bash -xec "$command" + systemd-run \ + --wait -p Environment=SYSTEMD_LOG_LEVEL=debug -p DevicePolicy=closed -p DeviceAllow="char-pts rw" \ + systemd-nspawn "$@" bash -xec '[[ ! -e /dev/net/tun ]]' [[ "$(stat /dev/net/tun --format=%u)" == 0 ]] [[ "$(stat /dev/net/tun --format=%g)" == 0 ]] @@ -1272,12 +1257,12 @@ testcase_dev_net_tun() { root="$(mktemp -d /var/lib/machines/TEST-13-NSPAWN.tun.XXX)" create_dummy_container "$root" - test_tun 0 1 --ephemeral --directory="$root" --private-users=no - test_tun 0 1 --ephemeral --directory="$root" --private-users=yes - test_tun 0 0 --ephemeral --directory="$root" --private-users=pick - test_tun 0 1 --ephemeral --directory="$root" --private-users=no --private-network - test_tun 0 1 --ephemeral --directory="$root" --private-users=yes --private-network - test_tun 1 0 --ephemeral --directory="$root" --private-users=pick --private-network + test_tun --ephemeral --directory="$root" --private-users=no + test_tun --ephemeral --directory="$root" --private-users=yes + test_tun --ephemeral --directory="$root" --private-users=pick + test_tun --ephemeral --directory="$root" --private-users=no --private-network + test_tun --ephemeral --directory="$root" --private-users=yes --private-network + test_tun --ephemeral --directory="$root" --private-users=pick --private-network rm -fr "$root" } From c51e472bd4f4363e8df9a87c1f17bcb392872176 Mon Sep 17 00:00:00 2001 From: Yu Watanabe Date: Tue, 18 Feb 2025 00:00:40 +0900 Subject: [PATCH 4/5] nspawn: drop unused argument for copy_devnode_one() --- src/nspawn/nspawn.c | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 8ded13506d..d73f83f3fb 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -2189,7 +2189,7 @@ static int bind_mount_devnode(const char *from, const char *to) { return 0; } -static int copy_devnode_one(const char *dest, const char *node, bool ignore_mknod_failure) { +static int copy_devnode_one(const char *dest, const char *node) { int r; assert(dest); @@ -2235,10 +2235,6 @@ static int copy_devnode_one(const char *dest, const char *node, bool ignore_mkno /* If arg_uid_shift != 0, then we cannot fall back to use bind mount. */ if (!(arg_userns_mode == USER_NAMESPACE_NO || (arg_userns_mode == USER_NAMESPACE_FIXED && arg_uid_shift == 0))) { - if (ignore_mknod_failure) { - log_debug_errno(r, "Failed to mknod(%s), ignoring: %m", to); - return 0; - } if (arg_userns_mode != USER_NAMESPACE_MANAGED || !ERRNO_IS_NEG_PRIVILEGE(r)) return log_error_errno(r, "Failed to mknod(%s): %m", to); @@ -2247,14 +2243,9 @@ static int copy_devnode_one(const char *dest, const char *node, bool ignore_mkno } /* Some systems abusively restrict mknod but allow bind mounts. */ - if (bind_mount_devnode(from, to) < 0) { + if (bind_mount_devnode(from, to) < 0) /* use the original error code. */ - if (ignore_mknod_failure) { - log_debug_errno(r, "Both mknod() and bind mount %s failed, ignoring: %m", to); - return 0; - } return log_error_errno(r, "Both mknod() and bind mount %s failed: %m", to); - } } else { /* mknod() succeeds, chown() it if necessary. */ r = userns_lchown(to, 0, 0); @@ -2294,7 +2285,7 @@ static int copy_devnodes(const char *dest) { assert(dest); FOREACH_STRING(node, "null", "zero", "full", "random", "urandom", "tty") { - r = copy_devnode_one(dest, node, /* ignore_mknod_failure = */ false); + r = copy_devnode_one(dest, node); if (r < 0) return r; } @@ -2303,7 +2294,7 @@ static int copy_devnodes(const char *dest) { * units that invoke nspawn may enable DevicePolicy= without DeviceAllow= for the device node. */ _cleanup_close_ int fuse_fd = open("/dev/fuse", O_CLOEXEC|O_RDWR); if (fuse_fd >= 0) { - r = copy_devnode_one(dest, "fuse", /* ignore_mknod_failure = */ false); + r = copy_devnode_one(dest, "fuse"); if (r < 0) return r; } @@ -2311,7 +2302,7 @@ static int copy_devnodes(const char *dest) { /* Similarly, create /dev/net/tun only when it is accessible. */ _cleanup_close_ int tun_fd = open("/dev/net/tun", O_CLOEXEC|O_RDWR); if (tun_fd >= 0) { - r = copy_devnode_one(dest, "net/tun", /* ignore_mknod_failure = */ false); + r = copy_devnode_one(dest, "net/tun"); if (r < 0) return r; } From 114d191a17041d8c34041de49442fb1f577349f1 Mon Sep 17 00:00:00 2001 From: Yu Watanabe Date: Tue, 18 Feb 2025 23:35:13 +0900 Subject: [PATCH 5/5] nspawn: move the accessibility check for device nodes into copy_devnode_one() --- src/nspawn/nspawn.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index d73f83f3fb..bcf0baaf7f 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -2189,7 +2189,7 @@ static int bind_mount_devnode(const char *from, const char *to) { return 0; } -static int copy_devnode_one(const char *dest, const char *node) { +static int copy_devnode_one(const char *dest, const char *node, bool check) { int r; assert(dest); @@ -2201,6 +2201,19 @@ static int copy_devnode_one(const char *dest, const char *node) { if (!from) return log_oom(); + if (check) { + /* If 'check' is true, create /dev/fuse only when it is accessible. The check is necessary, + * as some custom service units that invoke systemd-nspawn may enable DevicePolicy= without + * DeviceAllow= for the device node. */ + _cleanup_close_ int fd = open(from, O_CLOEXEC|O_RDWR); + if (fd < 0) { + log_debug_errno(errno, + "Failed to open %s, skipping creation of the device node in the container, ignoring: %m", + from); + return 0; + } + } + _cleanup_free_ char *to = path_join(dest, from); if (!to) return log_oom(); @@ -2284,25 +2297,16 @@ static int copy_devnodes(const char *dest) { assert(dest); + /* Required basic device nodes. */ FOREACH_STRING(node, "null", "zero", "full", "random", "urandom", "tty") { - r = copy_devnode_one(dest, node); + r = copy_devnode_one(dest, node, /* check = */ false); if (r < 0) return r; } - /* Create /dev/fuse only when it is accessible. The check is necessary, as some custom service - * units that invoke nspawn may enable DevicePolicy= without DeviceAllow= for the device node. */ - _cleanup_close_ int fuse_fd = open("/dev/fuse", O_CLOEXEC|O_RDWR); - if (fuse_fd >= 0) { - r = copy_devnode_one(dest, "fuse"); - if (r < 0) - return r; - } - - /* Similarly, create /dev/net/tun only when it is accessible. */ - _cleanup_close_ int tun_fd = open("/dev/net/tun", O_CLOEXEC|O_RDWR); - if (tun_fd >= 0) { - r = copy_devnode_one(dest, "net/tun"); + /* Optional device nodes. */ + FOREACH_STRING(node, "fuse", "net/tun") { + r = copy_devnode_one(dest, node, /* check = */ true); if (r < 0) return r; }