namespace-util: modernize fd_is_namespace() and is_our_namespace()

- Make fd_is_namespace() take NamespaceType
- Drop support for kernel without NS_GET_NSTYPE (< 4.11)
- Port is_our_namespace() to namespace_open_by_type()
  (preparation for later commits, where the latter
  would go by pidfd if available, avoiding procfs)
This commit is contained in:
Mike Yuan
2024-11-27 16:35:11 +01:00
parent 1775337a36
commit 07610cafcf
11 changed files with 85 additions and 111 deletions

8
README
View File

@@ -35,6 +35,7 @@ REQUIREMENTS:
≥ 4.9 for RENAME_NOREPLACE support in vfat
≥ 4.10 for cgroup-bpf egress and ingress hooks
≥ 4.11 for nsfs
# FIXME: drop compat glue and remove entries above before v258
≥ 4.15 for cgroup-bpf device hook and cpu controller in cgroup v2
≥ 4.17 for cgroup-bpf socket address hooks and /sys/power/resume_offset
≥ 4.20 for PSI (used by systemd-oomd)
@@ -43,16 +44,17 @@ REQUIREMENTS:
≥ 5.4 for pidfd, new mount API, and signed Verity images
≥ 5.6 for getrandom() GRND_INSECURE
≥ 5.7 for CLONE_INTO_CGROUP, BPF links and the BPF LSM hook
≥ 5.9 for close_range()
≥ 5.8 for LOOP_CONFIGURE and STATX_ATTR_MOUNT_ROOT
≥ 5.9 for close_range()
≥ 6.3 for MFD_EXEC/MFD_NOEXEC_SEAL and tmpfs noswap option
≥ 6.5 for name_to_handle_at() AT_HANDLE_FID, SO_PEERPIDFD/SO_PASSPIDFD,
and MOVE_MOUNT_BENEATH
≥ 6.9 for pidfs
⛔ Kernel versions below 4.3 ("minimum baseline") are not supported at
⛔ Kernel versions below 4.11 ("minimum baseline") are not supported at
all, and are missing required functionality (e.g. CLOCK_BOOTTIME support
for timerfd_create(), getrandom(), ambient capabilities, or memfd_create()).
for timerfd_create(), getrandom(), ambient capabilities, memfd_create(),
or nsfs (NS_GET_NSTYPE)).
⚠️ Kernel versions below 5.4 ("recommended baseline") have significant
gaps in functionality and are not recommended for use with this version

View File

@@ -40,7 +40,7 @@ const struct namespace_info namespace_info[_NAMESPACE_TYPE_MAX + 1] = {
#define pid_namespace_path(pid, type) procfs_file_alloca(pid, namespace_info[type].proc_path)
static NamespaceType clone_flag_to_namespace_type(unsigned long clone_flag) {
NamespaceType clone_flag_to_namespace_type(unsigned long clone_flag) {
for (NamespaceType t = 0; t < _NAMESPACE_TYPE_MAX; t++)
if (((namespace_info[t].clone_flag ^ clone_flag) & (CLONE_NEWCGROUP|CLONE_NEWIPC|CLONE_NEWNET|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUSER|CLONE_NEWUTS|CLONE_NEWTIME)) == 0)
return t;
@@ -157,10 +157,10 @@ int namespace_enter(int pidns_fd, int mntns_fd, int netns_fd, int userns_fd, int
/* Can't setns to your own userns, since then you could escalate from non-root to root in
* your own namespace, so check if namespaces are equal before attempting to enter. */
r = inode_same_at(userns_fd, "", AT_FDCWD, "/proc/self/ns/user", AT_EMPTY_PATH);
r = is_our_namespace(userns_fd, NAMESPACE_USER);
if (r < 0)
return r;
if (r)
if (r > 0)
userns_fd = -EBADF;
}
@@ -191,50 +191,49 @@ int namespace_enter(int pidns_fd, int mntns_fd, int netns_fd, int userns_fd, int
return reset_uid_gid();
}
int fd_is_ns(int fd, unsigned long nsflag) {
struct statfs s;
int fd_is_namespace(int fd, NamespaceType type) {
int r;
/* Checks whether the specified file descriptor refers to a namespace created by specifying nsflag in clone().
* On old kernels there's no nice way to detect that, hence on those we'll return a recognizable error (EUCLEAN),
* so that callers can handle this somewhat nicely.
*
* This function returns > 0 if the fd definitely refers to a network namespace, 0 if it definitely does not
* refer to a network namespace, -EUCLEAN if we can't determine, and other negative error codes on error. */
/* Checks whether the specified file descriptor refers to a namespace (of type if type != _NAMESPACE_INVALID). */
if (fstatfs(fd, &s) < 0)
assert(fd >= 0);
assert(type < _NAMESPACE_TYPE_MAX);
r = fd_is_fs_type(fd, NSFS_MAGIC);
if (r <= 0)
return r;
if (type < 0)
return true;
int clone_flag = ioctl(fd, NS_GET_NSTYPE);
if (clone_flag < 0)
return -errno;
if (!is_fs_type(&s, NSFS_MAGIC)) {
/* On really old kernels, there was no "nsfs", and network namespace sockets belonged to procfs
* instead. Handle that in a somewhat smart way. */
NamespaceType found_type = clone_flag_to_namespace_type(clone_flag);
if (found_type < 0)
return -EBADF; /* Uh? Unknown namespace type? */
if (is_fs_type(&s, PROC_SUPER_MAGIC)) {
struct statfs t;
return found_type == type;
}
/* OK, so it is procfs. Let's see if our own network namespace is procfs, too. If so, then the
* passed fd might refer to a network namespace, but we can't know for sure. In that case,
* return a recognizable error. */
int is_our_namespace(int fd, NamespaceType type) {
int r;
if (statfs("/proc/self/ns/net", &t) < 0)
return -errno;
assert(fd >= 0);
assert(type < _NAMESPACE_TYPE_MAX);
if (s.f_type == t.f_type)
return -EUCLEAN; /* It's possible, we simply don't know */
}
r = fd_is_namespace(fd, type);
if (r < 0)
return r;
if (r == 0) /* Not a namespace or not of the right type? */
return -EUCLEAN;
return 0; /* No! */
}
_cleanup_close_ int our_ns = namespace_open_by_type(type);
if (our_ns < 0)
return our_ns;
r = ioctl(fd, NS_GET_NSTYPE);
if (r < 0) {
if (errno == ENOTTY) /* Old kernels didn't know this ioctl, let's also return a recognizable error in that case */
return -EUCLEAN;
return -errno;
}
return (unsigned long) r == nsflag;
return fd_inode_same(fd, our_ns);
}
int detach_mount_namespace(void) {
@@ -505,37 +504,6 @@ int namespace_is_init(NamespaceType type) {
return st.st_ino == namespace_info[type].root_inode;
}
int is_our_namespace(int fd, NamespaceType request_type) {
int clone_flag;
assert(fd >= 0);
clone_flag = ioctl(fd, NS_GET_NSTYPE);
if (clone_flag < 0)
return -errno;
NamespaceType found_type = clone_flag_to_namespace_type(clone_flag);
if (found_type < 0)
return -EBADF; /* Uh? Unknown namespace type? */
if (request_type >= 0 && request_type != found_type) /* It's a namespace, but not of the right type? */
return -EUCLEAN;
struct stat st_fd, st_ours;
if (fstat(fd, &st_fd) < 0)
return -errno;
const char *p = pid_namespace_path(0, found_type);
if (stat(p, &st_ours) < 0) {
if (errno == ENOENT)
return proc_mounted() == 0 ? -ENOSYS : -ENOENT;
return -errno;
}
return stat_inode_same(&st_ours, &st_fd);
}
int is_idmapping_supported(const char *path) {
_cleanup_close_ int mount_fd = -EBADF, userns_fd = -EBADF, dir_fd = -EBADF;
_cleanup_free_ char *uid_map = NULL, *gid_map = NULL;

View File

@@ -27,6 +27,8 @@ extern const struct namespace_info {
ino_t root_inode;
} namespace_info[_NAMESPACE_TYPE_MAX + 1];
NamespaceType clone_flag_to_namespace_type(unsigned long clone_flag);
int pidref_namespace_open(
const PidRef *pidref,
int *ret_pidns_fd,
@@ -41,9 +43,11 @@ int namespace_open(
int *ret_netns_fd,
int *ret_userns_fd,
int *ret_root_fd);
int namespace_enter(int pidns_fd, int mntns_fd, int netns_fd, int userns_fd, int root_fd);
int fd_is_ns(int fd, unsigned long nsflag);
int fd_is_namespace(int fd, NamespaceType type);
int is_our_namespace(int fd, NamespaceType type);
int detach_mount_namespace(void);
int detach_mount_namespace_harder(uid_t target_uid, gid_t target_gid);
@@ -77,6 +81,4 @@ int namespace_open_by_type(NamespaceType type);
int namespace_is_init(NamespaceType type);
int is_our_namespace(int fd, NamespaceType type);
int is_idmapping_supported(const char *path);

View File

@@ -3253,6 +3253,7 @@ int setup_shareable_ns(int ns_storage_socket[static 2], unsigned long nsflag) {
int open_shareable_ns_path(int ns_storage_socket[static 2], const char *path, unsigned long nsflag) {
_cleanup_close_ int ns = -EBADF;
NamespaceType type;
int r;
assert(ns_storage_socket);
@@ -3264,6 +3265,9 @@ int open_shareable_ns_path(int ns_storage_socket[static 2], const char *path, un
* it. This is supposed to be called ahead of time, i.e. before setup_shareable_ns() which will
* allocate a new anonymous ns if needed. */
type = clone_flag_to_namespace_type(nsflag);
assert(type >= 0);
r = posix_lock(ns_storage_socket[0], LOCK_EX);
if (r < 0)
return r;
@@ -3282,11 +3286,11 @@ int open_shareable_ns_path(int ns_storage_socket[static 2], const char *path, un
if (ns < 0)
return -errno;
r = fd_is_ns(ns, nsflag);
r = fd_is_namespace(ns, type);
if (r < 0)
return r;
if (r == 0)
return -EINVAL;
if (r < 0 && r != -EUCLEAN) /* EUCLEAN: we don't know */
return r;
r = send_one_fd(ns_storage_socket[1], ns, MSG_DONTWAIT);
if (r < 0)

View File

@@ -227,7 +227,7 @@ static int validate_userns(sd_varlink *link, int *userns_fd) {
if (r < 0)
return log_debug_errno(r, "User namespace file descriptor has unsafe flags set: %m");
r = fd_is_ns(*userns_fd, CLONE_NEWUSER);
r = fd_is_namespace(*userns_fd, NAMESPACE_USER);
if (r < 0)
return r;
if (r == 0)

View File

@@ -5249,12 +5249,10 @@ static int run_container(
if (child_netns_fd < 0)
return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path);
r = fd_is_ns(child_netns_fd, CLONE_NEWNET);
if (r == -EUCLEAN)
log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
else if (r < 0)
r = fd_is_namespace(child_netns_fd, NAMESPACE_NET);
if (r < 0)
return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
else if (r == 0)
if (r == 0)
return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
"Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
}

View File

@@ -666,7 +666,7 @@ static int validate_userns(sd_varlink *link, int userns_fd) {
return log_debug_errno(r, "User namespace file descriptor has unsafe flags set: %m");
/* Validate this is actually a valid user namespace fd */
r = fd_is_ns(userns_fd, CLONE_NEWUSER);
r = fd_is_namespace(userns_fd, NAMESPACE_USER);
if (r < 0)
return log_debug_errno(r, "Failed to check if user namespace fd is actually a user namespace: %m");
if (r == 0)
@@ -1455,7 +1455,7 @@ static int validate_netns(sd_varlink *link, int userns_fd, int netns_fd) {
return log_debug_errno(r, "Network namespace file descriptor has unsafe flags set: %m");
/* Validate this is actually a valid network namespace fd */
r = fd_is_ns(netns_fd, CLONE_NEWNET);
r = fd_is_namespace(netns_fd, NAMESPACE_NET);
if (r < 0)
return r;
if (r == 0)

View File

@@ -280,7 +280,7 @@ int userns_restrict_put_by_fd(
assert(userns_fd >= 0);
assert(n_mount_fds == 0 || mount_fds);
r = fd_is_ns(userns_fd, CLONE_NEWUSER);
r = fd_is_namespace(userns_fd, NAMESPACE_USER);
if (r < 0)
return log_debug_errno(r, "Failed to determine if file descriptor is user namespace: %m");
if (r == 0)

View File

@@ -1097,7 +1097,7 @@ static int mount_in_namespace(
if (r < 0)
return log_debug_errno(r, "Failed to retrieve FDs of the target process' namespace: %m");
r = inode_same_at(mntns_fd, "", AT_FDCWD, "/proc/self/ns/mnt", AT_EMPTY_PATH);
r = is_our_namespace(mntns_fd, NAMESPACE_MOUNT);
if (r < 0)
return log_debug_errno(r, "Failed to determine if mount namespaces are equal: %m");
/* We can't add new mounts at runtime if the process wasn't started in a namespace */

View File

@@ -164,6 +164,31 @@ TEST(ipcns) {
test_shareable_ns(CLONE_NEWIPC);
}
TEST(fd_is_namespace) {
_cleanup_close_ int fd = -EBADF;
ASSERT_OK_ZERO(fd_is_namespace(STDIN_FILENO, NAMESPACE_NET));
ASSERT_OK_ZERO(fd_is_namespace(STDOUT_FILENO, NAMESPACE_NET));
ASSERT_OK_ZERO(fd_is_namespace(STDERR_FILENO, NAMESPACE_NET));
fd = namespace_open_by_type(NAMESPACE_MOUNT);
if (IN_SET(fd, -ENOSYS, -ENOENT)) {
log_notice("Path %s not found, skipping test", "/proc/self/ns/mnt");
return;
}
ASSERT_OK(fd);
ASSERT_OK_POSITIVE(fd_is_namespace(fd, NAMESPACE_MOUNT));
ASSERT_OK_ZERO(fd_is_namespace(fd, NAMESPACE_NET));
fd = safe_close(fd);
ASSERT_OK(fd = namespace_open_by_type(NAMESPACE_IPC));
ASSERT_OK_POSITIVE(fd_is_namespace(fd, NAMESPACE_IPC));
fd = safe_close(fd);
ASSERT_OK(fd = namespace_open_by_type(NAMESPACE_NET));
ASSERT_OK_POSITIVE(fd_is_namespace(fd, NAMESPACE_NET));
}
TEST(protect_kernel_logs) {
static const NamespaceParameters p = {
.runtime_scope = RUNTIME_SCOPE_SYSTEM,

View File

@@ -165,31 +165,6 @@ TEST(path_is_read_only_fs) {
assert_se(path_is_read_only_fs("/i-dont-exist") == -ENOENT);
}
TEST(fd_is_ns) {
_cleanup_close_ int fd = -EBADF;
assert_se(fd_is_ns(STDIN_FILENO, CLONE_NEWNET) == 0);
assert_se(fd_is_ns(STDERR_FILENO, CLONE_NEWNET) == 0);
assert_se(fd_is_ns(STDOUT_FILENO, CLONE_NEWNET) == 0);
fd = open("/proc/self/ns/mnt", O_CLOEXEC|O_RDONLY);
if (fd < 0) {
assert_se(errno == ENOENT);
log_notice("Path %s not found, skipping test", "/proc/self/ns/mnt");
return;
}
assert_se(fd >= 0);
assert_se(IN_SET(fd_is_ns(fd, CLONE_NEWNET), 0, -EUCLEAN));
fd = safe_close(fd);
assert_se((fd = open("/proc/self/ns/ipc", O_CLOEXEC|O_RDONLY)) >= 0);
assert_se(IN_SET(fd_is_ns(fd, CLONE_NEWIPC), 1, -EUCLEAN));
fd = safe_close(fd);
assert_se((fd = open("/proc/self/ns/net", O_CLOEXEC|O_RDONLY)) >= 0);
assert_se(IN_SET(fd_is_ns(fd, CLONE_NEWNET), 1, -EUCLEAN));
}
TEST(dir_is_empty) {
_cleanup_(rm_rf_physical_and_freep) char *empty_dir = NULL;
_cleanup_free_ char *j = NULL, *jj = NULL, *jjj = NULL;