mirror of
https://github.com/morgan9e/systemd
synced 2026-04-14 00:14:32 +09:00
namespace-util: modernize fd_is_namespace() and is_our_namespace()
- Make fd_is_namespace() take NamespaceType - Drop support for kernel without NS_GET_NSTYPE (< 4.11) - Port is_our_namespace() to namespace_open_by_type() (preparation for later commits, where the latter would go by pidfd if available, avoiding procfs)
This commit is contained in:
8
README
8
README
@@ -35,6 +35,7 @@ REQUIREMENTS:
|
||||
≥ 4.9 for RENAME_NOREPLACE support in vfat
|
||||
≥ 4.10 for cgroup-bpf egress and ingress hooks
|
||||
≥ 4.11 for nsfs
|
||||
# FIXME: drop compat glue and remove entries above before v258
|
||||
≥ 4.15 for cgroup-bpf device hook and cpu controller in cgroup v2
|
||||
≥ 4.17 for cgroup-bpf socket address hooks and /sys/power/resume_offset
|
||||
≥ 4.20 for PSI (used by systemd-oomd)
|
||||
@@ -43,16 +44,17 @@ REQUIREMENTS:
|
||||
≥ 5.4 for pidfd, new mount API, and signed Verity images
|
||||
≥ 5.6 for getrandom() GRND_INSECURE
|
||||
≥ 5.7 for CLONE_INTO_CGROUP, BPF links and the BPF LSM hook
|
||||
≥ 5.9 for close_range()
|
||||
≥ 5.8 for LOOP_CONFIGURE and STATX_ATTR_MOUNT_ROOT
|
||||
≥ 5.9 for close_range()
|
||||
≥ 6.3 for MFD_EXEC/MFD_NOEXEC_SEAL and tmpfs noswap option
|
||||
≥ 6.5 for name_to_handle_at() AT_HANDLE_FID, SO_PEERPIDFD/SO_PASSPIDFD,
|
||||
and MOVE_MOUNT_BENEATH
|
||||
≥ 6.9 for pidfs
|
||||
|
||||
⛔ Kernel versions below 4.3 ("minimum baseline") are not supported at
|
||||
⛔ Kernel versions below 4.11 ("minimum baseline") are not supported at
|
||||
all, and are missing required functionality (e.g. CLOCK_BOOTTIME support
|
||||
for timerfd_create(), getrandom(), ambient capabilities, or memfd_create()).
|
||||
for timerfd_create(), getrandom(), ambient capabilities, memfd_create(),
|
||||
or nsfs (NS_GET_NSTYPE)).
|
||||
|
||||
⚠️ Kernel versions below 5.4 ("recommended baseline") have significant
|
||||
gaps in functionality and are not recommended for use with this version
|
||||
|
||||
@@ -40,7 +40,7 @@ const struct namespace_info namespace_info[_NAMESPACE_TYPE_MAX + 1] = {
|
||||
|
||||
#define pid_namespace_path(pid, type) procfs_file_alloca(pid, namespace_info[type].proc_path)
|
||||
|
||||
static NamespaceType clone_flag_to_namespace_type(unsigned long clone_flag) {
|
||||
NamespaceType clone_flag_to_namespace_type(unsigned long clone_flag) {
|
||||
for (NamespaceType t = 0; t < _NAMESPACE_TYPE_MAX; t++)
|
||||
if (((namespace_info[t].clone_flag ^ clone_flag) & (CLONE_NEWCGROUP|CLONE_NEWIPC|CLONE_NEWNET|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUSER|CLONE_NEWUTS|CLONE_NEWTIME)) == 0)
|
||||
return t;
|
||||
@@ -157,10 +157,10 @@ int namespace_enter(int pidns_fd, int mntns_fd, int netns_fd, int userns_fd, int
|
||||
/* Can't setns to your own userns, since then you could escalate from non-root to root in
|
||||
* your own namespace, so check if namespaces are equal before attempting to enter. */
|
||||
|
||||
r = inode_same_at(userns_fd, "", AT_FDCWD, "/proc/self/ns/user", AT_EMPTY_PATH);
|
||||
r = is_our_namespace(userns_fd, NAMESPACE_USER);
|
||||
if (r < 0)
|
||||
return r;
|
||||
if (r)
|
||||
if (r > 0)
|
||||
userns_fd = -EBADF;
|
||||
}
|
||||
|
||||
@@ -191,50 +191,49 @@ int namespace_enter(int pidns_fd, int mntns_fd, int netns_fd, int userns_fd, int
|
||||
return reset_uid_gid();
|
||||
}
|
||||
|
||||
int fd_is_ns(int fd, unsigned long nsflag) {
|
||||
struct statfs s;
|
||||
int fd_is_namespace(int fd, NamespaceType type) {
|
||||
int r;
|
||||
|
||||
/* Checks whether the specified file descriptor refers to a namespace created by specifying nsflag in clone().
|
||||
* On old kernels there's no nice way to detect that, hence on those we'll return a recognizable error (EUCLEAN),
|
||||
* so that callers can handle this somewhat nicely.
|
||||
*
|
||||
* This function returns > 0 if the fd definitely refers to a network namespace, 0 if it definitely does not
|
||||
* refer to a network namespace, -EUCLEAN if we can't determine, and other negative error codes on error. */
|
||||
/* Checks whether the specified file descriptor refers to a namespace (of type if type != _NAMESPACE_INVALID). */
|
||||
|
||||
if (fstatfs(fd, &s) < 0)
|
||||
assert(fd >= 0);
|
||||
assert(type < _NAMESPACE_TYPE_MAX);
|
||||
|
||||
r = fd_is_fs_type(fd, NSFS_MAGIC);
|
||||
if (r <= 0)
|
||||
return r;
|
||||
|
||||
if (type < 0)
|
||||
return true;
|
||||
|
||||
int clone_flag = ioctl(fd, NS_GET_NSTYPE);
|
||||
if (clone_flag < 0)
|
||||
return -errno;
|
||||
|
||||
if (!is_fs_type(&s, NSFS_MAGIC)) {
|
||||
/* On really old kernels, there was no "nsfs", and network namespace sockets belonged to procfs
|
||||
* instead. Handle that in a somewhat smart way. */
|
||||
NamespaceType found_type = clone_flag_to_namespace_type(clone_flag);
|
||||
if (found_type < 0)
|
||||
return -EBADF; /* Uh? Unknown namespace type? */
|
||||
|
||||
if (is_fs_type(&s, PROC_SUPER_MAGIC)) {
|
||||
struct statfs t;
|
||||
return found_type == type;
|
||||
}
|
||||
|
||||
/* OK, so it is procfs. Let's see if our own network namespace is procfs, too. If so, then the
|
||||
* passed fd might refer to a network namespace, but we can't know for sure. In that case,
|
||||
* return a recognizable error. */
|
||||
int is_our_namespace(int fd, NamespaceType type) {
|
||||
int r;
|
||||
|
||||
if (statfs("/proc/self/ns/net", &t) < 0)
|
||||
return -errno;
|
||||
assert(fd >= 0);
|
||||
assert(type < _NAMESPACE_TYPE_MAX);
|
||||
|
||||
if (s.f_type == t.f_type)
|
||||
return -EUCLEAN; /* It's possible, we simply don't know */
|
||||
}
|
||||
r = fd_is_namespace(fd, type);
|
||||
if (r < 0)
|
||||
return r;
|
||||
if (r == 0) /* Not a namespace or not of the right type? */
|
||||
return -EUCLEAN;
|
||||
|
||||
return 0; /* No! */
|
||||
}
|
||||
_cleanup_close_ int our_ns = namespace_open_by_type(type);
|
||||
if (our_ns < 0)
|
||||
return our_ns;
|
||||
|
||||
r = ioctl(fd, NS_GET_NSTYPE);
|
||||
if (r < 0) {
|
||||
if (errno == ENOTTY) /* Old kernels didn't know this ioctl, let's also return a recognizable error in that case */
|
||||
return -EUCLEAN;
|
||||
|
||||
return -errno;
|
||||
}
|
||||
|
||||
return (unsigned long) r == nsflag;
|
||||
return fd_inode_same(fd, our_ns);
|
||||
}
|
||||
|
||||
int detach_mount_namespace(void) {
|
||||
@@ -505,37 +504,6 @@ int namespace_is_init(NamespaceType type) {
|
||||
return st.st_ino == namespace_info[type].root_inode;
|
||||
}
|
||||
|
||||
int is_our_namespace(int fd, NamespaceType request_type) {
|
||||
int clone_flag;
|
||||
|
||||
assert(fd >= 0);
|
||||
|
||||
clone_flag = ioctl(fd, NS_GET_NSTYPE);
|
||||
if (clone_flag < 0)
|
||||
return -errno;
|
||||
|
||||
NamespaceType found_type = clone_flag_to_namespace_type(clone_flag);
|
||||
if (found_type < 0)
|
||||
return -EBADF; /* Uh? Unknown namespace type? */
|
||||
|
||||
if (request_type >= 0 && request_type != found_type) /* It's a namespace, but not of the right type? */
|
||||
return -EUCLEAN;
|
||||
|
||||
struct stat st_fd, st_ours;
|
||||
if (fstat(fd, &st_fd) < 0)
|
||||
return -errno;
|
||||
|
||||
const char *p = pid_namespace_path(0, found_type);
|
||||
if (stat(p, &st_ours) < 0) {
|
||||
if (errno == ENOENT)
|
||||
return proc_mounted() == 0 ? -ENOSYS : -ENOENT;
|
||||
|
||||
return -errno;
|
||||
}
|
||||
|
||||
return stat_inode_same(&st_ours, &st_fd);
|
||||
}
|
||||
|
||||
int is_idmapping_supported(const char *path) {
|
||||
_cleanup_close_ int mount_fd = -EBADF, userns_fd = -EBADF, dir_fd = -EBADF;
|
||||
_cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
|
||||
|
||||
@@ -27,6 +27,8 @@ extern const struct namespace_info {
|
||||
ino_t root_inode;
|
||||
} namespace_info[_NAMESPACE_TYPE_MAX + 1];
|
||||
|
||||
NamespaceType clone_flag_to_namespace_type(unsigned long clone_flag);
|
||||
|
||||
int pidref_namespace_open(
|
||||
const PidRef *pidref,
|
||||
int *ret_pidns_fd,
|
||||
@@ -41,9 +43,11 @@ int namespace_open(
|
||||
int *ret_netns_fd,
|
||||
int *ret_userns_fd,
|
||||
int *ret_root_fd);
|
||||
|
||||
int namespace_enter(int pidns_fd, int mntns_fd, int netns_fd, int userns_fd, int root_fd);
|
||||
|
||||
int fd_is_ns(int fd, unsigned long nsflag);
|
||||
int fd_is_namespace(int fd, NamespaceType type);
|
||||
int is_our_namespace(int fd, NamespaceType type);
|
||||
|
||||
int detach_mount_namespace(void);
|
||||
int detach_mount_namespace_harder(uid_t target_uid, gid_t target_gid);
|
||||
@@ -77,6 +81,4 @@ int namespace_open_by_type(NamespaceType type);
|
||||
|
||||
int namespace_is_init(NamespaceType type);
|
||||
|
||||
int is_our_namespace(int fd, NamespaceType type);
|
||||
|
||||
int is_idmapping_supported(const char *path);
|
||||
|
||||
@@ -3253,6 +3253,7 @@ int setup_shareable_ns(int ns_storage_socket[static 2], unsigned long nsflag) {
|
||||
|
||||
int open_shareable_ns_path(int ns_storage_socket[static 2], const char *path, unsigned long nsflag) {
|
||||
_cleanup_close_ int ns = -EBADF;
|
||||
NamespaceType type;
|
||||
int r;
|
||||
|
||||
assert(ns_storage_socket);
|
||||
@@ -3264,6 +3265,9 @@ int open_shareable_ns_path(int ns_storage_socket[static 2], const char *path, un
|
||||
* it. This is supposed to be called ahead of time, i.e. before setup_shareable_ns() which will
|
||||
* allocate a new anonymous ns if needed. */
|
||||
|
||||
type = clone_flag_to_namespace_type(nsflag);
|
||||
assert(type >= 0);
|
||||
|
||||
r = posix_lock(ns_storage_socket[0], LOCK_EX);
|
||||
if (r < 0)
|
||||
return r;
|
||||
@@ -3282,11 +3286,11 @@ int open_shareable_ns_path(int ns_storage_socket[static 2], const char *path, un
|
||||
if (ns < 0)
|
||||
return -errno;
|
||||
|
||||
r = fd_is_ns(ns, nsflag);
|
||||
r = fd_is_namespace(ns, type);
|
||||
if (r < 0)
|
||||
return r;
|
||||
if (r == 0)
|
||||
return -EINVAL;
|
||||
if (r < 0 && r != -EUCLEAN) /* EUCLEAN: we don't know */
|
||||
return r;
|
||||
|
||||
r = send_one_fd(ns_storage_socket[1], ns, MSG_DONTWAIT);
|
||||
if (r < 0)
|
||||
|
||||
@@ -227,7 +227,7 @@ static int validate_userns(sd_varlink *link, int *userns_fd) {
|
||||
if (r < 0)
|
||||
return log_debug_errno(r, "User namespace file descriptor has unsafe flags set: %m");
|
||||
|
||||
r = fd_is_ns(*userns_fd, CLONE_NEWUSER);
|
||||
r = fd_is_namespace(*userns_fd, NAMESPACE_USER);
|
||||
if (r < 0)
|
||||
return r;
|
||||
if (r == 0)
|
||||
|
||||
@@ -5249,12 +5249,10 @@ static int run_container(
|
||||
if (child_netns_fd < 0)
|
||||
return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path);
|
||||
|
||||
r = fd_is_ns(child_netns_fd, CLONE_NEWNET);
|
||||
if (r == -EUCLEAN)
|
||||
log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
|
||||
else if (r < 0)
|
||||
r = fd_is_namespace(child_netns_fd, NAMESPACE_NET);
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
|
||||
else if (r == 0)
|
||||
if (r == 0)
|
||||
return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
|
||||
"Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
|
||||
}
|
||||
|
||||
@@ -666,7 +666,7 @@ static int validate_userns(sd_varlink *link, int userns_fd) {
|
||||
return log_debug_errno(r, "User namespace file descriptor has unsafe flags set: %m");
|
||||
|
||||
/* Validate this is actually a valid user namespace fd */
|
||||
r = fd_is_ns(userns_fd, CLONE_NEWUSER);
|
||||
r = fd_is_namespace(userns_fd, NAMESPACE_USER);
|
||||
if (r < 0)
|
||||
return log_debug_errno(r, "Failed to check if user namespace fd is actually a user namespace: %m");
|
||||
if (r == 0)
|
||||
@@ -1455,7 +1455,7 @@ static int validate_netns(sd_varlink *link, int userns_fd, int netns_fd) {
|
||||
return log_debug_errno(r, "Network namespace file descriptor has unsafe flags set: %m");
|
||||
|
||||
/* Validate this is actually a valid network namespace fd */
|
||||
r = fd_is_ns(netns_fd, CLONE_NEWNET);
|
||||
r = fd_is_namespace(netns_fd, NAMESPACE_NET);
|
||||
if (r < 0)
|
||||
return r;
|
||||
if (r == 0)
|
||||
|
||||
@@ -280,7 +280,7 @@ int userns_restrict_put_by_fd(
|
||||
assert(userns_fd >= 0);
|
||||
assert(n_mount_fds == 0 || mount_fds);
|
||||
|
||||
r = fd_is_ns(userns_fd, CLONE_NEWUSER);
|
||||
r = fd_is_namespace(userns_fd, NAMESPACE_USER);
|
||||
if (r < 0)
|
||||
return log_debug_errno(r, "Failed to determine if file descriptor is user namespace: %m");
|
||||
if (r == 0)
|
||||
|
||||
@@ -1097,7 +1097,7 @@ static int mount_in_namespace(
|
||||
if (r < 0)
|
||||
return log_debug_errno(r, "Failed to retrieve FDs of the target process' namespace: %m");
|
||||
|
||||
r = inode_same_at(mntns_fd, "", AT_FDCWD, "/proc/self/ns/mnt", AT_EMPTY_PATH);
|
||||
r = is_our_namespace(mntns_fd, NAMESPACE_MOUNT);
|
||||
if (r < 0)
|
||||
return log_debug_errno(r, "Failed to determine if mount namespaces are equal: %m");
|
||||
/* We can't add new mounts at runtime if the process wasn't started in a namespace */
|
||||
|
||||
@@ -164,6 +164,31 @@ TEST(ipcns) {
|
||||
test_shareable_ns(CLONE_NEWIPC);
|
||||
}
|
||||
|
||||
TEST(fd_is_namespace) {
|
||||
_cleanup_close_ int fd = -EBADF;
|
||||
|
||||
ASSERT_OK_ZERO(fd_is_namespace(STDIN_FILENO, NAMESPACE_NET));
|
||||
ASSERT_OK_ZERO(fd_is_namespace(STDOUT_FILENO, NAMESPACE_NET));
|
||||
ASSERT_OK_ZERO(fd_is_namespace(STDERR_FILENO, NAMESPACE_NET));
|
||||
|
||||
fd = namespace_open_by_type(NAMESPACE_MOUNT);
|
||||
if (IN_SET(fd, -ENOSYS, -ENOENT)) {
|
||||
log_notice("Path %s not found, skipping test", "/proc/self/ns/mnt");
|
||||
return;
|
||||
}
|
||||
ASSERT_OK(fd);
|
||||
ASSERT_OK_POSITIVE(fd_is_namespace(fd, NAMESPACE_MOUNT));
|
||||
ASSERT_OK_ZERO(fd_is_namespace(fd, NAMESPACE_NET));
|
||||
fd = safe_close(fd);
|
||||
|
||||
ASSERT_OK(fd = namespace_open_by_type(NAMESPACE_IPC));
|
||||
ASSERT_OK_POSITIVE(fd_is_namespace(fd, NAMESPACE_IPC));
|
||||
fd = safe_close(fd);
|
||||
|
||||
ASSERT_OK(fd = namespace_open_by_type(NAMESPACE_NET));
|
||||
ASSERT_OK_POSITIVE(fd_is_namespace(fd, NAMESPACE_NET));
|
||||
}
|
||||
|
||||
TEST(protect_kernel_logs) {
|
||||
static const NamespaceParameters p = {
|
||||
.runtime_scope = RUNTIME_SCOPE_SYSTEM,
|
||||
|
||||
@@ -165,31 +165,6 @@ TEST(path_is_read_only_fs) {
|
||||
assert_se(path_is_read_only_fs("/i-dont-exist") == -ENOENT);
|
||||
}
|
||||
|
||||
TEST(fd_is_ns) {
|
||||
_cleanup_close_ int fd = -EBADF;
|
||||
|
||||
assert_se(fd_is_ns(STDIN_FILENO, CLONE_NEWNET) == 0);
|
||||
assert_se(fd_is_ns(STDERR_FILENO, CLONE_NEWNET) == 0);
|
||||
assert_se(fd_is_ns(STDOUT_FILENO, CLONE_NEWNET) == 0);
|
||||
|
||||
fd = open("/proc/self/ns/mnt", O_CLOEXEC|O_RDONLY);
|
||||
if (fd < 0) {
|
||||
assert_se(errno == ENOENT);
|
||||
log_notice("Path %s not found, skipping test", "/proc/self/ns/mnt");
|
||||
return;
|
||||
}
|
||||
assert_se(fd >= 0);
|
||||
assert_se(IN_SET(fd_is_ns(fd, CLONE_NEWNET), 0, -EUCLEAN));
|
||||
fd = safe_close(fd);
|
||||
|
||||
assert_se((fd = open("/proc/self/ns/ipc", O_CLOEXEC|O_RDONLY)) >= 0);
|
||||
assert_se(IN_SET(fd_is_ns(fd, CLONE_NEWIPC), 1, -EUCLEAN));
|
||||
fd = safe_close(fd);
|
||||
|
||||
assert_se((fd = open("/proc/self/ns/net", O_CLOEXEC|O_RDONLY)) >= 0);
|
||||
assert_se(IN_SET(fd_is_ns(fd, CLONE_NEWNET), 1, -EUCLEAN));
|
||||
}
|
||||
|
||||
TEST(dir_is_empty) {
|
||||
_cleanup_(rm_rf_physical_and_freep) char *empty_dir = NULL;
|
||||
_cleanup_free_ char *j = NULL, *jj = NULL, *jjj = NULL;
|
||||
|
||||
Reference in New Issue
Block a user