From c46344d5976beb6b60e8ba7691d9b60a945e0562 Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Wed, 29 Oct 2025 13:38:38 +0100 Subject: [PATCH 1/3] mount-setup: Reformat table Preparation for the next commit. --- src/shared/mount-setup.c | 147 ++++++++++++++++++++++++++++++--------- 1 file changed, 116 insertions(+), 31 deletions(-) diff --git a/src/shared/mount-setup.c b/src/shared/mount-setup.c index 446ec51695..7fb3416c6c 100644 --- a/src/shared/mount-setup.c +++ b/src/shared/mount-setup.c @@ -25,7 +25,6 @@ #include "virt.h" typedef enum MountMode { - MNT_NONE = 0, MNT_FATAL = 1 << 0, MNT_IN_CONTAINER = 1 << 1, MNT_CHECK_WRITABLE = 1 << 2, @@ -73,44 +72,130 @@ int mount_cgroupfs(const char *path) { } static const MountPoint mount_table[] = { - { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, - MNT_FATAL|MNT_IN_CONTAINER|MNT_FOLLOW_SYMLINK }, - { "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, - MNT_FATAL|MNT_IN_CONTAINER }, - { "devtmpfs", "/dev", "devtmpfs", "mode=0755" TMPFS_LIMITS_DEV, MS_NOSUID|MS_STRICTATIME, - MNT_FATAL|MNT_IN_CONTAINER }, - { "securityfs", "/sys/kernel/security", "securityfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, - MNT_NONE }, + { + .what = "proc", + .where = "/proc", + .type = "proc", + .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV, + .mode = MNT_FATAL|MNT_IN_CONTAINER|MNT_FOLLOW_SYMLINK, + }, + { + .what = "sysfs", + .where = "/sys", + .type = "sysfs", + .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV, + .mode = MNT_FATAL|MNT_IN_CONTAINER, + }, + { + .what = "devtmpfs", + .where = "/dev", + .type = "devtmpfs", + .options = "mode=0755" TMPFS_LIMITS_DEV, + .flags = MS_NOSUID|MS_STRICTATIME, + .mode = MNT_FATAL|MNT_IN_CONTAINER, + }, + { + .what = "securityfs", + .where = "/sys/kernel/security", + .type = "securityfs", + .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV, + }, #if ENABLE_SMACK - { "smackfs", "/sys/fs/smackfs", "smackfs", "smackfsdef=*", MS_NOSUID|MS_NOEXEC|MS_NODEV, - MNT_FATAL, mac_smack_use }, - { "tmpfs", "/dev/shm", "tmpfs", "mode=01777,smackfsroot=*", MS_NOSUID|MS_NODEV|MS_STRICTATIME, - MNT_FATAL|MNT_USRQUOTA_GRACEFUL, mac_smack_use }, + { + .what = "smackfs", + .where = "/sys/fs/smackfs", + .type = "smackfs", + .options = "smackfsdef=*", + .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV, + .mode = MNT_FATAL, + .condition_fn = mac_smack_use, + }, + { + .what = "tmpfs", + .where = "/dev/shm", + .type = "tmpfs", + .options = "mode=01777,smackfsroot=*", + .flags = MS_NOSUID|MS_NODEV|MS_STRICTATIME, + .mode = MNT_FATAL|MNT_USRQUOTA_GRACEFUL, + .condition_fn = mac_smack_use, + }, #endif - { "tmpfs", "/dev/shm", "tmpfs", "mode=01777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, - MNT_FATAL|MNT_IN_CONTAINER|MNT_USRQUOTA_GRACEFUL }, - { "devpts", "/dev/pts", "devpts", "mode=" STRINGIFY(TTY_MODE) ",gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, - MNT_IN_CONTAINER }, + { + .what = "tmpfs", + .where = "/dev/shm", + .type = "tmpfs", + .options = "mode=01777", + .flags = MS_NOSUID|MS_NODEV|MS_STRICTATIME, + .mode = MNT_FATAL|MNT_IN_CONTAINER|MNT_USRQUOTA_GRACEFUL, + }, + { + .what = "devpts", + .where = "/dev/pts", + .type = "devpts", + .options = "mode=" STRINGIFY(TTY_MODE) ",gid=" STRINGIFY(TTY_GID), + .flags = MS_NOSUID|MS_NOEXEC, + .mode = MNT_IN_CONTAINER, + }, #if ENABLE_SMACK - { "tmpfs", "/run", "tmpfs", "mode=0755,smackfsroot=*" TMPFS_LIMITS_RUN, MS_NOSUID|MS_NODEV|MS_STRICTATIME, - MNT_FATAL, mac_smack_use }, + { + .what = "tmpfs", + .where = "/run", + .type = "tmpfs", + .options = "mode=0755,smackfsroot=*" TMPFS_LIMITS_RUN, + .flags = MS_NOSUID|MS_NODEV|MS_STRICTATIME, + .mode = MNT_FATAL, + .condition_fn = mac_smack_use, + }, #endif - { "tmpfs", "/run", "tmpfs", "mode=0755" TMPFS_LIMITS_RUN, MS_NOSUID|MS_NODEV|MS_STRICTATIME, - MNT_FATAL|MNT_IN_CONTAINER }, - { "cgroup2", "/sys/fs/cgroup", "cgroup2", "nsdelegate,memory_recursiveprot", MS_NOSUID|MS_NOEXEC|MS_NODEV, - MNT_FATAL|MNT_IN_CONTAINER|MNT_CHECK_WRITABLE, cgroupfs_recursiveprot_supported }, - { "cgroup2", "/sys/fs/cgroup", "cgroup2", "nsdelegate", MS_NOSUID|MS_NOEXEC|MS_NODEV, - MNT_FATAL|MNT_IN_CONTAINER|MNT_CHECK_WRITABLE }, + { + .what = "tmpfs", + .where = "/run", + .type = "tmpfs", + .options = "mode=0755" TMPFS_LIMITS_RUN, + .flags = MS_NOSUID|MS_NODEV|MS_STRICTATIME, + .mode = MNT_FATAL|MNT_IN_CONTAINER, + }, + { + .what = "cgroup2", + .where = "/sys/fs/cgroup", + .type = "cgroup2", + .options = "nsdelegate,memory_recursiveprot", + .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV, + .mode = MNT_FATAL|MNT_IN_CONTAINER|MNT_CHECK_WRITABLE, + .condition_fn = cgroupfs_recursiveprot_supported, + }, + { + .what = "cgroup2", + .where = "/sys/fs/cgroup", + .type = "cgroup2", + .options = "nsdelegate", + .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV, + .mode = MNT_FATAL|MNT_IN_CONTAINER|MNT_CHECK_WRITABLE, + }, #if ENABLE_PSTORE - { "pstore", "/sys/fs/pstore", "pstore", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, - MNT_NONE }, + { + .what = "pstore", + .where = "/sys/fs/pstore", + .type = "pstore", + .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV, + }, #endif #if ENABLE_EFI - { "efivarfs", "/sys/firmware/efi/efivars", "efivarfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, - MNT_NONE, is_efi_boot }, + { + .what = "efivarfs", + .where = "/sys/firmware/efi/efivars", + .type = "efivarfs", + .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV, + .condition_fn = is_efi_boot, + }, #endif - { "bpf", "/sys/fs/bpf", "bpf", "mode=0700", MS_NOSUID|MS_NOEXEC|MS_NODEV, - MNT_NONE }, + { + .what = "bpf", + .where = "/sys/fs/bpf", + .type = "bpf", + .options = "mode=0700", + .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV, + }, }; /* The first three entries we might need before SELinux is up. The From 5ce388aec839f97b15f972953de5e7e962c9042c Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Wed, 29 Oct 2025 16:28:40 +0100 Subject: [PATCH 2/3] mount-setup: Add optional function which provides extra mount options --- src/shared/mount-setup.c | 104 +++++++++++++++++++++++++-------------- 1 file changed, 67 insertions(+), 37 deletions(-) diff --git a/src/shared/mount-setup.c b/src/shared/mount-setup.c index 7fb3416c6c..cd2b732b9b 100644 --- a/src/shared/mount-setup.c +++ b/src/shared/mount-setup.c @@ -29,7 +29,6 @@ typedef enum MountMode { MNT_IN_CONTAINER = 1 << 1, MNT_CHECK_WRITABLE = 1 << 2, MNT_FOLLOW_SYMLINK = 1 << 3, - MNT_USRQUOTA_GRACEFUL = 1 << 4, } MountMode; typedef struct MountPoint { @@ -37,38 +36,77 @@ typedef struct MountPoint { const char *where; const char *type; const char *options; + int (*options_fn)(int priority, const char *type, char **ret); unsigned long flags; MountMode mode; bool (*condition_fn)(void); } MountPoint; -static bool cgroupfs_recursiveprot_supported(void) { +static int cgroupfs_mount_options(int priority, const char *type, char **ret) { int r; - /* Added in kernel 5.7 */ + assert(type); + assert(streq(type, "cgroup2")); + assert(ret); - r = mount_option_supported("cgroup2", "memory_recursiveprot", /* value = */ NULL); - if (r < 0) - log_debug_errno(r, "Failed to determine whether cgroupfs supports 'memory_recursiveprot' mount option, assuming not: %m"); - else if (r == 0) - log_debug("'memory_recursiveprot' not supported by cgroupfs, not using mount option."); + _cleanup_free_ char *opts = NULL; + FOREACH_STRING(o, "memory_recursiveprot") { + r = mount_option_supported("cgroup2", o, /* value = */ NULL); + if (r < 0) + log_full_errno(priority, r, "Failed to determine whether cgroupfs supports '%s' mount option, assuming not: %m", o); + else if (r == 0) + log_debug("'%s' not supported by cgroupfs, not using mount option.", o); + else if (!strextend_with_separator(&opts, ",", o)) + return log_oom_full(priority); + } - return r > 0; + *ret = TAKE_PTR(opts); + return 0; } int mount_cgroupfs(const char *path) { + int r; + assert(path); /* Mount a separate cgroupfs instance, taking all options we initial set into account. This is * especially useful when cgroup namespace is *not* employed, since the kernel overrides all * previous options if a new mount is established in initial cgns (c.f. * https://github.com/torvalds/linux/blob/b69bb476dee99d564d65d418e9a20acca6f32c3f/kernel/cgroup/cgroup.c#L1984) - * - * The options shall be kept in sync with those in mount_table below. */ + */ - return mount_nofollow_verbose(LOG_ERR, "cgroup2", path, "cgroup2", - MS_NOSUID|MS_NOEXEC|MS_NODEV, - cgroupfs_recursiveprot_supported() ? "nsdelegate,memory_recursiveprot" : "nsdelegate"); + _cleanup_free_ char *opts = NULL; + r = cgroupfs_mount_options(LOG_WARNING, "cgroup2", &opts); + if (r < 0) + return r; + + /* These options shall be kept in sync with those in mount_table below. */ + if (!strprepend_with_separator(&opts, ",", "nsdelegate")) + return log_oom(); + + return mount_nofollow_verbose(LOG_ERR, "cgroup2", path, "cgroup2", MS_NOSUID|MS_NOEXEC|MS_NODEV, opts); +} + +static int usrquota_mount_option(int priority, const char *type, char **ret) { + _cleanup_free_ char *o = NULL; + int r; + + assert(type); + assert(ret); + + r = mount_option_supported(type, "usrquota", /* value= */ NULL); + if (r < 0) + log_full_errno(priority, r, "Unable to determine whether %s supports 'usrquota' mount option, assuming not: %m", type); + else if (r == 0) + log_debug("Not enabling 'usrquota' for '%s' as kernel lacks support for it.", type); + else { + o = strdup("usrquota"); + if (!o) + return log_oom_full(priority); + } + + *ret = TAKE_PTR(o); + return 0; } static const MountPoint mount_table[] = { @@ -115,8 +153,9 @@ static const MountPoint mount_table[] = { .where = "/dev/shm", .type = "tmpfs", .options = "mode=01777,smackfsroot=*", + .options_fn = usrquota_mount_option, .flags = MS_NOSUID|MS_NODEV|MS_STRICTATIME, - .mode = MNT_FATAL|MNT_USRQUOTA_GRACEFUL, + .mode = MNT_FATAL, .condition_fn = mac_smack_use, }, #endif @@ -125,8 +164,9 @@ static const MountPoint mount_table[] = { .where = "/dev/shm", .type = "tmpfs", .options = "mode=01777", + .options_fn = usrquota_mount_option, .flags = MS_NOSUID|MS_NODEV|MS_STRICTATIME, - .mode = MNT_FATAL|MNT_IN_CONTAINER|MNT_USRQUOTA_GRACEFUL, + .mode = MNT_FATAL|MNT_IN_CONTAINER, }, { .what = "devpts", @@ -155,20 +195,12 @@ static const MountPoint mount_table[] = { .flags = MS_NOSUID|MS_NODEV|MS_STRICTATIME, .mode = MNT_FATAL|MNT_IN_CONTAINER, }, - { - .what = "cgroup2", - .where = "/sys/fs/cgroup", - .type = "cgroup2", - .options = "nsdelegate,memory_recursiveprot", - .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV, - .mode = MNT_FATAL|MNT_IN_CONTAINER|MNT_CHECK_WRITABLE, - .condition_fn = cgroupfs_recursiveprot_supported, - }, { .what = "cgroup2", .where = "/sys/fs/cgroup", .type = "cgroup2", .options = "nsdelegate", + .options_fn = cgroupfs_mount_options, .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV, .mode = MNT_FATAL|MNT_IN_CONTAINER|MNT_CHECK_WRITABLE, }, @@ -278,20 +310,18 @@ static int mount_one(const MountPoint *p, bool relabel) { (void) mkdir_p(p->where, 0755); _cleanup_free_ char *extend_options = NULL; - const char *o = p->options; - if (FLAGS_SET(p->mode, MNT_USRQUOTA_GRACEFUL)) { - r = mount_option_supported(p->type, "usrquota", /* value= */ NULL); + const char *o; + if (p->options_fn) { + r = p->options_fn(priority, p->type, &extend_options); if (r < 0) - log_full_errno(priority, r, "Unable to determine whether %s supports 'usrquota' mount option, assuming not: %m", p->type); - else if (r == 0) - log_debug("Not enabling 'usrquota' on '%s' as kernel lacks support for it.", p->where); - else { - if (!strextend_with_separator(&extend_options, ",", p->options ?: POINTER_MAX, "usrquota")) - return log_oom(); + return r; - o = extend_options; - } - } + if (!strprepend_with_separator(&extend_options, ",", p->options)) + return log_oom(); + + o = extend_options; + } else + o = p->options; r = mount_verbose_full(priority, p->what, p->where, p->type, p->flags, o, FLAGS_SET(p->mode, MNT_FOLLOW_SYMLINK)); if (r < 0) From 886c078702380686e036b05c54f182305d89182c Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Wed, 29 Oct 2025 16:30:07 +0100 Subject: [PATCH 3/3] mount-setup: Add memory_hugetlb_accounting to cgroupfs mount MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This mount option will count HugeTLB memory usage towards the cgroup’s overall memory usage for the memory controller. See https://lore.kernel.org/all/20231006184629.155543-4-nphamcs@gmail.com/T/#u for the patch introducing the new mount option. --- NEWS | 5 +++++ README | 3 ++- src/shared/mount-setup.c | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/NEWS b/NEWS index b50a1e123e..19894d6c18 100644 --- a/NEWS +++ b/NEWS @@ -2,6 +2,11 @@ systemd System and Service Manager CHANGES WITH 259 in spe: + * The cgroup2 file system is now mounted with the + "memory_hugetlb_accounting" mount option, supported since kernel 6.6. + This means that HugeTLB memory usage is now counted towards the + cgroup’s overall memory usage for the memory controller. + * homectl's --recovery-key= option may now be used with the "update" command to add recovery keys to existing user accounts. Previously, recovery keys could only be configured during initial user creation. diff --git a/README b/README index 2e3acdd907..89a9623cdc 100644 --- a/README +++ b/README @@ -68,7 +68,8 @@ REQUIREMENTS: ≥ 6.3 for MFD_EXEC/MFD_NOEXEC_SEAL and tmpfs noswap option ≥ 6.5 for name_to_handle_at() AT_HANDLE_FID, SO_PEERPIDFD/SO_PASSPIDFD, and MOVE_MOUNT_BENEATH - ≥ 6.6 for quota support on tmpfs + ≥ 6.6 for quota support on tmpfs and cgroup2fs memory_hugetlb_accounting + option ≥ 6.9 for pidfs ≥ 6.10 for fcntl(F_DUPFD_QUERY), unprivileged linkat(AT_EMPTY_PATH), and block device 'partscan' sysfs attribute diff --git a/src/shared/mount-setup.c b/src/shared/mount-setup.c index cd2b732b9b..cbd3e1d107 100644 --- a/src/shared/mount-setup.c +++ b/src/shared/mount-setup.c @@ -50,7 +50,7 @@ static int cgroupfs_mount_options(int priority, const char *type, char **ret) { assert(ret); _cleanup_free_ char *opts = NULL; - FOREACH_STRING(o, "memory_recursiveprot") { + FOREACH_STRING(o, "memory_recursiveprot", "memory_hugetlb_accounting") { r = mount_option_supported("cgroup2", o, /* value = */ NULL); if (r < 0) log_full_errno(priority, r, "Failed to determine whether cgroupfs supports '%s' mount option, assuming not: %m", o);