From d15811d7e5bb3fc3afc1c14e4d5bb3b18fddedc1 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 23 Jan 2025 11:40:36 +0100 Subject: [PATCH 1/7] devnum-util: add macros to safely convert dev_t to pointers and back Sometimes it's nice being able to store dev_t as pointer values in hashmaps/tables, instead of having to allocate memory for them and using devt_hash_ops. After all dev_t is weird on Linux/glibc: glibc defines it as 64bit entity (which hence appears as something we cannot encode in a pointer value for compat with 32bit archs) but it actually is 32bit in the kernel apis. Hence we can safely cut off the upper 32bit, and still retain compat with all archs. But let's hide this in new macros, and validate this is all correct via a test. --- src/basic/devnum-util.h | 10 ++++++++-- src/test/test-devnum-util.c | 17 +++++++++++++++++ 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/src/basic/devnum-util.h b/src/basic/devnum-util.h index e109de9913..0efca56780 100644 --- a/src/basic/devnum-util.h +++ b/src/basic/devnum-util.h @@ -9,6 +9,9 @@ int parse_devnum(const char *s, dev_t *ret); +#define DEVNUM_MAJOR_MAX ((UINT32_C(1) << 12) - 1U) +#define DEVNUM_MINOR_MAX ((UINT32_C(1) << 20) - 1U) + /* glibc and the Linux kernel have different ideas about the major/minor size. These calls will check whether the * specified major is valid by the Linux kernel's standards, not by glibc's. Linux has 20bits of minor, and 12 bits of * major space. See MINORBITS in linux/kdev_t.h in the kernel sources. (If you wonder why we define _y here, instead of @@ -18,14 +21,14 @@ int parse_devnum(const char *s, dev_t *ret); #define DEVICE_MAJOR_VALID(x) \ ({ \ typeof(x) _x = (x), _y = 0; \ - _x >= _y && _x < (UINT32_C(1) << 12); \ + _x >= _y && _x <= DEVNUM_MAJOR_MAX; \ \ }) #define DEVICE_MINOR_VALID(x) \ ({ \ typeof(x) _x = (x), _y = 0; \ - _x >= _y && _x < (UINT32_C(1) << 20); \ + _x >= _y && _x <= DEVNUM_MINOR_MAX; \ }) int device_path_make_major_minor(mode_t mode, dev_t devnum, char **ret); @@ -54,3 +57,6 @@ static inline char *format_devnum(dev_t d, char buf[static DEVNUM_STR_MAX]) { static inline bool devnum_is_zero(dev_t d) { return major(d) == 0 && minor(d) == 0; } + +#define DEVNUM_TO_PTR(u) ((void*) (uintptr_t) (u)) +#define PTR_TO_DEVNUM(p) ((dev_t) ((uintptr_t) (p))) diff --git a/src/test/test-devnum-util.c b/src/test/test-devnum-util.c index ebef794001..782f15d86f 100644 --- a/src/test/test-devnum-util.c +++ b/src/test/test-devnum-util.c @@ -121,4 +121,21 @@ TEST(devnum_format_str) { test_devnum_format_str_one(makedev(4095, 1048575), "4095:1048575"); } +TEST(devnum_to_ptr) { + dev_t m = makedev(0, 0); + ASSERT_EQ(major(m), 0U); + ASSERT_EQ(minor(m), 0U); + ASSERT_EQ(m, PTR_TO_DEVNUM(DEVNUM_TO_PTR(m))); + + m = makedev(DEVNUM_MAJOR_MAX, DEVNUM_MINOR_MAX); + ASSERT_EQ(major(m), DEVNUM_MAJOR_MAX); + ASSERT_EQ(minor(m), DEVNUM_MINOR_MAX); + ASSERT_EQ(m, PTR_TO_DEVNUM(DEVNUM_TO_PTR(m))); + + m = makedev(5, 8); + ASSERT_EQ(major(m), 5U); + ASSERT_EQ(minor(m), 8U); + ASSERT_EQ(m, PTR_TO_DEVNUM(DEVNUM_TO_PTR(m))); +} + DEFINE_TEST_MAIN(LOG_INFO); From 72b932aac0fbd818208d78ec01256bb946401881 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 10 Jan 2025 11:33:59 +0100 Subject: [PATCH 2/7] user-record: add fields for setting limits on /tmp/ and /dev/shm/ --- docs/USER_RECORD.md | 35 ++++++++++++----- src/shared/user-record-show.c | 24 ++++++++++++ src/shared/user-record.c | 71 +++++++++++++++++++++++++++++++++++ src/shared/user-record.h | 17 +++++++++ 4 files changed, 138 insertions(+), 9 deletions(-) diff --git a/docs/USER_RECORD.md b/docs/USER_RECORD.md index a8e02b2c5e..ae1173c560 100644 --- a/docs/USER_RECORD.md +++ b/docs/USER_RECORD.md @@ -619,6 +619,19 @@ is allowed to edit. `selfModifiablePrivileged` → Similar to `selfModifiableFields`, but it lists fields in the `privileged` section that the user is allowed to edit. +`tmpLimit` → A numeric value encoding a disk quota limit in bytes enforced on +`/tmp/` on login, in case it is backed by volatile file system (such as +`tmpfs`). + +`tmpLimitScale` → Similar, but encodes a relative value, normalized to +`UINT32_MAX` as 100%. This value is applied relative to the file system +size. If both `tmpLimit` and `tmpLimitScale` are set, the lower of the two +should be enforced. If neither field is set the implementation might apply a +default limit. + +`devShmLimit`, `devShmLimitScale` → Similar to the previous two, but apply to +`/dev/shm/` rather than `/tmp/`. + `privileged` → An object, which contains the fields of the `privileged` section of the user record, see below. @@ -761,22 +774,26 @@ These two are the only two fields specific to this section. All other fields that may be used in this section are identical to the equally named ones in the `regular` section (i.e. at the top-level object). Specifically, these are: -`blobDirectory`, `blobManifest`, `iconName`, `location`, `shell`, `umask`, `environment`, `timeZone`, -`preferredLanguage`, `additionalLanguages`, `niceLevel`, `resourceLimits`, `locked`, `notBeforeUSec`, -`notAfterUSec`, `storage`, `diskSize`, `diskSizeRelative`, `skeletonDirectory`, -`accessMode`, `tasksMax`, `memoryHigh`, `memoryMax`, `cpuWeight`, `ioWeight`, +`blobDirectory`, `blobManifest`, `iconName`, `location`, `shell`, `umask`, +`environment`, `timeZone`, `preferredLanguage`, `additionalLanguages`, +`niceLevel`, `resourceLimits`, `locked`, `notBeforeUSec`, `notAfterUSec`, +`storage`, `diskSize`, `diskSizeRelative`, `skeletonDirectory`, `accessMode`, +`tasksMax`, `memoryHigh`, `memoryMax`, `cpuWeight`, `ioWeight`, `mountNoDevices`, `mountNoSuid`, `mountNoExecute`, `cifsDomain`, `cifsUserName`, `cifsService`, `cifsExtraMountOptions`, `imagePath`, `uid`, `gid`, `memberOf`, `fileSystemType`, `partitionUuid`, `luksUuid`, `fileSystemUuid`, `luksDiscard`, `luksOfflineDiscard`, `luksCipher`, `luksCipherMode`, `luksVolumeKeySize`, `luksPbkdfHashAlgorithm`, -`luksPbkdfType`, `luksPbkdfForceIterations`, `luksPbkdfTimeCostUSec`, `luksPbkdfMemoryCost`, -`luksPbkdfParallelThreads`, `luksSectorSize`, `autoResizeMode`, `rebalanceWeight`, -`rateLimitIntervalUSec`, `rateLimitBurst`, `enforcePasswordPolicy`, -`autoLogin`, `preferredSessionType`, `preferredSessionLauncher`, `stopDelayUSec`, `killProcesses`, +`luksPbkdfType`, `luksPbkdfForceIterations`, `luksPbkdfTimeCostUSec`, +`luksPbkdfMemoryCost`, `luksPbkdfParallelThreads`, `luksSectorSize`, +`autoResizeMode`, `rebalanceWeight`, `rateLimitIntervalUSec`, `rateLimitBurst`, +`enforcePasswordPolicy`, `autoLogin`, `preferredSessionType`, +`preferredSessionLauncher`, `stopDelayUSec`, `killProcesses`, `passwordChangeMinUSec`, `passwordChangeMaxUSec`, `passwordChangeWarnUSec`, `passwordChangeInactiveUSec`, `passwordChangeNow`, `pkcs11TokenUri`, -`fido2HmacCredential`, `selfModifiableFields`, `selfModifiableBlobs`, `selfModifiablePrivileged`. +`fido2HmacCredential`, `selfModifiableFields`, `selfModifiableBlobs`, +`selfModifiablePrivileged`, `tmpLimit`, `tmpLimitScale`, `devShmLimit`, +`devShmLimitScale`. ## Fields in the `binding` section diff --git a/src/shared/user-record-show.c b/src/shared/user-record-show.c index acff25d071..a9c635a478 100644 --- a/src/shared/user-record-show.c +++ b/src/shared/user-record-show.c @@ -7,6 +7,7 @@ #include "hashmap.h" #include "hexdecoct.h" #include "path-util.h" +#include "percent-util.h" #include "pretty-print.h" #include "process-util.h" #include "rlimit-util.h" @@ -54,6 +55,26 @@ static void show_self_modifiable( printf("%13s %s\n", i == value ? heading : "", *i); } +static void show_tmpfs_limit(const char *tmpfs, const TmpfsLimit *limit, uint32_t scale) { + assert(tmpfs); + assert(limit); + + if (!limit->is_set) + return; + + printf(" %s Limit:", tmpfs); + + if (limit->limit != UINT64_MAX) + printf(" %s", FORMAT_BYTES(limit->limit)); + if (limit->limit == UINT64_MAX || limit->limit_scale != UINT32_MAX) { + if (limit->limit != UINT64_MAX) + printf(" or"); + + printf(" %i%%", UINT32_SCALE_TO_PERCENT(scale)); + } + printf("\n"); +} + void user_record_show(UserRecord *hr, bool show_full_group_info) { _cleanup_strv_free_ char **langs = NULL; const char *hd, *ip, *shell; @@ -368,6 +389,9 @@ void user_record_show(UserRecord *hr, bool show_full_group_info) { if (hr->io_weight != UINT64_MAX) printf(" IO Weight: %" PRIu64 "\n", hr->io_weight); + show_tmpfs_limit("TMP", &hr->tmp_limit, user_record_tmp_limit_scale(hr)); + show_tmpfs_limit("SHM", &hr->dev_shm_limit, user_record_dev_shm_limit_scale(hr)); + if (hr->access_mode != MODE_INVALID) printf(" Access Mode: 0%03o\n", user_record_access_mode(hr)); diff --git a/src/shared/user-record.c b/src/shared/user-record.c index 9feac30933..ddfcbf7d60 100644 --- a/src/shared/user-record.c +++ b/src/shared/user-record.c @@ -15,6 +15,7 @@ #include "locale-util.h" #include "memory-util.h" #include "path-util.h" +#include "percent-util.h" #include "pkcs11-util.h" #include "rlimit-util.h" #include "sha256.h" @@ -95,6 +96,8 @@ UserRecord* user_record_new(void) { .drop_caches = -1, .auto_resize_mode = _AUTO_RESIZE_MODE_INVALID, .rebalance_weight = REBALANCE_WEIGHT_UNSET, + .tmp_limit = TMPFS_LIMIT_NULL, + .dev_shm_limit = TMPFS_LIMIT_NULL, }; return h; @@ -982,6 +985,40 @@ static int dispatch_rebalance_weight(const char *name, sd_json_variant *variant, return 0; } +static int dispatch_tmpfs_limit(const char *name, sd_json_variant *variant, sd_json_dispatch_flags_t flags, void *userdata) { + TmpfsLimit *limit = ASSERT_PTR(userdata); + int r; + + if (sd_json_variant_is_null(variant)) { + *limit = TMPFS_LIMIT_NULL; + return 0; + } + + r = sd_json_dispatch_uint64(name, variant, flags, &limit->limit); + if (r < 0) + return r; + + limit->is_set = true; + return 0; +} + +static int dispatch_tmpfs_limit_scale(const char *name, sd_json_variant *variant, sd_json_dispatch_flags_t flags, void *userdata) { + TmpfsLimit *limit = ASSERT_PTR(userdata); + int r; + + if (sd_json_variant_is_null(variant)) { + *limit = TMPFS_LIMIT_NULL; + return 0; + } + + r = sd_json_dispatch_uint32(name, variant, flags, &limit->limit_scale); + if (r < 0) + return r; + + limit->is_set = true; + return 0; +} + static int dispatch_privileged(const char *name, sd_json_variant *variant, sd_json_dispatch_flags_t flags, void *userdata) { static const sd_json_dispatch_field privileged_dispatch_table[] = { @@ -1275,6 +1312,10 @@ static int dispatch_per_machine(const char *name, sd_json_variant *variant, sd_j { "selfModifiableFields", SD_JSON_VARIANT_ARRAY, sd_json_dispatch_strv, offsetof(UserRecord, self_modifiable_fields), SD_JSON_STRICT }, { "selfModifiableBlobs", SD_JSON_VARIANT_ARRAY, sd_json_dispatch_strv, offsetof(UserRecord, self_modifiable_blobs), SD_JSON_STRICT }, { "selfModifiablePrivileged", SD_JSON_VARIANT_ARRAY, sd_json_dispatch_strv, offsetof(UserRecord, self_modifiable_privileged), SD_JSON_STRICT }, + { "tmpLimit", _SD_JSON_VARIANT_TYPE_INVALID, dispatch_tmpfs_limit, offsetof(UserRecord, tmp_limit), 0, }, + { "tmpLimitScale", _SD_JSON_VARIANT_TYPE_INVALID, dispatch_tmpfs_limit_scale, offsetof(UserRecord, tmp_limit), 0, }, + { "devShmLimit", _SD_JSON_VARIANT_TYPE_INVALID, dispatch_tmpfs_limit, offsetof(UserRecord, dev_shm_limit), 0, }, + { "devShmLimitScale", _SD_JSON_VARIANT_TYPE_INVALID, dispatch_tmpfs_limit_scale, offsetof(UserRecord, dev_shm_limit), 0, }, {}, }; @@ -1625,6 +1666,10 @@ int user_record_load(UserRecord *h, sd_json_variant *v, UserRecordLoadFlags load { "selfModifiableFields", SD_JSON_VARIANT_ARRAY, sd_json_dispatch_strv, offsetof(UserRecord, self_modifiable_fields), SD_JSON_STRICT }, { "selfModifiableBlobs", SD_JSON_VARIANT_ARRAY, sd_json_dispatch_strv, offsetof(UserRecord, self_modifiable_blobs), SD_JSON_STRICT }, { "selfModifiablePrivileged", SD_JSON_VARIANT_ARRAY, sd_json_dispatch_strv, offsetof(UserRecord, self_modifiable_privileged), SD_JSON_STRICT }, + { "tmpLimit", _SD_JSON_VARIANT_TYPE_INVALID, dispatch_tmpfs_limit, offsetof(UserRecord, tmp_limit), 0, }, + { "tmpLimitScale", _SD_JSON_VARIANT_TYPE_INVALID, dispatch_tmpfs_limit_scale, offsetof(UserRecord, tmp_limit), 0, }, + { "devShmLimit", _SD_JSON_VARIANT_TYPE_INVALID, dispatch_tmpfs_limit, offsetof(UserRecord, dev_shm_limit), 0, }, + { "devShmLimitScale", _SD_JSON_VARIANT_TYPE_INVALID, dispatch_tmpfs_limit_scale, offsetof(UserRecord, dev_shm_limit), 0, }, { "secret", SD_JSON_VARIANT_OBJECT, dispatch_secret, 0, 0 }, { "privileged", SD_JSON_VARIANT_OBJECT, dispatch_privileged, 0, 0 }, @@ -2138,6 +2183,32 @@ int user_record_languages(UserRecord *h, char ***ret) { return 0; } +uint32_t user_record_tmp_limit_scale(UserRecord *h) { + assert(h); + + if (h->tmp_limit.is_set) + return h->tmp_limit.limit_scale; + + /* By default grant regular users only 80% quota */ + if (user_record_disposition(h) == USER_REGULAR) + return UINT32_SCALE_FROM_PERCENT(80); + + return UINT32_MAX; +} + +uint32_t user_record_dev_shm_limit_scale(UserRecord *h) { + assert(h); + + if (h->dev_shm_limit.is_set) + return h->dev_shm_limit.limit_scale; + + /* By default grant regular users only 80% quota */ + if (user_record_disposition(h) == USER_REGULAR) + return UINT32_SCALE_FROM_PERCENT(80); + + return UINT32_MAX; +} + const char** user_record_self_modifiable_fields(UserRecord *h) { /* As a rule of thumb: a setting is safe if it cannot be used by a * user to give themselves some unfair advantage over other users on diff --git a/src/shared/user-record.h b/src/shared/user-record.h index d80a46130a..fc8510c074 100644 --- a/src/shared/user-record.h +++ b/src/shared/user-record.h @@ -230,6 +230,19 @@ typedef enum AutoResizeMode { #define REBALANCE_WEIGHT_MAX UINT64_C(10000) #define REBALANCE_WEIGHT_UNSET UINT64_MAX +typedef struct TmpfsLimit { + /* Absolute and relative tmpfs limits */ + uint64_t limit; + uint32_t limit_scale; + bool is_set; +} TmpfsLimit; + +#define TMPFS_LIMIT_NULL \ + (TmpfsLimit) { \ + .limit = UINT64_MAX, \ + .limit_scale = UINT32_MAX, \ + } \ + typedef struct UserRecord { /* The following three fields are not part of the JSON record */ unsigned n_ref; @@ -389,6 +402,8 @@ typedef struct UserRecord { char **self_modifiable_blobs; char **self_modifiable_privileged; + TmpfsLimit tmp_limit, dev_shm_limit; + sd_json_variant *json; } UserRecord; @@ -436,6 +451,8 @@ uint64_t user_record_rebalance_weight(UserRecord *h); uint64_t user_record_capability_bounding_set(UserRecord *h); uint64_t user_record_capability_ambient_set(UserRecord *h); int user_record_languages(UserRecord *h, char ***ret); +uint32_t user_record_tmp_limit_scale(UserRecord *h); +uint32_t user_record_dev_shm_limit_scale(UserRecord *h); const char **user_record_self_modifiable_fields(UserRecord *h); const char **user_record_self_modifiable_blobs(UserRecord *h); From 9ef12bc1d73aa1c2d65ff006550180624e688a5c Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 23 Jan 2025 22:30:41 +0100 Subject: [PATCH 3/7] user-runtime-dir: some smaller modernizations/refactorings --- src/login/user-runtime-dir.c | 44 +++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/src/login/user-runtime-dir.c b/src/login/user-runtime-dir.c index b242f83429..f39c1ad225 100644 --- a/src/login/user-runtime-dir.c +++ b/src/login/user-runtime-dir.c @@ -92,39 +92,38 @@ static int user_mkdir_runtime_path( uid, gid, runtime_dir_size, runtime_dir_inodes, mac_smack_use() ? ",smackfsroot=*" : ""); + _cleanup_free_ char *d = strdup(runtime_path); + if (!d) + return log_oom(); + r = mkdir_label(runtime_path, 0700); if (r < 0 && r != -EEXIST) return log_error_errno(r, "Failed to create %s: %m", runtime_path); + _cleanup_(rmdir_and_freep) char *destroy = TAKE_PTR(d); /* auto-destroy */ + r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", runtime_path, "tmpfs", MS_NODEV|MS_NOSUID, options); if (r < 0) { - if (!ERRNO_IS_PRIVILEGE(r)) { - log_error_errno(r, "Failed to mount per-user tmpfs directory %s: %m", runtime_path); - goto fail; - } + if (!ERRNO_IS_PRIVILEGE(r)) + return log_error_errno(r, "Failed to mount per-user tmpfs directory %s: %m", runtime_path); log_debug_errno(r, "Failed to mount per-user tmpfs directory %s.\n" "Assuming containerized execution, ignoring: %m", runtime_path); r = chmod_and_chown(runtime_path, 0700, uid, gid); - if (r < 0) { - log_error_errno(r, "Failed to change ownership and mode of \"%s\": %m", runtime_path); - goto fail; - } + if (r < 0) + return log_error_errno(r, "Failed to change ownership and mode of \"%s\": %m", runtime_path); } + destroy = mfree(destroy); /* deactivate auto-destroy */ + r = label_fix(runtime_path, 0); if (r < 0) log_warning_errno(r, "Failed to fix label of \"%s\", ignoring: %m", runtime_path); } return 0; - -fail: - /* Try to clean up, but ignore errors */ - (void) rmdir(runtime_path); - return r; } static int user_remove_runtime_path(const char *runtime_path) { @@ -139,9 +138,9 @@ static int user_remove_runtime_path(const char *runtime_path) { /* Ignore cases where the directory isn't mounted, as that's quite possible, if we lacked the permissions to * mount something */ - r = umount2(runtime_path, MNT_DETACH); - if (r < 0 && !IN_SET(errno, EINVAL, ENOENT)) - log_debug_errno(errno, "Failed to unmount user runtime directory %s, ignoring: %m", runtime_path); + r = RET_NERRNO(umount2(runtime_path, MNT_DETACH)); + if (r < 0 && !IN_SET(r, -EINVAL, -ENOENT)) + log_debug_errno(r, "Failed to unmount user runtime directory %s, ignoring: %m", runtime_path); r = rm_rf(runtime_path, REMOVE_ROOT); if (r < 0 && r != -ENOENT) @@ -206,7 +205,10 @@ static int run(int argc, char *argv[]) { if (argc != 3) return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "This program takes two arguments."); - if (!STR_IN_SET(argv[1], "start", "stop")) + + const char *verb = argv[1], *user = argv[2]; + + if (!STR_IN_SET(verb, "start", "stop")) return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "First argument must be either \"start\" or \"stop\"."); @@ -216,10 +218,10 @@ static int run(int argc, char *argv[]) { if (r < 0) return r; - if (streq(argv[1], "start")) - return do_mount(argv[2]); - if (streq(argv[1], "stop")) - return do_umount(argv[2]); + if (streq(verb, "start")) + return do_mount(user); + if (streq(verb, "stop")) + return do_umount(user); assert_not_reached(); } From b1c95fb2e9d11fc190017dec3d64f468f9d378bc Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 10 Jan 2025 11:34:18 +0100 Subject: [PATCH 4/7] user-runtime-dir: enforce /tmp/ and /dev/shm/ quota Enforce the quota on these two tmpfs at the same place where we mount the per-user $XDG_RUNTIME_DIR. Conceptually these are very similar concepts, and it makes sure to enforce the limits at the same place with the same lifecycle. --- README | 2 + man/user@.service.xml | 13 +-- src/login/user-runtime-dir.c | 191 ++++++++++++++++++++++++++++++----- 3 files changed, 173 insertions(+), 33 deletions(-) diff --git a/README b/README index 1e55da23f6..975f5e5a5e 100644 --- a/README +++ b/README @@ -61,9 +61,11 @@ REQUIREMENTS: ≥ 5.9 for close_range() ≥ 5.12 for idmapped mount ≥ 5.14 for cgroup.kill + ≥ 5.14 for quotactl_fd() ≥ 6.3 for MFD_EXEC/MFD_NOEXEC_SEAL and tmpfs noswap option ≥ 6.5 for name_to_handle_at() AT_HANDLE_FID, SO_PEERPIDFD/SO_PASSPIDFD, and MOVE_MOUNT_BENEATH + ≥ 6.6 for quota support on tmpfs ≥ 6.9 for pidfs ✅ systemd utilizes several new kernel APIs, but will fall back gracefully diff --git a/man/user@.service.xml b/man/user@.service.xml index cc078d2d3c..a046a759d5 100644 --- a/man/user@.service.xml +++ b/man/user@.service.xml @@ -42,12 +42,13 @@ systemd.special7 for a list of units that form the basis of the unit hierarchies of system and user units. - user@UID.service is accompanied by the - system unit user-runtime-dir@UID.service, which - creates the user's runtime directory - /run/user/UID, and then removes it when this - unit is stopped. user-runtime-dir@UID.service - executes the systemd-user-runtime-dir binary to do the actual work. + user@UID.service is accompanied by the system unit + user-runtime-dir@UID.service, which creates the user's + runtime directory /run/user/UID when started, and removes + it when it is stopped. It also might apply runtime quota settings on /tmp/ and/or + /dev/shm/ for the + user. user-runtime-dir@UID.service executes the + systemd-user-runtime-dir binary to do the actual work. User processes may be started by the user@.service instance, in which case they will be part of that unit in the system hierarchy. They may also be started elsewhere, diff --git a/src/login/user-runtime-dir.c b/src/login/user-runtime-dir.c index f39c1ad225..6c2fef95db 100644 --- a/src/login/user-runtime-dir.c +++ b/src/login/user-runtime-dir.c @@ -8,15 +8,20 @@ #include "bus-error.h" #include "bus-locator.h" #include "dev-setup.h" +#include "devnum-util.h" +#include "fd-util.h" #include "format-util.h" #include "fs-util.h" #include "label-util.h" #include "limits-util.h" #include "main-func.h" +#include "missing_magic.h" +#include "missing_syscall.h" #include "mkdir-label.h" #include "mount-util.h" #include "mountpoint-util.h" #include "path-util.h" +#include "quota-util.h" #include "rm-rf.h" #include "selinux-util.h" #include "smack-util.h" @@ -24,6 +29,7 @@ #include "string-util.h" #include "strv.h" #include "user-util.h" +#include "userdb.h" static int acquire_runtime_dir_properties(uint64_t *ret_size, uint64_t *ret_inodes) { _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; @@ -126,6 +132,26 @@ static int user_mkdir_runtime_path( return 0; } +static int do_mount(UserRecord *ur) { + int r; + + assert(ur); + + if (!uid_is_valid(ur->uid) || !gid_is_valid(ur->gid)) + return log_error_errno(SYNTHETIC_ERRNO(ENOMSG), "User '%s' lacks UID or GID, refusing.", ur->user_name); + + uint64_t runtime_dir_size, runtime_dir_inodes; + r = acquire_runtime_dir_properties(&runtime_dir_size, &runtime_dir_inodes); + if (r < 0) + return r; + + char runtime_path[STRLEN("/run/user/") + DECIMAL_STR_MAX(uid_t)]; + xsprintf(runtime_path, "/run/user/" UID_FMT, ur->uid); + + log_debug("Will mount %s owned by "UID_FMT":"GID_FMT, runtime_path, ur->uid, ur->gid); + return user_mkdir_runtime_path(runtime_path, ur->uid, ur->gid, runtime_dir_size, runtime_dir_inodes); +} + static int user_remove_runtime_path(const char *runtime_path) { int r; @@ -149,31 +175,6 @@ static int user_remove_runtime_path(const char *runtime_path) { return 0; } -static int do_mount(const char *user) { - char runtime_path[STRLEN("/run/user/") + DECIMAL_STR_MAX(uid_t)]; - uint64_t runtime_dir_size, runtime_dir_inodes; - uid_t uid; - gid_t gid; - int r; - - r = get_user_creds(&user, &uid, &gid, NULL, NULL, 0); - if (r < 0) - return log_error_errno(r, - r == -ESRCH ? "No such user \"%s\"" : - r == -ENOMSG ? "UID \"%s\" is invalid or has an invalid main group" - : "Failed to look up user \"%s\": %m", - user); - - r = acquire_runtime_dir_properties(&runtime_dir_size, &runtime_dir_inodes); - if (r < 0) - return r; - - xsprintf(runtime_path, "/run/user/" UID_FMT, uid); - - log_debug("Will mount %s owned by "UID_FMT":"GID_FMT, runtime_path, uid, gid); - return user_mkdir_runtime_path(runtime_path, uid, gid, runtime_dir_size, runtime_dir_inodes); -} - static int do_umount(const char *user) { char runtime_path[STRLEN("/run/user/") + DECIMAL_STR_MAX(uid_t)]; uid_t uid; @@ -197,6 +198,126 @@ static int do_umount(const char *user) { return user_remove_runtime_path(runtime_path); } +static int apply_tmpfs_quota( + char **paths, + uid_t uid, + uint64_t limit, + uint32_t scale) { + + _cleanup_set_free_ Set *processed = NULL; + int r; + + assert(uid_is_valid(uid)); + + STRV_FOREACH(p, paths) { + _cleanup_close_ int fd = open(*p, O_DIRECTORY|O_CLOEXEC); + if (fd < 0) { + log_warning_errno(errno, "Failed to open '%s' in order to set quota, ignoring: %m", *p); + continue; + } + + struct stat st; + if (fstat(fd, &st) < 0) { + log_warning_errno(errno, "Failed to stat '%s' in order to set quota, ignoring: %m", *p); + continue; + } + + /* Cover for bind mounted or symlinked /var/tmp/ + /tmp/ */ + if (set_contains(processed, DEVNUM_TO_PTR(st.st_dev))) { + log_debug("Not setting quota on '%s', since already processed.", *p); + continue; + } + + /* Remember we already dealt with this fs, even if the subsequent operation fails, since + * there's no point in appyling quota twice, regardless if it succeeds or not. */ + if (set_ensure_put(&processed, /* hash_ops= */ NULL, DEVNUM_TO_PTR(st.st_dev)) < 0) + return log_oom(); + + struct statfs sfs; + if (fstatfs(fd, &sfs) < 0) { + log_warning_errno(errno, "Failed to statfs '%s' in order to set quota, ignoring: %m", *p); + continue; + } + + if (!is_fs_type(&sfs, TMPFS_MAGIC)) { + log_debug("Not setting quota on '%s', since not tmpfs.", *p); + continue; + } + + struct dqblk req; + r = RET_NERRNO(quotactl_fd(fd, QCMD_FIXED(Q_GETQUOTA, USRQUOTA), uid, &req)); + if (r == -ESRCH) + zero(req); + else if (ERRNO_IS_NEG_NOT_SUPPORTED(r)) { + log_debug_errno(r, "No UID quota support on %s, not setting quota: %m", *p); + continue; + } else if (ERRNO_IS_NEG_PRIVILEGE(r)) { + log_debug_errno(r, "Lacking privileges to query UID quota on %s, not setting quota: %m", *p); + continue; + } else if (r < 0) { + log_warning_errno(r, "Failed to query disk quota on %s for UID " UID_FMT ", ignoring: %m", *p, uid); + continue; + } + + uint64_t v = + (scale == 0) ? 0 : + (scale == UINT32_MAX) ? UINT64_MAX : + (uint64_t) ((double) (sfs.f_blocks * sfs.f_frsize) / scale * UINT32_MAX); + + v = MIN(v, limit); + v /= QIF_DQBLKSIZE; + + if (FLAGS_SET(req.dqb_valid, QIF_BLIMITS) && v == req.dqb_bhardlimit) { + /* Shortcut things if everything is set up properly already */ + log_debug("Configured quota on '%s' already matches the intended setting, not updating quota.", *p); + continue; + } + + req.dqb_valid = QIF_BLIMITS; + req.dqb_bsoftlimit = req.dqb_bhardlimit = v; + + r = RET_NERRNO(quotactl_fd(fd, QCMD_FIXED(Q_SETQUOTA, USRQUOTA), uid, &req)); + if (r == -ESRCH) { + log_debug_errno(r, "Not setting UID quota on %s since UID quota is not supported: %m", *p); + continue; + } else if (ERRNO_IS_NEG_PRIVILEGE(r)) { + log_debug_errno(r, "Lacking privileges to set UID quota on %s, skipping: %m", *p); + continue; + } else if (r < 0) { + log_warning_errno(r, "Failed to set disk quota on %s for UID " UID_FMT ", ignoring: %m", *p, uid); + continue; + } + + log_info("Successfully configured disk quota for UID " UID_FMT " on %s to %s", uid, *p, FORMAT_BYTES(v * QIF_DQBLKSIZE)); + } + + return 0; +} + +static int do_tmpfs_quota(UserRecord *ur) { + int r; + + assert(ur); + + if (user_record_is_root(ur)) { + log_debug("Not applying tmpfs quota to root user."); + return 0; + } + + if (!uid_is_valid(ur->uid)) + return log_error_errno(SYNTHETIC_ERRNO(ENOMSG), "User '%s' lacks UID, refusing.", ur->user_name); + + r = apply_tmpfs_quota(STRV_MAKE("/tmp", "/var/tmp"), ur->uid, ur->tmp_limit.limit, user_record_tmp_limit_scale(ur)); + if (r < 0) + return r; + + r = apply_tmpfs_quota(STRV_MAKE("/dev/shm"), ur->uid, ur->dev_shm_limit.limit, user_record_dev_shm_limit_scale(ur)); + if (r < 0) + return r; + + return 0; +} + static int run(int argc, char *argv[]) { int r; @@ -218,10 +339,26 @@ static int run(int argc, char *argv[]) { if (r < 0) return r; - if (streq(verb, "start")) - return do_mount(user); + if (streq(verb, "start")) { + _cleanup_(user_record_unrefp) UserRecord *ur = NULL; + r = userdb_by_name(user, USERDB_PARSE_NUMERIC|USERDB_SUPPRESS_SHADOW, &ur); + if (r == -ESRCH) + return log_error_errno(r, "User '%s' does not exist: %m", user); + if (r < 0) + return log_error_errno(r, "Failed to resolve user '%s': %m", user); + + /* We do two things here: mount the per-user XDG_RUNTIME_DIR, and set up tmpfs quota on /tmp/ + * and /dev/shm/. */ + + r = 0; + RET_GATHER(r, do_mount(ur)); + RET_GATHER(r, do_tmpfs_quota(ur)); + return r; + } + if (streq(verb, "stop")) return do_umount(user); + assert_not_reached(); } From 2b2aebf4dd695d52cf8b6cbd38ba57affdc5a6bb Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 10 Jan 2025 15:31:44 +0100 Subject: [PATCH 5/7] homectl: add support for configuring tmpfs limits --- man/homectl.xml | 16 +++++++++++++ src/home/homectl.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) diff --git a/man/homectl.xml b/man/homectl.xml index 6a2be70030..6dc830233c 100644 --- a/man/homectl.xml +++ b/man/homectl.xml @@ -758,6 +758,22 @@ + + + + + + + Controls the per-user quota on /tmp/ and + /dev/shm/ that is applied when the user logs in. Takes either an absolute value + in bytes (with the usual K, M, G, T suffixes to the base of 1024), or a percentage. In the latter + case the limit is applied relative to the size of the respective file system. This limit is only + applied if the relevant file system is tmpfs and has no effect otherwise. Note + that if these options are not used, a default quota might still be enforced (typically 80%.) + + + + diff --git a/src/home/homectl.c b/src/home/homectl.c index b8b6aa3a86..29786760e2 100644 --- a/src/home/homectl.c +++ b/src/home/homectl.c @@ -2830,6 +2830,9 @@ static int help(int argc, char *argv[], void *userdata) { " --memory-max=BYTES Set maximum memory limit\n" " --cpu-weight=WEIGHT Set CPU weight\n" " --io-weight=WEIGHT Set IO weight\n" + " --tmp-limit=BYTES|PERCENT Set limit on /tmp/\n" + " --dev-shm-limit=BYTES|PERCENT\n" + " Set limit on /dev/shm/\n" "\n%4$sStorage User Record Properties:%5$s\n" " --storage=STORAGE Storage type to use (luks, fscrypt, directory,\n" " subvolume, cifs)\n" @@ -2978,6 +2981,8 @@ static int parse_argv(int argc, char *argv[]) { ARG_PROMPT_NEW_USER, ARG_AVATAR, ARG_LOGIN_BACKGROUND, + ARG_TMP_LIMIT, + ARG_DEV_SHM_LIMIT, }; static const struct option options[] = { @@ -3078,6 +3083,8 @@ static int parse_argv(int argc, char *argv[]) { { "blob", required_argument, NULL, 'b' }, { "avatar", required_argument, NULL, ARG_AVATAR }, { "login-background", required_argument, NULL, ARG_LOGIN_BACKGROUND }, + { "tmp-limit", required_argument, NULL, ARG_TMP_LIMIT }, + { "dev-shm-limit", required_argument, NULL, ARG_DEV_SHM_LIMIT }, {} }; @@ -4511,6 +4518,56 @@ static int parse_argv(int argc, char *argv[]) { break; } + case ARG_TMP_LIMIT: + case ARG_DEV_SHM_LIMIT: { + const char *field = + c == ARG_TMP_LIMIT ? "tmpLimit" : + c == ARG_DEV_SHM_LIMIT ? "devShmLimit" : NULL; + const char *field_scale = + c == ARG_TMP_LIMIT ? "tmpLimitScale" : + c == ARG_DEV_SHM_LIMIT ? "devShmLimitScale" : NULL; + + assert(field); + assert(field_scale); + + if (isempty(optarg)) { + r = drop_from_identity(field); + if (r < 0) + return r; + r = drop_from_identity(field_scale); + if (r < 0) + return r; + break; + } + + r = parse_permyriad(optarg); + if (r < 0) { + uint64_t u; + + r = parse_size(optarg, 1024, &u); + if (r < 0) + return log_error_errno(r, "Failed to parse %s/%s parameter: %s", field, field_scale, optarg); + + r = sd_json_variant_set_field_unsigned(&arg_identity_extra, field, u); + if (r < 0) + return log_error_errno(r, "Failed to set %s field: %m", field); + + r = drop_from_identity(field_scale); + if (r < 0) + return r; + } else { + r = sd_json_variant_set_field_unsigned(&arg_identity_extra, field_scale, UINT32_SCALE_FROM_PERMYRIAD(r)); + if (r < 0) + return log_error_errno(r, "Failed to set %s field: %m", field_scale); + + r = drop_from_identity(field); + if (r < 0) + return r; + } + + break; + } + case '?': return -EINVAL; From d58d449fc6b828429f2e4d06779a45318eb17f27 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 15 Jan 2025 00:25:22 +0100 Subject: [PATCH 6/7] test: add test case for tmpfs quota logic + PAMName= ask-password logic --- test/units/TEST-46-HOMED.sh | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/test/units/TEST-46-HOMED.sh b/test/units/TEST-46-HOMED.sh index 8de170a1c9..3663e53908 100755 --- a/test/units/TEST-46-HOMED.sh +++ b/test/units/TEST-46-HOMED.sh @@ -652,6 +652,22 @@ getent passwd aliastest@myrealm getent passwd aliastest2@myrealm getent passwd aliastest3@myrealm +if findmnt -n -o options /tmp | grep -q usrquota ; then + + NEWPASSWORD=quux homectl create tmpfsquota --storage=subvolume --dev-shm-limit=50K -P + + run0 --property=SetCredential=pam.authtok.systemd-run0:quux -u tmpfsquota dd if=/dev/urandom of=/dev/shm/quotatestfile1 bs=1024 count=30 + (! run0 --property=SetCredential=pam.authtok.systemd-run0:quux -u tmpfsquota dd if=/dev/urandom of=/dev/shm/quotatestfile2 bs=1024 count=30) + run0 --property=SetCredential=pam.authtok.systemd-run0:quux -u tmpfsquota rm /dev/shm/quotatestfile1 /dev/shm/quotatestfile2 + run0 --property=SetCredential=pam.authtok.systemd-run0:quux -u tmpfsquota dd if=/dev/urandom of=/dev/shm/quotatestfile1 bs=1024 count=30 + run0 --property=SetCredential=pam.authtok.systemd-run0:quux -u tmpfsquota rm /dev/shm/quotatestfile1 + + systemctl stop user@"$(id -u tmpfsquota)".service + + wait_for_state tmpfsquota inactive + homectl remove tmpfsquota +fi + systemd-analyze log-level info touch /testok From 0054b7dce986b73e6cb10bf4ae51a1dd5ef57191 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 15 Jan 2025 09:44:52 +0100 Subject: [PATCH 7/7] update TODO --- TODO | 8 -------- 1 file changed, 8 deletions(-) diff --git a/TODO b/TODO index 8ea545ed46..347fe09fdc 100644 --- a/TODO +++ b/TODO @@ -294,10 +294,6 @@ Features: * pcrlock: add support for multi-profile UKIs -* logind: when logging in use new tmpfs quota support to configure quota on - /tmp/ + /dev/shm/. But do so only in case of tmpfs, because otherwise quota - is persistent and any persistent settings mean we don#t have to reapply them. - * initrd: when transitioning from initrd to host, validate that /lib/modules/`uname -r` exists, refuse otherwise @@ -1470,8 +1466,6 @@ Features: * rework recursive read-only remount to use new mount API -* PAM: pick up authentication token from credentials - * when mounting disk images: if IMAGE_ID/IMAGE_VERSION is set in os-release data in the image, make sure the image filename actually matches this, so that images cannot be misused. @@ -1538,7 +1532,6 @@ Features: - pass creds via keyring? - pass creds via memfd? - acquire + decrypt creds from pkcs11? - - make PAMName= acquire pw via creds logic - make macsec code in networkd read key via creds logic (copy logic from wireguard) - make gatewayd/remote read key via creds logic @@ -2404,7 +2397,6 @@ Features: - maybe make automatic, read-only, time-based reflink-copies of LUKS disk images (and btrfs snapshots of subvolumes) (think: time machine) - distinguish destroy / remove (i.e. currently we can unregister a user, unregister+remove their home directory, but not just remove their home directory) - - in systemd's PAMName= logic: query passwords with ssh-askpassword, so that we can make "loginctl set-linger" mode work - fingerprint authentication, pattern authentication, … - make sure "classic" user records can also be managed by homed - make size of $XDG_RUNTIME_DIR configurable in user record