From e7b4b2968e0e3752cdde290c57fe208eab9f6265 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 19 Jun 2025 14:14:31 +0200 Subject: [PATCH 1/3] repart: make file system sector size configurable Let's make the fs sector size configurable. This also adds infrastructure so that we can pick different sector sizes as defaults eventually, but doesn't actually do that. (Background: I think we should probably default to native sector size for the ESP, but Daan disagrees, so I'll leave this out for now). --- man/repart.d.xml | 19 +++++++ src/repart/repart.c | 131 +++++++++++++++++++++++++++++++++----------- 2 files changed, 119 insertions(+), 31 deletions(-) diff --git a/man/repart.d.xml b/man/repart.d.xml index a53057ffa1..eb628ffc16 100644 --- a/man/repart.d.xml +++ b/man/repart.d.xml @@ -1007,6 +1007,25 @@ + + FileSystemSectorSize= + + Controls the sector size for any file system, LUKS volume or Verity volume formatted + on this partition. Expects a power of 2 as value, and must be equal or larger than 512. Typically + it's recommended to set the file system sector size to 4096, even on 512 sector disks (and in + particular for images that are only ever intended to be stored as file on disks), in order to + optimize performance. However, for compatibility with foreign operating systems or firmware it might + be advisable to use the native sector size of the backing disk. + + If unspecified and operating on a block device, defaults to the native sector size of the + device. If unspecified and operating on a disk image file defaults to 4096. + + Regardless of what is configured here, or which default is picked, the file system sector size + is always increased to be equal or larger than the disk sector size. + + + + diff --git a/src/repart/repart.c b/src/repart/repart.c index de138c15d3..0dc9b7eb0e 100644 --- a/src/repart/repart.c +++ b/src/repart/repart.c @@ -409,6 +409,7 @@ typedef struct Partition { uint64_t verity_hash_block_size; char *compression; char *compression_level; + uint64_t fs_sector_size; int add_validatefs; CopyFiles *copy_files; @@ -461,7 +462,7 @@ typedef struct Context { uint64_t start, end, total; struct fdisk_context *fdisk_context; - uint64_t sector_size, grain_size, fs_sector_size; + uint64_t sector_size, grain_size, default_fs_sector_size; sd_id128_t seed; @@ -609,6 +610,7 @@ static Partition *partition_new(void) { .add_validatefs = -1, .last_percent = UINT_MAX, .progress_ratelimit = { 100 * USEC_PER_MSEC, 1 }, + .fs_sector_size = UINT64_MAX, }; return p; @@ -724,6 +726,7 @@ static void partition_foreignize(Partition *p) { p->growfs = -1; p->verity = VERITY_OFF; p->add_validatefs = false; + p->fs_sector_size = UINT64_MAX; partition_mountpoint_free_many(p->mountpoints, p->n_mountpoints); p->mountpoints = NULL; @@ -909,6 +912,23 @@ static bool context_drop_or_foreignize_one_priority(Context *context) { return true; } +static uint64_t partition_fs_sector_size(const Context *c, const Partition *p) { + assert(c); + assert(p); + + uint64_t ss; + + if (p->fs_sector_size != UINT64_MAX) + /* Prefer explicitly configured value */ + ss = p->fs_sector_size; + else + /* Otherwise follow the default sector size */ + ss = c->default_fs_sector_size; + + /* never allow the fs sector size to be picked smaller than the physical sector size */ + return MAX(ss, c->sector_size); +} + static uint64_t partition_min_size(const Context *context, const Partition *p) { uint64_t sz, override_min; @@ -1731,6 +1751,45 @@ static int config_parse_block_size( return 0; } +static int config_parse_fs_sector_size( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + uint64_t *fssecsz = ASSERT_PTR(data), parsed; + int r; + + assert(rvalue); + + if (isempty(rvalue)) { + *fssecsz = UINT64_MAX; + return 0; + } + + r = parse_size(rvalue, 1024, &parsed); + if (r < 0) + return log_syntax(unit, LOG_ERR, filename, line, r, + "Failed to parse size value: %s", rvalue); + + if (!ISPOWEROF2(parsed)) + return log_syntax(unit, LOG_ERR, filename, line, SYNTHETIC_ERRNO(EINVAL), + "Value not a power of 2: %s", rvalue); + + /* NB: we make no upper restriction here, since the maximum logical sector sizes file systems support + * vary greatly, and can be much larger than 4K. (That's also the reason we dont't use + * parse_sector_size() here.) */ + + *fssecsz = parsed; + return 0; +} + static int config_parse_fstype( const char *unit, const char *filename, @@ -2483,6 +2542,7 @@ static int partition_read_definition(Partition *p, const char *path, const char { "Partition", "CompressionLevel", config_parse_string, CONFIG_PARSE_STRING_SAFE_AND_ASCII, &p->compression_level }, { "Partition", "SupplementFor", config_parse_string, 0, &p->supplement_for_name }, { "Partition", "AddValidateFS", config_parse_tristate, 0, &p->add_validatefs }, + { "Partition", "FileSystemSectorSize", config_parse_fs_sector_size, 0, &p->fs_sector_size }, {} }; _cleanup_free_ char *filename = NULL; @@ -3256,7 +3316,7 @@ static int context_load_partition_table(Context *context) { if (S_ISREG(st.st_mode) && st.st_size == 0) { /* Use the fallback values if we have no better idea */ context->sector_size = fdisk_get_sector_size(c); - context->fs_sector_size = fs_secsz; + context->default_fs_sector_size = fs_secsz; context->grain_size = 4096; return /* from_scratch = */ true; } @@ -3290,7 +3350,7 @@ static int context_load_partition_table(Context *context) { * larger */ grainsz = secsz < 4096 ? 4096 : secsz; - log_debug("Sector size of device is %lu bytes. Using filesystem sector size of %" PRIu64 " and grain size of %" PRIu64 ".", secsz, fs_secsz, grainsz); + log_debug("Sector size of device is %lu bytes. Using default filesystem sector size of %" PRIu64 " and grain size of %" PRIu64 ".", secsz, fs_secsz, grainsz); switch (arg_empty) { @@ -3539,7 +3599,7 @@ add_initial_free_area: context->end = last_lba; context->total = nsectors; context->sector_size = secsz; - context->fs_sector_size = fs_secsz; + context->default_fs_sector_size = fs_secsz; context->grain_size = grainsz; context->fdisk_context = TAKE_PTR(c); @@ -4597,7 +4657,7 @@ static int partition_encrypt(Context *context, Partition *p, PartitionTarget *ta const char *node = partition_target_path(target); struct crypt_params_luks2 luks_params = { .label = strempty(ASSERT_PTR(p)->new_label), - .sector_size = ASSERT_PTR(context)->fs_sector_size, + .sector_size = partition_fs_sector_size(context, p), .data_device = offline ? node : NULL, }; struct crypt_params_reencrypt reencrypt_params = { @@ -6347,10 +6407,17 @@ static int context_mkfs(Context *context) { if (r < 0) return r; - r = make_filesystem(partition_target_path(t), p->format, strempty(p->new_label), root, - p->fs_uuid, partition_mkfs_flags(p), - context->fs_sector_size, p->compression, p->compression_level, - extra_mkfs_options); + r = make_filesystem( + partition_target_path(t), + p->format, + strempty(p->new_label), + root, + p->fs_uuid, + partition_mkfs_flags(p), + partition_fs_sector_size(context, p), + p->compression, + p->compression_level, + extra_mkfs_options); if (r < 0) return r; @@ -7925,10 +7992,10 @@ static int context_update_verity_size(Context *context) { assert_se(dp = p->siblings[VERITY_DATA]); if (p->verity_data_block_size == UINT64_MAX) - p->verity_data_block_size = context->fs_sector_size; + p->verity_data_block_size = partition_fs_sector_size(context, p); if (p->verity_hash_block_size == UINT64_MAX) - p->verity_hash_block_size = context->fs_sector_size; + p->verity_hash_block_size = partition_fs_sector_size(context, p); uint64_t sz; if (dp->size_max != UINT64_MAX) { @@ -8061,16 +8128,17 @@ static int context_minimize(Context *context) { if (r < 0) return r; - r = make_filesystem(d ? d->node : temp, - p->format, - strempty(p->new_label), - root, - fs_uuid, - partition_mkfs_flags(p), - context->fs_sector_size, - p->compression, - p->compression_level, - extra_mkfs_options); + r = make_filesystem( + d ? d->node : temp, + p->format, + strempty(p->new_label), + root, + fs_uuid, + partition_mkfs_flags(p), + partition_fs_sector_size(context, p), + p->compression, + p->compression_level, + extra_mkfs_options); if (r < 0) return r; @@ -8152,16 +8220,17 @@ static int context_minimize(Context *context) { return log_error_errno(r, "Failed to make loopback device of %s: %m", temp); } - r = make_filesystem(d ? d->node : temp, - p->format, - strempty(p->new_label), - root, - p->fs_uuid, - partition_mkfs_flags(p), - context->fs_sector_size, - p->compression, - p->compression_level, - extra_mkfs_options); + r = make_filesystem( + d ? d->node : temp, + p->format, + strempty(p->new_label), + root, + p->fs_uuid, + partition_mkfs_flags(p), + partition_fs_sector_size(context, p), + p->compression, + p->compression_level, + extra_mkfs_options); if (r < 0) return r; From 3982577119e678cc57a4bb1cfee7849d4c8391de Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 19 Jun 2025 14:42:55 +0200 Subject: [PATCH 2/3] repart: trivial modernizations --- src/repart/repart.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/repart/repart.c b/src/repart/repart.c index 0dc9b7eb0e..937d4dcd89 100644 --- a/src/repart/repart.c +++ b/src/repart/repart.c @@ -3239,7 +3239,6 @@ static int context_load_partition_table(Context *context) { bool from_scratch = false; sd_id128_t disk_uuid; size_t n_partitions; - unsigned long secsz; uint64_t grainsz, fs_secsz = DEFAULT_FILESYSTEM_SECTOR_SIZE; int r; @@ -3340,7 +3339,7 @@ static int context_load_partition_table(Context *context) { * it for all our needs. Note that the values we use ourselves always are in bytes though, thus mean * the same thing universally. Also note that regardless what kind of sector size is in use we'll * place partitions at multiples of 4K. */ - secsz = fdisk_get_sector_size(c); + unsigned long secsz = fdisk_get_sector_size(c); /* Insist on a power of two, and that it's a multiple of 512, i.e. the traditional sector size. */ if (secsz < 512 || !ISPOWEROF2(secsz)) @@ -3348,7 +3347,7 @@ static int context_load_partition_table(Context *context) { /* Use at least 4K, and ensure it's a multiple of the sector size, regardless if that is smaller or * larger */ - grainsz = secsz < 4096 ? 4096 : secsz; + grainsz = MAX(secsz, 4096U); log_debug("Sector size of device is %lu bytes. Using default filesystem sector size of %" PRIu64 " and grain size of %" PRIu64 ".", secsz, fs_secsz, grainsz); From c343f0ee13c7eb16c0b4b0b069bc6e1d5a8792db Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 25 Jun 2025 11:02:05 +0200 Subject: [PATCH 3/3] repart: respect minimum sector size for ESP/VFAT partitions Fixes: #37801 --- man/repart.d.xml | 5 ++++ src/repart/repart.c | 28 +++++++++++++++++-- test/units/TEST-58-REPART.sh | 18 ++++++------ test/units/TEST-87-AUX-UTILS-VM.validatefs.sh | 2 +- 4 files changed, 41 insertions(+), 12 deletions(-) diff --git a/man/repart.d.xml b/man/repart.d.xml index eb628ffc16..d3a8f6fcea 100644 --- a/man/repart.d.xml +++ b/man/repart.d.xml @@ -341,6 +341,11 @@ and the placing algorithm restarted. By default, a minimum size constraint of 10M and no maximum size constraint is set. + If Format= is set, the minimum size is automatically raised to the minimum + file system size for the selected file system type, if known. Moreover, for the ESP/XBOOTLDR + partitions the minimum is raised to 100M (for 512b sector images) or 260M (for 4K sector images) + automatically, if specified smaller. + diff --git a/src/repart/repart.c b/src/repart/repart.c index 937d4dcd89..ca211702bf 100644 --- a/src/repart/repart.c +++ b/src/repart/repart.c @@ -105,6 +105,15 @@ * filesystems will then also be compatible with sector sizes 512, 1024 and 2048. */ #define DEFAULT_FILESYSTEM_SECTOR_SIZE 4096ULL +/* Minimum sizes for the ESP depending on sector size. What the minimum is, is severely underdocumented, but + * it appears for 4K sector size it must be 260M, and otherwise 100M. This is what Microsoft says here: + * + * https://learn.microsoft.com/en-us/windows-hardware/manufacture/desktop/configure-uefigpt-based-hard-drive-partitions?view=windows-11 + * https://learn.microsoft.com/en-us/windows-hardware/manufacture/desktop/oem-deployment-of-windows-desktop-editions-sample-scripts?view=windows-11&preserve-view=true#-createpartitions-uefitxt + */ +#define ESP_MIN_SIZE (100 * U64_MB) +#define ESP_MIN_SIZE_4K (260 * U64_MB) + #define APIVFS_TMP_DIRS_NULSTR "proc\0sys\0dev\0tmp\0run\0var/tmp\0" #define AUTOMATIC_FSTAB_HEADER_START "# Start section ↓ of automatically generated fstab by systemd-repart" @@ -929,6 +938,21 @@ static uint64_t partition_fs_sector_size(const Context *c, const Partition *p) { return MAX(ss, c->sector_size); } +static uint64_t partition_fstype_min_size(const Context *c, const Partition *p) { + assert(c); + assert(p); + + /* If a file system type is configured, then take it into consideration for the minimum partition + * size */ + + if (IN_SET(p->type.designator, PARTITION_ESP, PARTITION_XBOOTLDR) && streq_ptr(p->format, "vfat")) { + uint64_t ss = partition_fs_sector_size(c, p); + return ss >= 4096 ? ESP_MIN_SIZE_4K : ESP_MIN_SIZE; + } + + return minimal_size_by_fs_name(p->format); +} + static uint64_t partition_min_size(const Context *context, const Partition *p) { uint64_t sz, override_min; @@ -964,8 +988,8 @@ static uint64_t partition_min_size(const Context *context, const Partition *p) { uint64_t f; /* If we shall synthesize a file system, take minimal fs size into account (assumed to be 4K if not known) */ - f = p->format ? round_up_size(minimal_size_by_fs_name(p->format), context->grain_size) : UINT64_MAX; - d += f == UINT64_MAX ? context->grain_size : f; + f = partition_fstype_min_size(context, p); + d += f == UINT64_MAX ? context->grain_size : round_up_size(f, context->grain_size); } if (d > sz) diff --git a/test/units/TEST-58-REPART.sh b/test/units/TEST-58-REPART.sh index 97e3783613..aa472df99a 100755 --- a/test/units/TEST-58-REPART.sh +++ b/test/units/TEST-58-REPART.sh @@ -565,9 +565,9 @@ EOF output=$(sfdisk --dump "$imgs/zzz") - assert_in "$imgs/zzz1 : start= 2048, size= 20480, type=C12A7328-F81F-11D2-BA4B-00A0C93EC93B, uuid=39107B09-615D-48FB-BA37-C663885FCE67, name=\"esp\"" "$output" - assert_in "$imgs/zzz2 : start= 22528, size= 65536, type=${root_guid}, uuid=${root_uuid}, name=\"root-${architecture}\", attrs=\"GUID:59\"" "$output" - assert_in "$imgs/zzz3 : start= 88064, size= 65536, type=${usr_guid}, uuid=${usr_uuid}, name=\"usr-${architecture}\", attrs=\"GUID:60\"" "$output" + assert_in "$imgs/zzz1 : start= 2048, size= 532480, type=C12A7328-F81F-11D2-BA4B-00A0C93EC93B, uuid=39107B09-615D-48FB-BA37-C663885FCE67, name=\"esp\"" "$output" + assert_in "$imgs/zzz2 : start= 534528, size= 65536, type=${root_guid}, uuid=${root_uuid}, name=\"root-${architecture}\", attrs=\"GUID:59\"" "$output" + assert_in "$imgs/zzz3 : start= 600064, size= 65536, type=${usr_guid}, uuid=${usr_uuid}, name=\"usr-${architecture}\", attrs=\"GUID:60\"" "$output" if systemd-detect-virt --quiet --container; then echo "Skipping second part of copy blocks tests in container." @@ -1573,7 +1573,7 @@ EOF systemd-repart --empty=create --size=auto --dry-run=no --definitions="$defs" "$image" output=$(sfdisk -d "$image") - assert_in "${image}1 : start= 2048, size= 204800, type=${esp_guid}" "$output" + assert_in "${image}1 : start= 2048, size= 532480, type=${esp_guid}" "$output" assert_not_in "${image}2" "$output" # Disk with small ESP => ESP grows @@ -1586,12 +1586,12 @@ EOF systemd-repart --dry-run=no --definitions="$defs" "$image" output=$(sfdisk -d "$image") - assert_in "${image}1 : start= 2048, size= 204800, type=${esp_guid}" "$output" + assert_in "${image}1 : start= 2048, size= 532480, type=${esp_guid}" "$output" assert_not_in "${image}2" "$output" # Disk with small ESP that can't grow => XBOOTLDR created - truncate -s 150M "$image" + truncate -s 400M "$image" sfdisk "$image" < XBOOTLDR grows, small ESP created @@ -1614,8 +1614,8 @@ EOF systemd-repart --dry-run=no --definitions="$defs" "$image" output=$(sfdisk -d "$image") - assert_in "${image}1 : start= 2048, size= 204800, type=${xbootldr_guid}" "$output" - assert_in "${image}2 : start= 206848, size= 100312, type=${esp_guid}" "$output" + assert_in "${image}1 : start= 2048, size= 284632, type=${xbootldr_guid}" "$output" + assert_in "${image}2 : start= 286680, size= 532480, type=${esp_guid}" "$output" } OFFLINE="yes" diff --git a/test/units/TEST-87-AUX-UTILS-VM.validatefs.sh b/test/units/TEST-87-AUX-UTILS-VM.validatefs.sh index ede0d71019..86120975be 100755 --- a/test/units/TEST-87-AUX-UTILS-VM.validatefs.sh +++ b/test/units/TEST-87-AUX-UTILS-VM.validatefs.sh @@ -74,7 +74,7 @@ MountPoint=/somewhere/else Format=ext4 EOF -systemd-repart --dry-run=no --empty=create --size=256M --definitions=/tmp/validatefs-test /var/tmp/validatefs-test.raw +systemd-repart --dry-run=no --empty=create --size=410M --definitions=/tmp/validatefs-test /var/tmp/validatefs-test.raw systemd-dissect --mount --mkdir /var/tmp/validatefs-test.raw /tmp/validatefs-test.mount