tree-wide: open block device locks in writable mode

udev's block device locking protocol has one pitfall not even the
example in the documentation got right so far (even though this is
explained in all detail above): udev's rescanning is only triggered when
an fd that is opened for writing is closed. This means that if a
separate locking fd is opened on a block device – one that is maintained
independently of the fd actually used for writing – it must be opened for
writing too, so that closing the lock definitely triggers a rescan. This
matters in cases where the lock fd is kept for longer than the fd used
for writing to disk. (Because otherwise udev might get the
IN_CLOSE_WRITE event, but when it tries to rescan will find the device
locked, and never retry because no IN_CLOSE_WRITE is triggred anymore.)

Let's fix that across the codebase, at 4 places:

1. in makefs (a lock fd is kept, and mkfs then invoked as child, which
   uses a different fd, and the lock fd is closed only once the child
   died)

2. in udevadm lock (embarassing!): which is intended to be used to wrap tools
   that modify disk contents, very similar to the makefs case. The lock
   is also kept until after the tool exited.

3. In storagetm: the kernel nvme-tcp layer writes to the device
   directly, we just keep a lock fd.

4. the example in BLOCK_DEVICE_LOCKING.md
This commit is contained in:
Lennart Poettering
2025-10-22 22:47:53 +02:00
parent 46da450f13
commit e582484789
6 changed files with 17 additions and 11 deletions

View File

@@ -223,11 +223,12 @@ int main(int argc, char **argv) {
return EXIT_FAILURE;
}
// try to take an exclusive and nonblocking BSD lock
// try to take an exclusive and nonblocking BSD lock (use O_WRONLY mode to ensure udev
// rescans the device once the lock is closed)
__attribute__((cleanup(closep))) int fd =
lock_whole_disk_from_devname(
argv[1],
O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY,
O_WRONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY,
LOCK_EX|LOCK_NB);
if (fd < 0)

View File

@@ -41,7 +41,7 @@ static int run(int argc, char *argv[]) {
if (S_ISBLK(st.st_mode)) {
/* Lock the device so that udev doesn't interfere with our work */
lock_fd = lock_whole_block_device(st.st_rdev, LOCK_EX);
lock_fd = lock_whole_block_device(st.st_rdev, O_WRONLY, LOCK_EX);
if (lock_fd < 0)
return log_error_errno(lock_fd, "Failed to lock whole block device of \"%s\": %m", device);
} else

View File

@@ -340,20 +340,24 @@ int get_block_device_harder(const char *path, dev_t *ret) {
return get_block_device_harder_fd(fd, ret);
}
int lock_whole_block_device(dev_t devt, int operation) {
int lock_whole_block_device(dev_t devt, int open_flags, int operation) {
_cleanup_close_ int lock_fd = -EBADF;
dev_t whole_devt;
int r;
/* Let's get a BSD file lock on the whole block device, as per: https://systemd.io/BLOCK_DEVICE_LOCKING */
/* Let's get a BSD file lock on the whole block device, as per: https://systemd.io/BLOCK_DEVICE_LOCKING
*
* NB: it matters whether open_flags indicates open for write: only then will the eventual closing of
* the fd trigger udev's partitioning rescanning of the device (as it watches for IN_CLOSE_WRITE),
* hence make sure to pass the right value there. */
r = block_get_whole_disk(devt, &whole_devt);
if (r < 0)
return r;
lock_fd = r = device_open_from_devnum(S_IFBLK, whole_devt, O_RDONLY|O_CLOEXEC|O_NONBLOCK, NULL);
if (r < 0)
return r;
lock_fd = device_open_from_devnum(S_IFBLK, whole_devt, open_flags|O_CLOEXEC|O_NONBLOCK|O_NOCTTY, NULL);
if (lock_fd < 0)
return lock_fd;
if (flock(lock_fd, operation) < 0)
return -errno;

View File

@@ -35,7 +35,7 @@ int get_block_device(const char *path, dev_t *dev);
int get_block_device_harder_fd(int fd, dev_t *dev);
int get_block_device_harder(const char *path, dev_t *dev);
int lock_whole_block_device(dev_t devt, int operation);
int lock_whole_block_device(dev_t devt, int open_flags, int operation);
int blockdev_partscan_enabled(sd_device *d);
int blockdev_partscan_enabled_fd(int fd);

View File

@@ -390,7 +390,7 @@ static int nvme_subsystem_add(const char *node, int consumed_fd, sd_device *devi
return log_oom();
if (fd < 0) {
fd = RET_NERRNO(open(node, O_RDONLY|O_CLOEXEC|O_NONBLOCK));
fd = RET_NERRNO(open(node, O_RDWR|O_CLOEXEC|O_NONBLOCK));
if (fd < 0)
return log_error_errno(fd, "Failed to open '%s': %m", node);
}

View File

@@ -184,7 +184,8 @@ static int lock_device(
struct stat st;
int r;
fd = open(path, O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
/* We open in O_WRONLY mode here, to trigger a rescan in udev once we are done */
fd = open(path, O_WRONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
if (fd < 0)
return log_error_errno(errno, "Failed to open '%s': %m", path);