From 738f29cb53b457ca0c17885f119de5bc1f10dead Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Wed, 26 Aug 2020 22:42:26 +0200
Subject: [PATCH 1/9] loop-util: handle EAGAIN on LOOP_SET_STATUS64

Since
https://github.com/torvalds/linux/commit/5db470e229e22b7eda6e23b5566e532c96fb5bc3 (i.e. kernel 5.0)
changing the .lo_offset field via LOOP_SET_STATUS64 might result in
EAGAIN. Let's handle that.

Fixes: #16858
---
 src/shared/loop-util.c | 40 ++++++++++++++++++++++++++++++++++++----
 1 file changed, 36 insertions(+), 4 deletions(-)

diff --git a/src/shared/loop-util.c b/src/shared/loop-util.c
index 4843cbcaab..156a41dd3a 100644
--- a/src/shared/loop-util.c
+++ b/src/shared/loop-util.c
@@ -34,6 +34,7 @@ static void cleanup_clear_loop_close(int *fd) {
 }
 
 static int loop_configure(int fd, const struct loop_config *c) {
+        _cleanup_close_ int lock_fd = -1;
         int r;
 
         assert(fd >= 0);
@@ -84,15 +85,46 @@ static int loop_configure(int fd, const struct loop_config *c) {
                         return 0;
 
                 /* Otherwise, undo the attachment and use the old APIs */
-                (void) ioctl(fd, LOOP_CLR_FD);
+                if (ioctl(fd, LOOP_CLR_FD) < 0)
+                        return -errno;
         }
 
+        /* Since kernel commit 5db470e229e22b7eda6e23b5566e532c96fb5bc3 (kernel v5.0) the LOOP_SET_STATUS64
+         * ioctl can return EAGAIN in case we change the lo_offset field, if someone else is accessing the
+         * block device while we try to reconfigure it. This is a pretty common case, since udev might
+         * instantly start probing the device as soon as we attach an fd to it. Hence handle it in two ways:
+         * first, let's take the BSD lock that that ensures that udev will not step in between the point in
+         * time where we attach the fd and where we reconfigure the device. Secondly, let's wait 50ms on
+         * EAGAIN and retry. The former should be an efficient mechanism to avoid we have to wait 50ms
+         * needlessly if we are just racing against udev. The latter is protection against all other cases,
+         * i.e. peers that do not take the BSD lock.
+         *
+         * We take the BSD lock on a second, separately opened fd for the device. udev after all watches for
+         * close() events (specifically IN_CLOSE_WRITE) on block devices to reprobe them, hence by having a
+         * separate fd we will later close() we can ensure we trigger udev after everything is done. If we'd
+         * lock our own fd instead and keep it open for a long time udev would possibly never run on it
+         * again, even though the fd is unlocked, simply because we never close() it. It also has the nice
+         * benefit we can use the _cleanup_close_ logic to automatically release the lock, after we are
+         * done. */
+        lock_fd = fd_reopen(fd, O_RDWR|O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
+        if (lock_fd < 0)
+                return lock_fd;
+
+        if (flock(lock_fd, LOCK_EX) < 0)
+                return -errno;
+
         if (ioctl(fd, LOOP_SET_FD, c->fd) < 0)
                 return -errno;
 
-        if (ioctl(fd, LOOP_SET_STATUS64, &c->info) < 0) {
-                r = -errno;
-                goto fail;
+        for (unsigned n_attempts = 0;;) {
+                if (ioctl(fd, LOOP_SET_STATUS64, &c->info) >= 0)
+                        break;
+                if (errno != EAGAIN || ++n_attempts >= 64) {
+                        r = log_debug_errno(errno, "Failed to configure loopback device: %m");
+                        goto fail;
+                }
+
+                (void) usleep(50 * USEC_PER_MSEC);
         }
 
         return 0;

From 95c5009248086f8a6fd86808654072adb1395afa Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Fri, 25 Sep 2020 15:22:48 +0200
Subject: [PATCH 2/9] loop-util: LOOP_CLR_FD is async, don't retry to reuse a
 device right after issuing it

When we fall back to classic LOOP_SET_FD logic in case LOOP_CONFIGURE
didn't work we issue LOOP_CLR_FD first. But that call turns out to be
potentially async in the kernel: if something else (let's say
udev/blkid) is accessing the device the ioctl just sets the autoclear
flag and exits. Hence quite often the LOOP_SET_FD will subsequently
fail. Let's avoid the trouble, and immediately exit with EBUSY if
LOOP_CONFIGURE fails, and but remember that LOOP_CONFIGURE is not
available so that on the next iteration we go directly for LOOP_SET_FD
instead.
---
 src/shared/loop-util.c | 105 ++++++++++++++++++++++++-----------------
 1 file changed, 62 insertions(+), 43 deletions(-)

diff --git a/src/shared/loop-util.c b/src/shared/loop-util.c
index 156a41dd3a..e4b44ab89b 100644
--- a/src/shared/loop-util.c
+++ b/src/shared/loop-util.c
@@ -33,60 +33,78 @@ static void cleanup_clear_loop_close(int *fd) {
         (void) safe_close(*fd);
 }
 
-static int loop_configure(int fd, const struct loop_config *c) {
+static int loop_configure(
+                int fd,
+                const struct loop_config *c,
+                bool *try_loop_configure) {
+
         _cleanup_close_ int lock_fd = -1;
         int r;
 
         assert(fd >= 0);
         assert(c);
+        assert(try_loop_configure);
 
-        if (ioctl(fd, LOOP_CONFIGURE, c) < 0) {
-                /* Do fallback only if LOOP_CONFIGURE is not supported, propagate all other errors. Note that
-                 * the kernel is weird: non-existing ioctls currently return EINVAL rather than ENOTTY on
-                 * loopback block devices. They should fix that in the kernel, but in the meantime we accept
-                 * both here. */
-                if (!ERRNO_IS_NOT_SUPPORTED(errno) && errno != EINVAL)
-                        return -errno;
-        } else {
-                bool good = true;
+        if (*try_loop_configure) {
+                if (ioctl(fd, LOOP_CONFIGURE, c) < 0) {
+                        /* Do fallback only if LOOP_CONFIGURE is not supported, propagate all other
+                         * errors. Note that the kernel is weird: non-existing ioctls currently return EINVAL
+                         * rather than ENOTTY on loopback block devices. They should fix that in the kernel,
+                         * but in the meantime we accept both here. */
+                        if (!ERRNO_IS_NOT_SUPPORTED(errno) && errno != EINVAL)
+                                return -errno;
 
-                if (c->info.lo_sizelimit != 0) {
-                        /* Kernel 5.8 vanilla doesn't properly propagate the size limit into the block
-                         * device. If it's used, let's immediately check if it had the desired effect
-                         * hence. And if not use classic LOOP_SET_STATUS64. */
-                        uint64_t z;
+                        *try_loop_configure = false;
+                } else {
+                        bool good = true;
 
-                        if (ioctl(fd, BLKGETSIZE64, &z) < 0) {
-                                r = -errno;
+                        if (c->info.lo_sizelimit != 0) {
+                                /* Kernel 5.8 vanilla doesn't properly propagate the size limit into the
+                                 * block device. If it's used, let's immediately check if it had the desired
+                                 * effect hence. And if not use classic LOOP_SET_STATUS64. */
+                                uint64_t z;
+
+                                if (ioctl(fd, BLKGETSIZE64, &z) < 0) {
+                                        r = -errno;
+                                        goto fail;
+                                }
+
+                                if (z != c->info.lo_sizelimit) {
+                                        log_debug("LOOP_CONFIGURE is broken, doesn't honour .lo_sizelimit. Falling back to LOOP_SET_STATUS64.");
+                                        good = false;
+                                }
+                        }
+
+                        if (FLAGS_SET(c->info.lo_flags, LO_FLAGS_PARTSCAN)) {
+                                /* Kernel 5.8 vanilla doesn't properly propagate the partition scanning flag
+                                 * into the block device. Let's hence verify if things work correctly here
+                                 * before returning. */
+
+                                r = blockdev_partscan_enabled(fd);
+                                if (r < 0)
+                                        goto fail;
+                                if (r == 0) {
+                                        log_debug("LOOP_CONFIGURE is broken, doesn't honour LO_FLAGS_PARTSCAN. Falling back to LOOP_SET_STATUS64.");
+                                        good = false;
+                                }
+                        }
+
+                        if (!good) {
+                                /* LOOP_CONFIGURE doesn't work. Remember that. */
+                                *try_loop_configure = false;
+
+                                /* We return EBUSY here instead of retrying immediately with LOOP_SET_FD,
+                                 * because LOOP_CLR_FD is async: if the operation cannot be executed right
+                                 * away it just sets the autoclear flag on the device. This means there's a
+                                 * good chance we cannot actually reuse the loopback device right-away. Hence
+                                 * let's assume it's busy, avoid the trouble and let the calling loop call us
+                                 * again with a new, likely unused device. */
+                                r = -EBUSY;
                                 goto fail;
                         }
 
-                        if (z != c->info.lo_sizelimit) {
-                                log_debug("LOOP_CONFIGURE is broken, doesn't honour .lo_sizelimit. Falling back to LOOP_SET_STATUS64.");
-                                good = false;
-                        }
-                }
-
-                if (FLAGS_SET(c->info.lo_flags, LO_FLAGS_PARTSCAN)) {
-                        /* Kernel 5.8 vanilla doesn't properly propagate the partition scanning flag into the
-                         * block device. Let's hence verify if things work correctly here before
-                         * returning. */
-
-                        r = blockdev_partscan_enabled(fd);
-                        if (r < 0)
-                                goto fail;
-                        if (r == 0) {
-                                log_debug("LOOP_CONFIGURE is broken, doesn't honour LO_FLAGS_PARTSCAN. Falling back to LOOP_SET_STATUS64.");
-                                good = false;
-                        }
-                }
-
-                if (good)
                         return 0;
-
-                /* Otherwise, undo the attachment and use the old APIs */
-                if (ioctl(fd, LOOP_CLR_FD) < 0)
-                        return -errno;
+                }
         }
 
         /* Since kernel commit 5db470e229e22b7eda6e23b5566e532c96fb5bc3 (kernel v5.0) the LOOP_SET_STATUS64
@@ -143,6 +161,7 @@ int loop_device_make(
                 LoopDevice **ret) {
 
         _cleanup_free_ char *loopdev = NULL;
+        bool try_loop_configure = true;
         struct loop_config config;
         LoopDevice *d = NULL;
         struct stat st;
@@ -233,7 +252,7 @@ int loop_device_make(
                         if (!IN_SET(errno, ENOENT, ENXIO))
                                 return -errno;
                 } else {
-                        r = loop_configure(loop, &config);
+                        r = loop_configure(loop, &config, &try_loop_configure);
                         if (r >= 0) {
                                 loop_with_fd = TAKE_FD(loop);
                                 break;

From 6c544d14d977f5b70951ff7f55498d0368f5dfd4 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Fri, 25 Sep 2020 18:49:13 +0200
Subject: [PATCH 3/9] dissect-image: wait for udev device to be initialized
 early

If we allocate the sd_device early we can already use it as path when
looking at whole-device fs images.
---
 src/shared/dissect-image.c | 44 +++++++++++++++++++++-----------------
 1 file changed, 24 insertions(+), 20 deletions(-)

diff --git a/src/shared/dissect-image.c b/src/shared/dissect-image.c
index cbd3357219..6a36f887b6 100644
--- a/src/shared/dissect-image.c
+++ b/src/shared/dissect-image.c
@@ -257,14 +257,7 @@ static int loop_wait_for_partitions_to_appear(
         assert(d);
         assert(ret_enumerator);
 
-        log_debug("Waiting for device (parent + %d partitions) to appear...", num_partitions);
-
-        if (!FLAGS_SET(flags, DISSECT_IMAGE_NO_UDEV)) {
-                r = device_wait_for_initialization(d, "block", USEC_INFINITY, &device);
-                if (r < 0)
-                        return r;
-        } else
-                device = sd_device_ref(d);
+        log_debug("Waiting for %u partitions to appear...", num_partitions);
 
         for (unsigned i = 0; i < N_DEVICE_NODE_LIST_ATTEMPTS; i++) {
                 r = wait_for_partitions_to_appear(fd, device, num_partitions, flags, ret_enumerator);
@@ -370,6 +363,23 @@ int dissect_image(
         if (!S_ISBLK(st.st_mode))
                 return -ENOTBLK;
 
+        r = sd_device_new_from_devnum(&d, 'b', st.st_rdev);
+        if (r < 0)
+                return r;
+
+        if (!FLAGS_SET(flags, DISSECT_IMAGE_NO_UDEV)) {
+                _cleanup_(sd_device_unrefp) sd_device *initialized = NULL;
+
+                /* If udev support is enabled, then let's wait for the device to be initialized before we doing anything. */
+
+                r = device_wait_for_initialization(d, "block", USEC_INFINITY, &initialized);
+                if (r < 0)
+                        return r;
+
+                sd_device_unref(d);
+                d = TAKE_PTR(initialized);
+        }
+
         b = blkid_new_probe();
         if (!b)
                 return -ENOMEM;
@@ -399,10 +409,6 @@ int dissect_image(
         if (!m)
                 return -ENOMEM;
 
-        r = sd_device_new_from_devnum(&d, 'b', st.st_rdev);
-        if (r < 0)
-                return r;
-
         if ((!(flags & DISSECT_IMAGE_GPT_ONLY) &&
             (flags & DISSECT_IMAGE_REQUIRE_ROOT)) ||
             (flags & DISSECT_IMAGE_NO_PARTITION_TABLE)) {
@@ -412,8 +418,8 @@ int dissect_image(
 
                 (void) blkid_probe_lookup_value(b, "USAGE", &usage, NULL);
                 if (STRPTR_IN_SET(usage, "filesystem", "crypto")) {
+                        const char *fstype = NULL, *options = NULL, *devname = NULL;
                         _cleanup_free_ char *t = NULL, *n = NULL, *o = NULL;
-                        const char *fstype = NULL, *options = NULL;
 
                         /* OK, we have found a file system, that's our root partition then. */
                         (void) blkid_probe_lookup_value(b, "TYPE", &fstype, NULL);
@@ -424,10 +430,14 @@ int dissect_image(
                                         return -ENOMEM;
                         }
 
-                        r = device_path_make_major_minor(st.st_mode, st.st_rdev, &n);
+                        r = sd_device_get_devname(d, &devname);
                         if (r < 0)
                                 return r;
 
+                        n = strdup(devname);
+                        if (!n)
+                                return -ENOMEM;
+
                         m->single_file_system = true;
                         m->verity = verity && verity->root_hash && verity->data_path && (verity->designator < 0 || verity->designator == PARTITION_ROOT);
                         m->can_verity = verity && verity->data_path;
@@ -451,13 +461,7 @@ int dissect_image(
 
                         m->encrypted = streq_ptr(fstype, "crypto_LUKS");
 
-                        /* Even on a single partition we need to wait for udev to create the
-                         * /dev/block/X:Y symlink to /dev/loopZ */
-                        r = loop_wait_for_partitions_to_appear(fd, d, 0, flags, &e);
-                        if (r < 0)
-                                return r;
                         *ret = TAKE_PTR(m);
-
                         return 0;
                 }
         }

From 4ba86848c92cf4d2d5466f43b4a42ae4ceddb8b0 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Tue, 29 Sep 2020 20:56:50 +0200
Subject: [PATCH 4/9] dissect-image: rework how we wait for partitions

Previously, we'd just wait for the first moment where the kernel exposes
the same numbre of partitions as libblkid tells us. After that point we
enumerate kernel partitions and look for matching libblkid partitions.

With this change we'll instead enumerate with libblkid only, and then
wait for each kernel partition to show up with the exact parameters we
expect them to show up. Once that happens we are happy.

Care is taken to use the udev device notification messages only as hint
to recheck what the kernel actually says. That's because we are
otherwise subject to a race: we might see udev events from an earlier
use of a loopback device. After all these devices are heavily recycled.
Under the assumption that we'll get udev events for *at least* all
partitions we care about (but possibly more) we can fix the race
entirely with one more fix coming in a later commit: if we make sure
that a loopback block device has zero kernel partitions when we take
possession of it, it doesn't matter anymore if we get spurious udev
events from a previous use. All we have to do is notice when the devices
we need all popped up.
---
 src/shared/dissect-image.c | 357 ++++++++++++++++++++++---------------
 1 file changed, 218 insertions(+), 139 deletions(-)

diff --git a/src/shared/dissect-image.c b/src/shared/dissect-image.c
index 6a36f887b6..7f93229f59 100644
--- a/src/shared/dissect-image.c
+++ b/src/shared/dissect-image.c
@@ -109,31 +109,6 @@ not_found:
 }
 
 #if HAVE_BLKID
-/* Detect RPMB and Boot partitions, which are not listed by blkid.
- * See https://github.com/systemd/systemd/issues/5806. */
-static bool device_is_mmc_special_partition(sd_device *d) {
-        const char *sysname;
-
-        assert(d);
-
-        if (sd_device_get_sysname(d, &sysname) < 0)
-                return false;
-
-        return startswith(sysname, "mmcblk") &&
-                (endswith(sysname, "rpmb") || endswith(sysname, "boot0") || endswith(sysname, "boot1"));
-}
-
-static bool device_is_block(sd_device *d) {
-        const char *ss;
-
-        assert(d);
-
-        if (sd_device_get_subsystem(d, &ss) < 0)
-                return false;
-
-        return streq(ss, "block");
-}
-
 static int enumerator_for_parent(sd_device *d, sd_device_enumerator **ret) {
         _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL;
         int r;
@@ -157,117 +132,217 @@ static int enumerator_for_parent(sd_device *d, sd_device_enumerator **ret) {
         return 0;
 }
 
-static int wait_for_partitions_to_appear(
-                int fd,
-                sd_device *d,
-                unsigned num_partitions,
-                DissectImageFlags flags,
-                sd_device_enumerator **ret_enumerator) {
+static int device_is_partition(sd_device *d, blkid_partition pp) {
+        blkid_loff_t bsize, bstart;
+        uint64_t size, start;
+        int partno, bpartno, r;
+        const char *ss, *v;
 
-        _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL;
-        sd_device *q;
-        unsigned n;
-        int r;
-
-        assert(fd >= 0);
         assert(d);
-        assert(ret_enumerator);
+        assert(pp);
 
-        r = enumerator_for_parent(d, &e);
+        r = sd_device_get_subsystem(d, &ss);
+        if (r < 0)
+                return r;
+        if (!streq(ss, "block"))
+                return false;
+
+        r = sd_device_get_sysattr_value(d, "partition", &v);
+        if (r == -ENOENT) /* Not a partition device */
+                return false;
+        if (r < 0)
+                return r;
+        r = safe_atoi(v, &partno);
         if (r < 0)
                 return r;
 
-        /* Count the partitions enumerated by the kernel */
-        n = 0;
-        FOREACH_DEVICE(e, q) {
-                if (sd_device_get_devnum(q, NULL) < 0)
-                        continue;
-                if (!device_is_block(q))
-                        continue;
-                if (device_is_mmc_special_partition(q))
-                        continue;
+        errno = 0;
+        bpartno = blkid_partition_get_partno(pp);
+        if (bpartno < 0)
+                return errno_or_else(EIO);
 
-                if (!FLAGS_SET(flags, DISSECT_IMAGE_NO_UDEV)) {
-                        r = device_wait_for_initialization(q, "block", USEC_INFINITY, NULL);
-                        if (r < 0)
-                                return r;
-                }
+        if (partno != bpartno)
+                return false;
 
-                n++;
-        }
+        r = sd_device_get_sysattr_value(d, "start", &v);
+        if (r < 0)
+                return r;
+        r = safe_atou64(v, &start);
+        if (r < 0)
+                return r;
 
-        if (n == num_partitions + 1) {
-                *ret_enumerator = TAKE_PTR(e);
-                return 0; /* success! */
-        }
-        if (n > num_partitions + 1)
-                return log_debug_errno(SYNTHETIC_ERRNO(EIO),
-                                       "blkid and kernel partition lists do not match.");
+        errno = 0;
+        bstart = blkid_partition_get_start(pp);
+        if (bstart < 0)
+                return errno_or_else(EIO);
 
-        /* The kernel has probed fewer partitions than blkid? Maybe the kernel prober is still running or it
-         * got EBUSY because udev already opened the device. Let's reprobe the device, which is a synchronous
-         * call that waits until probing is complete. */
+        if (start != (uint64_t) bstart)
+                return false;
 
-        for (unsigned j = 0; ; j++) {
-                if (j++ > 20)
-                        return -EBUSY;
+        r = sd_device_get_sysattr_value(d, "size", &v);
+        if (r < 0)
+                return r;
+        r = safe_atou64(v, &size);
+        if (r < 0)
+                return r;
 
-                if (ioctl(fd, BLKRRPART, 0) >= 0)
-                        break;
-                r = -errno;
-                if (r == -EINVAL) {
-                        /* If we are running on a block device that has partition scanning off, return an
-                         * explicit recognizable error about this, so that callers can generate a proper
-                         * message explaining the situation. */
+        errno = 0;
+        bsize = blkid_partition_get_size(pp);
+        if (bsize < 0)
+                return errno_or_else(EIO);
 
-                        r = blockdev_partscan_enabled(fd);
-                        if (r < 0)
-                                return r;
-                        if (r == 0)
-                                return log_debug_errno(SYNTHETIC_ERRNO(EPROTONOSUPPORT),
-                                                       "Device is a loop device and partition scanning is off!");
+        if (size != (uint64_t) bsize)
+                return false;
 
-                        return -EINVAL; /* original error */
-                }
-                if (r != -EBUSY)
-                        return r;
-
-                /* If something else has the device open, such as an udev rule, the ioctl will return
-                 * EBUSY. Since there's no way to wait until it isn't busy anymore, let's just wait a bit,
-                 * and try again.
-                 *
-                 * This is really something they should fix in the kernel! */
-                (void) usleep(50 * USEC_PER_MSEC);
-
-        }
-
-        return -EAGAIN; /* no success yet, try again */
+        return true;
 }
 
-static int loop_wait_for_partitions_to_appear(
-                int fd,
-                sd_device *d,
-                unsigned num_partitions,
-                DissectImageFlags flags,
-                sd_device_enumerator **ret_enumerator) {
-        _cleanup_(sd_device_unrefp) sd_device *device = NULL;
+static int find_partition(
+                sd_device *parent,
+                blkid_partition pp,
+                sd_device **ret) {
+
+        _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL;
+        sd_device *q;
         int r;
 
-        assert(fd >= 0);
-        assert(d);
-        assert(ret_enumerator);
+        assert(parent);
+        assert(pp);
+        assert(ret);
 
-        log_debug("Waiting for %u partitions to appear...", num_partitions);
+        r = enumerator_for_parent(parent, &e);
+        if (r < 0)
+                return r;
 
-        for (unsigned i = 0; i < N_DEVICE_NODE_LIST_ATTEMPTS; i++) {
-                r = wait_for_partitions_to_appear(fd, device, num_partitions, flags, ret_enumerator);
-                if (r != -EAGAIN)
+        FOREACH_DEVICE(e, q) {
+                r = device_is_partition(q, pp);
+                if (r < 0)
+                        return r;
+                if (r > 0) {
+                        *ret = sd_device_ref(q);
+                        return 0;
+                }
+        }
+
+        return -ENXIO;
+}
+
+struct wait_data {
+        sd_device *parent_device;
+        blkid_partition blkidp;
+        sd_device *found;
+};
+
+static inline void wait_data_done(struct wait_data *d) {
+        sd_device_unref(d->found);
+}
+
+static int device_monitor_handler(sd_device_monitor *monitor, sd_device *device, void *userdata) {
+        const char *parent1_path, *parent2_path;
+        struct wait_data *w = userdata;
+        sd_device *pp;
+        int r;
+
+        assert(w);
+
+        if (device_for_action(device, DEVICE_ACTION_REMOVE))
+                return 0;
+
+        r = sd_device_get_parent(device, &pp);
+        if (r < 0)
+                return 0; /* Doesn't have a parent? No relevant to us */
+
+        r = sd_device_get_syspath(pp, &parent1_path); /* Check parent of device of this action */
+        if (r < 0)
+                goto finish;
+
+        r = sd_device_get_syspath(w->parent_device, &parent2_path); /* Check parent of device we are looking for */
+        if (r < 0)
+                goto finish;
+
+        if (!path_equal(parent1_path, parent2_path))
+                return 0; /* Has a different parent than what we need, not interesting to us */
+
+        r = device_is_partition(device, w->blkidp);
+        if (r < 0)
+                goto finish;
+        if (r == 0) /* Not the one we need */
+                return 0;
+
+        /* It's the one we need! Yay! */
+        assert(!w->found);
+        w->found = sd_device_ref(device);
+        r = 0;
+
+finish:
+        return sd_event_exit(sd_device_monitor_get_event(monitor), r);
+}
+
+static int wait_for_partition_device(
+                sd_device *parent,
+                blkid_partition pp,
+                usec_t deadline,
+                sd_device **ret) {
+
+        _cleanup_(sd_event_source_unrefp) sd_event_source *timeout_source = NULL;
+        _cleanup_(sd_device_monitor_unrefp) sd_device_monitor *monitor = NULL;
+        _cleanup_(sd_event_unrefp) sd_event *event = NULL;
+        int r;
+
+        assert(parent);
+        assert(pp);
+        assert(ret);
+
+        r = find_partition(parent, pp, ret);
+        if (r != -ENXIO)
+                return r;
+
+        r = sd_event_new(&event);
+        if (r < 0)
+                return r;
+
+        r = sd_device_monitor_new(&monitor);
+        if (r < 0)
+                return r;
+
+        r = sd_device_monitor_filter_add_match_subsystem_devtype(monitor, "block", "partition");
+        if (r < 0)
+                return r;
+
+        r = sd_device_monitor_attach_event(monitor, event);
+        if (r < 0)
+                return r;
+
+        _cleanup_(wait_data_done) struct wait_data w = {
+                .parent_device = parent,
+                .blkidp = pp,
+        };
+
+        r = sd_device_monitor_start(monitor, device_monitor_handler, &w);
+        if (r < 0)
+                return r;
+
+        /* Check again, the partition might have appeared in the meantime */
+        r = find_partition(parent, pp, ret);
+        if (r != -ENXIO)
+                return r;
+
+        if (deadline != USEC_INFINITY) {
+                r = sd_event_add_time(
+                                event, &timeout_source,
+                                CLOCK_MONOTONIC, deadline, 0,
+                                NULL, INT_TO_PTR(-ETIMEDOUT));
+                if (r < 0)
                         return r;
         }
 
-        return log_debug_errno(SYNTHETIC_ERRNO(ENXIO),
-                               "Kernel partitions dit not appear within %d attempts",
-                               N_DEVICE_NODE_LIST_ATTEMPTS);
+        r = sd_event_loop(event);
+        if (r < 0)
+                return r;
+
+        assert(w.found);
+        *ret = TAKE_PTR(w.found);
+        return 0;
 }
 
 static void check_partition_flags(
@@ -295,6 +370,8 @@ static void check_partition_flags(
 
 #endif
 
+#define DEVICE_TIMEOUT_USEC (45 * USEC_PER_SEC)
+
 int dissect_image(
                 int fd,
                 const VeritySettings *verity,
@@ -305,7 +382,6 @@ int dissect_image(
 #if HAVE_BLKID
         sd_id128_t root_uuid = SD_ID128_NULL, root_verity_uuid = SD_ID128_NULL,
                 usr_uuid = SD_ID128_NULL, usr_verity_uuid = SD_ID128_NULL;
-        _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL;
         bool is_gpt, is_mbr, generic_rw, multiple_generic = false;
         _cleanup_(sd_device_unrefp) sd_device *d = NULL;
         _cleanup_(dissected_image_unrefp) DissectedImage *m = NULL;
@@ -314,9 +390,9 @@ int dissect_image(
         sd_id128_t generic_uuid = SD_ID128_NULL;
         const char *pttype = NULL;
         blkid_partlist pl;
-        int r, generic_nr;
+        int r, generic_nr, n_partitions;
         struct stat st;
-        sd_device *q;
+        usec_t deadline;
 
         assert(fd >= 0);
         assert(ret);
@@ -372,7 +448,7 @@ int dissect_image(
 
                 /* If udev support is enabled, then let's wait for the device to be initialized before we doing anything. */
 
-                r = device_wait_for_initialization(d, "block", USEC_INFINITY, &initialized);
+                r = device_wait_for_initialization(d, "block", DEVICE_TIMEOUT_USEC, &initialized);
                 if (r < 0)
                         return r;
 
@@ -476,48 +552,51 @@ int dissect_image(
         if (!is_gpt && ((flags & DISSECT_IMAGE_GPT_ONLY) || !is_mbr))
                 return -ENOPKG;
 
+        /* Safety check: refuse block devices that carry a partition table but for which the kernel doesn't
+         * do partition scanning. */
+        r = blockdev_partscan_enabled(fd);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return -EPROTONOSUPPORT;
+
         errno = 0;
         pl = blkid_probe_get_partitions(b);
         if (!pl)
                 return errno_or_else(ENOMEM);
 
-        r = loop_wait_for_partitions_to_appear(fd, d, blkid_partlist_numof_partitions(pl), flags, &e);
-        if (r < 0)
-                return r;
+        errno = 0;
+        n_partitions = blkid_partlist_numof_partitions(pl);
+        if (n_partitions < 0)
+                return errno_or_else(EIO);
 
-        FOREACH_DEVICE(e, q) {
+        deadline = usec_add(now(CLOCK_MONOTONIC), DEVICE_TIMEOUT_USEC);
+        for (int i = 0; i < n_partitions; i++) {
+                _cleanup_(sd_device_unrefp) sd_device *q = NULL;
                 unsigned long long pflags;
                 blkid_partition pp;
                 const char *node;
-                dev_t qn;
                 int nr;
 
-                r = sd_device_get_devnum(q, &qn);
+                errno = 0;
+                pp = blkid_partlist_get_partition(pl, i);
+                if (!pp)
+                        return errno_or_else(EIO);
+
+                r = wait_for_partition_device(d, pp, deadline, &q);
                 if (r < 0)
-                        continue;
-
-                if (st.st_rdev == qn)
-                        continue;
-
-                if (!device_is_block(q))
-                        continue;
-
-                if (device_is_mmc_special_partition(q))
-                        continue;
+                        return r;
 
                 r = sd_device_get_devname(q, &node);
                 if (r < 0)
-                        continue;
-
-                pp = blkid_partlist_devno_to_partition(pl, qn);
-                if (!pp)
-                        continue;
+                        return r;
 
                 pflags = blkid_partition_get_flags(pp);
 
+                errno = 0;
                 nr = blkid_partition_get_partno(pp);
                 if (nr < 0)
-                        continue;
+                        return errno_or_else(EIO);
 
                 if (is_gpt) {
                         PartitionDesignator designator = _PARTITION_DESIGNATOR_INVALID;

From 021bf175289537a67345b0ca9d5d6f163a1eaf7c Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Fri, 25 Sep 2020 17:12:34 +0200
Subject: [PATCH 5/9] loop-util: if a loopback device we want to use still has
 partitions, do something about it

On current kernels (5.8 for example) under some conditions I don't fully
grok it might happen that a detached loopback block device still has
partition block devices around. Accessing these partition block devices
results in EIO errors (that also fill up dmesg). These devices cannot be
claned up with LOOP_CLR_FD (since the main device already is officially
detached), nor with LOOP_CTL_DELETE (returns EBUSY as long as the
partitions still exist). This is a kernel bug. But it appears to apply
to all recent kernels. I cannot really pin down what triggers this,
suffice to say our heavy-duty test can trigger it.

Either way, let's do something about it: when we notice this state we'll
attach an empty file to it, which is guaranteed to have to part table.
This makes the partitions go away. After closing/reoping the device we
hence are good to go again. ugly workaround, but I think OK enough to
use.

The net result is: with this commit, we'll guarantee that by the time we
attach a file to the loopback device we have zero kernel partitions
associated with it. Thus if we then wait for the kernel partitions we
need to appear we should have entirely reliable behaviour even if
loopback devices by the name are heavily recycled and udev events reach
us very late.

Fixes: #16858
---
 src/shared/loop-util.c | 183 +++++++++++++++++++++++++++++++++++++----
 1 file changed, 166 insertions(+), 17 deletions(-)

diff --git a/src/shared/loop-util.c b/src/shared/loop-util.c
index e4b44ab89b..091a304755 100644
--- a/src/shared/loop-util.c
+++ b/src/shared/loop-util.c
@@ -13,8 +13,11 @@
 #include <sys/ioctl.h>
 #include <unistd.h>
 
+#include "sd-device.h"
+
 #include "alloc-util.h"
 #include "blockdev-util.h"
+#include "device-util.h"
 #include "errno-util.h"
 #include "fd-util.h"
 #include "fileio.h"
@@ -24,6 +27,7 @@
 #include "stat-util.h"
 #include "stdio-util.h"
 #include "string-util.h"
+#include "tmpfile-util.h"
 
 static void cleanup_clear_loop_close(int *fd) {
         if (*fd < 0)
@@ -33,18 +37,134 @@ static void cleanup_clear_loop_close(int *fd) {
         (void) safe_close(*fd);
 }
 
+static int loop_is_bound(int fd) {
+        struct loop_info64 info;
+
+        assert(fd >= 0);
+
+        if (ioctl(fd, LOOP_GET_STATUS64, &info) < 0) {
+                if (errno == ENXIO)
+                        return false; /* not bound! */
+
+                return -errno;
+        }
+
+        return true; /* bound! */
+}
+
+static int device_has_block_children(sd_device *d) {
+        _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL;
+        const char *main_sn, *main_ss;
+        sd_device *q;
+        int r;
+
+        assert(d);
+
+        /* Checks if the specified device currently has block device children (i.e. partition block
+         * devices). */
+
+        r = sd_device_get_sysname(d, &main_sn);
+        if (r < 0)
+                return r;
+
+        r = sd_device_get_subsystem(d, &main_ss);
+        if (r < 0)
+                return r;
+
+        if (!streq(main_ss, "block"))
+                return -EINVAL;
+
+        r = sd_device_enumerator_new(&e);
+        if (r < 0)
+                return r;
+
+        r = sd_device_enumerator_allow_uninitialized(e);
+        if (r < 0)
+                return r;
+
+        r = sd_device_enumerator_add_match_parent(e, d);
+        if (r < 0)
+                return r;
+
+        FOREACH_DEVICE(e, q) {
+                const char *ss, *sn;
+
+                r = sd_device_get_subsystem(q, &ss);
+                if (r < 0)
+                        continue;
+
+                if (!streq(ss, "block"))
+                        continue;
+
+                r = sd_device_get_sysname(q, &sn);
+                if (r < 0)
+                        continue;
+
+                if (streq(sn, main_sn))
+                        continue;
+
+                return 1; /* we have block device children */
+        }
+
+        return 0;
+}
+
 static int loop_configure(
                 int fd,
+                int nr,
                 const struct loop_config *c,
                 bool *try_loop_configure) {
 
+        _cleanup_(sd_device_unrefp) sd_device *d = NULL;
+        _cleanup_free_ char *sysname = NULL;
         _cleanup_close_ int lock_fd = -1;
         int r;
 
         assert(fd >= 0);
+        assert(nr >= 0);
         assert(c);
         assert(try_loop_configure);
 
+        if (asprintf(&sysname, "loop%i", nr) < 0)
+                return -ENOMEM;
+
+        r = sd_device_new_from_subsystem_sysname(&d, "block", sysname);
+        if (r < 0)
+                return r;
+
+        /* Let's lock the device before we do anything. We take the BSD lock on a second, separately opened
+         * fd for the device. udev after all watches for close() events (specifically IN_CLOSE_WRITE) on
+         * block devices to reprobe them, hence by having a separate fd we will later close() we can ensure
+         * we trigger udev after everything is done. If we'd lock our own fd instead and keep it open for a
+         * long time udev would possibly never run on it again, even though the fd is unlocked, simply
+         * because we never close() it. It also has the nice benefit we can use the _cleanup_close_ logic to
+         * automatically release the lock, after we are done. */
+        lock_fd = fd_reopen(fd, O_RDWR|O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
+        if (lock_fd < 0)
+                return lock_fd;
+        if (flock(lock_fd, LOCK_EX) < 0)
+                return -errno;
+
+        /* Let's see if the device is really detached, i.e. currently has no associated partition block
+         * devices. On various kernels (such as 5.8) it is possible to have a loopback block device that
+         * superficially is detached but still has partition block devices associated for it. They only go
+         * away when the device is reattached. (Yes, LOOP_CLR_FD doesn't work then, because officially
+         * nothing is attached and LOOP_CTL_REMOVE doesn't either, since it doesn't care about partition
+         * block devices. */
+        r = device_has_block_children(d);
+        if (r < 0)
+                return r;
+        if (r > 0) {
+                r = loop_is_bound(fd);
+                if (r < 0)
+                        return r;
+                if (r > 0)
+                        return -EBUSY;
+
+                return -EUCLEAN; /* Bound but children? Tell caller to reattach something so that the
+                                  * partition block devices are gone too. */
+        }
+
         if (*try_loop_configure) {
                 if (ioctl(fd, LOOP_CONFIGURE, c) < 0) {
                         /* Do fallback only if LOOP_CONFIGURE is not supported, propagate all other
@@ -115,21 +235,7 @@ static int loop_configure(
          * time where we attach the fd and where we reconfigure the device. Secondly, let's wait 50ms on
          * EAGAIN and retry. The former should be an efficient mechanism to avoid we have to wait 50ms
          * needlessly if we are just racing against udev. The latter is protection against all other cases,
-         * i.e. peers that do not take the BSD lock.
-         *
-         * We take the BSD lock on a second, separately opened fd for the device. udev after all watches for
-         * close() events (specifically IN_CLOSE_WRITE) on block devices to reprobe them, hence by having a
-         * separate fd we will later close() we can ensure we trigger udev after everything is done. If we'd
-         * lock our own fd instead and keep it open for a long time udev would possibly never run on it
-         * again, even though the fd is unlocked, simply because we never close() it. It also has the nice
-         * benefit we can use the _cleanup_close_ logic to automatically release the lock, after we are
-         * done. */
-        lock_fd = fd_reopen(fd, O_RDWR|O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
-        if (lock_fd < 0)
-                return lock_fd;
-
-        if (flock(lock_fd, LOCK_EX) < 0)
-                return -errno;
+         * i.e. peers that do not take the BSD lock. */
 
         if (ioctl(fd, LOOP_SET_FD, c->fd) < 0)
                 return -errno;
@@ -152,6 +258,44 @@ fail:
         return r;
 }
 
+static int attach_empty_file(int loop, int nr) {
+        _cleanup_close_ int fd = -1;
+
+        /* So here's the thing: on various kernels (5.8 at least) loop block devices might enter a state
+         * where they are detached but nonetheless have partitions, when used heavily. Accessing these
+         * partitions results in immediatey IO errors. There's no pretty way to get rid of them
+         * again. Neither LOOP_CLR_FD nor LOOP_CTL_REMOVE suffice (see above). What does work is to
+         * reassociate them with a new fd however. This is what we do here hence: we associate the devices
+         * with an empty file (i.e. an image that definitely has no partitons). We then immediately clear it
+         * again. This suffices to make the partitions go away. Ugly but appears to work. */
+
+        log_debug("Found unattached loopback block device /dev/loop%i with partitions. Attaching empty file to remove them.", nr);
+
+        fd = open_tmpfile_unlinkable(NULL, O_RDONLY);
+        if (fd < 0)
+                return fd;
+
+        if (flock(loop, LOCK_EX) < 0)
+                return -errno;
+
+        if (ioctl(loop, LOOP_SET_FD, fd) < 0)
+                return -errno;
+
+        if (ioctl(loop, LOOP_SET_STATUS64, &(struct loop_info64) {
+                                .lo_flags = LO_FLAGS_READ_ONLY|
+                                            LO_FLAGS_AUTOCLEAR|
+                                            LO_FLAGS_PARTSCAN, /* enable partscan, so that the partitions really go away */
+                        }) < 0)
+                return -errno;
+
+        if (ioctl(loop, LOOP_CLR_FD) < 0)
+                return -errno;
+
+        /* The caller is expected to immediately close the loopback device after this, so that the BSD lock
+         * is released, and udev sees the changes. */
+        return 0;
+}
+
 int loop_device_make(
                 int fd,
                 int open_flags,
@@ -252,12 +396,17 @@ int loop_device_make(
                         if (!IN_SET(errno, ENOENT, ENXIO))
                                 return -errno;
                 } else {
-                        r = loop_configure(loop, &config, &try_loop_configure);
+                        r = loop_configure(loop, nr, &config, &try_loop_configure);
                         if (r >= 0) {
                                 loop_with_fd = TAKE_FD(loop);
                                 break;
                         }
-                        if (r != -EBUSY)
+                        if (r == -EUCLEAN) {
+                                /* Make left-over partition disappear hack (see above) */
+                                r = attach_empty_file(loop, nr);
+                                if (r < 0 && r != -EBUSY)
+                                        return r;
+                        } else if (r != -EBUSY)
                                 return r;
                 }
 

From 9e3d90671edbdb69393818d2b755597f84518975 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Mon, 12 Oct 2020 18:18:33 +0200
Subject: [PATCH 6/9] udev-util: use absolute rather than relative timeout when
 waiting for devices

This makes it easier to accurately wait for a overall deadline.
---
 src/shared/dissect-image.c |  2 +-
 src/shared/udev-util.c     | 17 +++++++++--------
 src/shared/udev-util.h     |  4 ++--
 src/udev/udevadm-info.c    |  6 +++++-
 4 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/src/shared/dissect-image.c b/src/shared/dissect-image.c
index 7f93229f59..2721718381 100644
--- a/src/shared/dissect-image.c
+++ b/src/shared/dissect-image.c
@@ -1726,7 +1726,7 @@ static int verity_partition(
                         if (r == 0) {
                                 /* devmapper might say that the device exists, but the devlink might not yet have been
                                  * created. Check and wait for the udev event in that case. */
-                                r = device_wait_for_devlink(node, "block", 100 * USEC_PER_MSEC, NULL);
+                                r = device_wait_for_devlink(node, "block", usec_add(now(CLOCK_MONOTONIC), 100 * USEC_PER_MSEC), NULL);
                                 /* Fallback to activation with a unique device if it's taking too long */
                                 if (r == -ETIMEDOUT)
                                         break;
diff --git a/src/shared/udev-util.c b/src/shared/udev-util.c
index 98bbfb2ae3..33272de4bc 100644
--- a/src/shared/udev-util.c
+++ b/src/shared/udev-util.c
@@ -195,8 +195,9 @@ static int device_wait_for_initialization_internal(
                 sd_device *_device,
                 const char *devlink,
                 const char *subsystem,
-                usec_t timeout,
+                usec_t deadline,
                 sd_device **ret) {
+
         _cleanup_(sd_device_monitor_unrefp) sd_device_monitor *monitor = NULL;
         _cleanup_(sd_event_source_unrefp) sd_event_source *timeout_source = NULL;
         _cleanup_(sd_event_unrefp) sd_event *event = NULL;
@@ -256,10 +257,10 @@ static int device_wait_for_initialization_internal(
         if (r < 0)
                 return log_error_errno(r, "Failed to start device monitor: %m");
 
-        if (timeout != USEC_INFINITY) {
-                r = sd_event_add_time_relative(
+        if (deadline != USEC_INFINITY) {
+                r = sd_event_add_time(
                                 event, &timeout_source,
-                                CLOCK_MONOTONIC, timeout, 0,
+                                CLOCK_MONOTONIC, deadline, 0,
                                 NULL, INT_TO_PTR(-ETIMEDOUT));
                 if (r < 0)
                         return log_error_errno(r, "Failed to add timeout event source: %m");
@@ -287,12 +288,12 @@ static int device_wait_for_initialization_internal(
         return 0;
 }
 
-int device_wait_for_initialization(sd_device *device, const char *subsystem, usec_t timeout, sd_device **ret) {
-        return device_wait_for_initialization_internal(device, NULL, subsystem, timeout, ret);
+int device_wait_for_initialization(sd_device *device, const char *subsystem, usec_t deadline, sd_device **ret) {
+        return device_wait_for_initialization_internal(device, NULL, subsystem, deadline, ret);
 }
 
-int device_wait_for_devlink(const char *devlink, const char *subsystem, usec_t timeout, sd_device **ret) {
-        return device_wait_for_initialization_internal(NULL, devlink, subsystem, timeout, ret);
+int device_wait_for_devlink(const char *devlink, const char *subsystem, usec_t deadline, sd_device **ret) {
+        return device_wait_for_initialization_internal(NULL, devlink, subsystem, deadline, ret);
 }
 
 int device_is_renaming(sd_device *dev) {
diff --git a/src/shared/udev-util.h b/src/shared/udev-util.h
index 04c7ce5520..58803ea522 100644
--- a/src/shared/udev-util.h
+++ b/src/shared/udev-util.h
@@ -28,7 +28,7 @@ static inline int udev_parse_config(void) {
         return udev_parse_config_full(NULL, NULL, NULL, NULL, NULL);
 }
 
-int device_wait_for_initialization(sd_device *device, const char *subsystem, usec_t timeout, sd_device **ret);
-int device_wait_for_devlink(const char *path, const char *subsystem, usec_t timeout, sd_device **ret);
+int device_wait_for_initialization(sd_device *device, const char *subsystem, usec_t deadline, sd_device **ret);
+int device_wait_for_devlink(const char *path, const char *subsystem, usec_t deadline, sd_device **ret);
 int device_is_renaming(sd_device *dev);
 bool device_for_action(sd_device *dev, DeviceAction action);
diff --git a/src/udev/udevadm-info.c b/src/udev/udevadm-info.c
index ae6d8caf54..27e9a226cb 100644
--- a/src/udev/udevadm-info.c
+++ b/src/udev/udevadm-info.c
@@ -493,7 +493,11 @@ int info_main(int argc, char *argv[], void *userdata) {
                 if (arg_wait_for_initialization_timeout > 0) {
                         sd_device *d;
 
-                        r = device_wait_for_initialization(device, NULL, arg_wait_for_initialization_timeout, &d);
+                        r = device_wait_for_initialization(
+                                        device,
+                                        NULL,
+                                        usec_add(now(CLOCK_MONOTONIC), arg_wait_for_initialization_timeout),
+                                        &d);
                         if (r < 0)
                                 return r;
 

From 786e3a52a24d0317d20d3c8839db59b7bad6d0b7 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Wed, 14 Oct 2020 10:52:05 +0200
Subject: [PATCH 7/9] dissect: retrigger devices if we missed uevents

On systems that have a udev before
a7fdc6cbd399acdb1a975a7f72b9be4504a38c7c uevents would sometimes be
eaten because of device node collisions that caused the ruleset to fail.
Let's add an (ugly) work-around for this, so that we can even work with
such an older udev.
---
 src/shared/dissect-image.c | 86 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 85 insertions(+), 1 deletion(-)

diff --git a/src/shared/dissect-image.c b/src/shared/dissect-image.c
index 2721718381..1820b81e11 100644
--- a/src/shared/dissect-image.c
+++ b/src/shared/dissect-image.c
@@ -368,6 +368,86 @@ static void check_partition_flags(
         }
 }
 
+static int device_wait_for_initialization_harder(
+                sd_device *device,
+                const char *subsystem,
+                usec_t deadline,
+                sd_device **ret) {
+
+        _cleanup_free_ char *uevent = NULL;
+        usec_t start, left, retrigger_timeout;
+        int r;
+
+        start = now(CLOCK_MONOTONIC);
+        left = usec_sub_unsigned(deadline, start);
+
+        if (DEBUG_LOGGING) {
+                char buf[FORMAT_TIMESPAN_MAX];
+                const char *sn = NULL;
+
+                (void) sd_device_get_sysname(device, &sn);
+                log_debug("Waiting for device '%s' to initialize for %s.", strna(sn), format_timespan(buf, sizeof(buf), left, 0));
+        }
+
+        if (left != USEC_INFINITY)
+                retrigger_timeout = CLAMP(left / 4, 1 * USEC_PER_SEC, 5 * USEC_PER_SEC); /* A fourth of the total timeout, but let's clamp to 1s…5s range */
+        else
+                retrigger_timeout = 2 * USEC_PER_SEC;
+
+        for (;;) {
+                usec_t local_deadline, n;
+                bool last_try;
+
+                n = now(CLOCK_MONOTONIC);
+                assert(n >= start);
+
+                /* Find next deadline, when we'll retrigger */
+                local_deadline = start +
+                        DIV_ROUND_UP(n - start, retrigger_timeout) * retrigger_timeout;
+
+                if (deadline != USEC_INFINITY && deadline <= local_deadline) {
+                        local_deadline = deadline;
+                        last_try = true;
+                } else
+                        last_try = false;
+
+                r = device_wait_for_initialization(device, subsystem, local_deadline, ret);
+                if (r >= 0 && DEBUG_LOGGING) {
+                        char buf[FORMAT_TIMESPAN_MAX];
+                        const char *sn = NULL;
+
+                        (void) sd_device_get_sysname(device, &sn);
+                        log_debug("Successfully waited for device '%s' to initialize for %s.", strna(sn), format_timespan(buf, sizeof(buf), usec_sub_unsigned(now(CLOCK_MONOTONIC), start), 0));
+
+                }
+                if (r != -ETIMEDOUT || last_try)
+                        return r;
+
+                if (!uevent) {
+                        const char *syspath;
+
+                        r = sd_device_get_syspath(device, &syspath);
+                        if (r < 0)
+                                return r;
+
+                        uevent = path_join(syspath, "uevent");
+                        if (!uevent)
+                                return -ENOMEM;
+                }
+
+                if (DEBUG_LOGGING) {
+                        char buf[FORMAT_TIMESPAN_MAX];
+
+                        log_debug("Device didn't initialize within %s, assuming lost event. Retriggering device through %s.",
+                                  format_timespan(buf, sizeof(buf), usec_sub_unsigned(now(CLOCK_MONOTONIC), start), 0),
+                                  uevent);
+                }
+
+                r = write_string_file(uevent, "change", WRITE_STRING_FILE_DISABLE_BUFFER);
+                if (r < 0)
+                        return r;
+        }
+}
 #endif
 
 #define DEVICE_TIMEOUT_USEC (45 * USEC_PER_SEC)
@@ -448,7 +528,11 @@ int dissect_image(
 
                 /* If udev support is enabled, then let's wait for the device to be initialized before we doing anything. */
 
-                r = device_wait_for_initialization(d, "block", DEVICE_TIMEOUT_USEC, &initialized);
+                r = device_wait_for_initialization_harder(
+                                d,
+                                "block",
+                                usec_add(now(CLOCK_MONOTONIC), DEVICE_TIMEOUT_USEC),
+                                &initialized);
                 if (r < 0)
                         return r;
 

From b202ec20688cd0abe3f4b2e35cf6002c8bc45189 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Tue, 13 Oct 2020 14:37:39 +0200
Subject: [PATCH 8/9] loop-util: wait a random time before trying again

Let's try to make collisions when multiple clients want to use the same
device less likely, by sleeping a random time on collision.

The loop device allocation protocol is inherently collision prone:
first, a program asks which is the next free loop device, then it tries
to acquire it, in a separate, unsynchronized setp. If many peers do this
all at the same time, they'll likely all collide when trying to
acquire the device, so that they need to ask for a free device again and
again.

Let's make this a little less prone to collisions, reducing the number
of failing attempts: whenever we notice a collision we'll now wait
short and randomized time, making it more likely another peer succeeds.

(This also adds a similar logic when retrying LOOP_SET_STATUS64, but
with a slightly altered calculation, since there we definitely want to
wait a bit, under all cases)
---
 src/shared/loop-util.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/shared/loop-util.c b/src/shared/loop-util.c
index 091a304755..1c92fdbf5f 100644
--- a/src/shared/loop-util.c
+++ b/src/shared/loop-util.c
@@ -24,6 +24,7 @@
 #include "loop-util.h"
 #include "missing_loop.h"
 #include "parse-util.h"
+#include "random-util.h"
 #include "stat-util.h"
 #include "stdio-util.h"
 #include "string-util.h"
@@ -248,7 +249,10 @@ static int loop_configure(
                         goto fail;
                 }
 
-                (void) usleep(50 * USEC_PER_MSEC);
+                /* Sleep some random time, but at least 10ms, at most 250ms. Increase the delay the more
+                 * failed attempts we see */
+                (void) usleep(UINT64_C(10) * USEC_PER_MSEC +
+                              random_u64() % (UINT64_C(240) * USEC_PER_MSEC * n_attempts/64));
         }
 
         return 0;
@@ -414,6 +418,11 @@ int loop_device_make(
                         return -EBUSY;
 
                 loopdev = mfree(loopdev);
+
+                /* Wait some random time, to make collision less likely. Let's pick a random time in the
+                 * range 0ms…250ms, linearly scaled by the number of failed attempts. */
+                (void) usleep(random_u64() % (UINT64_C(10) * USEC_PER_MSEC +
+                                              UINT64_C(240) * USEC_PER_MSEC * n_attempts/64));
         }
 
         d = new(LoopDevice, 1);

From f93ba375301e43900f1fe5a93a2b33b1efcc73e0 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Fri, 25 Sep 2020 18:26:53 +0200
Subject: [PATCH 9/9] test: add heavy load loopback block device test

---
 meson.build                |   1 +
 src/shared/mount-util.h    |   3 +-
 src/test/meson.build       |  11 ++
 src/test/test-loop-block.c | 250 +++++++++++++++++++++++++++++++++++++
 4 files changed, 264 insertions(+), 1 deletion(-)
 create mode 100644 src/test/test-loop-block.c

diff --git a/meson.build b/meson.build
index 307d1bd5f7..9c915323c4 100644
--- a/meson.build
+++ b/meson.build
@@ -3338,6 +3338,7 @@ foreach tuple : tests
         type = tuple.length() >= 5 ? tuple[4] : ''
         defs = tuple.length() >= 6 ? tuple[5] : []
         incs = tuple.length() >= 7 ? tuple[6] : includes
+        parallel = tuple.length() >= 8 ? tuple[7] : true
         timeout = 30
 
         name = sources[0].split('/')[-1].split('.')[0]
diff --git a/src/shared/mount-util.h b/src/shared/mount-util.h
index ba5d6280d2..63de2c7dff 100644
--- a/src/shared/mount-util.h
+++ b/src/shared/mount-util.h
@@ -89,10 +89,11 @@ int mount_option_mangle(
 int mode_to_inaccessible_node(const char *runtime_dir, mode_t mode, char **dest);
 
 /* Useful for usage with _cleanup_(), unmounts, removes a directory and frees the pointer */
-static inline void umount_and_rmdir_and_free(char *p) {
+static inline char* umount_and_rmdir_and_free(char *p) {
         PROTECT_ERRNO;
         (void) umount_recursive(p, 0);
         (void) rmdir(p);
         free(p);
+        return NULL;
 }
 DEFINE_TRIVIAL_CLEANUP_FUNC(char*, umount_and_rmdir_and_free);
diff --git a/src/test/meson.build b/src/test/meson.build
index ac1182e37a..daf62eb539 100644
--- a/src/test/meson.build
+++ b/src/test/meson.build
@@ -433,6 +433,17 @@ tests += [
          [],
          []],
 
+        [['src/test/test-loop-block.c'],
+         [libcore,
+          libshared],
+         [threads,
+          libblkid],
+         '',
+         '',
+         [],
+         includes,
+         false],
+
         [['src/test/test-selinux.c'],
          [],
          []],
diff --git a/src/test/test-loop-block.c b/src/test/test-loop-block.c
new file mode 100644
index 0000000000..b9533fc16a
--- /dev/null
+++ b/src/test/test-loop-block.c
@@ -0,0 +1,250 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+
+#include <fcntl.h>
+#include <linux/loop.h>
+#include <pthread.h>
+
+#include "alloc-util.h"
+#include "dissect-image.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "fs-util.h"
+#include "gpt.h"
+#include "missing_loop.h"
+#include "mkfs-util.h"
+#include "mount-util.h"
+#include "namespace-util.h"
+#include "string-util.h"
+#include "strv.h"
+#include "tests.h"
+#include "tmpfile-util.h"
+#include "user-util.h"
+#include "virt.h"
+
+#define N_THREADS 5
+#define N_ITERATIONS 3
+
+static usec_t end = 0;
+
+static void* thread_func(void *ptr) {
+        int fd = PTR_TO_FD(ptr);
+        int r;
+
+        for (unsigned i = 0; i < N_ITERATIONS; i++) {
+                _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
+                _cleanup_(umount_and_rmdir_and_freep) char *mounted = NULL;
+                _cleanup_(dissected_image_unrefp) DissectedImage *dissected = NULL;
+
+                if (now(CLOCK_MONOTONIC) >= end) {
+                        log_notice("Time's up, exiting thread's loop");
+                        break;
+                }
+
+                log_notice("> Thread iteration #%u.", i);
+
+                assert_se(mkdtemp_malloc(NULL, &mounted) >= 0);
+
+                r = loop_device_make(fd, O_RDONLY, 0, UINT64_MAX, LO_FLAGS_PARTSCAN, &loop);
+                if (r < 0)
+                        log_error_errno(r, "Failed to allocate loopback device: %m");
+                assert_se(r >= 0);
+
+                log_notice("Acquired loop device %s, will mount on %s", loop->node, mounted);
+
+                r = dissect_image(loop->fd, NULL, NULL, DISSECT_IMAGE_READ_ONLY, &dissected);
+                if (r < 0)
+                        log_error_errno(r, "Failed dissect loopback device %s: %m", loop->node);
+                assert_se(r >= 0);
+
+                log_info("Dissected loop device %s", loop->node);
+
+                for (PartitionDesignator d = 0; d < _PARTITION_DESIGNATOR_MAX; d++) {
+                        if (!dissected->partitions[d].found)
+                                continue;
+
+                        log_notice("Found node %s fstype %s designator %s",
+                                   dissected->partitions[d].node,
+                                   dissected->partitions[d].fstype,
+                                   partition_designator_to_string(d));
+                }
+
+                assert_se(dissected->partitions[PARTITION_ESP].found);
+                assert_se(dissected->partitions[PARTITION_ESP].node);
+                assert_se(dissected->partitions[PARTITION_XBOOTLDR].found);
+                assert_se(dissected->partitions[PARTITION_XBOOTLDR].node);
+                assert_se(dissected->partitions[PARTITION_ROOT].found);
+                assert_se(dissected->partitions[PARTITION_ROOT].node);
+                assert_se(dissected->partitions[PARTITION_HOME].found);
+                assert_se(dissected->partitions[PARTITION_HOME].node);
+
+                r = dissected_image_mount(dissected, mounted, UID_INVALID, DISSECT_IMAGE_READ_ONLY);
+                log_notice_errno(r, "Mounted %s → %s: %m", loop->node, mounted);
+                assert_se(r >= 0);
+
+                log_notice("Unmounting %s", mounted);
+                mounted = umount_and_rmdir_and_free(mounted);
+
+                log_notice("Unmounted.");
+
+                dissected = dissected_image_unref(dissected);
+
+                log_notice("Detaching loop device %s", loop->node);
+                loop = loop_device_unref(loop);
+                log_notice("Detached loop device.");
+        }
+
+        log_notice("Leaving thread");
+
+        return NULL;
+}
+
+static bool have_root_gpt_type(void) {
+#ifdef GPT_ROOT_NATIVE
+        return true;
+#else
+        return false;
+#endif
+}
+
+int main(int argc, char *argv[]) {
+        _cleanup_free_ char *p = NULL, *cmd = NULL;
+        _cleanup_(pclosep) FILE *sfdisk = NULL;
+        _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
+        _cleanup_close_ int fd = -1;
+        _cleanup_(dissected_image_unrefp) DissectedImage *dissected = NULL;
+        _cleanup_(umount_and_rmdir_and_freep) char *mounted = NULL;
+        pthread_t threads[N_THREADS];
+        const char *fs;
+        sd_id128_t id;
+        int r;
+
+        test_setup_logging(LOG_DEBUG);
+        log_show_tid(true);
+        log_show_time(true);
+
+        if (!have_root_gpt_type()) {
+                log_tests_skipped("No root partition GPT defined for this architecture, exiting.");
+                return EXIT_TEST_SKIP;
+        }
+
+        if (detect_container() > 0) {
+                log_tests_skipped("Test not supported in a container, requires udev/uevent notifications.");
+                return EXIT_TEST_SKIP;
+        }
+
+        if (strstr_ptr(ci_environment(), "autopkgtest")) {
+                // FIXME: we should reenable this one day
+                log_tests_skipped("Skipping test on Ubuntu autopkgtest CI, test too slow and installed udev too flakey.");
+                return EXIT_TEST_SKIP;
+        }
+
+        /* This is a test for the loopback block device setup code and it's use by the image dissection
+         * logic: since the kernel APIs are hard use and prone to races, let's test this in a heavy duty
+         * test: we open a bunch of threads and repeatedly allocate and deallocate loopback block devices in
+         * them in parallel, with an image file with a number of partitions. */
+
+        r = detach_mount_namespace();
+        if (ERRNO_IS_PRIVILEGE(r)) {
+                log_tests_skipped("Lacking privileges");
+                return EXIT_TEST_SKIP;
+        }
+
+        FOREACH_STRING(fs, "vfat", "ext4") {
+                r = mkfs_exists(fs);
+                assert_se(r >= 0);
+                if (!r) {
+                        log_tests_skipped("mkfs.{vfat|ext4} not installed");
+                        return EXIT_TEST_SKIP;
+                }
+        }
+
+        assert_se(r >= 0);
+
+        assert_se(tempfn_random_child("/var/tmp", "sfdisk", &p) >= 0);
+        fd = open(p, O_CREAT|O_EXCL|O_RDWR|O_CLOEXEC|O_NOFOLLOW, 0666);
+        assert_se(fd >= 0);
+        assert_se(ftruncate(fd, 256*1024*1024) >= 0);
+
+        assert_se(cmd = strjoin("sfdisk ", p));
+        assert_se(sfdisk = popen(cmd, "we"));
+
+        /* A reasonably complex partition table that fits on a 64K disk */
+        fputs("label: gpt\n"
+              "size=32M, type=C12A7328-F81F-11D2-BA4B-00A0C93EC93B\n"
+              "size=32M, type=BC13C2FF-59E6-4262-A352-B275FD6F7172\n"
+              "size=32M, type=0657FD6D-A4AB-43C4-84E5-0933C84B4F4F\n"
+              "size=32M, type=", sfdisk);
+
+#ifdef GPT_ROOT_NATIVE
+        fprintf(sfdisk, SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(GPT_ROOT_NATIVE));
+#else
+        fprintf(sfdisk, SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(GPT_ROOT_X86_64));
+#endif
+
+        fputs("\n"
+              "size=32M, type=933AC7E1-2EB4-4F13-B844-0E14E2AEF915\n", sfdisk);
+
+        assert_se(pclose(sfdisk) == 0);
+        sfdisk = NULL;
+
+        assert_se(loop_device_make(fd, O_RDWR, 0, UINT64_MAX, LO_FLAGS_PARTSCAN, &loop) >= 0);
+        assert_se(dissect_image(loop->fd, NULL, NULL, 0, &dissected) >= 0);
+
+        assert_se(dissected->partitions[PARTITION_ESP].found);
+        assert_se(dissected->partitions[PARTITION_ESP].node);
+        assert_se(dissected->partitions[PARTITION_XBOOTLDR].found);
+        assert_se(dissected->partitions[PARTITION_XBOOTLDR].node);
+        assert_se(dissected->partitions[PARTITION_ROOT].found);
+        assert_se(dissected->partitions[PARTITION_ROOT].node);
+        assert_se(dissected->partitions[PARTITION_HOME].found);
+        assert_se(dissected->partitions[PARTITION_HOME].node);
+
+        assert_se(sd_id128_randomize(&id) >= 0);
+        assert_se(make_filesystem(dissected->partitions[PARTITION_ESP].node, "vfat", "EFI", id, true) >= 0);
+
+        assert_se(sd_id128_randomize(&id) >= 0);
+        assert_se(make_filesystem(dissected->partitions[PARTITION_XBOOTLDR].node, "vfat", "xbootldr", id, true) >= 0);
+
+        assert_se(sd_id128_randomize(&id) >= 0);
+        assert_se(make_filesystem(dissected->partitions[PARTITION_ROOT].node, "ext4", "root", id, true) >= 0);
+
+        assert_se(sd_id128_randomize(&id) >= 0);
+        assert_se(make_filesystem(dissected->partitions[PARTITION_HOME].node, "ext4", "home", id, true) >= 0);
+
+        dissected = dissected_image_unref(dissected);
+        assert_se(dissect_image(loop->fd, NULL, NULL, 0, &dissected) >= 0);
+
+        assert_se(mkdtemp_malloc(NULL, &mounted) >= 0);
+
+        /* This first (writable) mount will initialize the mount point dirs, so that the subsequent read-only ones can work */
+        assert_se(dissected_image_mount(dissected, mounted, UID_INVALID, 0) >= 0);
+
+        assert_se(umount_recursive(mounted, 0) >= 0);
+        loop = loop_device_unref(loop);
+
+        log_notice("Threads are being started now");
+
+        /* Let's make sure we run for 10s on slow systems at max */
+        end = usec_add(now(CLOCK_MONOTONIC),
+                       slow_tests_enabled() ? 5 * USEC_PER_SEC :
+                       1 * USEC_PER_SEC);
+
+        for (unsigned i = 0; i < N_THREADS; i++)
+                assert_se(pthread_create(threads + i, NULL, thread_func, FD_TO_PTR(fd)) == 0);
+
+        log_notice("All threads started now.");
+
+        for (unsigned i = 0; i < N_THREADS; i++) {
+                log_notice("Joining thread #%u.", i);
+
+                void *k;
+                assert_se(pthread_join(threads[i], &k) == 0);
+                assert_se(k == NULL);
+
+                log_notice("Joined thread #%u.", i);
+        }
+
+        log_notice("Threads are all terminated now.");
+
+        return 0;
+}