From 79a93bb7f52ce71b9d5027e640f59c0f08fbbbcc Mon Sep 17 00:00:00 2001 From: Frantisek Sumsal Date: Wed, 5 Nov 2025 18:12:39 +0100 Subject: [PATCH 1/2] test: terminate the test containers cleanly on SIGTERM So they exit with 0 instead of 143 when we call `machinectl terminate` on them. --- test/units/TEST-13-NSPAWN.machined.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/units/TEST-13-NSPAWN.machined.sh b/test/units/TEST-13-NSPAWN.machined.sh index add5428828..8c7087c55d 100755 --- a/test/units/TEST-13-NSPAWN.machined.sh +++ b/test/units/TEST-13-NSPAWN.machined.sh @@ -48,6 +48,7 @@ trap 'touch /terminate; kill 0' RTMIN+3 trap 'touch /poweroff' RTMIN+4 trap 'touch /reboot' INT trap 'touch /trap' TRAP +trap 'exit 0' TERM trap 'kill $PID' EXIT # We need to wait for the sleep process asynchronously in order to allow @@ -325,6 +326,7 @@ ip address add 192.0.2.1/24 dev hoge PID=0 trap 'kill 0' RTMIN+3 +trap 'exit 0' TERM trap 'kill $PID' EXIT # We need to wait for the sleep process asynchronously in order to allow From ed4903660c90e862c7834ea39772b887b88f8982 Mon Sep 17 00:00:00 2001 From: Frantisek Sumsal Date: Wed, 5 Nov 2025 18:13:58 +0100 Subject: [PATCH 2/2] test: wait until the nspawn process is completely dead Before calling io.systemd.MachineImage.List. The systemd-nspawn process takes a lock in the run() function in nspawn.c and holds it for the entire runtime of that function. If we call `machinectl terminate` the machine gets unregistered _before_ we release the lock, so the original `machinectl status` check would return early, allowing for a race where we call io.systemd.MachineImage.List over Varlink when systemd-nspawn still holds the lock because the process is still running.: [ 41.691826] TEST-13-NSPAWN.sh[1102]: + machinectl terminate long-running [ 41.695009] systemd-nspawn[2171]: Trying to halt container by sending TERM to container PID 1. Send SIGTERM again to trigger immediate termination. [ 41.698235] systemd-machined[1192]: Machine long-running terminated. [ 41.709520] TEST-13-NSPAWN.sh[1102]: + systemctl kill --signal=KILL systemd-nspawn@long-running.service [ 41.709169] systemd-nspawn[2171]: Failed to unregister machine: No machine 'long-running' known [ 41.720869] TEST-13-NSPAWN.sh[2346]: + varlinkctl --more call /run/systemd/machine/io.systemd.MachineImage io.systemd.MachineImage.List '{}' [ 41.723359] TEST-13-NSPAWN.sh[2347]: + grep long-running ... [ 41.735453] TEST-13-NSPAWN.sh[2352]: + varlinkctl call /run/systemd/machine/io.systemd.MachineImage io.systemd.MachineImage.List '{"name":"long-running", "acquireMetadata": "yes"}' [ 41.736222] TEST-13-NSPAWN.sh[2353]: + grep OSRelease [ 41.739500] TEST-13-NSPAWN.sh[2352]: Method call io.systemd.MachineImage.List() failed: Device or resource busy [ 41.740641] systemd[1]: Received SIGCHLD. [ 41.740670] systemd[1]: Child 2171 (systemd-nspawn) died (code=killed, status=9/KILL) [ 41.740725] systemd[1]: systemd-nspawn@long-running.service: Child 2171 belongs to systemd-nspawn@long-running.service. [ 41.740748] systemd[1]: systemd-nspawn@long-running.service: Main process exited, code=killed, status=9/KILL [ 41.740755] systemd[1]: systemd-nspawn@long-running.service: Will spawn child (service_enter_stop_post): systemd-nspawn [ 41.740872] systemd[1]: systemd-nspawn@long-running.service: About to execute: systemd-nspawn --cleanup --machine=long-running ... Let's mitigate this by waiting until the corresponding systemd-nspawn@.service instance enters the 'inactive' state where the lock should be properly released. Resolves: #39547 --- test/units/TEST-13-NSPAWN.machined.sh | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/test/units/TEST-13-NSPAWN.machined.sh b/test/units/TEST-13-NSPAWN.machined.sh index 8c7087c55d..a9e3817688 100755 --- a/test/units/TEST-13-NSPAWN.machined.sh +++ b/test/units/TEST-13-NSPAWN.machined.sh @@ -441,9 +441,14 @@ varlinkctl call /run/systemd/machine/io.systemd.Machine io.systemd.Machine.OpenR # Terminating machine, otherwise acquiring image metadata by io.systemd.MachineImage.List may fail in the below. machinectl terminate long-running -# wait for the container being stopped, otherwise acquiring image metadata by io.systemd.MachineImage.List may fail in the below. -timeout 30 bash -c "while machinectl status long-running &>/dev/null; do sleep .5; done" -systemctl kill --signal=KILL systemd-nspawn@long-running.service || : +# Wait for the container to stop, otherwise acquiring image metadata by io.systemd.MachineImage.List below +# may fail. +# +# We need to wait until the systemd-nspawn process is completely stopped, as the lock is held for almost the +# entire life of the process (see the run() function in nspawn.c). This means that the machine gets +# unregistered _before_ this lock is lifted which makes `machinectl status` return non-zero EC earlier than +# we need. +timeout 30 bash -xec 'until [[ "$(systemctl show -P ActiveState systemd-nspawn@long-running.service)" == inactive ]]; do sleep .5; done' # test io.systemd.MachineImage.List varlinkctl --more call /run/systemd/machine/io.systemd.MachineImage io.systemd.MachineImage.List '{}' | grep 'long-running'