test: write file from systemd service in transient unit

This integration test demonstrates that a containerized systemd instance can
write to a bind mounted file observable to the host. Specifically, the bash
script uses systemd-run to start a systemd instance as a transient unit
container. This systemd-run command bind mounts a directory the container will
share with the host, and runs an internal service which creates and writes to a
file from the container's view of this directory. When finished writing, the
service runs the exit target, terminating the internal systemd instance, and
ending the lifetime of the container.

The script waits for the container to finish running, then verifies that the
expected file contents were written on the host side of the filesystem mount.

This test employs a workaround, creating an unmasked procfs mount on the host
which enables the privileged guest to create its own mounts internally. This
may indicate a systemd bug, as the privileged container should not rely on
the existence of an unmasked procfs on the host in order to mount its own
filesystems internally.
This commit is contained in:
Tommy Unger
2025-04-18 18:57:26 -07:00
committed by Daan De Meyer
parent 62e5b90a49
commit ef3a0478bb

View File

@@ -0,0 +1,194 @@
#!/usr/bin/env bash
# SPDX-License-Identifier: LGPL-2.1-or-later
# shellcheck disable=SC2016
set -Eeuo pipefail
set -x
# -----------------------------------------------------------------------------
#
# Test: PID-1 Transient Unit Container
#
# Verifies that a minimal systemd PID 1 inside a tmpfs root can:
# • Boot
# • Bind mount the host's /usr directory read-only
# • Bind mount a shared writable directory with the host
# • Run a one-shot service in the container to create and
# write to a host file in that directory
# • Exit cleanly with systemd-run --wait propagating status
#
# -----------------------------------------------------------------------------
# Helpers
# shellcheck source=test/units/test-control.sh
. "$(dirname "$0")"/test-control.sh
# shellcheck source=test/units/util.sh
. "$(dirname "$0")"/util.sh
# Mounts and directories to teardown and cleanup
CLEANUP_MOUNTS=()
CLEANUP_PATHS=()
# Common Config:
TEST_NAME="TEST-07-PID1.transient-unit-container"
OUTPUT_FILE="test-service-output"
EXPECTED_OUTPUT="Test service is running"
readonly TEST_NAME OUTPUT_FILE EXPECTED_OUTPUT
# Host FS Directories
# mktemp helps avoid name collision; using dry-run mode
CONTAINER_ROOT_FS=$(mktemp -u -d --tmpdir "${TEST_NAME}-root-XXXX")
HOST_OUT_DIR=$(mktemp -u -d --tmpdir test-dir-XXXX)
readonly CONTAINER_ROOT_FS HOST_OUT_DIR
# Container FS Directories
CONTAINER_MOUNT_DIR="/${TEST_NAME}"
readonly CONTAINER_MOUNT_DIR
# Mount a dummy /proc FS which will not be passed to the container. It
# circumvents a permissions error when attempting to mount a FS within the
# container. This seems like a systemd bug.
temporary_mount_hack() {
# IMPORTANT: This is modeled after a workaround in
# TEST-07-PID1.private-pids.sh with a key difference. In private-pids, it's
# explained there must be at least 1 unmasked procfs mount on the host in
# order for /proc/ to be mounted by an UNPRIVILEGED user within the container
# namespace. Note the host mount is not actually passed through to the
# container.
#
# The key difference here is that, here, systemd-run is NOT launched with
# --user, it is a PRIVILEGED environment and should not hit a permissions
# error when attempting to mount /proc. Unfortunately, that's exactly what
# happens if you launch the container without first mounting a dummy
# unmasked /proc on the host.
#
# It was pointed out to me that this may indicate a significant bug. A
# change masking the host's /proc could prevent the startup of privileged
# containers. If this were addressed, this function could be removed.
local -r helper_proc=$(mktemp -d --tmpdir helper-proc-XXXX)
CLEANUP_PATHS+=("$helper_proc")
mount -t proc proc "$helper_proc"
CLEANUP_MOUNTS+=("$helper_proc")
}
# Mount 1) a writable directory for output; 2) a dummy procfs as a workaround so
# the container can mount /proc; 3) a tmpfs to serve as the container's root
# FS; 4) the host's /usr directory read only.
make_mounts() {
# Host bind mount for the output file. Systemd will make the container's version.
mkdir -p "$HOST_OUT_DIR"
CLEANUP_PATHS+=("$HOST_OUT_DIR")
temporary_mount_hack
# Container root tmpfs mount
mkdir -p "$CONTAINER_ROOT_FS"
CLEANUP_PATHS+=("$CONTAINER_ROOT_FS")
mount -t tmpfs tmpfs "$CONTAINER_ROOT_FS"
CLEANUP_MOUNTS+=("$CONTAINER_ROOT_FS")
# Container's /usr will be a read-only bind mount of the host's /usr. Tried
# using -p BindReadOnlyPaths=/usr instead of this, but that didn't work.
# Debugging that got hairy, so I'm going with this for now.
mkdir -p "${CONTAINER_ROOT_FS}/usr"
mount --bind /usr "${CONTAINER_ROOT_FS}/usr"
mount -o remount,bind,ro "${CONTAINER_ROOT_FS}/usr"
# Make sure /root/usr is unmounted before /root.
# Don't add to CLEANUP_PATHS because it will be removed when /root is.
CLEANUP_MOUNTS=( "${CONTAINER_ROOT_FS}/usr" "${CLEANUP_MOUNTS[@]}" )
}
# Create a test-service unit file that will run via the container's systemd and
# write the output file.
config_container_service() {
local -r container_systemd_dir="${CONTAINER_ROOT_FS}/etc/systemd/system"
local -r guest_output="${CONTAINER_MOUNT_DIR}/${OUTPUT_FILE}"
local -r internal_test_service="${container_systemd_dir}/test-service.service"
mkdir -p "$container_systemd_dir"
# Generate a phony random machine-id for the container
uuidgen -r | tr -d '-' | tr '[:upper:]' '[:lower:]' > "${CONTAINER_ROOT_FS}/etc/machine-id"
cat <<EOF >"$internal_test_service"
[Unit]
Description=Test Service for Internal Systemd
After=basic.target
[Service]
Type=oneshot
ExecStart=/bin/sh -c 'echo "$EXPECTED_OUTPUT" > "$guest_output"'
ExecStartPost=/usr/bin/systemctl --no-block exit 0
TimeoutStopSec=15s
[Install]
WantedBy=multi-user.target
EOF
systemctl --root="$CONTAINER_ROOT_FS" enable test-service.service
}
# The testcase. Configs cleanup trap, makes mounts, configs internal service
# unit, kicks off container as a transient unit, waits for it to finish and
# checks output.
testcase_transient_unit_container_file_write() {
# Cleanup on exit. Test cases seem to run in a subshell, and only a single
# testcase is expected in this file. So we tie cleanup to the lifetime of
# this subshell, not the global context, allowing for appending to
# CLEANUP_PATHS and CLEANUP_MOUNTS
trap file_write_cleanup EXIT ERR INT TERM
make_mounts
config_container_service
# Run the container as a transient unit and wait for it to finish
local -r bind_mount_arg="${HOST_OUT_DIR}:${CONTAINER_MOUNT_DIR}"
local -r service_unit_name="${TEST_NAME}.service"
SYSTEMD_LOG_LEVEL=debug SYSTEMD_LOG_TARGET=console \
systemd-run \
--unit "$service_unit_name" \
--wait \
-p RootDirectory="$CONTAINER_ROOT_FS" \
-p PrivatePIDs=yes \
-p PrivateUsersEx=full \
-p ProtectHostnameEx=private \
-p ProtectControlGroupsEx=private \
-p PrivateMounts=yes \
-p PrivateNetwork=yes \
-p PrivateDevices=yes \
-p PrivateIPC=yes \
-p BindLogSockets=no \
-p "Environment=container=transient-unit" \
-p "CapabilityBoundingSet=~CAP_SYS_TIME CAP_SYS_BOOT CAP_AUDIT_READ" \
-p Type=exec \
-p Delegate=true \
-p DelegateSubgroup=init.scope \
-p DelegateNamespaces=yes \
-p BindPaths="$bind_mount_arg" \
/usr/lib/systemd/systemd multi-user.target
# If our service ran, we should be able to read its output here
local -r host_output="${HOST_OUT_DIR}/${OUTPUT_FILE}"
assert_eq "$(cat "${host_output}")" "$EXPECTED_OUTPUT"
}
CLEANUP_DONE=0
file_write_cleanup() {
# Avoid re-running this function. E.g. At both SIGINT and EXIT.
(( CLEANUP_DONE )) && return
CLEANUP_DONE=1
set +e
# Remove all the mounts and directories we created
# These variables reset to empty arrays when the subprocess concludes.
umount "${CLEANUP_MOUNTS[@]}"
rm -rf "${CLEANUP_PATHS[@]}"
}
run_testcases