diff --git a/test/units/TEST-07-PID1.transient-unit-container.sh b/test/units/TEST-07-PID1.transient-unit-container.sh new file mode 100755 index 0000000000..f18e9ff112 --- /dev/null +++ b/test/units/TEST-07-PID1.transient-unit-container.sh @@ -0,0 +1,194 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: LGPL-2.1-or-later +# shellcheck disable=SC2016 +set -Eeuo pipefail +set -x + +# ----------------------------------------------------------------------------- +# +# Test: PID-1 Transient Unit Container +# +# Verifies that a minimal systemd PID 1 inside a tmpfs root can: +# • Boot +# • Bind mount the host's /usr directory read-only +# • Bind mount a shared writable directory with the host +# • Run a one-shot service in the container to create and +# write to a host file in that directory +# • Exit cleanly with systemd-run --wait propagating status +# +# ----------------------------------------------------------------------------- + +# Helpers +# shellcheck source=test/units/test-control.sh +. "$(dirname "$0")"/test-control.sh +# shellcheck source=test/units/util.sh +. "$(dirname "$0")"/util.sh + +# Mounts and directories to teardown and cleanup +CLEANUP_MOUNTS=() +CLEANUP_PATHS=() + +# Common Config: +TEST_NAME="TEST-07-PID1.transient-unit-container" +OUTPUT_FILE="test-service-output" +EXPECTED_OUTPUT="Test service is running" +readonly TEST_NAME OUTPUT_FILE EXPECTED_OUTPUT + +# Host FS Directories +# mktemp helps avoid name collision; using dry-run mode +CONTAINER_ROOT_FS=$(mktemp -u -d --tmpdir "${TEST_NAME}-root-XXXX") +HOST_OUT_DIR=$(mktemp -u -d --tmpdir test-dir-XXXX) +readonly CONTAINER_ROOT_FS HOST_OUT_DIR + +# Container FS Directories +CONTAINER_MOUNT_DIR="/${TEST_NAME}" +readonly CONTAINER_MOUNT_DIR + +# Mount a dummy /proc FS which will not be passed to the container. It +# circumvents a permissions error when attempting to mount a FS within the +# container. This seems like a systemd bug. +temporary_mount_hack() { + # IMPORTANT: This is modeled after a workaround in + # TEST-07-PID1.private-pids.sh with a key difference. In private-pids, it's + # explained there must be at least 1 unmasked procfs mount on the host in + # order for /proc/ to be mounted by an UNPRIVILEGED user within the container + # namespace. Note the host mount is not actually passed through to the + # container. + # + # The key difference here is that, here, systemd-run is NOT launched with + # --user, it is a PRIVILEGED environment and should not hit a permissions + # error when attempting to mount /proc. Unfortunately, that's exactly what + # happens if you launch the container without first mounting a dummy + # unmasked /proc on the host. + # + # It was pointed out to me that this may indicate a significant bug. A + # change masking the host's /proc could prevent the startup of privileged + # containers. If this were addressed, this function could be removed. + + local -r helper_proc=$(mktemp -d --tmpdir helper-proc-XXXX) + CLEANUP_PATHS+=("$helper_proc") + + mount -t proc proc "$helper_proc" + CLEANUP_MOUNTS+=("$helper_proc") +} + +# Mount 1) a writable directory for output; 2) a dummy procfs as a workaround so +# the container can mount /proc; 3) a tmpfs to serve as the container's root +# FS; 4) the host's /usr directory read only. +make_mounts() { + # Host bind mount for the output file. Systemd will make the container's version. + mkdir -p "$HOST_OUT_DIR" + CLEANUP_PATHS+=("$HOST_OUT_DIR") + + temporary_mount_hack + + # Container root tmpfs mount + mkdir -p "$CONTAINER_ROOT_FS" + CLEANUP_PATHS+=("$CONTAINER_ROOT_FS") + + mount -t tmpfs tmpfs "$CONTAINER_ROOT_FS" + CLEANUP_MOUNTS+=("$CONTAINER_ROOT_FS") + + # Container's /usr will be a read-only bind mount of the host's /usr. Tried + # using -p BindReadOnlyPaths=/usr instead of this, but that didn't work. + # Debugging that got hairy, so I'm going with this for now. + mkdir -p "${CONTAINER_ROOT_FS}/usr" + + mount --bind /usr "${CONTAINER_ROOT_FS}/usr" + mount -o remount,bind,ro "${CONTAINER_ROOT_FS}/usr" + + # Make sure /root/usr is unmounted before /root. + # Don't add to CLEANUP_PATHS because it will be removed when /root is. + CLEANUP_MOUNTS=( "${CONTAINER_ROOT_FS}/usr" "${CLEANUP_MOUNTS[@]}" ) +} + +# Create a test-service unit file that will run via the container's systemd and +# write the output file. +config_container_service() { + local -r container_systemd_dir="${CONTAINER_ROOT_FS}/etc/systemd/system" + local -r guest_output="${CONTAINER_MOUNT_DIR}/${OUTPUT_FILE}" + local -r internal_test_service="${container_systemd_dir}/test-service.service" + + mkdir -p "$container_systemd_dir" + + # Generate a phony random machine-id for the container + uuidgen -r | tr -d '-' | tr '[:upper:]' '[:lower:]' > "${CONTAINER_ROOT_FS}/etc/machine-id" + + cat <"$internal_test_service" +[Unit] +Description=Test Service for Internal Systemd +After=basic.target + +[Service] +Type=oneshot +ExecStart=/bin/sh -c 'echo "$EXPECTED_OUTPUT" > "$guest_output"' +ExecStartPost=/usr/bin/systemctl --no-block exit 0 +TimeoutStopSec=15s + +[Install] +WantedBy=multi-user.target +EOF + systemctl --root="$CONTAINER_ROOT_FS" enable test-service.service +} + +# The testcase. Configs cleanup trap, makes mounts, configs internal service +# unit, kicks off container as a transient unit, waits for it to finish and +# checks output. +testcase_transient_unit_container_file_write() { + + # Cleanup on exit. Test cases seem to run in a subshell, and only a single + # testcase is expected in this file. So we tie cleanup to the lifetime of + # this subshell, not the global context, allowing for appending to + # CLEANUP_PATHS and CLEANUP_MOUNTS + trap file_write_cleanup EXIT ERR INT TERM + + make_mounts + + config_container_service + + # Run the container as a transient unit and wait for it to finish + local -r bind_mount_arg="${HOST_OUT_DIR}:${CONTAINER_MOUNT_DIR}" + local -r service_unit_name="${TEST_NAME}.service" + + SYSTEMD_LOG_LEVEL=debug SYSTEMD_LOG_TARGET=console \ + systemd-run \ + --unit "$service_unit_name" \ + --wait \ + -p RootDirectory="$CONTAINER_ROOT_FS" \ + -p PrivatePIDs=yes \ + -p PrivateUsersEx=full \ + -p ProtectHostnameEx=private \ + -p ProtectControlGroupsEx=private \ + -p PrivateMounts=yes \ + -p PrivateNetwork=yes \ + -p PrivateDevices=yes \ + -p PrivateIPC=yes \ + -p BindLogSockets=no \ + -p "Environment=container=transient-unit" \ + -p "CapabilityBoundingSet=~CAP_SYS_TIME CAP_SYS_BOOT CAP_AUDIT_READ" \ + -p Type=exec \ + -p Delegate=true \ + -p DelegateSubgroup=init.scope \ + -p DelegateNamespaces=yes \ + -p BindPaths="$bind_mount_arg" \ + /usr/lib/systemd/systemd multi-user.target + + # If our service ran, we should be able to read its output here + local -r host_output="${HOST_OUT_DIR}/${OUTPUT_FILE}" + assert_eq "$(cat "${host_output}")" "$EXPECTED_OUTPUT" +} + +CLEANUP_DONE=0 +file_write_cleanup() { + # Avoid re-running this function. E.g. At both SIGINT and EXIT. + (( CLEANUP_DONE )) && return + CLEANUP_DONE=1 + set +e + + # Remove all the mounts and directories we created + # These variables reset to empty arrays when the subprocess concludes. + umount "${CLEANUP_MOUNTS[@]}" + rm -rf "${CLEANUP_PATHS[@]}" +} + +run_testcases