coredump: AccessContainer= bunch of followups (#34333)

Fixes #34130
This commit is contained in:
Luca Boccassi
2024-10-30 14:37:44 +00:00
committed by GitHub
6 changed files with 151 additions and 86 deletions

View File

@@ -110,14 +110,14 @@
</varlistentry>
<varlistentry>
<term><varname>AccessContainer=</varname></term>
<term><varname>EnterNamespace=</varname></term>
<listitem><para>Controls whether <command>systemd-coredump</command> will attempt to use the mount tree of
a process that crashed within a container. Access to the container's filesystem might be necessary to generate
a process that crashed in PID namespace. Access to the namespace's mount tree might be necessary to generate
a fully symbolized backtrace. If set to <literal>yes</literal>, then <command>systemd-coredump</command> will
obtain the mount tree from corresponding mount namespace and will try to generate the stack trace using the
binary and libraries from the mount namespace. Note that the coredump of the containerized process might
still be saved in <filename>/var/lib/systemd/coredump/</filename> even if <varname>AccessContainer=</varname>
binary and libraries from the mount namespace. Note that the coredump of the namespaced process might
still be saved in <filename>/var/lib/systemd/coredump/</filename> even if <varname>EnterNamespace=</varname>
is set to <literal>no</literal>. Defaults to <literal>no</literal>.</para>
<xi:include href="version-info.xml" xpointer="v257"/>

View File

@@ -4,6 +4,7 @@
#include "analyze.h"
#include "analyze-inspect-elf.h"
#include "chase.h"
#include "elf-util.h"
#include "errno-util.h"
#include "fd-util.h"
@@ -19,23 +20,13 @@ static int analyze_elf(char **filenames, sd_json_format_flags_t json_flags) {
STRV_FOREACH(filename, filenames) {
_cleanup_(sd_json_variant_unrefp) sd_json_variant *package_metadata = NULL;
_cleanup_(table_unrefp) Table *t = NULL;
_cleanup_free_ char *abspath = NULL, *path = NULL, *stacktrace = NULL;
_cleanup_free_ char *abspath = NULL, *stacktrace = NULL;
_cleanup_close_ int fd = -EBADF;
bool coredump = false;
r = path_make_absolute_cwd(*filename, &abspath);
if (r < 0)
return log_error_errno(r, "Could not make an absolute path out of \"%s\": %m", *filename);
path = path_join(empty_to_root(arg_root), abspath);
if (!path)
return log_oom();
path_simplify(path);
fd = RET_NERRNO(open(path, O_RDONLY|O_CLOEXEC));
fd = chase_and_open(*filename, arg_root, CHASE_PREFIX_ROOT, O_RDONLY|O_CLOEXEC, &abspath);
if (fd < 0)
return log_error_errno(fd, "Could not open \"%s\": %m", path);
return log_error_errno(fd, "Could not open \"%s\": %m", *filename);
r = parse_elf_object(fd, abspath, arg_root, /* fork_disable_dump= */false, &stacktrace, &package_metadata);
if (r < 0)
@@ -65,7 +56,7 @@ static int analyze_elf(char **filenames, sd_json_format_flags_t json_flags) {
* metadata is parsed recursively in core files, so there might be
* multiple modules. */
if (STR_IN_SET(module_name, "elfType", "elfArchitecture")) {
if (streq(module_name, "elfType") && streq("coredump", sd_json_variant_string(module_json)))
if (streq(module_name, "elfType") && streq(sd_json_variant_string(module_json), "coredump"))
coredump = true;
r = table_add_many(

View File

@@ -2,11 +2,15 @@
#include <errno.h>
#include <stdio.h>
#include <sys/mount.h>
#include <sys/prctl.h>
#include <sys/statvfs.h>
#include <sys/auxv.h>
#include <sys/xattr.h>
#include <unistd.h>
#if WANT_LINUX_FS_H
# include <linux/fs.h>
#endif
#include "sd-daemon.h"
#include "sd-journal.h"
@@ -86,6 +90,8 @@
* size. See DATA_SIZE_MAX in journal-importer.h. */
assert_cc(JOURNAL_SIZE_MAX <= DATA_SIZE_MAX);
#define MOUNT_TREE_ROOT "/run/systemd/mount-rootfs"
enum {
/* We use these as array indexes for our process metadata cache.
*
@@ -167,7 +173,9 @@ static uint64_t arg_external_size_max = EXTERNAL_SIZE_MAX;
static uint64_t arg_journal_size_max = JOURNAL_SIZE_MAX;
static uint64_t arg_keep_free = UINT64_MAX;
static uint64_t arg_max_use = UINT64_MAX;
static bool arg_access_container = false;
#if HAVE_DWFL_SET_SYSROOT
static bool arg_enter_namespace = false;
#endif
static int parse_config(void) {
static const ConfigTableItem items[] = {
@@ -179,9 +187,9 @@ static int parse_config(void) {
{ "Coredump", "KeepFree", config_parse_iec_uint64, 0, &arg_keep_free },
{ "Coredump", "MaxUse", config_parse_iec_uint64, 0, &arg_max_use },
#if HAVE_DWFL_SET_SYSROOT
{ "Coredump", "AccessContainer", config_parse_bool, 0, &arg_access_container },
{ "Coredump", "EnterNamespace", config_parse_bool, 0, &arg_enter_namespace },
#else
{ "Coredump", "AccessContainer", config_parse_warn_compat, DISABLED_CONFIGURATION, 0 },
{ "Coredump", "EnterNamespace", config_parse_warn_compat, DISABLED_CONFIGURATION, 0 },
#endif
{}
};
@@ -782,30 +790,31 @@ static int change_uid_gid(const Context *context) {
return drop_privileges(uid, gid, 0);
}
static int setup_container_mount_tree(int mount_tree_fd, char **container_root) {
_cleanup_free_ char *root = NULL;
static int attach_mount_tree(int mount_tree_fd) {
int r;
assert(mount_tree_fd >= 0);
assert(container_root);
r = unshare(CLONE_NEWNS);
r = detach_mount_namespace();
if (r < 0)
return log_warning_errno(errno, "Failed to unshare mount namespace: %m");
return log_warning_errno(r, "Failed to detach mount namespace: %m");
r = mount(NULL, "/", NULL, MS_REC|MS_PRIVATE, NULL);
r = mkdir_p_label(MOUNT_TREE_ROOT, 0555);
if (r < 0)
return log_warning_errno(errno, "Failed to disable mount propagation: %m");
return log_warning_errno(r, "Failed to create directory: %m");
r = mkdtemp_malloc("/tmp/systemd-coredump-root-XXXXXX", &root);
r = mount_setattr(mount_tree_fd, "", AT_EMPTY_PATH,
&(struct mount_attr) {
.attr_set = MOUNT_ATTR_RDONLY|MOUNT_ATTR_NOSUID|MOUNT_ATTR_NODEV|MOUNT_ATTR_NOEXEC,
.propagation = MS_SLAVE,
}, sizeof(struct mount_attr));
if (r < 0)
return log_warning_errno(r, "Failed to create temporary directory: %m");
return log_warning_errno(errno, "Failed to change properties of mount tree: %m");
r = move_mount(mount_tree_fd, "", -EBADF, root, MOVE_MOUNT_F_EMPTY_PATH);
r = move_mount(mount_tree_fd, "", -EBADF, MOUNT_TREE_ROOT, MOVE_MOUNT_F_EMPTY_PATH);
if (r < 0)
return log_warning_errno(errno, "Failed to move mount tree: %m");
return log_warning_errno(errno, "Failed to attach mount tree: %m");
*container_root = TAKE_PTR(root);
return 0;
}
@@ -817,10 +826,8 @@ static int submit_coredump(
_cleanup_(sd_json_variant_unrefp) sd_json_variant *json_metadata = NULL;
_cleanup_close_ int coredump_fd = -EBADF, coredump_node_fd = -EBADF;
_cleanup_free_ char *filename = NULL, *coredump_data = NULL;
_cleanup_free_ char *stacktrace = NULL;
_cleanup_free_ char *root = NULL;
const char *module_name;
_cleanup_free_ char *filename = NULL, *coredump_data = NULL, *stacktrace = NULL;
const char *module_name, *root = NULL;
uint64_t coredump_size = UINT64_MAX, coredump_compressed_size = UINT64_MAX;
bool truncated = false, written = false;
sd_json_variant *module_json;
@@ -856,11 +863,8 @@ static int submit_coredump(
(void) coredump_vacuum(coredump_node_fd >= 0 ? coredump_node_fd : coredump_fd, arg_keep_free, arg_max_use);
}
if (mount_tree_fd >= 0 && arg_access_container) {
r = setup_container_mount_tree(mount_tree_fd, &root);
if (r < 0)
log_warning_errno(r, "Failed to setup container mount tree, ignoring: %m");
}
if (mount_tree_fd >= 0 && attach_mount_tree(mount_tree_fd) >= 0)
root = MOUNT_TREE_ROOT;
/* Now, let's drop privileges to become the user who owns the segfaulted process and allocate the
* coredump memory under the user's uid. This also ensures that the credentials journald will see are
@@ -869,6 +873,7 @@ static int submit_coredump(
r = change_uid_gid(context);
if (r < 0)
return log_error_errno(r, "Failed to drop privileges: %m");
if (written) {
/* Try to get a stack trace if we can */
if (coredump_size > arg_process_size_max)
@@ -1087,11 +1092,34 @@ static int process_socket(int fd) {
goto finish;
}
/* The final zero-length datagram carries the file descriptor and tells us
/* The final zero-length datagram carries the file descriptors and tells us
* that we're done. */
if (n == 0) {
struct cmsghdr *found;
if (first) {
found = cmsg_find(&mh, SOL_SOCKET, SCM_RIGHTS, CMSG_LEN(sizeof(int)));
if (found) {
/* This is the first message that carries file descriptors. Maybe
* there will be one more that actually contains array of two
* descriptors. */
assert(input_fd < 0);
input_fd = *CMSG_TYPED_DATA(found, int);
first = false;
continue;
}
/* This is zero length message but it either doesn't carry a single descriptor,
* or it has more than one. This is a protocol violation so let's bail out. */
cmsg_close_all(&mh);
r = log_error_errno(SYNTHETIC_ERRNO(EBADMSG),
"Received zero length message with zero or more than one file descriptor(s).");
goto finish;
}
/* Second zero length message might carry two file descriptors, coredump fd and mount tree fd. */
found = cmsg_find(&mh, SOL_SOCKET, SCM_RIGHTS, CMSG_LEN(sizeof(int) * 2));
if (found) {
int fds[2] = EBADF_PAIR;
@@ -1099,28 +1127,30 @@ static int process_socket(int fd) {
memcpy(fds, CMSG_TYPED_DATA(found, int), sizeof(int) * 2);
assert(mount_tree_fd < 0);
assert(input_fd >= 0);
/* Maybe we already got coredump FD in previous iteration? */
/* Let's close input fd we got in the previous iteration. */
safe_close(input_fd);
input_fd = fds[0];
mount_tree_fd = fds[1];
/* We have all FDs we need let's take a shortcut here. */
/* We have all FDs we need so let's take a shortcut here. */
break;
} else {
found = cmsg_find(&mh, SOL_SOCKET, SCM_RIGHTS, CMSG_LEN(sizeof(int)));
if (found)
input_fd = *CMSG_TYPED_DATA(found, int);
}
/* This is the first message that carries file descriptors, maybe there will be one more that actually contains array of descriptors. */
if (first) {
first = false;
continue;
}
/* This is second iteration and we didn't find array of two FDs. */
found = cmsg_find(&mh, SOL_SOCKET, SCM_RIGHTS, (socklen_t) -1);
cmsg_close_all(&mh);
if (!found)
/* Hence we either have no FDs which is OK and we can break. */
break;
/* Or we have some other number of FDs and somebody is playing games with us. */
r = log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Received unexpected file descriptors.");
goto finish;
break;
} else
cmsg_close_all(&mh);
@@ -1628,42 +1658,60 @@ static int forward_coredump_to_container(Context *context) {
return 0;
}
static int gather_pid_mount_tree_fd(const Context *context) {
_cleanup_close_ int mntns_fd = -EBADF, root_fd = -EBADF;
_cleanup_close_pair_ int pair[2] = EBADF_PAIR;
int fd = -EBADF, r;
pid_t child;
assert(context);
static int gather_pid_mount_tree_fd(const Context *context, int *ret_fd) {
/* Don't bother preparing environment if we can't pass it to libdwfl. */
#if !HAVE_DWFL_SET_SYSROOT
return -EBADF;
#endif
*ret_fd = -EOPNOTSUPP;
log_debug("dwfl_set_sysroot() is not supported.");
#else
_cleanup_close_ int mntns_fd = -EBADF, root_fd = -EBADF, fd = -EBADF;
_cleanup_close_pair_ int pair[2] = EBADF_PAIR;
int r;
if (!arg_access_container)
return -EBADF;
assert(context);
assert(ret_fd);
if (!arg_enter_namespace) {
*ret_fd = -EHOSTDOWN;
log_debug("EnterNamespace=no so we won't use mount tree of the crashed process for generating backtrace.");
return 0;
}
if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, pair) < 0)
return log_error_errno(errno, "Failed to create socket pair: %m");
r = namespace_open(context->pid, NULL, &mntns_fd, NULL, NULL, &root_fd);
r = namespace_open(context->pid,
/* ret_pidns_fd= */ NULL,
&mntns_fd,
/* ret_netns_fd= */ NULL,
/* ret_userns_fd= */ NULL,
&root_fd);
if (r < 0)
return log_error_errno(r, "Failed to open mount namespace of crashing process: %m");
r = namespace_fork("(sd-mount-tree-ns)", "(sd-mount-tree)", NULL, 0, FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL, -1, mntns_fd, -1, -1, root_fd, &child);
r = namespace_fork("(sd-mount-tree-ns)",
"(sd-mount-tree)",
/* except_fds= */ NULL,
/* n_except_fds= */ 0,
FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL|FORK_LOG|FORK_WAIT,
/* pidns_fd= */ -EBADF,
mntns_fd,
/* netns_fd= */ -EBADF,
/* userns_fd= */ -EBADF,
root_fd,
NULL);
if (r < 0)
return log_error_errno(r, "Failed to fork(): %m");
return r;
if (r == 0) {
pair[0] = safe_close(pair[0]);
r = open_tree(-EBADF, "/", AT_NO_AUTOMOUNT | AT_RECURSIVE | AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLOEXEC | OPEN_TREE_CLONE);
if (r < 0) {
fd = open_tree(-EBADF, "/", AT_NO_AUTOMOUNT | AT_RECURSIVE | AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLOEXEC | OPEN_TREE_CLONE);
if (fd < 0) {
log_error_errno(errno, "Failed to clone mount tree: %m");
_exit(EXIT_FAILURE);
}
r = send_one_fd(pair[1], r, 0);
r = send_one_fd(pair[1], fd, 0);
if (r < 0) {
log_error_errno(r, "Failed to send mount tree to parent: %m");
_exit(EXIT_FAILURE);
@@ -1674,17 +1722,13 @@ static int gather_pid_mount_tree_fd(const Context *context) {
pair[1] = safe_close(pair[1]);
r = wait_for_terminate_and_check("(sd-mount-tree-ns)", child, 0);
if (r < 0)
return log_error_errno(r, "Failed to wait for child: %m");
if (r != EXIT_SUCCESS)
return log_error_errno(SYNTHETIC_ERRNO(ECHILD), "Child died abnormally.");
fd = receive_one_fd(pair[0], MSG_DONTWAIT);
if (fd < 0)
return log_error_errno(fd, "Failed to receive mount tree: %m");
return fd;
*ret_fd = TAKE_FD(fd);
#endif
return 0;
}
static int process_kernel(int argc, char* argv[]) {
@@ -1736,11 +1780,9 @@ static int process_kernel(int argc, char* argv[]) {
if (r >= 0)
return 0;
r = gather_pid_mount_tree_fd(&context);
if (r < 0 && r != -EBADF)
r = gather_pid_mount_tree_fd(&context, &mount_tree_fd);
if (r < 0)
log_warning_errno(r, "Failed to access the mount tree of a container, ignoring: %m");
else
mount_tree_fd = r;
}
/* If this is PID 1 disable coredump collection, we'll unlikely be able to process

View File

@@ -25,4 +25,4 @@
#JournalSizeMax=767M
#MaxUse=
#KeepFree=
#AccessContainer=no
#EnterNamespace=no

View File

@@ -36,6 +36,7 @@ test_append_files() {
instmods vmw_vsock_virtio_transport
instmods vsock_loopback
instmods vmw_vsock_vmci_transport
inst_binary gcc
generate_module_dependencies
}

View File

@@ -8,13 +8,15 @@ set -o pipefail
# Make sure the binary name fits into 15 characters
CORE_TEST_BIN="/tmp/test-dump"
CORE_STACKTRACE_TEST_BIN="/tmp/test-stacktrace-dump"
MAKE_STACKTRACE_DUMP="/tmp/make-stacktrace-dump"
CORE_TEST_UNPRIV_BIN="/tmp/test-usr-dump"
MAKE_DUMP_SCRIPT="/tmp/make-dump"
# Unset $PAGER so we don't have to use --no-pager everywhere
export PAGER=
at_exit() {
rm -fv -- "$CORE_TEST_BIN" "$CORE_TEST_UNPRIV_BIN" "$MAKE_DUMP_SCRIPT"
rm -fv -- "$CORE_TEST_BIN" "$CORE_TEST_UNPRIV_BIN" "$MAKE_DUMP_SCRIPT" "$MAKE_STACKTRACE_DUMP"
}
trap at_exit EXIT
@@ -225,3 +227,32 @@ systemd-run -t --property CoredumpFilter=default ls /tmp
(! coredumpctl dump --output=/dev/null --output=/dev/null "$CORE_TEST_BIN")
(! coredumpctl debug --debugger=/bin/false)
(! coredumpctl debug --debugger=/bin/true --debugger-arguments='"')
# Test for EnterNamespace= feature
if pkgconf --atleast-version 0.192 libdw ; then
# dwfl_set_sysroot() is supported only in libdw-0.192 or newer.
cat >"$MAKE_STACKTRACE_DUMP" <<END
#!/bin/bash
mount -t tmpfs tmpfs /tmp
gcc -xc -O0 -g -o $CORE_STACKTRACE_TEST_BIN - <<EOF
void baz(void) { int *x = 0; *x = 42; }
void bar(void) { baz(); }
void foo(void) { bar(); }
int main(void) { foo(); return 0;}
EOF
$CORE_STACKTRACE_TEST_BIN
END
chmod +x "$MAKE_STACKTRACE_DUMP"
mkdir -p /run/systemd/coredump.conf.d/
printf '[Coredump]\nEnterNamespace=no' >/run/systemd/coredump.conf.d/99-enter-namespace.conf
unshare --pid --fork --mount-proc --mount --uts --ipc --net /bin/bash -c "$MAKE_STACKTRACE_DUMP" || :
timeout 30 bash -c "until coredumpctl -1 info $CORE_STACKTRACE_TEST_BIN | grep -zvqE 'baz.*bar.*foo'; do sleep .2; done"
printf '[Coredump]\nEnterNamespace=yes' >/run/systemd/coredump.conf.d/99-enter-namespace.conf
unshare --pid --fork --mount-proc --mount --uts --ipc --net /bin/bash -c "$MAKE_STACKTRACE_DUMP" || :
timeout 30 bash -c "until coredumpctl -1 info $CORE_STACKTRACE_TEST_BIN | grep -zqE 'baz.*bar.*foo'; do sleep .2; done"
else
echo "libdw doesn't not support setting sysroot, skipping EnterNamespace= test"
fi