diff --git a/man/systemd-coredump.xml b/man/systemd-coredump.xml
index 9972ba02e4..d499600dc1 100644
--- a/man/systemd-coredump.xml
+++ b/man/systemd-coredump.xml
@@ -229,6 +229,17 @@ COREDUMP_FILENAME=/var/lib/systemd/coredump/core.Web….552351.….zst
+
+ COREDUMP_BY_PIDFD=
+ If the crashed process was analyzed using a PIDFD provided by the kernel (requires
+ kernel v6.16) then this field will be present and set to 1. If this field is
+ not set, then the crashed process was analyzed via a PID, which is known to be subject to race
+ conditions.
+
+
+
+
+
COREDUMP_TIMESTAMP=
The time of the crash as reported by the kernel (in μs since the epoch).
@@ -329,6 +340,18 @@ COREDUMP_FILENAME=/var/lib/systemd/coredump/core.Web….552351.….zst
+
+ COREDUMP_DUMPABLE=
+
+ The PR_GET_DUMPABLE field as reported by the kernel, see
+ prctl2.
+
+
+
+
+
+
COREDUMP_OPEN_FDS=
diff --git a/src/coredump/coredump.c b/src/coredump/coredump.c
index 07f4da2c7a..55a0b704e3 100644
--- a/src/coredump/coredump.c
+++ b/src/coredump/coredump.c
@@ -3,7 +3,6 @@
#include
#include
#include
-#include
#include
#include
#include
@@ -87,7 +86,7 @@ assert_cc(JOURNAL_SIZE_MAX <= DATA_SIZE_MAX);
#define MOUNT_TREE_ROOT "/run/systemd/mount-rootfs"
-enum {
+typedef enum {
/* We use these as array indexes for our process metadata cache.
*
* The first indices of the cache stores the same metadata as the ones passed by the kernel via
@@ -103,9 +102,11 @@ enum {
_META_ARGV_REQUIRED,
/* The fields below were added to kernel/core_pattern at later points, so they might be missing. */
META_ARGV_HOSTNAME = _META_ARGV_REQUIRED, /* %h: hostname */
- _META_ARGV_MAX,
+ META_ARGV_DUMPABLE, /* %d: as set by the kernel */
+ META_ARGV_PIDFD, /* %F: pidfd of the process, since v6.16 */
/* If new fields are added, they should be added here, to maintain compatibility
* with callers which don't know about the new fields. */
+ _META_ARGV_MAX,
/* The following indexes are cached for a couple of special fields we use (and
* thereby need to be retrieved quickly) for naming coredump files, and attaching
@@ -113,16 +114,15 @@ enum {
* environment. */
META_COMM = _META_ARGV_MAX,
- _META_MANDATORY_MAX,
/* The rest are similar to the previous ones except that we won't fail if one of
* them is missing in a message sent over the socket. */
- META_EXE = _META_MANDATORY_MAX,
+ META_EXE,
META_UNIT,
META_PROC_AUXV,
_META_MAX
-};
+} meta_argv_t;
static const char * const meta_field_names[_META_MAX] = {
[META_ARGV_PID] = "COREDUMP_PID=",
@@ -132,6 +132,8 @@ static const char * const meta_field_names[_META_MAX] = {
[META_ARGV_TIMESTAMP] = "COREDUMP_TIMESTAMP=",
[META_ARGV_RLIMIT] = "COREDUMP_RLIMIT=",
[META_ARGV_HOSTNAME] = "COREDUMP_HOSTNAME=",
+ [META_ARGV_DUMPABLE] = "COREDUMP_DUMPABLE=",
+ [META_ARGV_PIDFD] = "COREDUMP_BY_PIDFD=",
[META_COMM] = "COREDUMP_COMM=",
[META_EXE] = "COREDUMP_EXE=",
[META_UNIT] = "COREDUMP_UNIT=",
@@ -142,10 +144,12 @@ typedef struct Context {
PidRef pidref;
uid_t uid;
gid_t gid;
+ unsigned dumpable;
int signo;
uint64_t rlimit;
bool is_pid1;
bool is_journald;
+ bool got_pidfd;
int mount_tree_fd;
/* These point into external memory, are not owned by this object */
@@ -438,14 +442,16 @@ static int grant_user_access(int core_fd, const Context *context) {
if (r < 0)
return r;
- /* We allow access if we got all the data and at_secure is not set and
- * the uid/gid matches euid/egid. */
+ /* We allow access if %d/dumpable on the command line was exactly 1, we got all the data,
+ * at_secure is not set, and the uid/gid match euid/egid. */
bool ret =
+ context->dumpable == SUID_DUMP_USER &&
at_secure == 0 &&
uid != UID_INVALID && euid != UID_INVALID && uid == euid &&
gid != GID_INVALID && egid != GID_INVALID && gid == egid;
- log_debug("Will %s access (uid="UID_FMT " euid="UID_FMT " gid="GID_FMT " egid="GID_FMT " at_secure=%s)",
+ log_debug("Will %s access (dumpable=%u uid="UID_FMT " euid="UID_FMT " gid="GID_FMT " egid="GID_FMT " at_secure=%s)",
ret ? "permit" : "restrict",
+ context->dumpable,
uid, euid, gid, egid, yes_no(at_secure));
return ret;
}
@@ -1027,7 +1033,6 @@ static int context_parse_iovw(Context *context, struct iovec_wrapper *iovw) {
assert(context);
assert(iovw);
- assert(iovw->count >= _META_ARGV_MAX);
/* Converts the data in the iovec array iovw into separate fields. Fills in context->meta[] (for
* which no memory is allocated, it just contains direct pointers into the iovec array memory). */
@@ -1088,6 +1093,16 @@ static int context_parse_iovw(Context *context, struct iovec_wrapper *iovw) {
if (r < 0)
log_warning_errno(r, "Failed to parse resource limit \"%s\", ignoring: %m", context->meta[META_ARGV_RLIMIT]);
+ /* The value is set to contents of /proc/sys/fs/suid_dumpable, which we set to SUID_DUMP_SAFE (2),
+ * if the process is marked as not dumpable, see PR_SET_DUMPABLE(2const). */
+ if (context->meta[META_ARGV_DUMPABLE]) {
+ r = safe_atou(context->meta[META_ARGV_DUMPABLE], &context->dumpable);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse dumpable field \"%s\": %m", context->meta[META_ARGV_DUMPABLE]);
+ if (context->dumpable > SUID_DUMP_SAFE)
+ log_notice("Got unexpected %%d/dumpable value %u.", context->dumpable);
+ }
+
unit = context->meta[META_UNIT];
context->is_pid1 = streq(context->meta[META_ARGV_PID], "1") || streq_ptr(unit, SPECIAL_INIT_SCOPE);
context->is_journald = streq_ptr(unit, SPECIAL_JOURNALD_SERVICE);
@@ -1228,10 +1243,24 @@ static int process_socket(int fd) {
if (r < 0)
return r;
- /* Make sure we received at least all fields we need. */
- for (int i = 0; i < _META_MANDATORY_MAX; i++)
+ /* Make sure we received all the expected fields. We support being called by an *older*
+ * systemd-coredump from the outside, so we require only the basic set of fields that
+ * was being sent when the support for sending to containers over a socket was added
+ * in a108c43e36d3ceb6e34efe37c014fc2cda856000. */
+ meta_argv_t i;
+ FOREACH_ARGUMENT(i,
+ META_ARGV_PID,
+ META_ARGV_UID,
+ META_ARGV_GID,
+ META_ARGV_SIGNAL,
+ META_ARGV_TIMESTAMP,
+ META_ARGV_RLIMIT,
+ META_ARGV_HOSTNAME,
+ META_COMM)
if (!context.meta[i])
- return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "A mandatory argument (%i) has not been sent, aborting.", i);
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Mandatory argument %s not received on socket, aborting.",
+ meta_field_names[i]);
return submit_coredump(&context, &iovw, input_fd);
}
@@ -1318,7 +1347,8 @@ static int gather_pid_metadata_from_argv(
Context *context,
int argc, char **argv) {
- int r;
+ _cleanup_(pidref_done) PidRef local_pidref = PIDREF_NULL;
+ int r, kernel_fd = -EBADF;
assert(iovw);
assert(context);
@@ -1350,6 +1380,49 @@ static int gather_pid_metadata_from_argv(
t = buf;
}
+ if (i == META_ARGV_PID) {
+ /* Store this so that we can check whether the core will be forwarded to a container
+ * even when the kernel doesn't provide a pidfd. Can be dropped once baseline is
+ * >= v6.16. */
+ r = pidref_set_pidstr(&local_pidref, t);
+ if (r < 0)
+ return log_error_errno(r, "Failed to initialize pidref from pid %s: %m", t);
+ }
+
+ if (i == META_ARGV_PIDFD) {
+ /* If the current kernel doesn't support the %F specifier (which resolves to a
+ * pidfd), but we included it in the core_pattern expression, we'll receive an empty
+ * string here. Deal with that gracefully. */
+ if (isempty(t))
+ continue;
+
+ assert(!pidref_is_set(&context->pidref));
+ assert(kernel_fd < 0);
+
+ kernel_fd = parse_fd(t);
+ if (kernel_fd < 0)
+ return log_error_errno(kernel_fd, "Failed to parse pidfd \"%s\": %m", t);
+
+ r = pidref_set_pidfd(&context->pidref, kernel_fd);
+ if (r < 0)
+ return log_error_errno(r, "Failed to initialize pidref from pidfd %d: %m", kernel_fd);
+
+ context->got_pidfd = 1;
+
+ /* If there are containers involved with different versions of the code they might
+ * not be using pidfds, so it would be wrong to set the metadata, skip it. */
+ r = pidref_in_same_namespace(/* pid1 = */ NULL, &context->pidref, NAMESPACE_PID);
+ if (r < 0)
+ log_debug_errno(r, "Failed to check pidns of crashing process, ignoring: %m");
+ if (r <= 0)
+ continue;
+
+ /* We don't print the fd number in the journal as it's meaningless, but we still
+ * record that the parsing was done with a kernel-provided fd as it means it's safe
+ * from races, which is valuable information to provide in the journal record. */
+ t = "1";
+ }
+
r = iovw_put_string_field(iovw, meta_field_names[i], t);
if (r < 0)
return r;
@@ -1357,7 +1430,19 @@ static int gather_pid_metadata_from_argv(
/* Cache some of the process metadata we collected so far and that we'll need to
* access soon. */
- return context_parse_iovw(context, iovw);
+ r = context_parse_iovw(context, iovw);
+ if (r < 0)
+ return r;
+
+ /* If the kernel didn't give us a PIDFD, then use the one derived from the
+ * PID immediately, given we have it. */
+ if (!pidref_is_set(&context->pidref))
+ context->pidref = TAKE_PIDREF(local_pidref);
+
+ /* Close the kernel-provided FD as the last thing after everything else succeeded. */
+ kernel_fd = safe_close(kernel_fd);
+
+ return 0;
}
static int gather_pid_metadata_from_procfs(struct iovec_wrapper *iovw, Context *context) {
@@ -1536,13 +1621,21 @@ static int receive_ucred(int transport_fd, struct ucred *ret_ucred) {
return 0;
}
-static int can_forward_coredump(const PidRef *pid) {
+static int can_forward_coredump(Context *context, const PidRef *pid) {
_cleanup_free_ char *cgroup = NULL, *path = NULL, *unit = NULL;
int r;
+ assert(context);
assert(pidref_is_set(pid));
assert(!pidref_is_remote(pid));
+ /* We need to avoid a situation where the attacker crashes a SUID process or a root daemon and
+ * quickly replaces it with a namespaced process and we forward the coredump to the attacker, into
+ * the namespace. With %F/pidfd we can reliably check the namespace of the original process, hence we
+ * can allow forwarding. */
+ if (!context->got_pidfd && context->dumpable != SUID_DUMP_USER)
+ return false;
+
r = cg_pidref_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
if (r < 0)
return r;
@@ -1587,7 +1680,7 @@ static int forward_coredump_to_container(Context *context) {
if (r < 0)
return log_debug_errno(r, "Failed to get namespace leader: %m");
- r = can_forward_coredump(&leader_pid);
+ r = can_forward_coredump(context, &leader_pid);
if (r < 0)
return log_debug_errno(r, "Failed to check if coredump can be forwarded: %m");
if (r == 0)
@@ -1930,7 +2023,7 @@ static int run(int argc, char *argv[]) {
log_set_target_and_open(LOG_TARGET_KMSG);
/* Make sure we never enter a loop */
- (void) prctl(PR_SET_DUMPABLE, 0);
+ (void) set_dumpable(SUID_DUMP_DISABLE);
/* Ignore all parse errors */
(void) parse_config();
diff --git a/src/shared/coredump-util.c b/src/shared/coredump-util.c
index 9d18cf1301..37dfb2c91a 100644
--- a/src/shared/coredump-util.c
+++ b/src/shared/coredump-util.c
@@ -1,9 +1,11 @@
/* SPDX-License-Identifier: LGPL-2.1-or-later */
#include
+#include
#include "alloc-util.h"
#include "coredump-util.h"
+#include "errno-util.h"
#include "extract-word.h"
#include "fileio.h"
#include "log.h"
@@ -14,6 +16,11 @@
#include "unaligned.h"
#include "virt.h"
+int set_dumpable(SuidDumpMode mode) {
+ /* Cast mode explicitly to long, because prctl wants longs but is varargs. */
+ return RET_NERRNO(prctl(PR_SET_DUMPABLE, (long) mode));
+}
+
static const char *const coredump_filter_table[_COREDUMP_FILTER_MAX] = {
[COREDUMP_FILTER_PRIVATE_ANONYMOUS] = "private-anonymous",
[COREDUMP_FILTER_SHARED_ANONYMOUS] = "shared-anonymous",
diff --git a/src/shared/coredump-util.h b/src/shared/coredump-util.h
index cbf5678ed2..a39c9286eb 100644
--- a/src/shared/coredump-util.h
+++ b/src/shared/coredump-util.h
@@ -25,6 +25,15 @@ typedef enum CoredumpFilter {
/* The kernel doesn't like UINT64_MAX and returns ERANGE, use UINT32_MAX to support future new flags */
#define COREDUMP_FILTER_MASK_ALL UINT32_MAX
+typedef enum SuidDumpMode {
+ SUID_DUMP_DISABLE = 0, /* PR_SET_DUMPABLE(2const) */
+ SUID_DUMP_USER = 1, /* PR_SET_DUMPABLE(2const) */
+ SUID_DUMP_SAFE = 2, /* https://www.kernel.org/doc/html/latest/admin-guide/sysctl/fs.html#suid-dumpable */
+ _SUID_DUMP_MODE_MAX,
+} SuidDumpMode;
+
+int set_dumpable(SuidDumpMode mode);
+
const char* coredump_filter_to_string(CoredumpFilter i) _const_;
CoredumpFilter coredump_filter_from_string(const char *s) _pure_;
int coredump_filter_mask_from_string(const char *s, uint64_t *ret);
diff --git a/src/shared/elf-util.c b/src/shared/elf-util.c
index 73a910575d..a21167980d 100644
--- a/src/shared/elf-util.c
+++ b/src/shared/elf-util.c
@@ -6,12 +6,12 @@
#include
#include
#endif
-#include
#include
#include "sd-json.h"
#include "alloc-util.h"
+#include "coredump-util.h"
#include "dlfcn-util.h"
#include "elf-util.h"
#include "errno-util.h"
@@ -829,7 +829,7 @@ int parse_elf_object(int fd, const char *executable, const char *root, bool fork
if (r == 0) {
/* We want to avoid loops, given this can be called from systemd-coredump */
if (fork_disable_dump) {
- r = RET_NERRNO(prctl(PR_SET_DUMPABLE, 0));
+ r = set_dumpable(SUID_DUMP_DISABLE);
if (r < 0)
report_errno_and_exit(error_pipe[1], r);
}
diff --git a/src/shared/tests.c b/src/shared/tests.c
index bc80d153ac..5820c0d13f 100644
--- a/src/shared/tests.c
+++ b/src/shared/tests.c
@@ -17,6 +17,7 @@
#include "bus-wait-for-jobs.h"
#include "cgroup-setup.h"
#include "cgroup-util.h"
+#include "coredump-util.h"
#include "env-file.h"
#include "env-util.h"
#include "errno-util.h"
@@ -436,7 +437,8 @@ int assert_signal_internal(void) {
if (r == 0) {
/* Speed things up by never even attempting to generate a coredump */
- (void) prctl(PR_SET_DUMPABLE, 0);
+ (void) set_dumpable(SUID_DUMP_DISABLE);
+
/* But still set an rlimit just in case */
(void) setrlimit(RLIMIT_CORE, &RLIMIT_MAKE_CONST(0));
return 0;
diff --git a/sysctl.d/50-coredump.conf.in b/sysctl.d/50-coredump.conf.in
index 90c080bdfe..fe8f7670b0 100644
--- a/sysctl.d/50-coredump.conf.in
+++ b/sysctl.d/50-coredump.conf.in
@@ -13,7 +13,7 @@
# the core dump.
#
# See systemd-coredump(8) and core(5).
-kernel.core_pattern=|{{LIBEXECDIR}}/systemd-coredump %P %u %g %s %t %c %h
+kernel.core_pattern=|{{LIBEXECDIR}}/systemd-coredump %P %u %g %s %t %c %h %d %F
# Allow 16 coredumps to be dispatched in parallel by the kernel.
# We collect metadata from /proc/%P/, and thus need to make sure the crashed
diff --git a/test/units/TEST-87-AUX-UTILS-VM.coredump.sh b/test/units/TEST-87-AUX-UTILS-VM.coredump.sh
index 6ad7e29e28..ce4a2c9ed1 100755
--- a/test/units/TEST-87-AUX-UTILS-VM.coredump.sh
+++ b/test/units/TEST-87-AUX-UTILS-VM.coredump.sh
@@ -198,12 +198,17 @@ journalctl -b -n 1 --output=export --output-fields=MESSAGE,COREDUMP COREDUMP_EXE
/usr/lib/systemd/systemd-coredump --backtrace $$ 0 0 6 1679509900 12345
journalctl -b -n 1 --output=export --output-fields=MESSAGE,COREDUMP COREDUMP_EXE="/usr/bin/test-dump" |
/usr/lib/systemd/systemd-coredump --backtrace $$ 0 0 6 1679509901 12345 mymachine
+journalctl -b -n 1 --output=export --output-fields=MESSAGE,COREDUMP COREDUMP_EXE="/usr/bin/test-dump" |
+ /usr/lib/systemd/systemd-coredump --backtrace $$ 0 0 6 1679509902 12345 youmachine 1
# Wait a bit for the coredumps to get processed
timeout 30 bash -c "while [[ \$(coredumpctl list -q --no-legend $$ | wc -l) -lt 2 ]]; do sleep 1; done"
coredumpctl info $$
coredumpctl info COREDUMP_TIMESTAMP=1679509900000000
coredumpctl info COREDUMP_TIMESTAMP=1679509901000000
coredumpctl info COREDUMP_HOSTNAME="mymachine"
+coredumpctl info COREDUMP_TIMESTAMP=1679509902000000
+coredumpctl info COREDUMP_HOSTNAME="youmachine"
+coredumpctl info COREDUMP_DUMPABLE="1"
# This used to cause a stack overflow
systemd-run -t --property CoredumpFilter=all ls /tmp