diff --git a/man/systemd-coredump.xml b/man/systemd-coredump.xml index 9972ba02e4..d499600dc1 100644 --- a/man/systemd-coredump.xml +++ b/man/systemd-coredump.xml @@ -229,6 +229,17 @@ COREDUMP_FILENAME=/var/lib/systemd/coredump/core.Web….552351.….zst + + COREDUMP_BY_PIDFD= + If the crashed process was analyzed using a PIDFD provided by the kernel (requires + kernel v6.16) then this field will be present and set to 1. If this field is + not set, then the crashed process was analyzed via a PID, which is known to be subject to race + conditions. + + + + + COREDUMP_TIMESTAMP= The time of the crash as reported by the kernel (in μs since the epoch). @@ -329,6 +340,18 @@ COREDUMP_FILENAME=/var/lib/systemd/coredump/core.Web….552351.….zst + + COREDUMP_DUMPABLE= + + The PR_GET_DUMPABLE field as reported by the kernel, see + prctl2. + + + + + + COREDUMP_OPEN_FDS= diff --git a/src/coredump/coredump.c b/src/coredump/coredump.c index 07f4da2c7a..55a0b704e3 100644 --- a/src/coredump/coredump.c +++ b/src/coredump/coredump.c @@ -3,7 +3,6 @@ #include #include #include -#include #include #include #include @@ -87,7 +86,7 @@ assert_cc(JOURNAL_SIZE_MAX <= DATA_SIZE_MAX); #define MOUNT_TREE_ROOT "/run/systemd/mount-rootfs" -enum { +typedef enum { /* We use these as array indexes for our process metadata cache. * * The first indices of the cache stores the same metadata as the ones passed by the kernel via @@ -103,9 +102,11 @@ enum { _META_ARGV_REQUIRED, /* The fields below were added to kernel/core_pattern at later points, so they might be missing. */ META_ARGV_HOSTNAME = _META_ARGV_REQUIRED, /* %h: hostname */ - _META_ARGV_MAX, + META_ARGV_DUMPABLE, /* %d: as set by the kernel */ + META_ARGV_PIDFD, /* %F: pidfd of the process, since v6.16 */ /* If new fields are added, they should be added here, to maintain compatibility * with callers which don't know about the new fields. */ + _META_ARGV_MAX, /* The following indexes are cached for a couple of special fields we use (and * thereby need to be retrieved quickly) for naming coredump files, and attaching @@ -113,16 +114,15 @@ enum { * environment. */ META_COMM = _META_ARGV_MAX, - _META_MANDATORY_MAX, /* The rest are similar to the previous ones except that we won't fail if one of * them is missing in a message sent over the socket. */ - META_EXE = _META_MANDATORY_MAX, + META_EXE, META_UNIT, META_PROC_AUXV, _META_MAX -}; +} meta_argv_t; static const char * const meta_field_names[_META_MAX] = { [META_ARGV_PID] = "COREDUMP_PID=", @@ -132,6 +132,8 @@ static const char * const meta_field_names[_META_MAX] = { [META_ARGV_TIMESTAMP] = "COREDUMP_TIMESTAMP=", [META_ARGV_RLIMIT] = "COREDUMP_RLIMIT=", [META_ARGV_HOSTNAME] = "COREDUMP_HOSTNAME=", + [META_ARGV_DUMPABLE] = "COREDUMP_DUMPABLE=", + [META_ARGV_PIDFD] = "COREDUMP_BY_PIDFD=", [META_COMM] = "COREDUMP_COMM=", [META_EXE] = "COREDUMP_EXE=", [META_UNIT] = "COREDUMP_UNIT=", @@ -142,10 +144,12 @@ typedef struct Context { PidRef pidref; uid_t uid; gid_t gid; + unsigned dumpable; int signo; uint64_t rlimit; bool is_pid1; bool is_journald; + bool got_pidfd; int mount_tree_fd; /* These point into external memory, are not owned by this object */ @@ -438,14 +442,16 @@ static int grant_user_access(int core_fd, const Context *context) { if (r < 0) return r; - /* We allow access if we got all the data and at_secure is not set and - * the uid/gid matches euid/egid. */ + /* We allow access if %d/dumpable on the command line was exactly 1, we got all the data, + * at_secure is not set, and the uid/gid match euid/egid. */ bool ret = + context->dumpable == SUID_DUMP_USER && at_secure == 0 && uid != UID_INVALID && euid != UID_INVALID && uid == euid && gid != GID_INVALID && egid != GID_INVALID && gid == egid; - log_debug("Will %s access (uid="UID_FMT " euid="UID_FMT " gid="GID_FMT " egid="GID_FMT " at_secure=%s)", + log_debug("Will %s access (dumpable=%u uid="UID_FMT " euid="UID_FMT " gid="GID_FMT " egid="GID_FMT " at_secure=%s)", ret ? "permit" : "restrict", + context->dumpable, uid, euid, gid, egid, yes_no(at_secure)); return ret; } @@ -1027,7 +1033,6 @@ static int context_parse_iovw(Context *context, struct iovec_wrapper *iovw) { assert(context); assert(iovw); - assert(iovw->count >= _META_ARGV_MAX); /* Converts the data in the iovec array iovw into separate fields. Fills in context->meta[] (for * which no memory is allocated, it just contains direct pointers into the iovec array memory). */ @@ -1088,6 +1093,16 @@ static int context_parse_iovw(Context *context, struct iovec_wrapper *iovw) { if (r < 0) log_warning_errno(r, "Failed to parse resource limit \"%s\", ignoring: %m", context->meta[META_ARGV_RLIMIT]); + /* The value is set to contents of /proc/sys/fs/suid_dumpable, which we set to SUID_DUMP_SAFE (2), + * if the process is marked as not dumpable, see PR_SET_DUMPABLE(2const). */ + if (context->meta[META_ARGV_DUMPABLE]) { + r = safe_atou(context->meta[META_ARGV_DUMPABLE], &context->dumpable); + if (r < 0) + return log_error_errno(r, "Failed to parse dumpable field \"%s\": %m", context->meta[META_ARGV_DUMPABLE]); + if (context->dumpable > SUID_DUMP_SAFE) + log_notice("Got unexpected %%d/dumpable value %u.", context->dumpable); + } + unit = context->meta[META_UNIT]; context->is_pid1 = streq(context->meta[META_ARGV_PID], "1") || streq_ptr(unit, SPECIAL_INIT_SCOPE); context->is_journald = streq_ptr(unit, SPECIAL_JOURNALD_SERVICE); @@ -1228,10 +1243,24 @@ static int process_socket(int fd) { if (r < 0) return r; - /* Make sure we received at least all fields we need. */ - for (int i = 0; i < _META_MANDATORY_MAX; i++) + /* Make sure we received all the expected fields. We support being called by an *older* + * systemd-coredump from the outside, so we require only the basic set of fields that + * was being sent when the support for sending to containers over a socket was added + * in a108c43e36d3ceb6e34efe37c014fc2cda856000. */ + meta_argv_t i; + FOREACH_ARGUMENT(i, + META_ARGV_PID, + META_ARGV_UID, + META_ARGV_GID, + META_ARGV_SIGNAL, + META_ARGV_TIMESTAMP, + META_ARGV_RLIMIT, + META_ARGV_HOSTNAME, + META_COMM) if (!context.meta[i]) - return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "A mandatory argument (%i) has not been sent, aborting.", i); + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Mandatory argument %s not received on socket, aborting.", + meta_field_names[i]); return submit_coredump(&context, &iovw, input_fd); } @@ -1318,7 +1347,8 @@ static int gather_pid_metadata_from_argv( Context *context, int argc, char **argv) { - int r; + _cleanup_(pidref_done) PidRef local_pidref = PIDREF_NULL; + int r, kernel_fd = -EBADF; assert(iovw); assert(context); @@ -1350,6 +1380,49 @@ static int gather_pid_metadata_from_argv( t = buf; } + if (i == META_ARGV_PID) { + /* Store this so that we can check whether the core will be forwarded to a container + * even when the kernel doesn't provide a pidfd. Can be dropped once baseline is + * >= v6.16. */ + r = pidref_set_pidstr(&local_pidref, t); + if (r < 0) + return log_error_errno(r, "Failed to initialize pidref from pid %s: %m", t); + } + + if (i == META_ARGV_PIDFD) { + /* If the current kernel doesn't support the %F specifier (which resolves to a + * pidfd), but we included it in the core_pattern expression, we'll receive an empty + * string here. Deal with that gracefully. */ + if (isempty(t)) + continue; + + assert(!pidref_is_set(&context->pidref)); + assert(kernel_fd < 0); + + kernel_fd = parse_fd(t); + if (kernel_fd < 0) + return log_error_errno(kernel_fd, "Failed to parse pidfd \"%s\": %m", t); + + r = pidref_set_pidfd(&context->pidref, kernel_fd); + if (r < 0) + return log_error_errno(r, "Failed to initialize pidref from pidfd %d: %m", kernel_fd); + + context->got_pidfd = 1; + + /* If there are containers involved with different versions of the code they might + * not be using pidfds, so it would be wrong to set the metadata, skip it. */ + r = pidref_in_same_namespace(/* pid1 = */ NULL, &context->pidref, NAMESPACE_PID); + if (r < 0) + log_debug_errno(r, "Failed to check pidns of crashing process, ignoring: %m"); + if (r <= 0) + continue; + + /* We don't print the fd number in the journal as it's meaningless, but we still + * record that the parsing was done with a kernel-provided fd as it means it's safe + * from races, which is valuable information to provide in the journal record. */ + t = "1"; + } + r = iovw_put_string_field(iovw, meta_field_names[i], t); if (r < 0) return r; @@ -1357,7 +1430,19 @@ static int gather_pid_metadata_from_argv( /* Cache some of the process metadata we collected so far and that we'll need to * access soon. */ - return context_parse_iovw(context, iovw); + r = context_parse_iovw(context, iovw); + if (r < 0) + return r; + + /* If the kernel didn't give us a PIDFD, then use the one derived from the + * PID immediately, given we have it. */ + if (!pidref_is_set(&context->pidref)) + context->pidref = TAKE_PIDREF(local_pidref); + + /* Close the kernel-provided FD as the last thing after everything else succeeded. */ + kernel_fd = safe_close(kernel_fd); + + return 0; } static int gather_pid_metadata_from_procfs(struct iovec_wrapper *iovw, Context *context) { @@ -1536,13 +1621,21 @@ static int receive_ucred(int transport_fd, struct ucred *ret_ucred) { return 0; } -static int can_forward_coredump(const PidRef *pid) { +static int can_forward_coredump(Context *context, const PidRef *pid) { _cleanup_free_ char *cgroup = NULL, *path = NULL, *unit = NULL; int r; + assert(context); assert(pidref_is_set(pid)); assert(!pidref_is_remote(pid)); + /* We need to avoid a situation where the attacker crashes a SUID process or a root daemon and + * quickly replaces it with a namespaced process and we forward the coredump to the attacker, into + * the namespace. With %F/pidfd we can reliably check the namespace of the original process, hence we + * can allow forwarding. */ + if (!context->got_pidfd && context->dumpable != SUID_DUMP_USER) + return false; + r = cg_pidref_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup); if (r < 0) return r; @@ -1587,7 +1680,7 @@ static int forward_coredump_to_container(Context *context) { if (r < 0) return log_debug_errno(r, "Failed to get namespace leader: %m"); - r = can_forward_coredump(&leader_pid); + r = can_forward_coredump(context, &leader_pid); if (r < 0) return log_debug_errno(r, "Failed to check if coredump can be forwarded: %m"); if (r == 0) @@ -1930,7 +2023,7 @@ static int run(int argc, char *argv[]) { log_set_target_and_open(LOG_TARGET_KMSG); /* Make sure we never enter a loop */ - (void) prctl(PR_SET_DUMPABLE, 0); + (void) set_dumpable(SUID_DUMP_DISABLE); /* Ignore all parse errors */ (void) parse_config(); diff --git a/src/shared/coredump-util.c b/src/shared/coredump-util.c index 9d18cf1301..37dfb2c91a 100644 --- a/src/shared/coredump-util.c +++ b/src/shared/coredump-util.c @@ -1,9 +1,11 @@ /* SPDX-License-Identifier: LGPL-2.1-or-later */ #include +#include #include "alloc-util.h" #include "coredump-util.h" +#include "errno-util.h" #include "extract-word.h" #include "fileio.h" #include "log.h" @@ -14,6 +16,11 @@ #include "unaligned.h" #include "virt.h" +int set_dumpable(SuidDumpMode mode) { + /* Cast mode explicitly to long, because prctl wants longs but is varargs. */ + return RET_NERRNO(prctl(PR_SET_DUMPABLE, (long) mode)); +} + static const char *const coredump_filter_table[_COREDUMP_FILTER_MAX] = { [COREDUMP_FILTER_PRIVATE_ANONYMOUS] = "private-anonymous", [COREDUMP_FILTER_SHARED_ANONYMOUS] = "shared-anonymous", diff --git a/src/shared/coredump-util.h b/src/shared/coredump-util.h index cbf5678ed2..a39c9286eb 100644 --- a/src/shared/coredump-util.h +++ b/src/shared/coredump-util.h @@ -25,6 +25,15 @@ typedef enum CoredumpFilter { /* The kernel doesn't like UINT64_MAX and returns ERANGE, use UINT32_MAX to support future new flags */ #define COREDUMP_FILTER_MASK_ALL UINT32_MAX +typedef enum SuidDumpMode { + SUID_DUMP_DISABLE = 0, /* PR_SET_DUMPABLE(2const) */ + SUID_DUMP_USER = 1, /* PR_SET_DUMPABLE(2const) */ + SUID_DUMP_SAFE = 2, /* https://www.kernel.org/doc/html/latest/admin-guide/sysctl/fs.html#suid-dumpable */ + _SUID_DUMP_MODE_MAX, +} SuidDumpMode; + +int set_dumpable(SuidDumpMode mode); + const char* coredump_filter_to_string(CoredumpFilter i) _const_; CoredumpFilter coredump_filter_from_string(const char *s) _pure_; int coredump_filter_mask_from_string(const char *s, uint64_t *ret); diff --git a/src/shared/elf-util.c b/src/shared/elf-util.c index 73a910575d..a21167980d 100644 --- a/src/shared/elf-util.c +++ b/src/shared/elf-util.c @@ -6,12 +6,12 @@ #include #include #endif -#include #include #include "sd-json.h" #include "alloc-util.h" +#include "coredump-util.h" #include "dlfcn-util.h" #include "elf-util.h" #include "errno-util.h" @@ -829,7 +829,7 @@ int parse_elf_object(int fd, const char *executable, const char *root, bool fork if (r == 0) { /* We want to avoid loops, given this can be called from systemd-coredump */ if (fork_disable_dump) { - r = RET_NERRNO(prctl(PR_SET_DUMPABLE, 0)); + r = set_dumpable(SUID_DUMP_DISABLE); if (r < 0) report_errno_and_exit(error_pipe[1], r); } diff --git a/src/shared/tests.c b/src/shared/tests.c index bc80d153ac..5820c0d13f 100644 --- a/src/shared/tests.c +++ b/src/shared/tests.c @@ -17,6 +17,7 @@ #include "bus-wait-for-jobs.h" #include "cgroup-setup.h" #include "cgroup-util.h" +#include "coredump-util.h" #include "env-file.h" #include "env-util.h" #include "errno-util.h" @@ -436,7 +437,8 @@ int assert_signal_internal(void) { if (r == 0) { /* Speed things up by never even attempting to generate a coredump */ - (void) prctl(PR_SET_DUMPABLE, 0); + (void) set_dumpable(SUID_DUMP_DISABLE); + /* But still set an rlimit just in case */ (void) setrlimit(RLIMIT_CORE, &RLIMIT_MAKE_CONST(0)); return 0; diff --git a/sysctl.d/50-coredump.conf.in b/sysctl.d/50-coredump.conf.in index 90c080bdfe..fe8f7670b0 100644 --- a/sysctl.d/50-coredump.conf.in +++ b/sysctl.d/50-coredump.conf.in @@ -13,7 +13,7 @@ # the core dump. # # See systemd-coredump(8) and core(5). -kernel.core_pattern=|{{LIBEXECDIR}}/systemd-coredump %P %u %g %s %t %c %h +kernel.core_pattern=|{{LIBEXECDIR}}/systemd-coredump %P %u %g %s %t %c %h %d %F # Allow 16 coredumps to be dispatched in parallel by the kernel. # We collect metadata from /proc/%P/, and thus need to make sure the crashed diff --git a/test/units/TEST-87-AUX-UTILS-VM.coredump.sh b/test/units/TEST-87-AUX-UTILS-VM.coredump.sh index 6ad7e29e28..ce4a2c9ed1 100755 --- a/test/units/TEST-87-AUX-UTILS-VM.coredump.sh +++ b/test/units/TEST-87-AUX-UTILS-VM.coredump.sh @@ -198,12 +198,17 @@ journalctl -b -n 1 --output=export --output-fields=MESSAGE,COREDUMP COREDUMP_EXE /usr/lib/systemd/systemd-coredump --backtrace $$ 0 0 6 1679509900 12345 journalctl -b -n 1 --output=export --output-fields=MESSAGE,COREDUMP COREDUMP_EXE="/usr/bin/test-dump" | /usr/lib/systemd/systemd-coredump --backtrace $$ 0 0 6 1679509901 12345 mymachine +journalctl -b -n 1 --output=export --output-fields=MESSAGE,COREDUMP COREDUMP_EXE="/usr/bin/test-dump" | + /usr/lib/systemd/systemd-coredump --backtrace $$ 0 0 6 1679509902 12345 youmachine 1 # Wait a bit for the coredumps to get processed timeout 30 bash -c "while [[ \$(coredumpctl list -q --no-legend $$ | wc -l) -lt 2 ]]; do sleep 1; done" coredumpctl info $$ coredumpctl info COREDUMP_TIMESTAMP=1679509900000000 coredumpctl info COREDUMP_TIMESTAMP=1679509901000000 coredumpctl info COREDUMP_HOSTNAME="mymachine" +coredumpctl info COREDUMP_TIMESTAMP=1679509902000000 +coredumpctl info COREDUMP_HOSTNAME="youmachine" +coredumpctl info COREDUMP_DUMPABLE="1" # This used to cause a stack overflow systemd-run -t --property CoredumpFilter=all ls /tmp