From cbe83389d56bfcf0e7ba4b63312a64755fe27e9c Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 18 Mar 2019 20:21:11 +0100 Subject: [PATCH 1/9] core: rearrange cgroup empty events a bit So far the priorities for cgroup empty event handling were pretty weird. The raw events (on cgroupsv2 from inotify, on cgroupsv1 from the agent dgram socket) where scheduled at a lower priority than the cgroup empty queue dispatcher. Let's swap that and ensure that we can coalesce events more agressively: let's process the raw events at higher priority than the cgroup empty event (which remains at the same prio). --- src/core/cgroup.c | 10 +++++++--- src/core/manager.c | 9 ++++----- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/core/cgroup.c b/src/core/cgroup.c index ad67ba0438..5bac54f36d 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -2606,6 +2606,9 @@ int manager_setup_cgroup(Manager *m) { if (r < 0) return log_error_errno(r, "Failed to create cgroup empty event source: %m"); + /* Schedule cgroup empty checks early, but after having processed service notification messages or + * SIGCHLD signals, so that a cgroup running empty is always just the last safety net of + * notification, and we collected the metadata the notification and SIGCHLD stuff offers first. */ r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5); if (r < 0) return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m"); @@ -2632,9 +2635,10 @@ int manager_setup_cgroup(Manager *m) { if (r < 0) return log_error_errno(r, "Failed to watch control group inotify object: %m"); - /* Process cgroup empty notifications early, but after service notifications and SIGCHLD. Also - * see handling of cgroup agent notifications, for the classic cgroup hierarchy support. */ - r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-4); + /* Process cgroup empty notifications early. Note that when this event is dispatched it'll + * just add the unit to a cgroup empty queue, hence let's run earlier than that. Also see + * handling of cgroup agent notifications, for the classic cgroup hierarchy support. */ + r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-9); if (r < 0) return log_error_errno(r, "Failed to set priority of inotify event source: %m"); diff --git a/src/core/manager.c b/src/core/manager.c index 12ae911a38..a81c09dc6d 100644 --- a/src/core/manager.c +++ b/src/core/manager.c @@ -992,11 +992,10 @@ static int manager_setup_cgroups_agent(Manager *m) { if (r < 0) return log_error_errno(r, "Failed to allocate cgroups agent event source: %m"); - /* Process cgroups notifications early, but after having processed service notification messages or - * SIGCHLD signals, so that a cgroup running empty is always just the last safety net of notification, - * and we collected the metadata the notification and SIGCHLD stuff offers first. Also see handling of - * cgroup inotify for the unified cgroup stuff. */ - r = sd_event_source_set_priority(m->cgroups_agent_event_source, SD_EVENT_PRIORITY_NORMAL-4); + /* Process cgroups notifications early. Note that when the agent notification is received + * we'll just enqueue the unit in the cgroup empty queue, hence pick a high priority than + * that. Also see handling of cgroup inotify for the unified cgroup stuff. */ + r = sd_event_source_set_priority(m->cgroups_agent_event_source, SD_EVENT_PRIORITY_NORMAL-9); if (r < 0) return log_error_errno(r, "Failed to set priority of cgroups agent event source: %m"); From 5210387ea66655dfcc3264b3c44f95b373c0f719 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 19 Mar 2019 13:01:12 +0100 Subject: [PATCH 2/9] core: check for redundant operation before doing allocation --- src/core/cgroup.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 5bac54f36d..01d72a3908 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -1568,15 +1568,14 @@ int unit_set_cgroup_path(Unit *u, const char *path) { assert(u); + if (streq_ptr(u->cgroup_path, path)) + return 0; + if (path) { p = strdup(path); if (!p) return -ENOMEM; - } else - p = NULL; - - if (streq_ptr(u->cgroup_path, p)) - return 0; + } if (p) { r = hashmap_put(u->manager->cgroup_unit, p, u); @@ -1585,7 +1584,6 @@ int unit_set_cgroup_path(Unit *u, const char *path) { } unit_release_cgroup(u); - u->cgroup_path = TAKE_PTR(p); return 1; From 0bb814c2c21350eafaf267c2003f4af530ca7242 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 19 Mar 2019 17:17:31 +0100 Subject: [PATCH 3/9] =?UTF-8?q?core:=20rename=20cgroup=5Finotify=5Fwd=20?= =?UTF-8?q?=E2=86=92=20cgroup=5Fcontrol=5Finotify=5Fwd?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's rename the .cgroup_inotify_wd field of the Unit object to .cgroup_control_inotify_wd. Let's similarly rename the hashmap .cgroup_inotify_wd_unit of the Manager object to .cgroup_control_inotify_wd_unit. Why? As preparation for a later commit that allows us to watch the "memory.events" cgroup attribute file in addition to the "cgroup.events" file we already watch with the fields above. In that later commit we'll add new fields "cgroup_memory_inotify_wd" to Unit and "cgroup_memory_inotify_wd_unit" to Manager, that are used to watch these other events file. No change in behaviour. Just some renaming. --- src/core/cgroup.c | 38 ++++++++++++++++++++------------------ src/core/manager.h | 2 +- src/core/unit.c | 2 +- src/core/unit.h | 2 +- 4 files changed, 23 insertions(+), 21 deletions(-) diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 01d72a3908..8d27d26505 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -1595,10 +1595,13 @@ int unit_watch_cgroup(Unit *u) { assert(u); + /* Watches the "cgroups.events" attribute of this unit's cgroup for "empty" events, but only if + * cgroupv2 is available. */ + if (!u->cgroup_path) return 0; - if (u->cgroup_inotify_wd >= 0) + if (u->cgroup_control_inotify_wd >= 0) return 0; /* Only applies to the unified hierarchy */ @@ -1608,11 +1611,11 @@ int unit_watch_cgroup(Unit *u) { if (r == 0) return 0; - /* Don't watch the root slice, it's pointless. */ + /* No point in watch the top-level slice, it's never going to run empty. */ if (unit_has_name(u, SPECIAL_ROOT_SLICE)) return 0; - r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops); + r = hashmap_ensure_allocated(&u->manager->cgroup_control_inotify_wd_unit, &trivial_hash_ops); if (r < 0) return log_oom(); @@ -1620,20 +1623,19 @@ int unit_watch_cgroup(Unit *u) { if (r < 0) return log_oom(); - u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY); - if (u->cgroup_inotify_wd < 0) { + u->cgroup_control_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY); + if (u->cgroup_control_inotify_wd < 0) { - if (errno == ENOENT) /* If the directory is already - * gone we don't need to track - * it, so this is not an error */ + if (errno == ENOENT) /* If the directory is already gone we don't need to track it, so this + * is not an error */ return 0; - return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path); + return log_unit_error_errno(u, errno, "Failed to add control inotify watch descriptor for control group %s: %m", u->cgroup_path); } - r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u); + r = hashmap_put(u->manager->cgroup_control_inotify_wd_unit, INT_TO_PTR(u->cgroup_control_inotify_wd), u); if (r < 0) - return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m"); + return log_unit_error_errno(u, r, "Failed to add control inotify watch descriptor to hash map: %m"); return 0; } @@ -2223,12 +2225,12 @@ void unit_release_cgroup(Unit *u) { u->cgroup_path = mfree(u->cgroup_path); } - if (u->cgroup_inotify_wd >= 0) { - if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0) - log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring: %m", u->cgroup_inotify_wd, u->id); + if (u->cgroup_control_inotify_wd >= 0) { + if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_control_inotify_wd) < 0) + log_unit_debug_errno(u, errno, "Failed to remove cgroup control inotify watch %i for %s, ignoring: %m", u->cgroup_control_inotify_wd, u->id); - (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd)); - u->cgroup_inotify_wd = -1; + (void) hashmap_remove(u->manager->cgroup_control_inotify_wd_unit, INT_TO_PTR(u->cgroup_control_inotify_wd)); + u->cgroup_control_inotify_wd = -1; } } @@ -2508,7 +2510,7 @@ static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, /* The watch was just removed */ continue; - u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd)); + u = hashmap_get(m->cgroup_control_inotify_wd_unit, INT_TO_PTR(e->wd)); if (!u) /* Not that inotify might deliver * events for a watch even after it * was removed, because it was queued @@ -2706,7 +2708,7 @@ void manager_shutdown_cgroup(Manager *m, bool delete) { m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source); - m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit); + m->cgroup_control_inotify_wd_unit = hashmap_free(m->cgroup_control_inotify_wd_unit); m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source); m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd); diff --git a/src/core/manager.h b/src/core/manager.h index cdd4882a6a..ea442596ae 100644 --- a/src/core/manager.h +++ b/src/core/manager.h @@ -268,7 +268,7 @@ struct Manager { /* Notifications from cgroups, when the unified hierarchy is used is done via inotify. */ int cgroup_inotify_fd; sd_event_source *cgroup_inotify_event_source; - Hashmap *cgroup_inotify_wd_unit; + Hashmap *cgroup_control_inotify_wd_unit; /* A defer event for handling cgroup empty events and processing them after SIGCHLD in all cases. */ sd_event_source *cgroup_empty_event_source; diff --git a/src/core/unit.c b/src/core/unit.c index 2cde494a7e..c8586d16cf 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -91,7 +91,7 @@ Unit *unit_new(Manager *m, size_t size) { u->unit_file_state = _UNIT_FILE_STATE_INVALID; u->unit_file_preset = -1; u->on_failure_job_mode = JOB_REPLACE; - u->cgroup_inotify_wd = -1; + u->cgroup_control_inotify_wd = -1; u->job_timeout = USEC_INFINITY; u->job_running_timeout = USEC_INFINITY; u->ref_uid = UID_INVALID; diff --git a/src/core/unit.h b/src/core/unit.h index 28878b5f4f..dc2278c32d 100644 --- a/src/core/unit.h +++ b/src/core/unit.h @@ -252,7 +252,7 @@ typedef struct Unit { CGroupMask cgroup_enabled_mask; /* Which controllers are enabled (or more correctly: enabled for the children) for this unit's cgroup? (only relevant on cgroup v2) */ CGroupMask cgroup_invalidated_mask; /* A mask specifiying controllers which shall be considered invalidated, and require re-realization */ CGroupMask cgroup_members_mask; /* A cache for the controllers required by all children of this cgroup (only relevant for slice units) */ - int cgroup_inotify_wd; + int cgroup_control_inotify_wd; /* Device Controller BPF program */ BPFProgram *bpf_device_control_installed; From a5b5aece01f677ba59ca43001c85b722388fd67f Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 19 Mar 2019 17:28:02 +0100 Subject: [PATCH 4/9] service: beautify debug log message a bit --- src/core/service.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/service.c b/src/core/service.c index 02899901f7..a6f6a7383f 100644 --- a/src/core/service.c +++ b/src/core/service.c @@ -3148,7 +3148,7 @@ static void service_notify_cgroup_empty_event(Unit *u) { assert(u); - log_unit_debug(u, "cgroup is empty"); + log_unit_debug(u, "Control group is empty."); switch (s->state) { From afcfaa695cd00b713e7d57e1829da90b692ac6f8 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 19 Mar 2019 19:05:19 +0100 Subject: [PATCH 5/9] core: implement OOMPolicy= and watch cgroups for OOM killings This adds a new per-service OOMPolicy= (along with a global DefaultOOMPolicy=) that controls what to do if a process of the service is killed by the kernel's OOM killer. It has three different values: "continue" (old behaviour), "stop" (terminate the service), "kill" (let the kernel kill all the service's processes). On top of that, track OOM killer events per unit: generate a per-unit structured, recognizable log message when we see an OOM killer event, and put the service in a failure state if an OOM killer event was seen and the selected policy was not "continue". A new "result" is defined for this case: "oom-kill". All of this relies on new cgroupv2 kernel functionality: the "memory.events" notification interface and the "memory.oom.group" attribute (which makes the kernel kill all cgroup processes automatically). --- src/core/cgroup.c | 194 ++++++++++++++++++++++++-- src/core/cgroup.h | 4 + src/core/dbus-manager.c | 3 + src/core/dbus-manager.h | 2 + src/core/dbus-service.c | 6 + src/core/load-fragment-gperf.gperf.m4 | 1 + src/core/load-fragment.c | 1 + src/core/load-fragment.h | 1 + src/core/main.c | 3 + src/core/manager.c | 10 ++ src/core/manager.h | 21 +++ src/core/service.c | 70 +++++++++- src/core/service.h | 3 + src/core/unit.c | 13 ++ src/core/unit.h | 16 ++- src/shared/bus-unit-util.c | 2 +- src/systemd/sd-messages.h | 3 + 17 files changed, 340 insertions(+), 13 deletions(-) diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 8d27d26505..6a477adb71 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -3,6 +3,8 @@ #include #include +#include "sd-messages.h" + #include "alloc-util.h" #include "blockdev-util.h" #include "bpf-devices.h" @@ -1141,6 +1143,8 @@ static void cgroup_context_apply( cgroup_apply_unified_memory_limit(u, "memory.max", max); cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max); + (void) set_attribute_and_warn(u, "memory", "memory.oom.group", one_zero(c->memory_oom_group)); + } else { char buf[DECIMAL_STR_MAX(uint64_t) + 1]; uint64_t val; @@ -1640,6 +1644,69 @@ int unit_watch_cgroup(Unit *u) { return 0; } +int unit_watch_cgroup_memory(Unit *u) { + _cleanup_free_ char *events = NULL; + CGroupContext *c; + int r; + + assert(u); + + /* Watches the "memory.events" attribute of this unit's cgroup for "oom_kill" events, but only if + * cgroupv2 is available. */ + + if (!u->cgroup_path) + return 0; + + c = unit_get_cgroup_context(u); + if (!c) + return 0; + + /* The "memory.events" attribute is only available if the memory controller is on. Let's hence tie + * this to memory accounting, in a way watching for OOM kills is a form of memory accounting after + * all. */ + if (!c->memory_accounting) + return 0; + + /* Don't watch inner nodes, as the kernel doesn't report oom_kill events recursively currently, and + * we also don't want to generate a log message for each parent cgroup of a process. */ + if (u->type == UNIT_SLICE) + return 0; + + if (u->cgroup_memory_inotify_wd >= 0) + return 0; + + /* Only applies to the unified hierarchy */ + r = cg_all_unified(); + if (r < 0) + return log_error_errno(r, "Failed to determine whether the memory controller is unified: %m"); + if (r == 0) + return 0; + + r = hashmap_ensure_allocated(&u->manager->cgroup_memory_inotify_wd_unit, &trivial_hash_ops); + if (r < 0) + return log_oom(); + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "memory.events", &events); + if (r < 0) + return log_oom(); + + u->cgroup_memory_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY); + if (u->cgroup_memory_inotify_wd < 0) { + + if (errno == ENOENT) /* If the directory is already gone we don't need to track it, so this + * is not an error */ + return 0; + + return log_unit_error_errno(u, errno, "Failed to add memory inotify watch descriptor for control group %s: %m", u->cgroup_path); + } + + r = hashmap_put(u->manager->cgroup_memory_inotify_wd_unit, INT_TO_PTR(u->cgroup_memory_inotify_wd), u); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to add memory inotify watch descriptor to hash map: %m"); + + return 0; +} + int unit_pick_cgroup_path(Unit *u) { _cleanup_free_ char *path = NULL; int r; @@ -1692,6 +1759,7 @@ static int unit_create_cgroup( /* Start watching it */ (void) unit_watch_cgroup(u); + (void) unit_watch_cgroup_memory(u); /* Preserve enabled controllers in delegated units, adjust others. */ if (created || !u->cgroup_realized || !unit_cgroup_delegate(u)) { @@ -2232,6 +2300,14 @@ void unit_release_cgroup(Unit *u) { (void) hashmap_remove(u->manager->cgroup_control_inotify_wd_unit, INT_TO_PTR(u->cgroup_control_inotify_wd)); u->cgroup_control_inotify_wd = -1; } + + if (u->cgroup_memory_inotify_wd >= 0) { + if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_memory_inotify_wd) < 0) + log_unit_debug_errno(u, errno, "Failed to remove cgroup memory inotify watch %i for %s, ignoring: %m", u->cgroup_memory_inotify_wd, u->id); + + (void) hashmap_remove(u->manager->cgroup_memory_inotify_wd_unit, INT_TO_PTR(u->cgroup_memory_inotify_wd)); + u->cgroup_memory_inotify_wd = -1; + } } void unit_prune_cgroup(Unit *u) { @@ -2479,6 +2555,106 @@ void unit_add_to_cgroup_empty_queue(Unit *u) { log_debug_errno(r, "Failed to enable cgroup empty event source: %m"); } +static int unit_check_oom(Unit *u) { + _cleanup_free_ char *oom_kill = NULL; + bool increased; + uint64_t c; + int r; + + if (!u->cgroup_path) + return 0; + + r = cg_get_keyed_attribute("memory", u->cgroup_path, "memory.events", STRV_MAKE("oom_kill"), &oom_kill); + if (r < 0) + return log_unit_debug_errno(u, r, "Failed to read oom_kill field of memory.events cgroup attribute: %m"); + + r = safe_atou64(oom_kill, &c); + if (r < 0) + return log_unit_debug_errno(u, r, "Failed to parse oom_kill field: %m"); + + increased = c > u->oom_kill_last; + u->oom_kill_last = c; + + if (!increased) + return 0; + + log_struct(LOG_NOTICE, + "MESSAGE_ID=" SD_MESSAGE_UNIT_OUT_OF_MEMORY_STR, + LOG_UNIT_ID(u), + LOG_UNIT_INVOCATION_ID(u), + LOG_UNIT_MESSAGE(u, "A process of this unit has been killed by the OOM killer.")); + + if (UNIT_VTABLE(u)->notify_cgroup_oom) + UNIT_VTABLE(u)->notify_cgroup_oom(u); + + return 1; +} + +static int on_cgroup_oom_event(sd_event_source *s, void *userdata) { + Manager *m = userdata; + Unit *u; + int r; + + assert(s); + assert(m); + + u = m->cgroup_oom_queue; + if (!u) + return 0; + + assert(u->in_cgroup_oom_queue); + u->in_cgroup_oom_queue = false; + LIST_REMOVE(cgroup_oom_queue, m->cgroup_oom_queue, u); + + if (m->cgroup_oom_queue) { + /* More stuff queued, let's make sure we remain enabled */ + r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT); + if (r < 0) + log_debug_errno(r, "Failed to reenable cgroup oom event source, ignoring: %m"); + } + + (void) unit_check_oom(u); + return 0; +} + +static void unit_add_to_cgroup_oom_queue(Unit *u) { + int r; + + assert(u); + + if (u->in_cgroup_oom_queue) + return; + if (!u->cgroup_path) + return; + + LIST_PREPEND(cgroup_oom_queue, u->manager->cgroup_oom_queue, u); + u->in_cgroup_oom_queue = true; + + /* Trigger the defer event */ + if (!u->manager->cgroup_oom_event_source) { + _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL; + + r = sd_event_add_defer(u->manager->event, &s, on_cgroup_oom_event, u->manager); + if (r < 0) { + log_error_errno(r, "Failed to create cgroup oom event source: %m"); + return; + } + + r = sd_event_source_set_priority(s, SD_EVENT_PRIORITY_NORMAL-8); + if (r < 0) { + log_error_errno(r, "Failed to set priority of cgroup oom event source: %m"); + return; + } + + (void) sd_event_source_set_description(s, "cgroup-oom"); + u->manager->cgroup_oom_event_source = TAKE_PTR(s); + } + + r = sd_event_source_set_enabled(u->manager->cgroup_oom_event_source, SD_EVENT_ONESHOT); + if (r < 0) + log_error_errno(r, "Failed to enable cgroup oom event source: %m"); +} + static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) { Manager *m = userdata; @@ -2510,15 +2686,16 @@ static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, /* The watch was just removed */ continue; - u = hashmap_get(m->cgroup_control_inotify_wd_unit, INT_TO_PTR(e->wd)); - if (!u) /* Not that inotify might deliver - * events for a watch even after it - * was removed, because it was queued - * before the removal. Let's ignore - * this here safely. */ - continue; + /* Note that inotify might deliver events for a watch even after it was removed, + * because it was queued before the removal. Let's ignore this here safely. */ - unit_add_to_cgroup_empty_queue(u); + u = hashmap_get(m->cgroup_control_inotify_wd_unit, INT_TO_PTR(e->wd)); + if (u) + unit_add_to_cgroup_empty_queue(u); + + u = hashmap_get(m->cgroup_memory_inotify_wd_unit, INT_TO_PTR(e->wd)); + if (u) + unit_add_to_cgroup_oom_queue(u); } } } @@ -2709,6 +2886,7 @@ void manager_shutdown_cgroup(Manager *m, bool delete) { m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source); m->cgroup_control_inotify_wd_unit = hashmap_free(m->cgroup_control_inotify_wd_unit); + m->cgroup_memory_inotify_wd_unit = hashmap_free(m->cgroup_memory_inotify_wd_unit); m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source); m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd); diff --git a/src/core/cgroup.h b/src/core/cgroup.h index 51e7c96d60..050b963579 100644 --- a/src/core/cgroup.h +++ b/src/core/cgroup.h @@ -79,6 +79,9 @@ struct CGroupContext { bool tasks_accounting; bool ip_accounting; + /* Configures the memory.oom.group attribute (on unified) */ + bool memory_oom_group; + bool delegate; CGroupMask delegate_controllers; CGroupMask disable_controllers; @@ -174,6 +177,7 @@ int unit_realize_cgroup(Unit *u); void unit_release_cgroup(Unit *u); void unit_prune_cgroup(Unit *u); int unit_watch_cgroup(Unit *u); +int unit_watch_cgroup_memory(Unit *u); void unit_add_to_cgroup_empty_queue(Unit *u); diff --git a/src/core/dbus-manager.c b/src/core/dbus-manager.c index de997c3305..9c0b994cc3 100644 --- a/src/core/dbus-manager.c +++ b/src/core/dbus-manager.c @@ -43,6 +43,8 @@ static UnitFileFlags unit_file_bools_to_flags(bool runtime, bool force) { (force ? UNIT_FILE_FORCE : 0); } +BUS_DEFINE_PROPERTY_GET_ENUM(bus_property_get_oom_policy, oom_policy, OOMPolicy); + static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_version, "s", GIT_VERSION); static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_features, "s", SYSTEMD_FEATURES); static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_architecture, "s", architecture_to_string(uname_architecture())); @@ -2452,6 +2454,7 @@ const sd_bus_vtable bus_manager_vtable[] = { SD_BUS_PROPERTY("DefaultLimitRTTIMESoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_RTTIME]), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("DefaultTasksMax", "t", NULL, offsetof(Manager, default_tasks_max), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("TimerSlackNSec", "t", property_get_timer_slack_nsec, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultOOMPolicy", "s", bus_property_get_oom_policy, offsetof(Manager, default_oom_policy), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_METHOD("GetUnit", "s", "o", method_get_unit, SD_BUS_VTABLE_UNPRIVILEGED), SD_BUS_METHOD("GetUnitByPID", "u", "o", method_get_unit_by_pid, SD_BUS_VTABLE_UNPRIVILEGED), diff --git a/src/core/dbus-manager.h b/src/core/dbus-manager.h index d0306adc70..10aa2eccee 100644 --- a/src/core/dbus-manager.h +++ b/src/core/dbus-manager.h @@ -12,3 +12,5 @@ void bus_manager_send_reloading(Manager *m, bool active); void bus_manager_send_change_signal(Manager *m); int verify_run_space_and_log(const char *message); + +int bus_property_get_oom_policy(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error); diff --git a/src/core/dbus-service.c b/src/core/dbus-service.c index 48c0fb74a7..0f563a6625 100644 --- a/src/core/dbus-service.c +++ b/src/core/dbus-service.c @@ -10,6 +10,7 @@ #include "dbus-cgroup.h" #include "dbus-execute.h" #include "dbus-kill.h" +#include "dbus-manager.h" #include "dbus-service.h" #include "dbus-util.h" #include "exit-status.h" @@ -127,6 +128,7 @@ const sd_bus_vtable bus_service_vtable[] = { SD_BUS_PROPERTY("UID", "u", bus_property_get_uid, offsetof(Unit, ref_uid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), SD_BUS_PROPERTY("GID", "u", bus_property_get_gid, offsetof(Unit, ref_gid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), SD_BUS_PROPERTY("NRestarts", "u", bus_property_get_unsigned, offsetof(Service, n_restarts), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("OOMPolicy", "s", bus_property_get_oom_policy, offsetof(Service, oom_policy), SD_BUS_VTABLE_PROPERTY_CONST), BUS_EXEC_STATUS_VTABLE("ExecMain", offsetof(Service, main_exec_status), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), BUS_EXEC_COMMAND_LIST_VTABLE("ExecStartPre", offsetof(Service, exec_command[SERVICE_EXEC_START_PRE]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), @@ -257,6 +259,7 @@ static int bus_set_transient_std_fd( static BUS_DEFINE_SET_TRANSIENT_PARSE(notify_access, NotifyAccess, notify_access_from_string); static BUS_DEFINE_SET_TRANSIENT_PARSE(service_type, ServiceType, service_type_from_string); static BUS_DEFINE_SET_TRANSIENT_PARSE(service_restart, ServiceRestart, service_restart_from_string); +static BUS_DEFINE_SET_TRANSIENT_PARSE(oom_policy, OOMPolicy, oom_policy_from_string); static BUS_DEFINE_SET_TRANSIENT_STRING_WITH_CHECK(bus_name, service_name_is_valid); static int bus_service_set_transient_property( @@ -291,6 +294,9 @@ static int bus_service_set_transient_property( if (streq(name, "Type")) return bus_set_transient_service_type(u, name, &s->type, message, flags, error); + if (streq(name, "OOMPolicy")) + return bus_set_transient_oom_policy(u, name, &s->oom_policy, message, flags, error); + if (streq(name, "RestartUSec")) return bus_set_transient_usec(u, name, &s->restart_usec, message, flags, error); diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4 index 7fb1e6cc8b..9cbce4f11f 100644 --- a/src/core/load-fragment-gperf.gperf.m4 +++ b/src/core/load-fragment-gperf.gperf.m4 @@ -333,6 +333,7 @@ Service.Sockets, config_parse_service_sockets, 0, Service.BusPolicy, config_parse_warn_compat, DISABLED_LEGACY, 0 Service.USBFunctionDescriptors, config_parse_unit_path_printf, 0, offsetof(Service, usb_function_descriptors) Service.USBFunctionStrings, config_parse_unit_path_printf, 0, offsetof(Service, usb_function_strings) +Service.OOMPolicy, config_parse_oom_policy, 0, offsetof(Service, oom_policy) EXEC_CONTEXT_CONFIG_ITEMS(Service)m4_dnl CGROUP_CONTEXT_CONFIG_ITEMS(Service)m4_dnl KILL_CONTEXT_CONFIG_ITEMS(Service)m4_dnl diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c index 9b7c8cba67..66e5c6d231 100644 --- a/src/core/load-fragment.c +++ b/src/core/load-fragment.c @@ -86,6 +86,7 @@ DEFINE_CONFIG_PARSE_ENUM(config_parse_runtime_preserve_mode, exec_preserve_mode, DEFINE_CONFIG_PARSE_ENUM(config_parse_service_type, service_type, ServiceType, "Failed to parse service type"); DEFINE_CONFIG_PARSE_ENUM(config_parse_service_restart, service_restart, ServiceRestart, "Failed to parse service restart specifier"); DEFINE_CONFIG_PARSE_ENUM(config_parse_socket_bind, socket_address_bind_ipv6_only_or_bool, SocketAddressBindIPv6Only, "Failed to parse bind IPv6 only value"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_oom_policy, oom_policy, OOMPolicy, "Failed to parse OOM policy"); DEFINE_CONFIG_PARSE_ENUM_WITH_DEFAULT(config_parse_ip_tos, ip_tos, int, -1, "Failed to parse IP TOS value"); DEFINE_CONFIG_PARSE_PTR(config_parse_blockio_weight, cg_blkio_weight_parse, uint64_t, "Invalid block IO weight"); DEFINE_CONFIG_PARSE_PTR(config_parse_cg_weight, cg_weight_parse, uint64_t, "Invalid weight"); diff --git a/src/core/load-fragment.h b/src/core/load-fragment.h index e0d3b4ec3b..95d06320c0 100644 --- a/src/core/load-fragment.h +++ b/src/core/load-fragment.h @@ -106,6 +106,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_collect_mode); CONFIG_PARSER_PROTOTYPE(config_parse_pid_file); CONFIG_PARSER_PROTOTYPE(config_parse_exit_status); CONFIG_PARSER_PROTOTYPE(config_parse_disable_controllers); +CONFIG_PARSER_PROTOTYPE(config_parse_oom_policy); /* gperf prototypes */ const struct ConfigPerfItem* load_fragment_gperf_lookup(const char *key, GPERF_LEN_TYPE length); diff --git a/src/core/main.c b/src/core/main.c index 46db47126c..3c4b5202df 100644 --- a/src/core/main.c +++ b/src/core/main.c @@ -135,6 +135,7 @@ static bool arg_default_tasks_accounting = true; static uint64_t arg_default_tasks_max = UINT64_MAX; static sd_id128_t arg_machine_id = {}; static EmergencyAction arg_cad_burst_action = EMERGENCY_ACTION_REBOOT_FORCE; +static OOMPolicy arg_default_oom_policy = OOM_STOP; _noreturn_ static void freeze_or_exit_or_reboot(void) { @@ -725,6 +726,7 @@ static int parse_config_file(void) { { "Manager", "DefaultTasksAccounting", config_parse_bool, 0, &arg_default_tasks_accounting }, { "Manager", "DefaultTasksMax", config_parse_tasks_max, 0, &arg_default_tasks_max }, { "Manager", "CtrlAltDelBurstAction", config_parse_emergency_action, 0, &arg_cad_burst_action }, + { "Manager", "DefaultOOMPolicy", config_parse_oom_policy, 0, &arg_default_oom_policy }, {} }; @@ -780,6 +782,7 @@ static void set_manager_defaults(Manager *m) { m->default_memory_accounting = arg_default_memory_accounting; m->default_tasks_accounting = arg_default_tasks_accounting; m->default_tasks_max = arg_default_tasks_max; + m->default_oom_policy = arg_default_oom_policy; (void) manager_set_default_rlimits(m, arg_default_rlimit); diff --git a/src/core/manager.c b/src/core/manager.c index a81c09dc6d..4154aec12f 100644 --- a/src/core/manager.c +++ b/src/core/manager.c @@ -764,6 +764,8 @@ int manager_new(UnitFileScope scope, ManagerTestRunFlags test_run_flags, Manager .have_ask_password = -EINVAL, /* we don't know */ .first_boot = -1, .test_run_flags = test_run_flags, + + .default_oom_policy = OOM_STOP, }; #if ENABLE_EFI @@ -4714,3 +4716,11 @@ static const char *const manager_timestamp_table[_MANAGER_TIMESTAMP_MAX] = { }; DEFINE_STRING_TABLE_LOOKUP(manager_timestamp, ManagerTimestamp); + +static const char* const oom_policy_table[_OOM_POLICY_MAX] = { + [OOM_CONTINUE] = "continue", + [OOM_STOP] = "stop", + [OOM_KILL] = "kill", +}; + +DEFINE_STRING_TABLE_LOOKUP(oom_policy, OOMPolicy); diff --git a/src/core/manager.h b/src/core/manager.h index ea442596ae..b5def59ed0 100644 --- a/src/core/manager.h +++ b/src/core/manager.h @@ -56,6 +56,14 @@ typedef enum StatusType { STATUS_TYPE_EMERGENCY, } StatusType; +typedef enum OOMPolicy { + OOM_CONTINUE, /* The kernel kills the process it wants to kill, and that's it */ + OOM_STOP, /* The kernel kills the process it wants to kill, and we stop the unit */ + OOM_KILL, /* The kernel kills the process it wants to kill, and all others in the unit, and we stop the unit */ + _OOM_POLICY_MAX, + _OOM_POLICY_INVALID = -1 +} OOMPolicy; + /* Notes: * 1. TIMESTAMP_FIRMWARE, TIMESTAMP_LOADER, TIMESTAMP_KERNEL, TIMESTAMP_INITRD, * TIMESTAMP_SECURITY_START, and TIMESTAMP_SECURITY_FINISH are set only when @@ -159,6 +167,9 @@ struct Manager { /* Units whose cgroup ran empty */ LIST_HEAD(Unit, cgroup_empty_queue); + /* Units whose memory.event fired */ + LIST_HEAD(Unit, cgroup_oom_queue); + /* Target units whose default target dependencies haven't been set yet */ LIST_HEAD(Unit, target_deps_queue); @@ -268,10 +279,15 @@ struct Manager { /* Notifications from cgroups, when the unified hierarchy is used is done via inotify. */ int cgroup_inotify_fd; sd_event_source *cgroup_inotify_event_source; + + /* Maps for finding the unit for each inotify watch descriptor for the cgroup.events and + * memory.events cgroupv2 attributes. */ Hashmap *cgroup_control_inotify_wd_unit; + Hashmap *cgroup_memory_inotify_wd_unit; /* A defer event for handling cgroup empty events and processing them after SIGCHLD in all cases. */ sd_event_source *cgroup_empty_event_source; + sd_event_source *cgroup_oom_event_source; /* Make sure the user cannot accidentally unmount our cgroup * file system */ @@ -328,6 +344,8 @@ struct Manager { uint64_t default_tasks_max; usec_t default_timer_accuracy_usec; + OOMPolicy default_oom_policy; + int original_log_level; LogTarget original_log_target; bool log_level_overridden:1; @@ -519,3 +537,6 @@ void manager_disable_confirm_spawn(void); const char *manager_timestamp_to_string(ManagerTimestamp m) _const_; ManagerTimestamp manager_timestamp_from_string(const char *s) _pure_; ManagerTimestamp manager_timestamp_initrd_mangle(ManagerTimestamp s); + +const char* oom_policy_to_string(OOMPolicy i) _const_; +OOMPolicy oom_policy_from_string(const char *s) _pure_; diff --git a/src/core/service.c b/src/core/service.c index a6f6a7383f..53cead772a 100644 --- a/src/core/service.c +++ b/src/core/service.c @@ -112,6 +112,8 @@ static void service_init(Unit *u) { EXEC_KEYRING_PRIVATE : EXEC_KEYRING_INHERIT; s->watchdog_original_usec = USEC_INFINITY; + + s->oom_policy = _OOM_POLICY_INVALID; } static void service_unwatch_control_pid(Service *s) { @@ -731,6 +733,15 @@ static int service_add_extras(Service *s) { (s->type == SERVICE_NOTIFY || s->watchdog_usec > 0 || s->n_fd_store_max > 0)) s->notify_access = NOTIFY_MAIN; + /* If no OOM policy was explicitly set, then default to the configure default OOM policy. Except when + * delegation is on, in that case it we assume the payload knows better what to do and can process + * things in a more focussed way. */ + if (s->oom_policy < 0) + s->oom_policy = s->cgroup_context.delegate ? OOM_CONTINUE : UNIT(s)->manager->default_oom_policy; + + /* Let the kernel do the killing if that's requested. */ + s->cgroup_context.memory_oom_group = s->oom_policy == OOM_KILL; + r = service_add_default_dependencies(s); if (r < 0) return r; @@ -799,7 +810,8 @@ static void service_dump(Unit *u, FILE *f, const char *prefix) { "%sType: %s\n" "%sRestart: %s\n" "%sNotifyAccess: %s\n" - "%sNotifyState: %s\n", + "%sNotifyState: %s\n" + "%sOOMPolicy: %s\n", prefix, service_state_to_string(s->state), prefix, service_result_to_string(s->result), prefix, service_result_to_string(s->reload_result), @@ -810,7 +822,8 @@ static void service_dump(Unit *u, FILE *f, const char *prefix) { prefix, service_type_to_string(s->type), prefix, service_restart_to_string(s->restart), prefix, notify_access_to_string(s->notify_access), - prefix, notify_state_to_string(s->notify_state)); + prefix, notify_state_to_string(s->notify_state), + prefix, oom_policy_to_string(s->oom_policy)); if (s->control_pid > 0) fprintf(f, @@ -3211,6 +3224,57 @@ static void service_notify_cgroup_empty_event(Unit *u) { } } +static void service_notify_cgroup_oom_event(Unit *u) { + Service *s = SERVICE(u); + + log_unit_debug(u, "Process of control group was killed by the OOM killer."); + + if (s->oom_policy == OOM_CONTINUE) + return; + + switch (s->state) { + + case SERVICE_START_PRE: + case SERVICE_START: + case SERVICE_START_POST: + case SERVICE_STOP: + if (s->oom_policy == OOM_STOP) + service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_OOM_KILL); + else if (s->oom_policy == OOM_KILL) + service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_FAILURE_OOM_KILL); + + break; + + case SERVICE_EXITED: + case SERVICE_RUNNING: + if (s->oom_policy == OOM_STOP) + service_enter_stop(s, SERVICE_FAILURE_OOM_KILL); + else if (s->oom_policy == OOM_KILL) + service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_FAILURE_OOM_KILL); + + break; + + case SERVICE_STOP_WATCHDOG: + case SERVICE_STOP_SIGTERM: + service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_FAILURE_OOM_KILL); + break; + + case SERVICE_STOP_SIGKILL: + case SERVICE_FINAL_SIGKILL: + if (s->result == SERVICE_SUCCESS) + s->result = SERVICE_FAILURE_OOM_KILL; + break; + + case SERVICE_STOP_POST: + case SERVICE_FINAL_SIGTERM: + service_enter_signal(s, SERVICE_FINAL_SIGKILL, SERVICE_FAILURE_OOM_KILL); + break; + + default: + ; + } +} + static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) { bool notify_dbus = true; Service *s = SERVICE(u); @@ -4116,6 +4180,7 @@ static const char* const service_result_table[_SERVICE_RESULT_MAX] = { [SERVICE_FAILURE_CORE_DUMP] = "core-dump", [SERVICE_FAILURE_WATCHDOG] = "watchdog", [SERVICE_FAILURE_START_LIMIT_HIT] = "start-limit-hit", + [SERVICE_FAILURE_OOM_KILL] = "oom-kill", }; DEFINE_STRING_TABLE_LOOKUP(service_result, ServiceResult); @@ -4169,6 +4234,7 @@ const UnitVTable service_vtable = { .reset_failed = service_reset_failed, .notify_cgroup_empty = service_notify_cgroup_empty_event, + .notify_cgroup_oom = service_notify_cgroup_oom_event, .notify_message = service_notify_message, .main_pid = service_main_pid, diff --git a/src/core/service.h b/src/core/service.h index 57eefbb34d..5cc946acaf 100644 --- a/src/core/service.h +++ b/src/core/service.h @@ -67,6 +67,7 @@ typedef enum ServiceResult { SERVICE_FAILURE_CORE_DUMP, SERVICE_FAILURE_WATCHDOG, SERVICE_FAILURE_START_LIMIT_HIT, + SERVICE_FAILURE_OOM_KILL, _SERVICE_RESULT_MAX, _SERVICE_RESULT_INVALID = -1 } ServiceResult; @@ -184,6 +185,8 @@ struct Service { unsigned n_restarts; bool flush_n_restarts; + + OOMPolicy oom_policy; }; extern const UnitVTable service_vtable; diff --git a/src/core/unit.c b/src/core/unit.c index c8586d16cf..608a33530c 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -92,6 +92,7 @@ Unit *unit_new(Manager *m, size_t size) { u->unit_file_preset = -1; u->on_failure_job_mode = JOB_REPLACE; u->cgroup_control_inotify_wd = -1; + u->cgroup_memory_inotify_wd = -1; u->job_timeout = USEC_INFINITY; u->job_running_timeout = USEC_INFINITY; u->ref_uid = UID_INVALID; @@ -3245,6 +3246,9 @@ int unit_serialize(Unit *u, FILE *f, FDSet *fds, bool serialize_jobs) { if (u->cpu_usage_last != NSEC_INFINITY) (void) serialize_item_format(f, "cpu-usage-last", "%" PRIu64, u->cpu_usage_last); + if (u->oom_kill_last > 0) + (void) serialize_item_format(f, "oom-kill-last", "%" PRIu64, u->oom_kill_last); + if (u->cgroup_path) (void) serialize_item(f, "cgroup", u->cgroup_path); @@ -3478,6 +3482,14 @@ int unit_deserialize(Unit *u, FILE *f, FDSet *fds) { continue; + } else if (streq(l, "oom-kill-last")) { + + r = safe_atou64(v, &u->oom_kill_last); + if (r < 0) + log_unit_debug(u, "Failed to read OOM kill last %s, ignoring.", v); + + continue; + } else if (streq(l, "cgroup")) { r = unit_set_cgroup_path(u, v); @@ -3485,6 +3497,7 @@ int unit_deserialize(Unit *u, FILE *f, FDSet *fds) { log_unit_debug_errno(u, r, "Failed to set cgroup path %s, ignoring: %m", v); (void) unit_watch_cgroup(u); + (void) unit_watch_cgroup_memory(u); continue; } else if (streq(l, "cgroup-realized")) { diff --git a/src/core/unit.h b/src/core/unit.h index dc2278c32d..f730f6eee4 100644 --- a/src/core/unit.h +++ b/src/core/unit.h @@ -200,6 +200,9 @@ typedef struct Unit { /* cgroup empty queue */ LIST_FIELDS(Unit, cgroup_empty_queue); + /* cgroup OOM queue */ + LIST_FIELDS(Unit, cgroup_oom_queue); + /* Target dependencies queue */ LIST_FIELDS(Unit, target_deps_queue); @@ -246,13 +249,19 @@ typedef struct Unit { nsec_t cpu_usage_base; nsec_t cpu_usage_last; /* the most recently read value */ + /* The current counter of the oom_kill field in the memory.events cgroup attribute */ + uint64_t oom_kill_last; + /* Counterparts in the cgroup filesystem */ char *cgroup_path; CGroupMask cgroup_realized_mask; /* In which hierarchies does this unit's cgroup exist? (only relevant on cgroup v1) */ CGroupMask cgroup_enabled_mask; /* Which controllers are enabled (or more correctly: enabled for the children) for this unit's cgroup? (only relevant on cgroup v2) */ CGroupMask cgroup_invalidated_mask; /* A mask specifiying controllers which shall be considered invalidated, and require re-realization */ CGroupMask cgroup_members_mask; /* A cache for the controllers required by all children of this cgroup (only relevant for slice units) */ + + /* Inotify watch descriptors for watching cgroup.events and memory.events on cgroupv2 */ int cgroup_control_inotify_wd; + int cgroup_memory_inotify_wd; /* Device Controller BPF program */ BPFProgram *bpf_device_control_installed; @@ -320,6 +329,7 @@ typedef struct Unit { bool in_gc_queue:1; bool in_cgroup_realize_queue:1; bool in_cgroup_empty_queue:1; + bool in_cgroup_oom_queue:1; bool in_target_deps_queue:1; bool in_stop_when_unneeded_queue:1; @@ -494,10 +504,12 @@ typedef struct UnitVTable { /* Reset failed state if we are in failed state */ void (*reset_failed)(Unit *u); - /* Called whenever any of the cgroups this unit watches for - * ran empty */ + /* Called whenever any of the cgroups this unit watches for ran empty */ void (*notify_cgroup_empty)(Unit *u); + /* Called whenever an OOM kill event on this unit was seen */ + void (*notify_cgroup_oom)(Unit *u); + /* Called whenever a process of this unit sends us a message */ void (*notify_message)(Unit *u, const struct ucred *ucred, char **tags, FDSet *fds); diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c index b42a9e5c5b..8d99955ca7 100644 --- a/src/shared/bus-unit-util.c +++ b/src/shared/bus-unit-util.c @@ -1314,7 +1314,7 @@ static int bus_append_service_property(sd_bus_message *m, const char *field, con if (STR_IN_SET(field, "PIDFile", "Type", "Restart", "BusName", "NotifyAccess", - "USBFunctionDescriptors", "USBFunctionStrings")) + "USBFunctionDescriptors", "USBFunctionStrings", "OOMPolicy")) return bus_append_string(m, field, eq); diff --git a/src/systemd/sd-messages.h b/src/systemd/sd-messages.h index 293db8e8e1..c243642889 100644 --- a/src/systemd/sd-messages.h +++ b/src/systemd/sd-messages.h @@ -125,6 +125,9 @@ _SD_BEGIN_DECLARATIONS; #define SD_MESSAGE_OVERMOUNTING SD_ID128_MAKE(1d,ee,03,69,c7,fc,47,36,b7,09,9b,38,ec,b4,6e,e7) #define SD_MESSAGE_OVERMOUNTING_STR SD_ID128_MAKE_STR(1d,ee,03,69,c7,fc,47,36,b7,09,9b,38,ec,b4,6e,e7) +#define SD_MESSAGE_UNIT_OUT_OF_MEMORY SD_ID128_MAKE(fe,6f,aa,94,e7,77,46,63,a0,da,52,71,78,91,d8,ef) +#define SD_MESSAGE_UNIT_OUT_OF_MEMORY_STR SD_ID128_MAKE_STR(fe,6f,aa,94,e7,77,46,63,a0,da,52,71,78,91,d8,ef) + #define SD_MESSAGE_LID_OPENED SD_ID128_MAKE(b7,2e,a4,a2,88,15,45,a0,b5,0e,20,0e,55,b9,b0,6f) #define SD_MESSAGE_LID_OPENED_STR SD_ID128_MAKE_STR(b7,2e,a4,a2,88,15,45,a0,b5,0e,20,0e,55,b9,b0,6f) #define SD_MESSAGE_LID_CLOSED SD_ID128_MAKE(b7,2e,a4,a2,88,15,45,a0,b5,0e,20,0e,55,b9,b0,70) From 34e86947e96a5b0c23097c40856c8ccac3bbed43 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 19 Mar 2019 19:14:53 +0100 Subject: [PATCH 6/9] catalog: add a new catalog entry explaining the new OOM killer event log msg --- catalog/systemd.catalog.in | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/catalog/systemd.catalog.in b/catalog/systemd.catalog.in index 5c6799ee5d..d39a2251b0 100644 --- a/catalog/systemd.catalog.in +++ b/catalog/systemd.catalog.in @@ -391,3 +391,16 @@ The following "tags" are possible: - "overflowgid-not-65534" — the kernel group ID used for "unknown" users (with NFS or user namespaces) is not 65534 Current system is tagged as @TAINT@. + +-- fe6faa94e7774663a0da52717891d8ef +Subject: A process of @UNIT@ unit has been killed by the OOM killer. +Defined-By: systemd +Support: %SUPPORT_URL% + +A process of unit @UNIT has been killed by the Linux kernel out-of-memory (OOM) +killer logic. This usually indicates that the system is low on memory and that +memory needed to be freed. A process associated with @UNIT@ has been determined +as the best process to terminate and has been forcibly terminated by the +kernel. + +Note that the memory pressure might or might not have been caused by @UNIT@. From 8e74bf7f9c9f0e46066590603c3a7d1e04dbe6c4 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 19 Mar 2019 19:54:08 +0100 Subject: [PATCH 7/9] man: document new OOMPolicy= setting --- man/systemd-system.conf.xml | 11 +++++++++++ man/systemd.exec.xml | 16 +++++++++++----- man/systemd.service.xml | 23 +++++++++++++++++++++++ 3 files changed, 45 insertions(+), 5 deletions(-) diff --git a/man/systemd-system.conf.xml b/man/systemd-system.conf.xml index d23b3fb45d..5d1e4d1b97 100644 --- a/man/systemd-system.conf.xml +++ b/man/systemd-system.conf.xml @@ -364,6 +364,17 @@ limits are only defaults for units, they are not applied to PID 1 itself. + + + DefaultOOMPolicy= + + Configure the default policy for reacting to processes being killed by the Linux + Out-Of-Memory (OOM) killer. This may be used to pick a global default for the per-unit + OOMPolicy= setting. See + systemd.service5 + for details. Note that this default is not used for services that have Delegate= + turned on. + diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 688147ea32..df1e1e8681 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -651,11 +651,17 @@ CapabilityBoundingSet=~CAP_B CAP_C OOMScoreAdjust= - Sets the adjustment level for the Out-Of-Memory killer for executed processes. Takes an integer - between -1000 (to disable OOM killing for this process) and 1000 (to make killing of this process under memory - pressure very likely). See proc.txt for - details. + Sets the adjustment value for the Linux kernel's Out-Of-Memory (OOM) killer score for + executed processes. Takes an integer between -1000 (to disable OOM killing of processes of this unit) + and 1000 (to make killing of processes of this unit under memory pressure very likely). See proc.txt for details. If + not specified defaults to the OOM score adjustment level of the service manager itself, which is + normally at 0. + + Use the OOMPolicy= setting of service units to configure how the service + manager shall react to the kernel OOM killer terminating a process of the service. See + systemd.service5 + for details. diff --git a/man/systemd.service.xml b/man/systemd.service.xml index 5b88417530..1f40c2ff37 100644 --- a/man/systemd.service.xml +++ b/man/systemd.service.xml @@ -963,6 +963,29 @@ above. + + OOMPolicy= + + Configure the Out-Of-Memory (OOM) killer policy. On Linux, when memory becomes scarce + the kernel might decide to kill a running process in order to free up memory and reduce memory + pressure. This setting takes one of continue, stop or + kill. If set to continue and a process of the service is + killed by the kernel's OOM killer this is logged but the service continues running. If set to + stop the event is logged but the service is terminated cleanly by the service + manager. If set to kill and one of the service's processes is killed by the OOM + killer the kernel is instructed to kill all remaining processes of the service, too. Defaults to the + setting DefaultOOMPolicy= in + system.conf5 is + set to, except for services where Delegate= is turned on, where it defaults to + continue. + + Use the OOMScoreAdjust= setting to configure whether processes of the unit + shall be considered preferred or less preferred candidates for process termination by the Linux OOM + killer logic. See + systemd.exec5 for + details. + + Check From 36869f3381e6c07f2ea53c2b53df81bf22cbd849 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 19 Mar 2019 19:54:46 +0100 Subject: [PATCH 8/9] test: add test case for new OOM logic --- test/TEST-31-OOMPOLICY/Makefile | 1 + test/TEST-31-OOMPOLICY/test.sh | 52 +++++++++++++++++++++++++++++ test/TEST-31-OOMPOLICY/testsuite.sh | 39 ++++++++++++++++++++++ 3 files changed, 92 insertions(+) create mode 120000 test/TEST-31-OOMPOLICY/Makefile create mode 100755 test/TEST-31-OOMPOLICY/test.sh create mode 100755 test/TEST-31-OOMPOLICY/testsuite.sh diff --git a/test/TEST-31-OOMPOLICY/Makefile b/test/TEST-31-OOMPOLICY/Makefile new file mode 120000 index 0000000000..e9f93b1104 --- /dev/null +++ b/test/TEST-31-OOMPOLICY/Makefile @@ -0,0 +1 @@ +../TEST-01-BASIC/Makefile \ No newline at end of file diff --git a/test/TEST-31-OOMPOLICY/test.sh b/test/TEST-31-OOMPOLICY/test.sh new file mode 100755 index 0000000000..55752e6a70 --- /dev/null +++ b/test/TEST-31-OOMPOLICY/test.sh @@ -0,0 +1,52 @@ +#!/bin/bash +# -*- mode: shell-script; indent-tabs-mode: nil; sh-basic-offset: 4; -*- +# ex: ts=8 sw=4 sts=4 et filetype=sh +set -e +TEST_DESCRIPTION="test OOM killer logic" +TEST_NO_NSPAWN=1 + +. $TEST_BASE_DIR/test-functions + +UNIFIED_CGROUP_HIERARCHY=yes + +test_setup() { + create_empty_image + mkdir -p $TESTDIR/root + mount ${LOOPDEV}p1 $TESTDIR/root + + ( + LOG_LEVEL=5 + eval $(udevadm info --export --query=env --name=${LOOPDEV}p2) + + setup_basic_environment + + # mask some services that we do not want to run in these tests + ln -fs /dev/null $initdir/etc/systemd/system/systemd-hwdb-update.service + ln -fs /dev/null $initdir/etc/systemd/system/systemd-journal-catalog-update.service + ln -fs /dev/null $initdir/etc/systemd/system/systemd-networkd.service + ln -fs /dev/null $initdir/etc/systemd/system/systemd-networkd.socket + ln -fs /dev/null $initdir/etc/systemd/system/systemd-resolved.service + ln -fs /dev/null $initdir/etc/systemd/system/systemd-machined.service + + # setup the testsuite service + cat >$initdir/etc/systemd/system/testsuite.service < /proc/sys/kernel/sysrq + echo f > /proc/sysrq-trigger + + while : ; do + STATE=`systemctl show -p ActiveState --value oomtest.service` + [ "$STATE" = "failed" ] && break + sleep .5 + done + + RESULT=`systemctl show -p Result --value oomtest.service` + test "$RESULT" = "oom-kill" + + systemd-analyze log-level info +fi + +echo OK > /testok + +exit 0 From 7d1d177b8d06fc70bc796b701a4368ff5f9eaeb9 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 19 Mar 2019 20:16:29 +0100 Subject: [PATCH 9/9] update TODO --- TODO | 4 ---- 1 file changed, 4 deletions(-) diff --git a/TODO b/TODO index 663e4fedd8..b8304c466c 100644 --- a/TODO +++ b/TODO @@ -62,8 +62,6 @@ Features: * bootctl,sd-boot: actually honour the "architecture" key -* set memory.oom.group in cgroup v2 for all leaf cgroups (kernel v4.19+) - * add a new syscall group "@esoteric" for more esoteric stuff such as bpf() and usefaultd() and make systemd-analyze check for it. @@ -71,8 +69,6 @@ Features: first. i.e. look for all places we use string_erase()/string_free_erase() and augment them with mlock(). Also use MADV_DONTDUMP -* whenever oom_kill memory.event event is triggered print a nice log message - * Move RestrictAddressFamily= to the new cgroup create socket * support the bind/connect/sendmsg cgroup stuff for sandboxing, and possibly