oomd: check if a cgroup can be killed before attempting to kill it

On OOM event, oomd tries to kill a cgroup until it succeedes.
The kill can fail with EPERM in case a pid is not killed, this leaves
the cgroup with only half of the processed killed.
This is unlikely but theoretically possible in a user namespace,
where systemd run as root inside the container and tries to kill a
cgroup with some PID from the host namespace.

To address this, send the SIG0 signal to all the processes to check
that we have privileges to kill them.
This commit is contained in:
Matteo Croce
2025-11-17 17:30:34 +01:00
committed by Zbigniew Jędrzejewski-Szmek
parent f295cfa1a7
commit 38e9d40c80

View File

@@ -337,6 +337,12 @@ int oomd_kill_by_pgscan_rate(Hashmap *h, const char *prefix, bool dry_run, char
if (c->pgscan == 0 && c->current_memory_usage == 0)
continue;
/* First try killing recursively to ensure all child cgroups can be killed. */
r = cg_kill_recursive(c->path, /* sig= */ 0, CGROUP_IGNORE_SELF, /* killed_pids= */ NULL,
/* log_kill= */ NULL, /* userdata= */ NULL);
if (r < 0)
continue;
r = oomd_cgroup_kill(c->path, /* recurse= */ true, /* dry_run= */ dry_run);
if (r == -ENOMEM)
return r; /* Treat oom as a hard error */
@@ -381,6 +387,12 @@ int oomd_kill_by_swap_usage(Hashmap *h, uint64_t threshold_usage, bool dry_run,
if (c->swap_usage <= threshold_usage)
continue;
/* First try killing recursively to ensure all child cgroups can be killed. */
r = cg_kill_recursive(c->path, /* sig= */ 0, CGROUP_IGNORE_SELF, /* killed_pids= */ NULL,
/* log_kill= */ NULL, /* userdata= */ NULL);
if (r < 0)
continue;
r = oomd_cgroup_kill(c->path, /* recurse= */ true, /* dry_run= */ dry_run);
if (r == -ENOMEM)
return r; /* Treat oom as a hard error */