core: record transactions that have seen ordering cycles and expose them via IPC (#39210)

Closes #3829.
This commit is contained in:
Yu Watanabe
2025-11-14 00:41:46 +09:00
committed by GitHub
12 changed files with 141 additions and 36 deletions

View File

@@ -938,11 +938,12 @@ Defined-By: systemd
Support: %SUPPORT_URL%
Documentation: man:systemd(1)
A unit transaction was initiated that contains an ordering cycle, i.e. some
unit that was requested to be started (either directly, or indirectly due to a
requirement dependency such as Wants= or Requires=) is ordered before some
other unit (via After=/Before=), but that latter unit is also ordered before
the former by some dependency (either directly or indirectly).
A unit transaction (with ID @TRANSACTION_ID@) was initiated that contains
an ordering cycle, i.e. some unit that was requested to be started
(either directly, or indirectly due to a requirement dependency such as
Wants= or Requires=) is ordered before some other unit (via After=/Before=),
but that latter unit is also ordered before the former by some dependency
(either directly or indirectly).
Ordering cycles consist of at least two units, but might involve many
more. They generally indicate a bug in the unit definitions, as a unit

View File

@@ -413,6 +413,8 @@ node /org/freedesktop/systemd1 {
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly u NFailedJobs = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly at TransactionsWithOrderingCycle = [...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly d Progress = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly as Environment = ['...', ...];
@@ -1103,6 +1105,8 @@ node /org/freedesktop/systemd1 {
<variablelist class="dbus-property" generated="True" extra-ref="NFailedJobs"/>
<variablelist class="dbus-property" generated="True" extra-ref="TransactionsWithOrderingCycle"/>
<variablelist class="dbus-property" generated="True" extra-ref="Progress"/>
<variablelist class="dbus-property" generated="True" extra-ref="Environment"/>
@@ -1809,6 +1813,9 @@ node /org/freedesktop/systemd1 {
<para><varname>NFailedJobs</varname> encodes how many jobs have ever failed in total.</para>
<para><varname>TransactionsWithOrderingCycle</varname> encodes IDs of transactions that encountered
ordering cycle.</para>
<para><varname>Progress</varname> encodes boot progress as a floating point value between 0.0 and
1.0. This value begins at 0.0 at early-boot and ends at 1.0 when boot is finished and is based on the
number of executed and queued jobs. After startup, this field is always 1.0 indicating a finished
@@ -12465,6 +12472,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
<para><varname>DefaultRestrictSUIDSGID</varname>,
<function>RemoveSubgroupFromUnit()</function>, and
<function>KillUnitSubgroup()</function> were added in version 258.</para>
<para><varname>TransactionsWithOrderingCycle</varname> was added in version 259.</para>
</refsect2>
<refsect2>
<title>Unit Objects</title>

View File

@@ -225,8 +225,8 @@ Sun 2017-02-26 20:57:49 EST 2h 3min left Sun 2017-02-26 11:56:36 EST 6h ago
<listitem>
<para>Check whether any of the specified units is in the "failed" state. If no unit is specified,
check whether there are any failed units, which corresponds to the <literal>degraded</literal> state
returned by <command>is-system-running</command>. Returns an exit code <constant>0</constant>
check whether there are any failed units or ordering cycles, which corresponds to the <literal>degraded</literal>
state returned by <command>is-system-running</command>. Returns an exit code <constant>0</constant>
if at least one has failed, non-zero otherwise. Unless <option>--quiet</option> is specified, this
will also print the current unit or system state to standard output.</para>

View File

@@ -455,6 +455,35 @@ static int property_get_oom_score_adjust(
return sd_bus_message_append(reply, "i", n);
}
static int property_get_transactions_with_cycle(
sd_bus *bus,
const char *path,
const char *interface,
const char *property,
sd_bus_message *reply,
void *userdata,
sd_bus_error *error) {
Manager *m = ASSERT_PTR(userdata);
int r;
assert(bus);
assert(reply);
r = sd_bus_message_open_container(reply, 'a', "t");
if (r < 0)
return r;
uint64_t *id;
SET_FOREACH(id, m->transactions_with_cycle) {
r = sd_bus_message_append_basic(reply, 't', id);
if (r < 0)
return r;
}
return sd_bus_message_close_container(reply);
}
static int bus_get_unit_by_name(Manager *m, sd_bus_message *message, const char *name, Unit **ret_unit, sd_bus_error *error) {
Unit *u;
int r;
@@ -2870,6 +2899,7 @@ const sd_bus_vtable bus_manager_vtable[] = {
SD_BUS_PROPERTY("NJobs", "u", property_get_hashmap_size, offsetof(Manager, jobs), 0),
SD_BUS_PROPERTY("NInstalledJobs", "u", bus_property_get_unsigned, offsetof(Manager, n_installed_jobs), 0),
SD_BUS_PROPERTY("NFailedJobs", "u", bus_property_get_unsigned, offsetof(Manager, n_failed_jobs), 0),
SD_BUS_PROPERTY("TransactionsWithOrderingCycle", "at", property_get_transactions_with_cycle, 0, 0),
SD_BUS_PROPERTY("Progress", "d", property_get_progress, 0, 0),
SD_BUS_PROPERTY("Environment", "as", property_get_environment, 0, 0),
SD_BUS_PROPERTY("ConfirmSpawn", "b", bus_property_get_bool, offsetof(Manager, confirm_spawn), SD_BUS_VTABLE_PROPERTY_CONST),

View File

@@ -92,6 +92,8 @@ int manager_serialize(
_cleanup_(manager_reloading_stopp) _unused_ Manager *reloading = manager_reloading_start(m);
(void) serialize_item_format(f, "last-transaction-id", "%" PRIu64, m->last_transaction_id);
(void) serialize_item_format(f, "current-job-id", "%" PRIu32, m->current_job_id);
(void) serialize_item_format(f, "n-installed-jobs", "%u", m->n_installed_jobs);
(void) serialize_item_format(f, "n-failed-jobs", "%u", m->n_failed_jobs);
@@ -283,7 +285,7 @@ static void manager_deserialize_gid_refs_one(Manager *m, const char *value) {
int manager_deserialize(Manager *m, FILE *f, FDSet *fds) {
bool deserialize_varlink_sockets = false;
int r = 0;
int r;
assert(m);
assert(f);
@@ -325,7 +327,15 @@ int manager_deserialize(Manager *m, FILE *f, FDSet *fds) {
if (r == 0) /* eof or end marker */
break;
if ((val = startswith(l, "current-job-id="))) {
if ((val = startswith(l, "last-transaction-id="))) {
uint64_t id;
if (safe_atou64(val, &id) < 0)
log_notice("Failed to parse last transaction id value '%s', ignoring.", val);
else
m->last_transaction_id = MAX(m->last_transaction_id, id);
} else if ((val = startswith(l, "current-job-id="))) {
uint32_t id;
if (safe_atou32(val, &id) < 0)
@@ -350,22 +360,18 @@ int manager_deserialize(Manager *m, FILE *f, FDSet *fds) {
m->n_failed_jobs += n;
} else if ((val = startswith(l, "taint-logged="))) {
int b;
b = parse_boolean(val);
if (b < 0)
r = parse_boolean(val);
if (r < 0)
log_notice("Failed to parse taint-logged flag '%s', ignoring.", val);
else
m->taint_logged = m->taint_logged || b;
m->taint_logged = m->taint_logged || r;
} else if ((val = startswith(l, "service-watchdogs="))) {
int b;
b = parse_boolean(val);
if (b < 0)
r = parse_boolean(val);
if (r < 0)
log_notice("Failed to parse service-watchdogs flag '%s', ignoring.", val);
else
m->service_watchdogs = b;
m->service_watchdogs = r;
} else if ((val = startswith(l, "show-status-overridden="))) {
ShowStatus s;

View File

@@ -1660,6 +1660,8 @@ static void manager_clear_jobs_and_units(Manager *m) {
m->n_running_jobs = 0;
m->n_installed_jobs = 0;
m->n_failed_jobs = 0;
m->transactions_with_cycle = set_free(m->transactions_with_cycle);
}
Manager* manager_free(Manager *m) {
@@ -2154,14 +2156,16 @@ int manager_add_job_full(
if (mode == JOB_RESTART_DEPENDENCIES && type != JOB_START)
return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "--job-mode=restart-dependencies is only valid for start.");
tr = transaction_new(mode == JOB_REPLACE_IRREVERSIBLY, ++m->last_transaction_id);
if (!tr)
return -ENOMEM;
LOG_CONTEXT_PUSHF("TRANSACTION_ID=%" PRIu64, tr->id);
log_unit_debug(unit, "Trying to enqueue job %s/%s/%s", unit->id, job_type_to_string(type), job_mode_to_string(mode));
type = job_type_collapse(type, unit);
tr = transaction_new(mode == JOB_REPLACE_IRREVERSIBLY);
if (!tr)
return -ENOMEM;
r = transaction_add_job_and_dependencies(
tr,
type,
@@ -2248,18 +2252,20 @@ int manager_add_job_by_name_and_warn(Manager *m, JobType type, const char *name,
}
int manager_propagate_reload(Manager *m, Unit *unit, JobMode mode, sd_bus_error *e) {
int r;
_cleanup_(transaction_abort_and_freep) Transaction *tr = NULL;
int r;
assert(m);
assert(unit);
assert(mode < _JOB_MODE_MAX);
assert(mode != JOB_ISOLATE); /* Isolate is only valid for start */
tr = transaction_new(mode == JOB_REPLACE_IRREVERSIBLY);
tr = transaction_new(mode == JOB_REPLACE_IRREVERSIBLY, ++m->last_transaction_id);
if (!tr)
return -ENOMEM;
LOG_CONTEXT_PUSHF("TRANSACTION_ID=%" PRIu64, tr->id);
/* We need an anchor job */
r = transaction_add_job_and_dependencies(tr, JOB_NOP, unit, NULL, TRANSACTION_IGNORE_REQUIREMENTS|TRANSACTION_IGNORE_ORDER, e);
if (r < 0)
@@ -3670,6 +3676,8 @@ void manager_reset_failed(Manager *m) {
HASHMAP_FOREACH(u, m->units)
unit_reset_failed(u);
m->transactions_with_cycle = set_free(m->transactions_with_cycle);
}
bool manager_unit_inactive_or_pending(Manager *m, const char *name) {
@@ -4618,8 +4626,8 @@ ManagerState manager_state(Manager *m) {
return MANAGER_MAINTENANCE;
}
/* Are there any failed units? If so, we are in degraded mode */
if (!set_isempty(m->failed_units))
/* Are there any failed units or ordering cycles? If so, we are in degraded mode */
if (!set_isempty(m->failed_units) || !set_isempty(m->transactions_with_cycle))
return MANAGER_DEGRADED;
return MANAGER_RUNNING;

View File

@@ -236,6 +236,11 @@ typedef struct Manager {
/* A set which contains all currently failed units */
Set *failed_units;
uint64_t last_transaction_id;
/* IDs of transactions that once encountered ordering cycle */
Set *transactions_with_cycle;
sd_event_source *run_queue_event_source;
char *notify_socket;

View File

@@ -8,6 +8,7 @@
#include "bus-common-errors.h"
#include "bus-error.h"
#include "dbus-unit.h"
#include "hash-funcs.h"
#include "manager.h"
#include "set.h"
#include "slice.h"
@@ -15,6 +16,8 @@
#include "strv.h"
#include "transaction.h"
#define CYCLIC_TRANSACTIONS_MAX 4096U
static bool job_matters_to_anchor(Job *job);
static void transaction_unlink_job(Transaction *tr, Job *j, bool delete_dependencies);
@@ -399,6 +402,16 @@ static int transaction_verify_order_one(Transaction *tr, Job *j, Job *from, unsi
LOG_MESSAGE_ID(SD_MESSAGE_UNIT_ORDERING_CYCLE_STR),
LOG_ITEM("%s", strempty(unit_ids)));
if (set_size(j->manager->transactions_with_cycle) >= CYCLIC_TRANSACTIONS_MAX)
log_warning("Too many transactions with ordering cycle, suppressing record.");
else {
uint64_t *id_buf = newdup(uint64_t, &tr->id, 1);
if (!id_buf)
log_oom_warning();
else
(void) set_ensure_consume(&j->manager->transactions_with_cycle, &uint64_hash_ops_value_free, id_buf);
}
if (delete) {
const char *status;
/* logging for j not k here to provide a consistent narrative */
@@ -1235,20 +1248,24 @@ int transaction_add_triggering_jobs(Transaction *tr, Unit *u) {
return 0;
}
Transaction* transaction_new(bool irreversible) {
Transaction *tr;
Transaction* transaction_new(bool irreversible, uint64_t id) {
_cleanup_free_ Transaction *tr = NULL;
tr = new0(Transaction, 1);
assert(id != 0);
tr = new(Transaction, 1);
if (!tr)
return NULL;
tr->jobs = hashmap_new(NULL);
*tr = (Transaction) {
.jobs = hashmap_new(NULL),
.irreversible = irreversible,
.id = id,
};
if (!tr->jobs)
return mfree(tr);
return NULL;
tr->irreversible = irreversible;
return tr;
return TAKE_PTR(tr);
}
Transaction* transaction_free(Transaction *tr) {

View File

@@ -8,9 +8,11 @@ typedef struct Transaction {
Hashmap *jobs; /* Unit object => Job object list 1:1 */
Job *anchor_job; /* The job the user asked for */
bool irreversible;
uint64_t id;
} Transaction;
Transaction* transaction_new(bool irreversible);
Transaction* transaction_new(bool irreversible, uint64_t id);
Transaction* transaction_free(Transaction *tr);
Transaction* transaction_abort_and_free(Transaction *tr);
DEFINE_TRIVIAL_CLEANUP_FUNC(Transaction*, transaction_abort_and_free);

View File

@@ -112,6 +112,24 @@ static int manager_context_build_json(sd_json_variant **ret, const char *name, v
JSON_BUILD_PAIR_STRING_NON_EMPTY("ControlGroup", m->cgroup_root));
}
static int transactions_with_cycle_build_json(sd_json_variant **ret, const char *name, void *userdata) {
_cleanup_(sd_json_variant_unrefp) sd_json_variant *v = NULL;
const Set *ids = userdata;
int r;
assert(ret);
uint64_t *id;
SET_FOREACH(id, ids) {
r = sd_json_variant_append_arrayb(&v, SD_JSON_BUILD_UNSIGNED(*id));
if (r < 0)
return r;
}
*ret = TAKE_PTR(v);
return 0;
}
static int manager_runtime_build_json(sd_json_variant **ret, const char *name, void *userdata) {
Manager *m = ASSERT_PTR(userdata);
dual_timestamp watchdog_last_ping;
@@ -154,6 +172,7 @@ static int manager_runtime_build_json(sd_json_variant **ret, const char *name, v
SD_JSON_BUILD_PAIR_UNSIGNED("NJobs", hashmap_size(m->jobs)),
SD_JSON_BUILD_PAIR_UNSIGNED("NInstalledJobs", m->n_installed_jobs),
SD_JSON_BUILD_PAIR_UNSIGNED("NFailedJobs", m->n_failed_jobs),
JSON_BUILD_PAIR_CALLBACK_NON_NULL("TransactionsWithOrderingCycle", transactions_with_cycle_build_json, m->transactions_with_cycle),
SD_JSON_BUILD_PAIR_REAL("Progress", manager_get_progress(m)),
JSON_BUILD_PAIR_DUAL_TIMESTAMP_NON_NULL("WatchdogLastPingTimestamp", watchdog_get_last_ping_as_dual_timestamp(&watchdog_last_ping)),
SD_JSON_BUILD_PAIR_STRING("SystemState", manager_state_to_string(manager_state(m))),

View File

@@ -155,6 +155,8 @@ static SD_VARLINK_DEFINE_STRUCT_TYPE(
SD_VARLINK_DEFINE_FIELD(NInstalledJobs, SD_VARLINK_INT, 0),
SD_VARLINK_FIELD_COMMENT("The total amount of failed jobs"),
SD_VARLINK_DEFINE_FIELD(NFailedJobs, SD_VARLINK_INT, 0),
SD_VARLINK_FIELD_COMMENT("IDs of transactions that encountered ordering cycle"),
SD_VARLINK_DEFINE_FIELD(TransactionsWithOrderingCycle, SD_VARLINK_INT, SD_VARLINK_ARRAY|SD_VARLINK_NULLABLE),
SD_VARLINK_FIELD_COMMENT("Boot progress as a floating point value between 0.0 and 1.0"),
SD_VARLINK_DEFINE_FIELD(Progress, SD_VARLINK_FLOAT, 0),
SD_VARLINK_FIELD_COMMENT("Timestamp when the hardware watchdog was last pinged"),

View File

@@ -124,6 +124,13 @@ for i in {0..19}; do
systemctl start "transaction-cycle$i.service"
done
IDS_FILE="/tmp/TEST-03-JOBS-CYCLE-IDS-$RANDOM"
varlinkctl call /run/systemd/io.systemd.Manager io.systemd.Manager.Describe '{}' | jq '.runtime.TransactionsWithOrderingCycle' >"$IDS_FILE"
[[ "$(jq length "$IDS_FILE")" -ge 20 ]]
for i in {0..19}; do
journalctl -b TRANSACTION_ID="$(jq -r ".[$i]" "$IDS_FILE")" --grep "cycle starting with"
done
# Test PropagatesStopTo= when restart (issue #26839)
systemctl start propagatestopto-and-pullin.target
systemctl --quiet is-active propagatestopto-and-pullin.target