From 85e5613cf3fde59135965645114a533745697520 Mon Sep 17 00:00:00 2001 From: Franck Bui Date: Thu, 16 Sep 2021 16:19:05 +0200 Subject: [PATCH 1/9] watchdog: cleanup: create an helper for each ioctl No functional changes. --- src/shared/watchdog.c | 114 +++++++++++++++++++++++++----------------- 1 file changed, 68 insertions(+), 46 deletions(-) diff --git a/src/shared/watchdog.c b/src/shared/watchdog.c index 3f6a2f0228..78c13a7b05 100644 --- a/src/shared/watchdog.c +++ b/src/shared/watchdog.c @@ -19,50 +19,81 @@ static char *watchdog_device = NULL; static usec_t watchdog_timeout = USEC_INFINITY; static usec_t watchdog_last_ping = USEC_INFINITY; +static int watchdog_set_enable(bool enable) { + int flags = enable ? WDIOS_ENABLECARD : WDIOS_DISABLECARD; + int r; + + assert(watchdog_fd >= 0); + + r = ioctl(watchdog_fd, WDIOC_SETOPTIONS, &flags); + if (r < 0) { + if (!enable) + return log_warning_errno(errno, "Failed to disable hardware watchdog, ignoring: %m"); + + /* ENOTTY means the watchdog is always enabled so we're fine */ + log_full_errno(ERRNO_IS_NOT_SUPPORTED(errno) ? LOG_DEBUG : LOG_WARNING, errno, + "Failed to enable hardware watchdog, ignoring: %m"); + if (!ERRNO_IS_NOT_SUPPORTED(errno)) + return -errno; + } + + return 0; +} + +static int watchdog_set_timeout(void) { + usec_t t; + int sec; + + assert(watchdog_fd >= 0); + assert(timestamp_is_set(watchdog_timeout)); + + t = DIV_ROUND_UP(watchdog_timeout, USEC_PER_SEC); + sec = MIN(t, (usec_t) INT_MAX); /* Saturate */ + + if (ioctl(watchdog_fd, WDIOC_SETTIMEOUT, &sec) < 0) + return log_warning_errno(errno, "Failed to set timeout to %is, ignoring: %m", sec); + + /* Just in case the driver is buggy */ + assert(sec > 0); + + /* watchdog_timeout stores the timeout used by the HW */ + watchdog_timeout = sec * USEC_PER_SEC; + + log_info("Set hardware watchdog to %s.", FORMAT_TIMESPAN(watchdog_timeout, 0)); + return 0; +} + +static int watchdog_ping_now(void) { + assert(watchdog_fd >= 0); + + if (ioctl(watchdog_fd, WDIOC_KEEPALIVE, 0) < 0) + return log_warning_errno(errno, "Failed to ping hardware watchdog, ignoring: %m"); + + watchdog_last_ping = now(clock_boottime_or_monotonic()); + + return 0; +} + static int update_timeout(void) { + int r; + if (watchdog_fd < 0) return 0; if (watchdog_timeout == USEC_INFINITY) return 0; - if (watchdog_timeout == 0) { - int flags; + if (watchdog_timeout == 0) + return watchdog_set_enable(false); - flags = WDIOS_DISABLECARD; - if (ioctl(watchdog_fd, WDIOC_SETOPTIONS, &flags) < 0) - return log_warning_errno(errno, "Failed to disable hardware watchdog, ignoring: %m"); - } else { - int sec, flags; - usec_t t; + r = watchdog_set_timeout(); + if (r < 0) + return r; - t = DIV_ROUND_UP(watchdog_timeout, USEC_PER_SEC); - sec = MIN(t, (usec_t) INT_MAX); /* Saturate */ - if (ioctl(watchdog_fd, WDIOC_SETTIMEOUT, &sec) < 0) - return log_warning_errno(errno, "Failed to set timeout to %is, ignoring: %m", sec); + r = watchdog_set_enable(true); + if (r < 0) + return r; - /* Just in case the driver is buggy */ - assert(sec > 0); - - /* watchdog_timeout stores the actual timeout used by the HW */ - watchdog_timeout = sec * USEC_PER_SEC; - log_info("Set hardware watchdog to %s.", FORMAT_TIMESPAN(watchdog_timeout, 0)); - - flags = WDIOS_ENABLECARD; - if (ioctl(watchdog_fd, WDIOC_SETOPTIONS, &flags) < 0) { - /* ENOTTY means the watchdog is always enabled so we're fine */ - log_full_errno(ERRNO_IS_NOT_SUPPORTED(errno) ? LOG_DEBUG : LOG_WARNING, errno, - "Failed to enable hardware watchdog, ignoring: %m"); - if (!ERRNO_IS_NOT_SUPPORTED(errno)) - return -errno; - } - - if (ioctl(watchdog_fd, WDIOC_KEEPALIVE, 0) < 0) - return log_warning_errno(errno, "Failed to ping hardware watchdog, ignoring: %m"); - - watchdog_last_ping = now(clock_boottime_or_monotonic()); - } - - return 0; + return watchdog_ping_now(); } static int open_watchdog(void) { @@ -155,11 +186,7 @@ int watchdog_ping(void) { return 0; } - if (ioctl(watchdog_fd, WDIOC_KEEPALIVE, 0) < 0) - return log_warning_errno(errno, "Failed to ping hardware watchdog, ignoring: %m"); - - watchdog_last_ping = ntime; - return 0; + return watchdog_ping_now(); } void watchdog_close(bool disarm) { @@ -167,12 +194,7 @@ void watchdog_close(bool disarm) { return; if (disarm) { - int flags; - - /* Explicitly disarm it */ - flags = WDIOS_DISABLECARD; - if (ioctl(watchdog_fd, WDIOC_SETOPTIONS, &flags) < 0) - log_warning_errno(errno, "Failed to disable hardware watchdog, ignoring: %m"); + (void) watchdog_set_enable(false); /* To be sure, use magic close logic, too */ for (;;) { From fcbf2c64f5c392f4bbf820b446d0b6f993527b62 Mon Sep 17 00:00:00 2001 From: Franck Bui Date: Fri, 17 Sep 2021 09:21:18 +0200 Subject: [PATCH 2/9] watchdog: minor optimization in watchdog_setup() --- src/shared/watchdog.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/shared/watchdog.c b/src/shared/watchdog.c index 78c13a7b05..3f4effddb5 100644 --- a/src/shared/watchdog.c +++ b/src/shared/watchdog.c @@ -134,6 +134,10 @@ int watchdog_set_device(const char *path) { int watchdog_setup(usec_t timeout) { + /* Let's shortcut duplicated requests */ + if (watchdog_fd >= 0 && watchdog_timeout == timeout) + return 0; + /* Initialize the watchdog timeout with the caller value. This value is * going to be updated by update_timeout() with the closest value * supported by the driver */ From ef1d5f3c5c613b9fa75d6a69bbc51a79361e28cc Mon Sep 17 00:00:00 2001 From: Franck Bui Date: Thu, 16 Sep 2021 18:46:04 +0200 Subject: [PATCH 3/9] watchdog: configuring a timeout value might not be supported by the HW In that case we should hanlde this case more gracefully by reusing the programmed value. Fixes: #20683 --- src/shared/watchdog.c | 37 +++++++++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/src/shared/watchdog.c b/src/shared/watchdog.c index 3f4effddb5..44974aa633 100644 --- a/src/shared/watchdog.c +++ b/src/shared/watchdog.c @@ -40,6 +40,20 @@ static int watchdog_set_enable(bool enable) { return 0; } +static int watchdog_get_timeout(void) { + int sec = 0; + + assert(watchdog_fd > 0); + + if (ioctl(watchdog_fd, WDIOC_GETTIMEOUT, &sec) < 0) + return -errno; + + assert(sec > 0); + watchdog_timeout = sec * USEC_PER_SEC; + + return 0; +} + static int watchdog_set_timeout(void) { usec_t t; int sec; @@ -51,15 +65,11 @@ static int watchdog_set_timeout(void) { sec = MIN(t, (usec_t) INT_MAX); /* Saturate */ if (ioctl(watchdog_fd, WDIOC_SETTIMEOUT, &sec) < 0) - return log_warning_errno(errno, "Failed to set timeout to %is, ignoring: %m", sec); + return -errno; - /* Just in case the driver is buggy */ - assert(sec > 0); - - /* watchdog_timeout stores the timeout used by the HW */ + assert(sec > 0);/* buggy driver ? */ watchdog_timeout = sec * USEC_PER_SEC; - log_info("Set hardware watchdog to %s.", FORMAT_TIMESPAN(watchdog_timeout, 0)); return 0; } @@ -86,13 +96,24 @@ static int update_timeout(void) { return watchdog_set_enable(false); r = watchdog_set_timeout(); - if (r < 0) - return r; + if (r < 0) { + if (!ERRNO_IS_NOT_SUPPORTED(r)) + return log_warning_errno(r, "Failed to set timeout to %s, ignoring: %m", + FORMAT_TIMESPAN(watchdog_timeout, 0)); + + log_info("Modifying watchdog timeout is not supported, reusing the programmed timeout."); + + r = watchdog_get_timeout(); + if (r < 0) + return log_warning_errno(errno, "Failed to query watchdog HW timeout, ignoring: %m"); + } r = watchdog_set_enable(true); if (r < 0) return r; + log_info("Watchdog running with a timeout of %s.", FORMAT_TIMESPAN(watchdog_timeout, 0)); + return watchdog_ping_now(); } From c1a08a76ab6761eee9d537c0e0fb6252a4961b14 Mon Sep 17 00:00:00 2001 From: Franck Bui Date: Fri, 17 Sep 2021 10:34:35 +0200 Subject: [PATCH 4/9] watchdog: pass USEC_INFINITY to watchdog_setup() to reuse the programmed timeout value This patch changes the meaning of USEC_INFINITY value for the watchdog module. Previously passing this value was a NOP. It now has a special meaning: it requests the watchdog module to read the programmed timeout value and reuse it for pinging the device. This is mostly useful when the watchdog is started by the firmware and there's no way to reconfigure the timeout with a different value afterwards. "RuntimeWatchdogSec=infinity" in system.conf can now be used rather than putting an arbitrary value that PID1 will fail to set (even if it still felt back to the programmed timeout). Please note that "infinity" is not supposed to restore the default value of the firmware. If the value is changed after booting then "infinity" would simply reuse the current programmed value. IOW it's a NOP unless the watchdog was previously closed and in that case it will be reopened and the last programmed value reused. --- src/shared/watchdog.c | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/src/shared/watchdog.c b/src/shared/watchdog.c index 44974aa633..1b5239888f 100644 --- a/src/shared/watchdog.c +++ b/src/shared/watchdog.c @@ -15,8 +15,8 @@ #include "watchdog.h" static int watchdog_fd = -1; -static char *watchdog_device = NULL; -static usec_t watchdog_timeout = USEC_INFINITY; +static char *watchdog_device; +static usec_t watchdog_timeout; /* USEC_INFINITY → don't change timeout */ static usec_t watchdog_last_ping = USEC_INFINITY; static int watchdog_set_enable(bool enable) { @@ -89,20 +89,23 @@ static int update_timeout(void) { if (watchdog_fd < 0) return 0; - if (watchdog_timeout == USEC_INFINITY) - return 0; if (watchdog_timeout == 0) return watchdog_set_enable(false); - r = watchdog_set_timeout(); - if (r < 0) { - if (!ERRNO_IS_NOT_SUPPORTED(r)) - return log_warning_errno(r, "Failed to set timeout to %s, ignoring: %m", - FORMAT_TIMESPAN(watchdog_timeout, 0)); + if (watchdog_timeout != USEC_INFINITY) { + r = watchdog_set_timeout(); + if (r < 0) { + if (!ERRNO_IS_NOT_SUPPORTED(r)) + return log_warning_errno(r, "Failed to set timeout to %s, ignoring: %m", + FORMAT_TIMESPAN(watchdog_timeout, 0)); - log_info("Modifying watchdog timeout is not supported, reusing the programmed timeout."); + log_info("Modifying watchdog timeout is not supported, reusing the programmed timeout."); + watchdog_timeout = USEC_INFINITY; + } + } + if (watchdog_timeout == USEC_INFINITY) { r = watchdog_get_timeout(); if (r < 0) return log_warning_errno(errno, "Failed to query watchdog HW timeout, ignoring: %m"); @@ -164,11 +167,6 @@ int watchdog_setup(usec_t timeout) { * supported by the driver */ watchdog_timeout = timeout; - /* If we didn't open the watchdog yet and didn't get any explicit - * timeout value set, don't do anything */ - if (watchdog_fd < 0 && watchdog_timeout == USEC_INFINITY) - return 0; - if (watchdog_fd < 0) return open_watchdog(); @@ -194,7 +192,7 @@ usec_t watchdog_runtime_wait(void) { int watchdog_ping(void) { usec_t ntime; - if (!timestamp_is_set(watchdog_timeout)) + if (watchdog_timeout == 0) return 0; if (watchdog_fd < 0) @@ -239,5 +237,5 @@ void watchdog_close(bool disarm) { /* Once closed, pinging the device becomes a NOP and we request a new * call to watchdog_setup() to open the device again. */ - watchdog_timeout = USEC_INFINITY; + watchdog_timeout = 0; } From f16890f8d2e3994608274b5e46dd847d9ec3ee6a Mon Sep 17 00:00:00 2001 From: Franck Bui Date: Fri, 17 Sep 2021 15:16:38 +0200 Subject: [PATCH 5/9] watchdog: passing 0 to watchdog_setup now closes the watchdog Passing 0 meant "disable the watchdog although still kept it opened". However this case didn't seem to be useful especially since PID1 closes the device if it is passed the nul timeout. Hence let's change the meaning of watchdog_setup(0) to match PID1's behavior which allows to simplify the code a bit. Hence this patch also drops enable_watchdog(). --- src/core/main.c | 21 +++++++++------------ src/core/manager.c | 13 +++---------- src/shared/watchdog.c | 29 ++++++++++++++++++++--------- 3 files changed, 32 insertions(+), 31 deletions(-) diff --git a/src/core/main.c b/src/core/main.c index ff7f189370..8792ef88fc 100644 --- a/src/core/main.c +++ b/src/core/main.c @@ -1483,9 +1483,9 @@ static int become_shutdown( }; _cleanup_strv_free_ char **env_block = NULL; + usec_t watchdog_timer = 0; size_t pos = 7; int r; - usec_t watchdog_timer = 0; assert(shutdown_verb); assert(!command_line[pos]); @@ -1534,19 +1534,16 @@ static int become_shutdown( else if (streq(shutdown_verb, "kexec")) watchdog_timer = arg_kexec_watchdog; - if (timestamp_is_set(watchdog_timer)) { - /* If we reboot or kexec let's set the shutdown watchdog and tell the shutdown binary to - * repeatedly ping it */ - r = watchdog_setup(watchdog_timer); - watchdog_close(r < 0); + /* If we reboot or kexec let's set the shutdown watchdog and tell the + * shutdown binary to repeatedly ping it */ + r = watchdog_setup(watchdog_timer); + watchdog_close(r < 0); - /* Tell the binary how often to ping, ignore failure */ - (void) strv_extendf(&env_block, "WATCHDOG_USEC="USEC_FMT, watchdog_timer); + /* Tell the binary how often to ping, ignore failure */ + (void) strv_extendf(&env_block, "WATCHDOG_USEC="USEC_FMT, watchdog_timer); - if (arg_watchdog_device) - (void) strv_extendf(&env_block, "WATCHDOG_DEVICE=%s", arg_watchdog_device); - } else - watchdog_close(true); + if (arg_watchdog_device) + (void) strv_extendf(&env_block, "WATCHDOG_DEVICE=%s", arg_watchdog_device); /* Avoid the creation of new processes forked by the kernel; at this * point, we will not listen to the signals anyway */ diff --git a/src/core/manager.c b/src/core/manager.c index 2e7db509d1..b231e36cbd 100644 --- a/src/core/manager.c +++ b/src/core/manager.c @@ -3203,12 +3203,8 @@ void manager_set_watchdog(Manager *m, WatchdogType t, usec_t timeout) { return; if (t == WATCHDOG_RUNTIME) - if (!timestamp_is_set(m->watchdog_overridden[WATCHDOG_RUNTIME])) { - if (timestamp_is_set(timeout)) - (void) watchdog_setup(timeout); - else - watchdog_close(true); - } + if (!timestamp_is_set(m->watchdog_overridden[WATCHDOG_RUNTIME])) + (void) watchdog_setup(timeout); m->watchdog[t] = timeout; } @@ -3226,10 +3222,7 @@ int manager_override_watchdog(Manager *m, WatchdogType t, usec_t timeout) { if (t == WATCHDOG_RUNTIME) { usec_t usec = timestamp_is_set(timeout) ? timeout : m->watchdog[t]; - if (timestamp_is_set(usec)) - (void) watchdog_setup(usec); - else - watchdog_close(true); + (void) watchdog_setup(usec); } m->watchdog_overridden[t] = timeout; diff --git a/src/shared/watchdog.c b/src/shared/watchdog.c index 1b5239888f..b6fddd193f 100644 --- a/src/shared/watchdog.c +++ b/src/shared/watchdog.c @@ -16,7 +16,7 @@ static int watchdog_fd = -1; static char *watchdog_device; -static usec_t watchdog_timeout; /* USEC_INFINITY → don't change timeout */ +static usec_t watchdog_timeout; /* 0 → close device and USEC_INFINITY → don't change timeout */ static usec_t watchdog_last_ping = USEC_INFINITY; static int watchdog_set_enable(bool enable) { @@ -87,12 +87,11 @@ static int watchdog_ping_now(void) { static int update_timeout(void) { int r; + assert(watchdog_timeout > 0); + if (watchdog_fd < 0) return 0; - if (watchdog_timeout == 0) - return watchdog_set_enable(false); - if (watchdog_timeout != USEC_INFINITY) { r = watchdog_set_timeout(); if (r < 0) { @@ -158,8 +157,19 @@ int watchdog_set_device(const char *path) { int watchdog_setup(usec_t timeout) { + /* timeout=0 closes the device whereas passing timeout=USEC_INFINITY + * opens it (if needed) without configuring any particular timeout and + * thus reuses the programmed value (therefore it's a nop if the device + * is already opened). + */ + + if (timeout == 0) { + watchdog_close(true); + return 0; + } + /* Let's shortcut duplicated requests */ - if (watchdog_fd >= 0 && watchdog_timeout == timeout) + if (watchdog_fd >= 0 && (timeout == watchdog_timeout || timeout == USEC_INFINITY)) return 0; /* Initialize the watchdog timeout with the caller value. This value is @@ -213,6 +223,11 @@ int watchdog_ping(void) { } void watchdog_close(bool disarm) { + + /* Once closed, pinging the device becomes a NOP and we request a new + * call to watchdog_setup() to open the device again. */ + watchdog_timeout = 0; + if (watchdog_fd < 0) return; @@ -234,8 +249,4 @@ void watchdog_close(bool disarm) { } watchdog_fd = safe_close(watchdog_fd); - - /* Once closed, pinging the device becomes a NOP and we request a new - * call to watchdog_setup() to open the device again. */ - watchdog_timeout = 0; } From 807938e7ecceddde1879a5e5a993189a5f8564dc Mon Sep 17 00:00:00 2001 From: Franck Bui Date: Mon, 27 Sep 2021 10:16:09 +0200 Subject: [PATCH 6/9] watchdog: update the documentation While at it, split the watchdog section into a few paragraphs to make it easier to read as it becomes lengthy. --- man/systemd-system.conf.xml | 67 ++++++++++++++++++++++--------------- 1 file changed, 40 insertions(+), 27 deletions(-) diff --git a/man/systemd-system.conf.xml b/man/systemd-system.conf.xml index 5824e01e0c..4172ec00ab 100644 --- a/man/systemd-system.conf.xml +++ b/man/systemd-system.conf.xml @@ -133,33 +133,46 @@ RebootWatchdogSec= KExecWatchdogSec= - Configure the hardware watchdog at runtime and at reboot. Takes a timeout value in seconds (or - in other time units if suffixed with ms, min, h, - d, w). If RuntimeWatchdogSec= is set to a non-zero - value, the watchdog hardware (/dev/watchdog or the path specified with - WatchdogDevice= or the kernel option systemd.watchdog-device=) will be - programmed to automatically reboot the system if it is not contacted within the specified timeout interval. The - system manager will ensure to contact it at least once in half the specified timeout interval. This feature - requires a hardware watchdog device to be present, as it is commonly the case in embedded and server - systems. Not all hardware watchdogs allow configuration of all possible reboot timeout values, in which case - the closest available timeout is picked. RebootWatchdogSec= may be used to configure the - hardware watchdog when the system is asked to reboot. It works as a safety net to ensure that the reboot takes - place even if a clean reboot attempt times out. Note that the RebootWatchdogSec= timeout - applies only to the second phase of the reboot, i.e. after all regular services are already terminated, and - after the system and service manager process (PID 1) got replaced by the systemd-shutdown - binary, see system bootup7 - for details. During the first phase of the shutdown operation the system and service manager remains running - and hence RuntimeWatchdogSec= is still honoured. In order to define a timeout on this first - phase of system shutdown, configure JobTimeoutSec= and JobTimeoutAction= - in the [Unit] section of the shutdown.target unit. By default - RuntimeWatchdogSec= defaults to 0 (off), and RebootWatchdogSec= to - 10min. KExecWatchdogSec= may be used to additionally enable the watchdog when kexec - is being executed rather than when rebooting. Note that if the kernel does not reset the watchdog on kexec (depending - on the specific hardware and/or driver), in this case the watchdog might not get disabled after kexec succeeds - and thus the system might get rebooted, unless RuntimeWatchdogSec= is also enabled at the same time. - For this reason it is recommended to enable KExecWatchdogSec= only if - RuntimeWatchdogSec= is also enabled. - These settings have no effect if a hardware watchdog is not available. + Configure the hardware watchdog at runtime and at reboot. Takes a timeout value in + seconds (or in other time units if suffixed with ms, min, + h, d, w). If set to zero the watchdog logic + is disabled: no watchdog device is opened, configured, or pinged. If set to the special string + infinity the watchdog is opened and pinged in regular intervals, but the timeout + is not changed from the default. If set to any other time value the watchdog timeout is configured to + the specified value (or a value close to it, depending on hardware capabilities). + + If RuntimeWatchdogSec= is set to a non-zero value, the watchdog hardware + (/dev/watchdog or the path specified with WatchdogDevice= or + the kernel option systemd.watchdog-device=) will be programmed to automatically + reboot the system if it is not contacted within the specified timeout interval. The system manager + will ensure to contact it at least once in half the specified timeout interval. This feature requires + a hardware watchdog device to be present, as it is commonly the case in embedded and server + systems. Not all hardware watchdogs allow configuration of all possible reboot timeout values, in + which case the closest available timeout is picked. + + RebootWatchdogSec= may be used to configure the hardware watchdog when the + system is asked to reboot. It works as a safety net to ensure that the reboot takes place even if a + clean reboot attempt times out. Note that the RebootWatchdogSec= timeout applies + only to the second phase of the reboot, i.e. after all regular services are already terminated, and + after the system and service manager process (PID 1) got replaced by the + systemd-shutdown binary, see system + bootup7 for + details. During the first phase of the shutdown operation the system and service manager remains + running and hence RuntimeWatchdogSec= is still honoured. In order to define a + timeout on this first phase of system shutdown, configure JobTimeoutSec= and + JobTimeoutAction= in the [Unit] section of the + shutdown.target unit. By default RuntimeWatchdogSec= defaults + to 0 (off), and RebootWatchdogSec= to 10min. + + KExecWatchdogSec= may be used to additionally enable the watchdog when kexec + is being executed rather than when rebooting. Note that if the kernel does not reset the watchdog on + kexec (depending on the specific hardware and/or driver), in this case the watchdog might not get + disabled after kexec succeeds and thus the system might get rebooted, unless + RuntimeWatchdogSec= is also enabled at the same time. For this reason it is + recommended to enable KExecWatchdogSec= only if + RuntimeWatchdogSec= is also enabled. + + These settings have no effect if a hardware watchdog is not available. From b3aa73e4de614c06c4a27e5635967a0392654fbc Mon Sep 17 00:00:00 2001 From: Franck Bui Date: Fri, 17 Sep 2021 11:11:14 +0200 Subject: [PATCH 7/9] core: introduce systemd.watchdog_sec= option --- src/core/main.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/core/main.c b/src/core/main.c index 8792ef88fc..e2fc6fae61 100644 --- a/src/core/main.c +++ b/src/core/main.c @@ -531,6 +531,17 @@ static int parse_proc_cmdline_item(const char *key, const char *value, void *dat (void) parse_path_argument(value, false, &arg_watchdog_device); + } else if (proc_cmdline_key_streq(key, "systemd.watchdog_sec")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = parse_sec(value, &arg_runtime_watchdog); + if (r < 0) + log_warning_errno(r, "Failed to parse systemd.watchdog_sec= argument '%s', ignoring: %m", value); + else + arg_kexec_watchdog = arg_reboot_watchdog = arg_runtime_watchdog; + } else if (proc_cmdline_key_streq(key, "systemd.clock_usec")) { if (proc_cmdline_value_missing(key, value)) From 0ffdfe7d6863e6d479323d90c222f2dcae2ff057 Mon Sep 17 00:00:00 2001 From: Franck Bui Date: Mon, 27 Sep 2021 10:51:28 +0200 Subject: [PATCH 8/9] watchdog: handle timeout programming errors more safely If an error happened while the timeout value was being programmed, the error was ignored and the watchdog module used the new timeout value whereas the watchdog device was left with the previous one. Now in cases of error, the device is now disabled and closed if it wasn't opened already otherwise the previous timeout value is kept so the device is still pinged at correct intervals. --- src/shared/watchdog.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/src/shared/watchdog.c b/src/shared/watchdog.c index b6fddd193f..4de6003039 100644 --- a/src/shared/watchdog.c +++ b/src/shared/watchdog.c @@ -96,8 +96,8 @@ static int update_timeout(void) { r = watchdog_set_timeout(); if (r < 0) { if (!ERRNO_IS_NOT_SUPPORTED(r)) - return log_warning_errno(r, "Failed to set timeout to %s, ignoring: %m", - FORMAT_TIMESPAN(watchdog_timeout, 0)); + return log_error_errno(r, "Failed to set timeout to %s: %m", + FORMAT_TIMESPAN(watchdog_timeout, 0)); log_info("Modifying watchdog timeout is not supported, reusing the programmed timeout."); watchdog_timeout = USEC_INFINITY; @@ -107,7 +107,7 @@ static int update_timeout(void) { if (watchdog_timeout == USEC_INFINITY) { r = watchdog_get_timeout(); if (r < 0) - return log_warning_errno(errno, "Failed to query watchdog HW timeout, ignoring: %m"); + return log_error_errno(errno, "Failed to query watchdog HW timeout: %m"); } r = watchdog_set_enable(true); @@ -122,6 +122,7 @@ static int update_timeout(void) { static int open_watchdog(void) { struct watchdog_info ident; const char *fn; + int r; if (watchdog_fd >= 0) return 0; @@ -139,7 +140,11 @@ static int open_watchdog(void) { ident.firmware_version, fn); - return update_timeout(); + r = update_timeout(); + if (r < 0) + watchdog_close(true); + + return r; } int watchdog_set_device(const char *path) { @@ -156,6 +161,8 @@ int watchdog_set_device(const char *path) { } int watchdog_setup(usec_t timeout) { + usec_t previous_timeout; + int r; /* timeout=0 closes the device whereas passing timeout=USEC_INFINITY * opens it (if needed) without configuring any particular timeout and @@ -175,12 +182,17 @@ int watchdog_setup(usec_t timeout) { /* Initialize the watchdog timeout with the caller value. This value is * going to be updated by update_timeout() with the closest value * supported by the driver */ + previous_timeout = watchdog_timeout; watchdog_timeout = timeout; if (watchdog_fd < 0) return open_watchdog(); - return update_timeout(); + r = update_timeout(); + if (r < 0) + watchdog_timeout = previous_timeout; + + return r; } usec_t watchdog_runtime_wait(void) { From 8a85c5b6160b76125413e021b0e6b3eab4afa49c Mon Sep 17 00:00:00 2001 From: Franck Bui Date: Fri, 1 Oct 2021 10:42:11 +0200 Subject: [PATCH 9/9] watchdog: rename special string "infinity" taken by the watchdog timeout options to "default" --- man/systemd-system.conf.xml | 2 +- src/core/load-fragment.c | 29 +++++++++++++++++++++++++++++ src/core/load-fragment.h | 1 + src/core/main.c | 24 +++++++++++++++--------- 4 files changed, 46 insertions(+), 10 deletions(-) diff --git a/man/systemd-system.conf.xml b/man/systemd-system.conf.xml index 4172ec00ab..dfd4e245a8 100644 --- a/man/systemd-system.conf.xml +++ b/man/systemd-system.conf.xml @@ -137,7 +137,7 @@ seconds (or in other time units if suffixed with ms, min, h, d, w). If set to zero the watchdog logic is disabled: no watchdog device is opened, configured, or pinged. If set to the special string - infinity the watchdog is opened and pinged in regular intervals, but the timeout + default the watchdog is opened and pinged in regular intervals, but the timeout is not changed from the default. If set to any other time value the watchdog timeout is configured to the specified value (or a value close to it, depending on hardware capabilities). diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c index f971084c28..c5adef6c1f 100644 --- a/src/core/load-fragment.c +++ b/src/core/load-fragment.c @@ -6262,3 +6262,32 @@ int config_parse_swap_priority( s->parameters_fragment.priority_set = true; return 0; } + +int config_parse_watchdog_sec( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + assert(filename); + assert(lvalue); + assert(rvalue); + + /* This is called for {Runtime,Reboot,KExec}WatchdogSec= where "default" maps to + * USEC_INFINITY internally. */ + + if (streq(rvalue, "default")) { + usec_t *usec = data; + + *usec = USEC_INFINITY; + return 0; + } + + return config_parse_sec(unit, filename, line, section, section_line, lvalue, ltype, rvalue, data, userdata); +} diff --git a/src/core/load-fragment.h b/src/core/load-fragment.h index e84f9ee391..06f703e218 100644 --- a/src/core/load-fragment.h +++ b/src/core/load-fragment.h @@ -142,6 +142,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_extension_images); CONFIG_PARSER_PROTOTYPE(config_parse_bpf_foreign_program); CONFIG_PARSER_PROTOTYPE(config_parse_cgroup_socket_bind); CONFIG_PARSER_PROTOTYPE(config_parse_restrict_network_interfaces); +CONFIG_PARSER_PROTOTYPE(config_parse_watchdog_sec); /* gperf prototypes */ const struct ConfigPerfItem* load_fragment_gperf_lookup(const char *key, GPERF_LEN_TYPE length); diff --git a/src/core/main.c b/src/core/main.c index e2fc6fae61..67710f7528 100644 --- a/src/core/main.c +++ b/src/core/main.c @@ -536,11 +536,17 @@ static int parse_proc_cmdline_item(const char *key, const char *value, void *dat if (proc_cmdline_value_missing(key, value)) return 0; - r = parse_sec(value, &arg_runtime_watchdog); - if (r < 0) - log_warning_errno(r, "Failed to parse systemd.watchdog_sec= argument '%s', ignoring: %m", value); - else - arg_kexec_watchdog = arg_reboot_watchdog = arg_runtime_watchdog; + if (streq(value, "default")) + arg_runtime_watchdog = USEC_INFINITY; + else { + r = parse_sec(value, &arg_runtime_watchdog); + if (r < 0) { + log_warning_errno(r, "Failed to parse systemd.watchdog_sec= argument '%s', ignoring: %m", value); + return 0; + } + } + + arg_kexec_watchdog = arg_reboot_watchdog = arg_runtime_watchdog; } else if (proc_cmdline_key_streq(key, "systemd.clock_usec")) { @@ -662,10 +668,10 @@ static int parse_config_file(void) { { "Manager", "NUMAPolicy", config_parse_numa_policy, 0, &arg_numa_policy.type }, { "Manager", "NUMAMask", config_parse_numa_mask, 0, &arg_numa_policy }, { "Manager", "JoinControllers", config_parse_warn_compat, DISABLED_CONFIGURATION, NULL }, - { "Manager", "RuntimeWatchdogSec", config_parse_sec, 0, &arg_runtime_watchdog }, - { "Manager", "RebootWatchdogSec", config_parse_sec, 0, &arg_reboot_watchdog }, - { "Manager", "ShutdownWatchdogSec", config_parse_sec, 0, &arg_reboot_watchdog }, /* obsolete alias */ - { "Manager", "KExecWatchdogSec", config_parse_sec, 0, &arg_kexec_watchdog }, + { "Manager", "RuntimeWatchdogSec", config_parse_watchdog_sec, 0, &arg_runtime_watchdog }, + { "Manager", "RebootWatchdogSec", config_parse_watchdog_sec, 0, &arg_reboot_watchdog }, + { "Manager", "ShutdownWatchdogSec", config_parse_watchdog_sec, 0, &arg_reboot_watchdog }, /* obsolete alias */ + { "Manager", "KExecWatchdogSec", config_parse_watchdog_sec, 0, &arg_kexec_watchdog }, { "Manager", "WatchdogDevice", config_parse_path, 0, &arg_watchdog_device }, { "Manager", "CapabilityBoundingSet", config_parse_capability_set, 0, &arg_capability_bounding_set }, { "Manager", "NoNewPrivileges", config_parse_bool, 0, &arg_no_new_privs },