* [PATCH v5] Extend watchdog timeout during kernel panic.
@ 2021-02-05 18:46 JP Ertola
2021-05-21 15:18 ` Guenter Roeck
0 siblings, 1 reply; 2+ messages in thread
From: JP Ertola @ 2021-02-05 18:46 UTC (permalink / raw)
To: wim, linux, linux-watchdog; +Cc: JP Ertola
If the watchdog timeout is set such that the crash kernel does not
have time to collect a coredump and the crash kernel is not equipped to
ping the watchdog timer itself, then a kernel coredump will not be collected
before the watchdog fires. This change registers a panic notifier and
callback so the watchdog timeout can be extended if a kernel panic occurs.
This timeout extension would give the crash kernel enough time to collect
a coredump before the CPU resets. The watchdog timeout is extended if and only
if a crash kernel image is loaded in memory, the watchdog is active at the
time of the panic, and the kconfig setting is set.
A Kconfig option has been added to configure the timeout duration at
compile-time. Default is zero seconds.
Signed-off-by: JP Ertola <jp.ertola@hpe.com>
---
v5: Clean up variable names and spacing. Call __watchdog_ping() instead of
wdd->ops->ping(). Remove notifier_from_errno() as it could cause unintended
behavior in the future if this watchdog extension notifier has its priority
elevated above minimum.
v4: Remove optional callback mechanism alltogether. I agree with Guenter,
not widely used.
v3: Fix logic so timeout extension is not longer than wdd->max_timeout
v2: Remove dead code and comments.
drivers/watchdog/Kconfig | 13 ++++++
drivers/watchdog/watchdog_dev.c | 73 ++++++++++++++++++++++++++++++++-
include/linux/watchdog.h | 1 +
3 files changed, 85 insertions(+), 2 deletions(-)
diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig
index fd7968635e6d..f1055985e100 100644
--- a/drivers/watchdog/Kconfig
+++ b/drivers/watchdog/Kconfig
@@ -141,6 +141,19 @@ comment "Watchdog Device Drivers"
# Architecture Independent
+config DEFAULT_WATCHDOG_CRASH_KERNEL_TIMEOUT
+ int "Default timeout for watchdog timer before crash kernel starts (seconds)"
+ default 0
+ help
+ This option allows an extended timeout to be used for the watchdog when
+ the kernel panics and a crash kernel is about to start. This is helpful
+ when the existing WDT timeout value is less than the time required for
+ crash kernel to run and the crash kernel is unable to handle the
+ the watchdog itself. The timeout extension happens last in chain of
+ kernel panic handler callbacks just in case another panic handler
+ hangs unexpectedly. When this value is set to 0, the watchdog timeout
+ will not be changed.
+
config SOFT_WATCHDOG
tristate "Software watchdog"
select WATCHDOG_CORE
diff --git a/drivers/watchdog/watchdog_dev.c b/drivers/watchdog/watchdog_dev.c
index 2946f3a63110..92d11ef9fbb4 100644
--- a/drivers/watchdog/watchdog_dev.c
+++ b/drivers/watchdog/watchdog_dev.c
@@ -34,6 +34,7 @@
#include <linux/init.h> /* For __init/__exit/... */
#include <linux/hrtimer.h> /* For hrtimers */
#include <linux/kernel.h> /* For printk/panic/... */
+#include <linux/kexec.h> /* For checking if crash kernel is loaded */
#include <linux/kthread.h> /* For kthread_work */
#include <linux/miscdevice.h> /* For handling misc devices */
#include <linux/module.h> /* For module stuff/... */
@@ -82,6 +83,8 @@ static bool handle_boot_enabled =
static unsigned open_timeout = CONFIG_WATCHDOG_OPEN_TIMEOUT;
+static unsigned int wdt_panic_timeout = CONFIG_DEFAULT_WATCHDOG_CRASH_KERNEL_TIMEOUT;
+
static bool watchdog_past_open_deadline(struct watchdog_core_data *data)
{
return ktime_after(ktime_get(), data->open_deadline);
@@ -658,6 +661,50 @@ static int watchdog_ioctl_op(struct watchdog_device *wdd, unsigned int cmd,
* off the watchdog (if 'nowayout' is not set).
*/
+static int watchdog_panic_notifier(struct notifier_block *nb,
+ unsigned long code, void *data)
+{
+ struct watchdog_device *wdd;
+ int ret;
+ unsigned int timeout = wdt_panic_timeout;
+
+ if (wdt_panic_timeout == 0)
+ return NOTIFY_DONE;
+
+ wdd = container_of(nb, struct watchdog_device, panic_nb);
+
+ if (watchdog_timeout_invalid(wdd, wdt_panic_timeout)) {
+ timeout = min(wdt_panic_timeout, wdd->max_timeout);
+ pr_err("watchdog%d: timeout extension value invalid. Falling back to %d seconds\n",
+ wdd->id, timeout);
+ }
+
+ if (kexec_crash_image && watchdog_active(wdd)) {
+ int ping_ret;
+
+ ret = watchdog_set_timeout(wdd, timeout);
+ if (ret) {
+ pr_err("watchdog%d: Unable to extend watchdog timeout before starting crash kernel (%d)",
+ wdd->id, ret);
+ }
+
+ /* Many watchdog implementations will reset the timer
+ * when the timeout is changed, but ping again to be
+ * safe.
+ */
+ ping_ret = __watchdog_ping(wdd);
+ if (ping_ret) {
+ pr_warn("watchdog%d: Unable to ping watchdog before starting crash kernel (%d)\n",
+ wdd->id, ping_ret);
+ }
+ if (ret == 0 && ping_ret == 0) {
+ pr_info("watchdog%d: Extending watchdog timeout to %d seconds\n",
+ wdd->id, timeout);
+ }
+ }
+ return ret;
+}
+
static ssize_t watchdog_write(struct file *file, const char __user *data,
size_t len, loff_t *ppos)
{
@@ -1118,10 +1165,27 @@ int watchdog_dev_register(struct watchdog_device *wdd)
return ret;
ret = watchdog_register_pretimeout(wdd);
- if (ret)
+ if (ret) {
watchdog_cdev_unregister(wdd);
+ return ret;
+ }
- return ret;
+ /*
+ * Setting panic_nb priority to minimum ensures the watchdog device
+ * panic callback runs last in the chain of kernel panic callbacks.
+ * This way, the watchdog device will still fire in the event another
+ * panic callback hangs.
+ */
+ wdd->panic_nb.priority = INT_MIN;
+ wdd->panic_nb.notifier_call = watchdog_panic_notifier;
+
+ ret = atomic_notifier_chain_register(&panic_notifier_list,
+ &wdd->panic_nb);
+ if (ret)
+ pr_warn("watchdog%d: Cannot register panic notifier (%d)\n",
+ wdd->id, ret);
+
+ return 0;
}
/*
@@ -1228,3 +1292,8 @@ module_param(open_timeout, uint, 0644);
MODULE_PARM_DESC(open_timeout,
"Maximum time (in seconds, 0 means infinity) for userspace to take over a running watchdog (default="
__MODULE_STRING(CONFIG_WATCHDOG_OPEN_TIMEOUT) ")");
+
+module_param(wdt_panic_timeout, uint, 0444);
+MODULE_PARM_DESC(wdt_panic_timeout,
+ "Watchdog timeout extension duration upon kernel panic. (default="
+ __MODULE_STRING(CONFIG_DEFAULT_WATCHDOG_CRASH_KERNEL_TIMEOUT) " seconds)");
diff --git a/include/linux/watchdog.h b/include/linux/watchdog.h
index 9b19e6bb68b5..bc7e6e8aa7ab 100644
--- a/include/linux/watchdog.h
+++ b/include/linux/watchdog.h
@@ -107,6 +107,7 @@ struct watchdog_device {
unsigned int max_hw_heartbeat_ms;
struct notifier_block reboot_nb;
struct notifier_block restart_nb;
+ struct notifier_block panic_nb;
void *driver_data;
struct watchdog_core_data *wd_data;
unsigned long status;
--
2.29.0
^ permalink raw reply related [flat|nested] 2+ messages in thread
* Re: [PATCH v5] Extend watchdog timeout during kernel panic.
2021-02-05 18:46 [PATCH v5] Extend watchdog timeout during kernel panic JP Ertola
@ 2021-05-21 15:18 ` Guenter Roeck
0 siblings, 0 replies; 2+ messages in thread
From: Guenter Roeck @ 2021-05-21 15:18 UTC (permalink / raw)
To: JP Ertola; +Cc: wim, linux-watchdog
On Fri, Feb 05, 2021 at 10:46:20AM -0800, JP Ertola wrote:
> If the watchdog timeout is set such that the crash kernel does not
> have time to collect a coredump and the crash kernel is not equipped to
> ping the watchdog timer itself, then a kernel coredump will not be collected
> before the watchdog fires. This change registers a panic notifier and
> callback so the watchdog timeout can be extended if a kernel panic occurs.
> This timeout extension would give the crash kernel enough time to collect
> a coredump before the CPU resets. The watchdog timeout is extended if and only
> if a crash kernel image is loaded in memory, the watchdog is active at the
> time of the panic, and the kconfig setting is set.
>
> A Kconfig option has been added to configure the timeout duration at
> compile-time. Default is zero seconds.
>
> Signed-off-by: JP Ertola <jp.ertola@hpe.com>
> ---
> v5: Clean up variable names and spacing. Call __watchdog_ping() instead of
> wdd->ops->ping(). Remove notifier_from_errno() as it could cause unintended
> behavior in the future if this watchdog extension notifier has its priority
> elevated above minimum.
> v4: Remove optional callback mechanism alltogether. I agree with Guenter,
> not widely used.
> v3: Fix logic so timeout extension is not longer than wdd->max_timeout
> v2: Remove dead code and comments.
>
> drivers/watchdog/Kconfig | 13 ++++++
> drivers/watchdog/watchdog_dev.c | 73 ++++++++++++++++++++++++++++++++-
> include/linux/watchdog.h | 1 +
> 3 files changed, 85 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig
> index fd7968635e6d..f1055985e100 100644
> --- a/drivers/watchdog/Kconfig
> +++ b/drivers/watchdog/Kconfig
> @@ -141,6 +141,19 @@ comment "Watchdog Device Drivers"
>
> # Architecture Independent
>
> +config DEFAULT_WATCHDOG_CRASH_KERNEL_TIMEOUT
> + int "Default timeout for watchdog timer before crash kernel starts (seconds)"
> + default 0
> + help
> + This option allows an extended timeout to be used for the watchdog when
> + the kernel panics and a crash kernel is about to start. This is helpful
> + when the existing WDT timeout value is less than the time required for
> + crash kernel to run and the crash kernel is unable to handle the
> + the watchdog itself. The timeout extension happens last in chain of
> + kernel panic handler callbacks just in case another panic handler
> + hangs unexpectedly. When this value is set to 0, the watchdog timeout
> + will not be changed.
> +
> config SOFT_WATCHDOG
> tristate "Software watchdog"
> select WATCHDOG_CORE
> diff --git a/drivers/watchdog/watchdog_dev.c b/drivers/watchdog/watchdog_dev.c
> index 2946f3a63110..92d11ef9fbb4 100644
> --- a/drivers/watchdog/watchdog_dev.c
> +++ b/drivers/watchdog/watchdog_dev.c
> @@ -34,6 +34,7 @@
> #include <linux/init.h> /* For __init/__exit/... */
> #include <linux/hrtimer.h> /* For hrtimers */
> #include <linux/kernel.h> /* For printk/panic/... */
> +#include <linux/kexec.h> /* For checking if crash kernel is loaded */
> #include <linux/kthread.h> /* For kthread_work */
> #include <linux/miscdevice.h> /* For handling misc devices */
> #include <linux/module.h> /* For module stuff/... */
> @@ -82,6 +83,8 @@ static bool handle_boot_enabled =
>
> static unsigned open_timeout = CONFIG_WATCHDOG_OPEN_TIMEOUT;
>
> +static unsigned int wdt_panic_timeout = CONFIG_DEFAULT_WATCHDOG_CRASH_KERNEL_TIMEOUT;
> +
> static bool watchdog_past_open_deadline(struct watchdog_core_data *data)
> {
> return ktime_after(ktime_get(), data->open_deadline);
> @@ -658,6 +661,50 @@ static int watchdog_ioctl_op(struct watchdog_device *wdd, unsigned int cmd,
> * off the watchdog (if 'nowayout' is not set).
> */
>
> +static int watchdog_panic_notifier(struct notifier_block *nb,
> + unsigned long code, void *data)
> +{
> + struct watchdog_device *wdd;
> + int ret;
> + unsigned int timeout = wdt_panic_timeout;
> +
> + if (wdt_panic_timeout == 0)
> + return NOTIFY_DONE;
> +
> + wdd = container_of(nb, struct watchdog_device, panic_nb);
> +
> + if (watchdog_timeout_invalid(wdd, wdt_panic_timeout)) {
> + timeout = min(wdt_panic_timeout, wdd->max_timeout);
This won't work if wdd->min_timeout is set to a value larger than the
requested value. It also won't work if max_timeout is not set at all
(as is the case for drivers using max_hw_heartbeat_ms instead).
Guenter
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2021-05-21 15:18 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-02-05 18:46 [PATCH v5] Extend watchdog timeout during kernel panic JP Ertola
2021-05-21 15:18 ` Guenter Roeck
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).