linux-watchdog.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v6] watchdog: Extend watchdog timeout during kernel panic
@ 2021-08-15  2:29 Curtis Klein
  2021-10-11  1:46 ` Guenter Roeck
  0 siblings, 1 reply; 2+ messages in thread
From: Curtis Klein @ 2021-08-15  2:29 UTC (permalink / raw)
  To: wim, linux; +Cc: curtis.klein, linux-watchdog, linux-kernel

From: JP Ertola <jp.ertola@hpe.com>

If the watchdog timeout is set such that the crash kernel does not
have time to collect a coredump and the crash kernel is not equipped to
ping the watchdog timer itself, then a kernel coredump will not be
collected before the watchdog fires. This change registers a panic
notifier and callback so the watchdog timeout can be extended if a
kernel panic occurs. This timeout extension would give the crash kernel
enough time to collect a coredump before the CPU resets. The watchdog
timeout is extended if and only if a crash kernel image is loaded in
memory, the watchdog is active at the time of the panic, and the kconfig
setting is set.

A Kconfig option has been added to configure the timeout duration at
compile-time. Default is zero seconds.

Signed-off-by: JP Ertola <jp.ertola@hpe.com>
Co-developed-by: Curtis Klein <curtis.klein@hpe.com>
Signed-off-by: Curtis Klein <curtis.klein@hpe.com>
---
v6: 
 - Only register panic notifier if wdt_panic_timeout is set. 
 - Exit notifier callback early if watchdog is not active and hw is not running
 - Handle more scenarios where the wdt_panic_timeout is invalid.
 - Unregister notifier in watchdog_dev_unregister.
 - Rework notifier callback flow

v5: Clean up variable names and spacing. Call __watchdog_ping() instead of 
wdd->ops->ping(). Remove notifier_from_errno() as it could cause unintended
behavior in the future if this watchdog extension notifier has its priority
elevated above minimum.
v4: Remove optional callback mechanism alltogether. I agree with Guenter,
not widely used.
v3: Fix logic so timeout extension is not longer than wdd->max_timeout
v2: Remove dead code and comments.

 drivers/watchdog/Kconfig        | 13 +++++++
 drivers/watchdog/watchdog_dev.c | 84 ++++++++++++++++++++++++++++++++++-------
 include/linux/watchdog.h        |  1 +
 3 files changed, 84 insertions(+), 14 deletions(-)

diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig
index 546dfc1..e7c6230 100644
--- a/drivers/watchdog/Kconfig
+++ b/drivers/watchdog/Kconfig
@@ -149,6 +149,19 @@ comment "Watchdog Device Drivers"
 
 # Architecture Independent
 
+config DEFAULT_WATCHDOG_CRASH_KERNEL_TIMEOUT
+	int "Default timeout for watchdog timer before crash kernel starts (seconds)"
+	default 0
+	help
+	  This option allows an extended timeout to be used for the watchdog when
+	  the kernel panics and a crash kernel is about to start. This is helpful
+	  when the existing WDT timeout value is less than the time required for
+	  crash kernel to run and the crash kernel is unable to handle the
+	  the watchdog itself. The timeout extension happens last in chain of
+	  kernel panic handler callbacks just in case another panic handler
+	  hangs unexpectedly. When this value is set to 0, the watchdog timeout
+	  will not be changed.
+
 config SOFT_WATCHDOG
 	tristate "Software watchdog"
 	select WATCHDOG_CORE
diff --git a/drivers/watchdog/watchdog_dev.c b/drivers/watchdog/watchdog_dev.c
index 3bab324..68e1281 100644
--- a/drivers/watchdog/watchdog_dev.c
+++ b/drivers/watchdog/watchdog_dev.c
@@ -29,20 +29,22 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
-#include <linux/cdev.h>		/* For character device */
-#include <linux/errno.h>	/* For the -ENODEV/... values */
-#include <linux/fs.h>		/* For file operations */
-#include <linux/init.h>		/* For __init/__exit/... */
-#include <linux/hrtimer.h>	/* For hrtimers */
-#include <linux/kernel.h>	/* For printk/panic/... */
-#include <linux/kthread.h>	/* For kthread_work */
-#include <linux/miscdevice.h>	/* For handling misc devices */
-#include <linux/module.h>	/* For module stuff/... */
-#include <linux/mutex.h>	/* For mutexes */
-#include <linux/slab.h>		/* For memory functions */
-#include <linux/types.h>	/* For standard types (like size_t) */
-#include <linux/watchdog.h>	/* For watchdog specific items */
-#include <linux/uaccess.h>	/* For copy_to_user/put_user/... */
+#include <linux/cdev.h>		    /* For character device */
+#include <linux/errno.h>	    /* For the -ENODEV/... values */
+#include <linux/fs.h>		    /* For file operations */
+#include <linux/init.h>		    /* For __init/__exit/... */
+#include <linux/hrtimer.h>	    /* For hrtimers */
+#include <linux/kernel.h>	    /* For printk/panic/... */
+#include <linux/kexec.h>	    /* For checking if crash kernel is loaded */
+#include <linux/kthread.h>	    /* For kthread_work */
+#include <linux/miscdevice.h>	    /* For handling misc devices */
+#include <linux/module.h>	    /* For module stuff/... */
+#include <linux/mutex.h>	    /* For mutexes */
+#include <linux/panic_notifier.h>   /* For panic_notifier_list */
+#include <linux/slab.h>		    /* For memory functions */
+#include <linux/types.h>	    /* For standard types (like size_t) */
+#include <linux/watchdog.h>	    /* For watchdog specific items */
+#include <linux/uaccess.h>	    /* For copy_to_user/put_user/... */
 
 #include "watchdog_core.h"
 #include "watchdog_pretimeout.h"
@@ -57,6 +59,9 @@ static struct kthread_worker *watchdog_kworker;
 static bool handle_boot_enabled =
 	IS_ENABLED(CONFIG_WATCHDOG_HANDLE_BOOT_ENABLED);
 
+static unsigned int wdt_panic_timeout =
+	CONFIG_DEFAULT_WATCHDOG_CRASH_KERNEL_TIMEOUT;
+
 static unsigned open_timeout = CONFIG_WATCHDOG_OPEN_TIMEOUT;
 
 static bool watchdog_past_open_deadline(struct watchdog_core_data *data)
@@ -818,6 +823,43 @@ static long watchdog_ioctl(struct file *file, unsigned int cmd,
 	return err;
 }
 
+static int watchdog_panic_notifier(struct notifier_block *nb,
+	unsigned long code, void *data)
+{
+	struct watchdog_device *wdd = container_of(nb, struct watchdog_device, panic_nb);
+	unsigned int timeout = wdt_panic_timeout;
+	int set_timeout_ret, ping_ret;
+
+	if (wdd == NULL || wdt_panic_timeout == 0 || kexec_crash_image == NULL ||
+	    !(watchdog_active(wdd) || watchdog_hw_running(wdd)))
+		return NOTIFY_DONE;
+
+	if (wdd->max_hw_heartbeat_ms && wdd->max_hw_heartbeat_ms < wdt_panic_timeout * 1000)
+		timeout = wdd->max_hw_heartbeat_ms / 1000;
+	else if (watchdog_timeout_invalid(wdd, wdt_panic_timeout))
+		timeout = wdd->max_timeout;
+
+	if (timeout != wdt_panic_timeout)
+		pr_err("watchdog%d: Cannot set specified panic timeout, using %ds.\n",
+			wdd->id, timeout);
+
+	set_timeout_ret = watchdog_set_timeout(wdd, timeout);
+	if (set_timeout_ret)
+		pr_err("watchdog%d: Failed to extend watchdog timeout (%d)\n",
+			wdd->id, set_timeout_ret);
+
+	ping_ret = watchdog_ping(wdd);
+	if (ping_ret)
+		pr_warn("watchdog%d: Failed to ping watchdog (%d)\n",
+			wdd->id, ping_ret);
+
+	if (set_timeout_ret == 0 && ping_ret == 0)
+		pr_info("watchdog%d: Extended watchdog timeout to %d seconds\n",
+			wdd->id, timeout);
+
+	return NOTIFY_DONE;
+}
+
 /*
  *	watchdog_open: open the /dev/watchdog* devices.
  *	@inode: inode of device
@@ -1129,6 +1171,17 @@ int watchdog_dev_register(struct watchdog_device *wdd)
 	if (ret)
 		watchdog_cdev_unregister(wdd);
 
+	if (wdt_panic_timeout) {
+		wdd->panic_nb.priority = INT_MIN; /* Run last */
+		wdd->panic_nb.notifier_call = watchdog_panic_notifier;
+
+		ret = atomic_notifier_chain_register(&panic_notifier_list,
+						     &wdd->panic_nb);
+		if (ret)
+			pr_warn("watchdog%d: Cannot register panic notifier (%d)\n",
+				wdd->id, ret);
+	}
+
 	return ret;
 }
 
@@ -1142,6 +1195,9 @@ int watchdog_dev_register(struct watchdog_device *wdd)
 
 void watchdog_dev_unregister(struct watchdog_device *wdd)
 {
+	if (wdt_panic_timeout)
+		atomic_notifier_chain_unregister(&panic_notifier_list, &wdd->panic_nb);
+
 	watchdog_unregister_pretimeout(wdd);
 	watchdog_cdev_unregister(wdd);
 }
diff --git a/include/linux/watchdog.h b/include/linux/watchdog.h
index 9b19e6b..bc7e6e8 100644
--- a/include/linux/watchdog.h
+++ b/include/linux/watchdog.h
@@ -107,6 +107,7 @@ struct watchdog_device {
 	unsigned int max_hw_heartbeat_ms;
 	struct notifier_block reboot_nb;
 	struct notifier_block restart_nb;
+	struct notifier_block panic_nb;
 	void *driver_data;
 	struct watchdog_core_data *wd_data;
 	unsigned long status;
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2021-10-11  1:46 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-08-15  2:29 [PATCH v6] watchdog: Extend watchdog timeout during kernel panic Curtis Klein
2021-10-11  1:46 ` Guenter Roeck

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).