Linux-Watchdog Archive on lore.kernel.org
 help / color / Atom feed
* [PATCH v3] Extend watchdog timeout during kernel panic.
@ 2021-01-28  1:14 JP Ertola
  2021-01-28  1:54 ` Guenter Roeck
  0 siblings, 1 reply; 2+ messages in thread
From: JP Ertola @ 2021-01-28  1:14 UTC (permalink / raw)
  To: wim, linux, linux-watchdog; +Cc: JP Ertola

If the watchdog timeout is set such that the crash kernel does not
have time to collect a coredump and the crash kernel is not equipped to
ping the watchdog timer itself, then a kernel coredump will not be collected
before the watchdog fires. This change registers a panic notifier and
callback so the watchdog timeout can be extended if a kernel panic occurs.
This timeout extension would give the crash kernel enough time to collect
a coredump before the CPU resets. The watchdog timeout is extended if and only
if a crash kernel image is loaded in memory and the watchdog is active at the
time of the panic.

Drivers have the option of implementing their own platform-dependent (PD)
callback at the same time they register their watchdog device (wdd) with the
Linux kernel. By registering their own PD callback, the watchdog device
can extend the watchdog timeout and perform other tasks in a panic context.

This may be desireable when uncommon hardware platforms are used. For
example, a power management subsystem controlled by an FPGA attached to
the CPU.

A Kconfig option has been added to configure the timeout duration at
compile-time. Default is zero seconds.

Signed-off-by: JP Ertola <jp.ertola@hpe.com>
---
 drivers/watchdog/Kconfig        | 13 +++++
 drivers/watchdog/watchdog_dev.c | 84 ++++++++++++++++++++++++++++++++-
 include/linux/watchdog.h        |  3 ++
 3 files changed, 99 insertions(+), 1 deletion(-)

diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig
index fd7968635e6d..f1055985e100 100644
--- a/drivers/watchdog/Kconfig
+++ b/drivers/watchdog/Kconfig
@@ -141,6 +141,19 @@ comment "Watchdog Device Drivers"
 
 # Architecture Independent
 
+config DEFAULT_WATCHDOG_CRASH_KERNEL_TIMEOUT
+	int "Default timeout for watchdog timer before crash kernel starts (seconds)"
+	default 0
+	help
+	  This option allows an extended timeout to be used for the watchdog when
+	  the kernel panics and a crash kernel is about to start. This is helpful
+	  when the existing WDT timeout value is less than the time required for
+	  crash kernel to run and the crash kernel is unable to handle the
+	  the watchdog itself. The timeout extension happens last in chain of
+	  kernel panic handler callbacks just in case another panic handler
+	  hangs unexpectedly. When this value is set to 0, the watchdog timeout
+	  will not be changed.
+
 config SOFT_WATCHDOG
 	tristate "Software watchdog"
 	select WATCHDOG_CORE
diff --git a/drivers/watchdog/watchdog_dev.c b/drivers/watchdog/watchdog_dev.c
index 2946f3a63110..e2d056c70ca7 100644
--- a/drivers/watchdog/watchdog_dev.c
+++ b/drivers/watchdog/watchdog_dev.c
@@ -34,6 +34,7 @@
 #include <linux/init.h>		/* For __init/__exit/... */
 #include <linux/hrtimer.h>	/* For hrtimers */
 #include <linux/kernel.h>	/* For printk/panic/... */
+#include <linux/kexec.h>	/* For checking if crash kernel is loaded */
 #include <linux/kthread.h>	/* For kthread_work */
 #include <linux/miscdevice.h>	/* For handling misc devices */
 #include <linux/module.h>	/* For module stuff/... */
@@ -82,6 +83,8 @@ static bool handle_boot_enabled =
 
 static unsigned open_timeout = CONFIG_WATCHDOG_OPEN_TIMEOUT;
 
+static unsigned int wdt_panic_timeout = CONFIG_DEFAULT_WATCHDOG_CRASH_KERNEL_TIMEOUT;
+
 static bool watchdog_past_open_deadline(struct watchdog_core_data *data)
 {
 	return ktime_after(ktime_get(), data->open_deadline);
@@ -658,6 +661,61 @@ static int watchdog_ioctl_op(struct watchdog_device *wdd, unsigned int cmd,
  *	off the watchdog (if 'nowayout' is not set).
  */
 
+static int watchdog_panic_notifier(struct notifier_block *nb,
+	unsigned long code, void *data)
+{
+	struct watchdog_device *wdd;
+	int ret = 0;
+	unsigned int time_out = 0; // seconds
+
+	if (wdt_panic_timeout == 0)
+		return NOTIFY_DONE;
+
+	if (watchdog_timeout_invalid(wdt_panic_timeout)) {
+		time_out = min(wdt_panic_timeout, wdd->max_timeout);
+		pr_err("watchdog%d: timeout extension value "
+			" invalid. Falling back to %d seconds\n", time_out);
+	} else {
+		time_out = wdt_panic_timeout;
+	}
+
+	wdd = container_of(nb, struct watchdog_device, panic_nb);
+
+	if (kexec_crash_image && watchdog_active(wdd)) {
+		if (wdd->ops->panic_cb) {
+			ret = wdd->ops->panic_cb(wdd, time_out);
+		} else {
+			int ping_ret;
+
+			pr_info("watchdog%d: Extending watchdog timeout to "
+				"%d seconds\n", wdd->id, time_out);
+
+			ret = watchdog_set_timeout(wdd, time_out);
+
+			/* Many watchdog implementations will reset the timer
+			 * when the timeout is changed, but ping again to be
+			 * safe.
+			 */
+			if (wdd->ops->ping) {
+				ping_ret = wdd->ops->ping(wdd);
+				if (ping_ret) {
+					pr_warn("watchdog%d: Unable to ping "
+						"watchdog before starting "
+						"crash kernel (%d)\n",
+						wdd->id, ping_ret);
+				}
+			}
+		}
+		if (ret) {
+			pr_err("watchdog%d: Unable to extend watchdog timeout "
+				"before starting crash kernel (%d)",
+				wdd->id, ret);
+		}
+	}
+
+	return notifier_from_errno(ret);
+}
+
 static ssize_t watchdog_write(struct file *file, const char __user *data,
 						size_t len, loff_t *ppos)
 {
@@ -1118,8 +1176,27 @@ int watchdog_dev_register(struct watchdog_device *wdd)
 		return ret;
 
 	ret = watchdog_register_pretimeout(wdd);
-	if (ret)
+	if (ret) {
 		watchdog_cdev_unregister(wdd);
+		return ret;
+	}
+
+	/*
+	 * Setting panic_nb priority to minimum ensures the watchdog device
+	 * panic callback runs last in the chain of kernel panic callbacks.
+	 * This way, the watchdog device will fire in the event another
+	 * panic callback hangs.
+	 */
+	if (wdd->ops->panic_cb) {
+		wdd->panic_nb.priority = INT_MIN;
+		wdd->panic_nb.notifier_call = watchdog_panic_notifier;
+
+		ret = atomic_notifier_chain_register(&panic_notifier_list,
+						&wdd->panic_nb);
+		if (ret)
+			pr_err("watchdog%d: Cannot register panic notifier (%d)\n",
+				wdd->id, ret);
+	}
 
 	return ret;
 }
@@ -1228,3 +1305,8 @@ module_param(open_timeout, uint, 0644);
 MODULE_PARM_DESC(open_timeout,
 	"Maximum time (in seconds, 0 means infinity) for userspace to take over a running watchdog (default="
 	__MODULE_STRING(CONFIG_WATCHDOG_OPEN_TIMEOUT) ")");
+
+module_param(wdt_panic_timeout, uint, 0444);
+MODULE_PARM_DESC(wdt_panic_timeout,
+	"Watchdog timeout extension duration upon kernel panic. (default="
+	__MODULE_STRING(CONFIG_DEFAULT_WATCHDOG_CRASH_KERNEL_TIMEOUT) " seconds)");
diff --git a/include/linux/watchdog.h b/include/linux/watchdog.h
index 9b19e6bb68b5..f79f41cca156 100644
--- a/include/linux/watchdog.h
+++ b/include/linux/watchdog.h
@@ -34,6 +34,7 @@ struct watchdog_governor;
  * @get_timeleft:The routine that gets the time left before a reset (in seconds).
  * @restart:	The routine for restarting the machine.
  * @ioctl:	The routines that handles extra ioctl calls.
+ * @panic_cb:	The routine that extends the watchdog timeout before the crash kernel starts.
  *
  * The watchdog_ops structure contains a list of low-level operations
  * that control a watchdog device. It also contains the module that owns
@@ -53,6 +54,7 @@ struct watchdog_ops {
 	unsigned int (*get_timeleft)(struct watchdog_device *);
 	int (*restart)(struct watchdog_device *, unsigned long, void *);
 	long (*ioctl)(struct watchdog_device *, unsigned int, unsigned long);
+	int (*panic_cb)(struct watchdog_device *, unsigned int);
 };
 
 /** struct watchdog_device - The structure that defines a watchdog device
@@ -107,6 +109,7 @@ struct watchdog_device {
 	unsigned int max_hw_heartbeat_ms;
 	struct notifier_block reboot_nb;
 	struct notifier_block restart_nb;
+	struct notifier_block panic_nb;
 	void *driver_data;
 	struct watchdog_core_data *wd_data;
 	unsigned long status;
-- 
2.29.0


^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [PATCH v3] Extend watchdog timeout during kernel panic.
  2021-01-28  1:14 [PATCH v3] Extend watchdog timeout during kernel panic JP Ertola
@ 2021-01-28  1:54 ` Guenter Roeck
  0 siblings, 0 replies; 2+ messages in thread
From: Guenter Roeck @ 2021-01-28  1:54 UTC (permalink / raw)
  To: JP Ertola, wim, linux-watchdog

On 1/27/21 5:14 PM, JP Ertola wrote:
> If the watchdog timeout is set such that the crash kernel does not
> have time to collect a coredump and the crash kernel is not equipped to
> ping the watchdog timer itself, then a kernel coredump will not be collected
> before the watchdog fires. This change registers a panic notifier and
> callback so the watchdog timeout can be extended if a kernel panic occurs.
> This timeout extension would give the crash kernel enough time to collect
> a coredump before the CPU resets. The watchdog timeout is extended if and only
> if a crash kernel image is loaded in memory and the watchdog is active at the
> time of the panic.
> 
> Drivers have the option of implementing their own platform-dependent (PD)
> callback at the same time they register their watchdog device (wdd) with the
> Linux kernel. By registering their own PD callback, the watchdog device
> can extend the watchdog timeout and perform other tasks in a panic context.
> 
> This may be desireable when uncommon hardware platforms are used. For
> example, a power management subsystem controlled by an FPGA attached to
> the CPU.

This is only acceptable if there is an actual user for this callback.
I won't accept adding such a callback without a user. If you have one
in mind, please submit the patch adding it as part of a series.
That will let us evaluate if the callback is really needed and
what it is used for. Otherwise, please drop the callback.

> 
> A Kconfig option has been added to configure the timeout duration at
> compile-time. Default is zero seconds.
> 
> Signed-off-by: JP Ertola <jp.ertola@hpe.com>
> ---

Change log goes here, and I am not going back to v2 and v1 to try to figure
out what changed.

Guenter

>  drivers/watchdog/Kconfig        | 13 +++++
>  drivers/watchdog/watchdog_dev.c | 84 ++++++++++++++++++++++++++++++++-
>  include/linux/watchdog.h        |  3 ++
>  3 files changed, 99 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig
> index fd7968635e6d..f1055985e100 100644
> --- a/drivers/watchdog/Kconfig
> +++ b/drivers/watchdog/Kconfig
> @@ -141,6 +141,19 @@ comment "Watchdog Device Drivers"
>  
>  # Architecture Independent
>  
> +config DEFAULT_WATCHDOG_CRASH_KERNEL_TIMEOUT
> +	int "Default timeout for watchdog timer before crash kernel starts (seconds)"
> +	default 0
> +	help
> +	  This option allows an extended timeout to be used for the watchdog when
> +	  the kernel panics and a crash kernel is about to start. This is helpful
> +	  when the existing WDT timeout value is less than the time required for
> +	  crash kernel to run and the crash kernel is unable to handle the
> +	  the watchdog itself. The timeout extension happens last in chain of
> +	  kernel panic handler callbacks just in case another panic handler
> +	  hangs unexpectedly. When this value is set to 0, the watchdog timeout
> +	  will not be changed.
> +
>  config SOFT_WATCHDOG
>  	tristate "Software watchdog"
>  	select WATCHDOG_CORE
> diff --git a/drivers/watchdog/watchdog_dev.c b/drivers/watchdog/watchdog_dev.c
> index 2946f3a63110..e2d056c70ca7 100644
> --- a/drivers/watchdog/watchdog_dev.c
> +++ b/drivers/watchdog/watchdog_dev.c
> @@ -34,6 +34,7 @@
>  #include <linux/init.h>		/* For __init/__exit/... */
>  #include <linux/hrtimer.h>	/* For hrtimers */
>  #include <linux/kernel.h>	/* For printk/panic/... */
> +#include <linux/kexec.h>	/* For checking if crash kernel is loaded */
>  #include <linux/kthread.h>	/* For kthread_work */
>  #include <linux/miscdevice.h>	/* For handling misc devices */
>  #include <linux/module.h>	/* For module stuff/... */
> @@ -82,6 +83,8 @@ static bool handle_boot_enabled =
>  
>  static unsigned open_timeout = CONFIG_WATCHDOG_OPEN_TIMEOUT;
>  
> +static unsigned int wdt_panic_timeout = CONFIG_DEFAULT_WATCHDOG_CRASH_KERNEL_TIMEOUT;
> +
>  static bool watchdog_past_open_deadline(struct watchdog_core_data *data)
>  {
>  	return ktime_after(ktime_get(), data->open_deadline);
> @@ -658,6 +661,61 @@ static int watchdog_ioctl_op(struct watchdog_device *wdd, unsigned int cmd,
>   *	off the watchdog (if 'nowayout' is not set).
>   */
>  
> +static int watchdog_panic_notifier(struct notifier_block *nb,
> +	unsigned long code, void *data)
> +{
> +	struct watchdog_device *wdd;
> +	int ret = 0;
> +	unsigned int time_out = 0; // seconds
> +
> +	if (wdt_panic_timeout == 0)
> +		return NOTIFY_DONE;
> +
> +	if (watchdog_timeout_invalid(wdt_panic_timeout)) {
> +		time_out = min(wdt_panic_timeout, wdd->max_timeout);
> +		pr_err("watchdog%d: timeout extension value "
> +			" invalid. Falling back to %d seconds\n", time_out);
> +	} else {
> +		time_out = wdt_panic_timeout;
> +	}
> +
> +	wdd = container_of(nb, struct watchdog_device, panic_nb);
> +
> +	if (kexec_crash_image && watchdog_active(wdd)) {
> +		if (wdd->ops->panic_cb) {
> +			ret = wdd->ops->panic_cb(wdd, time_out);
> +		} else {
> +			int ping_ret;
> +
> +			pr_info("watchdog%d: Extending watchdog timeout to "
> +				"%d seconds\n", wdd->id, time_out);
> +
> +			ret = watchdog_set_timeout(wdd, time_out);
> +
> +			/* Many watchdog implementations will reset the timer
> +			 * when the timeout is changed, but ping again to be
> +			 * safe.
> +			 */
> +			if (wdd->ops->ping) {
> +				ping_ret = wdd->ops->ping(wdd);
> +				if (ping_ret) {
> +					pr_warn("watchdog%d: Unable to ping "
> +						"watchdog before starting "
> +						"crash kernel (%d)\n",
> +						wdd->id, ping_ret);
> +				}
> +			}
> +		}
> +		if (ret) {
> +			pr_err("watchdog%d: Unable to extend watchdog timeout "
> +				"before starting crash kernel (%d)",
> +				wdd->id, ret);
> +		}
> +	}
> +
> +	return notifier_from_errno(ret);
> +}
> +
>  static ssize_t watchdog_write(struct file *file, const char __user *data,
>  						size_t len, loff_t *ppos)
>  {
> @@ -1118,8 +1176,27 @@ int watchdog_dev_register(struct watchdog_device *wdd)
>  		return ret;
>  
>  	ret = watchdog_register_pretimeout(wdd);
> -	if (ret)
> +	if (ret) {
>  		watchdog_cdev_unregister(wdd);
> +		return ret;
> +	}
> +
> +	/*
> +	 * Setting panic_nb priority to minimum ensures the watchdog device
> +	 * panic callback runs last in the chain of kernel panic callbacks.
> +	 * This way, the watchdog device will fire in the event another
> +	 * panic callback hangs.
> +	 */
> +	if (wdd->ops->panic_cb) {
> +		wdd->panic_nb.priority = INT_MIN;
> +		wdd->panic_nb.notifier_call = watchdog_panic_notifier;
> +
> +		ret = atomic_notifier_chain_register(&panic_notifier_list,
> +						&wdd->panic_nb);
> +		if (ret)
> +			pr_err("watchdog%d: Cannot register panic notifier (%d)\n",
> +				wdd->id, ret);
> +	}
>  
>  	return ret;
>  }
> @@ -1228,3 +1305,8 @@ module_param(open_timeout, uint, 0644);
>  MODULE_PARM_DESC(open_timeout,
>  	"Maximum time (in seconds, 0 means infinity) for userspace to take over a running watchdog (default="
>  	__MODULE_STRING(CONFIG_WATCHDOG_OPEN_TIMEOUT) ")");
> +
> +module_param(wdt_panic_timeout, uint, 0444);
> +MODULE_PARM_DESC(wdt_panic_timeout,
> +	"Watchdog timeout extension duration upon kernel panic. (default="
> +	__MODULE_STRING(CONFIG_DEFAULT_WATCHDOG_CRASH_KERNEL_TIMEOUT) " seconds)");
> diff --git a/include/linux/watchdog.h b/include/linux/watchdog.h
> index 9b19e6bb68b5..f79f41cca156 100644
> --- a/include/linux/watchdog.h
> +++ b/include/linux/watchdog.h
> @@ -34,6 +34,7 @@ struct watchdog_governor;
>   * @get_timeleft:The routine that gets the time left before a reset (in seconds).
>   * @restart:	The routine for restarting the machine.
>   * @ioctl:	The routines that handles extra ioctl calls.
> + * @panic_cb:	The routine that extends the watchdog timeout before the crash kernel starts.
>   *
>   * The watchdog_ops structure contains a list of low-level operations
>   * that control a watchdog device. It also contains the module that owns
> @@ -53,6 +54,7 @@ struct watchdog_ops {
>  	unsigned int (*get_timeleft)(struct watchdog_device *);
>  	int (*restart)(struct watchdog_device *, unsigned long, void *);
>  	long (*ioctl)(struct watchdog_device *, unsigned int, unsigned long);
> +	int (*panic_cb)(struct watchdog_device *, unsigned int);
>  };
>  
>  /** struct watchdog_device - The structure that defines a watchdog device
> @@ -107,6 +109,7 @@ struct watchdog_device {
>  	unsigned int max_hw_heartbeat_ms;
>  	struct notifier_block reboot_nb;
>  	struct notifier_block restart_nb;
> +	struct notifier_block panic_nb;
>  	void *driver_data;
>  	struct watchdog_core_data *wd_data;
>  	unsigned long status;
> 


^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, back to index

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-01-28  1:14 [PATCH v3] Extend watchdog timeout during kernel panic JP Ertola
2021-01-28  1:54 ` Guenter Roeck

Linux-Watchdog Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-watchdog/0 linux-watchdog/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-watchdog linux-watchdog/ https://lore.kernel.org/linux-watchdog \
		linux-watchdog@vger.kernel.org
	public-inbox-index linux-watchdog

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-watchdog


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git