linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] habanalabs: add an option to delay a device reset
@ 2022-02-27  9:44 Oded Gabbay
  2022-02-27 10:26 ` Christophe JAILLET
  0 siblings, 1 reply; 3+ messages in thread
From: Oded Gabbay @ 2022-02-27  9:44 UTC (permalink / raw)
  To: linux-kernel; +Cc: Tomer Tayar

From: Tomer Tayar <ttayar@habana.ai>

Several H/W events can be sent adjacently, even due to a single error.
If a hard-reset is triggered as part of handling one of these events,
the following events won't be handled.
The debug info from these missed events is important, sometimes even
more important than the one that was handled.

To allow handling these close events, add an option to delay a device
reset and use it when resetting due to H/W events.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/device.c     | 8 +++++++-
 drivers/misc/habanalabs/common/habanalabs.h | 4 ++++
 drivers/misc/habanalabs/gaudi/gaudi.c       | 2 +-
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index d52381d1fbd2..651326cd22fb 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -13,6 +13,8 @@
 #include <linux/pci.h>
 #include <linux/hwmon.h>
 
+#define HL_RESET_DELAY_USEC		10000	/* 10ms */
+
 enum hl_device_status hl_device_status(struct hl_device *hdev)
 {
 	enum hl_device_status status;
@@ -980,7 +982,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 {
 	bool hard_reset, from_hard_reset_thread, fw_reset, hard_instead_soft = false,
 			reset_upon_device_release = false, schedule_hard_reset = false,
-			skip_wq_flush = false;
+			skip_wq_flush, delay_reset;
 	u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
 	struct hl_ctx *ctx;
 	int i, rc;
@@ -994,6 +996,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 	from_hard_reset_thread = !!(flags & HL_DRV_RESET_FROM_RESET_THR);
 	fw_reset = !!(flags & HL_DRV_RESET_BYPASS_REQ_TO_FW);
 	skip_wq_flush = !!(flags & HL_DRV_RESET_DEV_RELEASE);
+	delay_reset = !!(flags && HL_DRV_RESET_DELAY);
 
 	if (!hard_reset && !hdev->asic_prop.supports_soft_reset) {
 		hard_instead_soft = true;
@@ -1043,6 +1046,9 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 		hdev->reset_info.in_reset = 1;
 		spin_unlock(&hdev->reset_info.lock);
 
+		if (delay_reset)
+			usleep_range(HL_RESET_DELAY_USEC, HL_RESET_DELAY_USEC << 1);
+
 		handle_reset_trigger(hdev, flags);
 
 		/* This still allows the completion of some KDMA ops */
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index cef4717d0916..1edaf6ab67bd 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -142,6 +142,9 @@ enum hl_mmu_page_table_location {
  *
  * - HL_DRV_RESET_FW_FATAL_ERR
  *       Set if reset is due to a fatal error from FW
+ *
+ * - HL_DRV_RESET_DELAY
+ *       Set if a delay should be added before the reset
  */
 
 #define HL_DRV_RESET_HARD		(1 << 0)
@@ -151,6 +154,7 @@ enum hl_mmu_page_table_location {
 #define HL_DRV_RESET_DEV_RELEASE	(1 << 4)
 #define HL_DRV_RESET_BYPASS_REQ_TO_FW	(1 << 5)
 #define HL_DRV_RESET_FW_FATAL_ERR	(1 << 6)
+#define HL_DRV_RESET_DELAY		(1 << 7)
 
 #define HL_MAX_SOBS_PER_MONITOR	8
 
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 0eed0efae040..21c2b678ff72 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -8199,7 +8199,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
 					| HL_DRV_RESET_BYPASS_REQ_TO_FW
 					| fw_fatal_err_flag);
 	else if (hdev->hard_reset_on_fw_events)
-		hl_device_reset(hdev, HL_DRV_RESET_HARD | fw_fatal_err_flag);
+		hl_device_reset(hdev, HL_DRV_RESET_HARD | HL_DRV_RESET_DELAY | fw_fatal_err_flag);
 	else
 		hl_fw_unmask_irq(hdev, event_type);
 }
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH] habanalabs: add an option to delay a device reset
  2022-02-27  9:44 [PATCH] habanalabs: add an option to delay a device reset Oded Gabbay
@ 2022-02-27 10:26 ` Christophe JAILLET
  2022-02-27 11:48   ` Oded Gabbay
  0 siblings, 1 reply; 3+ messages in thread
From: Christophe JAILLET @ 2022-02-27 10:26 UTC (permalink / raw)
  To: Oded Gabbay, linux-kernel; +Cc: Tomer Tayar

Hi,

Le 27/02/2022 à 10:44, Oded Gabbay a écrit :
> From: Tomer Tayar <ttayar@habana.ai>
> 
> Several H/W events can be sent adjacently, even due to a single error.
> If a hard-reset is triggered as part of handling one of these events,
> the following events won't be handled.
> The debug info from these missed events is important, sometimes even
> more important than the one that was handled.
> 
> To allow handling these close events, add an option to delay a device
> reset and use it when resetting due to H/W events.
> 
> Signed-off-by: Tomer Tayar <ttayar@habana.ai>
> Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
> ---
>   drivers/misc/habanalabs/common/device.c     | 8 +++++++-
>   drivers/misc/habanalabs/common/habanalabs.h | 4 ++++
>   drivers/misc/habanalabs/gaudi/gaudi.c       | 2 +-
>   3 files changed, 12 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
> index d52381d1fbd2..651326cd22fb 100644
> --- a/drivers/misc/habanalabs/common/device.c
> +++ b/drivers/misc/habanalabs/common/device.c
> @@ -13,6 +13,8 @@
>   #include <linux/pci.h>
>   #include <linux/hwmon.h>
>   
> +#define HL_RESET_DELAY_USEC		10000	/* 10ms */
> +
>   enum hl_device_status hl_device_status(struct hl_device *hdev)
>   {
>   	enum hl_device_status status;
> @@ -980,7 +982,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
>   {
>   	bool hard_reset, from_hard_reset_thread, fw_reset, hard_instead_soft = false,
>   			reset_upon_device_release = false, schedule_hard_reset = false,
> -			skip_wq_flush = false;
> +			skip_wq_flush, delay_reset;
>   	u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
>   	struct hl_ctx *ctx;
>   	int i, rc;
> @@ -994,6 +996,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
>   	from_hard_reset_thread = !!(flags & HL_DRV_RESET_FROM_RESET_THR);
>   	fw_reset = !!(flags & HL_DRV_RESET_BYPASS_REQ_TO_FW);
>   	skip_wq_flush = !!(flags & HL_DRV_RESET_DEV_RELEASE);
> +	delay_reset = !!(flags && HL_DRV_RESET_DELAY);

s/&&/&/ ?

CJ

>   
>   	if (!hard_reset && !hdev->asic_prop.supports_soft_reset) {
>   		hard_instead_soft = true;
> @@ -1043,6 +1046,9 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
>   		hdev->reset_info.in_reset = 1;
>   		spin_unlock(&hdev->reset_info.lock);
>   
> +		if (delay_reset)
> +			usleep_range(HL_RESET_DELAY_USEC, HL_RESET_DELAY_USEC << 1);
> +
>   		handle_reset_trigger(hdev, flags);
>   
>   		/* This still allows the completion of some KDMA ops */
> diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
> index cef4717d0916..1edaf6ab67bd 100644
> --- a/drivers/misc/habanalabs/common/habanalabs.h
> +++ b/drivers/misc/habanalabs/common/habanalabs.h
> @@ -142,6 +142,9 @@ enum hl_mmu_page_table_location {
>    *
>    * - HL_DRV_RESET_FW_FATAL_ERR
>    *       Set if reset is due to a fatal error from FW
> + *
> + * - HL_DRV_RESET_DELAY
> + *       Set if a delay should be added before the reset
>    */
>   
>   #define HL_DRV_RESET_HARD		(1 << 0)
> @@ -151,6 +154,7 @@ enum hl_mmu_page_table_location {
>   #define HL_DRV_RESET_DEV_RELEASE	(1 << 4)
>   #define HL_DRV_RESET_BYPASS_REQ_TO_FW	(1 << 5)
>   #define HL_DRV_RESET_FW_FATAL_ERR	(1 << 6)
> +#define HL_DRV_RESET_DELAY		(1 << 7)
>   
>   #define HL_MAX_SOBS_PER_MONITOR	8
>   
> diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
> index 0eed0efae040..21c2b678ff72 100644
> --- a/drivers/misc/habanalabs/gaudi/gaudi.c
> +++ b/drivers/misc/habanalabs/gaudi/gaudi.c
> @@ -8199,7 +8199,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
>   					| HL_DRV_RESET_BYPASS_REQ_TO_FW
>   					| fw_fatal_err_flag);
>   	else if (hdev->hard_reset_on_fw_events)
> -		hl_device_reset(hdev, HL_DRV_RESET_HARD | fw_fatal_err_flag);
> +		hl_device_reset(hdev, HL_DRV_RESET_HARD | HL_DRV_RESET_DELAY | fw_fatal_err_flag);
>   	else
>   		hl_fw_unmask_irq(hdev, event_type);
>   }


^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] habanalabs: add an option to delay a device reset
  2022-02-27 10:26 ` Christophe JAILLET
@ 2022-02-27 11:48   ` Oded Gabbay
  0 siblings, 0 replies; 3+ messages in thread
From: Oded Gabbay @ 2022-02-27 11:48 UTC (permalink / raw)
  To: Christophe JAILLET; +Cc: Linux-Kernel@Vger. Kernel. Org, Tomer Tayar

On Sun, Feb 27, 2022 at 12:26 PM Christophe JAILLET
<christophe.jaillet@wanadoo.fr> wrote:
>
> Hi,
>
> Le 27/02/2022 à 10:44, Oded Gabbay a écrit :
> > From: Tomer Tayar <ttayar@habana.ai>
> >
> > Several H/W events can be sent adjacently, even due to a single error.
> > If a hard-reset is triggered as part of handling one of these events,
> > the following events won't be handled.
> > The debug info from these missed events is important, sometimes even
> > more important than the one that was handled.
> >
> > To allow handling these close events, add an option to delay a device
> > reset and use it when resetting due to H/W events.
> >
> > Signed-off-by: Tomer Tayar <ttayar@habana.ai>
> > Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
> > Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
> > ---
> >   drivers/misc/habanalabs/common/device.c     | 8 +++++++-
> >   drivers/misc/habanalabs/common/habanalabs.h | 4 ++++
> >   drivers/misc/habanalabs/gaudi/gaudi.c       | 2 +-
> >   3 files changed, 12 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
> > index d52381d1fbd2..651326cd22fb 100644
> > --- a/drivers/misc/habanalabs/common/device.c
> > +++ b/drivers/misc/habanalabs/common/device.c
> > @@ -13,6 +13,8 @@
> >   #include <linux/pci.h>
> >   #include <linux/hwmon.h>
> >
> > +#define HL_RESET_DELAY_USEC          10000   /* 10ms */
> > +
> >   enum hl_device_status hl_device_status(struct hl_device *hdev)
> >   {
> >       enum hl_device_status status;
> > @@ -980,7 +982,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
> >   {
> >       bool hard_reset, from_hard_reset_thread, fw_reset, hard_instead_soft = false,
> >                       reset_upon_device_release = false, schedule_hard_reset = false,
> > -                     skip_wq_flush = false;
> > +                     skip_wq_flush, delay_reset;
> >       u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
> >       struct hl_ctx *ctx;
> >       int i, rc;
> > @@ -994,6 +996,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
> >       from_hard_reset_thread = !!(flags & HL_DRV_RESET_FROM_RESET_THR);
> >       fw_reset = !!(flags & HL_DRV_RESET_BYPASS_REQ_TO_FW);
> >       skip_wq_flush = !!(flags & HL_DRV_RESET_DEV_RELEASE);
> > +     delay_reset = !!(flags && HL_DRV_RESET_DELAY);
>
> s/&&/&/ ?
>
> CJ

Yes, of course you are correct. Thanks for noticing that.
Oded
>
> >
> >       if (!hard_reset && !hdev->asic_prop.supports_soft_reset) {
> >               hard_instead_soft = true;
> > @@ -1043,6 +1046,9 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
> >               hdev->reset_info.in_reset = 1;
> >               spin_unlock(&hdev->reset_info.lock);
> >
> > +             if (delay_reset)
> > +                     usleep_range(HL_RESET_DELAY_USEC, HL_RESET_DELAY_USEC << 1);
> > +
> >               handle_reset_trigger(hdev, flags);
> >
> >               /* This still allows the completion of some KDMA ops */
> > diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
> > index cef4717d0916..1edaf6ab67bd 100644
> > --- a/drivers/misc/habanalabs/common/habanalabs.h
> > +++ b/drivers/misc/habanalabs/common/habanalabs.h
> > @@ -142,6 +142,9 @@ enum hl_mmu_page_table_location {
> >    *
> >    * - HL_DRV_RESET_FW_FATAL_ERR
> >    *       Set if reset is due to a fatal error from FW
> > + *
> > + * - HL_DRV_RESET_DELAY
> > + *       Set if a delay should be added before the reset
> >    */
> >
> >   #define HL_DRV_RESET_HARD           (1 << 0)
> > @@ -151,6 +154,7 @@ enum hl_mmu_page_table_location {
> >   #define HL_DRV_RESET_DEV_RELEASE    (1 << 4)
> >   #define HL_DRV_RESET_BYPASS_REQ_TO_FW       (1 << 5)
> >   #define HL_DRV_RESET_FW_FATAL_ERR   (1 << 6)
> > +#define HL_DRV_RESET_DELAY           (1 << 7)
> >
> >   #define HL_MAX_SOBS_PER_MONITOR     8
> >
> > diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
> > index 0eed0efae040..21c2b678ff72 100644
> > --- a/drivers/misc/habanalabs/gaudi/gaudi.c
> > +++ b/drivers/misc/habanalabs/gaudi/gaudi.c
> > @@ -8199,7 +8199,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
> >                                       | HL_DRV_RESET_BYPASS_REQ_TO_FW
> >                                       | fw_fatal_err_flag);
> >       else if (hdev->hard_reset_on_fw_events)
> > -             hl_device_reset(hdev, HL_DRV_RESET_HARD | fw_fatal_err_flag);
> > +             hl_device_reset(hdev, HL_DRV_RESET_HARD | HL_DRV_RESET_DELAY | fw_fatal_err_flag);
> >       else
> >               hl_fw_unmask_irq(hdev, event_type);
> >   }
>

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2022-02-27 11:48 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-02-27  9:44 [PATCH] habanalabs: add an option to delay a device reset Oded Gabbay
2022-02-27 10:26 ` Christophe JAILLET
2022-02-27 11:48   ` Oded Gabbay

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).