amd-gfx.lists.freedesktop.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] drm/amdgpu: Use driver mode reset for data poison handling
@ 2024-04-16  4:34 Hawking Zhang
  2024-04-16  5:08 ` Zhou1, Tao
  0 siblings, 1 reply; 5+ messages in thread
From: Hawking Zhang @ 2024-04-16  4:34 UTC (permalink / raw)
  To: amd-gfx, Tao Zhou; +Cc: Hawking Zhang

mode-2 reset is the only reliable method that can get
GC/SDMA back when poison is consumed. mmhub requires
mode-1 reset.

Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index c368c70df3f4a..b6caf6eda8a0c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -163,17 +163,13 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
 	case SOC15_IH_CLIENTID_SE2SH:
 	case SOC15_IH_CLIENTID_SE3SH:
 	case SOC15_IH_CLIENTID_UTCL2:
-		ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
 		block = AMDGPU_RAS_BLOCK__GFX;
-		if (ret)
-			reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+		reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
 		break;
 	case SOC15_IH_CLIENTID_VMC:
 	case SOC15_IH_CLIENTID_VMC1:
-		ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
 		block = AMDGPU_RAS_BLOCK__MMHUB;
-		if (ret)
-			reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
+		reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
 		break;
 	case SOC15_IH_CLIENTID_SDMA0:
 	case SOC15_IH_CLIENTID_SDMA1:
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* RE: [PATCH] drm/amdgpu: Use driver mode reset for data poison handling
  2024-04-16  4:34 [PATCH] drm/amdgpu: Use driver mode reset for data poison handling Hawking Zhang
@ 2024-04-16  5:08 ` Zhou1, Tao
  2024-04-16  5:51   ` Zhang, Hawking
  0 siblings, 1 reply; 5+ messages in thread
From: Zhou1, Tao @ 2024-04-16  5:08 UTC (permalink / raw)
  To: Zhang, Hawking, amd-gfx; +Cc: Zhang, Hawking

[AMD Official Use Only - General]

Reviewed-by: Tao Zhou <tao.zhou1@amd.com>

> -----Original Message-----
> From: Hawking Zhang <Hawking.Zhang@amd.com>
> Sent: Tuesday, April 16, 2024 12:34 PM
> To: amd-gfx@lists.freedesktop.org; Zhou1, Tao <Tao.Zhou1@amd.com>
> Cc: Zhang, Hawking <Hawking.Zhang@amd.com>
> Subject: [PATCH] drm/amdgpu: Use driver mode reset for data poison handling
>
> mode-2 reset is the only reliable method that can get GC/SDMA back when
> poison is consumed. mmhub requires
> mode-1 reset.
>
> Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 8 ++------
>  1 file changed, 2 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> index c368c70df3f4a..b6caf6eda8a0c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> @@ -163,17 +163,13 @@ static void
> event_interrupt_poison_consumption_v9(struct kfd_node *dev,
>       case SOC15_IH_CLIENTID_SE2SH:
>       case SOC15_IH_CLIENTID_SE3SH:
>       case SOC15_IH_CLIENTID_UTCL2:
> -             ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
>               block = AMDGPU_RAS_BLOCK__GFX;
> -             if (ret)
> -                     reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
> +             reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
>               break;
>       case SOC15_IH_CLIENTID_VMC:
>       case SOC15_IH_CLIENTID_VMC1:
> -             ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
>               block = AMDGPU_RAS_BLOCK__MMHUB;
> -             if (ret)
> -                     reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
> +             reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
>               break;
>       case SOC15_IH_CLIENTID_SDMA0:
>       case SOC15_IH_CLIENTID_SDMA1:
> --
> 2.17.1


^ permalink raw reply	[flat|nested] 5+ messages in thread

* RE: [PATCH] drm/amdgpu: Use driver mode reset for data poison handling
  2024-04-16  5:08 ` Zhou1, Tao
@ 2024-04-16  5:51   ` Zhang, Hawking
  0 siblings, 0 replies; 5+ messages in thread
From: Zhang, Hawking @ 2024-04-16  5:51 UTC (permalink / raw)
  To: Zhou1, Tao, amd-gfx

[AMD Official Use Only - General]

Please ignore this one, will send out a new one

-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1@amd.com>
Sent: Tuesday, April 16, 2024 01:08
To: Zhang, Hawking <Hawking.Zhang@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang@amd.com>
Subject: RE: [PATCH] drm/amdgpu: Use driver mode reset for data poison handling

[AMD Official Use Only - General]

Reviewed-by: Tao Zhou <tao.zhou1@amd.com>

> -----Original Message-----
> From: Hawking Zhang <Hawking.Zhang@amd.com>
> Sent: Tuesday, April 16, 2024 12:34 PM
> To: amd-gfx@lists.freedesktop.org; Zhou1, Tao <Tao.Zhou1@amd.com>
> Cc: Zhang, Hawking <Hawking.Zhang@amd.com>
> Subject: [PATCH] drm/amdgpu: Use driver mode reset for data poison
> handling
>
> mode-2 reset is the only reliable method that can get GC/SDMA back
> when poison is consumed. mmhub requires
> mode-1 reset.
>
> Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 8 ++------
>  1 file changed, 2 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> index c368c70df3f4a..b6caf6eda8a0c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> @@ -163,17 +163,13 @@ static void
> event_interrupt_poison_consumption_v9(struct kfd_node *dev,
>       case SOC15_IH_CLIENTID_SE2SH:
>       case SOC15_IH_CLIENTID_SE3SH:
>       case SOC15_IH_CLIENTID_UTCL2:
> -             ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
>               block = AMDGPU_RAS_BLOCK__GFX;
> -             if (ret)
> -                     reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
> +             reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
>               break;
>       case SOC15_IH_CLIENTID_VMC:
>       case SOC15_IH_CLIENTID_VMC1:
> -             ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
>               block = AMDGPU_RAS_BLOCK__MMHUB;
> -             if (ret)
> -                     reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
> +             reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
>               break;
>       case SOC15_IH_CLIENTID_SDMA0:
>       case SOC15_IH_CLIENTID_SDMA1:
> --
> 2.17.1



^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] drm/amdgpu: Use driver mode reset for data poison handling
  2024-04-16  5:56 Hawking Zhang
@ 2024-04-17 18:59 ` Deucher, Alexander
  0 siblings, 0 replies; 5+ messages in thread
From: Deucher, Alexander @ 2024-04-17 18:59 UTC (permalink / raw)
  To: Zhang, Hawking, amd-gfx, Zhou1, Tao

[-- Attachment #1: Type: text/plain, Size: 3158 bytes --]

[Public]

Acked-by: Alex Deucher <alexander.deucher@amd.com>
________________________________
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf of Hawking Zhang <Hawking.Zhang@amd.com>
Sent: Tuesday, April 16, 2024 1:56 AM
To: amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>; Zhou1, Tao <Tao.Zhou1@amd.com>
Cc: Zhang, Hawking <Hawking.Zhang@amd.com>
Subject: [PATCH] drm/amdgpu: Use driver mode reset for data poison handling

mode-2 reset is the only reliable method that can get
GC/SDMA back when poison is consumed. mmhub requires
mode-1 reset.

Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
---
 .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   | 22 +++----------------
 1 file changed, 3 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index c368c70df3f4a..94eb2493103ef 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -144,7 +144,7 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
                                 uint16_t pasid, uint16_t client_id)
 {
         enum amdgpu_ras_block block = 0;
-       int old_poison, ret = -EINVAL;
+       int old_poison;
         uint32_t reset = 0;
         struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);

@@ -163,17 +163,13 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
         case SOC15_IH_CLIENTID_SE2SH:
         case SOC15_IH_CLIENTID_SE3SH:
         case SOC15_IH_CLIENTID_UTCL2:
-               ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
                 block = AMDGPU_RAS_BLOCK__GFX;
-               if (ret)
-                       reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+               reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
                 break;
         case SOC15_IH_CLIENTID_VMC:
         case SOC15_IH_CLIENTID_VMC1:
-               ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
                 block = AMDGPU_RAS_BLOCK__MMHUB;
-               if (ret)
-                       reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
+               reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
                 break;
         case SOC15_IH_CLIENTID_SDMA0:
         case SOC15_IH_CLIENTID_SDMA1:
@@ -189,18 +185,6 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,

         kfd_signal_poison_consumed_event(dev, pasid);

-       /* resetting queue passes, do page retirement without gpu reset
-        * resetting queue fails, fallback to gpu reset solution
-        */
-       if (!ret)
-               dev_warn(dev->adev->dev,
-                       "RAS poison consumption, unmap queue flow succeeded: client id %d\n",
-                       client_id);
-       else
-               dev_warn(dev->adev->dev,
-                       "RAS poison consumption, fall back to gpu reset flow: client id %d\n",
-                       client_id);
-
         amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset);
 }

--
2.17.1


[-- Attachment #2: Type: text/html, Size: 6894 bytes --]

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH] drm/amdgpu: Use driver mode reset for data poison handling
@ 2024-04-16  5:56 Hawking Zhang
  2024-04-17 18:59 ` Deucher, Alexander
  0 siblings, 1 reply; 5+ messages in thread
From: Hawking Zhang @ 2024-04-16  5:56 UTC (permalink / raw)
  To: amd-gfx, Tao Zhou; +Cc: Hawking Zhang

mode-2 reset is the only reliable method that can get
GC/SDMA back when poison is consumed. mmhub requires
mode-1 reset.

Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
---
 .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   | 22 +++----------------
 1 file changed, 3 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index c368c70df3f4a..94eb2493103ef 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -144,7 +144,7 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
 				uint16_t pasid, uint16_t client_id)
 {
 	enum amdgpu_ras_block block = 0;
-	int old_poison, ret = -EINVAL;
+	int old_poison;
 	uint32_t reset = 0;
 	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
 
@@ -163,17 +163,13 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
 	case SOC15_IH_CLIENTID_SE2SH:
 	case SOC15_IH_CLIENTID_SE3SH:
 	case SOC15_IH_CLIENTID_UTCL2:
-		ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
 		block = AMDGPU_RAS_BLOCK__GFX;
-		if (ret)
-			reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+		reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
 		break;
 	case SOC15_IH_CLIENTID_VMC:
 	case SOC15_IH_CLIENTID_VMC1:
-		ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
 		block = AMDGPU_RAS_BLOCK__MMHUB;
-		if (ret)
-			reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
+		reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
 		break;
 	case SOC15_IH_CLIENTID_SDMA0:
 	case SOC15_IH_CLIENTID_SDMA1:
@@ -189,18 +185,6 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
 
 	kfd_signal_poison_consumed_event(dev, pasid);
 
-	/* resetting queue passes, do page retirement without gpu reset
-	 * resetting queue fails, fallback to gpu reset solution
-	 */
-	if (!ret)
-		dev_warn(dev->adev->dev,
-			"RAS poison consumption, unmap queue flow succeeded: client id %d\n",
-			client_id);
-	else
-		dev_warn(dev->adev->dev,
-			"RAS poison consumption, fall back to gpu reset flow: client id %d\n",
-			client_id);
-
 	amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset);
 }
 
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2024-04-17 19:00 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-04-16  4:34 [PATCH] drm/amdgpu: Use driver mode reset for data poison handling Hawking Zhang
2024-04-16  5:08 ` Zhou1, Tao
2024-04-16  5:51   ` Zhang, Hawking
2024-04-16  5:56 Hawking Zhang
2024-04-17 18:59 ` Deucher, Alexander

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).