All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH Review V3 1/1] drm/amdgpu: Fix ecc irq enable/disable unpaired
@ 2023-12-21  6:05 Stanley.Yang
  2023-12-21  8:31 ` Zhou1, Tao
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: Stanley.Yang @ 2023-12-21  6:05 UTC (permalink / raw)
  To: amd-gfx, Hawking.Zhang; +Cc: Stanley.Yang

The ecc_irq is disabled while GPU mode2 reset suspending process,
but not be enabled during GPU mode2 reset resume process.

Changed from V1:
	only do sdma/gfx ras_late_init in aldebaran_mode2_restore_ip
	delete amdgpu_ras_late_resume function

Changed from V2:
	check umc ras supported before put ecc_irq

Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/aldebaran.c | 28 +++++++++++++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c |  4 ++++
 drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c |  5 +++++
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  4 ++++
 4 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/aldebaran.c b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
index 02f4c6f9d4f6..b60a3c1bd0f2 100644
--- a/drivers/gpu/drm/amd/amdgpu/aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
@@ -330,6 +330,7 @@ aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
 {
 	struct list_head *reset_device_list = reset_context->reset_device_list;
 	struct amdgpu_device *tmp_adev = NULL;
+	struct amdgpu_ras *con;
 	int r;
 
 	if (reset_device_list == NULL)
@@ -355,7 +356,32 @@ aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
 		 */
 		amdgpu_register_gpu_instance(tmp_adev);
 
-		/* Resume RAS */
+		/* Resume RAS, ecc_irq */
+		con = amdgpu_ras_get_context(tmp_adev);
+		if (!amdgpu_sriov_vf(tmp_adev) && con) {
+			if (tmp_adev->sdma.ras &&
+				amdgpu_ras_is_supported(tmp_adev, AMDGPU_RAS_BLOCK__SDMA) &&
+				tmp_adev->sdma.ras->ras_block.ras_late_init) {
+				r = tmp_adev->sdma.ras->ras_block.ras_late_init(tmp_adev,
+						&tmp_adev->sdma.ras->ras_block.ras_comm);
+				if (r) {
+					dev_err(tmp_adev->dev, "SDMA failed to execute ras_late_init! ret:%d\n", r);
+					goto end;
+				}
+			}
+
+			if (tmp_adev->gfx.ras &&
+				amdgpu_ras_is_supported(tmp_adev, AMDGPU_RAS_BLOCK__GFX) &&
+				tmp_adev->gfx.ras->ras_block.ras_late_init) {
+				r = tmp_adev->gfx.ras->ras_block.ras_late_init(tmp_adev,
+						&tmp_adev->gfx.ras->ras_block.ras_comm);
+				if (r) {
+					dev_err(tmp_adev->dev, "GFX failed to execute ras_late_init! ret:%d\n", r);
+					goto end;
+				}
+			}
+		}
+
 		amdgpu_ras_resume(tmp_adev);
 
 		/* Update PSP FW topology after reset */
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 09cbca596bb5..4048539205cb 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -1043,6 +1043,10 @@ static int gmc_v10_0_hw_fini(void *handle)
 
 	amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0);
 
+	if (adev->gmc.ecc_irq.funcs &&
+		amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC))
+		amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0);
+
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
index 416f3e4f0438..e1ca5a599971 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
@@ -941,6 +941,11 @@ static int gmc_v11_0_hw_fini(void *handle)
 	}
 
 	amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0);
+
+	if (adev->gmc.ecc_irq.funcs &&
+		amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC))
+		amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0);
+
 	gmc_v11_0_gart_disable(adev);
 
 	return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 205db28a9803..f00e5c8c79b0 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -2388,6 +2388,10 @@ static int gmc_v9_0_hw_fini(void *handle)
 
 	amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0);
 
+	if (adev->gmc.ecc_irq.funcs &&
+		amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC))
+		amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0);
+
 	return 0;
 }
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* RE: [PATCH Review V3 1/1] drm/amdgpu: Fix ecc irq enable/disable unpaired
  2023-12-21  6:05 [PATCH Review V3 1/1] drm/amdgpu: Fix ecc irq enable/disable unpaired Stanley.Yang
@ 2023-12-21  8:31 ` Zhou1, Tao
  2023-12-21  8:35 ` Zhang, Hawking
  2023-12-22  9:08 ` Lazar, Lijo
  2 siblings, 0 replies; 4+ messages in thread
From: Zhou1, Tao @ 2023-12-21  8:31 UTC (permalink / raw)
  To: Yang, Stanley, amd-gfx, Zhang, Hawking; +Cc: Yang, Stanley

[AMD Official Use Only - General]

Reviewed-by: Tao Zhou <tao.zhou1@amd.com>

> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
> Stanley.Yang
> Sent: Thursday, December 21, 2023 2:05 PM
> To: amd-gfx@lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang@amd.com>
> Cc: Yang, Stanley <Stanley.Yang@amd.com>
> Subject: [PATCH Review V3 1/1] drm/amdgpu: Fix ecc irq enable/disable unpaired
>
> The ecc_irq is disabled while GPU mode2 reset suspending process, but not be
> enabled during GPU mode2 reset resume process.
>
> Changed from V1:
>       only do sdma/gfx ras_late_init in aldebaran_mode2_restore_ip
>       delete amdgpu_ras_late_resume function
>
> Changed from V2:
>       check umc ras supported before put ecc_irq
>
> Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/aldebaran.c | 28 +++++++++++++++++++++++++-
> drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c |  4 ++++
> drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c |  5 +++++
> drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  4 ++++
>  4 files changed, 40 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/aldebaran.c
> b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
> index 02f4c6f9d4f6..b60a3c1bd0f2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/aldebaran.c
> +++ b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
> @@ -330,6 +330,7 @@ aldebaran_mode2_restore_hwcontext(struct
> amdgpu_reset_control *reset_ctl,  {
>       struct list_head *reset_device_list = reset_context->reset_device_list;
>       struct amdgpu_device *tmp_adev = NULL;
> +     struct amdgpu_ras *con;
>       int r;
>
>       if (reset_device_list == NULL)
> @@ -355,7 +356,32 @@ aldebaran_mode2_restore_hwcontext(struct
> amdgpu_reset_control *reset_ctl,
>                */
>               amdgpu_register_gpu_instance(tmp_adev);
>
> -             /* Resume RAS */
> +             /* Resume RAS, ecc_irq */
> +             con = amdgpu_ras_get_context(tmp_adev);
> +             if (!amdgpu_sriov_vf(tmp_adev) && con) {
> +                     if (tmp_adev->sdma.ras &&
> +                             amdgpu_ras_is_supported(tmp_adev,
> AMDGPU_RAS_BLOCK__SDMA) &&
> +                             tmp_adev->sdma.ras->ras_block.ras_late_init) {
> +                             r = tmp_adev->sdma.ras-
> >ras_block.ras_late_init(tmp_adev,
> +                                             &tmp_adev->sdma.ras-
> >ras_block.ras_comm);
> +                             if (r) {
> +                                     dev_err(tmp_adev->dev, "SDMA failed
> to execute ras_late_init! ret:%d\n", r);
> +                                     goto end;
> +                             }
> +                     }
> +
> +                     if (tmp_adev->gfx.ras &&
> +                             amdgpu_ras_is_supported(tmp_adev,
> AMDGPU_RAS_BLOCK__GFX) &&
> +                             tmp_adev->gfx.ras->ras_block.ras_late_init) {
> +                             r = tmp_adev->gfx.ras-
> >ras_block.ras_late_init(tmp_adev,
> +                                             &tmp_adev->gfx.ras-
> >ras_block.ras_comm);
> +                             if (r) {
> +                                     dev_err(tmp_adev->dev, "GFX failed to
> execute ras_late_init! ret:%d\n", r);
> +                                     goto end;
> +                             }
> +                     }
> +             }
> +
>               amdgpu_ras_resume(tmp_adev);
>
>               /* Update PSP FW topology after reset */ diff --git
> a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 09cbca596bb5..4048539205cb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -1043,6 +1043,10 @@ static int gmc_v10_0_hw_fini(void *handle)
>
>       amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0);
>
> +     if (adev->gmc.ecc_irq.funcs &&
> +             amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC))
> +             amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0);
> +
>       return 0;
>  }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> index 416f3e4f0438..e1ca5a599971 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> @@ -941,6 +941,11 @@ static int gmc_v11_0_hw_fini(void *handle)
>       }
>
>       amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0);
> +
> +     if (adev->gmc.ecc_irq.funcs &&
> +             amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC))
> +             amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0);
> +
>       gmc_v11_0_gart_disable(adev);
>
>       return 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 205db28a9803..f00e5c8c79b0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -2388,6 +2388,10 @@ static int gmc_v9_0_hw_fini(void *handle)
>
>       amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0);
>
> +     if (adev->gmc.ecc_irq.funcs &&
> +             amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC))
> +             amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0);
> +
>       return 0;
>  }
>
> --
> 2.25.1


^ permalink raw reply	[flat|nested] 4+ messages in thread

* RE: [PATCH Review V3 1/1] drm/amdgpu: Fix ecc irq enable/disable unpaired
  2023-12-21  6:05 [PATCH Review V3 1/1] drm/amdgpu: Fix ecc irq enable/disable unpaired Stanley.Yang
  2023-12-21  8:31 ` Zhou1, Tao
@ 2023-12-21  8:35 ` Zhang, Hawking
  2023-12-22  9:08 ` Lazar, Lijo
  2 siblings, 0 replies; 4+ messages in thread
From: Zhang, Hawking @ 2023-12-21  8:35 UTC (permalink / raw)
  To: Yang, Stanley, amd-gfx; +Cc: Yang, Stanley

[AMD Official Use Only - General]

Feel free to drop the check as below since amdgpu_xxx_ras_late_init applies the check for interrupt enablement.
+                               amdgpu_ras_is_supported(tmp_adev, AMDGPU_RAS_BLOCK__SDMA)
+                               amdgpu_ras_is_supported(tmp_adev, AMDGPU_RAS_BLOCK__GFX)

Apart from that, the change is

Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>

Regards,
Hawking
-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Stanley.Yang
Sent: Thursday, December 21, 2023 14:05
To: amd-gfx@lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang@amd.com>
Cc: Yang, Stanley <Stanley.Yang@amd.com>
Subject: [PATCH Review V3 1/1] drm/amdgpu: Fix ecc irq enable/disable unpaired

The ecc_irq is disabled while GPU mode2 reset suspending process, but not be enabled during GPU mode2 reset resume process.

Changed from V1:
        only do sdma/gfx ras_late_init in aldebaran_mode2_restore_ip
        delete amdgpu_ras_late_resume function

Changed from V2:
        check umc ras supported before put ecc_irq

Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/aldebaran.c | 28 +++++++++++++++++++++++++-  drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c |  4 ++++  drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c |  5 +++++  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  4 ++++
 4 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/aldebaran.c b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
index 02f4c6f9d4f6..b60a3c1bd0f2 100644
--- a/drivers/gpu/drm/amd/amdgpu/aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
@@ -330,6 +330,7 @@ aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,  {
        struct list_head *reset_device_list = reset_context->reset_device_list;
        struct amdgpu_device *tmp_adev = NULL;
+       struct amdgpu_ras *con;
        int r;

        if (reset_device_list == NULL)
@@ -355,7 +356,32 @@ aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
                 */
                amdgpu_register_gpu_instance(tmp_adev);

-               /* Resume RAS */
+               /* Resume RAS, ecc_irq */
+               con = amdgpu_ras_get_context(tmp_adev);
+               if (!amdgpu_sriov_vf(tmp_adev) && con) {
+                       if (tmp_adev->sdma.ras &&
+                               amdgpu_ras_is_supported(tmp_adev, AMDGPU_RAS_BLOCK__SDMA) &&
+                               tmp_adev->sdma.ras->ras_block.ras_late_init) {
+                               r = tmp_adev->sdma.ras->ras_block.ras_late_init(tmp_adev,
+                                               &tmp_adev->sdma.ras->ras_block.ras_comm);
+                               if (r) {
+                                       dev_err(tmp_adev->dev, "SDMA failed to execute ras_late_init! ret:%d\n", r);
+                                       goto end;
+                               }
+                       }
+
+                       if (tmp_adev->gfx.ras &&
+                               amdgpu_ras_is_supported(tmp_adev, AMDGPU_RAS_BLOCK__GFX) &&
+                               tmp_adev->gfx.ras->ras_block.ras_late_init) {
+                               r = tmp_adev->gfx.ras->ras_block.ras_late_init(tmp_adev,
+                                               &tmp_adev->gfx.ras->ras_block.ras_comm);
+                               if (r) {
+                                       dev_err(tmp_adev->dev, "GFX failed to execute ras_late_init! ret:%d\n", r);
+                                       goto end;
+                               }
+                       }
+               }
+
                amdgpu_ras_resume(tmp_adev);

                /* Update PSP FW topology after reset */ diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 09cbca596bb5..4048539205cb 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -1043,6 +1043,10 @@ static int gmc_v10_0_hw_fini(void *handle)

        amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0);

+       if (adev->gmc.ecc_irq.funcs &&
+               amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC))
+               amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0);
+
        return 0;
 }

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
index 416f3e4f0438..e1ca5a599971 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
@@ -941,6 +941,11 @@ static int gmc_v11_0_hw_fini(void *handle)
        }

        amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0);
+
+       if (adev->gmc.ecc_irq.funcs &&
+               amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC))
+               amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0);
+
        gmc_v11_0_gart_disable(adev);

        return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 205db28a9803..f00e5c8c79b0 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -2388,6 +2388,10 @@ static int gmc_v9_0_hw_fini(void *handle)

        amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0);

+       if (adev->gmc.ecc_irq.funcs &&
+               amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC))
+               amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0);
+
        return 0;
 }

--
2.25.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH Review V3 1/1] drm/amdgpu: Fix ecc irq enable/disable unpaired
  2023-12-21  6:05 [PATCH Review V3 1/1] drm/amdgpu: Fix ecc irq enable/disable unpaired Stanley.Yang
  2023-12-21  8:31 ` Zhou1, Tao
  2023-12-21  8:35 ` Zhang, Hawking
@ 2023-12-22  9:08 ` Lazar, Lijo
  2 siblings, 0 replies; 4+ messages in thread
From: Lazar, Lijo @ 2023-12-22  9:08 UTC (permalink / raw)
  To: Stanley.Yang, amd-gfx, Hawking.Zhang

On 12/21/2023 11:35 AM, Stanley.Yang wrote:
> The ecc_irq is disabled while GPU mode2 reset suspending process,
> but not be enabled during GPU mode2 reset resume process.
> 
> Changed from V1:
> 	only do sdma/gfx ras_late_init in aldebaran_mode2_restore_ip
> 	delete amdgpu_ras_late_resume function
> 
> Changed from V2:
> 	check umc ras supported before put ecc_irq
> 
> Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/aldebaran.c | 28 +++++++++++++++++++++++++-
>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c |  4 ++++
>   drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c |  5 +++++
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  4 ++++
>   4 files changed, 40 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/aldebaran.c b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
> index 02f4c6f9d4f6..b60a3c1bd0f2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/aldebaran.c
> +++ b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
> @@ -330,6 +330,7 @@ aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
>   {
>   	struct list_head *reset_device_list = reset_context->reset_device_list;
>   	struct amdgpu_device *tmp_adev = NULL;
> +	struct amdgpu_ras *con;
>   	int r;
>   
>   	if (reset_device_list == NULL)
> @@ -355,7 +356,32 @@ aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
>   		 */
>   		amdgpu_register_gpu_instance(tmp_adev);
>   
> -		/* Resume RAS */
> +		/* Resume RAS, ecc_irq */
> +		con = amdgpu_ras_get_context(tmp_adev);
> +		if (!amdgpu_sriov_vf(tmp_adev) && con) {
> +			if (tmp_adev->sdma.ras &&
> +				amdgpu_ras_is_supported(tmp_adev, AMDGPU_RAS_BLOCK__SDMA) &&
> +				tmp_adev->sdma.ras->ras_block.ras_late_init) {
> +				r = tmp_adev->sdma.ras->ras_block.ras_late_init(tmp_adev,
> +						&tmp_adev->sdma.ras->ras_block.ras_comm);
> +				if (r) {
> +					dev_err(tmp_adev->dev, "SDMA failed to execute ras_late_init! ret:%d\n", r);
> +					goto end;
> +				}
> +			}
> +
> +			if (tmp_adev->gfx.ras &&
> +				amdgpu_ras_is_supported(tmp_adev, AMDGPU_RAS_BLOCK__GFX) &&
> +				tmp_adev->gfx.ras->ras_block.ras_late_init) {
> +				r = tmp_adev->gfx.ras->ras_block.ras_late_init(tmp_adev,
> +						&tmp_adev->gfx.ras->ras_block.ras_comm);
> +				if (r) {
> +					dev_err(tmp_adev->dev, "GFX failed to execute ras_late_init! ret:%d\n", r);
> +					goto end;
> +				}
> +			}
> +		}

This is the not the only ASIC that supports mode-2 reset.

What is preferred here is a RAS API which doesn't do all these kind of 
ras variable checks to initialize selective ras blocks.

amdgpu_ras_late_init(ras_block_id) or similar. Whatever checks done 
above may be wrapped inside the API. For now, GFX and SDMA are the only 
blocks that need to be inited, but an API gives more flexibility to 
selectively init blocks that are reset.

Thanks,
Lijo

> +
>   		amdgpu_ras_resume(tmp_adev);
>   
>   		/* Update PSP FW topology after reset */
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 09cbca596bb5..4048539205cb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -1043,6 +1043,10 @@ static int gmc_v10_0_hw_fini(void *handle)
>   
>   	amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0);
>   
> +	if (adev->gmc.ecc_irq.funcs &&
> +		amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC))
> +		amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0);
> +
>   	return 0;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> index 416f3e4f0438..e1ca5a599971 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> @@ -941,6 +941,11 @@ static int gmc_v11_0_hw_fini(void *handle)
>   	}
>   
>   	amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0);
> +
> +	if (adev->gmc.ecc_irq.funcs &&
> +		amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC))
> +		amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0);
> +
>   	gmc_v11_0_gart_disable(adev);
>   
>   	return 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 205db28a9803..f00e5c8c79b0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -2388,6 +2388,10 @@ static int gmc_v9_0_hw_fini(void *handle)
>   
>   	amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0);
>   
> +	if (adev->gmc.ecc_irq.funcs &&
> +		amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC))
> +		amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0);
> +
>   	return 0;
>   }
>   
-- 
Regards,
Lijo


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2023-12-22  9:08 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-12-21  6:05 [PATCH Review V3 1/1] drm/amdgpu: Fix ecc irq enable/disable unpaired Stanley.Yang
2023-12-21  8:31 ` Zhou1, Tao
2023-12-21  8:35 ` Zhang, Hawking
2023-12-22  9:08 ` Lazar, Lijo

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.