[PATCH 1/2] drm/amdgpu: Enable RAS error injection after mode2 reset on Aldebaran

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 1/2] drm/amdgpu: Enable RAS error injection after mode2 reset on Aldebaran
@ 2021-10-12  2:33 Mukul Joshi
  2021-10-12  2:33 ` [PATCH 2/2] drm/amdgpu: Fix RAS page retirement with " Mukul Joshi
  2021-10-12  3:31 ` [PATCH 1/2] drm/amdgpu: Enable RAS error injection after " Zhou1, Tao
  0 siblings, 2 replies; 5+ messages in thread
From: Mukul Joshi @ 2021-10-12  2:33 UTC (permalink / raw)
  To: amd-gfx; +Cc: tao.zhou1, john.clements, Mukul Joshi

Add the missing call to re-enable RAS error injections on the Aldebaran
mode2 reset code path.

Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/aldebaran.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/aldebaran.c b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
index 148f6c3343ab..bcfdb63b1d42 100644
--- a/drivers/gpu/drm/amd/amdgpu/aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
@@ -307,6 +307,8 @@ static int aldebaran_mode2_restore_ip(struct amdgpu_device *adev)
 		adev->ip_blocks[i].status.late_initialized = true;
 	}
 
+	amdgpu_ras_set_error_query_ready(adev, true);
+
 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
 
-- 
2.33.0


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 2/2] drm/amdgpu: Fix RAS page retirement with mode2 reset on Aldebaran
  2021-10-12  2:33 [PATCH 1/2] drm/amdgpu: Enable RAS error injection after mode2 reset on Aldebaran Mukul Joshi
@ 2021-10-12  2:33 ` Mukul Joshi
  2021-10-12  3:55   ` Zhou1, Tao
  2021-10-12  3:31 ` [PATCH 1/2] drm/amdgpu: Enable RAS error injection after " Zhou1, Tao
  1 sibling, 1 reply; 5+ messages in thread
From: Mukul Joshi @ 2021-10-12  2:33 UTC (permalink / raw)
  To: amd-gfx; +Cc: tao.zhou1, john.clements, Mukul Joshi

During mode2 reset, the GPU is temporarily removed from the
mgpu_info list. As a result, page retirement fails because it
cannot find the GPU in the GPU list.
To fix this, create our own list of GPUs that support MCE notifier
based page retirement and use that list to check if the UMC error
occurred on a GPU that supports MCE notifier based page retirement.

Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index e8875351967e..e8d88c77eb46 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -112,7 +112,12 @@ static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
 static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
 				uint64_t addr);
 #ifdef CONFIG_X86_MCE_AMD
-static void amdgpu_register_bad_pages_mca_notifier(void);
+static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev);
+struct mce_notifier_adev_list {
+	struct amdgpu_device *devs[MAX_GPU_INSTANCE];
+	int num_gpu;
+};
+static struct mce_notifier_adev_list mce_adev_list;
 #endif
 
 void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready)
@@ -2108,7 +2113,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 #ifdef CONFIG_X86_MCE_AMD
 	if ((adev->asic_type == CHIP_ALDEBARAN) &&
 	    (adev->gmc.xgmi.connected_to_cpu))
-		amdgpu_register_bad_pages_mca_notifier();
+		amdgpu_register_bad_pages_mca_notifier(adev);
 #endif
 	return 0;
 
@@ -2605,24 +2610,18 @@ void amdgpu_release_ras_context(struct amdgpu_device *adev)
 #ifdef CONFIG_X86_MCE_AMD
 static struct amdgpu_device *find_adev(uint32_t node_id)
 {
-	struct amdgpu_gpu_instance *gpu_instance;
 	int i;
 	struct amdgpu_device *adev = NULL;
 
-	mutex_lock(&mgpu_info.mutex);
-
-	for (i = 0; i < mgpu_info.num_gpu; i++) {
-		gpu_instance = &(mgpu_info.gpu_ins[i]);
-		adev = gpu_instance->adev;
+	for (i = 0; i < mce_adev_list.num_gpu; i++) {
+		adev = mce_adev_list.devs[i];
 
-		if (adev->gmc.xgmi.connected_to_cpu &&
+		if (adev && adev->gmc.xgmi.connected_to_cpu &&
 		    adev->gmc.xgmi.physical_node_id == node_id)
 			break;
 		adev = NULL;
 	}
 
-	mutex_unlock(&mgpu_info.mutex);
-
 	return adev;
 }
 
@@ -2718,8 +2717,9 @@ static struct notifier_block amdgpu_bad_page_nb = {
 	.priority       = MCE_PRIO_UC,
 };
 
-static void amdgpu_register_bad_pages_mca_notifier(void)
+static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev)
 {
+	mce_adev_list.devs[mce_adev_list.num_gpu++] = adev;
 	/*
 	 * Register the x86 notifier only once
 	 * with MCE subsystem.
-- 
2.33.0


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: Enable RAS error injection after mode2 reset on Aldebaran
  2021-10-12  2:33 [PATCH 1/2] drm/amdgpu: Enable RAS error injection after mode2 reset on Aldebaran Mukul Joshi
  2021-10-12  2:33 ` [PATCH 2/2] drm/amdgpu: Fix RAS page retirement with " Mukul Joshi
@ 2021-10-12  3:31 ` Zhou1, Tao
  1 sibling, 0 replies; 5+ messages in thread
From: Zhou1, Tao @ 2021-10-12  3:31 UTC (permalink / raw)
  To: Joshi, Mukul, amd-gfx; +Cc: Clements, John

[-- Attachment #1: Type: text/plain, Size: 1322 bytes --]

[AMD Official Use Only]

Reviewed-by: Tao Zhou <tao.zhou1@amd.com<mailto:tao.zhou1@amd.com>>
________________________________
From: Joshi, Mukul <Mukul.Joshi@amd.com>
Sent: Tuesday, October 12, 2021 10:33 AM
To: amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>
Cc: Zhou1, Tao <Tao.Zhou1@amd.com>; Clements, John <John.Clements@amd.com>; Joshi, Mukul <Mukul.Joshi@amd.com>
Subject: [PATCH 1/2] drm/amdgpu: Enable RAS error injection after mode2 reset on Aldebaran

Add the missing call to re-enable RAS error injections on the Aldebaran
mode2 reset code path.

Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/aldebaran.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/aldebaran.c b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
index 148f6c3343ab..bcfdb63b1d42 100644
--- a/drivers/gpu/drm/amd/amdgpu/aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
@@ -307,6 +307,8 @@ static int aldebaran_mode2_restore_ip(struct amdgpu_device *adev)
                 adev->ip_blocks[i].status.late_initialized = true;
         }

+       amdgpu_ras_set_error_query_ready(adev, true);
+
         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);

--
2.33.0


[-- Attachment #2: Type: text/html, Size: 2889 bytes --]

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: Fix RAS page retirement with mode2 reset on Aldebaran
  2021-10-12  2:33 ` [PATCH 2/2] drm/amdgpu: Fix RAS page retirement with " Mukul Joshi
@ 2021-10-12  3:55   ` Zhou1, Tao
  2021-10-12 15:35     ` Joshi, Mukul
  0 siblings, 1 reply; 5+ messages in thread
From: Zhou1, Tao @ 2021-10-12  3:55 UTC (permalink / raw)
  To: Joshi, Mukul, amd-gfx; +Cc: Clements, John

[-- Attachment #1: Type: text/plain, Size: 3847 bytes --]

[AMD Official Use Only]

The patch looks good for me, but it's better to add comment in amdgpu_register_bad_pages_mca_notifier to explain why we need to reserve GPU info instead of using mgpu_info list, with this addressed, the patch is:

Reviewed-by: Tao Zhou <tao.zhou1@amd.com<mailto:tao.zhou1@amd.com>>

________________________________
From: Joshi, Mukul <Mukul.Joshi@amd.com>
Sent: Tuesday, October 12, 2021 10:33 AM
To: amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>
Cc: Zhou1, Tao <Tao.Zhou1@amd.com>; Clements, John <John.Clements@amd.com>; Joshi, Mukul <Mukul.Joshi@amd.com>
Subject: [PATCH 2/2] drm/amdgpu: Fix RAS page retirement with mode2 reset on Aldebaran

During mode2 reset, the GPU is temporarily removed from the
mgpu_info list. As a result, page retirement fails because it
cannot find the GPU in the GPU list.
To fix this, create our own list of GPUs that support MCE notifier
based page retirement and use that list to check if the UMC error
occurred on a GPU that supports MCE notifier based page retirement.

Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index e8875351967e..e8d88c77eb46 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -112,7 +112,12 @@ static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
 static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
                                 uint64_t addr);
 #ifdef CONFIG_X86_MCE_AMD
-static void amdgpu_register_bad_pages_mca_notifier(void);
+static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev);
+struct mce_notifier_adev_list {
+       struct amdgpu_device *devs[MAX_GPU_INSTANCE];
+       int num_gpu;
+};
+static struct mce_notifier_adev_list mce_adev_list;
 #endif

 void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready)
@@ -2108,7 +2113,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 #ifdef CONFIG_X86_MCE_AMD
         if ((adev->asic_type == CHIP_ALDEBARAN) &&
             (adev->gmc.xgmi.connected_to_cpu))
-               amdgpu_register_bad_pages_mca_notifier();
+               amdgpu_register_bad_pages_mca_notifier(adev);
 #endif
         return 0;

@@ -2605,24 +2610,18 @@ void amdgpu_release_ras_context(struct amdgpu_device *adev)
 #ifdef CONFIG_X86_MCE_AMD
 static struct amdgpu_device *find_adev(uint32_t node_id)
 {
-       struct amdgpu_gpu_instance *gpu_instance;
         int i;
         struct amdgpu_device *adev = NULL;

-       mutex_lock(&mgpu_info.mutex);
-
-       for (i = 0; i < mgpu_info.num_gpu; i++) {
-               gpu_instance = &(mgpu_info.gpu_ins[i]);
-               adev = gpu_instance->adev;
+       for (i = 0; i < mce_adev_list.num_gpu; i++) {
+               adev = mce_adev_list.devs[i];

-               if (adev->gmc.xgmi.connected_to_cpu &&
+               if (adev && adev->gmc.xgmi.connected_to_cpu &&
                     adev->gmc.xgmi.physical_node_id == node_id)
                         break;
                 adev = NULL;
         }

-       mutex_unlock(&mgpu_info.mutex);
-
         return adev;
 }

@@ -2718,8 +2717,9 @@ static struct notifier_block amdgpu_bad_page_nb = {
         .priority       = MCE_PRIO_UC,
 };

-static void amdgpu_register_bad_pages_mca_notifier(void)
+static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev)
 {
+       mce_adev_list.devs[mce_adev_list.num_gpu++] = adev;
         /*
          * Register the x86 notifier only once
          * with MCE subsystem.
--
2.33.0


[-- Attachment #2: Type: text/html, Size: 7588 bytes --]

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* RE: [PATCH 2/2] drm/amdgpu: Fix RAS page retirement with mode2 reset on Aldebaran
  2021-10-12  3:55   ` Zhou1, Tao
@ 2021-10-12 15:35     ` Joshi, Mukul
  0 siblings, 0 replies; 5+ messages in thread
From: Joshi, Mukul @ 2021-10-12 15:35 UTC (permalink / raw)
  To: Zhou1, Tao, amd-gfx; +Cc: Clements, John

[-- Attachment #1: Type: text/plain, Size: 4481 bytes --]

[AMD Official Use Only]

Thanks Tao.
I will add a comment as you suggested before committing the change.

Regards,
Mukul

From: Zhou1, Tao <Tao.Zhou1@amd.com>
Sent: Monday, October 11, 2021 11:55 PM
To: Joshi, Mukul <Mukul.Joshi@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Clements, John <John.Clements@amd.com>
Subject: Re: [PATCH 2/2] drm/amdgpu: Fix RAS page retirement with mode2 reset on Aldebaran


[AMD Official Use Only]

The patch looks good for me, but it's better to add comment in amdgpu_register_bad_pages_mca_notifier to explain why we need to reserve GPU info instead of using mgpu_info list, with this addressed, the patch is:

Reviewed-by: Tao Zhou <tao.zhou1@amd.com<mailto:tao.zhou1@amd.com>>

________________________________
From: Joshi, Mukul <Mukul.Joshi@amd.com<mailto:Mukul.Joshi@amd.com>>
Sent: Tuesday, October 12, 2021 10:33 AM
To: amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org> <amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>>
Cc: Zhou1, Tao <Tao.Zhou1@amd.com<mailto:Tao.Zhou1@amd.com>>; Clements, John <John.Clements@amd.com<mailto:John.Clements@amd.com>>; Joshi, Mukul <Mukul.Joshi@amd.com<mailto:Mukul.Joshi@amd.com>>
Subject: [PATCH 2/2] drm/amdgpu: Fix RAS page retirement with mode2 reset on Aldebaran

During mode2 reset, the GPU is temporarily removed from the
mgpu_info list. As a result, page retirement fails because it
cannot find the GPU in the GPU list.
To fix this, create our own list of GPUs that support MCE notifier
based page retirement and use that list to check if the UMC error
occurred on a GPU that supports MCE notifier based page retirement.

Signed-off-by: Mukul Joshi <mukul.joshi@amd.com<mailto:mukul.joshi@amd.com>>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index e8875351967e..e8d88c77eb46 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -112,7 +112,12 @@ static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
 static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
                                 uint64_t addr);
 #ifdef CONFIG_X86_MCE_AMD
-static void amdgpu_register_bad_pages_mca_notifier(void);
+static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev);
+struct mce_notifier_adev_list {
+       struct amdgpu_device *devs[MAX_GPU_INSTANCE];
+       int num_gpu;
+};
+static struct mce_notifier_adev_list mce_adev_list;
 #endif

 void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready)
@@ -2108,7 +2113,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 #ifdef CONFIG_X86_MCE_AMD
         if ((adev->asic_type == CHIP_ALDEBARAN) &&
             (adev->gmc.xgmi.connected_to_cpu))
-               amdgpu_register_bad_pages_mca_notifier();
+               amdgpu_register_bad_pages_mca_notifier(adev);
 #endif
         return 0;

@@ -2605,24 +2610,18 @@ void amdgpu_release_ras_context(struct amdgpu_device *adev)
 #ifdef CONFIG_X86_MCE_AMD
 static struct amdgpu_device *find_adev(uint32_t node_id)
 {
-       struct amdgpu_gpu_instance *gpu_instance;
         int i;
         struct amdgpu_device *adev = NULL;

-       mutex_lock(&mgpu_info.mutex);
-
-       for (i = 0; i < mgpu_info.num_gpu; i++) {
-               gpu_instance = &(mgpu_info.gpu_ins[i]);
-               adev = gpu_instance->adev;
+       for (i = 0; i < mce_adev_list.num_gpu; i++) {
+               adev = mce_adev_list.devs[i];

-               if (adev->gmc.xgmi.connected_to_cpu &&
+               if (adev && adev->gmc.xgmi.connected_to_cpu &&
                     adev->gmc.xgmi.physical_node_id == node_id)
                         break;
                 adev = NULL;
         }

-       mutex_unlock(&mgpu_info.mutex);
-
         return adev;
 }

@@ -2718,8 +2717,9 @@ static struct notifier_block amdgpu_bad_page_nb = {
         .priority       = MCE_PRIO_UC,
 };

-static void amdgpu_register_bad_pages_mca_notifier(void)
+static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev)
 {
+       mce_adev_list.devs[mce_adev_list.num_gpu++] = adev;
         /*
          * Register the x86 notifier only once
          * with MCE subsystem.
--
2.33.0

[-- Attachment #2: Type: text/html, Size: 10716 bytes --]

^ permalink raw reply related	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2021-10-12 15:35 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-10-12  2:33 [PATCH 1/2] drm/amdgpu: Enable RAS error injection after mode2 reset on Aldebaran Mukul Joshi
2021-10-12  2:33 ` [PATCH 2/2] drm/amdgpu: Fix RAS page retirement with " Mukul Joshi
2021-10-12  3:55   ` Zhou1, Tao
2021-10-12 15:35     ` Joshi, Mukul
2021-10-12  3:31 ` [PATCH 1/2] drm/amdgpu: Enable RAS error injection after " Zhou1, Tao

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.