All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 1/4] drm/amdgpu: Set fatal errror detected flag earlier
@ 2024-03-28  2:35 Lijo Lazar
  2024-03-28  2:35 ` [PATCH v2 2/4] drm/amd/pm: Add PMFW message and capability flags Lijo Lazar
                   ` (3 more replies)
  0 siblings, 4 replies; 10+ messages in thread
From: Lijo Lazar @ 2024-03-28  2:35 UTC (permalink / raw)
  To: amd-gfx; +Cc: Hawking.Zhang, Alexander.Deucher, kevinyang.wang

In case of fatal errors, set FED status when interrupt is received. Set
the flag on other devices in the hive before RAS recovery work.

Signed-off-by: Lijo Lazar <lijo.lazar@amd.com>
---
v2: Avoid accessing hive in interrupt handler as it may take mutex path (Kevin)

 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 41 +++++++++++++++++--------
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index b8c7d0bf8fb1..352ce16a0963 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2399,6 +2399,19 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
 	return ret;
 }
 
+static void amdgpu_ras_set_fed_all(struct amdgpu_device *adev,
+				   struct amdgpu_hive_info *hive, bool status)
+{
+	struct amdgpu_device *tmp_adev;
+
+	if (hive) {
+		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
+			amdgpu_ras_set_fed(tmp_adev, status);
+	} else {
+		amdgpu_ras_set_fed(adev, status);
+	}
+}
+
 static void amdgpu_ras_do_recovery(struct work_struct *work)
 {
 	struct amdgpu_ras *ras =
@@ -2408,8 +2421,21 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
 	struct list_head device_list, *device_list_handle =  NULL;
 	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
 
-	if (hive)
+	if (hive) {
 		atomic_set(&hive->ras_recovery, 1);
+
+		/* If any device which is part of the hive received RAS fatal
+		 * error interrupt, set fatal error status on all. This
+		 * condition will need a recovery, and flag will be cleared
+		 * as part of recovery.
+		 */
+		list_for_each_entry(remote_adev, &hive->device_list,
+				    gmc.xgmi.head)
+			if (amdgpu_ras_get_fed_status(remote_adev)) {
+				amdgpu_ras_set_fed_all(adev, hive, true);
+				break;
+			}
+	}
 	if (!ras->disable_ras_err_cnt_harvest) {
 
 		/* Build list of devices to query RAS related errors */
@@ -2454,18 +2480,6 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
 				ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE1_RESET;
 				set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
 
-				/* For any RAS error that needs a full reset to
-				 * recover, set the fatal error status
-				 */
-				if (hive) {
-					list_for_each_entry(remote_adev,
-							    &hive->device_list,
-							    gmc.xgmi.head)
-						amdgpu_ras_set_fed(remote_adev,
-								   true);
-				} else {
-					amdgpu_ras_set_fed(adev, true);
-				}
 				psp_fatal_error_recovery_quirk(&adev->psp);
 			}
 		}
@@ -3550,6 +3564,7 @@ void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
 		RAS_EVENT_LOG(adev, event_id, "uncorrectable hardware error"
 			      "(ERREVENT_ATHUB_INTERRUPT) detected!\n");
 
+		amdgpu_ras_set_fed(adev, true);
 		ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET;
 		amdgpu_ras_reset_gpu(adev);
 	}
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2024-04-01 11:33 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-03-28  2:35 [PATCH v2 1/4] drm/amdgpu: Set fatal errror detected flag earlier Lijo Lazar
2024-03-28  2:35 ` [PATCH v2 2/4] drm/amd/pm: Add PMFW message and capability flags Lijo Lazar
2024-03-28  3:27   ` Wang, Yang(Kevin)
2024-03-28  3:33     ` Lazar, Lijo
2024-04-01 11:15   ` Kamal, Asad
2024-04-01 11:20     ` Lazar, Lijo
2024-03-28  2:35 ` [PATCH v2 3/4] drm/amd/pm: Add special handling for RAS messages Lijo Lazar
2024-03-28  2:35 ` [PATCH v2 4/4] drm/amd/pm: Categorize RAS messages on SMUv13.0.6 Lijo Lazar
2024-03-28  8:01   ` Zhang, Hawking
2024-04-01 11:33 ` [PATCH v2 1/4] drm/amdgpu: Set fatal errror detected flag earlier Kamal, Asad

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.