amd-gfx.lists.freedesktop.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] drm/amdgpu: added xgmi ras error reset sequence
@ 2020-03-25  6:50 Clements, John
  2020-03-25  6:59 ` Zhang, Hawking
  0 siblings, 1 reply; 2+ messages in thread
From: Clements, John @ 2020-03-25  6:50 UTC (permalink / raw)
  To: amd-gfx, Zhang, Hawking


[-- Attachment #1.1: Type: text/plain, Size: 133 bytes --]

[AMD Official Use Only - Internal Distribution Only]

Submitting patch to clear xgmi ras error counters inbetween ras error query

[-- Attachment #1.2: Type: text/html, Size: 1854 bytes --]

[-- Attachment #2: 0001-drm-amdgpu-added-xgmi-ras-error-reset-sequence.patch --]
[-- Type: application/octet-stream, Size: 2854 bytes --]

From be23e834d84cca6df396ac7a18666d173b2991e7 Mon Sep 17 00:00:00 2001
From: John Clements <john.clements@amd.com>
Date: Wed, 25 Mar 2020 14:44:52 +0800
Subject: [PATCH] drm/amdgpu: added xgmi ras error reset sequence

added mechanism to clear xgmi ras status inbetween error queries

Signed-off-by: John Clements <john.clements@amd.com>
Change-Id: I16272adca533f274169efcfb0904d094da43e95f
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 29 ++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h |  1 +
 2 files changed, 30 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 95b3327168ac..8f1f8435e948 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -604,6 +604,8 @@ int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev)
 	    adev->gmc.xgmi.num_physical_nodes == 0)
 		return 0;
 
+	amdgpu_xgmi_reset_ras_error_count(adev);
+
 	if (!adev->gmc.xgmi.ras_if) {
 		adev->gmc.xgmi.ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
 		if (!adev->gmc.xgmi.ras_if)
@@ -668,6 +670,31 @@ uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,
 	return addr + dram_base_addr;
 }
 
+static void pcs_clear_status(struct amdgpu_device *adev, uint32_t pcs_status_reg)
+{
+	WREG32_PCIE(pcs_status_reg, 0xFFFFFFFF);
+	WREG32_PCIE(pcs_status_reg, 0);
+}
+
+void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev)
+{
+	uint32_t i;
+
+	switch (adev->asic_type) {
+	case CHIP_ARCTURUS:
+		for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++)
+			pcs_clear_status(adev,
+					 xgmi_pcs_err_status_reg_arct[i]);
+		break;
+	case CHIP_VEGA20:
+	default:
+		for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++)
+			pcs_clear_status(adev,
+					 xgmi_pcs_err_status_reg_vg20[i]);
+		break;
+	}
+}
+
 static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev,
 					      uint32_t value,
 					      uint32_t *ue_count,
@@ -758,6 +785,8 @@ int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
 		break;
 	}
 
+	amdgpu_xgmi_reset_ras_error_count(adev);
+
 	err_data->ue_count += ue_cnt;
 	err_data->ce_count += ce_cnt;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
index 4a92067fe595..d5a63904ec33 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -56,6 +56,7 @@ uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,
 					   uint64_t addr);
 int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
 				      void *ras_error_status);
+void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev);
 
 static inline bool amdgpu_xgmi_same_hive(struct amdgpu_device *adev,
 		struct amdgpu_device *bo_adev)
-- 
2.17.1


[-- Attachment #3: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2020-03-25  6:59 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-03-25  6:50 [PATCH] drm/amdgpu: added xgmi ras error reset sequence Clements, John
2020-03-25  6:59 ` Zhang, Hawking

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).