All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/4] drm/amdgpu: add RAS page retirement functions for MCA
@ 2022-10-19  8:11 Tao Zhou
  2022-10-19  8:11 ` [PATCH 2/4] drm/amdgpu: use page retirement API in MCA notifier Tao Zhou
                   ` (3 more replies)
  0 siblings, 4 replies; 12+ messages in thread
From: Tao Zhou @ 2022-10-19  8:11 UTC (permalink / raw)
  To: amd-gfx, hawking.zhang, stanley.yang, YiPeng.Chai, Candice.Li; +Cc: Tao Zhou

Define page retirement functions for MCA platform.

v2: remove page retirement handling from MCA poison handler,
    let MCA notifier do page retirement.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 67 +++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h |  2 +
 2 files changed, 69 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index aad3c8b4c810..9494fa14db9a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -22,6 +22,73 @@
  */
 
 #include "amdgpu.h"
+#include "umc_v6_7.h"
+
+static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev,
+				    struct ras_err_data *err_data, uint64_t err_addr,
+				    uint32_t ch_inst, uint32_t umc_inst)
+{
+	switch (adev->ip_versions[UMC_HWIP][0]) {
+	case IP_VERSION(6, 7, 0):
+		umc_v6_7_convert_error_address(adev,
+				err_data, err_addr, ch_inst, umc_inst);
+		break;
+	default:
+		dev_warn(adev->dev,
+			 "UMC address to Physical address translation is not supported\n");
+		return AMDGPU_RAS_FAIL;
+	}
+
+	return AMDGPU_RAS_SUCCESS;
+}
+
+int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
+			uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst)
+{
+	struct ras_err_data err_data = {0, 0, 0, NULL};
+	int ret = AMDGPU_RAS_FAIL;
+
+	err_data.err_addr =
+		kcalloc(adev->umc.max_ras_err_cnt_per_query,
+			sizeof(struct eeprom_table_record), GFP_KERNEL);
+	if (!err_data.err_addr) {
+		dev_warn(adev->dev,
+			"Failed to alloc memory for umc error record in MCA notifier!\n");
+		return AMDGPU_RAS_FAIL;
+	}
+
+	/*
+	 * Translate UMC channel address to Physical address
+	 */
+	ret = amdgpu_umc_convert_error_address(adev, &err_data, err_addr,
+					ch_inst, umc_inst);
+	if (ret)
+		goto out;
+
+	if (amdgpu_bad_page_threshold != 0) {
+		amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
+						err_data.err_addr_cnt);
+		amdgpu_ras_save_bad_pages(adev);
+	}
+
+out:
+	kfree(err_data.err_addr);
+	return ret;
+}
+
+static int amdgpu_umc_poison_handler_mca(struct amdgpu_device *adev,
+		struct ras_err_data *err_data, bool reset)
+{
+	/* MCA poison handler is only responsible for GPU reset,
+	 * let MCA notifier do page retirement.
+	 */
+	if (reset) {
+		kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+		amdgpu_ras_reset_gpu(adev);
+	}
+
+	return AMDGPU_RAS_SUCCESS;
+}
 
 static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
 		void *ras_error_status,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index 3629d8f292ef..659a10de29c9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -98,4 +98,6 @@ void amdgpu_umc_fill_error_record(struct ras_err_data *err_data,
 int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
 		void *ras_error_status,
 		struct amdgpu_iv_entry *entry);
+int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
+			uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst);
 #endif
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread
* [PATCH 1/4] drm/amdgpu: add RAS page retirement functions for MCA
@ 2022-10-21  7:36 Tao Zhou
  2022-10-21  7:36 ` [PATCH 2/4] drm/amdgpu: use page retirement API in MCA notifier Tao Zhou
  0 siblings, 1 reply; 12+ messages in thread
From: Tao Zhou @ 2022-10-21  7:36 UTC (permalink / raw)
  To: amd-gfx, hawking.zhang, stanley.yang, YiPeng.Chai, candice.li; +Cc: Tao Zhou

Define page retirement functions for MCA platform.

v2: remove page retirement handling from MCA poison handler,
    let MCA notifier do page retirement.

v3: remove specific poison handler for MCA to simplify code.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 53 +++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h |  2 +
 2 files changed, 55 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index aad3c8b4c810..3c83129f4090 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -22,6 +22,59 @@
  */
 
 #include "amdgpu.h"
+#include "umc_v6_7.h"
+
+static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev,
+				    struct ras_err_data *err_data, uint64_t err_addr,
+				    uint32_t ch_inst, uint32_t umc_inst)
+{
+	switch (adev->ip_versions[UMC_HWIP][0]) {
+	case IP_VERSION(6, 7, 0):
+		umc_v6_7_convert_error_address(adev,
+				err_data, err_addr, ch_inst, umc_inst);
+		break;
+	default:
+		dev_warn(adev->dev,
+			 "UMC address to Physical address translation is not supported\n");
+		return AMDGPU_RAS_FAIL;
+	}
+
+	return AMDGPU_RAS_SUCCESS;
+}
+
+int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
+			uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst)
+{
+	struct ras_err_data err_data = {0, 0, 0, NULL};
+	int ret = AMDGPU_RAS_FAIL;
+
+	err_data.err_addr =
+		kcalloc(adev->umc.max_ras_err_cnt_per_query,
+			sizeof(struct eeprom_table_record), GFP_KERNEL);
+	if (!err_data.err_addr) {
+		dev_warn(adev->dev,
+			"Failed to alloc memory for umc error record in MCA notifier!\n");
+		return AMDGPU_RAS_FAIL;
+	}
+
+	/*
+	 * Translate UMC channel address to Physical address
+	 */
+	ret = amdgpu_umc_convert_error_address(adev, &err_data, err_addr,
+					ch_inst, umc_inst);
+	if (ret)
+		goto out;
+
+	if (amdgpu_bad_page_threshold != 0) {
+		amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
+						err_data.err_addr_cnt);
+		amdgpu_ras_save_bad_pages(adev);
+	}
+
+out:
+	kfree(err_data.err_addr);
+	return ret;
+}
 
 static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
 		void *ras_error_status,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index 3629d8f292ef..659a10de29c9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -98,4 +98,6 @@ void amdgpu_umc_fill_error_record(struct ras_err_data *err_data,
 int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
 		void *ras_error_status,
 		struct amdgpu_iv_entry *entry);
+int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
+			uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst);
 #endif
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread
* [PATCH 1/4] drm/amdgpu: add RAS page retirement functions for MCA
@ 2022-10-18  8:31 Tao Zhou
  2022-10-18  8:31 ` [PATCH 2/4] drm/amdgpu: use page retirement API in MCA notifier Tao Zhou
  0 siblings, 1 reply; 12+ messages in thread
From: Tao Zhou @ 2022-10-18  8:31 UTC (permalink / raw)
  To: amd-gfx, hawking.zhang, stanley.yang, YiPeng.Chai, candice.li; +Cc: Tao Zhou

Define page retirement functions for MCA platform.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 112 ++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h |   2 +
 drivers/gpu/drm/amd/amdgpu/umc_v6_7.c   |   2 +-
 drivers/gpu/drm/amd/amdgpu/umc_v6_7.h   |   2 +
 4 files changed, 117 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index aad3c8b4c810..e97b1bd343ee 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -22,6 +22,118 @@
  */
 
 #include "amdgpu.h"
+#include "umc_v6_7.h"
+
+static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev,
+				    struct ras_err_data *err_data, uint64_t err_addr,
+				    uint32_t ch_inst, uint32_t umc_inst)
+{
+	switch (adev->ip_versions[UMC_HWIP][0]) {
+	case IP_VERSION(6, 7, 0):
+		umc_v6_7_convert_error_address(adev,
+				err_data, err_addr, ch_inst, umc_inst);
+		break;
+	default:
+		dev_warn(adev->dev,
+			 "UMC address to Physical address translation is not supported\n");
+		return AMDGPU_RAS_FAIL;
+	}
+
+	return AMDGPU_RAS_SUCCESS;
+}
+
+static int amdgpu_umc_ecc_info_query_error_address(struct amdgpu_device *adev,
+					     struct ras_err_data *err_data)
+{
+	switch (adev->ip_versions[UMC_HWIP][0]) {
+	case IP_VERSION(6, 7, 0):
+		umc_v6_7_ecc_info_query_ras_error_address(adev,
+				err_data);
+		break;
+	default:
+		dev_warn(adev->dev,
+			 "UMC error address query is not supported\n");
+		return AMDGPU_RAS_FAIL;
+	}
+
+	return AMDGPU_RAS_SUCCESS;
+}
+
+int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
+			uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst)
+{
+	struct ras_err_data err_data = {0, 0, 0, NULL};
+	int ret = AMDGPU_RAS_FAIL;
+
+	err_data.err_addr =
+		kcalloc(adev->umc.max_ras_err_cnt_per_query,
+			sizeof(struct eeprom_table_record), GFP_KERNEL);
+	if (!err_data.err_addr) {
+		dev_warn(adev->dev,
+			"Failed to alloc memory for umc error record in MCA notifier!\n");
+		return AMDGPU_RAS_FAIL;
+	}
+
+	/*
+	 * Translate UMC channel address to Physical address
+	 */
+	ret = amdgpu_umc_convert_error_address(adev, &err_data, err_addr,
+					ch_inst, umc_inst);
+	if (ret)
+		goto out;
+
+	if (amdgpu_bad_page_threshold != 0) {
+		amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
+						err_data.err_addr_cnt);
+		amdgpu_ras_save_bad_pages(adev);
+	}
+
+out:
+	kfree(err_data.err_addr);
+	return ret;
+}
+
+static int amdgpu_umc_poison_handler_mca(struct amdgpu_device *adev,
+		struct ras_err_data *err_data, bool reset)
+{
+	int ret = AMDGPU_RAS_FAIL;
+
+	err_data->err_addr =
+		kcalloc(adev->umc.max_ras_err_cnt_per_query,
+			sizeof(struct eeprom_table_record), GFP_KERNEL);
+	if (!err_data->err_addr) {
+		dev_warn(adev->dev,
+			"Failed to alloc memory for MCA RAS poison handler!\n");
+		goto out2;
+	}
+
+	/*
+	 * Translate UMC channel address to Physical address
+	 */
+	ret = amdgpu_umc_ecc_info_query_error_address(adev, err_data);
+	if (ret)
+		goto out1;
+
+	if (amdgpu_bad_page_threshold != 0) {
+		amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
+						err_data->err_addr_cnt);
+		amdgpu_ras_save_bad_pages(adev);
+	}
+
+out1:
+	kfree(err_data->err_addr);
+out2:
+	/* trigger gpu reset even error count is 0 for CPU MCA RAS,
+	 * MCA notifier is responsible for page retirement if error
+	 * count can't be queried in poison handler.
+	 */
+	if (reset) {
+		kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+		amdgpu_ras_reset_gpu(adev);
+	}
+
+	return ret;
+}
 
 static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
 		void *ras_error_status,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index 3629d8f292ef..659a10de29c9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -98,4 +98,6 @@ void amdgpu_umc_fill_error_record(struct ras_err_data *err_data,
 int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
 		void *ras_error_status,
 		struct amdgpu_iv_entry *entry);
+int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
+			uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst);
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
index 72fd963f178b..b0da50d03af6 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
@@ -252,7 +252,7 @@ static void umc_v6_7_ecc_info_query_error_address(struct amdgpu_device *adev,
 	}
 }
 
-static void umc_v6_7_ecc_info_query_ras_error_address(struct amdgpu_device *adev,
+void umc_v6_7_ecc_info_query_ras_error_address(struct amdgpu_device *adev,
 					     void *ras_error_status)
 {
 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h
index 105245d5b6e5..24382e9e5814 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h
@@ -74,4 +74,6 @@ extern const uint32_t
 void umc_v6_7_convert_error_address(struct amdgpu_device *adev,
                                     struct ras_err_data *err_data, uint64_t err_addr,
                                     uint32_t ch_inst, uint32_t umc_inst);
+void umc_v6_7_ecc_info_query_ras_error_address(struct amdgpu_device *adev,
+					     void *ras_error_status);
 #endif
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2022-10-21  7:36 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-10-19  8:11 [PATCH 1/4] drm/amdgpu: add RAS page retirement functions for MCA Tao Zhou
2022-10-19  8:11 ` [PATCH 2/4] drm/amdgpu: use page retirement API in MCA notifier Tao Zhou
2022-10-19  8:11 ` [PATCH 3/4] drm/amdgpu: add RAS poison handling for MCA Tao Zhou
2022-10-19  8:11 ` [PATCH 4/4] drm/amdgpu: remove ras_error_status parameter for UMC poison handler Tao Zhou
2022-10-20  9:29   ` Zhang, Hawking
2022-10-21  2:51     ` Zhou1, Tao
2022-10-20  9:12 ` [PATCH 1/4] drm/amdgpu: add RAS page retirement functions for MCA Zhang, Hawking
2022-10-21  2:53   ` Zhou1, Tao
2022-10-21  4:15     ` Zhang, Hawking
2022-10-21  6:27       ` Zhou1, Tao
  -- strict thread matches above, loose matches on Subject: below --
2022-10-21  7:36 Tao Zhou
2022-10-21  7:36 ` [PATCH 2/4] drm/amdgpu: use page retirement API in MCA notifier Tao Zhou
2022-10-18  8:31 [PATCH 1/4] drm/amdgpu: add RAS page retirement functions for MCA Tao Zhou
2022-10-18  8:31 ` [PATCH 2/4] drm/amdgpu: use page retirement API in MCA notifier Tao Zhou

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.