All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/4] drm/amdgpu: add RAS page retirement functions for MCA
@ 2022-10-19  8:11 Tao Zhou
  2022-10-19  8:11 ` [PATCH 2/4] drm/amdgpu: use page retirement API in MCA notifier Tao Zhou
                   ` (3 more replies)
  0 siblings, 4 replies; 13+ messages in thread
From: Tao Zhou @ 2022-10-19  8:11 UTC (permalink / raw)
  To: amd-gfx, hawking.zhang, stanley.yang, YiPeng.Chai, Candice.Li; +Cc: Tao Zhou

Define page retirement functions for MCA platform.

v2: remove page retirement handling from MCA poison handler,
    let MCA notifier do page retirement.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 67 +++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h |  2 +
 2 files changed, 69 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index aad3c8b4c810..9494fa14db9a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -22,6 +22,73 @@
  */
 
 #include "amdgpu.h"
+#include "umc_v6_7.h"
+
+static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev,
+				    struct ras_err_data *err_data, uint64_t err_addr,
+				    uint32_t ch_inst, uint32_t umc_inst)
+{
+	switch (adev->ip_versions[UMC_HWIP][0]) {
+	case IP_VERSION(6, 7, 0):
+		umc_v6_7_convert_error_address(adev,
+				err_data, err_addr, ch_inst, umc_inst);
+		break;
+	default:
+		dev_warn(adev->dev,
+			 "UMC address to Physical address translation is not supported\n");
+		return AMDGPU_RAS_FAIL;
+	}
+
+	return AMDGPU_RAS_SUCCESS;
+}
+
+int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
+			uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst)
+{
+	struct ras_err_data err_data = {0, 0, 0, NULL};
+	int ret = AMDGPU_RAS_FAIL;
+
+	err_data.err_addr =
+		kcalloc(adev->umc.max_ras_err_cnt_per_query,
+			sizeof(struct eeprom_table_record), GFP_KERNEL);
+	if (!err_data.err_addr) {
+		dev_warn(adev->dev,
+			"Failed to alloc memory for umc error record in MCA notifier!\n");
+		return AMDGPU_RAS_FAIL;
+	}
+
+	/*
+	 * Translate UMC channel address to Physical address
+	 */
+	ret = amdgpu_umc_convert_error_address(adev, &err_data, err_addr,
+					ch_inst, umc_inst);
+	if (ret)
+		goto out;
+
+	if (amdgpu_bad_page_threshold != 0) {
+		amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
+						err_data.err_addr_cnt);
+		amdgpu_ras_save_bad_pages(adev);
+	}
+
+out:
+	kfree(err_data.err_addr);
+	return ret;
+}
+
+static int amdgpu_umc_poison_handler_mca(struct amdgpu_device *adev,
+		struct ras_err_data *err_data, bool reset)
+{
+	/* MCA poison handler is only responsible for GPU reset,
+	 * let MCA notifier do page retirement.
+	 */
+	if (reset) {
+		kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+		amdgpu_ras_reset_gpu(adev);
+	}
+
+	return AMDGPU_RAS_SUCCESS;
+}
 
 static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
 		void *ras_error_status,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index 3629d8f292ef..659a10de29c9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -98,4 +98,6 @@ void amdgpu_umc_fill_error_record(struct ras_err_data *err_data,
 int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
 		void *ras_error_status,
 		struct amdgpu_iv_entry *entry);
+int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
+			uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst);
 #endif
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 2/4] drm/amdgpu: use page retirement API in MCA notifier
  2022-10-19  8:11 [PATCH 1/4] drm/amdgpu: add RAS page retirement functions for MCA Tao Zhou
@ 2022-10-19  8:11 ` Tao Zhou
  2022-10-19  8:11 ` [PATCH 3/4] drm/amdgpu: add RAS poison handling for MCA Tao Zhou
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 13+ messages in thread
From: Tao Zhou @ 2022-10-19  8:11 UTC (permalink / raw)
  To: amd-gfx, hawking.zhang, stanley.yang, YiPeng.Chai, Candice.Li; +Cc: Tao Zhou

Make the code more readable.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 36 +++----------------------
 1 file changed, 3 insertions(+), 33 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 21a47f2bb87b..28463b47ce33 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -36,7 +36,6 @@
 #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
 #include "atom.h"
 #include "amdgpu_reset.h"
-#include "umc_v6_7.h"
 
 #ifdef CONFIG_X86_MCE_AMD
 #include <asm/mce.h>
@@ -2849,7 +2848,6 @@ static int amdgpu_bad_page_notifier(struct notifier_block *nb,
 	struct amdgpu_device *adev = NULL;
 	uint32_t gpu_id = 0;
 	uint32_t umc_inst = 0, ch_inst = 0;
-	struct ras_err_data err_data = {0, 0, 0, NULL};
 
 	/*
 	 * If the error was generated in UMC_V2, which belongs to GPU UMCs,
@@ -2888,38 +2886,10 @@ static int amdgpu_bad_page_notifier(struct notifier_block *nb,
 	dev_info(adev->dev, "Uncorrectable error detected in UMC inst: %d, chan_idx: %d",
 			     umc_inst, ch_inst);
 
-	err_data.err_addr =
-		kcalloc(adev->umc.max_ras_err_cnt_per_query,
-			sizeof(struct eeprom_table_record), GFP_KERNEL);
-	if (!err_data.err_addr) {
-		dev_warn(adev->dev,
-			"Failed to alloc memory for umc error record in mca notifier!\n");
-		return NOTIFY_DONE;
-	}
-
-	/*
-	 * Translate UMC channel address to Physical address
-	 */
-	switch (adev->ip_versions[UMC_HWIP][0]) {
-	case IP_VERSION(6, 7, 0):
-		umc_v6_7_convert_error_address(adev,
-				&err_data, m->addr, ch_inst, umc_inst);
-		break;
-	default:
-		dev_warn(adev->dev,
-			 "UMC address to Physical address translation is not supported\n");
-		kfree(err_data.err_addr);
+	if (!amdgpu_umc_page_retirement_mca(adev, m->addr, ch_inst, umc_inst))
+		return NOTIFY_OK;
+	else
 		return NOTIFY_DONE;
-	}
-
-	if (amdgpu_bad_page_threshold != 0) {
-		amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
-						err_data.err_addr_cnt);
-		amdgpu_ras_save_bad_pages(adev);
-	}
-
-	kfree(err_data.err_addr);
-	return NOTIFY_OK;
 }
 
 static struct notifier_block amdgpu_bad_page_nb = {
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 3/4] drm/amdgpu: add RAS poison handling for MCA
  2022-10-19  8:11 [PATCH 1/4] drm/amdgpu: add RAS page retirement functions for MCA Tao Zhou
  2022-10-19  8:11 ` [PATCH 2/4] drm/amdgpu: use page retirement API in MCA notifier Tao Zhou
@ 2022-10-19  8:11 ` Tao Zhou
  2022-10-19  8:11 ` [PATCH 4/4] drm/amdgpu: remove ras_error_status parameter for UMC poison handler Tao Zhou
  2022-10-20  9:12 ` [PATCH 1/4] drm/amdgpu: add RAS page retirement functions for MCA Zhang, Hawking
  3 siblings, 0 replies; 13+ messages in thread
From: Tao Zhou @ 2022-10-19  8:11 UTC (permalink / raw)
  To: amd-gfx, hawking.zhang, stanley.yang, YiPeng.Chai, Candice.Li; +Cc: Tao Zhou

For MCA poison, if unmap queue fails, only gpu reset should be
triggered without page retirement handling, MCA notifier will do it.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 27 +++++++++++++++----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 9494fa14db9a..dd1b1a612343 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -184,18 +184,23 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
 		bool reset)
 {
 	int ret;
-	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
-	struct ras_common_if head = {
-		.block = AMDGPU_RAS_BLOCK__UMC,
-	};
-	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
 
-	ret =
-		amdgpu_umc_do_page_retirement(adev, ras_error_status, NULL, reset);
-
-	if (ret == AMDGPU_RAS_SUCCESS && obj) {
-		obj->err_data.ue_count += err_data->ue_count;
-		obj->err_data.ce_count += err_data->ce_count;
+	if (adev->gmc.xgmi.connected_to_cpu) {
+		ret = amdgpu_umc_poison_handler_mca(adev, ras_error_status, reset);
+	} else {
+		struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
+		struct ras_common_if head = {
+			.block = AMDGPU_RAS_BLOCK__UMC,
+		};
+		struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
+
+		ret =
+			amdgpu_umc_do_page_retirement(adev, ras_error_status, NULL, reset);
+
+		if (ret == AMDGPU_RAS_SUCCESS && obj) {
+			obj->err_data.ue_count += err_data->ue_count;
+			obj->err_data.ce_count += err_data->ce_count;
+		}
 	}
 
 	return ret;
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 4/4] drm/amdgpu: remove ras_error_status parameter for UMC poison handler
  2022-10-19  8:11 [PATCH 1/4] drm/amdgpu: add RAS page retirement functions for MCA Tao Zhou
  2022-10-19  8:11 ` [PATCH 2/4] drm/amdgpu: use page retirement API in MCA notifier Tao Zhou
  2022-10-19  8:11 ` [PATCH 3/4] drm/amdgpu: add RAS poison handling for MCA Tao Zhou
@ 2022-10-19  8:11 ` Tao Zhou
  2022-10-20  9:29   ` Zhang, Hawking
  2022-10-20  9:12 ` [PATCH 1/4] drm/amdgpu: add RAS page retirement functions for MCA Zhang, Hawking
  3 siblings, 1 reply; 13+ messages in thread
From: Tao Zhou @ 2022-10-19  8:11 UTC (permalink / raw)
  To: amd-gfx, hawking.zhang, stanley.yang, YiPeng.Chai, Candice.Li; +Cc: Tao Zhou

Make the code more simple.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c |  4 +---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    |  3 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c    | 16 ++++++----------
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h    |  4 +---
 4 files changed, 9 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 0561812aa0a4..37db39ba8718 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -753,9 +753,7 @@ bool amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev)
 
 void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bool reset)
 {
-	struct ras_err_data err_data = {0, 0, 0, NULL};
-
-	amdgpu_umc_poison_handler(adev, &err_data, reset);
+	amdgpu_umc_poison_handler(adev, reset);
 }
 
 bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 28463b47ce33..693bce07eb46 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1561,7 +1561,6 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
 {
 	bool poison_stat = false;
 	struct amdgpu_device *adev = obj->adev;
-	struct ras_err_data err_data = {0, 0, 0, NULL};
 	struct amdgpu_ras_block_object *block_obj =
 		amdgpu_ras_get_ras_block(adev, obj->head.block, 0);
 
@@ -1584,7 +1583,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
 	}
 
 	if (!adev->gmc.xgmi.connected_to_cpu)
-		amdgpu_umc_poison_handler(adev, &err_data, false);
+		amdgpu_umc_poison_handler(adev, false);
 
 	if (block_obj->hw_ops->handle_poison_consumption)
 		poison_stat = block_obj->hw_ops->handle_poison_consumption(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index dd1b1a612343..c040c9104521 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -179,27 +179,23 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
 	return AMDGPU_RAS_SUCCESS;
 }
 
-int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
-		void *ras_error_status,
-		bool reset)
+int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset)
 {
+	struct ras_err_data err_data = {0, 0, 0, NULL};
 	int ret;
 
 	if (adev->gmc.xgmi.connected_to_cpu) {
-		ret = amdgpu_umc_poison_handler_mca(adev, ras_error_status, reset);
+		ret = amdgpu_umc_poison_handler_mca(adev, &err_data, reset);
 	} else {
-		struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
 		struct ras_common_if head = {
 			.block = AMDGPU_RAS_BLOCK__UMC,
 		};
 		struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
 
-		ret =
-			amdgpu_umc_do_page_retirement(adev, ras_error_status, NULL, reset);
-
+		ret = amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);
 		if (ret == AMDGPU_RAS_SUCCESS && obj) {
-			obj->err_data.ue_count += err_data->ue_count;
-			obj->err_data.ce_count += err_data->ce_count;
+			obj->err_data.ue_count += err_data.ue_count;
+			obj->err_data.ce_count += err_data.ce_count;
 		}
 	}
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index 659a10de29c9..a6951160f13a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -83,9 +83,7 @@ struct amdgpu_umc {
 };
 
 int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block);
-int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
-		void *ras_error_status,
-		bool reset);
+int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset);
 int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
 		struct amdgpu_irq_src *source,
 		struct amdgpu_iv_entry *entry);
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* RE: [PATCH 1/4] drm/amdgpu: add RAS page retirement functions for MCA
  2022-10-19  8:11 [PATCH 1/4] drm/amdgpu: add RAS page retirement functions for MCA Tao Zhou
                   ` (2 preceding siblings ...)
  2022-10-19  8:11 ` [PATCH 4/4] drm/amdgpu: remove ras_error_status parameter for UMC poison handler Tao Zhou
@ 2022-10-20  9:12 ` Zhang, Hawking
  2022-10-21  2:53   ` Zhou1, Tao
  3 siblings, 1 reply; 13+ messages in thread
From: Zhang, Hawking @ 2022-10-20  9:12 UTC (permalink / raw)
  To: Zhou1, Tao, amd-gfx, Yang, Stanley, Chai, Thomas, Li, Candice

[AMD Official Use Only - General]

+static int amdgpu_umc_poison_handler_mca(struct amdgpu_device *adev,
+		struct ras_err_data *err_data, bool reset)


+	if (adev->gmc.xgmi.connected_to_cpu) {
+		ret = amdgpu_umc_poison_handler_mca(adev, ras_error_status, reset);

The input parameters "reset" and "err_data" can be dropped since amdgpu_umc_poison_handler_mca is dedicated to MCA platform

Regards,
Hawking

-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1@amd.com> 
Sent: Wednesday, October 19, 2022 16:12
To: amd-gfx@lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>; Li, Candice <Candice.Li@amd.com>
Cc: Zhou1, Tao <Tao.Zhou1@amd.com>
Subject: [PATCH 1/4] drm/amdgpu: add RAS page retirement functions for MCA

Define page retirement functions for MCA platform.

v2: remove page retirement handling from MCA poison handler,
    let MCA notifier do page retirement.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 67 +++++++++++++++++++++++++  drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h |  2 +
 2 files changed, 69 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index aad3c8b4c810..9494fa14db9a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -22,6 +22,73 @@
  */
 
 #include "amdgpu.h"
+#include "umc_v6_7.h"
+
+static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev,
+				    struct ras_err_data *err_data, uint64_t err_addr,
+				    uint32_t ch_inst, uint32_t umc_inst) {
+	switch (adev->ip_versions[UMC_HWIP][0]) {
+	case IP_VERSION(6, 7, 0):
+		umc_v6_7_convert_error_address(adev,
+				err_data, err_addr, ch_inst, umc_inst);
+		break;
+	default:
+		dev_warn(adev->dev,
+			 "UMC address to Physical address translation is not supported\n");
+		return AMDGPU_RAS_FAIL;
+	}
+
+	return AMDGPU_RAS_SUCCESS;
+}
+
+int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
+			uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst) {
+	struct ras_err_data err_data = {0, 0, 0, NULL};
+	int ret = AMDGPU_RAS_FAIL;
+
+	err_data.err_addr =
+		kcalloc(adev->umc.max_ras_err_cnt_per_query,
+			sizeof(struct eeprom_table_record), GFP_KERNEL);
+	if (!err_data.err_addr) {
+		dev_warn(adev->dev,
+			"Failed to alloc memory for umc error record in MCA notifier!\n");
+		return AMDGPU_RAS_FAIL;
+	}
+
+	/*
+	 * Translate UMC channel address to Physical address
+	 */
+	ret = amdgpu_umc_convert_error_address(adev, &err_data, err_addr,
+					ch_inst, umc_inst);
+	if (ret)
+		goto out;
+
+	if (amdgpu_bad_page_threshold != 0) {
+		amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
+						err_data.err_addr_cnt);
+		amdgpu_ras_save_bad_pages(adev);
+	}
+
+out:
+	kfree(err_data.err_addr);
+	return ret;
+}
+
+static int amdgpu_umc_poison_handler_mca(struct amdgpu_device *adev,
+		struct ras_err_data *err_data, bool reset) {
+	/* MCA poison handler is only responsible for GPU reset,
+	 * let MCA notifier do page retirement.
+	 */
+	if (reset) {
+		kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+		amdgpu_ras_reset_gpu(adev);
+	}
+
+	return AMDGPU_RAS_SUCCESS;
+}
 
 static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
 		void *ras_error_status,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index 3629d8f292ef..659a10de29c9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -98,4 +98,6 @@ void amdgpu_umc_fill_error_record(struct ras_err_data *err_data,  int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
 		void *ras_error_status,
 		struct amdgpu_iv_entry *entry);
+int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
+			uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst);
 #endif
--
2.35.1

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* RE: [PATCH 4/4] drm/amdgpu: remove ras_error_status parameter for UMC poison handler
  2022-10-19  8:11 ` [PATCH 4/4] drm/amdgpu: remove ras_error_status parameter for UMC poison handler Tao Zhou
@ 2022-10-20  9:29   ` Zhang, Hawking
  2022-10-21  2:51     ` Zhou1, Tao
  0 siblings, 1 reply; 13+ messages in thread
From: Zhang, Hawking @ 2022-10-20  9:29 UTC (permalink / raw)
  To: Zhou1, Tao, amd-gfx, Yang, Stanley, Chai, Thomas, Li, Candice

[AMD Official Use Only - General]

Might squash this with patch 1

Regards,
Hawking

-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1@amd.com> 
Sent: Wednesday, October 19, 2022 16:12
To: amd-gfx@lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>; Li, Candice <Candice.Li@amd.com>
Cc: Zhou1, Tao <Tao.Zhou1@amd.com>
Subject: [PATCH 4/4] drm/amdgpu: remove ras_error_status parameter for UMC poison handler

Make the code more simple.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c |  4 +---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    |  3 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c    | 16 ++++++----------
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h    |  4 +---
 4 files changed, 9 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 0561812aa0a4..37db39ba8718 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -753,9 +753,7 @@ bool amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev)
 
 void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bool reset)  {
-	struct ras_err_data err_data = {0, 0, 0, NULL};
-
-	amdgpu_umc_poison_handler(adev, &err_data, reset);
+	amdgpu_umc_poison_handler(adev, reset);
 }
 
 bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 28463b47ce33..693bce07eb46 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1561,7 +1561,6 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *  {
 	bool poison_stat = false;
 	struct amdgpu_device *adev = obj->adev;
-	struct ras_err_data err_data = {0, 0, 0, NULL};
 	struct amdgpu_ras_block_object *block_obj =
 		amdgpu_ras_get_ras_block(adev, obj->head.block, 0);
 
@@ -1584,7 +1583,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
 	}
 
 	if (!adev->gmc.xgmi.connected_to_cpu)
-		amdgpu_umc_poison_handler(adev, &err_data, false);
+		amdgpu_umc_poison_handler(adev, false);
 
 	if (block_obj->hw_ops->handle_poison_consumption)
 		poison_stat = block_obj->hw_ops->handle_poison_consumption(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index dd1b1a612343..c040c9104521 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -179,27 +179,23 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
 	return AMDGPU_RAS_SUCCESS;
 }
 
-int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
-		void *ras_error_status,
-		bool reset)
+int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset)
 {
+	struct ras_err_data err_data = {0, 0, 0, NULL};
 	int ret;
 
 	if (adev->gmc.xgmi.connected_to_cpu) {
-		ret = amdgpu_umc_poison_handler_mca(adev, ras_error_status, reset);
+		ret = amdgpu_umc_poison_handler_mca(adev, &err_data, reset);
 	} else {
-		struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
 		struct ras_common_if head = {
 			.block = AMDGPU_RAS_BLOCK__UMC,
 		};
 		struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
 
-		ret =
-			amdgpu_umc_do_page_retirement(adev, ras_error_status, NULL, reset);
-
+		ret = amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);
 		if (ret == AMDGPU_RAS_SUCCESS && obj) {
-			obj->err_data.ue_count += err_data->ue_count;
-			obj->err_data.ce_count += err_data->ce_count;
+			obj->err_data.ue_count += err_data.ue_count;
+			obj->err_data.ce_count += err_data.ce_count;
 		}
 	}
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index 659a10de29c9..a6951160f13a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -83,9 +83,7 @@ struct amdgpu_umc {
 };
 
 int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block); -int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
-		void *ras_error_status,
-		bool reset);
+int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset);
 int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
 		struct amdgpu_irq_src *source,
 		struct amdgpu_iv_entry *entry);
--
2.35.1

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* RE: [PATCH 4/4] drm/amdgpu: remove ras_error_status parameter for UMC poison handler
  2022-10-20  9:29   ` Zhang, Hawking
@ 2022-10-21  2:51     ` Zhou1, Tao
  0 siblings, 0 replies; 13+ messages in thread
From: Zhou1, Tao @ 2022-10-21  2:51 UTC (permalink / raw)
  To: Zhang, Hawking, amd-gfx, Yang, Stanley, Chai, Thomas, Li, Candice

[AMD Official Use Only - General]



> -----Original Message-----
> From: Zhang, Hawking <Hawking.Zhang@amd.com>
> Sent: Thursday, October 20, 2022 5:30 PM
> To: Zhou1, Tao <Tao.Zhou1@amd.com>; amd-gfx@lists.freedesktop.org; Yang,
> Stanley <Stanley.Yang@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>; Li,
> Candice <Candice.Li@amd.com>
> Subject: RE: [PATCH 4/4] drm/amdgpu: remove ras_error_status parameter for
> UMC poison handler
> 
> [AMD Official Use Only - General]
> 
> Might squash this with patch 1

[Tao] This is a refinement different from patch#1. Both ways are OK but I prefer leaving the patch alone.

> 
> Regards,
> Hawking
> 
> -----Original Message-----
> From: Zhou1, Tao <Tao.Zhou1@amd.com>
> Sent: Wednesday, October 19, 2022 16:12
> To: amd-gfx@lists.freedesktop.org; Zhang, Hawking
> <Hawking.Zhang@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; Chai,
> Thomas <YiPeng.Chai@amd.com>; Li, Candice <Candice.Li@amd.com>
> Cc: Zhou1, Tao <Tao.Zhou1@amd.com>
> Subject: [PATCH 4/4] drm/amdgpu: remove ras_error_status parameter for
> UMC poison handler
> 
> Make the code more simple.
> 
> Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c |  4 +---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    |  3 +--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c    | 16 ++++++----------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h    |  4 +---
>  4 files changed, 9 insertions(+), 18 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index 0561812aa0a4..37db39ba8718 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -753,9 +753,7 @@ bool amdgpu_amdkfd_have_atomics_support(struct
> amdgpu_device *adev)
> 
>  void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device
> *adev, bool reset)  {
> -	struct ras_err_data err_data = {0, 0, 0, NULL};
> -
> -	amdgpu_umc_poison_handler(adev, &err_data, reset);
> +	amdgpu_umc_poison_handler(adev, reset);
>  }
> 
>  bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device
> *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 28463b47ce33..693bce07eb46 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1561,7 +1561,6 @@ static void
> amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *  {
>  	bool poison_stat = false;
>  	struct amdgpu_device *adev = obj->adev;
> -	struct ras_err_data err_data = {0, 0, 0, NULL};
>  	struct amdgpu_ras_block_object *block_obj =
>  		amdgpu_ras_get_ras_block(adev, obj->head.block, 0);
> 
> @@ -1584,7 +1583,7 @@ static void
> amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
>  	}
> 
>  	if (!adev->gmc.xgmi.connected_to_cpu)
> -		amdgpu_umc_poison_handler(adev, &err_data, false);
> +		amdgpu_umc_poison_handler(adev, false);
> 
>  	if (block_obj->hw_ops->handle_poison_consumption)
>  		poison_stat = block_obj->hw_ops-
> >handle_poison_consumption(adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> index dd1b1a612343..c040c9104521 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> @@ -179,27 +179,23 @@ static int amdgpu_umc_do_page_retirement(struct
> amdgpu_device *adev,
>  	return AMDGPU_RAS_SUCCESS;
>  }
> 
> -int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
> -		void *ras_error_status,
> -		bool reset)
> +int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset)
>  {
> +	struct ras_err_data err_data = {0, 0, 0, NULL};
>  	int ret;
> 
>  	if (adev->gmc.xgmi.connected_to_cpu) {
> -		ret = amdgpu_umc_poison_handler_mca(adev,
> ras_error_status, reset);
> +		ret = amdgpu_umc_poison_handler_mca(adev, &err_data,
> reset);
>  	} else {
> -		struct ras_err_data *err_data = (struct ras_err_data
> *)ras_error_status;
>  		struct ras_common_if head = {
>  			.block = AMDGPU_RAS_BLOCK__UMC,
>  		};
>  		struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
> 
> -		ret =
> -			amdgpu_umc_do_page_retirement(adev,
> ras_error_status, NULL, reset);
> -
> +		ret = amdgpu_umc_do_page_retirement(adev, &err_data,
> NULL, reset);
>  		if (ret == AMDGPU_RAS_SUCCESS && obj) {
> -			obj->err_data.ue_count += err_data->ue_count;
> -			obj->err_data.ce_count += err_data->ce_count;
> +			obj->err_data.ue_count += err_data.ue_count;
> +			obj->err_data.ce_count += err_data.ce_count;
>  		}
>  	}
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> index 659a10de29c9..a6951160f13a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> @@ -83,9 +83,7 @@ struct amdgpu_umc {
>  };
> 
>  int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct
> ras_common_if *ras_block); -int amdgpu_umc_poison_handler(struct
> amdgpu_device *adev,
> -		void *ras_error_status,
> -		bool reset);
> +int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset);
>  int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
>  		struct amdgpu_irq_src *source,
>  		struct amdgpu_iv_entry *entry);
> --
> 2.35.1

^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: [PATCH 1/4] drm/amdgpu: add RAS page retirement functions for MCA
  2022-10-20  9:12 ` [PATCH 1/4] drm/amdgpu: add RAS page retirement functions for MCA Zhang, Hawking
@ 2022-10-21  2:53   ` Zhou1, Tao
  2022-10-21  4:15     ` Zhang, Hawking
  0 siblings, 1 reply; 13+ messages in thread
From: Zhou1, Tao @ 2022-10-21  2:53 UTC (permalink / raw)
  To: Zhang, Hawking, amd-gfx, Yang, Stanley, Chai, Thomas, Li, Candice

[AMD Official Use Only - General]


> -----Original Message-----
> From: Zhang, Hawking <Hawking.Zhang@amd.com>
> Sent: Thursday, October 20, 2022 5:13 PM
> To: Zhou1, Tao <Tao.Zhou1@amd.com>; amd-gfx@lists.freedesktop.org; Yang,
> Stanley <Stanley.Yang@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>; Li,
> Candice <Candice.Li@amd.com>
> Subject: RE: [PATCH 1/4] drm/amdgpu: add RAS page retirement functions for
> MCA
> 
> [AMD Official Use Only - General]
> 
> +static int amdgpu_umc_poison_handler_mca(struct amdgpu_device *adev,
> +		struct ras_err_data *err_data, bool reset)
> 
> 
> +	if (adev->gmc.xgmi.connected_to_cpu) {
> +		ret = amdgpu_umc_poison_handler_mca(adev,
> ras_error_status, reset);
> 
> The input parameters "reset" and "err_data" can be dropped since
> amdgpu_umc_poison_handler_mca is dedicated to MCA platform

[Tao] whether need to do gpu reset is determined by unmap queue status, so reset parameter can't be dropped.
For "err_data", it can be removed currently, but I'm afraid we may need it on other ASICs in the future.

> 
> Regards,
> Hawking
> 
> -----Original Message-----
> From: Zhou1, Tao <Tao.Zhou1@amd.com>
> Sent: Wednesday, October 19, 2022 16:12
> To: amd-gfx@lists.freedesktop.org; Zhang, Hawking
> <Hawking.Zhang@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; Chai,
> Thomas <YiPeng.Chai@amd.com>; Li, Candice <Candice.Li@amd.com>
> Cc: Zhou1, Tao <Tao.Zhou1@amd.com>
> Subject: [PATCH 1/4] drm/amdgpu: add RAS page retirement functions for MCA
> 
> Define page retirement functions for MCA platform.
> 
> v2: remove page retirement handling from MCA poison handler,
>     let MCA notifier do page retirement.
> 
> Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 67
> +++++++++++++++++++++++++  drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> |  2 +
>  2 files changed, 69 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> index aad3c8b4c810..9494fa14db9a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> @@ -22,6 +22,73 @@
>   */
> 
>  #include "amdgpu.h"
> +#include "umc_v6_7.h"
> +
> +static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev,
> +				    struct ras_err_data *err_data, uint64_t
> err_addr,
> +				    uint32_t ch_inst, uint32_t umc_inst) {
> +	switch (adev->ip_versions[UMC_HWIP][0]) {
> +	case IP_VERSION(6, 7, 0):
> +		umc_v6_7_convert_error_address(adev,
> +				err_data, err_addr, ch_inst, umc_inst);
> +		break;
> +	default:
> +		dev_warn(adev->dev,
> +			 "UMC address to Physical address translation is not
> supported\n");
> +		return AMDGPU_RAS_FAIL;
> +	}
> +
> +	return AMDGPU_RAS_SUCCESS;
> +}
> +
> +int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
> +			uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst)
> {
> +	struct ras_err_data err_data = {0, 0, 0, NULL};
> +	int ret = AMDGPU_RAS_FAIL;
> +
> +	err_data.err_addr =
> +		kcalloc(adev->umc.max_ras_err_cnt_per_query,
> +			sizeof(struct eeprom_table_record), GFP_KERNEL);
> +	if (!err_data.err_addr) {
> +		dev_warn(adev->dev,
> +			"Failed to alloc memory for umc error record in MCA
> notifier!\n");
> +		return AMDGPU_RAS_FAIL;
> +	}
> +
> +	/*
> +	 * Translate UMC channel address to Physical address
> +	 */
> +	ret = amdgpu_umc_convert_error_address(adev, &err_data, err_addr,
> +					ch_inst, umc_inst);
> +	if (ret)
> +		goto out;
> +
> +	if (amdgpu_bad_page_threshold != 0) {
> +		amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
> +						err_data.err_addr_cnt);
> +		amdgpu_ras_save_bad_pages(adev);
> +	}
> +
> +out:
> +	kfree(err_data.err_addr);
> +	return ret;
> +}
> +
> +static int amdgpu_umc_poison_handler_mca(struct amdgpu_device *adev,
> +		struct ras_err_data *err_data, bool reset) {
> +	/* MCA poison handler is only responsible for GPU reset,
> +	 * let MCA notifier do page retirement.
> +	 */
> +	if (reset) {
> +		kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
> +		amdgpu_ras_reset_gpu(adev);
> +	}
> +
> +	return AMDGPU_RAS_SUCCESS;
> +}
> 
>  static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
>  		void *ras_error_status,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> index 3629d8f292ef..659a10de29c9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> @@ -98,4 +98,6 @@ void amdgpu_umc_fill_error_record(struct ras_err_data
> *err_data,  int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
>  		void *ras_error_status,
>  		struct amdgpu_iv_entry *entry);
> +int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
> +			uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst);
>  #endif
> --
> 2.35.1

^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: [PATCH 1/4] drm/amdgpu: add RAS page retirement functions for MCA
  2022-10-21  2:53   ` Zhou1, Tao
@ 2022-10-21  4:15     ` Zhang, Hawking
  2022-10-21  6:27       ` Zhou1, Tao
  0 siblings, 1 reply; 13+ messages in thread
From: Zhang, Hawking @ 2022-10-21  4:15 UTC (permalink / raw)
  To: Zhou1, Tao, amd-gfx, Yang, Stanley, Chai, Thomas, Li, Candice

[AMD Official Use Only - General]

Re - whether need to do gpu reset is determined by unmap queue status, so reset parameter can't be dropped

+	if (adev->gmc.xgmi.connected_to_cpu) {
+		ret = amdgpu_umc_poison_handler_mca(adev, ras_error_status, reset);

I think amdgpu_umc_poison_handler_mca is fallback handler specific for MCA platform, right? 

I noticed there is platform check already in amdgpu_umc_poison_handler with the reset flag. so driver already knows whether the reset is needed, right.
That's why I think "ras_error_status", "reset" are all not necessary. You can either call the followings directly by checking connected_to_cpu && reset,

+		kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+		amdgpu_ras_reset_gpu(adev);

Or still provide a wrapper like amdgpu_umc_poison_handler_mca for above two calls.

The latter seems redundant as well. I mean, we don't need to maintain a specific API for poison handling fallback on MCA platform - Aldebaran is
the last SOC that supports this A + A RAS design. I can confirm we'll move to a new design going forward.

Regards,
Hawking

-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1@amd.com> 
Sent: Friday, October 21, 2022 10:54
To: Zhang, Hawking <Hawking.Zhang@amd.com>; amd-gfx@lists.freedesktop.org; Yang, Stanley <Stanley.Yang@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>; Li, Candice <Candice.Li@amd.com>
Subject: RE: [PATCH 1/4] drm/amdgpu: add RAS page retirement functions for MCA

[AMD Official Use Only - General]


> -----Original Message-----
> From: Zhang, Hawking <Hawking.Zhang@amd.com>
> Sent: Thursday, October 20, 2022 5:13 PM
> To: Zhou1, Tao <Tao.Zhou1@amd.com>; amd-gfx@lists.freedesktop.org; 
> Yang, Stanley <Stanley.Yang@amd.com>; Chai, Thomas 
> <YiPeng.Chai@amd.com>; Li, Candice <Candice.Li@amd.com>
> Subject: RE: [PATCH 1/4] drm/amdgpu: add RAS page retirement functions 
> for MCA
> 
> [AMD Official Use Only - General]
> 
> +static int amdgpu_umc_poison_handler_mca(struct amdgpu_device *adev,
> +		struct ras_err_data *err_data, bool reset)
> 
> 
> +	if (adev->gmc.xgmi.connected_to_cpu) {
> +		ret = amdgpu_umc_poison_handler_mca(adev,
> ras_error_status, reset);
> 
> The input parameters "reset" and "err_data" can be dropped since 
> amdgpu_umc_poison_handler_mca is dedicated to MCA platform

[Tao] whether need to do gpu reset is determined by unmap queue status, so reset parameter can't be dropped.
For "err_data", it can be removed currently, but I'm afraid we may need it on other ASICs in the future.

> 
> Regards,
> Hawking
> 
> -----Original Message-----
> From: Zhou1, Tao <Tao.Zhou1@amd.com>
> Sent: Wednesday, October 19, 2022 16:12
> To: amd-gfx@lists.freedesktop.org; Zhang, Hawking 
> <Hawking.Zhang@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; Chai, 
> Thomas <YiPeng.Chai@amd.com>; Li, Candice <Candice.Li@amd.com>
> Cc: Zhou1, Tao <Tao.Zhou1@amd.com>
> Subject: [PATCH 1/4] drm/amdgpu: add RAS page retirement functions for 
> MCA
> 
> Define page retirement functions for MCA platform.
> 
> v2: remove page retirement handling from MCA poison handler,
>     let MCA notifier do page retirement.
> 
> Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 67
> +++++++++++++++++++++++++  drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> |  2 +
>  2 files changed, 69 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> index aad3c8b4c810..9494fa14db9a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> @@ -22,6 +22,73 @@
>   */
> 
>  #include "amdgpu.h"
> +#include "umc_v6_7.h"
> +
> +static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev,
> +				    struct ras_err_data *err_data, uint64_t
> err_addr,
> +				    uint32_t ch_inst, uint32_t umc_inst) {
> +	switch (adev->ip_versions[UMC_HWIP][0]) {
> +	case IP_VERSION(6, 7, 0):
> +		umc_v6_7_convert_error_address(adev,
> +				err_data, err_addr, ch_inst, umc_inst);
> +		break;
> +	default:
> +		dev_warn(adev->dev,
> +			 "UMC address to Physical address translation is not
> supported\n");
> +		return AMDGPU_RAS_FAIL;
> +	}
> +
> +	return AMDGPU_RAS_SUCCESS;
> +}
> +
> +int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
> +			uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst)
> {
> +	struct ras_err_data err_data = {0, 0, 0, NULL};
> +	int ret = AMDGPU_RAS_FAIL;
> +
> +	err_data.err_addr =
> +		kcalloc(adev->umc.max_ras_err_cnt_per_query,
> +			sizeof(struct eeprom_table_record), GFP_KERNEL);
> +	if (!err_data.err_addr) {
> +		dev_warn(adev->dev,
> +			"Failed to alloc memory for umc error record in MCA
> notifier!\n");
> +		return AMDGPU_RAS_FAIL;
> +	}
> +
> +	/*
> +	 * Translate UMC channel address to Physical address
> +	 */
> +	ret = amdgpu_umc_convert_error_address(adev, &err_data, err_addr,
> +					ch_inst, umc_inst);
> +	if (ret)
> +		goto out;
> +
> +	if (amdgpu_bad_page_threshold != 0) {
> +		amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
> +						err_data.err_addr_cnt);
> +		amdgpu_ras_save_bad_pages(adev);
> +	}
> +
> +out:
> +	kfree(err_data.err_addr);
> +	return ret;
> +}
> +
> +static int amdgpu_umc_poison_handler_mca(struct amdgpu_device *adev,
> +		struct ras_err_data *err_data, bool reset) {
> +	/* MCA poison handler is only responsible for GPU reset,
> +	 * let MCA notifier do page retirement.
> +	 */
> +	if (reset) {
> +		kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
> +		amdgpu_ras_reset_gpu(adev);
> +	}
> +
> +	return AMDGPU_RAS_SUCCESS;
> +}
> 
>  static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
>  		void *ras_error_status,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> index 3629d8f292ef..659a10de29c9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> @@ -98,4 +98,6 @@ void amdgpu_umc_fill_error_record(struct 
> ras_err_data *err_data,  int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
>  		void *ras_error_status,
>  		struct amdgpu_iv_entry *entry);
> +int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
> +			uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst);
>  #endif
> --
> 2.35.1

^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: [PATCH 1/4] drm/amdgpu: add RAS page retirement functions for MCA
  2022-10-21  4:15     ` Zhang, Hawking
@ 2022-10-21  6:27       ` Zhou1, Tao
  0 siblings, 0 replies; 13+ messages in thread
From: Zhou1, Tao @ 2022-10-21  6:27 UTC (permalink / raw)
  To: Zhang, Hawking, amd-gfx, Yang, Stanley, Chai, Thomas, Li, Candice

[AMD Official Use Only - General]



> -----Original Message-----
> From: Zhang, Hawking <Hawking.Zhang@amd.com>
> Sent: Friday, October 21, 2022 12:15 PM
> To: Zhou1, Tao <Tao.Zhou1@amd.com>; amd-gfx@lists.freedesktop.org; Yang,
> Stanley <Stanley.Yang@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>; Li,
> Candice <Candice.Li@amd.com>
> Subject: RE: [PATCH 1/4] drm/amdgpu: add RAS page retirement functions for
> MCA
> 
> [AMD Official Use Only - General]
> 
> Re - whether need to do gpu reset is determined by unmap queue status, so
> reset parameter can't be dropped
> 
> +	if (adev->gmc.xgmi.connected_to_cpu) {
> +		ret = amdgpu_umc_poison_handler_mca(adev,
> ras_error_status, reset);
> 
> I think amdgpu_umc_poison_handler_mca is fallback handler specific for MCA
> platform, right?
> 
> I noticed there is platform check already in amdgpu_umc_poison_handler with
> the reset flag. so driver already knows whether the reset is needed, right.
> That's why I think "ras_error_status", "reset" are all not necessary. You can
> either call the followings directly by checking connected_to_cpu && reset,
> 
> +		kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
> +		amdgpu_ras_reset_gpu(adev);
> 
> Or still provide a wrapper like amdgpu_umc_poison_handler_mca for above two
> calls.
> 
> The latter seems redundant as well. I mean, we don't need to maintain a specific
> API for poison handling fallback on MCA platform - Aldebaran is the last SOC
> that supports this A + A RAS design. I can confirm we'll move to a new design
> going forward.
> 
> Regards,
> Hawking

[Tao] adding amdgpu_umc_poison_handler_mca is for better extension, although it only calls gpu reset right now. But since A + A RAS design will change dramatically, I'll remove amdgpu_umc_poison_handler_mca as you suggested.

> 
> -----Original Message-----
> From: Zhou1, Tao <Tao.Zhou1@amd.com>
> Sent: Friday, October 21, 2022 10:54
> To: Zhang, Hawking <Hawking.Zhang@amd.com>; amd-
> gfx@lists.freedesktop.org; Yang, Stanley <Stanley.Yang@amd.com>; Chai,
> Thomas <YiPeng.Chai@amd.com>; Li, Candice <Candice.Li@amd.com>
> Subject: RE: [PATCH 1/4] drm/amdgpu: add RAS page retirement functions for
> MCA
> 
> [AMD Official Use Only - General]
> 
> 
> > -----Original Message-----
> > From: Zhang, Hawking <Hawking.Zhang@amd.com>
> > Sent: Thursday, October 20, 2022 5:13 PM
> > To: Zhou1, Tao <Tao.Zhou1@amd.com>; amd-gfx@lists.freedesktop.org;
> > Yang, Stanley <Stanley.Yang@amd.com>; Chai, Thomas
> > <YiPeng.Chai@amd.com>; Li, Candice <Candice.Li@amd.com>
> > Subject: RE: [PATCH 1/4] drm/amdgpu: add RAS page retirement functions
> > for MCA
> >
> > [AMD Official Use Only - General]
> >
> > +static int amdgpu_umc_poison_handler_mca(struct amdgpu_device *adev,
> > +		struct ras_err_data *err_data, bool reset)
> >
> >
> > +	if (adev->gmc.xgmi.connected_to_cpu) {
> > +		ret = amdgpu_umc_poison_handler_mca(adev,
> > ras_error_status, reset);
> >
> > The input parameters "reset" and "err_data" can be dropped since
> > amdgpu_umc_poison_handler_mca is dedicated to MCA platform
> 
> [Tao] whether need to do gpu reset is determined by unmap queue status, so
> reset parameter can't be dropped.
> For "err_data", it can be removed currently, but I'm afraid we may need it on
> other ASICs in the future.
> 
> >
> > Regards,
> > Hawking
> >
> > -----Original Message-----
> > From: Zhou1, Tao <Tao.Zhou1@amd.com>
> > Sent: Wednesday, October 19, 2022 16:12
> > To: amd-gfx@lists.freedesktop.org; Zhang, Hawking
> > <Hawking.Zhang@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; Chai,
> > Thomas <YiPeng.Chai@amd.com>; Li, Candice <Candice.Li@amd.com>
> > Cc: Zhou1, Tao <Tao.Zhou1@amd.com>
> > Subject: [PATCH 1/4] drm/amdgpu: add RAS page retirement functions for
> > MCA
> >
> > Define page retirement functions for MCA platform.
> >
> > v2: remove page retirement handling from MCA poison handler,
> >     let MCA notifier do page retirement.
> >
> > Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
> > ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 67
> > +++++++++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> > |  2 +
> >  2 files changed, 69 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> > index aad3c8b4c810..9494fa14db9a 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> > @@ -22,6 +22,73 @@
> >   */
> >
> >  #include "amdgpu.h"
> > +#include "umc_v6_7.h"
> > +
> > +static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev,
> > +				    struct ras_err_data *err_data, uint64_t
> > err_addr,
> > +				    uint32_t ch_inst, uint32_t umc_inst) {
> > +	switch (adev->ip_versions[UMC_HWIP][0]) {
> > +	case IP_VERSION(6, 7, 0):
> > +		umc_v6_7_convert_error_address(adev,
> > +				err_data, err_addr, ch_inst, umc_inst);
> > +		break;
> > +	default:
> > +		dev_warn(adev->dev,
> > +			 "UMC address to Physical address translation is not
> > supported\n");
> > +		return AMDGPU_RAS_FAIL;
> > +	}
> > +
> > +	return AMDGPU_RAS_SUCCESS;
> > +}
> > +
> > +int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
> > +			uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst)
> > {
> > +	struct ras_err_data err_data = {0, 0, 0, NULL};
> > +	int ret = AMDGPU_RAS_FAIL;
> > +
> > +	err_data.err_addr =
> > +		kcalloc(adev->umc.max_ras_err_cnt_per_query,
> > +			sizeof(struct eeprom_table_record), GFP_KERNEL);
> > +	if (!err_data.err_addr) {
> > +		dev_warn(adev->dev,
> > +			"Failed to alloc memory for umc error record in MCA
> > notifier!\n");
> > +		return AMDGPU_RAS_FAIL;
> > +	}
> > +
> > +	/*
> > +	 * Translate UMC channel address to Physical address
> > +	 */
> > +	ret = amdgpu_umc_convert_error_address(adev, &err_data, err_addr,
> > +					ch_inst, umc_inst);
> > +	if (ret)
> > +		goto out;
> > +
> > +	if (amdgpu_bad_page_threshold != 0) {
> > +		amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
> > +						err_data.err_addr_cnt);
> > +		amdgpu_ras_save_bad_pages(adev);
> > +	}
> > +
> > +out:
> > +	kfree(err_data.err_addr);
> > +	return ret;
> > +}
> > +
> > +static int amdgpu_umc_poison_handler_mca(struct amdgpu_device *adev,
> > +		struct ras_err_data *err_data, bool reset) {
> > +	/* MCA poison handler is only responsible for GPU reset,
> > +	 * let MCA notifier do page retirement.
> > +	 */
> > +	if (reset) {
> > +		kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
> > +		amdgpu_ras_reset_gpu(adev);
> > +	}
> > +
> > +	return AMDGPU_RAS_SUCCESS;
> > +}
> >
> >  static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
> >  		void *ras_error_status,
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> > index 3629d8f292ef..659a10de29c9 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> > @@ -98,4 +98,6 @@ void amdgpu_umc_fill_error_record(struct
> > ras_err_data *err_data,  int amdgpu_umc_process_ras_data_cb(struct
> amdgpu_device *adev,
> >  		void *ras_error_status,
> >  		struct amdgpu_iv_entry *entry);
> > +int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
> > +			uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst);
> >  #endif
> > --
> > 2.35.1

^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: [PATCH 4/4] drm/amdgpu: remove ras_error_status parameter for UMC poison handler
  2022-10-21  7:36 ` [PATCH 4/4] drm/amdgpu: remove ras_error_status parameter for UMC poison handler Tao Zhou
@ 2022-10-21  8:21   ` Zhang, Hawking
  0 siblings, 0 replies; 13+ messages in thread
From: Zhang, Hawking @ 2022-10-21  8:21 UTC (permalink / raw)
  To: Zhou1, Tao, amd-gfx, Yang, Stanley, Chai, Thomas, Li, Candice

[AMD Official Use Only - General]

Series is

Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>

Regards,
Hawking
-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1@amd.com> 
Sent: Friday, October 21, 2022 15:36
To: amd-gfx@lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>; Li, Candice <Candice.Li@amd.com>
Cc: Zhou1, Tao <Tao.Zhou1@amd.com>
Subject: [PATCH 4/4] drm/amdgpu: remove ras_error_status parameter for UMC poison handler

Make the code more simple.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c |  4 +---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    |  3 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c    | 13 +++++--------
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h    |  4 +---
 4 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 0561812aa0a4..37db39ba8718 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -753,9 +753,7 @@ bool amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev)
 
 void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bool reset)  {
-	struct ras_err_data err_data = {0, 0, 0, NULL};
-
-	amdgpu_umc_poison_handler(adev, &err_data, reset);
+	amdgpu_umc_poison_handler(adev, reset);
 }
 
 bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 28463b47ce33..693bce07eb46 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1561,7 +1561,6 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *  {
 	bool poison_stat = false;
 	struct amdgpu_device *adev = obj->adev;
-	struct ras_err_data err_data = {0, 0, 0, NULL};
 	struct amdgpu_ras_block_object *block_obj =
 		amdgpu_ras_get_ras_block(adev, obj->head.block, 0);
 
@@ -1584,7 +1583,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
 	}
 
 	if (!adev->gmc.xgmi.connected_to_cpu)
-		amdgpu_umc_poison_handler(adev, &err_data, false);
+		amdgpu_umc_poison_handler(adev, false);
 
 	if (block_obj->hw_ops->handle_poison_consumption)
 		poison_stat = block_obj->hw_ops->handle_poison_consumption(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 758942150c09..f76c19fc0392 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -165,25 +165,22 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
 	return AMDGPU_RAS_SUCCESS;
 }
 
-int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
-		void *ras_error_status,
-		bool reset)
+int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset)
 {
 	int ret = AMDGPU_RAS_SUCCESS;
 
 	if (!adev->gmc.xgmi.connected_to_cpu) {
-		struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
+		struct ras_err_data err_data = {0, 0, 0, NULL};
 		struct ras_common_if head = {
 			.block = AMDGPU_RAS_BLOCK__UMC,
 		};
 		struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
 
-		ret =
-			amdgpu_umc_do_page_retirement(adev, ras_error_status, NULL, reset);
+		ret = amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);
 
 		if (ret == AMDGPU_RAS_SUCCESS && obj) {
-			obj->err_data.ue_count += err_data->ue_count;
-			obj->err_data.ce_count += err_data->ce_count;
+			obj->err_data.ue_count += err_data.ue_count;
+			obj->err_data.ce_count += err_data.ce_count;
 		}
 	} else if (reset) {
 		/* MCA poison handler is only responsible for GPU reset, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index 659a10de29c9..a6951160f13a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -83,9 +83,7 @@ struct amdgpu_umc {
 };
 
 int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block); -int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
-		void *ras_error_status,
-		bool reset);
+int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset);
 int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
 		struct amdgpu_irq_src *source,
 		struct amdgpu_iv_entry *entry);
--
2.35.1

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 4/4] drm/amdgpu: remove ras_error_status parameter for UMC poison handler
  2022-10-21  7:36 Tao Zhou
@ 2022-10-21  7:36 ` Tao Zhou
  2022-10-21  8:21   ` Zhang, Hawking
  0 siblings, 1 reply; 13+ messages in thread
From: Tao Zhou @ 2022-10-21  7:36 UTC (permalink / raw)
  To: amd-gfx, hawking.zhang, stanley.yang, YiPeng.Chai, candice.li; +Cc: Tao Zhou

Make the code more simple.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c |  4 +---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    |  3 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c    | 13 +++++--------
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h    |  4 +---
 4 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 0561812aa0a4..37db39ba8718 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -753,9 +753,7 @@ bool amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev)
 
 void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bool reset)
 {
-	struct ras_err_data err_data = {0, 0, 0, NULL};
-
-	amdgpu_umc_poison_handler(adev, &err_data, reset);
+	amdgpu_umc_poison_handler(adev, reset);
 }
 
 bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 28463b47ce33..693bce07eb46 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1561,7 +1561,6 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
 {
 	bool poison_stat = false;
 	struct amdgpu_device *adev = obj->adev;
-	struct ras_err_data err_data = {0, 0, 0, NULL};
 	struct amdgpu_ras_block_object *block_obj =
 		amdgpu_ras_get_ras_block(adev, obj->head.block, 0);
 
@@ -1584,7 +1583,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
 	}
 
 	if (!adev->gmc.xgmi.connected_to_cpu)
-		amdgpu_umc_poison_handler(adev, &err_data, false);
+		amdgpu_umc_poison_handler(adev, false);
 
 	if (block_obj->hw_ops->handle_poison_consumption)
 		poison_stat = block_obj->hw_ops->handle_poison_consumption(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 758942150c09..f76c19fc0392 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -165,25 +165,22 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
 	return AMDGPU_RAS_SUCCESS;
 }
 
-int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
-		void *ras_error_status,
-		bool reset)
+int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset)
 {
 	int ret = AMDGPU_RAS_SUCCESS;
 
 	if (!adev->gmc.xgmi.connected_to_cpu) {
-		struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
+		struct ras_err_data err_data = {0, 0, 0, NULL};
 		struct ras_common_if head = {
 			.block = AMDGPU_RAS_BLOCK__UMC,
 		};
 		struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
 
-		ret =
-			amdgpu_umc_do_page_retirement(adev, ras_error_status, NULL, reset);
+		ret = amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);
 
 		if (ret == AMDGPU_RAS_SUCCESS && obj) {
-			obj->err_data.ue_count += err_data->ue_count;
-			obj->err_data.ce_count += err_data->ce_count;
+			obj->err_data.ue_count += err_data.ue_count;
+			obj->err_data.ce_count += err_data.ce_count;
 		}
 	} else if (reset) {
 		/* MCA poison handler is only responsible for GPU reset,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index 659a10de29c9..a6951160f13a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -83,9 +83,7 @@ struct amdgpu_umc {
 };
 
 int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block);
-int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
-		void *ras_error_status,
-		bool reset);
+int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset);
 int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
 		struct amdgpu_irq_src *source,
 		struct amdgpu_iv_entry *entry);
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 4/4] drm/amdgpu: remove ras_error_status parameter for UMC poison handler
  2022-10-18  8:31 [PATCH 1/4] drm/amdgpu: add RAS page retirement functions for MCA Tao Zhou
@ 2022-10-18  8:32 ` Tao Zhou
  0 siblings, 0 replies; 13+ messages in thread
From: Tao Zhou @ 2022-10-18  8:32 UTC (permalink / raw)
  To: amd-gfx, hawking.zhang, stanley.yang, YiPeng.Chai, candice.li; +Cc: Tao Zhou

Make the code more simple.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c |  4 +---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    |  3 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c    | 16 ++++++----------
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h    |  4 +---
 4 files changed, 9 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 0561812aa0a4..37db39ba8718 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -753,9 +753,7 @@ bool amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev)
 
 void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bool reset)
 {
-	struct ras_err_data err_data = {0, 0, 0, NULL};
-
-	amdgpu_umc_poison_handler(adev, &err_data, reset);
+	amdgpu_umc_poison_handler(adev, reset);
 }
 
 bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 28463b47ce33..693bce07eb46 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1561,7 +1561,6 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
 {
 	bool poison_stat = false;
 	struct amdgpu_device *adev = obj->adev;
-	struct ras_err_data err_data = {0, 0, 0, NULL};
 	struct amdgpu_ras_block_object *block_obj =
 		amdgpu_ras_get_ras_block(adev, obj->head.block, 0);
 
@@ -1584,7 +1583,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
 	}
 
 	if (!adev->gmc.xgmi.connected_to_cpu)
-		amdgpu_umc_poison_handler(adev, &err_data, false);
+		amdgpu_umc_poison_handler(adev, false);
 
 	if (block_obj->hw_ops->handle_poison_consumption)
 		poison_stat = block_obj->hw_ops->handle_poison_consumption(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 22995f371eae..481a8d69911e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -224,27 +224,23 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
 	return AMDGPU_RAS_SUCCESS;
 }
 
-int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
-		void *ras_error_status,
-		bool reset)
+int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset)
 {
+	struct ras_err_data err_data = {0, 0, 0, NULL};
 	int ret;
 
 	if (adev->gmc.xgmi.connected_to_cpu) {
-		ret = amdgpu_umc_poison_handler_mca(adev, ras_error_status, reset);
+		ret = amdgpu_umc_poison_handler_mca(adev, &err_data, reset);
 	} else {
-		struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
 		struct ras_common_if head = {
 			.block = AMDGPU_RAS_BLOCK__UMC,
 		};
 		struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
 
-		ret =
-			amdgpu_umc_do_page_retirement(adev, ras_error_status, NULL, reset);
-
+		ret = amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);
 		if (ret == AMDGPU_RAS_SUCCESS && obj) {
-			obj->err_data.ue_count += err_data->ue_count;
-			obj->err_data.ce_count += err_data->ce_count;
+			obj->err_data.ue_count += err_data.ue_count;
+			obj->err_data.ce_count += err_data.ce_count;
 		}
 	}
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index 659a10de29c9..a6951160f13a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -83,9 +83,7 @@ struct amdgpu_umc {
 };
 
 int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block);
-int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
-		void *ras_error_status,
-		bool reset);
+int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset);
 int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
 		struct amdgpu_irq_src *source,
 		struct amdgpu_iv_entry *entry);
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2022-10-21  8:22 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-10-19  8:11 [PATCH 1/4] drm/amdgpu: add RAS page retirement functions for MCA Tao Zhou
2022-10-19  8:11 ` [PATCH 2/4] drm/amdgpu: use page retirement API in MCA notifier Tao Zhou
2022-10-19  8:11 ` [PATCH 3/4] drm/amdgpu: add RAS poison handling for MCA Tao Zhou
2022-10-19  8:11 ` [PATCH 4/4] drm/amdgpu: remove ras_error_status parameter for UMC poison handler Tao Zhou
2022-10-20  9:29   ` Zhang, Hawking
2022-10-21  2:51     ` Zhou1, Tao
2022-10-20  9:12 ` [PATCH 1/4] drm/amdgpu: add RAS page retirement functions for MCA Zhang, Hawking
2022-10-21  2:53   ` Zhou1, Tao
2022-10-21  4:15     ` Zhang, Hawking
2022-10-21  6:27       ` Zhou1, Tao
  -- strict thread matches above, loose matches on Subject: below --
2022-10-21  7:36 Tao Zhou
2022-10-21  7:36 ` [PATCH 4/4] drm/amdgpu: remove ras_error_status parameter for UMC poison handler Tao Zhou
2022-10-21  8:21   ` Zhang, Hawking
2022-10-18  8:31 [PATCH 1/4] drm/amdgpu: add RAS page retirement functions for MCA Tao Zhou
2022-10-18  8:32 ` [PATCH 4/4] drm/amdgpu: remove ras_error_status parameter for UMC poison handler Tao Zhou

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.