All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/3] drm/amdgpu: add poison mode query for UMC
@ 2021-09-18  8:07 Tao Zhou
  2021-09-18  8:07 ` [PATCH 2/3] drm/amdgpu: set poison mode for RAS Tao Zhou
                   ` (2 more replies)
  0 siblings, 3 replies; 8+ messages in thread
From: Tao Zhou @ 2021-09-18  8:07 UTC (permalink / raw)
  To: amd-gfx, hawking.zhang, john.clements, stanley.yang; +Cc: Tao Zhou

Add ras poison mode query interface for UMC.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h |  1 +
 drivers/gpu/drm/amd/amdgpu/umc_v6_7.c   | 34 +++++++++++++++++++++++++
 2 files changed, 35 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index e5a75fb788dd..1f5fe2315236 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -48,6 +48,7 @@ struct amdgpu_umc_ras_funcs {
 				      void *ras_error_status);
 	void (*query_ras_error_address)(struct amdgpu_device *adev,
 					void *ras_error_status);
+	bool (*query_ras_poison_mode)(struct amdgpu_device *adev);
 };
 
 struct amdgpu_umc_funcs {
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
index bb30336b1e8d..f7ec3fe134e5 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
@@ -288,9 +288,43 @@ static void umc_v6_7_query_ras_error_address(struct amdgpu_device *adev,
 	}
 }
 
+static uint32_t umc_v6_7_query_ras_poison_mode_per_channel(
+						struct amdgpu_device *adev,
+						uint32_t umc_reg_offset)
+{
+	uint32_t ecc_ctrl_addr, ecc_ctrl;
+
+	ecc_ctrl_addr =
+		SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_EccCtrl);
+	ecc_ctrl = RREG32_PCIE((ecc_ctrl_addr +
+					umc_reg_offset) * 4);
+
+	return REG_GET_FIELD(ecc_ctrl, UMCCH0_0_EccCtrl, UCFatalEn);
+}
+
+static bool umc_v6_7_query_ras_poison_mode(struct amdgpu_device *adev)
+{
+	uint32_t umc_inst        = 0;
+	uint32_t ch_inst         = 0;
+	uint32_t umc_reg_offset  = 0;
+
+	LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
+		umc_reg_offset = get_umc_v6_7_reg_offset(adev,
+							umc_inst,
+							ch_inst);
+		/* Enabling fatal error in one channel will be considered
+		   as fatal error mode */
+		if (umc_v6_7_query_ras_poison_mode_per_channel(adev, umc_reg_offset))
+			return false;
+	}
+
+	return true;
+}
+
 const struct amdgpu_umc_ras_funcs umc_v6_7_ras_funcs = {
 	.ras_late_init = amdgpu_umc_ras_late_init,
 	.ras_fini = amdgpu_umc_ras_fini,
 	.query_ras_error_count = umc_v6_7_query_ras_error_count,
 	.query_ras_error_address = umc_v6_7_query_ras_error_address,
+	.query_ras_poison_mode = umc_v6_7_query_ras_poison_mode,
 };
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 2/3] drm/amdgpu: set poison mode for RAS
  2021-09-18  8:07 [PATCH 1/3] drm/amdgpu: add poison mode query for UMC Tao Zhou
@ 2021-09-18  8:07 ` Tao Zhou
  2021-09-18  8:59   ` Zhang, Hawking
  2021-09-18  8:07 ` [PATCH 3/3] drm/amdgpu: skip umc ras irq handling in poison mode Tao Zhou
  2021-09-18  8:59 ` [PATCH 1/3] drm/amdgpu: add poison mode query for UMC Zhang, Hawking
  2 siblings, 1 reply; 8+ messages in thread
From: Tao Zhou @ 2021-09-18  8:07 UTC (permalink / raw)
  To: amd-gfx, hawking.zhang, john.clements, stanley.yang; +Cc: Tao Zhou

Add RAS poison mode flag and tell PSP RAS TA about the info.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c |  4 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 28 +++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  5 +++++
 3 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 7d09b28889af..140b94da2f5a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -1442,9 +1442,9 @@ static int psp_ras_initialize(struct psp_context *psp)
 	ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf;
 	memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));
 
-	if (psp->adev->gmc.xgmi.connected_to_cpu)
+	if (amdgpu_ras_is_poison_enabled(adev))
 		ras_cmd->ras_in_message.init_flags.poison_mode_en = 1;
-	else
+	if (!adev->gmc.xgmi.connected_to_cpu)
 		ras_cmd->ras_in_message.init_flags.dgpu_mode = 1;
 
 	ret = psp_ras_load(psp);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index b5332db4d287..7b7e54fdd785 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2180,6 +2180,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
 {
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 	int r;
+	bool df_poison, umc_poison;
 
 	if (con)
 		return 0;
@@ -2249,6 +2250,23 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
 			goto release_con;
 	}
 
+	/* Init poison mode, the default value is false */
+	if (adev->df.funcs &&
+	    adev->df.funcs->query_ras_poison_mode &&
+	    adev->umc.ras_funcs &&
+	    adev->umc.ras_funcs->query_ras_poison_mode) {
+		df_poison =
+			adev->df.funcs->query_ras_poison_mode(adev);
+		umc_poison =
+			adev->umc.ras_funcs->query_ras_poison_mode(adev);
+		/* Only poison is set in both DF and UMC, we can enable it */
+		if (df_poison && umc_poison)
+			con->poison_mode_en = true;
+		else if (df_poison != umc_poison)
+			dev_warn(adev->dev, "Poison setting is inconsistent in DF/UMC(%d:%d)!\n",
+					df_poison, umc_poison);
+	}
+
 	if (amdgpu_ras_fs_init(adev)) {
 		r = -EINVAL;
 		goto release_con;
@@ -2292,6 +2310,16 @@ static int amdgpu_persistent_edc_harvesting(struct amdgpu_device *adev,
 	return 0;
 }
 
+bool amdgpu_ras_is_poison_enabled(struct amdgpu_device *adev)
+{
+       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+       if (!con)
+               return false;
+
+       return con->poison_mode_en;
+}
+
 /* helper function to handle common stuff in ip late init phase */
 int amdgpu_ras_late_init(struct amdgpu_device *adev,
 			 struct ras_common_if *ras_block,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 1670467c2054..044bd19b7cce 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -345,6 +345,9 @@ struct amdgpu_ras {
 	/* disable ras error count harvest in recovery */
 	bool disable_ras_err_cnt_harvest;
 
+	/* is poison mode */
+	bool poison_mode_en;
+
 	/* RAS count errors delayed work */
 	struct delayed_work ras_counte_delay_work;
 	atomic_t ras_ue_count;
@@ -640,4 +643,6 @@ void amdgpu_release_ras_context(struct amdgpu_device *adev);
 
 int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev);
 
+bool amdgpu_ras_is_poison_enabled(struct amdgpu_device *adev);
+
 #endif
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 3/3] drm/amdgpu: skip umc ras irq handling in poison mode
  2021-09-18  8:07 [PATCH 1/3] drm/amdgpu: add poison mode query for UMC Tao Zhou
  2021-09-18  8:07 ` [PATCH 2/3] drm/amdgpu: set poison mode for RAS Tao Zhou
@ 2021-09-18  8:07 ` Tao Zhou
  2021-09-18  9:10   ` Zhang, Hawking
  2021-09-18  8:59 ` [PATCH 1/3] drm/amdgpu: add poison mode query for UMC Zhang, Hawking
  2 siblings, 1 reply; 8+ messages in thread
From: Tao Zhou @ 2021-09-18  8:07 UTC (permalink / raw)
  To: amd-gfx, hawking.zhang, john.clements, stanley.yang; +Cc: Tao Zhou

In ras poison mode, umc uncorrectable error will be ignored until
the corrupted data consumed by another ras module (such as gfx, sdma).

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 34 +++++++++++++++----------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 7b7e54fdd785..195637725c7f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1473,22 +1473,28 @@ static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
 		data->rptr = (data->aligned_element_size +
 				data->rptr) % data->ring_size;
 
-		/* Let IP handle its data, maybe we need get the output
-		 * from the callback to udpate the error type/count, etc
-		 */
 		if (data->cb) {
-			ret = data->cb(obj->adev, &err_data, &entry);
-			/* ue will trigger an interrupt, and in that case
-			 * we need do a reset to recovery the whole system.
-			 * But leave IP do that recovery, here we just dispatch
-			 * the error.
-			 */
-			if (ret == AMDGPU_RAS_SUCCESS) {
-				/* these counts could be left as 0 if
-				 * some blocks do not count error number
+			if (amdgpu_ras_is_poison_enabled(obj->adev) &&
+			    obj->head.block == AMDGPU_RAS_BLOCK__UMC)
+				dev_warn(obj->adev->dev, "Poison mode, no need to do page retirement"
+								"in UMC ras handler!\n");
+			else {
+				/* Let IP handle its data, maybe we need get the output
+				 * from the callback to udpate the error type/count, etc
+				 */
+				ret = data->cb(obj->adev, &err_data, &entry);
+				/* ue will trigger an interrupt, and in that case
+				 * we need do a reset to recovery the whole system.
+				 * But leave IP do that recovery, here we just dispatch
+				 * the error.
 				 */
-				obj->err_data.ue_count += err_data.ue_count;
-				obj->err_data.ce_count += err_data.ce_count;
+				if (ret == AMDGPU_RAS_SUCCESS) {
+					/* these counts could be left as 0 if
+					 * some blocks do not count error number
+					 */
+					obj->err_data.ue_count += err_data.ue_count;
+					obj->err_data.ce_count += err_data.ce_count;
+				}
 			}
 		}
 	}
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* RE: [PATCH 2/3] drm/amdgpu: set poison mode for RAS
  2021-09-18  8:07 ` [PATCH 2/3] drm/amdgpu: set poison mode for RAS Tao Zhou
@ 2021-09-18  8:59   ` Zhang, Hawking
  2021-09-18  9:32     ` Zhou1, Tao
  0 siblings, 1 reply; 8+ messages in thread
From: Zhang, Hawking @ 2021-09-18  8:59 UTC (permalink / raw)
  To: Zhou1, Tao, amd-gfx, Clements, John, Yang, Stanley

[AMD Official Use Only]

+	if (amdgpu_ras_is_poison_enabled(adev))
 		ras_cmd->ras_in_message.init_flags.poison_mode_en = 1;
-	else
+	if (!adev->gmc.xgmi.connected_to_cpu)
 		ras_cmd->ras_in_message.init_flags.dgpu_mode = 1;

I'd expect these flags are set in enable_feature command per IP block if needed. Instead of global setting at firmware/TA initialization phase, thoughts?

Regards,
Hawking

-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1@amd.com> 
Sent: Saturday, September 18, 2021 16:08
To: amd-gfx@lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang@amd.com>; Clements, John <John.Clements@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>
Cc: Zhou1, Tao <Tao.Zhou1@amd.com>
Subject: [PATCH 2/3] drm/amdgpu: set poison mode for RAS

Add RAS poison mode flag and tell PSP RAS TA about the info.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c |  4 ++--  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 28 +++++++++++++++++++++++++  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  5 +++++
 3 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 7d09b28889af..140b94da2f5a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -1442,9 +1442,9 @@ static int psp_ras_initialize(struct psp_context *psp)
 	ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf;
 	memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));
 
-	if (psp->adev->gmc.xgmi.connected_to_cpu)
+	if (amdgpu_ras_is_poison_enabled(adev))
 		ras_cmd->ras_in_message.init_flags.poison_mode_en = 1;
-	else
+	if (!adev->gmc.xgmi.connected_to_cpu)
 		ras_cmd->ras_in_message.init_flags.dgpu_mode = 1;
 
 	ret = psp_ras_load(psp);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index b5332db4d287..7b7e54fdd785 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2180,6 +2180,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev)  {
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 	int r;
+	bool df_poison, umc_poison;
 
 	if (con)
 		return 0;
@@ -2249,6 +2250,23 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
 			goto release_con;
 	}
 
+	/* Init poison mode, the default value is false */
+	if (adev->df.funcs &&
+	    adev->df.funcs->query_ras_poison_mode &&
+	    adev->umc.ras_funcs &&
+	    adev->umc.ras_funcs->query_ras_poison_mode) {
+		df_poison =
+			adev->df.funcs->query_ras_poison_mode(adev);
+		umc_poison =
+			adev->umc.ras_funcs->query_ras_poison_mode(adev);
+		/* Only poison is set in both DF and UMC, we can enable it */
+		if (df_poison && umc_poison)
+			con->poison_mode_en = true;
+		else if (df_poison != umc_poison)
+			dev_warn(adev->dev, "Poison setting is inconsistent in DF/UMC(%d:%d)!\n",
+					df_poison, umc_poison);
+	}
+
 	if (amdgpu_ras_fs_init(adev)) {
 		r = -EINVAL;
 		goto release_con;
@@ -2292,6 +2310,16 @@ static int amdgpu_persistent_edc_harvesting(struct amdgpu_device *adev,
 	return 0;
 }
 
+bool amdgpu_ras_is_poison_enabled(struct amdgpu_device *adev) {
+       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+       if (!con)
+               return false;
+
+       return con->poison_mode_en;
+}
+
 /* helper function to handle common stuff in ip late init phase */  int amdgpu_ras_late_init(struct amdgpu_device *adev,
 			 struct ras_common_if *ras_block,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 1670467c2054..044bd19b7cce 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -345,6 +345,9 @@ struct amdgpu_ras {
 	/* disable ras error count harvest in recovery */
 	bool disable_ras_err_cnt_harvest;
 
+	/* is poison mode */
+	bool poison_mode_en;
+
 	/* RAS count errors delayed work */
 	struct delayed_work ras_counte_delay_work;
 	atomic_t ras_ue_count;
@@ -640,4 +643,6 @@ void amdgpu_release_ras_context(struct amdgpu_device *adev);
 
 int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev);
 
+bool amdgpu_ras_is_poison_enabled(struct amdgpu_device *adev);
+
 #endif
--
2.17.1

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* RE: [PATCH 1/3] drm/amdgpu: add poison mode query for UMC
  2021-09-18  8:07 [PATCH 1/3] drm/amdgpu: add poison mode query for UMC Tao Zhou
  2021-09-18  8:07 ` [PATCH 2/3] drm/amdgpu: set poison mode for RAS Tao Zhou
  2021-09-18  8:07 ` [PATCH 3/3] drm/amdgpu: skip umc ras irq handling in poison mode Tao Zhou
@ 2021-09-18  8:59 ` Zhang, Hawking
  2 siblings, 0 replies; 8+ messages in thread
From: Zhang, Hawking @ 2021-09-18  8:59 UTC (permalink / raw)
  To: Zhou1, Tao, amd-gfx, Clements, John, Yang, Stanley

[AMD Official Use Only]

Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>

Regards,
Hawking
-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1@amd.com> 
Sent: Saturday, September 18, 2021 16:08
To: amd-gfx@lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang@amd.com>; Clements, John <John.Clements@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>
Cc: Zhou1, Tao <Tao.Zhou1@amd.com>
Subject: [PATCH 1/3] drm/amdgpu: add poison mode query for UMC

Add ras poison mode query interface for UMC.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h |  1 +
 drivers/gpu/drm/amd/amdgpu/umc_v6_7.c   | 34 +++++++++++++++++++++++++
 2 files changed, 35 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index e5a75fb788dd..1f5fe2315236 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -48,6 +48,7 @@ struct amdgpu_umc_ras_funcs {
 				      void *ras_error_status);
 	void (*query_ras_error_address)(struct amdgpu_device *adev,
 					void *ras_error_status);
+	bool (*query_ras_poison_mode)(struct amdgpu_device *adev);
 };
 
 struct amdgpu_umc_funcs {
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
index bb30336b1e8d..f7ec3fe134e5 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
@@ -288,9 +288,43 @@ static void umc_v6_7_query_ras_error_address(struct amdgpu_device *adev,
 	}
 }
 
+static uint32_t umc_v6_7_query_ras_poison_mode_per_channel(
+						struct amdgpu_device *adev,
+						uint32_t umc_reg_offset)
+{
+	uint32_t ecc_ctrl_addr, ecc_ctrl;
+
+	ecc_ctrl_addr =
+		SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_EccCtrl);
+	ecc_ctrl = RREG32_PCIE((ecc_ctrl_addr +
+					umc_reg_offset) * 4);
+
+	return REG_GET_FIELD(ecc_ctrl, UMCCH0_0_EccCtrl, UCFatalEn); }
+
+static bool umc_v6_7_query_ras_poison_mode(struct amdgpu_device *adev) 
+{
+	uint32_t umc_inst        = 0;
+	uint32_t ch_inst         = 0;
+	uint32_t umc_reg_offset  = 0;
+
+	LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
+		umc_reg_offset = get_umc_v6_7_reg_offset(adev,
+							umc_inst,
+							ch_inst);
+		/* Enabling fatal error in one channel will be considered
+		   as fatal error mode */
+		if (umc_v6_7_query_ras_poison_mode_per_channel(adev, umc_reg_offset))
+			return false;
+	}
+
+	return true;
+}
+
 const struct amdgpu_umc_ras_funcs umc_v6_7_ras_funcs = {
 	.ras_late_init = amdgpu_umc_ras_late_init,
 	.ras_fini = amdgpu_umc_ras_fini,
 	.query_ras_error_count = umc_v6_7_query_ras_error_count,
 	.query_ras_error_address = umc_v6_7_query_ras_error_address,
+	.query_ras_poison_mode = umc_v6_7_query_ras_poison_mode,
 };
--
2.17.1

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* RE: [PATCH 3/3] drm/amdgpu: skip umc ras irq handling in poison mode
  2021-09-18  8:07 ` [PATCH 3/3] drm/amdgpu: skip umc ras irq handling in poison mode Tao Zhou
@ 2021-09-18  9:10   ` Zhang, Hawking
  0 siblings, 0 replies; 8+ messages in thread
From: Zhang, Hawking @ 2021-09-18  9:10 UTC (permalink / raw)
  To: Zhou1, Tao, amd-gfx, Clements, John, Yang, Stanley

[AMD Official Use Only]

+				dev_warn(obj->adev->dev, "Poison mode, no need to do page retirement"
+								"in UMC ras handler!\n");

I suggest make it to be dbg message or just drop it. The message more likely introduces confusion to regular user.

In PCIe-connected mode, the df_mca interrupt is used to inform the poison creation, I'm thinking of one the following handling
a). even do not register that interrupt in such case *or*
b). make a new callback just used to inform user poison creation event (i.e. leveraging umc error address look up and convert function to get the physical address, explicitly highlight this in kernel message)

thoughts?

Regards,
Hawking

-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1@amd.com> 
Sent: Saturday, September 18, 2021 16:08
To: amd-gfx@lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang@amd.com>; Clements, John <John.Clements@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>
Cc: Zhou1, Tao <Tao.Zhou1@amd.com>
Subject: [PATCH 3/3] drm/amdgpu: skip umc ras irq handling in poison mode

In ras poison mode, umc uncorrectable error will be ignored until the corrupted data consumed by another ras module (such as gfx, sdma).

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 34 +++++++++++++++----------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 7b7e54fdd785..195637725c7f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1473,22 +1473,28 @@ static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
 		data->rptr = (data->aligned_element_size +
 				data->rptr) % data->ring_size;
 
-		/* Let IP handle its data, maybe we need get the output
-		 * from the callback to udpate the error type/count, etc
-		 */
 		if (data->cb) {
-			ret = data->cb(obj->adev, &err_data, &entry);
-			/* ue will trigger an interrupt, and in that case
-			 * we need do a reset to recovery the whole system.
-			 * But leave IP do that recovery, here we just dispatch
-			 * the error.
-			 */
-			if (ret == AMDGPU_RAS_SUCCESS) {
-				/* these counts could be left as 0 if
-				 * some blocks do not count error number
+			if (amdgpu_ras_is_poison_enabled(obj->adev) &&
+			    obj->head.block == AMDGPU_RAS_BLOCK__UMC)
+				dev_warn(obj->adev->dev, "Poison mode, no need to do page retirement"
+								"in UMC ras handler!\n");
+			else {
+				/* Let IP handle its data, maybe we need get the output
+				 * from the callback to udpate the error type/count, etc
+				 */
+				ret = data->cb(obj->adev, &err_data, &entry);
+				/* ue will trigger an interrupt, and in that case
+				 * we need do a reset to recovery the whole system.
+				 * But leave IP do that recovery, here we just dispatch
+				 * the error.
 				 */
-				obj->err_data.ue_count += err_data.ue_count;
-				obj->err_data.ce_count += err_data.ce_count;
+				if (ret == AMDGPU_RAS_SUCCESS) {
+					/* these counts could be left as 0 if
+					 * some blocks do not count error number
+					 */
+					obj->err_data.ue_count += err_data.ue_count;
+					obj->err_data.ce_count += err_data.ce_count;
+				}
 			}
 		}
 	}
--
2.17.1

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH 2/3] drm/amdgpu: set poison mode for RAS
  2021-09-18  8:59   ` Zhang, Hawking
@ 2021-09-18  9:32     ` Zhou1, Tao
  0 siblings, 0 replies; 8+ messages in thread
From: Zhou1, Tao @ 2021-09-18  9:32 UTC (permalink / raw)
  To: Zhang, Hawking, amd-gfx, Clements, John, Yang, Stanley

[-- Attachment #1: Type: text/plain, Size: 5545 bytes --]

[AMD Official Use Only]

Poison mode is a global setting currently, will we set it per IP block in the future?
For example, set poison mode for GFX but fatal error mode for SDMA?

dgpu_mode is disabled when connected_to_cpu is 1, is irrelevant to IP block.

Regards,
Tao
________________________________
From: Zhang, Hawking <Hawking.Zhang@amd.com>
Sent: Saturday, September 18, 2021 4:59 PM
To: Zhou1, Tao <Tao.Zhou1@amd.com>; amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>; Clements, John <John.Clements@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>
Subject: RE: [PATCH 2/3] drm/amdgpu: set poison mode for RAS

[AMD Official Use Only]

+       if (amdgpu_ras_is_poison_enabled(adev))
                 ras_cmd->ras_in_message.init_flags.poison_mode_en = 1;
-       else
+       if (!adev->gmc.xgmi.connected_to_cpu)
                 ras_cmd->ras_in_message.init_flags.dgpu_mode = 1;

I'd expect these flags are set in enable_feature command per IP block if needed. Instead of global setting at firmware/TA initialization phase, thoughts?

Regards,
Hawking

-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1@amd.com>
Sent: Saturday, September 18, 2021 16:08
To: amd-gfx@lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang@amd.com>; Clements, John <John.Clements@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>
Cc: Zhou1, Tao <Tao.Zhou1@amd.com>
Subject: [PATCH 2/3] drm/amdgpu: set poison mode for RAS

Add RAS poison mode flag and tell PSP RAS TA about the info.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c |  4 ++--  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 28 +++++++++++++++++++++++++  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  5 +++++
 3 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 7d09b28889af..140b94da2f5a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -1442,9 +1442,9 @@ static int psp_ras_initialize(struct psp_context *psp)
         ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf;
         memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));

-       if (psp->adev->gmc.xgmi.connected_to_cpu)
+       if (amdgpu_ras_is_poison_enabled(adev))
                 ras_cmd->ras_in_message.init_flags.poison_mode_en = 1;
-       else
+       if (!adev->gmc.xgmi.connected_to_cpu)
                 ras_cmd->ras_in_message.init_flags.dgpu_mode = 1;

         ret = psp_ras_load(psp);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index b5332db4d287..7b7e54fdd785 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2180,6 +2180,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev)  {
         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
         int r;
+       bool df_poison, umc_poison;

         if (con)
                 return 0;
@@ -2249,6 +2250,23 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
                         goto release_con;
         }

+       /* Init poison mode, the default value is false */
+       if (adev->df.funcs &&
+           adev->df.funcs->query_ras_poison_mode &&
+           adev->umc.ras_funcs &&
+           adev->umc.ras_funcs->query_ras_poison_mode) {
+               df_poison =
+                       adev->df.funcs->query_ras_poison_mode(adev);
+               umc_poison =
+                       adev->umc.ras_funcs->query_ras_poison_mode(adev);
+               /* Only poison is set in both DF and UMC, we can enable it */
+               if (df_poison && umc_poison)
+                       con->poison_mode_en = true;
+               else if (df_poison != umc_poison)
+                       dev_warn(adev->dev, "Poison setting is inconsistent in DF/UMC(%d:%d)!\n",
+                                       df_poison, umc_poison);
+       }
+
         if (amdgpu_ras_fs_init(adev)) {
                 r = -EINVAL;
                 goto release_con;
@@ -2292,6 +2310,16 @@ static int amdgpu_persistent_edc_harvesting(struct amdgpu_device *adev,
         return 0;
 }

+bool amdgpu_ras_is_poison_enabled(struct amdgpu_device *adev) {
+       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+       if (!con)
+               return false;
+
+       return con->poison_mode_en;
+}
+
 /* helper function to handle common stuff in ip late init phase */  int amdgpu_ras_late_init(struct amdgpu_device *adev,
                          struct ras_common_if *ras_block,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 1670467c2054..044bd19b7cce 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -345,6 +345,9 @@ struct amdgpu_ras {
         /* disable ras error count harvest in recovery */
         bool disable_ras_err_cnt_harvest;

+       /* is poison mode */
+       bool poison_mode_en;
+
         /* RAS count errors delayed work */
         struct delayed_work ras_counte_delay_work;
         atomic_t ras_ue_count;
@@ -640,4 +643,6 @@ void amdgpu_release_ras_context(struct amdgpu_device *adev);

 int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev);

+bool amdgpu_ras_is_poison_enabled(struct amdgpu_device *adev);
+
 #endif
--
2.17.1

[-- Attachment #2: Type: text/html, Size: 11505 bytes --]

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 1/3] drm/amdgpu: add poison mode query for UMC
@ 2021-09-22 10:32 Tao Zhou
  0 siblings, 0 replies; 8+ messages in thread
From: Tao Zhou @ 2021-09-22 10:32 UTC (permalink / raw)
  To: amd-gfx, hawking.zhang, john.clements, stanley.yang; +Cc: Tao Zhou

Add ras poison mode query interface for UMC.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h |  1 +
 drivers/gpu/drm/amd/amdgpu/umc_v6_7.c   | 34 +++++++++++++++++++++++++
 2 files changed, 35 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index e5a75fb788dd..1f5fe2315236 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -48,6 +48,7 @@ struct amdgpu_umc_ras_funcs {
 				      void *ras_error_status);
 	void (*query_ras_error_address)(struct amdgpu_device *adev,
 					void *ras_error_status);
+	bool (*query_ras_poison_mode)(struct amdgpu_device *adev);
 };
 
 struct amdgpu_umc_funcs {
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
index bb30336b1e8d..f7ec3fe134e5 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
@@ -288,9 +288,43 @@ static void umc_v6_7_query_ras_error_address(struct amdgpu_device *adev,
 	}
 }
 
+static uint32_t umc_v6_7_query_ras_poison_mode_per_channel(
+						struct amdgpu_device *adev,
+						uint32_t umc_reg_offset)
+{
+	uint32_t ecc_ctrl_addr, ecc_ctrl;
+
+	ecc_ctrl_addr =
+		SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_EccCtrl);
+	ecc_ctrl = RREG32_PCIE((ecc_ctrl_addr +
+					umc_reg_offset) * 4);
+
+	return REG_GET_FIELD(ecc_ctrl, UMCCH0_0_EccCtrl, UCFatalEn);
+}
+
+static bool umc_v6_7_query_ras_poison_mode(struct amdgpu_device *adev)
+{
+	uint32_t umc_inst        = 0;
+	uint32_t ch_inst         = 0;
+	uint32_t umc_reg_offset  = 0;
+
+	LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
+		umc_reg_offset = get_umc_v6_7_reg_offset(adev,
+							umc_inst,
+							ch_inst);
+		/* Enabling fatal error in one channel will be considered
+		   as fatal error mode */
+		if (umc_v6_7_query_ras_poison_mode_per_channel(adev, umc_reg_offset))
+			return false;
+	}
+
+	return true;
+}
+
 const struct amdgpu_umc_ras_funcs umc_v6_7_ras_funcs = {
 	.ras_late_init = amdgpu_umc_ras_late_init,
 	.ras_fini = amdgpu_umc_ras_fini,
 	.query_ras_error_count = umc_v6_7_query_ras_error_count,
 	.query_ras_error_address = umc_v6_7_query_ras_error_address,
+	.query_ras_poison_mode = umc_v6_7_query_ras_poison_mode,
 };
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2021-09-22 10:33 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-09-18  8:07 [PATCH 1/3] drm/amdgpu: add poison mode query for UMC Tao Zhou
2021-09-18  8:07 ` [PATCH 2/3] drm/amdgpu: set poison mode for RAS Tao Zhou
2021-09-18  8:59   ` Zhang, Hawking
2021-09-18  9:32     ` Zhou1, Tao
2021-09-18  8:07 ` [PATCH 3/3] drm/amdgpu: skip umc ras irq handling in poison mode Tao Zhou
2021-09-18  9:10   ` Zhang, Hawking
2021-09-18  8:59 ` [PATCH 1/3] drm/amdgpu: add poison mode query for UMC Zhang, Hawking
2021-09-22 10:32 Tao Zhou

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.