All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH Review v3 1/2] drm/amdgpu/pm: support mca_ceumc_addr in ecctable
@ 2022-05-25  6:10 Stanley.Yang
  2022-05-25  6:10 ` [PATCH Review v3 2/2] drm/amdgpu: print umc correctable error address Stanley.Yang
  2022-05-25 12:33 ` [PATCH Review v3 1/2] drm/amdgpu/pm: support mca_ceumc_addr in ecctable Lazar, Lijo
  0 siblings, 2 replies; 7+ messages in thread
From: Stanley.Yang @ 2022-05-25  6:10 UTC (permalink / raw)
  To: amd-gfx, hawking.zhang, tao.zhou1, evan.quan, lijo.lazar; +Cc: Stanley.Yang

SMU add a new variable mca_ceumc_addr to record
umc correctable error address in EccInfo table,
driver side add EccInfo_V2_t to support this feature

Changed from V1:
	remove ecc_table_v2 and unnecessary table id, define union struct include
	EccInfo_t and EccInfo_V2_t.

Changed from V2:
	sync patch verion

Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h       |  1 +
 .../inc/pmfw_if/smu13_driver_if_aldebaran.h   | 16 +++++-
 .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c    | 53 ++++++++++++++-----
 3 files changed, 57 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index b9a6fac2b8b2..28e603243b67 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -328,6 +328,7 @@ struct ecc_info_per_ch {
 	uint16_t ce_count_hi_chip;
 	uint64_t mca_umc_status;
 	uint64_t mca_umc_addr;
+	uint64_t mca_ceumc_addr;
 };
 
 struct umc_ecc_info {
diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h
index 0f67c56c2863..6f92038470ec 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h
@@ -519,7 +519,21 @@ typedef struct {
 } EccInfo_t;
 
 typedef struct {
-	EccInfo_t  EccInfo[ALDEBARAN_UMC_CHANNEL_NUM];
+	uint64_t mca_umc_status;
+	uint64_t mca_umc_addr;
+	uint64_t mca_ceumc_addr;
+
+	uint16_t ce_count_lo_chip;
+	uint16_t ce_count_hi_chip;
+
+	uint32_t eccPadding;
+} EccInfo_V2_t;
+
+typedef struct {
+	union {
+		EccInfo_t  EccInfo[ALDEBARAN_UMC_CHANNEL_NUM];
+		EccInfo_V2_t EccInfo_V2[ALDEBARAN_UMC_CHANNEL_NUM];
+	};
 } EccInfoTable_t;
 
 // These defines are used with the following messages:
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
index 38af648cb857..9cdfeea58085 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
@@ -82,6 +82,12 @@
  */
 #define SUPPORT_ECCTABLE_SMU_VERSION 0x00442a00
 
+/*
+ * SMU support mca_ceumc_addr in ECCTABLE since version 68.55.0,
+ * use this to check mca_ceumc_addr record whether support
+ */
+#define SUPPORT_ECCTABLE_V2_SMU_VERSION 0x00443700
+
 /*
  * SMU support BAD CHENNEL info MSG since version 68.51.00,
  * use this to check ECCTALE feature whether support
@@ -1802,7 +1808,8 @@ static ssize_t aldebaran_get_gpu_metrics(struct smu_context *smu,
 	return sizeof(struct gpu_metrics_v1_3);
 }
 
-static int aldebaran_check_ecc_table_support(struct smu_context *smu)
+static int aldebaran_check_ecc_table_support(struct smu_context *smu,
+		int *ecctable_version)
 {
 	uint32_t if_version = 0xff, smu_version = 0xff;
 	int ret = 0;
@@ -1815,6 +1822,11 @@ static int aldebaran_check_ecc_table_support(struct smu_context *smu)
 
 	if (smu_version < SUPPORT_ECCTABLE_SMU_VERSION)
 		ret = -EOPNOTSUPP;
+	else if (smu_version >= SUPPORT_ECCTABLE_SMU_VERSION &&
+			smu_version < SUPPORT_ECCTABLE_V2_SMU_VERSION)
+		*ecctable_version = 1;
+	else
+		*ecctable_version = 2;
 
 	return ret;
 }
@@ -1826,9 +1838,10 @@ static ssize_t aldebaran_get_ecc_info(struct smu_context *smu,
 	EccInfoTable_t *ecc_table = NULL;
 	struct ecc_info_per_ch *ecc_info_per_channel = NULL;
 	int i, ret = 0;
+	int table_version = 0;
 	struct umc_ecc_info *eccinfo = (struct umc_ecc_info *)table;
 
-	ret = aldebaran_check_ecc_table_support(smu);
+	ret = aldebaran_check_ecc_table_support(smu, &table_version);
 	if (ret)
 		return ret;
 
@@ -1844,16 +1857,32 @@ static ssize_t aldebaran_get_ecc_info(struct smu_context *smu,
 
 	ecc_table = (EccInfoTable_t *)smu_table->ecc_table;
 
-	for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
-		ecc_info_per_channel = &(eccinfo->ecc[i]);
-		ecc_info_per_channel->ce_count_lo_chip =
-			ecc_table->EccInfo[i].ce_count_lo_chip;
-		ecc_info_per_channel->ce_count_hi_chip =
-			ecc_table->EccInfo[i].ce_count_hi_chip;
-		ecc_info_per_channel->mca_umc_status =
-			ecc_table->EccInfo[i].mca_umc_status;
-		ecc_info_per_channel->mca_umc_addr =
-			ecc_table->EccInfo[i].mca_umc_addr;
+	if (table_version == 1) {
+		for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
+			ecc_info_per_channel = &(eccinfo->ecc[i]);
+			ecc_info_per_channel->ce_count_lo_chip =
+				ecc_table->EccInfo[i].ce_count_lo_chip;
+			ecc_info_per_channel->ce_count_hi_chip =
+				ecc_table->EccInfo[i].ce_count_hi_chip;
+			ecc_info_per_channel->mca_umc_status =
+				ecc_table->EccInfo[i].mca_umc_status;
+			ecc_info_per_channel->mca_umc_addr =
+				ecc_table->EccInfo[i].mca_umc_addr;
+		}
+	} else if (table_version == 2) {
+		for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
+			ecc_info_per_channel = &(eccinfo->ecc[i]);
+			ecc_info_per_channel->ce_count_lo_chip =
+				ecc_table->EccInfo_V2[i].ce_count_lo_chip;
+			ecc_info_per_channel->ce_count_hi_chip =
+				ecc_table->EccInfo_V2[i].ce_count_hi_chip;
+			ecc_info_per_channel->mca_umc_status =
+				ecc_table->EccInfo_V2[i].mca_umc_status;
+			ecc_info_per_channel->mca_umc_addr =
+				ecc_table->EccInfo_V2[i].mca_umc_addr;
+			ecc_info_per_channel->mca_ceumc_addr =
+				ecc_table->EccInfo_V2[i].mca_ceumc_addr;
+		}
 	}
 
 	return ret;
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH Review v3 2/2] drm/amdgpu: print umc correctable error address
  2022-05-25  6:10 [PATCH Review v3 1/2] drm/amdgpu/pm: support mca_ceumc_addr in ecctable Stanley.Yang
@ 2022-05-25  6:10 ` Stanley.Yang
  2022-05-25  6:52   ` Wang, Yang(Kevin)
  2022-05-25 12:38   ` Lazar, Lijo
  2022-05-25 12:33 ` [PATCH Review v3 1/2] drm/amdgpu/pm: support mca_ceumc_addr in ecctable Lazar, Lijo
  1 sibling, 2 replies; 7+ messages in thread
From: Stanley.Yang @ 2022-05-25  6:10 UTC (permalink / raw)
  To: amd-gfx, hawking.zhang, tao.zhou1, evan.quan, lijo.lazar; +Cc: Stanley.Yang

Changed from V1:
	remove unnecessary same row physical address calculation

Changed from V2:
	move record_ce_addr_supported to umc_ecc_info struct

Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h       |  5 ++
 drivers/gpu/drm/amd/amdgpu/umc_v6_7.c         | 50 ++++++++++++++++++-
 .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c    |  1 +
 3 files changed, 54 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 28e603243b67..bf5a95104ec1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -333,6 +333,11 @@ struct ecc_info_per_ch {
 
 struct umc_ecc_info {
 	struct ecc_info_per_ch ecc[MAX_UMC_CHANNEL_NUM];
+
+	/* Determine smu ecctable whether support
+	 * record correctable error address
+	 */
+	int record_ce_addr_supported;
 };
 
 struct amdgpu_ras {
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
index 606892dbea1c..bf7524f16b66 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
@@ -119,6 +119,24 @@ static void umc_v6_7_ecc_info_query_correctable_error_count(struct amdgpu_device
 		*error_count += 1;
 
 		umc_v6_7_query_error_status_helper(adev, mc_umc_status, umc_reg_offset);
+
+		if (ras->umc_ecc.record_ce_addr_supported)	{
+			uint64_t err_addr, soc_pa;
+			uint32_t channel_index =
+				adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
+
+			err_addr = ras->umc_ecc.ecc[eccinfo_table_idx].mca_ceumc_addr;
+			err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
+			/* translate umc channel address to soc pa, 3 parts are included */
+			soc_pa = ADDR_OF_8KB_BLOCK(err_addr) |
+					ADDR_OF_256B_BLOCK(channel_index) |
+					OFFSET_IN_256B_BLOCK(err_addr);
+
+			/* The umc channel bits are not original values, they are hashed */
+			SET_CHANNEL_HASH(channel_index, soc_pa);
+
+			dev_info(adev->dev, "Error Address(PA): 0x%llx\n", soc_pa);
+		}
 	}
 }
 
@@ -251,7 +269,9 @@ static void umc_v6_7_ecc_info_query_ras_error_address(struct amdgpu_device *adev
 
 static void umc_v6_7_query_correctable_error_count(struct amdgpu_device *adev,
 						   uint32_t umc_reg_offset,
-						   unsigned long *error_count)
+						   unsigned long *error_count,
+						   uint32_t ch_inst,
+						   uint32_t umc_inst)
 {
 	uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
 	uint32_t ecc_err_cnt, ecc_err_cnt_addr;
@@ -295,6 +315,31 @@ static void umc_v6_7_query_correctable_error_count(struct amdgpu_device *adev,
 		*error_count += 1;
 
 		umc_v6_7_query_error_status_helper(adev, mc_umc_status, umc_reg_offset);
+
+		{
+			uint64_t err_addr, soc_pa;
+			uint32_t mc_umc_addrt0;
+			uint32_t channel_index;
+
+			mc_umc_addrt0 =
+				SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_ADDRT0);
+
+			channel_index =
+				adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
+
+			err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4);
+			err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
+
+			/* translate umc channel address to soc pa, 3 parts are included */
+			soc_pa = ADDR_OF_8KB_BLOCK(err_addr) |
+					ADDR_OF_256B_BLOCK(channel_index) |
+					OFFSET_IN_256B_BLOCK(err_addr);
+
+			/* The umc channel bits are not original values, they are hashed */
+			SET_CHANNEL_HASH(channel_index, soc_pa);
+
+			dev_info(adev->dev, "Error Address(PA): 0x%llx\n", soc_pa);
+		}
 	}
 }
 
@@ -395,7 +440,8 @@ static void umc_v6_7_query_ras_error_count(struct amdgpu_device *adev,
 							 ch_inst);
 		umc_v6_7_query_correctable_error_count(adev,
 						       umc_reg_offset,
-						       &(err_data->ce_count));
+						       &(err_data->ce_count),
+						       ch_inst, umc_inst);
 		umc_v6_7_querry_uncorrectable_error_count(adev,
 							  umc_reg_offset,
 							  &(err_data->ue_count));
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
index 9cdfeea58085..c7e0fec614ea 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
@@ -1883,6 +1883,7 @@ static ssize_t aldebaran_get_ecc_info(struct smu_context *smu,
 			ecc_info_per_channel->mca_ceumc_addr =
 				ecc_table->EccInfo_V2[i].mca_ceumc_addr;
 		}
+		eccinfo->record_ce_addr_supported =1;
 	}
 
 	return ret;
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH Review v3 2/2] drm/amdgpu: print umc correctable error address
  2022-05-25  6:10 ` [PATCH Review v3 2/2] drm/amdgpu: print umc correctable error address Stanley.Yang
@ 2022-05-25  6:52   ` Wang, Yang(Kevin)
  2022-05-25  7:15     ` 答复: " Yang, Stanley
  2022-05-25 12:38   ` Lazar, Lijo
  1 sibling, 1 reply; 7+ messages in thread
From: Wang, Yang(Kevin) @ 2022-05-25  6:52 UTC (permalink / raw)
  To: Yang, Stanley, amd-gfx, Zhang, Hawking, Zhou1, Tao, Quan, Evan,
	Lazar, Lijo

[-- Attachment #1: Type: text/plain, Size: 6831 bytes --]

[AMD Official Use Only - General]

From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf of Stanley.Yang <Stanley.Yang@amd.com>
Sent: Wednesday, May 25, 2022 2:10 PM
To: amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>; Zhang, Hawking <Hawking.Zhang@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Quan, Evan <Evan.Quan@amd.com>; Lazar, Lijo <Lijo.Lazar@amd.com>
Cc: Yang, Stanley <Stanley.Yang@amd.com>
Subject: [PATCH Review v3 2/2] drm/amdgpu: print umc correctable error address

Changed from V1:
        remove unnecessary same row physical address calculation

Changed from V2:
        move record_ce_addr_supported to umc_ecc_info struct

Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h       |  5 ++
 drivers/gpu/drm/amd/amdgpu/umc_v6_7.c         | 50 ++++++++++++++++++-
 .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c    |  1 +
 3 files changed, 54 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 28e603243b67..bf5a95104ec1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -333,6 +333,11 @@ struct ecc_info_per_ch {

 struct umc_ecc_info {
         struct ecc_info_per_ch ecc[MAX_UMC_CHANNEL_NUM];
+
+       /* Determine smu ecctable whether support
+        * record correctable error address
+        */
+       int record_ce_addr_supported;
 };

[kevin]:

  1.  the new field of record_ce_addr_supported is not set on sienna_cichlid chip.

  1.  and this field is better to renamed to others when this ecc table(pmfw side) update again in the furture. .e.g: ecc_table_version

Best Regards
Kevin

 struct amdgpu_ras {
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
index 606892dbea1c..bf7524f16b66 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
@@ -119,6 +119,24 @@ static void umc_v6_7_ecc_info_query_correctable_error_count(struct amdgpu_device
                 *error_count += 1;

                 umc_v6_7_query_error_status_helper(adev, mc_umc_status, umc_reg_offset);
+
+               if (ras->umc_ecc.record_ce_addr_supported)      {
+                       uint64_t err_addr, soc_pa;
+                       uint32_t channel_index =
+                               adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
+
+                       err_addr = ras->umc_ecc.ecc[eccinfo_table_idx].mca_ceumc_addr;
+                       err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
+                       /* translate umc channel address to soc pa, 3 parts are included */
+                       soc_pa = ADDR_OF_8KB_BLOCK(err_addr) |
+                                       ADDR_OF_256B_BLOCK(channel_index) |
+                                       OFFSET_IN_256B_BLOCK(err_addr);
+
+                       /* The umc channel bits are not original values, they are hashed */
+                       SET_CHANNEL_HASH(channel_index, soc_pa);
+
+                       dev_info(adev->dev, "Error Address(PA): 0x%llx\n", soc_pa);
+               }
         }
 }

@@ -251,7 +269,9 @@ static void umc_v6_7_ecc_info_query_ras_error_address(struct amdgpu_device *adev

 static void umc_v6_7_query_correctable_error_count(struct amdgpu_device *adev,
                                                    uint32_t umc_reg_offset,
-                                                  unsigned long *error_count)
+                                                  unsigned long *error_count,
+                                                  uint32_t ch_inst,
+                                                  uint32_t umc_inst)
 {
         uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
         uint32_t ecc_err_cnt, ecc_err_cnt_addr;
@@ -295,6 +315,31 @@ static void umc_v6_7_query_correctable_error_count(struct amdgpu_device *adev,
                 *error_count += 1;

                 umc_v6_7_query_error_status_helper(adev, mc_umc_status, umc_reg_offset);
+
+               {
+                       uint64_t err_addr, soc_pa;
+                       uint32_t mc_umc_addrt0;
+                       uint32_t channel_index;
+
+                       mc_umc_addrt0 =
+                               SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_ADDRT0);
+
+                       channel_index =
+                               adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
+
+                       err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4);
+                       err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
+
+                       /* translate umc channel address to soc pa, 3 parts are included */
+                       soc_pa = ADDR_OF_8KB_BLOCK(err_addr) |
+                                       ADDR_OF_256B_BLOCK(channel_index) |
+                                       OFFSET_IN_256B_BLOCK(err_addr);
+
+                       /* The umc channel bits are not original values, they are hashed */
+                       SET_CHANNEL_HASH(channel_index, soc_pa);
+
+                       dev_info(adev->dev, "Error Address(PA): 0x%llx\n", soc_pa);
+               }
         }
 }

@@ -395,7 +440,8 @@ static void umc_v6_7_query_ras_error_count(struct amdgpu_device *adev,
                                                          ch_inst);
                 umc_v6_7_query_correctable_error_count(adev,
                                                        umc_reg_offset,
-                                                      &(err_data->ce_count));
+                                                      &(err_data->ce_count),
+                                                      ch_inst, umc_inst);
                 umc_v6_7_querry_uncorrectable_error_count(adev,
                                                           umc_reg_offset,
                                                           &(err_data->ue_count));
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
index 9cdfeea58085..c7e0fec614ea 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
@@ -1883,6 +1883,7 @@ static ssize_t aldebaran_get_ecc_info(struct smu_context *smu,
                         ecc_info_per_channel->mca_ceumc_addr =
                                 ecc_table->EccInfo_V2[i].mca_ceumc_addr;
                 }
+               eccinfo->record_ce_addr_supported =1;
         }

         return ret;
--
2.17.1


[-- Attachment #2: Type: text/html, Size: 21729 bytes --]

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* 答复: [PATCH Review v3 2/2] drm/amdgpu: print umc correctable error address
  2022-05-25  6:52   ` Wang, Yang(Kevin)
@ 2022-05-25  7:15     ` Yang, Stanley
  0 siblings, 0 replies; 7+ messages in thread
From: Yang, Stanley @ 2022-05-25  7:15 UTC (permalink / raw)
  To: Wang, Yang(Kevin),
	amd-gfx, Zhang, Hawking, Zhou1, Tao, Quan, Evan, Lazar, Lijo

[-- Attachment #1: Type: text/plain, Size: 7521 bytes --]

[AMD Official Use Only - General]


[AMD Official Use Only - General]


发件人: Wang, Yang(Kevin) <KevinYang.Wang@amd.com>
日期: 星期三, 2022年5月25日 下午2:52
收件人: Yang, Stanley <Stanley.Yang@amd.com>, amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>, Zhang, Hawking <Hawking.Zhang@amd.com>, Zhou1, Tao <Tao.Zhou1@amd.com>, Quan, Evan <Evan.Quan@amd.com>, Lazar, Lijo <Lijo.Lazar@amd.com>
主题: Re: [PATCH Review v3 2/2] drm/amdgpu: print umc correctable error address

[AMD Official Use Only - General]

From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf of Stanley.Yang <Stanley.Yang@amd.com>
Sent: Wednesday, May 25, 2022 2:10 PM
To: amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>; Zhang, Hawking <Hawking.Zhang@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Quan, Evan <Evan.Quan@amd.com>; Lazar, Lijo <Lijo.Lazar@amd.com>
Cc: Yang, Stanley <Stanley.Yang@amd.com>
Subject: [PATCH Review v3 2/2] drm/amdgpu: print umc correctable error address

Changed from V1:
        remove unnecessary same row physical address calculation

Changed from V2:
        move record_ce_addr_supported to umc_ecc_info struct

Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h       |  5 ++
 drivers/gpu/drm/amd/amdgpu/umc_v6_7.c         | 50 ++++++++++++++++++-
 .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c    |  1 +
 3 files changed, 54 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 28e603243b67..bf5a95104ec1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -333,6 +333,11 @@ struct ecc_info_per_ch {

 struct umc_ecc_info {
         struct ecc_info_per_ch ecc[MAX_UMC_CHANNEL_NUM];
+
+       /* Determine smu ecctable whether support
+        * record correctable error address
+        */
+       int record_ce_addr_supported;
 };

[kevin]:

  1.  the new field of record_ce_addr_supported is not set on sienna_cichlid chip.
Stanley: Sienna_cichild not support this feature, so do not set record_ce_addr_supported.

  1.  and this field is better to renamed to others when this ecc table(pmfw side) update again in the furture. .e.g: ecc_table_version

Stanley: To name record_ce_addr_supported is more intuitive then using ecc_table_version or others.

Best Regards
Kevin

 struct amdgpu_ras {
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
index 606892dbea1c..bf7524f16b66 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
@@ -119,6 +119,24 @@ static void umc_v6_7_ecc_info_query_correctable_error_count(struct amdgpu_device
                 *error_count += 1;

                 umc_v6_7_query_error_status_helper(adev, mc_umc_status, umc_reg_offset);
+
+               if (ras->umc_ecc.record_ce_addr_supported)      {
+                       uint64_t err_addr, soc_pa;
+                       uint32_t channel_index =
+                               adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
+
+                       err_addr = ras->umc_ecc.ecc[eccinfo_table_idx].mca_ceumc_addr;
+                       err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
+                       /* translate umc channel address to soc pa, 3 parts are included */
+                       soc_pa = ADDR_OF_8KB_BLOCK(err_addr) |
+                                       ADDR_OF_256B_BLOCK(channel_index) |
+                                       OFFSET_IN_256B_BLOCK(err_addr);
+
+                       /* The umc channel bits are not original values, they are hashed */
+                       SET_CHANNEL_HASH(channel_index, soc_pa);
+
+                       dev_info(adev->dev, "Error Address(PA): 0x%llx\n", soc_pa);
+               }
         }
 }

@@ -251,7 +269,9 @@ static void umc_v6_7_ecc_info_query_ras_error_address(struct amdgpu_device *adev

 static void umc_v6_7_query_correctable_error_count(struct amdgpu_device *adev,
                                                    uint32_t umc_reg_offset,
-                                                  unsigned long *error_count)
+                                                  unsigned long *error_count,
+                                                  uint32_t ch_inst,
+                                                  uint32_t umc_inst)
 {
         uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
         uint32_t ecc_err_cnt, ecc_err_cnt_addr;
@@ -295,6 +315,31 @@ static void umc_v6_7_query_correctable_error_count(struct amdgpu_device *adev,
                 *error_count += 1;

                 umc_v6_7_query_error_status_helper(adev, mc_umc_status, umc_reg_offset);
+
+               {
+                       uint64_t err_addr, soc_pa;
+                       uint32_t mc_umc_addrt0;
+                       uint32_t channel_index;
+
+                       mc_umc_addrt0 =
+                               SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_ADDRT0);
+
+                       channel_index =
+                               adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
+
+                       err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4);
+                       err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
+
+                       /* translate umc channel address to soc pa, 3 parts are included */
+                       soc_pa = ADDR_OF_8KB_BLOCK(err_addr) |
+                                       ADDR_OF_256B_BLOCK(channel_index) |
+                                       OFFSET_IN_256B_BLOCK(err_addr);
+
+                       /* The umc channel bits are not original values, they are hashed */
+                       SET_CHANNEL_HASH(channel_index, soc_pa);
+
+                       dev_info(adev->dev, "Error Address(PA): 0x%llx\n", soc_pa);
+               }
         }
 }

@@ -395,7 +440,8 @@ static void umc_v6_7_query_ras_error_count(struct amdgpu_device *adev,
                                                          ch_inst);
                 umc_v6_7_query_correctable_error_count(adev,
                                                        umc_reg_offset,
-                                                      &(err_data->ce_count));
+                                                      &(err_data->ce_count),
+                                                      ch_inst, umc_inst);
                 umc_v6_7_querry_uncorrectable_error_count(adev,
                                                           umc_reg_offset,
                                                           &(err_data->ue_count));
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
index 9cdfeea58085..c7e0fec614ea 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
@@ -1883,6 +1883,7 @@ static ssize_t aldebaran_get_ecc_info(struct smu_context *smu,
                         ecc_info_per_channel->mca_ceumc_addr =
                                 ecc_table->EccInfo_V2[i].mca_ceumc_addr;
                 }
+               eccinfo->record_ce_addr_supported =1;
         }

         return ret;
--
2.17.1

[-- Attachment #2: Type: text/html, Size: 32736 bytes --]

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH Review v3 1/2] drm/amdgpu/pm: support mca_ceumc_addr in ecctable
  2022-05-25  6:10 [PATCH Review v3 1/2] drm/amdgpu/pm: support mca_ceumc_addr in ecctable Stanley.Yang
  2022-05-25  6:10 ` [PATCH Review v3 2/2] drm/amdgpu: print umc correctable error address Stanley.Yang
@ 2022-05-25 12:33 ` Lazar, Lijo
  1 sibling, 0 replies; 7+ messages in thread
From: Lazar, Lijo @ 2022-05-25 12:33 UTC (permalink / raw)
  To: Stanley.Yang, amd-gfx, hawking.zhang, tao.zhou1, evan.quan



On 5/25/2022 11:40 AM, Stanley.Yang wrote:
> SMU add a new variable mca_ceumc_addr to record
> umc correctable error address in EccInfo table,
> driver side add EccInfo_V2_t to support this feature
> 
> Changed from V1:
> 	remove ecc_table_v2 and unnecessary table id, define union struct include
> 	EccInfo_t and EccInfo_V2_t.
> 
> Changed from V2:
> 	sync patch verion
> 
> Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com>

Reviewed-by: Lijo Lazar <lijo.lazar@amd.com>

Thanks,
Lijo

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h       |  1 +
>   .../inc/pmfw_if/smu13_driver_if_aldebaran.h   | 16 +++++-
>   .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c    | 53 ++++++++++++++-----
>   3 files changed, 57 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index b9a6fac2b8b2..28e603243b67 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -328,6 +328,7 @@ struct ecc_info_per_ch {
>   	uint16_t ce_count_hi_chip;
>   	uint64_t mca_umc_status;
>   	uint64_t mca_umc_addr;
> +	uint64_t mca_ceumc_addr;
>   };
>   
>   struct umc_ecc_info {
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h
> index 0f67c56c2863..6f92038470ec 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h
> +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h
> @@ -519,7 +519,21 @@ typedef struct {
>   } EccInfo_t;
>   
>   typedef struct {
> -	EccInfo_t  EccInfo[ALDEBARAN_UMC_CHANNEL_NUM];
> +	uint64_t mca_umc_status;
> +	uint64_t mca_umc_addr;
> +	uint64_t mca_ceumc_addr;
> +
> +	uint16_t ce_count_lo_chip;
> +	uint16_t ce_count_hi_chip;
> +
> +	uint32_t eccPadding;
> +} EccInfo_V2_t;
> +
> +typedef struct {
> +	union {
> +		EccInfo_t  EccInfo[ALDEBARAN_UMC_CHANNEL_NUM];
> +		EccInfo_V2_t EccInfo_V2[ALDEBARAN_UMC_CHANNEL_NUM];
> +	};
>   } EccInfoTable_t;
>   
>   // These defines are used with the following messages:
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> index 38af648cb857..9cdfeea58085 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> @@ -82,6 +82,12 @@
>    */
>   #define SUPPORT_ECCTABLE_SMU_VERSION 0x00442a00
>   
> +/*
> + * SMU support mca_ceumc_addr in ECCTABLE since version 68.55.0,
> + * use this to check mca_ceumc_addr record whether support
> + */
> +#define SUPPORT_ECCTABLE_V2_SMU_VERSION 0x00443700
> +
>   /*
>    * SMU support BAD CHENNEL info MSG since version 68.51.00,
>    * use this to check ECCTALE feature whether support
> @@ -1802,7 +1808,8 @@ static ssize_t aldebaran_get_gpu_metrics(struct smu_context *smu,
>   	return sizeof(struct gpu_metrics_v1_3);
>   }
>   
> -static int aldebaran_check_ecc_table_support(struct smu_context *smu)
> +static int aldebaran_check_ecc_table_support(struct smu_context *smu,
> +		int *ecctable_version)
>   {
>   	uint32_t if_version = 0xff, smu_version = 0xff;
>   	int ret = 0;
> @@ -1815,6 +1822,11 @@ static int aldebaran_check_ecc_table_support(struct smu_context *smu)
>   
>   	if (smu_version < SUPPORT_ECCTABLE_SMU_VERSION)
>   		ret = -EOPNOTSUPP;
> +	else if (smu_version >= SUPPORT_ECCTABLE_SMU_VERSION &&
> +			smu_version < SUPPORT_ECCTABLE_V2_SMU_VERSION)
> +		*ecctable_version = 1;
> +	else
> +		*ecctable_version = 2;
>   
>   	return ret;
>   }
> @@ -1826,9 +1838,10 @@ static ssize_t aldebaran_get_ecc_info(struct smu_context *smu,
>   	EccInfoTable_t *ecc_table = NULL;
>   	struct ecc_info_per_ch *ecc_info_per_channel = NULL;
>   	int i, ret = 0;
> +	int table_version = 0;
>   	struct umc_ecc_info *eccinfo = (struct umc_ecc_info *)table;
>   
> -	ret = aldebaran_check_ecc_table_support(smu);
> +	ret = aldebaran_check_ecc_table_support(smu, &table_version);
>   	if (ret)
>   		return ret;
>   
> @@ -1844,16 +1857,32 @@ static ssize_t aldebaran_get_ecc_info(struct smu_context *smu,
>   
>   	ecc_table = (EccInfoTable_t *)smu_table->ecc_table;
>   
> -	for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
> -		ecc_info_per_channel = &(eccinfo->ecc[i]);
> -		ecc_info_per_channel->ce_count_lo_chip =
> -			ecc_table->EccInfo[i].ce_count_lo_chip;
> -		ecc_info_per_channel->ce_count_hi_chip =
> -			ecc_table->EccInfo[i].ce_count_hi_chip;
> -		ecc_info_per_channel->mca_umc_status =
> -			ecc_table->EccInfo[i].mca_umc_status;
> -		ecc_info_per_channel->mca_umc_addr =
> -			ecc_table->EccInfo[i].mca_umc_addr;
> +	if (table_version == 1) {
> +		for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
> +			ecc_info_per_channel = &(eccinfo->ecc[i]);
> +			ecc_info_per_channel->ce_count_lo_chip =
> +				ecc_table->EccInfo[i].ce_count_lo_chip;
> +			ecc_info_per_channel->ce_count_hi_chip =
> +				ecc_table->EccInfo[i].ce_count_hi_chip;
> +			ecc_info_per_channel->mca_umc_status =
> +				ecc_table->EccInfo[i].mca_umc_status;
> +			ecc_info_per_channel->mca_umc_addr =
> +				ecc_table->EccInfo[i].mca_umc_addr;
> +		}
> +	} else if (table_version == 2) {
> +		for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
> +			ecc_info_per_channel = &(eccinfo->ecc[i]);
> +			ecc_info_per_channel->ce_count_lo_chip =
> +				ecc_table->EccInfo_V2[i].ce_count_lo_chip;
> +			ecc_info_per_channel->ce_count_hi_chip =
> +				ecc_table->EccInfo_V2[i].ce_count_hi_chip;
> +			ecc_info_per_channel->mca_umc_status =
> +				ecc_table->EccInfo_V2[i].mca_umc_status;
> +			ecc_info_per_channel->mca_umc_addr =
> +				ecc_table->EccInfo_V2[i].mca_umc_addr;
> +			ecc_info_per_channel->mca_ceumc_addr =
> +				ecc_table->EccInfo_V2[i].mca_ceumc_addr;
> +		}
>   	}
>   
>   	return ret;
> 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH Review v3 2/2] drm/amdgpu: print umc correctable error address
  2022-05-25  6:10 ` [PATCH Review v3 2/2] drm/amdgpu: print umc correctable error address Stanley.Yang
  2022-05-25  6:52   ` Wang, Yang(Kevin)
@ 2022-05-25 12:38   ` Lazar, Lijo
  2022-05-25 13:21     ` 答复: " Yang, Stanley
  1 sibling, 1 reply; 7+ messages in thread
From: Lazar, Lijo @ 2022-05-25 12:38 UTC (permalink / raw)
  To: Stanley.Yang, amd-gfx, hawking.zhang, tao.zhou1, evan.quan



On 5/25/2022 11:40 AM, Stanley.Yang wrote:
> Changed from V1:
> 	remove unnecessary same row physical address calculation
> 
> Changed from V2:
> 	move record_ce_addr_supported to umc_ecc_info struct
> 
> Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h       |  5 ++
>   drivers/gpu/drm/amd/amdgpu/umc_v6_7.c         | 50 ++++++++++++++++++-
>   .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c    |  1 +
>   3 files changed, 54 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 28e603243b67..bf5a95104ec1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -333,6 +333,11 @@ struct ecc_info_per_ch {
>   
>   struct umc_ecc_info {
>   	struct ecc_info_per_ch ecc[MAX_UMC_CHANNEL_NUM];
> +
> +	/* Determine smu ecctable whether support
> +	 * record correctable error address
> +	 */
> +	int record_ce_addr_supported;
>   };
>   
>   struct amdgpu_ras {
> diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
> index 606892dbea1c..bf7524f16b66 100644
> --- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
> +++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
> @@ -119,6 +119,24 @@ static void umc_v6_7_ecc_info_query_correctable_error_count(struct amdgpu_device
>   		*error_count += 1;
>   
>   		umc_v6_7_query_error_status_helper(adev, mc_umc_status, umc_reg_offset);
> +
> +		if (ras->umc_ecc.record_ce_addr_supported)	{
> +			uint64_t err_addr, soc_pa;
> +			uint32_t channel_index =
> +				adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
> +
> +			err_addr = ras->umc_ecc.ecc[eccinfo_table_idx].mca_ceumc_addr;
> +			err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
> +			/* translate umc channel address to soc pa, 3 parts are included */
> +			soc_pa = ADDR_OF_8KB_BLOCK(err_addr) |
> +					ADDR_OF_256B_BLOCK(channel_index) |
> +					OFFSET_IN_256B_BLOCK(err_addr);
> +
> +			/* The umc channel bits are not original values, they are hashed */
> +			SET_CHANNEL_HASH(channel_index, soc_pa);
> +

UMC address to PA conversion is common regardless of UE/CE error 
addresses. You may want to pack it in a small function.

Regardless,
	Acked-by: Lijo Lazar <lijo.lazar@amd.com>

Thanks,
Lijo

> +			dev_info(adev->dev, "Error Address(PA): 0x%llx\n", soc_pa);
> +		}
>   	}
>   }
>   
> @@ -251,7 +269,9 @@ static void umc_v6_7_ecc_info_query_ras_error_address(struct amdgpu_device *adev
>   
>   static void umc_v6_7_query_correctable_error_count(struct amdgpu_device *adev,
>   						   uint32_t umc_reg_offset,
> -						   unsigned long *error_count)
> +						   unsigned long *error_count,
> +						   uint32_t ch_inst,
> +						   uint32_t umc_inst)
>   {
>   	uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
>   	uint32_t ecc_err_cnt, ecc_err_cnt_addr;
> @@ -295,6 +315,31 @@ static void umc_v6_7_query_correctable_error_count(struct amdgpu_device *adev,
>   		*error_count += 1;
>   
>   		umc_v6_7_query_error_status_helper(adev, mc_umc_status, umc_reg_offset);
> +
> +		{
> +			uint64_t err_addr, soc_pa;
> +			uint32_t mc_umc_addrt0;
> +			uint32_t channel_index;
> +
> +			mc_umc_addrt0 =
> +				SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_ADDRT0);
> +
> +			channel_index =
> +				adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
> +
> +			err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4);
> +			err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
> +
> +			/* translate umc channel address to soc pa, 3 parts are included */
> +			soc_pa = ADDR_OF_8KB_BLOCK(err_addr) |
> +					ADDR_OF_256B_BLOCK(channel_index) |
> +					OFFSET_IN_256B_BLOCK(err_addr);
> +
> +			/* The umc channel bits are not original values, they are hashed */
> +			SET_CHANNEL_HASH(channel_index, soc_pa);
> +
> +			dev_info(adev->dev, "Error Address(PA): 0x%llx\n", soc_pa);
> +		}
>   	}
>   }
>   
> @@ -395,7 +440,8 @@ static void umc_v6_7_query_ras_error_count(struct amdgpu_device *adev,
>   							 ch_inst);
>   		umc_v6_7_query_correctable_error_count(adev,
>   						       umc_reg_offset,
> -						       &(err_data->ce_count));
> +						       &(err_data->ce_count),
> +						       ch_inst, umc_inst);
>   		umc_v6_7_querry_uncorrectable_error_count(adev,
>   							  umc_reg_offset,
>   							  &(err_data->ue_count));
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> index 9cdfeea58085..c7e0fec614ea 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> @@ -1883,6 +1883,7 @@ static ssize_t aldebaran_get_ecc_info(struct smu_context *smu,
>   			ecc_info_per_channel->mca_ceumc_addr =
>   				ecc_table->EccInfo_V2[i].mca_ceumc_addr;
>   		}
> +		eccinfo->record_ce_addr_supported =1;
>   	}
>   
>   	return ret;
> 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* 答复: [PATCH Review v3 2/2] drm/amdgpu: print umc correctable error address
  2022-05-25 12:38   ` Lazar, Lijo
@ 2022-05-25 13:21     ` Yang, Stanley
  0 siblings, 0 replies; 7+ messages in thread
From: Yang, Stanley @ 2022-05-25 13:21 UTC (permalink / raw)
  To: Lazar, Lijo, amd-gfx, Zhang, Hawking, Zhou1, Tao, Quan, Evan

[-- Attachment #1: Type: text/plain, Size: 6975 bytes --]

[AMD Official Use Only - General]


[AMD Official Use Only - General]

发件人: Lazar, Lijo <Lijo.Lazar@amd.com>
日期: 星期三, 2022年5月25日 下午8:38
收件人: Yang, Stanley <Stanley.Yang@amd.com>, amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>, Zhang, Hawking <Hawking.Zhang@amd.com>, Zhou1, Tao <Tao.Zhou1@amd.com>, Quan, Evan <Evan.Quan@amd.com>
主题: Re: [PATCH Review v3 2/2] drm/amdgpu: print umc correctable error address


On 5/25/2022 11:40 AM, Stanley.Yang wrote:
> Changed from V1:
>        remove unnecessary same row physical address calculation
>
> Changed from V2:
>        move record_ce_addr_supported to umc_ecc_info struct
>
> Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h       |  5 ++
>   drivers/gpu/drm/amd/amdgpu/umc_v6_7.c         | 50 ++++++++++++++++++-
>   .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c    |  1 +
>   3 files changed, 54 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 28e603243b67..bf5a95104ec1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -333,6 +333,11 @@ struct ecc_info_per_ch {
>
>   struct umc_ecc_info {
>        struct ecc_info_per_ch ecc[MAX_UMC_CHANNEL_NUM];
> +
> +     /* Determine smu ecctable whether support
> +      * record correctable error address
> +      */
> +     int record_ce_addr_supported;
>   };
>
>   struct amdgpu_ras {
> diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
> index 606892dbea1c..bf7524f16b66 100644
> --- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
> +++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
> @@ -119,6 +119,24 @@ static void umc_v6_7_ecc_info_query_correctable_error_count(struct amdgpu_device
>                *error_count += 1;
>
>                umc_v6_7_query_error_status_helper(adev, mc_umc_status, umc_reg_offset);
> +
> +             if (ras->umc_ecc.record_ce_addr_supported)      {
> +                     uint64_t err_addr, soc_pa;
> +                     uint32_t channel_index =
> +                             adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
> +
> +                     err_addr = ras->umc_ecc.ecc[eccinfo_table_idx].mca_ceumc_addr;
> +                     err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
> +                     /* translate umc channel address to soc pa, 3 parts are included */
> +                     soc_pa = ADDR_OF_8KB_BLOCK(err_addr) |
> +                                     ADDR_OF_256B_BLOCK(channel_index) |
> +                                     OFFSET_IN_256B_BLOCK(err_addr);
> +
> +                     /* The umc channel bits are not original values, they are hashed */
> +                     SET_CHANNEL_HASH(channel_index, soc_pa);
> +

UMC address to PA conversion is common regardless of UE/CE error
addresses. You may want to pack it in a small function.

Regardless,
        Acked-by: Lijo Lazar <lijo.lazar@amd.com>

Thanks,
Lijo
Stanley: These lines are indeed redundant. I'll make a patch to simplify it.

Reagards,
Stanley


> +                     dev_info(adev->dev, "Error Address(PA): 0x%llx\n", soc_pa);
> +             }
>        }
>   }
>
> @@ -251,7 +269,9 @@ static void umc_v6_7_ecc_info_query_ras_error_address(struct amdgpu_device *adev
>
>   static void umc_v6_7_query_correctable_error_count(struct amdgpu_device *adev,
>                                                   uint32_t umc_reg_offset,
> -                                                unsigned long *error_count)
> +                                                unsigned long *error_count,
> +                                                uint32_t ch_inst,
> +                                                uint32_t umc_inst)
>   {
>        uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
>        uint32_t ecc_err_cnt, ecc_err_cnt_addr;
> @@ -295,6 +315,31 @@ static void umc_v6_7_query_correctable_error_count(struct amdgpu_device *adev,
>                *error_count += 1;
>
>                umc_v6_7_query_error_status_helper(adev, mc_umc_status, umc_reg_offset);
> +
> +             {
> +                     uint64_t err_addr, soc_pa;
> +                     uint32_t mc_umc_addrt0;
> +                     uint32_t channel_index;
> +
> +                     mc_umc_addrt0 =
> +                             SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_ADDRT0);
> +
> +                     channel_index =
> +                             adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
> +
> +                     err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4);
> +                     err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
> +
> +                     /* translate umc channel address to soc pa, 3 parts are included */
> +                     soc_pa = ADDR_OF_8KB_BLOCK(err_addr) |
> +                                     ADDR_OF_256B_BLOCK(channel_index) |
> +                                     OFFSET_IN_256B_BLOCK(err_addr);
> +
> +                     /* The umc channel bits are not original values, they are hashed */
> +                     SET_CHANNEL_HASH(channel_index, soc_pa);
> +
> +                     dev_info(adev->dev, "Error Address(PA): 0x%llx\n", soc_pa);
> +             }
>        }
>   }
>
> @@ -395,7 +440,8 @@ static void umc_v6_7_query_ras_error_count(struct amdgpu_device *adev,
>                                                         ch_inst);
>                umc_v6_7_query_correctable_error_count(adev,
>                                                       umc_reg_offset,
> -                                                    &(err_data->ce_count));
> +                                                    &(err_data->ce_count),
> +                                                    ch_inst, umc_inst);
>                umc_v6_7_querry_uncorrectable_error_count(adev,
>                                                          umc_reg_offset,
>                                                          &(err_data->ue_count));
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> index 9cdfeea58085..c7e0fec614ea 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> @@ -1883,6 +1883,7 @@ static ssize_t aldebaran_get_ecc_info(struct smu_context *smu,
>                        ecc_info_per_channel->mca_ceumc_addr =
>                                ecc_table->EccInfo_V2[i].mca_ceumc_addr;
>                }
> +             eccinfo->record_ce_addr_supported =1;
>        }
>
>        return ret;
>

[-- Attachment #2: Type: text/html, Size: 20106 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2022-05-25 13:21 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-05-25  6:10 [PATCH Review v3 1/2] drm/amdgpu/pm: support mca_ceumc_addr in ecctable Stanley.Yang
2022-05-25  6:10 ` [PATCH Review v3 2/2] drm/amdgpu: print umc correctable error address Stanley.Yang
2022-05-25  6:52   ` Wang, Yang(Kevin)
2022-05-25  7:15     ` 答复: " Yang, Stanley
2022-05-25 12:38   ` Lazar, Lijo
2022-05-25 13:21     ` 答复: " Yang, Stanley
2022-05-25 12:33 ` [PATCH Review v3 1/2] drm/amdgpu/pm: support mca_ceumc_addr in ecctable Lazar, Lijo

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.