All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH Review 1/2] drm/amdgpu/pm: support mca_ceumc_addr in ecctable
@ 2022-05-23  8:17 Stanley.Yang
  2022-05-23  8:17 ` [PATCH Review 2/2] drm/amdgpu: print umc correctable error address Stanley.Yang
                   ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: Stanley.Yang @ 2022-05-23  8:17 UTC (permalink / raw)
  To: amd-gfx, hawking.zhang, tao.zhou1, evan.quan, lijo.lazar; +Cc: Stanley.Yang

SMU add a new variable mca_ceumc_addr to record
umc correctable error address in EccInfo table,
driver side add ecctable_v2 to support this feature

Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h       |   1 +
 drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h |   2 +
 .../inc/pmfw_if/smu13_driver_if_aldebaran.h   |  15 +++
 .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c    | 101 ++++++++++++++----
 .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c    |   2 +
 5 files changed, 98 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index b9a6fac2b8b2..28e603243b67 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -328,6 +328,7 @@ struct ecc_info_per_ch {
 	uint16_t ce_count_hi_chip;
 	uint64_t mca_umc_status;
 	uint64_t mca_umc_addr;
+	uint64_t mca_ceumc_addr;
 };
 
 struct umc_ecc_info {
diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
index a6a7b6c33683..9f7257ada437 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
@@ -322,6 +322,7 @@ enum smu_table_id
 	SMU_TABLE_PACE,
 	SMU_TABLE_ECCINFO,
 	SMU_TABLE_COMBO_PPTABLE,
+	SMU_TABLE_ECCINFO_V2,
 	SMU_TABLE_COUNT,
 };
 
@@ -340,6 +341,7 @@ struct smu_table_context
 	void				*driver_pptable;
 	void				*combo_pptable;
 	void                            *ecc_table;
+	void                            *ecc_table_v2;	// adapt to smu support record mca_ceumc_addr
 	void				*driver_smu_config_table;
 	struct smu_table		tables[SMU_TABLE_COUNT];
 	/*
diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h
index 0f67c56c2863..2868604eff49 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h
@@ -522,6 +522,21 @@ typedef struct {
 	EccInfo_t  EccInfo[ALDEBARAN_UMC_CHANNEL_NUM];
 } EccInfoTable_t;
 
+typedef struct {
+	uint64_t mca_umc_status;
+	uint64_t mca_umc_addr;
+	uint64_t mca_ceumc_addr;
+
+	uint16_t ce_count_lo_chip;
+	uint16_t ce_count_hi_chip;
+
+	uint32_t eccPadding;
+} EccInfo_t_v2;
+
+typedef struct {
+	EccInfo_t_v2  EccInfo[ALDEBARAN_UMC_CHANNEL_NUM];
+} EccInfoTable_t_v2;
+
 // These defines are used with the following messages:
 // SMC_MSG_TransferTableDram2Smu
 // SMC_MSG_TransferTableSmu2Dram
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
index 38af648cb857..e58df9490cec 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
@@ -82,6 +82,12 @@
  */
 #define SUPPORT_ECCTABLE_SMU_VERSION 0x00442a00
 
+/*
+ * SMU support mca_ceumc_addr in ECCTABLE since version 68.55.0,
+ * use this to check mca_ceumc_addr record whether support
+ */
+#define SUPPORT_ECCTABLE_V2_SMU_VERSION 0x00443700
+
 /*
  * SMU support BAD CHENNEL info MSG since version 68.51.00,
  * use this to check ECCTALE feature whether support
@@ -239,6 +245,9 @@ static int aldebaran_tables_init(struct smu_context *smu)
 	SMU_TABLE_INIT(tables, SMU_TABLE_ECCINFO, sizeof(EccInfoTable_t),
 		       PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);
 
+	SMU_TABLE_INIT(tables, SMU_TABLE_ECCINFO_V2, sizeof(EccInfoTable_t_v2),
+			PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);
+
 	smu_table->metrics_table = kzalloc(sizeof(SmuMetrics_t), GFP_KERNEL);
 	if (!smu_table->metrics_table)
 		return -ENOMEM;
@@ -255,6 +264,10 @@ static int aldebaran_tables_init(struct smu_context *smu)
 	if (!smu_table->ecc_table)
 		return -ENOMEM;
 
+	smu_table->ecc_table_v2 = kzalloc(tables[SMU_TABLE_ECCINFO_V2].size, GFP_KERNEL);
+	if (!smu_table->ecc_table_v2)
+		return -ENOMEM;;
+
 	return 0;
 }
 
@@ -1802,7 +1815,8 @@ static ssize_t aldebaran_get_gpu_metrics(struct smu_context *smu,
 	return sizeof(struct gpu_metrics_v1_3);
 }
 
-static int aldebaran_check_ecc_table_support(struct smu_context *smu)
+static int aldebaran_check_ecc_table_support(struct smu_context *smu,
+		int *ecctable_version)
 {
 	uint32_t if_version = 0xff, smu_version = 0xff;
 	int ret = 0;
@@ -1815,6 +1829,11 @@ static int aldebaran_check_ecc_table_support(struct smu_context *smu)
 
 	if (smu_version < SUPPORT_ECCTABLE_SMU_VERSION)
 		ret = -EOPNOTSUPP;
+	else if (smu_version >= SUPPORT_ECCTABLE_SMU_VERSION &&
+			smu_version < SUPPORT_ECCTABLE_V2_SMU_VERSION)
+		*ecctable_version = 1;
+	else
+		*ecctable_version = 2;
 
 	return ret;
 }
@@ -1824,36 +1843,72 @@ static ssize_t aldebaran_get_ecc_info(struct smu_context *smu,
 {
 	struct smu_table_context *smu_table = &smu->smu_table;
 	EccInfoTable_t *ecc_table = NULL;
+	EccInfoTable_t_v2 *ecc_table_v2 = NULL;
 	struct ecc_info_per_ch *ecc_info_per_channel = NULL;
 	int i, ret = 0;
+	int table_version = 0;
 	struct umc_ecc_info *eccinfo = (struct umc_ecc_info *)table;
 
-	ret = aldebaran_check_ecc_table_support(smu);
+	ret = aldebaran_check_ecc_table_support(smu, &table_version);
 	if (ret)
 		return ret;
 
-	ret = smu_cmn_update_table(smu,
-			       SMU_TABLE_ECCINFO,
-			       0,
-			       smu_table->ecc_table,
-			       false);
-	if (ret) {
-		dev_info(smu->adev->dev, "Failed to export SMU ecc table!\n");
-		return ret;
-	}
+	if (table_version == 1) {
+		ret = smu_cmn_update_table(smu,
+				       SMU_TABLE_ECCINFO,
+				       0,
+				       smu_table->ecc_table,
+				       false);
+		if (ret) {
+			dev_info(smu->adev->dev, "Failed to export SMU ecc table!\n");
+			return ret;
+		}
+
+		ecc_table = (EccInfoTable_t *)smu_table->ecc_table;
+
+		for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
+			ecc_info_per_channel = &(eccinfo->ecc[i]);
+			ecc_info_per_channel->ce_count_lo_chip =
+				ecc_table->EccInfo[i].ce_count_lo_chip;
+			ecc_info_per_channel->ce_count_hi_chip =
+				ecc_table->EccInfo[i].ce_count_hi_chip;
+			ecc_info_per_channel->mca_umc_status =
+				ecc_table->EccInfo[i].mca_umc_status;
+			ecc_info_per_channel->mca_umc_addr =
+				ecc_table->EccInfo[i].mca_umc_addr;
+		}
+	} else if (table_version == 2) {
+		/* still use SMU_TABLE_ECC_INFO index,
+		 * smu 68.55.0 add mca_ceumc_addr variable
+		 * in EccInfo_t struct to report correctable
+		 * error address and the table_id is not changed
+		 */
+		ret = smu_cmn_update_table(smu,
+				       SMU_TABLE_ECCINFO,
+				       0,
+				       smu_table->ecc_table_v2,
+					   false);
 
-	ecc_table = (EccInfoTable_t *)smu_table->ecc_table;
-
-	for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
-		ecc_info_per_channel = &(eccinfo->ecc[i]);
-		ecc_info_per_channel->ce_count_lo_chip =
-			ecc_table->EccInfo[i].ce_count_lo_chip;
-		ecc_info_per_channel->ce_count_hi_chip =
-			ecc_table->EccInfo[i].ce_count_hi_chip;
-		ecc_info_per_channel->mca_umc_status =
-			ecc_table->EccInfo[i].mca_umc_status;
-		ecc_info_per_channel->mca_umc_addr =
-			ecc_table->EccInfo[i].mca_umc_addr;
+		if (ret) {
+			dev_info(smu->adev->dev, "Failed to export SMU ecc table_v2!\n");
+			return ret;
+		}
+
+		ecc_table_v2 = (EccInfoTable_t_v2 *)smu_table->ecc_table_v2;
+
+		for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
+			ecc_info_per_channel = &(eccinfo->ecc[i]);
+			ecc_info_per_channel->ce_count_lo_chip =
+				ecc_table_v2->EccInfo[i].ce_count_lo_chip;
+			ecc_info_per_channel->ce_count_hi_chip =
+				ecc_table_v2->EccInfo[i].ce_count_hi_chip;
+			ecc_info_per_channel->mca_umc_status =
+				ecc_table_v2->EccInfo[i].mca_umc_status;
+			ecc_info_per_channel->mca_umc_addr =
+				ecc_table_v2->EccInfo[i].mca_umc_addr;
+			ecc_info_per_channel->mca_ceumc_addr =
+				ecc_table_v2->EccInfo[i].mca_ceumc_addr;
+		}
 	}
 
 	return ret;
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
index ae6321af9d88..af2d84a16f3e 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
@@ -552,9 +552,11 @@ int smu_v13_0_fini_smc_tables(struct smu_context *smu)
 	kfree(smu_table->hardcode_pptable);
 	smu_table->hardcode_pptable = NULL;
 
+	kfree(smu_table->ecc_table_v2);
 	kfree(smu_table->ecc_table);
 	kfree(smu_table->metrics_table);
 	kfree(smu_table->watermarks_table);
+	smu_table->ecc_table_v2 = NULL;
 	smu_table->ecc_table = NULL;
 	smu_table->metrics_table = NULL;
 	smu_table->watermarks_table = NULL;
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH Review 2/2] drm/amdgpu: print umc correctable error address
  2022-05-23  8:17 [PATCH Review 1/2] drm/amdgpu/pm: support mca_ceumc_addr in ecctable Stanley.Yang
@ 2022-05-23  8:17 ` Stanley.Yang
  2022-05-23 10:22   ` Zhou1, Tao
  2022-05-23  8:48 ` [PATCH Review 1/2] drm/amdgpu/pm: support mca_ceumc_addr in ecctable Wang, Yang(Kevin)
  2022-05-23  9:12 ` Lazar, Lijo
  2 siblings, 1 reply; 9+ messages in thread
From: Stanley.Yang @ 2022-05-23  8:17 UTC (permalink / raw)
  To: amd-gfx, hawking.zhang, tao.zhou1, evan.quan, lijo.lazar; +Cc: Stanley.Yang

Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h           |  5 ++
 drivers/gpu/drm/amd/amdgpu/umc_v6_7.c         | 55 ++++++++++++++++++-
 .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c    |  2 +
 3 files changed, 60 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 3f23f9ad3249..985b8cddb5a1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1108,6 +1108,11 @@ struct amdgpu_device {
 
 	bool                            scpm_enabled;
 	uint32_t                        scpm_status;
+
+	/* Determine smu ecctable whether support
+	 * record correctable error address
+	 */
+	int record_ce_addr_supported;
 };
 
 static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
index 606892dbea1c..47bd39e52e9b 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
@@ -119,6 +119,27 @@ static void umc_v6_7_ecc_info_query_correctable_error_count(struct amdgpu_device
 		*error_count += 1;
 
 		umc_v6_7_query_error_status_helper(adev, mc_umc_status, umc_reg_offset);
+
+		if (adev->record_ce_addr_supported)	{
+			uint64_t err_addr, soc_pa;
+			uint32_t channel_index =
+				adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
+
+			err_addr = ras->umc_ecc.ecc[eccinfo_table_idx].mca_ceumc_addr;
+			err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
+			/* translate umc channel address to soc pa, 3 parts are included */
+			soc_pa = ADDR_OF_8KB_BLOCK(err_addr) |
+					ADDR_OF_256B_BLOCK(channel_index) |
+					OFFSET_IN_256B_BLOCK(err_addr);
+
+			/* The umc channel bits are not original values, they are hashed */
+			SET_CHANNEL_HASH(channel_index, soc_pa);
+
+			/* clear [C4 C3 C2] in soc physical address */
+			soc_pa &= ~(0x7ULL << UMC_V6_7_PA_C2_BIT);
+
+			dev_info(adev->dev, "Error Address(PA): 0x%llx\n", soc_pa);
+		}
 	}
 }
 
@@ -251,7 +272,9 @@ static void umc_v6_7_ecc_info_query_ras_error_address(struct amdgpu_device *adev
 
 static void umc_v6_7_query_correctable_error_count(struct amdgpu_device *adev,
 						   uint32_t umc_reg_offset,
-						   unsigned long *error_count)
+						   unsigned long *error_count,
+						   uint32_t ch_inst,
+						   uint32_t umc_inst)
 {
 	uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
 	uint32_t ecc_err_cnt, ecc_err_cnt_addr;
@@ -295,6 +318,33 @@ static void umc_v6_7_query_correctable_error_count(struct amdgpu_device *adev,
 		*error_count += 1;
 
 		umc_v6_7_query_error_status_helper(adev, mc_umc_status, umc_reg_offset);
+
+		{
+			uint64_t err_addr, soc_pa;
+			uint32_t mc_umc_addrt0;
+			uint32_t channel_index;
+
+			mc_umc_addrt0 =
+				SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_ADDRT0);
+
+			channel_index =
+				adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
+
+			err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4);
+			err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
+
+			/* translate umc channel address to soc pa, 3 parts are included */
+			soc_pa = ADDR_OF_8KB_BLOCK(err_addr) |
+					ADDR_OF_256B_BLOCK(channel_index) |
+					OFFSET_IN_256B_BLOCK(err_addr);
+
+			/* The umc channel bits are not original values, they are hashed */
+			SET_CHANNEL_HASH(channel_index, soc_pa);
+
+			/* clear [C4 C3 C2] in soc physical address */
+			soc_pa &= ~(0x7ULL << UMC_V6_7_PA_C2_BIT);
+			dev_info(adev->dev, "Error Address(PA): 0x%llx\n", soc_pa);
+		}
 	}
 }
 
@@ -395,7 +445,8 @@ static void umc_v6_7_query_ras_error_count(struct amdgpu_device *adev,
 							 ch_inst);
 		umc_v6_7_query_correctable_error_count(adev,
 						       umc_reg_offset,
-						       &(err_data->ce_count));
+						       &(err_data->ce_count),
+						       ch_inst, umc_inst);
 		umc_v6_7_querry_uncorrectable_error_count(adev,
 							  umc_reg_offset,
 							  &(err_data->ue_count));
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
index e58df9490cec..e182088b4ac8 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
@@ -1908,6 +1908,8 @@ static ssize_t aldebaran_get_ecc_info(struct smu_context *smu,
 				ecc_table_v2->EccInfo[i].mca_umc_addr;
 			ecc_info_per_channel->mca_ceumc_addr =
 				ecc_table_v2->EccInfo[i].mca_ceumc_addr;
+			if (!smu->adev->record_ce_addr_supported)
+				smu->adev->record_ce_addr_supported =1;
 		}
 	}
 
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH Review 1/2] drm/amdgpu/pm: support mca_ceumc_addr in ecctable
  2022-05-23  8:17 [PATCH Review 1/2] drm/amdgpu/pm: support mca_ceumc_addr in ecctable Stanley.Yang
  2022-05-23  8:17 ` [PATCH Review 2/2] drm/amdgpu: print umc correctable error address Stanley.Yang
@ 2022-05-23  8:48 ` Wang, Yang(Kevin)
  2022-05-23 10:16   ` 回复: " Yang, Stanley
  2022-05-23  9:12 ` Lazar, Lijo
  2 siblings, 1 reply; 9+ messages in thread
From: Wang, Yang(Kevin) @ 2022-05-23  8:48 UTC (permalink / raw)
  To: Yang, Stanley, amd-gfx, Zhang, Hawking, Zhou1, Tao, Quan, Evan,
	Lazar, Lijo

[-- Attachment #1: Type: text/plain, Size: 11620 bytes --]

[AMD Official Use Only - General]


________________________________
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf of Stanley.Yang <Stanley.Yang@amd.com>
Sent: Monday, May 23, 2022 4:17 PM
To: amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>; Zhang, Hawking <Hawking.Zhang@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Quan, Evan <Evan.Quan@amd.com>; Lazar, Lijo <Lijo.Lazar@amd.com>
Cc: Yang, Stanley <Stanley.Yang@amd.com>
Subject: [PATCH Review 1/2] drm/amdgpu/pm: support mca_ceumc_addr in ecctable

SMU add a new variable mca_ceumc_addr to record
umc correctable error address in EccInfo table,
driver side add ecctable_v2 to support this feature

Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h       |   1 +
 drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h |   2 +
 .../inc/pmfw_if/smu13_driver_if_aldebaran.h   |  15 +++
 .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c    | 101 ++++++++++++++----
 .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c    |   2 +
 5 files changed, 98 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index b9a6fac2b8b2..28e603243b67 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -328,6 +328,7 @@ struct ecc_info_per_ch {
         uint16_t ce_count_hi_chip;
         uint64_t mca_umc_status;
         uint64_t mca_umc_addr;
+       uint64_t mca_ceumc_addr;
 };

 struct umc_ecc_info {
diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
index a6a7b6c33683..9f7257ada437 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
@@ -322,6 +322,7 @@ enum smu_table_id
         SMU_TABLE_PACE,
         SMU_TABLE_ECCINFO,
         SMU_TABLE_COMBO_PPTABLE,
+       SMU_TABLE_ECCINFO_V2,
         SMU_TABLE_COUNT,
 };

@@ -340,6 +341,7 @@ struct smu_table_context
         void                            *driver_pptable;
         void                            *combo_pptable;
         void                            *ecc_table;
+       void                            *ecc_table_v2;  // adapt to smu support record mca_ceumc_addr
         void                            *driver_smu_config_table;
         struct smu_table                tables[SMU_TABLE_COUNT];
         /*
diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h
index 0f67c56c2863..2868604eff49 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h
@@ -522,6 +522,21 @@ typedef struct {
         EccInfo_t  EccInfo[ALDEBARAN_UMC_CHANNEL_NUM];
 } EccInfoTable_t;

+typedef struct {
+       uint64_t mca_umc_status;
+       uint64_t mca_umc_addr;
+       uint64_t mca_ceumc_addr;
+
+       uint16_t ce_count_lo_chip;
+       uint16_t ce_count_hi_chip;
+
+       uint32_t eccPadding;
+} EccInfo_t_v2;
+
+typedef struct {
+       EccInfo_t_v2  EccInfo[ALDEBARAN_UMC_CHANNEL_NUM];
+} EccInfoTable_t_v2;
+
 // These defines are used with the following messages:
 // SMC_MSG_TransferTableDram2Smu
 // SMC_MSG_TransferTableSmu2Dram
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
index 38af648cb857..e58df9490cec 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
@@ -82,6 +82,12 @@
  */
 #define SUPPORT_ECCTABLE_SMU_VERSION 0x00442a00

+/*
+ * SMU support mca_ceumc_addr in ECCTABLE since version 68.55.0,
+ * use this to check mca_ceumc_addr record whether support
+ */
+#define SUPPORT_ECCTABLE_V2_SMU_VERSION 0x00443700
+
 /*
  * SMU support BAD CHENNEL info MSG since version 68.51.00,
  * use this to check ECCTALE feature whether support
@@ -239,6 +245,9 @@ static int aldebaran_tables_init(struct smu_context *smu)
         SMU_TABLE_INIT(tables, SMU_TABLE_ECCINFO, sizeof(EccInfoTable_t),
                        PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);

+       SMU_TABLE_INIT(tables, SMU_TABLE_ECCINFO_V2, sizeof(EccInfoTable_t_v2),
+                       PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);
+
[kevin]:
this table mapping is not needed, the reason as below.

         smu_table->metrics_table = kzalloc(sizeof(SmuMetrics_t), GFP_KERNEL);
         if (!smu_table->metrics_table)
                 return -ENOMEM;
@@ -255,6 +264,10 @@ static int aldebaran_tables_init(struct smu_context *smu)
         if (!smu_table->ecc_table)
                 return -ENOMEM;

+       smu_table->ecc_table_v2 = kzalloc(tables[SMU_TABLE_ECCINFO_V2].size, GFP_KERNEL);
+       if (!smu_table->ecc_table_v2)
+               return -ENOMEM;;
+

[kevin]:

add eccinfo table v2 is not needed for this case, this table is only used store table data from pmfw to driver,
you can create a large enough table which can save ecc table data directly.

e.g:
size = max(sizeof(EccInfoTable_t_v2), sizeof(EccInfoTable_t));
smu_table->ecc_table = kzalloc(size, GFP_KERNEL);

Best Regards,
Kevin

         return 0;
 }

@@ -1802,7 +1815,8 @@ static ssize_t aldebaran_get_gpu_metrics(struct smu_context *smu,
         return sizeof(struct gpu_metrics_v1_3);
 }

-static int aldebaran_check_ecc_table_support(struct smu_context *smu)
+static int aldebaran_check_ecc_table_support(struct smu_context *smu,
+               int *ecctable_version)
 {
         uint32_t if_version = 0xff, smu_version = 0xff;
         int ret = 0;
@@ -1815,6 +1829,11 @@ static int aldebaran_check_ecc_table_support(struct smu_context *smu)

         if (smu_version < SUPPORT_ECCTABLE_SMU_VERSION)
                 ret = -EOPNOTSUPP;
+       else if (smu_version >= SUPPORT_ECCTABLE_SMU_VERSION &&
+                       smu_version < SUPPORT_ECCTABLE_V2_SMU_VERSION)
+               *ecctable_version = 1;
+       else
+               *ecctable_version = 2;

         return ret;
 }
@@ -1824,36 +1843,72 @@ static ssize_t aldebaran_get_ecc_info(struct smu_context *smu,
 {
         struct smu_table_context *smu_table = &smu->smu_table;
         EccInfoTable_t *ecc_table = NULL;
+       EccInfoTable_t_v2 *ecc_table_v2 = NULL;
         struct ecc_info_per_ch *ecc_info_per_channel = NULL;
         int i, ret = 0;
+       int table_version = 0;
         struct umc_ecc_info *eccinfo = (struct umc_ecc_info *)table;

-       ret = aldebaran_check_ecc_table_support(smu);
+       ret = aldebaran_check_ecc_table_support(smu, &table_version);
         if (ret)
                 return ret;

-       ret = smu_cmn_update_table(smu,
-                              SMU_TABLE_ECCINFO,
-                              0,
-                              smu_table->ecc_table,
-                              false);
-       if (ret) {
-               dev_info(smu->adev->dev, "Failed to export SMU ecc table!\n");
-               return ret;
-       }
+       if (table_version == 1) {
+               ret = smu_cmn_update_table(smu,
+                                      SMU_TABLE_ECCINFO,
+                                      0,
+                                      smu_table->ecc_table,
+                                      false);
+               if (ret) {
+                       dev_info(smu->adev->dev, "Failed to export SMU ecc table!\n");
+                       return ret;
+               }
+
+               ecc_table = (EccInfoTable_t *)smu_table->ecc_table;
+
+               for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
+                       ecc_info_per_channel = &(eccinfo->ecc[i]);
+                       ecc_info_per_channel->ce_count_lo_chip =
+                               ecc_table->EccInfo[i].ce_count_lo_chip;
+                       ecc_info_per_channel->ce_count_hi_chip =
+                               ecc_table->EccInfo[i].ce_count_hi_chip;
+                       ecc_info_per_channel->mca_umc_status =
+                               ecc_table->EccInfo[i].mca_umc_status;
+                       ecc_info_per_channel->mca_umc_addr =
+                               ecc_table->EccInfo[i].mca_umc_addr;
+               }
+       } else if (table_version == 2) {
+               /* still use SMU_TABLE_ECC_INFO index,
+                * smu 68.55.0 add mca_ceumc_addr variable
+                * in EccInfo_t struct to report correctable
+                * error address and the table_id is not changed
+                */
+               ret = smu_cmn_update_table(smu,
+                                      SMU_TABLE_ECCINFO,
+                                      0,
+                                      smu_table->ecc_table_v2,
+                                          false);

-       ecc_table = (EccInfoTable_t *)smu_table->ecc_table;
-
-       for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
-               ecc_info_per_channel = &(eccinfo->ecc[i]);
-               ecc_info_per_channel->ce_count_lo_chip =
-                       ecc_table->EccInfo[i].ce_count_lo_chip;
-               ecc_info_per_channel->ce_count_hi_chip =
-                       ecc_table->EccInfo[i].ce_count_hi_chip;
-               ecc_info_per_channel->mca_umc_status =
-                       ecc_table->EccInfo[i].mca_umc_status;
-               ecc_info_per_channel->mca_umc_addr =
-                       ecc_table->EccInfo[i].mca_umc_addr;
+               if (ret) {
+                       dev_info(smu->adev->dev, "Failed to export SMU ecc table_v2!\n");
+                       return ret;
+               }
+
+               ecc_table_v2 = (EccInfoTable_t_v2 *)smu_table->ecc_table_v2;
+
+               for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
+                       ecc_info_per_channel = &(eccinfo->ecc[i]);
+                       ecc_info_per_channel->ce_count_lo_chip =
+                               ecc_table_v2->EccInfo[i].ce_count_lo_chip;
+                       ecc_info_per_channel->ce_count_hi_chip =
+                               ecc_table_v2->EccInfo[i].ce_count_hi_chip;
+                       ecc_info_per_channel->mca_umc_status =
+                               ecc_table_v2->EccInfo[i].mca_umc_status;
+                       ecc_info_per_channel->mca_umc_addr =
+                               ecc_table_v2->EccInfo[i].mca_umc_addr;
+                       ecc_info_per_channel->mca_ceumc_addr =
+                               ecc_table_v2->EccInfo[i].mca_ceumc_addr;
+               }
         }

         return ret;
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
index ae6321af9d88..af2d84a16f3e 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
@@ -552,9 +552,11 @@ int smu_v13_0_fini_smc_tables(struct smu_context *smu)
         kfree(smu_table->hardcode_pptable);
         smu_table->hardcode_pptable = NULL;

+       kfree(smu_table->ecc_table_v2);
         kfree(smu_table->ecc_table);
         kfree(smu_table->metrics_table);
         kfree(smu_table->watermarks_table);
+       smu_table->ecc_table_v2 = NULL;
         smu_table->ecc_table = NULL;
         smu_table->metrics_table = NULL;
         smu_table->watermarks_table = NULL;
--
2.17.1


[-- Attachment #2: Type: text/html, Size: 26849 bytes --]

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH Review 1/2] drm/amdgpu/pm: support mca_ceumc_addr in ecctable
  2022-05-23  8:17 [PATCH Review 1/2] drm/amdgpu/pm: support mca_ceumc_addr in ecctable Stanley.Yang
  2022-05-23  8:17 ` [PATCH Review 2/2] drm/amdgpu: print umc correctable error address Stanley.Yang
  2022-05-23  8:48 ` [PATCH Review 1/2] drm/amdgpu/pm: support mca_ceumc_addr in ecctable Wang, Yang(Kevin)
@ 2022-05-23  9:12 ` Lazar, Lijo
  2022-05-23 10:11   ` 回复: " Yang, Stanley
  2 siblings, 1 reply; 9+ messages in thread
From: Lazar, Lijo @ 2022-05-23  9:12 UTC (permalink / raw)
  To: Stanley.Yang, amd-gfx, hawking.zhang, tao.zhou1, evan.quan



On 5/23/2022 1:47 PM, Stanley.Yang wrote:
> SMU add a new variable mca_ceumc_addr to record
> umc correctable error address in EccInfo table,
> driver side add ecctable_v2 to support this feature
> 
> Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h       |   1 +
>   drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h |   2 +
>   .../inc/pmfw_if/smu13_driver_if_aldebaran.h   |  15 +++
>   .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c    | 101 ++++++++++++++----
>   .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c    |   2 +
>   5 files changed, 98 insertions(+), 23 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index b9a6fac2b8b2..28e603243b67 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -328,6 +328,7 @@ struct ecc_info_per_ch {
>   	uint16_t ce_count_hi_chip;
>   	uint64_t mca_umc_status;
>   	uint64_t mca_umc_addr;
> +	uint64_t mca_ceumc_addr;
>   };
>   
>   struct umc_ecc_info {
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> index a6a7b6c33683..9f7257ada437 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> @@ -322,6 +322,7 @@ enum smu_table_id
>   	SMU_TABLE_PACE,
>   	SMU_TABLE_ECCINFO,
>   	SMU_TABLE_COMBO_PPTABLE,
> +	SMU_TABLE_ECCINFO_V2,

Hi Stanley,

This is not the right approach. Need to ask FW team to fix this. There 
shouldn't be any new id with each version of table. You may check Sienna 
Cichlid smu metrics table as an example and ask FW team to follow 
something similar. I don't see 68.55 being released, so it's not late 
anyway. We don't need to keep defining pointers in table context per 
version of ECC table.

Thanks,
Lijo

>   	SMU_TABLE_COUNT,
>   };
>   
> @@ -340,6 +341,7 @@ struct smu_table_context
>   	void				*driver_pptable;
>   	void				*combo_pptable;
>   	void                            *ecc_table;
> +	void                            *ecc_table_v2;	// adapt to smu support record mca_ceumc_addr
>   	void				*driver_smu_config_table;
>   	struct smu_table		tables[SMU_TABLE_COUNT];
>   	/*
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h
> index 0f67c56c2863..2868604eff49 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h
> +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h
> @@ -522,6 +522,21 @@ typedef struct {
>   	EccInfo_t  EccInfo[ALDEBARAN_UMC_CHANNEL_NUM];
>   } EccInfoTable_t;
>   
> +typedef struct {
> +	uint64_t mca_umc_status;
> +	uint64_t mca_umc_addr;
> +	uint64_t mca_ceumc_addr;
> +
> +	uint16_t ce_count_lo_chip;
> +	uint16_t ce_count_hi_chip;
> +
> +	uint32_t eccPadding;
> +} EccInfo_t_v2;
> +
> +typedef struct {
> +	EccInfo_t_v2  EccInfo[ALDEBARAN_UMC_CHANNEL_NUM];
> +} EccInfoTable_t_v2;
> +
>   // These defines are used with the following messages:
>   // SMC_MSG_TransferTableDram2Smu
>   // SMC_MSG_TransferTableSmu2Dram
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> index 38af648cb857..e58df9490cec 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> @@ -82,6 +82,12 @@
>    */
>   #define SUPPORT_ECCTABLE_SMU_VERSION 0x00442a00
>   
> +/*
> + * SMU support mca_ceumc_addr in ECCTABLE since version 68.55.0,
> + * use this to check mca_ceumc_addr record whether support
> + */
> +#define SUPPORT_ECCTABLE_V2_SMU_VERSION 0x00443700
> +
>   /*
>    * SMU support BAD CHENNEL info MSG since version 68.51.00,
>    * use this to check ECCTALE feature whether support
> @@ -239,6 +245,9 @@ static int aldebaran_tables_init(struct smu_context *smu)
>   	SMU_TABLE_INIT(tables, SMU_TABLE_ECCINFO, sizeof(EccInfoTable_t),
>   		       PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);
>   
> +	SMU_TABLE_INIT(tables, SMU_TABLE_ECCINFO_V2, sizeof(EccInfoTable_t_v2),
> +			PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);
> +
>   	smu_table->metrics_table = kzalloc(sizeof(SmuMetrics_t), GFP_KERNEL);
>   	if (!smu_table->metrics_table)
>   		return -ENOMEM;
> @@ -255,6 +264,10 @@ static int aldebaran_tables_init(struct smu_context *smu)
>   	if (!smu_table->ecc_table)
>   		return -ENOMEM;
>   
> +	smu_table->ecc_table_v2 = kzalloc(tables[SMU_TABLE_ECCINFO_V2].size, GFP_KERNEL);
> +	if (!smu_table->ecc_table_v2)
> +		return -ENOMEM;;
> +
>   	return 0;
>   }
>   
> @@ -1802,7 +1815,8 @@ static ssize_t aldebaran_get_gpu_metrics(struct smu_context *smu,
>   	return sizeof(struct gpu_metrics_v1_3);
>   }
>   
> -static int aldebaran_check_ecc_table_support(struct smu_context *smu)
> +static int aldebaran_check_ecc_table_support(struct smu_context *smu,
> +		int *ecctable_version)
>   {
>   	uint32_t if_version = 0xff, smu_version = 0xff;
>   	int ret = 0;
> @@ -1815,6 +1829,11 @@ static int aldebaran_check_ecc_table_support(struct smu_context *smu)
>   
>   	if (smu_version < SUPPORT_ECCTABLE_SMU_VERSION)
>   		ret = -EOPNOTSUPP;
> +	else if (smu_version >= SUPPORT_ECCTABLE_SMU_VERSION &&
> +			smu_version < SUPPORT_ECCTABLE_V2_SMU_VERSION)
> +		*ecctable_version = 1;
> +	else
> +		*ecctable_version = 2;
>   
>   	return ret;
>   }
> @@ -1824,36 +1843,72 @@ static ssize_t aldebaran_get_ecc_info(struct smu_context *smu,
>   {
>   	struct smu_table_context *smu_table = &smu->smu_table;
>   	EccInfoTable_t *ecc_table = NULL;
> +	EccInfoTable_t_v2 *ecc_table_v2 = NULL;
>   	struct ecc_info_per_ch *ecc_info_per_channel = NULL;
>   	int i, ret = 0;
> +	int table_version = 0;
>   	struct umc_ecc_info *eccinfo = (struct umc_ecc_info *)table;
>   
> -	ret = aldebaran_check_ecc_table_support(smu);
> +	ret = aldebaran_check_ecc_table_support(smu, &table_version);
>   	if (ret)
>   		return ret;
>   
> -	ret = smu_cmn_update_table(smu,
> -			       SMU_TABLE_ECCINFO,
> -			       0,
> -			       smu_table->ecc_table,
> -			       false);
> -	if (ret) {
> -		dev_info(smu->adev->dev, "Failed to export SMU ecc table!\n");
> -		return ret;
> -	}
> +	if (table_version == 1) {
> +		ret = smu_cmn_update_table(smu,
> +				       SMU_TABLE_ECCINFO,
> +				       0,
> +				       smu_table->ecc_table,
> +				       false);
> +		if (ret) {
> +			dev_info(smu->adev->dev, "Failed to export SMU ecc table!\n");
> +			return ret;
> +		}
> +
> +		ecc_table = (EccInfoTable_t *)smu_table->ecc_table;
> +
> +		for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
> +			ecc_info_per_channel = &(eccinfo->ecc[i]);
> +			ecc_info_per_channel->ce_count_lo_chip =
> +				ecc_table->EccInfo[i].ce_count_lo_chip;
> +			ecc_info_per_channel->ce_count_hi_chip =
> +				ecc_table->EccInfo[i].ce_count_hi_chip;
> +			ecc_info_per_channel->mca_umc_status =
> +				ecc_table->EccInfo[i].mca_umc_status;
> +			ecc_info_per_channel->mca_umc_addr =
> +				ecc_table->EccInfo[i].mca_umc_addr;
> +		}
> +	} else if (table_version == 2) {
> +		/* still use SMU_TABLE_ECC_INFO index,
> +		 * smu 68.55.0 add mca_ceumc_addr variable
> +		 * in EccInfo_t struct to report correctable
> +		 * error address and the table_id is not changed
> +		 */
> +		ret = smu_cmn_update_table(smu,
> +				       SMU_TABLE_ECCINFO,
> +				       0,
> +				       smu_table->ecc_table_v2,
> +					   false);
>   
> -	ecc_table = (EccInfoTable_t *)smu_table->ecc_table;
> -
> -	for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
> -		ecc_info_per_channel = &(eccinfo->ecc[i]);
> -		ecc_info_per_channel->ce_count_lo_chip =
> -			ecc_table->EccInfo[i].ce_count_lo_chip;
> -		ecc_info_per_channel->ce_count_hi_chip =
> -			ecc_table->EccInfo[i].ce_count_hi_chip;
> -		ecc_info_per_channel->mca_umc_status =
> -			ecc_table->EccInfo[i].mca_umc_status;
> -		ecc_info_per_channel->mca_umc_addr =
> -			ecc_table->EccInfo[i].mca_umc_addr;
> +		if (ret) {
> +			dev_info(smu->adev->dev, "Failed to export SMU ecc table_v2!\n");
> +			return ret;
> +		}
> +
> +		ecc_table_v2 = (EccInfoTable_t_v2 *)smu_table->ecc_table_v2;
> +
> +		for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
> +			ecc_info_per_channel = &(eccinfo->ecc[i]);
> +			ecc_info_per_channel->ce_count_lo_chip =
> +				ecc_table_v2->EccInfo[i].ce_count_lo_chip;
> +			ecc_info_per_channel->ce_count_hi_chip =
> +				ecc_table_v2->EccInfo[i].ce_count_hi_chip;
> +			ecc_info_per_channel->mca_umc_status =
> +				ecc_table_v2->EccInfo[i].mca_umc_status;
> +			ecc_info_per_channel->mca_umc_addr =
> +				ecc_table_v2->EccInfo[i].mca_umc_addr;
> +			ecc_info_per_channel->mca_ceumc_addr =
> +				ecc_table_v2->EccInfo[i].mca_ceumc_addr;
> +		}
>   	}
>   
>   	return ret;
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> index ae6321af9d88..af2d84a16f3e 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> @@ -552,9 +552,11 @@ int smu_v13_0_fini_smc_tables(struct smu_context *smu)
>   	kfree(smu_table->hardcode_pptable);
>   	smu_table->hardcode_pptable = NULL;
>   
> +	kfree(smu_table->ecc_table_v2);
>   	kfree(smu_table->ecc_table);
>   	kfree(smu_table->metrics_table);
>   	kfree(smu_table->watermarks_table);
> +	smu_table->ecc_table_v2 = NULL;
>   	smu_table->ecc_table = NULL;
>   	smu_table->metrics_table = NULL;
>   	smu_table->watermarks_table = NULL;
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* 回复: [PATCH Review 1/2] drm/amdgpu/pm: support mca_ceumc_addr in ecctable
  2022-05-23  9:12 ` Lazar, Lijo
@ 2022-05-23 10:11   ` Yang, Stanley
  0 siblings, 0 replies; 9+ messages in thread
From: Yang, Stanley @ 2022-05-23 10:11 UTC (permalink / raw)
  To: Lazar, Lijo, amd-gfx, Zhang, Hawking, Zhou1, Tao, Quan, Evan, Joo, Maria

[AMD Official Use Only - General]

Hi Lijo,
+@Joo, Maria

> -----邮件原件-----
> 发件人: Lazar, Lijo <Lijo.Lazar@amd.com>
> 发送时间: Monday, May 23, 2022 5:12 PM
> 收件人: Yang, Stanley <Stanley.Yang@amd.com>; amd-
> gfx@lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang@amd.com>; Zhou1,
> Tao <Tao.Zhou1@amd.com>; Quan, Evan <Evan.Quan@amd.com>
> 主题: Re: [PATCH Review 1/2] drm/amdgpu/pm: support mca_ceumc_addr in
> ecctable
>
>
>
> On 5/23/2022 1:47 PM, Stanley.Yang wrote:
> > SMU add a new variable mca_ceumc_addr to record umc correctable error
> > address in EccInfo table, driver side add ecctable_v2 to support this
> > feature
> >
> > Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h       |   1 +
> >   drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h |   2 +
> >   .../inc/pmfw_if/smu13_driver_if_aldebaran.h   |  15 +++
> >   .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c    | 101 ++++++++++++++----
> >   .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c    |   2 +
> >   5 files changed, 98 insertions(+), 23 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > index b9a6fac2b8b2..28e603243b67 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > @@ -328,6 +328,7 @@ struct ecc_info_per_ch {
> >     uint16_t ce_count_hi_chip;
> >     uint64_t mca_umc_status;
> >     uint64_t mca_umc_addr;
> > +   uint64_t mca_ceumc_addr;
> >   };
> >
> >   struct umc_ecc_info {
> > diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> > b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> > index a6a7b6c33683..9f7257ada437 100644
> > --- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> > +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> > @@ -322,6 +322,7 @@ enum smu_table_id
> >     SMU_TABLE_PACE,
> >     SMU_TABLE_ECCINFO,
> >     SMU_TABLE_COMBO_PPTABLE,
> > +   SMU_TABLE_ECCINFO_V2,
>
> Hi Stanley,
>
> This is not the right approach. Need to ask FW team to fix this. There shouldn't
> be any new id with each version of table. You may check Sienna Cichlid smu
> metrics table as an example and ask FW team to follow something similar. I
> don't see 68.55 being released, so it's not late anyway. We don't need to keep
> defining pointers in table context per version of ECC table.
>
> Thanks,
> Lijo
[Yang, Stanley] : There is not enough padding space in ecc_table, you can check below struct,
the new added variable is uint32_t type, I think smu can't add uint32_t type in ecc_table directly without change ecc_tabe size.
If you have any better approach, we can discuss a better method to complete it.
512 typedef struct {
513     uint64_t mca_umc_status;
514     uint64_t mca_umc_addr;
515     uint16_t ce_count_lo_chip;
516     uint16_t ce_count_hi_chip;
517
518     uint32_t eccPadding;
519 } EccInfo_t;

Thanks,
Stanley
>
> >     SMU_TABLE_COUNT,
> >   };
> >
> > @@ -340,6 +341,7 @@ struct smu_table_context
> >     void                            *driver_pptable;
> >     void                            *combo_pptable;
> >     void                            *ecc_table;
> > +   void                            *ecc_table_v2;  // adapt to smu support record
> mca_ceumc_addr
> >     void                            *driver_smu_config_table;
> >     struct smu_table                tables[SMU_TABLE_COUNT];
> >     /*
> > diff --git
> >
> a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h
> >
> b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h
> > index 0f67c56c2863..2868604eff49 100644
> > ---
> >
> a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h
> > +++
> b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebar
> > +++ an.h
> > @@ -522,6 +522,21 @@ typedef struct {
> >     EccInfo_t  EccInfo[ALDEBARAN_UMC_CHANNEL_NUM];
> >   } EccInfoTable_t;
> >
> > +typedef struct {
> > +   uint64_t mca_umc_status;
> > +   uint64_t mca_umc_addr;
> > +   uint64_t mca_ceumc_addr;
> > +
> > +   uint16_t ce_count_lo_chip;
> > +   uint16_t ce_count_hi_chip;
> > +
> > +   uint32_t eccPadding;
> > +} EccInfo_t_v2;
> > +
> > +typedef struct {
> > +   EccInfo_t_v2  EccInfo[ALDEBARAN_UMC_CHANNEL_NUM];
> > +} EccInfoTable_t_v2;
> > +
> >   // These defines are used with the following messages:
> >   // SMC_MSG_TransferTableDram2Smu
> >   // SMC_MSG_TransferTableSmu2Dram
> > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> > b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> > index 38af648cb857..e58df9490cec 100644
> > --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> > @@ -82,6 +82,12 @@
> >    */
> >   #define SUPPORT_ECCTABLE_SMU_VERSION 0x00442a00
> >
> > +/*
> > + * SMU support mca_ceumc_addr in ECCTABLE since version 68.55.0,
> > + * use this to check mca_ceumc_addr record whether support  */
> > +#define SUPPORT_ECCTABLE_V2_SMU_VERSION 0x00443700
> > +
> >   /*
> >    * SMU support BAD CHENNEL info MSG since version 68.51.00,
> >    * use this to check ECCTALE feature whether support @@ -239,6
> > +245,9 @@ static int aldebaran_tables_init(struct smu_context *smu)
> >     SMU_TABLE_INIT(tables, SMU_TABLE_ECCINFO, sizeof(EccInfoTable_t),
> >                    PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);
> >
> > +   SMU_TABLE_INIT(tables, SMU_TABLE_ECCINFO_V2,
> sizeof(EccInfoTable_t_v2),
> > +                   PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);
> > +
> >     smu_table->metrics_table = kzalloc(sizeof(SmuMetrics_t), GFP_KERNEL);
> >     if (!smu_table->metrics_table)
> >             return -ENOMEM;
> > @@ -255,6 +264,10 @@ static int aldebaran_tables_init(struct smu_context
> *smu)
> >     if (!smu_table->ecc_table)
> >             return -ENOMEM;
> >
> > +   smu_table->ecc_table_v2 =
> kzalloc(tables[SMU_TABLE_ECCINFO_V2].size, GFP_KERNEL);
> > +   if (!smu_table->ecc_table_v2)
> > +           return -ENOMEM;;
> > +
> >     return 0;
> >   }
> >
> > @@ -1802,7 +1815,8 @@ static ssize_t aldebaran_get_gpu_metrics(struct
> smu_context *smu,
> >     return sizeof(struct gpu_metrics_v1_3);
> >   }
> >
> > -static int aldebaran_check_ecc_table_support(struct smu_context *smu)
> > +static int aldebaran_check_ecc_table_support(struct smu_context *smu,
> > +           int *ecctable_version)
> >   {
> >     uint32_t if_version = 0xff, smu_version = 0xff;
> >     int ret = 0;
> > @@ -1815,6 +1829,11 @@ static int
> > aldebaran_check_ecc_table_support(struct smu_context *smu)
> >
> >     if (smu_version < SUPPORT_ECCTABLE_SMU_VERSION)
> >             ret = -EOPNOTSUPP;
> > +   else if (smu_version >= SUPPORT_ECCTABLE_SMU_VERSION &&
> > +                   smu_version <
> SUPPORT_ECCTABLE_V2_SMU_VERSION)
> > +           *ecctable_version = 1;
> > +   else
> > +           *ecctable_version = 2;
> >
> >     return ret;
> >   }
> > @@ -1824,36 +1843,72 @@ static ssize_t aldebaran_get_ecc_info(struct
> smu_context *smu,
> >   {
> >     struct smu_table_context *smu_table = &smu->smu_table;
> >     EccInfoTable_t *ecc_table = NULL;
> > +   EccInfoTable_t_v2 *ecc_table_v2 = NULL;
> >     struct ecc_info_per_ch *ecc_info_per_channel = NULL;
> >     int i, ret = 0;
> > +   int table_version = 0;
> >     struct umc_ecc_info *eccinfo = (struct umc_ecc_info *)table;
> >
> > -   ret = aldebaran_check_ecc_table_support(smu);
> > +   ret = aldebaran_check_ecc_table_support(smu, &table_version);
> >     if (ret)
> >             return ret;
> >
> > -   ret = smu_cmn_update_table(smu,
> > -                          SMU_TABLE_ECCINFO,
> > -                          0,
> > -                          smu_table->ecc_table,
> > -                          false);
> > -   if (ret) {
> > -           dev_info(smu->adev->dev, "Failed to export SMU ecc table!\n");
> > -           return ret;
> > -   }
> > +   if (table_version == 1) {
> > +           ret = smu_cmn_update_table(smu,
> > +                                  SMU_TABLE_ECCINFO,
> > +                                  0,
> > +                                  smu_table->ecc_table,
> > +                                  false);
> > +           if (ret) {
> > +                   dev_info(smu->adev->dev, "Failed to export SMU ecc
> table!\n");
> > +                   return ret;
> > +           }
> > +
> > +           ecc_table = (EccInfoTable_t *)smu_table->ecc_table;
> > +
> > +           for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
> > +                   ecc_info_per_channel = &(eccinfo->ecc[i]);
> > +                   ecc_info_per_channel->ce_count_lo_chip =
> > +                           ecc_table->EccInfo[i].ce_count_lo_chip;
> > +                   ecc_info_per_channel->ce_count_hi_chip =
> > +                           ecc_table->EccInfo[i].ce_count_hi_chip;
> > +                   ecc_info_per_channel->mca_umc_status =
> > +                           ecc_table->EccInfo[i].mca_umc_status;
> > +                   ecc_info_per_channel->mca_umc_addr =
> > +                           ecc_table->EccInfo[i].mca_umc_addr;
> > +           }
> > +   } else if (table_version == 2) {
> > +           /* still use SMU_TABLE_ECC_INFO index,
> > +            * smu 68.55.0 add mca_ceumc_addr variable
> > +            * in EccInfo_t struct to report correctable
> > +            * error address and the table_id is not changed
> > +            */
> > +           ret = smu_cmn_update_table(smu,
> > +                                  SMU_TABLE_ECCINFO,
> > +                                  0,
> > +                                  smu_table->ecc_table_v2,
> > +                                      false);
> >
> > -   ecc_table = (EccInfoTable_t *)smu_table->ecc_table;
> > -
> > -   for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
> > -           ecc_info_per_channel = &(eccinfo->ecc[i]);
> > -           ecc_info_per_channel->ce_count_lo_chip =
> > -                   ecc_table->EccInfo[i].ce_count_lo_chip;
> > -           ecc_info_per_channel->ce_count_hi_chip =
> > -                   ecc_table->EccInfo[i].ce_count_hi_chip;
> > -           ecc_info_per_channel->mca_umc_status =
> > -                   ecc_table->EccInfo[i].mca_umc_status;
> > -           ecc_info_per_channel->mca_umc_addr =
> > -                   ecc_table->EccInfo[i].mca_umc_addr;
> > +           if (ret) {
> > +                   dev_info(smu->adev->dev, "Failed to export SMU ecc
> table_v2!\n");
> > +                   return ret;
> > +           }
> > +
> > +           ecc_table_v2 = (EccInfoTable_t_v2 *)smu_table->ecc_table_v2;
> > +
> > +           for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
> > +                   ecc_info_per_channel = &(eccinfo->ecc[i]);
> > +                   ecc_info_per_channel->ce_count_lo_chip =
> > +                           ecc_table_v2->EccInfo[i].ce_count_lo_chip;
> > +                   ecc_info_per_channel->ce_count_hi_chip =
> > +                           ecc_table_v2->EccInfo[i].ce_count_hi_chip;
> > +                   ecc_info_per_channel->mca_umc_status =
> > +                           ecc_table_v2->EccInfo[i].mca_umc_status;
> > +                   ecc_info_per_channel->mca_umc_addr =
> > +                           ecc_table_v2->EccInfo[i].mca_umc_addr;
> > +                   ecc_info_per_channel->mca_ceumc_addr =
> > +                           ecc_table_v2->EccInfo[i].mca_ceumc_addr;
> > +           }
> >     }
> >
> >     return ret;
> > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> > b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> > index ae6321af9d88..af2d84a16f3e 100644
> > --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> > @@ -552,9 +552,11 @@ int smu_v13_0_fini_smc_tables(struct smu_context
> *smu)
> >     kfree(smu_table->hardcode_pptable);
> >     smu_table->hardcode_pptable = NULL;
> >
> > +   kfree(smu_table->ecc_table_v2);
> >     kfree(smu_table->ecc_table);
> >     kfree(smu_table->metrics_table);
> >     kfree(smu_table->watermarks_table);
> > +   smu_table->ecc_table_v2 = NULL;
> >     smu_table->ecc_table = NULL;
> >     smu_table->metrics_table = NULL;
> >     smu_table->watermarks_table = NULL;
> >

^ permalink raw reply	[flat|nested] 9+ messages in thread

* 回复: [PATCH Review 1/2] drm/amdgpu/pm: support mca_ceumc_addr in ecctable
  2022-05-23  8:48 ` [PATCH Review 1/2] drm/amdgpu/pm: support mca_ceumc_addr in ecctable Wang, Yang(Kevin)
@ 2022-05-23 10:16   ` Yang, Stanley
  2022-05-23 10:37     ` 答复: " Yang, Stanley
  0 siblings, 1 reply; 9+ messages in thread
From: Yang, Stanley @ 2022-05-23 10:16 UTC (permalink / raw)
  To: Wang, Yang(Kevin),
	amd-gfx, Zhang, Hawking, Zhou1, Tao, Quan, Evan, Lazar, Lijo

[-- Attachment #1: Type: text/plain, Size: 12536 bytes --]

[AMD Official Use Only - General]

Hi Kevin,

发件人: Wang, Yang(Kevin) <KevinYang.Wang@amd.com>
发送时间: Monday, May 23, 2022 4:49 PM
收件人: Yang, Stanley <Stanley.Yang@amd.com>; amd-gfx@lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Quan, Evan <Evan.Quan@amd.com>; Lazar, Lijo <Lijo.Lazar@amd.com>
主题: Re: [PATCH Review 1/2] drm/amdgpu/pm: support mca_ceumc_addr in ecctable


[AMD Official Use Only - General]


________________________________
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org<mailto:amd-gfx-bounces@lists.freedesktop.org>> on behalf of Stanley.Yang <Stanley.Yang@amd.com<mailto:Stanley.Yang@amd.com>>
Sent: Monday, May 23, 2022 4:17 PM
To: amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org> <amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>>; Zhang, Hawking <Hawking.Zhang@amd.com<mailto:Hawking.Zhang@amd.com>>; Zhou1, Tao <Tao.Zhou1@amd.com<mailto:Tao.Zhou1@amd.com>>; Quan, Evan <Evan.Quan@amd.com<mailto:Evan.Quan@amd.com>>; Lazar, Lijo <Lijo.Lazar@amd.com<mailto:Lijo.Lazar@amd.com>>
Cc: Yang, Stanley <Stanley.Yang@amd.com<mailto:Stanley.Yang@amd.com>>
Subject: [PATCH Review 1/2] drm/amdgpu/pm: support mca_ceumc_addr in ecctable

SMU add a new variable mca_ceumc_addr to record
umc correctable error address in EccInfo table,
driver side add ecctable_v2 to support this feature

Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com<mailto:Stanley.Yang@amd.com>>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h       |   1 +
 drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h |   2 +
 .../inc/pmfw_if/smu13_driver_if_aldebaran.h   |  15 +++
 .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c    | 101 ++++++++++++++----
 .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c    |   2 +
 5 files changed, 98 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index b9a6fac2b8b2..28e603243b67 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -328,6 +328,7 @@ struct ecc_info_per_ch {
         uint16_t ce_count_hi_chip;
         uint64_t mca_umc_status;
         uint64_t mca_umc_addr;
+       uint64_t mca_ceumc_addr;
 };

 struct umc_ecc_info {
diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
index a6a7b6c33683..9f7257ada437 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
@@ -322,6 +322,7 @@ enum smu_table_id
         SMU_TABLE_PACE,
         SMU_TABLE_ECCINFO,
         SMU_TABLE_COMBO_PPTABLE,
+       SMU_TABLE_ECCINFO_V2,
         SMU_TABLE_COUNT,
 };

@@ -340,6 +341,7 @@ struct smu_table_context
         void                            *driver_pptable;
         void                            *combo_pptable;
         void                            *ecc_table;
+       void                            *ecc_table_v2;  // adapt to smu support record mca_ceumc_addr
         void                            *driver_smu_config_table;
         struct smu_table                tables[SMU_TABLE_COUNT];
         /*
diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h
index 0f67c56c2863..2868604eff49 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h
@@ -522,6 +522,21 @@ typedef struct {
         EccInfo_t  EccInfo[ALDEBARAN_UMC_CHANNEL_NUM];
 } EccInfoTable_t;

+typedef struct {
+       uint64_t mca_umc_status;
+       uint64_t mca_umc_addr;
+       uint64_t mca_ceumc_addr;
+
+       uint16_t ce_count_lo_chip;
+       uint16_t ce_count_hi_chip;
+
+       uint32_t eccPadding;
+} EccInfo_t_v2;
+
+typedef struct {
+       EccInfo_t_v2  EccInfo[ALDEBARAN_UMC_CHANNEL_NUM];
+} EccInfoTable_t_v2;
+
 // These defines are used with the following messages:
 // SMC_MSG_TransferTableDram2Smu
 // SMC_MSG_TransferTableSmu2Dram
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
index 38af648cb857..e58df9490cec 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
@@ -82,6 +82,12 @@
  */
 #define SUPPORT_ECCTABLE_SMU_VERSION 0x00442a00

+/*
+ * SMU support mca_ceumc_addr in ECCTABLE since version 68.55.0,
+ * use this to check mca_ceumc_addr record whether support
+ */
+#define SUPPORT_ECCTABLE_V2_SMU_VERSION 0x00443700
+
 /*
  * SMU support BAD CHENNEL info MSG since version 68.51.00,
  * use this to check ECCTALE feature whether support
@@ -239,6 +245,9 @@ static int aldebaran_tables_init(struct smu_context *smu)
         SMU_TABLE_INIT(tables, SMU_TABLE_ECCINFO, sizeof(EccInfoTable_t),
                        PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);

+       SMU_TABLE_INIT(tables, SMU_TABLE_ECCINFO_V2, sizeof(EccInfoTable_t_v2),
+                       PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);
+
[kevin]:
this table mapping is not needed, the reason as below.

         smu_table->metrics_table = kzalloc(sizeof(SmuMetrics_t), GFP_KERNEL);
         if (!smu_table->metrics_table)
                 return -ENOMEM;
@@ -255,6 +264,10 @@ static int aldebaran_tables_init(struct smu_context *smu)
         if (!smu_table->ecc_table)
                 return -ENOMEM;

+       smu_table->ecc_table_v2 = kzalloc(tables[SMU_TABLE_ECCINFO_V2].size, GFP_KERNEL);
+       if (!smu_table->ecc_table_v2)
+               return -ENOMEM;;
+

[kevin]:

add eccinfo table v2 is not needed for this case, this table is only used store table data from pmfw to driver,
you can create a large enough table which can save ecc table data directly.

e.g:
size = max(sizeof(EccInfoTable_t_v2), sizeof(EccInfoTable_t));
smu_table->ecc_table = kzalloc(size, GFP_KERNEL);

Best Regards,
Kevin
[Yang, Stanley] :  this method is not forward compatible, or driver need complex convert to get the correct value, if new driver use an old pmfw.

         return 0;
 }

@@ -1802,7 +1815,8 @@ static ssize_t aldebaran_get_gpu_metrics(struct smu_context *smu,
         return sizeof(struct gpu_metrics_v1_3);
 }

-static int aldebaran_check_ecc_table_support(struct smu_context *smu)
+static int aldebaran_check_ecc_table_support(struct smu_context *smu,
+               int *ecctable_version)
 {
         uint32_t if_version = 0xff, smu_version = 0xff;
         int ret = 0;
@@ -1815,6 +1829,11 @@ static int aldebaran_check_ecc_table_support(struct smu_context *smu)

         if (smu_version < SUPPORT_ECCTABLE_SMU_VERSION)
                 ret = -EOPNOTSUPP;
+       else if (smu_version >= SUPPORT_ECCTABLE_SMU_VERSION &&
+                       smu_version < SUPPORT_ECCTABLE_V2_SMU_VERSION)
+               *ecctable_version = 1;
+       else
+               *ecctable_version = 2;

         return ret;
 }
@@ -1824,36 +1843,72 @@ static ssize_t aldebaran_get_ecc_info(struct smu_context *smu,
 {
         struct smu_table_context *smu_table = &smu->smu_table;
         EccInfoTable_t *ecc_table = NULL;
+       EccInfoTable_t_v2 *ecc_table_v2 = NULL;
         struct ecc_info_per_ch *ecc_info_per_channel = NULL;
         int i, ret = 0;
+       int table_version = 0;
         struct umc_ecc_info *eccinfo = (struct umc_ecc_info *)table;

-       ret = aldebaran_check_ecc_table_support(smu);
+       ret = aldebaran_check_ecc_table_support(smu, &table_version);
         if (ret)
                 return ret;

-       ret = smu_cmn_update_table(smu,
-                              SMU_TABLE_ECCINFO,
-                              0,
-                              smu_table->ecc_table,
-                              false);
-       if (ret) {
-               dev_info(smu->adev->dev, "Failed to export SMU ecc table!\n");
-               return ret;
-       }
+       if (table_version == 1) {
+               ret = smu_cmn_update_table(smu,
+                                      SMU_TABLE_ECCINFO,
+                                      0,
+                                      smu_table->ecc_table,
+                                      false);
+               if (ret) {
+                       dev_info(smu->adev->dev, "Failed to export SMU ecc table!\n");
+                       return ret;
+               }
+
+               ecc_table = (EccInfoTable_t *)smu_table->ecc_table;
+
+               for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
+                       ecc_info_per_channel = &(eccinfo->ecc[i]);
+                       ecc_info_per_channel->ce_count_lo_chip =
+                               ecc_table->EccInfo[i].ce_count_lo_chip;
+                       ecc_info_per_channel->ce_count_hi_chip =
+                               ecc_table->EccInfo[i].ce_count_hi_chip;
+                       ecc_info_per_channel->mca_umc_status =
+                               ecc_table->EccInfo[i].mca_umc_status;
+                       ecc_info_per_channel->mca_umc_addr =
+                               ecc_table->EccInfo[i].mca_umc_addr;
+               }
+       } else if (table_version == 2) {
+               /* still use SMU_TABLE_ECC_INFO index,
+                * smu 68.55.0 add mca_ceumc_addr variable
+                * in EccInfo_t struct to report correctable
+                * error address and the table_id is not changed
+                */
+               ret = smu_cmn_update_table(smu,
+                                      SMU_TABLE_ECCINFO,
+                                      0,
+                                      smu_table->ecc_table_v2,
+                                          false);

-       ecc_table = (EccInfoTable_t *)smu_table->ecc_table;
-
-       for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
-               ecc_info_per_channel = &(eccinfo->ecc[i]);
-               ecc_info_per_channel->ce_count_lo_chip =
-                       ecc_table->EccInfo[i].ce_count_lo_chip;
-               ecc_info_per_channel->ce_count_hi_chip =
-                       ecc_table->EccInfo[i].ce_count_hi_chip;
-               ecc_info_per_channel->mca_umc_status =
-                       ecc_table->EccInfo[i].mca_umc_status;
-               ecc_info_per_channel->mca_umc_addr =
-                       ecc_table->EccInfo[i].mca_umc_addr;
+               if (ret) {
+                       dev_info(smu->adev->dev, "Failed to export SMU ecc table_v2!\n");
+                       return ret;
+               }
+
+               ecc_table_v2 = (EccInfoTable_t_v2 *)smu_table->ecc_table_v2;
+
+               for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
+                       ecc_info_per_channel = &(eccinfo->ecc[i]);
+                       ecc_info_per_channel->ce_count_lo_chip =
+                               ecc_table_v2->EccInfo[i].ce_count_lo_chip;
+                       ecc_info_per_channel->ce_count_hi_chip =
+                               ecc_table_v2->EccInfo[i].ce_count_hi_chip;
+                       ecc_info_per_channel->mca_umc_status =
+                               ecc_table_v2->EccInfo[i].mca_umc_status;
+                       ecc_info_per_channel->mca_umc_addr =
+                               ecc_table_v2->EccInfo[i].mca_umc_addr;
+                       ecc_info_per_channel->mca_ceumc_addr =
+                               ecc_table_v2->EccInfo[i].mca_ceumc_addr;
+               }
         }

         return ret;
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
index ae6321af9d88..af2d84a16f3e 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
@@ -552,9 +552,11 @@ int smu_v13_0_fini_smc_tables(struct smu_context *smu)
         kfree(smu_table->hardcode_pptable);
         smu_table->hardcode_pptable = NULL;

+       kfree(smu_table->ecc_table_v2);
         kfree(smu_table->ecc_table);
         kfree(smu_table->metrics_table);
         kfree(smu_table->watermarks_table);
+       smu_table->ecc_table_v2 = NULL;
         smu_table->ecc_table = NULL;
         smu_table->metrics_table = NULL;
         smu_table->watermarks_table = NULL;
--
2.17.1

[-- Attachment #2: Type: text/html, Size: 30206 bytes --]

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* RE: [PATCH Review 2/2] drm/amdgpu: print umc correctable error address
  2022-05-23  8:17 ` [PATCH Review 2/2] drm/amdgpu: print umc correctable error address Stanley.Yang
@ 2022-05-23 10:22   ` Zhou1, Tao
  2022-05-23 10:26     ` 回复: " Yang, Stanley
  0 siblings, 1 reply; 9+ messages in thread
From: Zhou1, Tao @ 2022-05-23 10:22 UTC (permalink / raw)
  To: Yang, Stanley, amd-gfx, Zhang, Hawking, Quan, Evan, Lazar, Lijo
  Cc: Yang, Stanley

[AMD Official Use Only - General]



> -----Original Message-----
> From: Stanley.Yang <Stanley.Yang@amd.com>
> Sent: Monday, May 23, 2022 4:17 PM
> To: amd-gfx@lists.freedesktop.org; Zhang, Hawking
> <Hawking.Zhang@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Quan,
> Evan <Evan.Quan@amd.com>; Lazar, Lijo <Lijo.Lazar@amd.com>
> Cc: Yang, Stanley <Stanley.Yang@amd.com>
> Subject: [PATCH Review 2/2] drm/amdgpu: print umc correctable error
> address
> 
> Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h           |  5 ++
>  drivers/gpu/drm/amd/amdgpu/umc_v6_7.c         | 55
> ++++++++++++++++++-
>  .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c    |  2 +
>  3 files changed, 60 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 3f23f9ad3249..985b8cddb5a1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1108,6 +1108,11 @@ struct amdgpu_device {
> 
>  	bool                            scpm_enabled;
>  	uint32_t                        scpm_status;
> +
> +	/* Determine smu ecctable whether support
> +	 * record correctable error address
> +	 */
> +	int record_ce_addr_supported;
>  };
> 
>  static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
> b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
> index 606892dbea1c..47bd39e52e9b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
> +++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
> @@ -119,6 +119,27 @@ static void
> umc_v6_7_ecc_info_query_correctable_error_count(struct amdgpu_device
>  		*error_count += 1;
> 
>  		umc_v6_7_query_error_status_helper(adev,
> mc_umc_status, umc_reg_offset);
> +
> +		if (adev->record_ce_addr_supported)	{
> +			uint64_t err_addr, soc_pa;
> +			uint32_t channel_index =
> +				adev->umc.channel_idx_tbl[umc_inst *
> adev->umc.channel_inst_num +
> +ch_inst];
> +
> +			err_addr = ras-
> >umc_ecc.ecc[eccinfo_table_idx].mca_ceumc_addr;
> +			err_addr = REG_GET_FIELD(err_addr,
> MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
> +			/* translate umc channel address to soc pa, 3 parts
> are included */
> +			soc_pa = ADDR_OF_8KB_BLOCK(err_addr) |
> +
> 	ADDR_OF_256B_BLOCK(channel_index) |
> +					OFFSET_IN_256B_BLOCK(err_addr);
> +
> +			/* The umc channel bits are not original values, they
> are hashed */
> +			SET_CHANNEL_HASH(channel_index, soc_pa);
> +
> +			/* clear [C4 C3 C2] in soc physical address */
> +			soc_pa &= ~(0x7ULL << UMC_V6_7_PA_C2_BIT);

[Tao] this clear is the preparation for looping all column bits in same row, you only need physical address of one page, the code can be removed.

> +
> +			dev_info(adev->dev, "Error Address(PA): 0x%llx\n",
> soc_pa);
> +		}
>  	}
>  }
> 
> @@ -251,7 +272,9 @@ static void
> umc_v6_7_ecc_info_query_ras_error_address(struct amdgpu_device *adev
> 
>  static void umc_v6_7_query_correctable_error_count(struct
> amdgpu_device *adev,
>  						   uint32_t umc_reg_offset,
> -						   unsigned long
> *error_count)
> +						   unsigned long
> *error_count,
> +						   uint32_t ch_inst,
> +						   uint32_t umc_inst)
>  {
>  	uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
>  	uint32_t ecc_err_cnt, ecc_err_cnt_addr; @@ -295,6 +318,33 @@
> static void umc_v6_7_query_correctable_error_count(struct
> amdgpu_device *adev,
>  		*error_count += 1;
> 
>  		umc_v6_7_query_error_status_helper(adev,
> mc_umc_status, umc_reg_offset);
> +
> +		{
> +			uint64_t err_addr, soc_pa;
> +			uint32_t mc_umc_addrt0;
> +			uint32_t channel_index;
> +
> +			mc_umc_addrt0 =
> +				SOC15_REG_OFFSET(UMC, 0,
> regMCA_UMC_UMC0_MCUMC_ADDRT0);
> +
> +			channel_index =
> +				adev->umc.channel_idx_tbl[umc_inst *
> adev->umc.channel_inst_num +
> +ch_inst];
> +
> +			err_addr = RREG64_PCIE((mc_umc_addrt0 +
> umc_reg_offset) * 4);
> +			err_addr = REG_GET_FIELD(err_addr,
> MCA_UMC_UMC0_MCUMC_ADDRT0,
> +ErrorAddr);
> +
> +			/* translate umc channel address to soc pa, 3 parts
> are included */
> +			soc_pa = ADDR_OF_8KB_BLOCK(err_addr) |
> +
> 	ADDR_OF_256B_BLOCK(channel_index) |
> +					OFFSET_IN_256B_BLOCK(err_addr);
> +
> +			/* The umc channel bits are not original values, they
> are hashed */
> +			SET_CHANNEL_HASH(channel_index, soc_pa);
> +
> +			/* clear [C4 C3 C2] in soc physical address */
> +			soc_pa &= ~(0x7ULL << UMC_V6_7_PA_C2_BIT);
> +			dev_info(adev->dev, "Error Address(PA): 0x%llx\n",
> soc_pa);
> +		}
>  	}
>  }
> 
> @@ -395,7 +445,8 @@ static void umc_v6_7_query_ras_error_count(struct
> amdgpu_device *adev,
>  							 ch_inst);
>  		umc_v6_7_query_correctable_error_count(adev,
>  						       umc_reg_offset,
> -						       &(err_data->ce_count));
> +						       &(err_data->ce_count),
> +						       ch_inst, umc_inst);
>  		umc_v6_7_querry_uncorrectable_error_count(adev,
>  							  umc_reg_offset,
>  							  &(err_data-
> >ue_count));
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> index e58df9490cec..e182088b4ac8 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> @@ -1908,6 +1908,8 @@ static ssize_t aldebaran_get_ecc_info(struct
> smu_context *smu,
>  				ecc_table_v2->EccInfo[i].mca_umc_addr;
>  			ecc_info_per_channel->mca_ceumc_addr =
>  				ecc_table_v2->EccInfo[i].mca_ceumc_addr;
> +			if (!smu->adev->record_ce_addr_supported)

[Tao] it seems the check is unnecessary.

> +				smu->adev->record_ce_addr_supported =1;
>  		}
>  	}
> 
> --
> 2.17.1

^ permalink raw reply	[flat|nested] 9+ messages in thread

* 回复: [PATCH Review 2/2] drm/amdgpu: print umc correctable error address
  2022-05-23 10:22   ` Zhou1, Tao
@ 2022-05-23 10:26     ` Yang, Stanley
  0 siblings, 0 replies; 9+ messages in thread
From: Yang, Stanley @ 2022-05-23 10:26 UTC (permalink / raw)
  To: Zhou1, Tao, amd-gfx, Zhang, Hawking, Quan, Evan, Lazar, Lijo

Thanks tao, will update.

> -----邮件原件-----
> 发件人: Zhou1, Tao <Tao.Zhou1@amd.com>
> 发送时间: Monday, May 23, 2022 6:22 PM
> 收件人: Yang, Stanley <Stanley.Yang@amd.com>; amd-
> gfx@lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang@amd.com>; Quan,
> Evan <Evan.Quan@amd.com>; Lazar, Lijo <Lijo.Lazar@amd.com>
> 抄送: Yang, Stanley <Stanley.Yang@amd.com>
> 主题: RE: [PATCH Review 2/2] drm/amdgpu: print umc correctable error address
> 
> [AMD Official Use Only - General]
> 
> 
> 
> > -----Original Message-----
> > From: Stanley.Yang <Stanley.Yang@amd.com>
> > Sent: Monday, May 23, 2022 4:17 PM
> > To: amd-gfx@lists.freedesktop.org; Zhang, Hawking
> > <Hawking.Zhang@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Quan,
> Evan
> > <Evan.Quan@amd.com>; Lazar, Lijo <Lijo.Lazar@amd.com>
> > Cc: Yang, Stanley <Stanley.Yang@amd.com>
> > Subject: [PATCH Review 2/2] drm/amdgpu: print umc correctable error
> > address
> >
> > Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com>
> > ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu.h           |  5 ++
> >  drivers/gpu/drm/amd/amdgpu/umc_v6_7.c         | 55
> > ++++++++++++++++++-
> >  .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c    |  2 +
> >  3 files changed, 60 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > index 3f23f9ad3249..985b8cddb5a1 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > @@ -1108,6 +1108,11 @@ struct amdgpu_device {
> >
> >  	bool                            scpm_enabled;
> >  	uint32_t                        scpm_status;
> > +
> > +	/* Determine smu ecctable whether support
> > +	 * record correctable error address
> > +	 */
> > +	int record_ce_addr_supported;
> >  };
> >
> >  static inline struct amdgpu_device *drm_to_adev(struct drm_device
> > *ddev) diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
> > b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
> > index 606892dbea1c..47bd39e52e9b 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
> > @@ -119,6 +119,27 @@ static void
> > umc_v6_7_ecc_info_query_correctable_error_count(struct amdgpu_device
> >  		*error_count += 1;
> >
> >  		umc_v6_7_query_error_status_helper(adev,
> > mc_umc_status, umc_reg_offset);
> > +
> > +		if (adev->record_ce_addr_supported)	{
> > +			uint64_t err_addr, soc_pa;
> > +			uint32_t channel_index =
> > +				adev->umc.channel_idx_tbl[umc_inst *
> > adev->umc.channel_inst_num +
> > +ch_inst];
> > +
> > +			err_addr = ras-
> > >umc_ecc.ecc[eccinfo_table_idx].mca_ceumc_addr;
> > +			err_addr = REG_GET_FIELD(err_addr,
> > MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
> > +			/* translate umc channel address to soc pa, 3 parts
> > are included */
> > +			soc_pa = ADDR_OF_8KB_BLOCK(err_addr) |
> > +
> > 	ADDR_OF_256B_BLOCK(channel_index) |
> > +					OFFSET_IN_256B_BLOCK(err_addr);
> > +
> > +			/* The umc channel bits are not original values, they
> > are hashed */
> > +			SET_CHANNEL_HASH(channel_index, soc_pa);
> > +
> > +			/* clear [C4 C3 C2] in soc physical address */
> > +			soc_pa &= ~(0x7ULL << UMC_V6_7_PA_C2_BIT);
> 
> [Tao] this clear is the preparation for looping all column bits in same row, you
> only need physical address of one page, the code can be removed.
> 
> > +
> > +			dev_info(adev->dev, "Error Address(PA): 0x%llx\n",
> > soc_pa);
> > +		}
> >  	}
> >  }
> >
> > @@ -251,7 +272,9 @@ static void
> > umc_v6_7_ecc_info_query_ras_error_address(struct amdgpu_device *adev
> >
> >  static void umc_v6_7_query_correctable_error_count(struct
> > amdgpu_device *adev,
> >  						   uint32_t umc_reg_offset,
> > -						   unsigned long
> > *error_count)
> > +						   unsigned long
> > *error_count,
> > +						   uint32_t ch_inst,
> > +						   uint32_t umc_inst)
> >  {
> >  	uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
> >  	uint32_t ecc_err_cnt, ecc_err_cnt_addr; @@ -295,6 +318,33 @@ static
> > void umc_v6_7_query_correctable_error_count(struct
> > amdgpu_device *adev,
> >  		*error_count += 1;
> >
> >  		umc_v6_7_query_error_status_helper(adev,
> > mc_umc_status, umc_reg_offset);
> > +
> > +		{
> > +			uint64_t err_addr, soc_pa;
> > +			uint32_t mc_umc_addrt0;
> > +			uint32_t channel_index;
> > +
> > +			mc_umc_addrt0 =
> > +				SOC15_REG_OFFSET(UMC, 0,
> > regMCA_UMC_UMC0_MCUMC_ADDRT0);
> > +
> > +			channel_index =
> > +				adev->umc.channel_idx_tbl[umc_inst *
> > adev->umc.channel_inst_num +
> > +ch_inst];
> > +
> > +			err_addr = RREG64_PCIE((mc_umc_addrt0 +
> > umc_reg_offset) * 4);
> > +			err_addr = REG_GET_FIELD(err_addr,
> > MCA_UMC_UMC0_MCUMC_ADDRT0,
> > +ErrorAddr);
> > +
> > +			/* translate umc channel address to soc pa, 3 parts
> > are included */
> > +			soc_pa = ADDR_OF_8KB_BLOCK(err_addr) |
> > +
> > 	ADDR_OF_256B_BLOCK(channel_index) |
> > +					OFFSET_IN_256B_BLOCK(err_addr);
> > +
> > +			/* The umc channel bits are not original values, they
> > are hashed */
> > +			SET_CHANNEL_HASH(channel_index, soc_pa);
> > +
> > +			/* clear [C4 C3 C2] in soc physical address */
> > +			soc_pa &= ~(0x7ULL << UMC_V6_7_PA_C2_BIT);
> > +			dev_info(adev->dev, "Error Address(PA): 0x%llx\n",
> > soc_pa);
> > +		}
> >  	}
> >  }
> >
> > @@ -395,7 +445,8 @@ static void umc_v6_7_query_ras_error_count(struct
> > amdgpu_device *adev,
> >  							 ch_inst);
> >  		umc_v6_7_query_correctable_error_count(adev,
> >  						       umc_reg_offset,
> > -						       &(err_data->ce_count));
> > +						       &(err_data->ce_count),
> > +						       ch_inst, umc_inst);
> >  		umc_v6_7_querry_uncorrectable_error_count(adev,
> >  							  umc_reg_offset,
> >  							  &(err_data-
> > >ue_count));
> > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> > b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> > index e58df9490cec..e182088b4ac8 100644
> > --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> > @@ -1908,6 +1908,8 @@ static ssize_t aldebaran_get_ecc_info(struct
> > smu_context *smu,
> >  				ecc_table_v2->EccInfo[i].mca_umc_addr;
> >  			ecc_info_per_channel->mca_ceumc_addr =
> >  				ecc_table_v2->EccInfo[i].mca_ceumc_addr;
> > +			if (!smu->adev->record_ce_addr_supported)
> 
> [Tao] it seems the check is unnecessary.
> 
> > +				smu->adev->record_ce_addr_supported =1;
> >  		}
> >  	}
> >
> > --
> > 2.17.1

^ permalink raw reply	[flat|nested] 9+ messages in thread

* 答复: [PATCH Review 1/2] drm/amdgpu/pm: support mca_ceumc_addr in ecctable
  2022-05-23 10:16   ` 回复: " Yang, Stanley
@ 2022-05-23 10:37     ` Yang, Stanley
  0 siblings, 0 replies; 9+ messages in thread
From: Yang, Stanley @ 2022-05-23 10:37 UTC (permalink / raw)
  To: Wang, Yang(Kevin),
	amd-gfx, Zhang, Hawking, Zhou1, Tao, Quan, Evan, Lazar, Lijo

[-- Attachment #1: Type: text/plain, Size: 13211 bytes --]

[AMD Official Use Only - General]


[AMD Official Use Only - General]
Hi Kevin,

Please ignore above mail, thanks your suggestion, I will try it.

Regards,
Stanley
发件人: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> 代表 Yang, Stanley <Stanley.Yang@amd.com>
日期: 星期一, 2022年5月23日 下午6:16
收件人: Wang, Yang(Kevin) <KevinYang.Wang@amd.com>, amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>, Zhang, Hawking <Hawking.Zhang@amd.com>, Zhou1, Tao <Tao.Zhou1@amd.com>, Quan, Evan <Evan.Quan@amd.com>, Lazar, Lijo <Lijo.Lazar@amd.com>
主题: 回复: [PATCH Review 1/2] drm/amdgpu/pm: support mca_ceumc_addr in ecctable

[AMD Official Use Only - General]


[AMD Official Use Only - General]

Hi Kevin,

发件人: Wang, Yang(Kevin) <KevinYang.Wang@amd.com>
发送时间: Monday, May 23, 2022 4:49 PM
收件人: Yang, Stanley <Stanley.Yang@amd.com>; amd-gfx@lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Quan, Evan <Evan.Quan@amd.com>; Lazar, Lijo <Lijo.Lazar@amd.com>
主题: Re: [PATCH Review 1/2] drm/amdgpu/pm: support mca_ceumc_addr in ecctable


[AMD Official Use Only - General]


________________________________
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org<mailto:amd-gfx-bounces@lists.freedesktop.org>> on behalf of Stanley.Yang <Stanley.Yang@amd.com<mailto:Stanley.Yang@amd.com>>
Sent: Monday, May 23, 2022 4:17 PM
To: amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org> <amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>>; Zhang, Hawking <Hawking.Zhang@amd.com<mailto:Hawking.Zhang@amd.com>>; Zhou1, Tao <Tao.Zhou1@amd.com<mailto:Tao.Zhou1@amd.com>>; Quan, Evan <Evan.Quan@amd.com<mailto:Evan.Quan@amd.com>>; Lazar, Lijo <Lijo.Lazar@amd.com<mailto:Lijo.Lazar@amd.com>>
Cc: Yang, Stanley <Stanley.Yang@amd.com<mailto:Stanley.Yang@amd.com>>
Subject: [PATCH Review 1/2] drm/amdgpu/pm: support mca_ceumc_addr in ecctable

SMU add a new variable mca_ceumc_addr to record
umc correctable error address in EccInfo table,
driver side add ecctable_v2 to support this feature

Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com<mailto:Stanley.Yang@amd.com>>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h       |   1 +
 drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h |   2 +
 .../inc/pmfw_if/smu13_driver_if_aldebaran.h   |  15 +++
 .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c    | 101 ++++++++++++++----
 .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c    |   2 +
 5 files changed, 98 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index b9a6fac2b8b2..28e603243b67 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -328,6 +328,7 @@ struct ecc_info_per_ch {
         uint16_t ce_count_hi_chip;
         uint64_t mca_umc_status;
         uint64_t mca_umc_addr;
+       uint64_t mca_ceumc_addr;
 };

 struct umc_ecc_info {
diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
index a6a7b6c33683..9f7257ada437 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
@@ -322,6 +322,7 @@ enum smu_table_id
         SMU_TABLE_PACE,
         SMU_TABLE_ECCINFO,
         SMU_TABLE_COMBO_PPTABLE,
+       SMU_TABLE_ECCINFO_V2,
         SMU_TABLE_COUNT,
 };

@@ -340,6 +341,7 @@ struct smu_table_context
         void                            *driver_pptable;
         void                            *combo_pptable;
         void                            *ecc_table;
+       void                            *ecc_table_v2;  // adapt to smu support record mca_ceumc_addr
         void                            *driver_smu_config_table;
         struct smu_table                tables[SMU_TABLE_COUNT];
         /*
diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h
index 0f67c56c2863..2868604eff49 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h
@@ -522,6 +522,21 @@ typedef struct {
         EccInfo_t  EccInfo[ALDEBARAN_UMC_CHANNEL_NUM];
 } EccInfoTable_t;

+typedef struct {
+       uint64_t mca_umc_status;
+       uint64_t mca_umc_addr;
+       uint64_t mca_ceumc_addr;
+
+       uint16_t ce_count_lo_chip;
+       uint16_t ce_count_hi_chip;
+
+       uint32_t eccPadding;
+} EccInfo_t_v2;
+
+typedef struct {
+       EccInfo_t_v2  EccInfo[ALDEBARAN_UMC_CHANNEL_NUM];
+} EccInfoTable_t_v2;
+
 // These defines are used with the following messages:
 // SMC_MSG_TransferTableDram2Smu
 // SMC_MSG_TransferTableSmu2Dram
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
index 38af648cb857..e58df9490cec 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
@@ -82,6 +82,12 @@
  */
 #define SUPPORT_ECCTABLE_SMU_VERSION 0x00442a00

+/*
+ * SMU support mca_ceumc_addr in ECCTABLE since version 68.55.0,
+ * use this to check mca_ceumc_addr record whether support
+ */
+#define SUPPORT_ECCTABLE_V2_SMU_VERSION 0x00443700
+
 /*
  * SMU support BAD CHENNEL info MSG since version 68.51.00,
  * use this to check ECCTALE feature whether support
@@ -239,6 +245,9 @@ static int aldebaran_tables_init(struct smu_context *smu)
         SMU_TABLE_INIT(tables, SMU_TABLE_ECCINFO, sizeof(EccInfoTable_t),
                        PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);

+       SMU_TABLE_INIT(tables, SMU_TABLE_ECCINFO_V2, sizeof(EccInfoTable_t_v2),
+                       PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);
+
[kevin]:
this table mapping is not needed, the reason as below.

         smu_table->metrics_table = kzalloc(sizeof(SmuMetrics_t), GFP_KERNEL);
         if (!smu_table->metrics_table)
                 return -ENOMEM;
@@ -255,6 +264,10 @@ static int aldebaran_tables_init(struct smu_context *smu)
         if (!smu_table->ecc_table)
                 return -ENOMEM;

+       smu_table->ecc_table_v2 = kzalloc(tables[SMU_TABLE_ECCINFO_V2].size, GFP_KERNEL);
+       if (!smu_table->ecc_table_v2)
+               return -ENOMEM;;
+

[kevin]:

add eccinfo table v2 is not needed for this case, this table is only used store table data from pmfw to driver,
you can create a large enough table which can save ecc table data directly.

e.g:
size = max(sizeof(EccInfoTable_t_v2), sizeof(EccInfoTable_t));
smu_table->ecc_table = kzalloc(size, GFP_KERNEL);

Best Regards,
Kevin
[Yang, Stanley] :  this method is not forward compatible, or driver need complex convert to get the correct value, if new driver use an old pmfw.

         return 0;
 }

@@ -1802,7 +1815,8 @@ static ssize_t aldebaran_get_gpu_metrics(struct smu_context *smu,
         return sizeof(struct gpu_metrics_v1_3);
 }

-static int aldebaran_check_ecc_table_support(struct smu_context *smu)
+static int aldebaran_check_ecc_table_support(struct smu_context *smu,
+               int *ecctable_version)
 {
         uint32_t if_version = 0xff, smu_version = 0xff;
         int ret = 0;
@@ -1815,6 +1829,11 @@ static int aldebaran_check_ecc_table_support(struct smu_context *smu)

         if (smu_version < SUPPORT_ECCTABLE_SMU_VERSION)
                 ret = -EOPNOTSUPP;
+       else if (smu_version >= SUPPORT_ECCTABLE_SMU_VERSION &&
+                       smu_version < SUPPORT_ECCTABLE_V2_SMU_VERSION)
+               *ecctable_version = 1;
+       else
+               *ecctable_version = 2;

         return ret;
 }
@@ -1824,36 +1843,72 @@ static ssize_t aldebaran_get_ecc_info(struct smu_context *smu,
 {
         struct smu_table_context *smu_table = &smu->smu_table;
         EccInfoTable_t *ecc_table = NULL;
+       EccInfoTable_t_v2 *ecc_table_v2 = NULL;
         struct ecc_info_per_ch *ecc_info_per_channel = NULL;
         int i, ret = 0;
+       int table_version = 0;
         struct umc_ecc_info *eccinfo = (struct umc_ecc_info *)table;

-       ret = aldebaran_check_ecc_table_support(smu);
+       ret = aldebaran_check_ecc_table_support(smu, &table_version);
         if (ret)
                 return ret;

-       ret = smu_cmn_update_table(smu,
-                              SMU_TABLE_ECCINFO,
-                              0,
-                              smu_table->ecc_table,
-                              false);
-       if (ret) {
-               dev_info(smu->adev->dev, "Failed to export SMU ecc table!\n");
-               return ret;
-       }
+       if (table_version == 1) {
+               ret = smu_cmn_update_table(smu,
+                                      SMU_TABLE_ECCINFO,
+                                      0,
+                                      smu_table->ecc_table,
+                                      false);
+               if (ret) {
+                       dev_info(smu->adev->dev, "Failed to export SMU ecc table!\n");
+                       return ret;
+               }
+
+               ecc_table = (EccInfoTable_t *)smu_table->ecc_table;
+
+               for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
+                       ecc_info_per_channel = &(eccinfo->ecc[i]);
+                       ecc_info_per_channel->ce_count_lo_chip =
+                               ecc_table->EccInfo[i].ce_count_lo_chip;
+                       ecc_info_per_channel->ce_count_hi_chip =
+                               ecc_table->EccInfo[i].ce_count_hi_chip;
+                       ecc_info_per_channel->mca_umc_status =
+                               ecc_table->EccInfo[i].mca_umc_status;
+                       ecc_info_per_channel->mca_umc_addr =
+                               ecc_table->EccInfo[i].mca_umc_addr;
+               }
+       } else if (table_version == 2) {
+               /* still use SMU_TABLE_ECC_INFO index,
+                * smu 68.55.0 add mca_ceumc_addr variable
+                * in EccInfo_t struct to report correctable
+                * error address and the table_id is not changed
+                */
+               ret = smu_cmn_update_table(smu,
+                                      SMU_TABLE_ECCINFO,
+                                      0,
+                                      smu_table->ecc_table_v2,
+                                          false);

-       ecc_table = (EccInfoTable_t *)smu_table->ecc_table;
-
-       for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
-               ecc_info_per_channel = &(eccinfo->ecc[i]);
-               ecc_info_per_channel->ce_count_lo_chip =
-                       ecc_table->EccInfo[i].ce_count_lo_chip;
-               ecc_info_per_channel->ce_count_hi_chip =
-                       ecc_table->EccInfo[i].ce_count_hi_chip;
-               ecc_info_per_channel->mca_umc_status =
-                       ecc_table->EccInfo[i].mca_umc_status;
-               ecc_info_per_channel->mca_umc_addr =
-                       ecc_table->EccInfo[i].mca_umc_addr;
+               if (ret) {
+                       dev_info(smu->adev->dev, "Failed to export SMU ecc table_v2!\n");
+                       return ret;
+               }
+
+               ecc_table_v2 = (EccInfoTable_t_v2 *)smu_table->ecc_table_v2;
+
+               for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
+                       ecc_info_per_channel = &(eccinfo->ecc[i]);
+                       ecc_info_per_channel->ce_count_lo_chip =
+                               ecc_table_v2->EccInfo[i].ce_count_lo_chip;
+                       ecc_info_per_channel->ce_count_hi_chip =
+                               ecc_table_v2->EccInfo[i].ce_count_hi_chip;
+                       ecc_info_per_channel->mca_umc_status =
+                               ecc_table_v2->EccInfo[i].mca_umc_status;
+                       ecc_info_per_channel->mca_umc_addr =
+                               ecc_table_v2->EccInfo[i].mca_umc_addr;
+                       ecc_info_per_channel->mca_ceumc_addr =
+                               ecc_table_v2->EccInfo[i].mca_ceumc_addr;
+               }
         }

         return ret;
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
index ae6321af9d88..af2d84a16f3e 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
@@ -552,9 +552,11 @@ int smu_v13_0_fini_smc_tables(struct smu_context *smu)
         kfree(smu_table->hardcode_pptable);
         smu_table->hardcode_pptable = NULL;

+       kfree(smu_table->ecc_table_v2);
         kfree(smu_table->ecc_table);
         kfree(smu_table->metrics_table);
         kfree(smu_table->watermarks_table);
+       smu_table->ecc_table_v2 = NULL;
         smu_table->ecc_table = NULL;
         smu_table->metrics_table = NULL;
         smu_table->watermarks_table = NULL;
--
2.17.1

[-- Attachment #2: Type: text/html, Size: 35258 bytes --]

^ permalink raw reply related	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2022-05-23 10:37 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-05-23  8:17 [PATCH Review 1/2] drm/amdgpu/pm: support mca_ceumc_addr in ecctable Stanley.Yang
2022-05-23  8:17 ` [PATCH Review 2/2] drm/amdgpu: print umc correctable error address Stanley.Yang
2022-05-23 10:22   ` Zhou1, Tao
2022-05-23 10:26     ` 回复: " Yang, Stanley
2022-05-23  8:48 ` [PATCH Review 1/2] drm/amdgpu/pm: support mca_ceumc_addr in ecctable Wang, Yang(Kevin)
2022-05-23 10:16   ` 回复: " Yang, Stanley
2022-05-23 10:37     ` 答复: " Yang, Stanley
2022-05-23  9:12 ` Lazar, Lijo
2022-05-23 10:11   ` 回复: " Yang, Stanley

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.