All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/4] add SDMA ras error reporting support
@ 2020-01-08 16:17 Hawking Zhang
  2020-01-08 16:17 ` [PATCH 1/4] drm/amdgpu: add query_ras_error_count function for sdma v4 Hawking Zhang
                   ` (4 more replies)
  0 siblings, 5 replies; 12+ messages in thread
From: Hawking Zhang @ 2020-01-08 16:17 UTC (permalink / raw)
  To: amd-gfx, Alex Deucher, Dennis Li, John Clements, Guchun Chen,
	Tao Zhou, Candice Li, Gang Long
  Cc: Hawking Zhang

Currently, sdma edc counters are grouped in gfx edc counter
registers array (sec_ded_counter_registers), which results
to several issues including:
1). count sdma ras error into gfx ip blocks when querying gfx
error counter (i.e. through sysfs gfx_error_count node).
2). kernel crash (access NULL pointer) when querying gfx error
counter on vega20. there is only 2 sdma instances while the
gfx edc counter register array unifed arcturus and vega20 cases.
then driver will force to read sdma2 ~ 7 edc counter registers
even the ip base address is not initlaized.
3). unnecessary/wrong grbm switch even reading sdma edc counter.

To fix above issue, the series will separate sdma ras query
functions from gfx one. check the sdam_edc_counters and report
back error count and the error type as well. 

Hawking Zhang (4):
  drm/amdgpu: add query_ras_error_count function for sdma v4
  drm/amdgpu: support error reporting for sdma ip block
  drm/amdgpu: add ras_late_init and ras_fini for sdma v4
  drm/amdgpu: read sdma edc counter to clear the counters

 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  |   7 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h |   9 ++
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    |  11 +-
 drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c   | 176 ++++++++++++++++++++++-
 4 files changed, 191 insertions(+), 12 deletions(-)

-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 1/4] drm/amdgpu: add query_ras_error_count function for sdma v4
  2020-01-08 16:17 [PATCH 0/4] add SDMA ras error reporting support Hawking Zhang
@ 2020-01-08 16:17 ` Hawking Zhang
  2020-01-08 16:26   ` Alex Deucher
  2020-01-09  0:59   ` Chen, Guchun
  2020-01-08 16:17 ` [PATCH 2/4] drm/amdgpu: support error reporting for sdma ip block Hawking Zhang
                   ` (3 subsequent siblings)
  4 siblings, 2 replies; 12+ messages in thread
From: Hawking Zhang @ 2020-01-08 16:17 UTC (permalink / raw)
  To: amd-gfx, Alex Deucher, Dennis Li, John Clements, Guchun Chen,
	Tao Zhou, Candice Li, Gang Long
  Cc: Hawking Zhang

query_ras_error_count function will be invoked to query
single bit error count detected in sdma ip block

Change-Id: I1b17df7c66e71739ae4c31900bd96c5359af2240
Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h |   6 +
 drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c   | 163 +++++++++++++++++++++++
 2 files changed, 169 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
index 957791673fcd..9e87a97f81fb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
@@ -50,6 +50,11 @@ struct amdgpu_sdma_instance {
 	bool			burst_nop;
 };
 
+struct amdgpu_sdma_ras_funcs {
+	int (*query_ras_error_count)(struct amdgpu_device *adev,
+			uint32_t instance, void *ras_error_status);
+};
+
 struct amdgpu_sdma {
 	struct amdgpu_sdma_instance instance[AMDGPU_MAX_SDMA_INSTANCES];
 	struct drm_gpu_scheduler    *sdma_sched[AMDGPU_MAX_SDMA_INSTANCES];
@@ -61,6 +66,7 @@ struct amdgpu_sdma {
 	uint32_t                    srbm_soft_reset;
 	bool			has_page_queue;
 	struct ras_common_if	*ras_if;
+	const struct amdgpu_sdma_ras_funcs	*funcs;
 };
 
 /*
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
index 4074314695c3..a00b85934b04 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
@@ -82,6 +82,7 @@ static void sdma_v4_0_set_ring_funcs(struct amdgpu_device *adev);
 static void sdma_v4_0_set_buffer_funcs(struct amdgpu_device *adev);
 static void sdma_v4_0_set_vm_pte_funcs(struct amdgpu_device *adev);
 static void sdma_v4_0_set_irq_funcs(struct amdgpu_device *adev);
+static void sdma_v4_0_set_ras_funcs(struct amdgpu_device *adev);
 
 static const struct soc15_reg_golden golden_settings_sdma_4[] = {
 	SOC15_REG_GOLDEN_VALUE(SDMA0, 0, mmSDMA0_CHICKEN_BITS, 0xfe931f07, 0x02831d07),
@@ -257,6 +258,105 @@ static const struct soc15_reg_golden golden_settings_sdma_4_3[] = {
 	SOC15_REG_GOLDEN_VALUE(SDMA0, 0, mmSDMA0_UTCL1_WATERMK, 0xfc000000, 0x00000000)
 };
 
+static const struct soc15_ras_field_entry sdma_v4_0_ras_fields[] = {
+	{ "SDMA_UCODE_BUF_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+	SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_UCODE_BUF_SED),
+	0, 0,
+	},
+	{ "SDMA_RB_CMD_BUF_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+	SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_RB_CMD_BUF_SED),
+	0, 0,
+	},
+	{ "SDMA_IB_CMD_BUF_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+	SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_IB_CMD_BUF_SED),
+	0, 0,
+	},
+	{ "SDMA_UTCL1_RD_FIFO_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+        SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_UTCL1_RD_FIFO_SED),
+	0, 0,
+        },
+	{ "SDMA_UTCL1_RDBST_FIFO_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+	SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_UTCL1_RDBST_FIFO_SED),
+	0, 0,
+	},
+	{ "SDMA_DATA_LUT_FIFO_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+	SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_DATA_LUT_FIFO_SED),
+	0, 0,
+	},
+	{ "SDMA_MBANK_DATA_BUF0_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+	SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF0_SED),
+	0, 0,
+	},
+	{ "SDMA_MBANK_DATA_BUF1_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+	SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF1_SED),
+	0, 0,
+	},
+	{ "SDMA_MBANK_DATA_BUF2_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+	SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF2_SED),
+	0, 0,
+	},
+	{ "SDMA_MBANK_DATA_BUF3_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+	SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF3_SED),
+	0, 0,
+	},
+	{ "SDMA_MBANK_DATA_BUF4_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+	SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF4_SED),
+	0, 0,
+	},
+	{ "SDMA_MBANK_DATA_BUF5_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+	SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF5_SED),
+	0, 0,
+	},
+	{ "SDMA_MBANK_DATA_BUF6_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+	SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF6_SED),
+	0, 0,
+	},
+	{ "SDMA_MBANK_DATA_BUF7_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+	SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF7_SED),
+	0, 0,
+	},
+	{ "SDMA_MBANK_DATA_BUF8_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+	SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF8_SED),
+	0, 0,
+	},
+	{ "SDMA_MBANK_DATA_BUF9_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+	SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF9_SED),
+	0, 0,
+	},
+	{ "SDMA_MBANK_DATA_BUF10_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+	SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF10_SED),
+	0, 0,
+	},
+	{ "SDMA_MBANK_DATA_BUF11_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+	SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF11_SED),
+	0, 0,
+	},
+	{ "SDMA_MBANK_DATA_BUF12_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+	SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF12_SED),
+	0, 0,
+	},
+	{ "SDMA_MBANK_DATA_BUF13_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+	SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF13_SED),
+	0, 0,
+	},
+	{ "SDMA_MBANK_DATA_BUF14_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+	SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF14_SED),
+	0, 0,
+	},
+	{ "SDMA_MBANK_DATA_BUF15_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+	SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF15_SED),
+	0, 0,
+	},
+	{ "SDMA_SPLIT_DAT_BUF_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+	SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_SPLIT_DAT_BUF_SED),
+	0, 0,
+	},
+	{ "SDMA_MC_WR_ADDR_FIFO_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+	SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MC_WR_ADDR_FIFO_SED),
+	0, 0,
+	},
+};
+
 static u32 sdma_v4_0_get_reg_offset(struct amdgpu_device *adev,
 		u32 instance, u32 offset)
 {
@@ -1687,6 +1787,7 @@ static int sdma_v4_0_early_init(void *handle)
 	sdma_v4_0_set_buffer_funcs(adev);
 	sdma_v4_0_set_vm_pte_funcs(adev);
 	sdma_v4_0_set_irq_funcs(adev);
+	sdma_v4_0_set_ras_funcs(adev);
 
 	return 0;
 }
@@ -2417,6 +2518,68 @@ static void sdma_v4_0_set_vm_pte_funcs(struct amdgpu_device *adev)
 	adev->vm_manager.vm_pte_num_scheds = adev->sdma.num_instances;
 }
 
+static void sdma_v4_0_get_ras_error_count(uint32_t value,
+					uint32_t instance,
+					uint32_t *sec_count)
+{
+	uint32_t i;
+	uint32_t sec_cnt;
+
+	/* double bits error (multiple bits) error detection is not supported */
+	for (i = 0; i < ARRAY_SIZE(sdma_v4_0_ras_fields); i++) {
+		/* the SDMA_EDC_COUNTER register in each sdma instance
+		 * shares the same sed shift_mask
+		 * */
+		sec_cnt = (value &
+			sdma_v4_0_ras_fields[i].sec_count_mask) >>
+			sdma_v4_0_ras_fields[i].sec_count_shift;
+		if (sec_cnt) {
+			DRM_INFO("Detected %s in SDMA%d, SED %d\n",
+				sdma_v4_0_ras_fields[i].name,
+				instance, sec_cnt);
+			*sec_count += sec_cnt;
+		}
+	}
+}
+
+static int sdma_v4_0_query_ras_error_count(struct amdgpu_device *adev,
+			uint32_t instance, void *ras_error_status)
+{
+	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
+	uint32_t sec_count = 0;
+	uint32_t reg_value = 0;
+
+	reg_value = RREG32_SDMA(instance, mmSDMA0_EDC_COUNTER);
+	/* double bit error is not supported */
+	if (reg_value)
+		sdma_v4_0_get_ras_error_count(reg_value,
+				instance, &sec_count);
+	/* err_data->ce_count should be initialized to 0
+	 * before calling into this function */
+	err_data->ce_count += sec_count;
+	/* double bit error is not supported
+	 * set ue count to 0 */
+	err_data->ue_count = 0;
+
+	return 0;
+};
+
+static const struct amdgpu_sdma_ras_funcs sdma_v4_0_ras_funcs = {
+	.query_ras_error_count = sdma_v4_0_query_ras_error_count,
+};
+
+static void sdma_v4_0_set_ras_funcs(struct amdgpu_device *adev)
+{
+	switch (adev->asic_type) {
+	case CHIP_VEGA20:
+	case CHIP_ARCTURUS:
+		adev->sdma.funcs = &sdma_v4_0_ras_funcs;
+		break;
+	default:
+		break;
+	}
+}
+
 const struct amdgpu_ip_block_version sdma_v4_0_ip_block = {
 	.type = AMD_IP_BLOCK_TYPE_SDMA,
 	.major = 4,
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 2/4] drm/amdgpu: support error reporting for sdma ip block
  2020-01-08 16:17 [PATCH 0/4] add SDMA ras error reporting support Hawking Zhang
  2020-01-08 16:17 ` [PATCH 1/4] drm/amdgpu: add query_ras_error_count function for sdma v4 Hawking Zhang
@ 2020-01-08 16:17 ` Hawking Zhang
  2020-01-08 16:25   ` Alex Deucher
  2020-01-08 16:17 ` [PATCH 3/4] drm/amdgpu: add ras_late_init and ras_fini for sdma v4 Hawking Zhang
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 12+ messages in thread
From: Hawking Zhang @ 2020-01-08 16:17 UTC (permalink / raw)
  To: amd-gfx, Alex Deucher, Dennis Li, John Clements, Guchun Chen,
	Tao Zhou, Candice Li, Gang Long
  Cc: Hawking Zhang

invoke sdma query_ras_error_count to get sdma single
bit error count

Change-Id: Iaaa86bb79dc28fe714937ca832da8a1cb5541930
Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index c7cee9716bdb..ac9926b3f9fe 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -686,6 +686,7 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev,
 {
 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
 	struct ras_err_data err_data = {0, 0, 0, NULL};
+	int i;
 
 	if (!obj)
 		return -EINVAL;
@@ -700,6 +701,12 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev,
 		if (adev->umc.funcs->query_ras_error_address)
 			adev->umc.funcs->query_ras_error_address(adev, &err_data);
 		break;
+	case AMDGPU_RAS_BLOCK__SDMA:
+		if (adev->sdma.funcs->query_ras_error_count) {
+			for (i = 0; i < adev->sdma.num_instances; i++)
+			adev->sdma.funcs->query_ras_error_count(adev, i, &err_data);
+		}
+		break;
 	case AMDGPU_RAS_BLOCK__GFX:
 		if (adev->gfx.funcs->query_ras_error_count)
 			adev->gfx.funcs->query_ras_error_count(adev, &err_data);
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 3/4] drm/amdgpu: add ras_late_init and ras_fini for sdma v4
  2020-01-08 16:17 [PATCH 0/4] add SDMA ras error reporting support Hawking Zhang
  2020-01-08 16:17 ` [PATCH 1/4] drm/amdgpu: add query_ras_error_count function for sdma v4 Hawking Zhang
  2020-01-08 16:17 ` [PATCH 2/4] drm/amdgpu: support error reporting for sdma ip block Hawking Zhang
@ 2020-01-08 16:17 ` Hawking Zhang
  2020-01-08 16:26   ` Alex Deucher
  2020-01-08 16:17 ` [PATCH 4/4] drm/amdgpu: read sdma edc counter to clear the counters Hawking Zhang
  2020-01-09  1:03 ` [PATCH 0/4] add SDMA ras error reporting support Chen, Guchun
  4 siblings, 1 reply; 12+ messages in thread
From: Hawking Zhang @ 2020-01-08 16:17 UTC (permalink / raw)
  To: amd-gfx, Alex Deucher, Dennis Li, John Clements, Guchun Chen,
	Tao Zhou, Candice Li, Gang Long
  Cc: Hawking Zhang

move ras_late_init and ras_fini to sdma_ras_funcs table

Change-Id: If3a6c0defde4d23f81d2ff7ff79daa98a732efde
Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h | 3 +++
 drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c   | 6 ++++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
index 9e87a97f81fb..ee0ca996da0d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
@@ -51,6 +51,9 @@ struct amdgpu_sdma_instance {
 };
 
 struct amdgpu_sdma_ras_funcs {
+	int (*ras_late_init)(struct amdgpu_device *adev,
+			void *ras_ih_info);
+	void (*ras_fini)(struct amdgpu_device *adev);
 	int (*query_ras_error_count)(struct amdgpu_device *adev,
 			uint32_t instance, void *ras_error_status);
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
index a00b85934b04..fd20594b6d6e 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
@@ -1803,7 +1803,7 @@ static int sdma_v4_0_late_init(void *handle)
 		.cb = sdma_v4_0_process_ras_data_cb,
 	};
 
-	return amdgpu_sdma_ras_late_init(adev, &ih_info);
+	return adev->sdma.funcs->ras_late_init(adev, &ih_info);
 }
 
 static int sdma_v4_0_sw_init(void *handle)
@@ -1875,7 +1875,7 @@ static int sdma_v4_0_sw_fini(void *handle)
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 	int i;
 
-	amdgpu_sdma_ras_fini(adev);
+	adev->sdma.funcs->ras_fini(adev);
 
 	for (i = 0; i < adev->sdma.num_instances; i++) {
 		amdgpu_ring_fini(&adev->sdma.instance[i].ring);
@@ -2565,6 +2565,8 @@ static int sdma_v4_0_query_ras_error_count(struct amdgpu_device *adev,
 };
 
 static const struct amdgpu_sdma_ras_funcs sdma_v4_0_ras_funcs = {
+	.ras_late_init = amdgpu_sdma_ras_late_init,
+	.ras_fini = amdgpu_sdma_ras_fini,
 	.query_ras_error_count = sdma_v4_0_query_ras_error_count,
 };
 
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 4/4] drm/amdgpu: read sdma edc counter to clear the counters
  2020-01-08 16:17 [PATCH 0/4] add SDMA ras error reporting support Hawking Zhang
                   ` (2 preceding siblings ...)
  2020-01-08 16:17 ` [PATCH 3/4] drm/amdgpu: add ras_late_init and ras_fini for sdma v4 Hawking Zhang
@ 2020-01-08 16:17 ` Hawking Zhang
  2020-01-08 16:27   ` Alex Deucher
  2020-01-09  1:03 ` [PATCH 0/4] add SDMA ras error reporting support Chen, Guchun
  4 siblings, 1 reply; 12+ messages in thread
From: Hawking Zhang @ 2020-01-08 16:17 UTC (permalink / raw)
  To: amd-gfx, Alex Deucher, Dennis Li, John Clements, Guchun Chen,
	Tao Zhou, Candice Li, Gang Long
  Cc: Hawking Zhang

SDMA edc counter registers were added in gfx edc counters
array. When querying gfx error counter in that array, there
is no way to differentiate sdma instance number for different
asic and then results to NULL pointer access when trying to
read sdma register base address for instances greater
than 2 on Vega20.
In addition, this also results to wrong gfx error counters
since it actually added sdma edc counters.
Therefore, sdma edc counter registers should be separated
from gfx edc counter regsiter array and only get initialized
when driver tries to enable sdma ras.

Change-Id: I206917f9d7b81670a8fed84dc749085ce5a6f678
Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 11 +----------
 drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c |  7 +++++++
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 33d1c57aaaf1..c9ade16bbcc3 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -4038,14 +4038,6 @@ static const struct soc15_reg_entry sec_ded_counter_registers[] = {
    { SOC15_REG_ENTRY(GC, 0, mmTCA_EDC_CNT), 0, 1, 2},
    { SOC15_REG_ENTRY(GC, 0, mmSQC_EDC_CNT3), 0, 4, 6},
    { SOC15_REG_ENTRY(HDP, 0, mmHDP_EDC_CNT), 0, 1, 1},
-   { SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER), 0, 1, 1},
-   { SOC15_REG_ENTRY(SDMA1, 0, mmSDMA1_EDC_COUNTER), 0, 1, 1},
-   { SOC15_REG_ENTRY(SDMA2, 0, mmSDMA2_EDC_COUNTER), 0, 1, 1},
-   { SOC15_REG_ENTRY(SDMA3, 0, mmSDMA3_EDC_COUNTER), 0, 1, 1},
-   { SOC15_REG_ENTRY(SDMA4, 0, mmSDMA4_EDC_COUNTER), 0, 1, 1},
-   { SOC15_REG_ENTRY(SDMA5, 0, mmSDMA5_EDC_COUNTER), 0, 1, 1},
-   { SOC15_REG_ENTRY(SDMA6, 0, mmSDMA6_EDC_COUNTER), 0, 1, 1},
-   { SOC15_REG_ENTRY(SDMA7, 0, mmSDMA7_EDC_COUNTER), 0, 1, 1},
 };
 
 static int gfx_v9_0_do_edc_gds_workarounds(struct amdgpu_device *adev)
@@ -4109,7 +4101,6 @@ static int gfx_v9_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
 						adev->gfx.config.max_sh_per_se;
 	int sgpr_work_group_size = 5;
 	int gpr_reg_size = compute_dim_x / 16 + 6;
-	int sec_ded_counter_reg_size = adev->sdma.num_instances + 34;
 
 	/* only support when RAS is enabled */
 	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX))
@@ -4249,7 +4240,7 @@ static int gfx_v9_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
 
 	/* read back registers to clear the counters */
 	mutex_lock(&adev->grbm_idx_mutex);
-	for (i = 0; i < sec_ded_counter_reg_size; i++) {
+	for (i = 0; i < ARRAY_SIZE(sec_ded_counter_registers); i++) {
 		for (j = 0; j < sec_ded_counter_registers[i].se_num; j++) {
 			for (k = 0; k < sec_ded_counter_registers[i].instance; k++) {
 				gfx_v9_0_select_se_sh(adev, j, 0x0, k);
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
index fd20594b6d6e..f4107f9b75f3 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
@@ -1802,6 +1802,13 @@ static int sdma_v4_0_late_init(void *handle)
 	struct ras_ih_if ih_info = {
 		.cb = sdma_v4_0_process_ras_data_cb,
 	};
+	int i;
+
+	/* read back edc counter registers to clear the counters */
+	if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA)) {
+		for (i = 0; i < adev->sdma.num_instances; i++)
+			RREG32_SDMA(i, mmSDMA0_EDC_COUNTER);
+	}
 
 	return adev->sdma.funcs->ras_late_init(adev, &ih_info);
 }
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/4] drm/amdgpu: support error reporting for sdma ip block
  2020-01-08 16:17 ` [PATCH 2/4] drm/amdgpu: support error reporting for sdma ip block Hawking Zhang
@ 2020-01-08 16:25   ` Alex Deucher
  0 siblings, 0 replies; 12+ messages in thread
From: Alex Deucher @ 2020-01-08 16:25 UTC (permalink / raw)
  To: Hawking Zhang
  Cc: Gang Long, Guchun Chen, Tao Zhou, amd-gfx list, Alex Deucher,
	Candice Li, John Clements, Dennis Li

On Wed, Jan 8, 2020 at 11:17 AM Hawking Zhang <Hawking.Zhang@amd.com> wrote:
>
> invoke sdma query_ras_error_count to get sdma single
> bit error count
>
> Change-Id: Iaaa86bb79dc28fe714937ca832da8a1cb5541930
> Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 7 +++++++
>  1 file changed, 7 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index c7cee9716bdb..ac9926b3f9fe 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -686,6 +686,7 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev,
>  {
>         struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
>         struct ras_err_data err_data = {0, 0, 0, NULL};
> +       int i;
>
>         if (!obj)
>                 return -EINVAL;
> @@ -700,6 +701,12 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev,
>                 if (adev->umc.funcs->query_ras_error_address)
>                         adev->umc.funcs->query_ras_error_address(adev, &err_data);
>                 break;
> +       case AMDGPU_RAS_BLOCK__SDMA:
> +               if (adev->sdma.funcs->query_ras_error_count) {
> +                       for (i = 0; i < adev->sdma.num_instances; i++)
> +                       adev->sdma.funcs->query_ras_error_count(adev, i, &err_data);

Looks like the indentation is off here.  With that fixed:
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>

> +               }
> +               break;
>         case AMDGPU_RAS_BLOCK__GFX:
>                 if (adev->gfx.funcs->query_ras_error_count)
>                         adev->gfx.funcs->query_ras_error_count(adev, &err_data);
> --
> 2.17.1
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/4] drm/amdgpu: add query_ras_error_count function for sdma v4
  2020-01-08 16:17 ` [PATCH 1/4] drm/amdgpu: add query_ras_error_count function for sdma v4 Hawking Zhang
@ 2020-01-08 16:26   ` Alex Deucher
  2020-01-09  0:59   ` Chen, Guchun
  1 sibling, 0 replies; 12+ messages in thread
From: Alex Deucher @ 2020-01-08 16:26 UTC (permalink / raw)
  To: Hawking Zhang
  Cc: Gang Long, Guchun Chen, Tao Zhou, amd-gfx list, Alex Deucher,
	Candice Li, John Clements, Dennis Li

On Wed, Jan 8, 2020 at 11:17 AM Hawking Zhang <Hawking.Zhang@amd.com> wrote:
>
> query_ras_error_count function will be invoked to query
> single bit error count detected in sdma ip block
>
> Change-Id: I1b17df7c66e71739ae4c31900bd96c5359af2240
> Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>

Reviewed-by: Alex Deucher <alexander.deucher@amd.com>

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h |   6 +
>  drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c   | 163 +++++++++++++++++++++++
>  2 files changed, 169 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
> index 957791673fcd..9e87a97f81fb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
> @@ -50,6 +50,11 @@ struct amdgpu_sdma_instance {
>         bool                    burst_nop;
>  };
>
> +struct amdgpu_sdma_ras_funcs {
> +       int (*query_ras_error_count)(struct amdgpu_device *adev,
> +                       uint32_t instance, void *ras_error_status);
> +};
> +
>  struct amdgpu_sdma {
>         struct amdgpu_sdma_instance instance[AMDGPU_MAX_SDMA_INSTANCES];
>         struct drm_gpu_scheduler    *sdma_sched[AMDGPU_MAX_SDMA_INSTANCES];
> @@ -61,6 +66,7 @@ struct amdgpu_sdma {
>         uint32_t                    srbm_soft_reset;
>         bool                    has_page_queue;
>         struct ras_common_if    *ras_if;
> +       const struct amdgpu_sdma_ras_funcs      *funcs;
>  };
>
>  /*
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> index 4074314695c3..a00b85934b04 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> @@ -82,6 +82,7 @@ static void sdma_v4_0_set_ring_funcs(struct amdgpu_device *adev);
>  static void sdma_v4_0_set_buffer_funcs(struct amdgpu_device *adev);
>  static void sdma_v4_0_set_vm_pte_funcs(struct amdgpu_device *adev);
>  static void sdma_v4_0_set_irq_funcs(struct amdgpu_device *adev);
> +static void sdma_v4_0_set_ras_funcs(struct amdgpu_device *adev);
>
>  static const struct soc15_reg_golden golden_settings_sdma_4[] = {
>         SOC15_REG_GOLDEN_VALUE(SDMA0, 0, mmSDMA0_CHICKEN_BITS, 0xfe931f07, 0x02831d07),
> @@ -257,6 +258,105 @@ static const struct soc15_reg_golden golden_settings_sdma_4_3[] = {
>         SOC15_REG_GOLDEN_VALUE(SDMA0, 0, mmSDMA0_UTCL1_WATERMK, 0xfc000000, 0x00000000)
>  };
>
> +static const struct soc15_ras_field_entry sdma_v4_0_ras_fields[] = {
> +       { "SDMA_UCODE_BUF_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
> +       SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_UCODE_BUF_SED),
> +       0, 0,
> +       },
> +       { "SDMA_RB_CMD_BUF_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
> +       SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_RB_CMD_BUF_SED),
> +       0, 0,
> +       },
> +       { "SDMA_IB_CMD_BUF_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
> +       SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_IB_CMD_BUF_SED),
> +       0, 0,
> +       },
> +       { "SDMA_UTCL1_RD_FIFO_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
> +        SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_UTCL1_RD_FIFO_SED),
> +       0, 0,
> +        },
> +       { "SDMA_UTCL1_RDBST_FIFO_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
> +       SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_UTCL1_RDBST_FIFO_SED),
> +       0, 0,
> +       },
> +       { "SDMA_DATA_LUT_FIFO_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
> +       SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_DATA_LUT_FIFO_SED),
> +       0, 0,
> +       },
> +       { "SDMA_MBANK_DATA_BUF0_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
> +       SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF0_SED),
> +       0, 0,
> +       },
> +       { "SDMA_MBANK_DATA_BUF1_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
> +       SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF1_SED),
> +       0, 0,
> +       },
> +       { "SDMA_MBANK_DATA_BUF2_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
> +       SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF2_SED),
> +       0, 0,
> +       },
> +       { "SDMA_MBANK_DATA_BUF3_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
> +       SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF3_SED),
> +       0, 0,
> +       },
> +       { "SDMA_MBANK_DATA_BUF4_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
> +       SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF4_SED),
> +       0, 0,
> +       },
> +       { "SDMA_MBANK_DATA_BUF5_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
> +       SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF5_SED),
> +       0, 0,
> +       },
> +       { "SDMA_MBANK_DATA_BUF6_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
> +       SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF6_SED),
> +       0, 0,
> +       },
> +       { "SDMA_MBANK_DATA_BUF7_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
> +       SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF7_SED),
> +       0, 0,
> +       },
> +       { "SDMA_MBANK_DATA_BUF8_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
> +       SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF8_SED),
> +       0, 0,
> +       },
> +       { "SDMA_MBANK_DATA_BUF9_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
> +       SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF9_SED),
> +       0, 0,
> +       },
> +       { "SDMA_MBANK_DATA_BUF10_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
> +       SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF10_SED),
> +       0, 0,
> +       },
> +       { "SDMA_MBANK_DATA_BUF11_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
> +       SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF11_SED),
> +       0, 0,
> +       },
> +       { "SDMA_MBANK_DATA_BUF12_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
> +       SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF12_SED),
> +       0, 0,
> +       },
> +       { "SDMA_MBANK_DATA_BUF13_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
> +       SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF13_SED),
> +       0, 0,
> +       },
> +       { "SDMA_MBANK_DATA_BUF14_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
> +       SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF14_SED),
> +       0, 0,
> +       },
> +       { "SDMA_MBANK_DATA_BUF15_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
> +       SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF15_SED),
> +       0, 0,
> +       },
> +       { "SDMA_SPLIT_DAT_BUF_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
> +       SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_SPLIT_DAT_BUF_SED),
> +       0, 0,
> +       },
> +       { "SDMA_MC_WR_ADDR_FIFO_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
> +       SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MC_WR_ADDR_FIFO_SED),
> +       0, 0,
> +       },
> +};
> +
>  static u32 sdma_v4_0_get_reg_offset(struct amdgpu_device *adev,
>                 u32 instance, u32 offset)
>  {
> @@ -1687,6 +1787,7 @@ static int sdma_v4_0_early_init(void *handle)
>         sdma_v4_0_set_buffer_funcs(adev);
>         sdma_v4_0_set_vm_pte_funcs(adev);
>         sdma_v4_0_set_irq_funcs(adev);
> +       sdma_v4_0_set_ras_funcs(adev);
>
>         return 0;
>  }
> @@ -2417,6 +2518,68 @@ static void sdma_v4_0_set_vm_pte_funcs(struct amdgpu_device *adev)
>         adev->vm_manager.vm_pte_num_scheds = adev->sdma.num_instances;
>  }
>
> +static void sdma_v4_0_get_ras_error_count(uint32_t value,
> +                                       uint32_t instance,
> +                                       uint32_t *sec_count)
> +{
> +       uint32_t i;
> +       uint32_t sec_cnt;
> +
> +       /* double bits error (multiple bits) error detection is not supported */
> +       for (i = 0; i < ARRAY_SIZE(sdma_v4_0_ras_fields); i++) {
> +               /* the SDMA_EDC_COUNTER register in each sdma instance
> +                * shares the same sed shift_mask
> +                * */
> +               sec_cnt = (value &
> +                       sdma_v4_0_ras_fields[i].sec_count_mask) >>
> +                       sdma_v4_0_ras_fields[i].sec_count_shift;
> +               if (sec_cnt) {
> +                       DRM_INFO("Detected %s in SDMA%d, SED %d\n",
> +                               sdma_v4_0_ras_fields[i].name,
> +                               instance, sec_cnt);
> +                       *sec_count += sec_cnt;
> +               }
> +       }
> +}
> +
> +static int sdma_v4_0_query_ras_error_count(struct amdgpu_device *adev,
> +                       uint32_t instance, void *ras_error_status)
> +{
> +       struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
> +       uint32_t sec_count = 0;
> +       uint32_t reg_value = 0;
> +
> +       reg_value = RREG32_SDMA(instance, mmSDMA0_EDC_COUNTER);
> +       /* double bit error is not supported */
> +       if (reg_value)
> +               sdma_v4_0_get_ras_error_count(reg_value,
> +                               instance, &sec_count);
> +       /* err_data->ce_count should be initialized to 0
> +        * before calling into this function */
> +       err_data->ce_count += sec_count;
> +       /* double bit error is not supported
> +        * set ue count to 0 */
> +       err_data->ue_count = 0;
> +
> +       return 0;
> +};
> +
> +static const struct amdgpu_sdma_ras_funcs sdma_v4_0_ras_funcs = {
> +       .query_ras_error_count = sdma_v4_0_query_ras_error_count,
> +};
> +
> +static void sdma_v4_0_set_ras_funcs(struct amdgpu_device *adev)
> +{
> +       switch (adev->asic_type) {
> +       case CHIP_VEGA20:
> +       case CHIP_ARCTURUS:
> +               adev->sdma.funcs = &sdma_v4_0_ras_funcs;
> +               break;
> +       default:
> +               break;
> +       }
> +}
> +
>  const struct amdgpu_ip_block_version sdma_v4_0_ip_block = {
>         .type = AMD_IP_BLOCK_TYPE_SDMA,
>         .major = 4,
> --
> 2.17.1
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/4] drm/amdgpu: add ras_late_init and ras_fini for sdma v4
  2020-01-08 16:17 ` [PATCH 3/4] drm/amdgpu: add ras_late_init and ras_fini for sdma v4 Hawking Zhang
@ 2020-01-08 16:26   ` Alex Deucher
  0 siblings, 0 replies; 12+ messages in thread
From: Alex Deucher @ 2020-01-08 16:26 UTC (permalink / raw)
  To: Hawking Zhang
  Cc: Gang Long, Guchun Chen, Tao Zhou, amd-gfx list, Alex Deucher,
	Candice Li, John Clements, Dennis Li

On Wed, Jan 8, 2020 at 11:18 AM Hawking Zhang <Hawking.Zhang@amd.com> wrote:
>
> move ras_late_init and ras_fini to sdma_ras_funcs table
>
> Change-Id: If3a6c0defde4d23f81d2ff7ff79daa98a732efde
> Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>

Reviewed-by: Alex Deucher <alexander.deucher@amd.com>

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h | 3 +++
>  drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c   | 6 ++++--
>  2 files changed, 7 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
> index 9e87a97f81fb..ee0ca996da0d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
> @@ -51,6 +51,9 @@ struct amdgpu_sdma_instance {
>  };
>
>  struct amdgpu_sdma_ras_funcs {
> +       int (*ras_late_init)(struct amdgpu_device *adev,
> +                       void *ras_ih_info);
> +       void (*ras_fini)(struct amdgpu_device *adev);
>         int (*query_ras_error_count)(struct amdgpu_device *adev,
>                         uint32_t instance, void *ras_error_status);
>  };
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> index a00b85934b04..fd20594b6d6e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> @@ -1803,7 +1803,7 @@ static int sdma_v4_0_late_init(void *handle)
>                 .cb = sdma_v4_0_process_ras_data_cb,
>         };
>
> -       return amdgpu_sdma_ras_late_init(adev, &ih_info);
> +       return adev->sdma.funcs->ras_late_init(adev, &ih_info);
>  }
>
>  static int sdma_v4_0_sw_init(void *handle)
> @@ -1875,7 +1875,7 @@ static int sdma_v4_0_sw_fini(void *handle)
>         struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>         int i;
>
> -       amdgpu_sdma_ras_fini(adev);
> +       adev->sdma.funcs->ras_fini(adev);
>
>         for (i = 0; i < adev->sdma.num_instances; i++) {
>                 amdgpu_ring_fini(&adev->sdma.instance[i].ring);
> @@ -2565,6 +2565,8 @@ static int sdma_v4_0_query_ras_error_count(struct amdgpu_device *adev,
>  };
>
>  static const struct amdgpu_sdma_ras_funcs sdma_v4_0_ras_funcs = {
> +       .ras_late_init = amdgpu_sdma_ras_late_init,
> +       .ras_fini = amdgpu_sdma_ras_fini,
>         .query_ras_error_count = sdma_v4_0_query_ras_error_count,
>  };
>
> --
> 2.17.1
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 4/4] drm/amdgpu: read sdma edc counter to clear the counters
  2020-01-08 16:17 ` [PATCH 4/4] drm/amdgpu: read sdma edc counter to clear the counters Hawking Zhang
@ 2020-01-08 16:27   ` Alex Deucher
  0 siblings, 0 replies; 12+ messages in thread
From: Alex Deucher @ 2020-01-08 16:27 UTC (permalink / raw)
  To: Hawking Zhang
  Cc: Gang Long, Guchun Chen, Tao Zhou, amd-gfx list, Alex Deucher,
	Candice Li, John Clements, Dennis Li

On Wed, Jan 8, 2020 at 11:18 AM Hawking Zhang <Hawking.Zhang@amd.com> wrote:
>
> SDMA edc counter registers were added in gfx edc counters
> array. When querying gfx error counter in that array, there
> is no way to differentiate sdma instance number for different
> asic and then results to NULL pointer access when trying to
> read sdma register base address for instances greater
> than 2 on Vega20.
> In addition, this also results to wrong gfx error counters
> since it actually added sdma edc counters.
> Therefore, sdma edc counter registers should be separated
> from gfx edc counter regsiter array and only get initialized
> when driver tries to enable sdma ras.
>
> Change-Id: I206917f9d7b81670a8fed84dc749085ce5a6f678
> Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>

Reviewed-by: Alex Deucher <alexander.deucher@amd.com>

> ---
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 11 +----------
>  drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c |  7 +++++++
>  2 files changed, 8 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 33d1c57aaaf1..c9ade16bbcc3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -4038,14 +4038,6 @@ static const struct soc15_reg_entry sec_ded_counter_registers[] = {
>     { SOC15_REG_ENTRY(GC, 0, mmTCA_EDC_CNT), 0, 1, 2},
>     { SOC15_REG_ENTRY(GC, 0, mmSQC_EDC_CNT3), 0, 4, 6},
>     { SOC15_REG_ENTRY(HDP, 0, mmHDP_EDC_CNT), 0, 1, 1},
> -   { SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER), 0, 1, 1},
> -   { SOC15_REG_ENTRY(SDMA1, 0, mmSDMA1_EDC_COUNTER), 0, 1, 1},
> -   { SOC15_REG_ENTRY(SDMA2, 0, mmSDMA2_EDC_COUNTER), 0, 1, 1},
> -   { SOC15_REG_ENTRY(SDMA3, 0, mmSDMA3_EDC_COUNTER), 0, 1, 1},
> -   { SOC15_REG_ENTRY(SDMA4, 0, mmSDMA4_EDC_COUNTER), 0, 1, 1},
> -   { SOC15_REG_ENTRY(SDMA5, 0, mmSDMA5_EDC_COUNTER), 0, 1, 1},
> -   { SOC15_REG_ENTRY(SDMA6, 0, mmSDMA6_EDC_COUNTER), 0, 1, 1},
> -   { SOC15_REG_ENTRY(SDMA7, 0, mmSDMA7_EDC_COUNTER), 0, 1, 1},
>  };
>
>  static int gfx_v9_0_do_edc_gds_workarounds(struct amdgpu_device *adev)
> @@ -4109,7 +4101,6 @@ static int gfx_v9_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
>                                                 adev->gfx.config.max_sh_per_se;
>         int sgpr_work_group_size = 5;
>         int gpr_reg_size = compute_dim_x / 16 + 6;
> -       int sec_ded_counter_reg_size = adev->sdma.num_instances + 34;
>
>         /* only support when RAS is enabled */
>         if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX))
> @@ -4249,7 +4240,7 @@ static int gfx_v9_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
>
>         /* read back registers to clear the counters */
>         mutex_lock(&adev->grbm_idx_mutex);
> -       for (i = 0; i < sec_ded_counter_reg_size; i++) {
> +       for (i = 0; i < ARRAY_SIZE(sec_ded_counter_registers); i++) {
>                 for (j = 0; j < sec_ded_counter_registers[i].se_num; j++) {
>                         for (k = 0; k < sec_ded_counter_registers[i].instance; k++) {
>                                 gfx_v9_0_select_se_sh(adev, j, 0x0, k);
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> index fd20594b6d6e..f4107f9b75f3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> @@ -1802,6 +1802,13 @@ static int sdma_v4_0_late_init(void *handle)
>         struct ras_ih_if ih_info = {
>                 .cb = sdma_v4_0_process_ras_data_cb,
>         };
> +       int i;
> +
> +       /* read back edc counter registers to clear the counters */
> +       if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA)) {
> +               for (i = 0; i < adev->sdma.num_instances; i++)
> +                       RREG32_SDMA(i, mmSDMA0_EDC_COUNTER);
> +       }
>
>         return adev->sdma.funcs->ras_late_init(adev, &ih_info);
>  }
> --
> 2.17.1
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [PATCH 1/4] drm/amdgpu: add query_ras_error_count function for sdma v4
  2020-01-08 16:17 ` [PATCH 1/4] drm/amdgpu: add query_ras_error_count function for sdma v4 Hawking Zhang
  2020-01-08 16:26   ` Alex Deucher
@ 2020-01-09  0:59   ` Chen, Guchun
  1 sibling, 0 replies; 12+ messages in thread
From: Chen, Guchun @ 2020-01-09  0:59 UTC (permalink / raw)
  To: Zhang, Hawking, amd-gfx, Deucher, Alexander, Li, Dennis,
	Clements, John, Zhou1, Tao, Li, Candice, Long, Gang
  Cc: Zhang, Hawking

[AMD Public Use]



-----Original Message-----
From: Hawking Zhang <Hawking.Zhang@amd.com> 
Sent: Thursday, January 9, 2020 12:17 AM
To: amd-gfx@lists.freedesktop.org; Deucher, Alexander <Alexander.Deucher@amd.com>; Li, Dennis <Dennis.Li@amd.com>; Clements, John <John.Clements@amd.com>; Chen, Guchun <Guchun.Chen@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Candice <Candice.Li@amd.com>; Long, Gang <Gang.Long@amd.com>
Cc: Zhang, Hawking <Hawking.Zhang@amd.com>
Subject: [PATCH 1/4] drm/amdgpu: add query_ras_error_count function for sdma v4

query_ras_error_count function will be invoked to query single bit error count detected in sdma ip block

Change-Id: I1b17df7c66e71739ae4c31900bd96c5359af2240
Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h |   6 +
 drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c   | 163 +++++++++++++++++++++++
 2 files changed, 169 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
index 957791673fcd..9e87a97f81fb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
@@ -50,6 +50,11 @@ struct amdgpu_sdma_instance {
 	bool			burst_nop;
 };
 
+struct amdgpu_sdma_ras_funcs {
+	int (*query_ras_error_count)(struct amdgpu_device *adev,
+			uint32_t instance, void *ras_error_status); };
+
 struct amdgpu_sdma {
 	struct amdgpu_sdma_instance instance[AMDGPU_MAX_SDMA_INSTANCES];
 	struct drm_gpu_scheduler    *sdma_sched[AMDGPU_MAX_SDMA_INSTANCES];
@@ -61,6 +66,7 @@ struct amdgpu_sdma {
 	uint32_t                    srbm_soft_reset;
 	bool			has_page_queue;
 	struct ras_common_if	*ras_if;
+	const struct amdgpu_sdma_ras_funcs	*funcs;
 };
 
 /*
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
index 4074314695c3..a00b85934b04 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
@@ -82,6 +82,7 @@ static void sdma_v4_0_set_ring_funcs(struct amdgpu_device *adev);  static void sdma_v4_0_set_buffer_funcs(struct amdgpu_device *adev);  static void sdma_v4_0_set_vm_pte_funcs(struct amdgpu_device *adev);  static void sdma_v4_0_set_irq_funcs(struct amdgpu_device *adev);
+static void sdma_v4_0_set_ras_funcs(struct amdgpu_device *adev);
 
 static const struct soc15_reg_golden golden_settings_sdma_4[] = {
 	SOC15_REG_GOLDEN_VALUE(SDMA0, 0, mmSDMA0_CHICKEN_BITS, 0xfe931f07, 0x02831d07), @@ -257,6 +258,105 @@ static const struct soc15_reg_golden golden_settings_sdma_4_3[] = {
 	SOC15_REG_GOLDEN_VALUE(SDMA0, 0, mmSDMA0_UTCL1_WATERMK, 0xfc000000, 0x00000000)  };
 
+static const struct soc15_ras_field_entry sdma_v4_0_ras_fields[] = {
+	{ "SDMA_UCODE_BUF_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+	SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_UCODE_BUF_SED),
+	0, 0,
+	},
+	{ "SDMA_RB_CMD_BUF_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+	SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_RB_CMD_BUF_SED),
+	0, 0,
+	},
+	{ "SDMA_IB_CMD_BUF_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+	SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_IB_CMD_BUF_SED),
+	0, 0,
+	},
+	{ "SDMA_UTCL1_RD_FIFO_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+        SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_UTCL1_RD_FIFO_SED),
+	0, 0,
+        },
[Guchun]The indentation looks not right.

+	{ "SDMA_UTCL1_RDBST_FIFO_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+	SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_UTCL1_RDBST_FIFO_SED),
+	0, 0,
+	},
+	{ "SDMA_DATA_LUT_FIFO_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+	SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_DATA_LUT_FIFO_SED),
+	0, 0,
+	},
+	{ "SDMA_MBANK_DATA_BUF0_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+	SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF0_SED),
+	0, 0,
+	},
+	{ "SDMA_MBANK_DATA_BUF1_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+	SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF1_SED),
+	0, 0,
+	},
+	{ "SDMA_MBANK_DATA_BUF2_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+	SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF2_SED),
+	0, 0,
+	},
+	{ "SDMA_MBANK_DATA_BUF3_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+	SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF3_SED),
+	0, 0,
+	},
+	{ "SDMA_MBANK_DATA_BUF4_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+	SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF4_SED),
+	0, 0,
+	},
+	{ "SDMA_MBANK_DATA_BUF5_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+	SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF5_SED),
+	0, 0,
+	},
+	{ "SDMA_MBANK_DATA_BUF6_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+	SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF6_SED),
+	0, 0,
+	},
+	{ "SDMA_MBANK_DATA_BUF7_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+	SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF7_SED),
+	0, 0,
+	},
+	{ "SDMA_MBANK_DATA_BUF8_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+	SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF8_SED),
+	0, 0,
+	},
+	{ "SDMA_MBANK_DATA_BUF9_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+	SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF9_SED),
+	0, 0,
+	},
+	{ "SDMA_MBANK_DATA_BUF10_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+	SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF10_SED),
+	0, 0,
+	},
+	{ "SDMA_MBANK_DATA_BUF11_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+	SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF11_SED),
+	0, 0,
+	},
+	{ "SDMA_MBANK_DATA_BUF12_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+	SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF12_SED),
+	0, 0,
+	},
+	{ "SDMA_MBANK_DATA_BUF13_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+	SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF13_SED),
+	0, 0,
+	},
+	{ "SDMA_MBANK_DATA_BUF14_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+	SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF14_SED),
+	0, 0,
+	},
+	{ "SDMA_MBANK_DATA_BUF15_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+	SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF15_SED),
+	0, 0,
+	},
+	{ "SDMA_SPLIT_DAT_BUF_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+	SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_SPLIT_DAT_BUF_SED),
+	0, 0,
+	},
+	{ "SDMA_MC_WR_ADDR_FIFO_SED", SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER),
+	SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MC_WR_ADDR_FIFO_SED),
+	0, 0,
+	},
+};
+
 static u32 sdma_v4_0_get_reg_offset(struct amdgpu_device *adev,
 		u32 instance, u32 offset)
 {
@@ -1687,6 +1787,7 @@ static int sdma_v4_0_early_init(void *handle)
 	sdma_v4_0_set_buffer_funcs(adev);
 	sdma_v4_0_set_vm_pte_funcs(adev);
 	sdma_v4_0_set_irq_funcs(adev);
+	sdma_v4_0_set_ras_funcs(adev);
 
 	return 0;
 }
@@ -2417,6 +2518,68 @@ static void sdma_v4_0_set_vm_pte_funcs(struct amdgpu_device *adev)
 	adev->vm_manager.vm_pte_num_scheds = adev->sdma.num_instances;  }
 
+static void sdma_v4_0_get_ras_error_count(uint32_t value,
+					uint32_t instance,
+					uint32_t *sec_count)
+{
+	uint32_t i;
+	uint32_t sec_cnt;
+
+	/* double bits error (multiple bits) error detection is not supported */
+	for (i = 0; i < ARRAY_SIZE(sdma_v4_0_ras_fields); i++) {
+		/* the SDMA_EDC_COUNTER register in each sdma instance
+		 * shares the same sed shift_mask
+		 * */
+		sec_cnt = (value &
+			sdma_v4_0_ras_fields[i].sec_count_mask) >>
+			sdma_v4_0_ras_fields[i].sec_count_shift;
+		if (sec_cnt) {
+			DRM_INFO("Detected %s in SDMA%d, SED %d\n",
+				sdma_v4_0_ras_fields[i].name,
+				instance, sec_cnt);
+			*sec_count += sec_cnt;
+		}
+	}
+}
+
+static int sdma_v4_0_query_ras_error_count(struct amdgpu_device *adev,
+			uint32_t instance, void *ras_error_status) {
+	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
+	uint32_t sec_count = 0;
+	uint32_t reg_value = 0;
+
+	reg_value = RREG32_SDMA(instance, mmSDMA0_EDC_COUNTER);
[Guchun]One question is, after reading SDMA_EDC_COUNTER, do we need to explicitly clear this counter register to avoid duplicated counting in multiple querying case?
Or that register is cleared automatically after reading once?

+	/* double bit error is not supported */
+	if (reg_value)
+		sdma_v4_0_get_ras_error_count(reg_value,
+				instance, &sec_count);
+	/* err_data->ce_count should be initialized to 0
+	 * before calling into this function */
+	err_data->ce_count += sec_count;
+	/* double bit error is not supported
+	 * set ue count to 0 */
+	err_data->ue_count = 0;
+
+	return 0;
+};
+
+static const struct amdgpu_sdma_ras_funcs sdma_v4_0_ras_funcs = {
+	.query_ras_error_count = sdma_v4_0_query_ras_error_count, };
+
+static void sdma_v4_0_set_ras_funcs(struct amdgpu_device *adev) {
+	switch (adev->asic_type) {
+	case CHIP_VEGA20:
+	case CHIP_ARCTURUS:
+		adev->sdma.funcs = &sdma_v4_0_ras_funcs;
+		break;
+	default:
+		break;
+	}
+}
+
 const struct amdgpu_ip_block_version sdma_v4_0_ip_block = {
 	.type = AMD_IP_BLOCK_TYPE_SDMA,
 	.major = 4,
--
2.17.1
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* RE: [PATCH 0/4] add SDMA ras error reporting support
  2020-01-08 16:17 [PATCH 0/4] add SDMA ras error reporting support Hawking Zhang
                   ` (3 preceding siblings ...)
  2020-01-08 16:17 ` [PATCH 4/4] drm/amdgpu: read sdma edc counter to clear the counters Hawking Zhang
@ 2020-01-09  1:03 ` Chen, Guchun
  2020-01-09  3:20   ` Zhang, Hawking
  4 siblings, 1 reply; 12+ messages in thread
From: Chen, Guchun @ 2020-01-09  1:03 UTC (permalink / raw)
  To: Zhang, Hawking, amd-gfx, Deucher, Alexander, Li, Dennis,
	Clements, John, Zhou1, Tao, Li, Candice, Long, Gang
  Cc: Zhang, Hawking

[AMD Public Use]

Two comments in patch 1.

And one more question for the series is, we add SDMA block case in ras query, but no such case in ras error injection.
Then how we get to know who triggers SDMA ECC counter? Still by the GFX injecton?

With above concerns fixed/clarified, series is:
Reviewed-by: Guchun Chen <guchun.chen@amd.com>

-----Original Message-----
From: Hawking Zhang <Hawking.Zhang@amd.com> 
Sent: Thursday, January 9, 2020 12:17 AM
To: amd-gfx@lists.freedesktop.org; Deucher, Alexander <Alexander.Deucher@amd.com>; Li, Dennis <Dennis.Li@amd.com>; Clements, John <John.Clements@amd.com>; Chen, Guchun <Guchun.Chen@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Candice <Candice.Li@amd.com>; Long, Gang <Gang.Long@amd.com>
Cc: Zhang, Hawking <Hawking.Zhang@amd.com>
Subject: [PATCH 0/4] add SDMA ras error reporting support

Currently, sdma edc counters are grouped in gfx edc counter registers array (sec_ded_counter_registers), which results to several issues including:
1). count sdma ras error into gfx ip blocks when querying gfx error counter (i.e. through sysfs gfx_error_count node).
2). kernel crash (access NULL pointer) when querying gfx error counter on vega20. there is only 2 sdma instances while the gfx edc counter register array unifed arcturus and vega20 cases.
then driver will force to read sdma2 ~ 7 edc counter registers even the ip base address is not initlaized.
3). unnecessary/wrong grbm switch even reading sdma edc counter.

To fix above issue, the series will separate sdma ras query functions from gfx one. check the sdam_edc_counters and report back error count and the error type as well. 

Hawking Zhang (4):
  drm/amdgpu: add query_ras_error_count function for sdma v4
  drm/amdgpu: support error reporting for sdma ip block
  drm/amdgpu: add ras_late_init and ras_fini for sdma v4
  drm/amdgpu: read sdma edc counter to clear the counters

 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  |   7 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h |   9 ++
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    |  11 +-
 drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c   | 176 ++++++++++++++++++++++-
 4 files changed, 191 insertions(+), 12 deletions(-)

--
2.17.1
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [PATCH 0/4] add SDMA ras error reporting support
  2020-01-09  1:03 ` [PATCH 0/4] add SDMA ras error reporting support Chen, Guchun
@ 2020-01-09  3:20   ` Zhang, Hawking
  0 siblings, 0 replies; 12+ messages in thread
From: Zhang, Hawking @ 2020-01-09  3:20 UTC (permalink / raw)
  To: Chen, Guchun, amd-gfx, Deucher, Alexander, Li, Dennis, Clements,
	John, Zhou1, Tao, Li, Candice, Long, Gang

[AMD Public Use]

To address your concerns

1). The SDMA_EDC_COUTNERS will be cleared by HW after the reading. This is a read-only registers. Either explicitly clear this register or programming EDC_COUNTER_CLEAR register is unnecessary.
2). The error injection and error reporting are actually separated features. That's saying, users may not be allowed to do error injection to generate the error. But once hw edc feature was enabled, the driver should be able to collect and report error information.  

Regards,
Hawking

-----Original Message-----
From: Chen, Guchun <Guchun.Chen@amd.com> 
Sent: Thursday, January 9, 2020 09:04
To: Zhang, Hawking <Hawking.Zhang@amd.com>; amd-gfx@lists.freedesktop.org; Deucher, Alexander <Alexander.Deucher@amd.com>; Li, Dennis <Dennis.Li@amd.com>; Clements, John <John.Clements@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Candice <Candice.Li@amd.com>; Long, Gang <Gang.Long@amd.com>
Cc: Zhang, Hawking <Hawking.Zhang@amd.com>
Subject: RE: [PATCH 0/4] add SDMA ras error reporting support

[AMD Public Use]

Two comments in patch 1.

And one more question for the series is, we add SDMA block case in ras query, but no such case in ras error injection.
Then how we get to know who triggers SDMA ECC counter? Still by the GFX injecton?

With above concerns fixed/clarified, series is:
Reviewed-by: Guchun Chen <guchun.chen@amd.com>

-----Original Message-----
From: Hawking Zhang <Hawking.Zhang@amd.com> 
Sent: Thursday, January 9, 2020 12:17 AM
To: amd-gfx@lists.freedesktop.org; Deucher, Alexander <Alexander.Deucher@amd.com>; Li, Dennis <Dennis.Li@amd.com>; Clements, John <John.Clements@amd.com>; Chen, Guchun <Guchun.Chen@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Candice <Candice.Li@amd.com>; Long, Gang <Gang.Long@amd.com>
Cc: Zhang, Hawking <Hawking.Zhang@amd.com>
Subject: [PATCH 0/4] add SDMA ras error reporting support

Currently, sdma edc counters are grouped in gfx edc counter registers array (sec_ded_counter_registers), which results to several issues including:
1). count sdma ras error into gfx ip blocks when querying gfx error counter (i.e. through sysfs gfx_error_count node).
2). kernel crash (access NULL pointer) when querying gfx error counter on vega20. there is only 2 sdma instances while the gfx edc counter register array unifed arcturus and vega20 cases.
then driver will force to read sdma2 ~ 7 edc counter registers even the ip base address is not initlaized.
3). unnecessary/wrong grbm switch even reading sdma edc counter.

To fix above issue, the series will separate sdma ras query functions from gfx one. check the sdam_edc_counters and report back error count and the error type as well. 

Hawking Zhang (4):
  drm/amdgpu: add query_ras_error_count function for sdma v4
  drm/amdgpu: support error reporting for sdma ip block
  drm/amdgpu: add ras_late_init and ras_fini for sdma v4
  drm/amdgpu: read sdma edc counter to clear the counters

 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  |   7 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h |   9 ++
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    |  11 +-
 drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c   | 176 ++++++++++++++++++++++-
 4 files changed, 191 insertions(+), 12 deletions(-)

--
2.17.1
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2020-01-09  3:20 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-01-08 16:17 [PATCH 0/4] add SDMA ras error reporting support Hawking Zhang
2020-01-08 16:17 ` [PATCH 1/4] drm/amdgpu: add query_ras_error_count function for sdma v4 Hawking Zhang
2020-01-08 16:26   ` Alex Deucher
2020-01-09  0:59   ` Chen, Guchun
2020-01-08 16:17 ` [PATCH 2/4] drm/amdgpu: support error reporting for sdma ip block Hawking Zhang
2020-01-08 16:25   ` Alex Deucher
2020-01-08 16:17 ` [PATCH 3/4] drm/amdgpu: add ras_late_init and ras_fini for sdma v4 Hawking Zhang
2020-01-08 16:26   ` Alex Deucher
2020-01-08 16:17 ` [PATCH 4/4] drm/amdgpu: read sdma edc counter to clear the counters Hawking Zhang
2020-01-08 16:27   ` Alex Deucher
2020-01-09  1:03 ` [PATCH 0/4] add SDMA ras error reporting support Chen, Guchun
2020-01-09  3:20   ` Zhang, Hawking

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.