All of lore.kernel.org
 help / color / mirror / Atom feed
* Fix CZ EDC workarounds and provide a kernel parameter to control use of EDC.
@ 2017-06-06 20:32 David Panariti
       [not found] ` <1496781172-23297-1-git-send-email-David.Panariti-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 14+ messages in thread
From: David Panariti @ 2017-06-06 20:32 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

I've made the workarounds function a bit verbose because EDC enabled CZs are
often used in an embedded environment and providing as much information as
possible can be very helpful in debugging.



_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 1/3] drm/amdgpu: Moved gfx_v8_0_select_se_sh() in lieu of re-redundant prototype.
       [not found] ` <1496781172-23297-1-git-send-email-David.Panariti-5C7GfCeVMHo@public.gmane.org>
@ 2017-06-06 20:32   ` David Panariti
       [not found]     ` <1496781172-23297-2-git-send-email-David.Panariti-5C7GfCeVMHo@public.gmane.org>
  2017-06-06 20:32   ` [PATCH 2/3] drm/amdgpu: Complete Carrizo EDC (Error Detection and Correction) workarounds David Panariti
  2017-06-06 20:32   ` [PATCH 3/3] drm/amdgpu: Add kernel parameter to control use of ECC/EDC David Panariti
  2 siblings, 1 reply; 14+ messages in thread
From: David Panariti @ 2017-06-06 20:32 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: David Panariti

Will be needed for the rest of the EDC workarounds patch.

Signed-off-by: David Panariti <David.Panariti@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 46 +++++++++++++++++------------------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 69d9bbd..07172f8 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -662,6 +662,29 @@ static void gfx_v8_0_ring_emit_de_meta(struct amdgpu_ring *ring);
 static int gfx_v8_0_compute_mqd_sw_init(struct amdgpu_device *adev);
 static void gfx_v8_0_compute_mqd_sw_fini(struct amdgpu_device *adev);
 
+static void gfx_v8_0_select_se_sh(struct amdgpu_device *adev,
+				  u32 se_num, u32 sh_num, u32 instance)
+{
+	u32 data;
+
+	if (instance == 0xffffffff)
+		data = REG_SET_FIELD(0, GRBM_GFX_INDEX, INSTANCE_BROADCAST_WRITES, 1);
+	else
+		data = REG_SET_FIELD(0, GRBM_GFX_INDEX, INSTANCE_INDEX, instance);
+
+	if (se_num == 0xffffffff)
+		data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SE_BROADCAST_WRITES, 1);
+	else
+		data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SE_INDEX, se_num);
+
+	if (sh_num == 0xffffffff)
+		data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SH_BROADCAST_WRITES, 1);
+	else
+		data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SH_INDEX, sh_num);
+
+	WREG32(mmGRBM_GFX_INDEX, data);
+}
+
 static void gfx_v8_0_init_golden_registers(struct amdgpu_device *adev)
 {
 	switch (adev->asic_type) {
@@ -3679,29 +3702,6 @@ static void gfx_v8_0_tiling_mode_table_init(struct amdgpu_device *adev)
 	}
 }
 
-static void gfx_v8_0_select_se_sh(struct amdgpu_device *adev,
-				  u32 se_num, u32 sh_num, u32 instance)
-{
-	u32 data;
-
-	if (instance == 0xffffffff)
-		data = REG_SET_FIELD(0, GRBM_GFX_INDEX, INSTANCE_BROADCAST_WRITES, 1);
-	else
-		data = REG_SET_FIELD(0, GRBM_GFX_INDEX, INSTANCE_INDEX, instance);
-
-	if (se_num == 0xffffffff)
-		data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SE_BROADCAST_WRITES, 1);
-	else
-		data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SE_INDEX, se_num);
-
-	if (sh_num == 0xffffffff)
-		data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SH_BROADCAST_WRITES, 1);
-	else
-		data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SH_INDEX, sh_num);
-
-	WREG32(mmGRBM_GFX_INDEX, data);
-}
-
 static u32 gfx_v8_0_create_bitmask(u32 bit_width)
 {
 	return (u32)((1ULL << bit_width) - 1);
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 2/3] drm/amdgpu: Complete Carrizo EDC (Error Detection and Correction) workarounds.
       [not found] ` <1496781172-23297-1-git-send-email-David.Panariti-5C7GfCeVMHo@public.gmane.org>
  2017-06-06 20:32   ` [PATCH 1/3] drm/amdgpu: Moved gfx_v8_0_select_se_sh() in lieu of re-redundant prototype David Panariti
@ 2017-06-06 20:32   ` David Panariti
       [not found]     ` <1496781172-23297-3-git-send-email-David.Panariti-5C7GfCeVMHo@public.gmane.org>
  2017-06-06 20:32   ` [PATCH 3/3] drm/amdgpu: Add kernel parameter to control use of ECC/EDC David Panariti
  2 siblings, 1 reply; 14+ messages in thread
From: David Panariti @ 2017-06-06 20:32 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: David Panariti

The workarounds are unconditionally performed on CZs with EDC enabled.
EDC detects uncorrected ECC errors and uses data poisoning to prevent
corrupted compute results from being used (read).
EDC enabled CZs are often used in embedded environments.

Signed-off-by: David Panariti <David.Panariti@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h   |  13 ++++
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 136 +++++++++++++++++++++++-----------
 2 files changed, 107 insertions(+), 42 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 0d64bbba..a6f51eb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -2060,5 +2060,18 @@ int amdgpu_dm_display_resume(struct amdgpu_device *adev );
 static inline int amdgpu_dm_display_resume(struct amdgpu_device *adev) { return 0; }
 #endif
 
+/* Carrizo EDC support */
+struct reg32_counter_name_map {
+	uint32_t rnmap_addr;	     /* Counter register address */
+	const char *rnmap_name;	     /* Name of the counter */
+	size_t rnmap_num_instances;  /* Number of block instances */
+};
+
+#define DEF_mmREG32_NAME_MAP_ELEMENT(reg, num_instances) {	\
+		.rnmap_addr = mm##reg,				\
+		.rnmap_name = #reg,				\
+		.rnmap_num_instances = num_instances		\
+}
+
 #include "amdgpu_object.h"
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 07172f8..46e766e 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -1727,35 +1727,80 @@ static const u32 sgpr2_init_regs[] =
 	mmCOMPUTE_USER_DATA_9, 0xedcedc09,
 };
 
-static const u32 sec_ded_counter_registers[] =
-{
-	mmCPC_EDC_ATC_CNT,
-	mmCPC_EDC_SCRATCH_CNT,
-	mmCPC_EDC_UCODE_CNT,
-	mmCPF_EDC_ATC_CNT,
-	mmCPF_EDC_ROQ_CNT,
-	mmCPF_EDC_TAG_CNT,
-	mmCPG_EDC_ATC_CNT,
-	mmCPG_EDC_DMA_CNT,
-	mmCPG_EDC_TAG_CNT,
-	mmDC_EDC_CSINVOC_CNT,
-	mmDC_EDC_RESTORE_CNT,
-	mmDC_EDC_STATE_CNT,
-	mmGDS_EDC_CNT,
-	mmGDS_EDC_GRBM_CNT,
-	mmGDS_EDC_OA_DED,
-	mmSPI_EDC_CNT,
-	mmSQC_ATC_EDC_GATCL1_CNT,
-	mmSQC_EDC_CNT,
-	mmSQ_EDC_DED_CNT,
-	mmSQ_EDC_INFO,
-	mmSQ_EDC_SEC_CNT,
-	mmTCC_EDC_CNT,
-	mmTCP_ATC_EDC_GATCL1_CNT,
-	mmTCP_EDC_CNT,
-	mmTD_EDC_CNT
+/* See GRBM_GFX_INDEX, et. al. registers. */
+static const struct reg32_counter_name_map sec_ded_counter_registers[] = {
+	DEF_mmREG32_NAME_MAP_ELEMENT(SQC_EDC_CNT, 2),
+	DEF_mmREG32_NAME_MAP_ELEMENT(SQC_ATC_EDC_GATCL1_CNT, 2),
+
+	DEF_mmREG32_NAME_MAP_ELEMENT(SQ_EDC_SEC_CNT, 8),
+	DEF_mmREG32_NAME_MAP_ELEMENT(SQ_EDC_DED_CNT, 8),
+	DEF_mmREG32_NAME_MAP_ELEMENT(TCP_EDC_CNT, 8),
+	DEF_mmREG32_NAME_MAP_ELEMENT(TCP_ATC_EDC_GATCL1_CNT, 8),
+	DEF_mmREG32_NAME_MAP_ELEMENT(TD_EDC_CNT, 8),
+
+	DEF_mmREG32_NAME_MAP_ELEMENT(TCC_EDC_CNT, 4),
+
+	DEF_mmREG32_NAME_MAP_ELEMENT(CPC_EDC_ATC_CNT, 1),
+	DEF_mmREG32_NAME_MAP_ELEMENT(CPC_EDC_SCRATCH_CNT, 1),
+	DEF_mmREG32_NAME_MAP_ELEMENT(CPC_EDC_UCODE_CNT, 1),
+	DEF_mmREG32_NAME_MAP_ELEMENT(CPF_EDC_ATC_CNT, 1),
+	DEF_mmREG32_NAME_MAP_ELEMENT(CPF_EDC_ROQ_CNT, 1),
+	DEF_mmREG32_NAME_MAP_ELEMENT(CPF_EDC_TAG_CNT, 1),
+	DEF_mmREG32_NAME_MAP_ELEMENT(CPG_EDC_ATC_CNT, 1),
+	DEF_mmREG32_NAME_MAP_ELEMENT(CPG_EDC_DMA_CNT, 1),
+	DEF_mmREG32_NAME_MAP_ELEMENT(CPG_EDC_TAG_CNT, 1),
+	DEF_mmREG32_NAME_MAP_ELEMENT(DC_EDC_CSINVOC_CNT, 1),
+	DEF_mmREG32_NAME_MAP_ELEMENT(DC_EDC_STATE_CNT, 1),
+	DEF_mmREG32_NAME_MAP_ELEMENT(DC_EDC_RESTORE_CNT, 1),
+	DEF_mmREG32_NAME_MAP_ELEMENT(GDS_EDC_CNT, 1),
+	DEF_mmREG32_NAME_MAP_ELEMENT(GDS_EDC_GRBM_CNT, 1),
+	DEF_mmREG32_NAME_MAP_ELEMENT(SPI_EDC_CNT, 1),
 };
 
+static int gfx_v8_0_edc_clear_counters(struct amdgpu_device *adev)
+{
+	int ci, se, sh, i;
+	uint32_t count;
+	int r = 0;
+
+	mutex_lock(&adev->grbm_idx_mutex);
+
+	for (ci = 0; ci < ARRAY_SIZE(sec_ded_counter_registers); ++ci) {
+		const struct reg32_counter_name_map *cp =
+			sec_ded_counter_registers + ci;
+		const char *name = cp->rnmap_name;
+
+		for (se = 0; se < adev->gfx.config.max_shader_engines; ++se) {
+			for (sh = 0; sh < adev->gfx.config.max_sh_per_se; ++sh) {
+				for (i = 0; i < cp->rnmap_num_instances; ++i) {
+					gfx_v8_0_select_se_sh(adev, se, sh, i);
+					count = RREG32(cp->rnmap_addr);
+					count = RREG32(cp->rnmap_addr);
+					if (count != 0) {
+						/*
+						 * EDC workaround failed.
+						 * If people are interested
+						 * in EDC at all, they will
+						 * want to know which
+						 * counters had problems.
+						 */
+						DRM_WARN("EDC counter %s is 0x%08x, but should be 0\n.",
+							 name, count);
+						r = -EINVAL;
+						goto ret;
+					}
+				}
+			}
+		}
+	}
+
+ret:
+	gfx_v8_0_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff);
+	mutex_unlock(&adev->grbm_idx_mutex);
+
+	return r;
+}
+
 static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
 {
 	struct amdgpu_ring *ring = &adev->gfx.compute_ring[0];
@@ -1763,18 +1808,36 @@ static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
 	struct dma_fence *f = NULL;
 	int r, i;
 	u32 tmp;
+	u32 dis_bit;
 	unsigned total_size, vgpr_offset, sgpr_offset;
 	u64 gpu_addr;
 
-	/* only supported on CZ */
-	if (adev->asic_type != CHIP_CARRIZO)
+	if (adev->asic_type != CHIP_CARRIZO) {
+		/* EDC is only supported on CZ */
+		return 0;
+	}
+
+	DRM_INFO("Detected Carrizo.\n");
+
+	tmp = RREG32(mmCC_GC_EDC_CONFIG);
+	dis_bit = REG_GET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC);
+	if (dis_bit) {
+		/* On Carrizo, EDC may be permanently disabled by a fuse. */
+		DRM_INFO("CZ EDC hardware is disabled, GC_EDC_CONFIG: 0x%08x.\n",
+			tmp);
 		return 0;
+	}
 
 	/* bail if the compute ring is not ready */
 	if (!ring->ready)
 		return 0;
 
-	tmp = RREG32(mmGB_EDC_MODE);
+	DRM_INFO("Applying EDC workarounds.\n");
+
+	/*
+	 * Interested parties can enable EDC using debugfs register reads and
+	 * writes.
+	 */
 	WREG32(mmGB_EDC_MODE, 0);
 
 	total_size =
@@ -1899,18 +1962,7 @@ static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
 		goto fail;
 	}
 
-	tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, DED_MODE, 2);
-	tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, PROP_FED, 1);
-	WREG32(mmGB_EDC_MODE, tmp);
-
-	tmp = RREG32(mmCC_GC_EDC_CONFIG);
-	tmp = REG_SET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC, 0) | 1;
-	WREG32(mmCC_GC_EDC_CONFIG, tmp);
-
-
-	/* read back registers to clear the counters */
-	for (i = 0; i < ARRAY_SIZE(sec_ded_counter_registers); i++)
-		RREG32(sec_ded_counter_registers[i]);
+	gfx_v8_0_edc_clear_counters(adev);
 
 fail:
 	amdgpu_ib_free(adev, &ib, NULL);
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 3/3] drm/amdgpu: Add kernel parameter to control use of ECC/EDC.
       [not found] ` <1496781172-23297-1-git-send-email-David.Panariti-5C7GfCeVMHo@public.gmane.org>
  2017-06-06 20:32   ` [PATCH 1/3] drm/amdgpu: Moved gfx_v8_0_select_se_sh() in lieu of re-redundant prototype David Panariti
  2017-06-06 20:32   ` [PATCH 2/3] drm/amdgpu: Complete Carrizo EDC (Error Detection and Correction) workarounds David Panariti
@ 2017-06-06 20:32   ` David Panariti
       [not found]     ` <1496781172-23297-4-git-send-email-David.Panariti-5C7GfCeVMHo@public.gmane.org>
  2 siblings, 1 reply; 14+ messages in thread
From: David Panariti @ 2017-06-06 20:32 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: David Panariti

Allow various kinds of memory integrity methods (e.g. ECC/EDC) to be enabled
or disabled.  By default, all features are disabled.

EDC is Error Detection and Correction.  It can detect ECC errors and do 0 or
more of: count SEC (single error corrected) and DED (double error detected,
i.e. uncorrected ECC error), halt the affected block, interrupt the CPU.
Currently, only counting errors is supported.

Fixed a whitespace error.

Signed-off-by: David Panariti <David.Panariti@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h      |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  |  4 ++++
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 28 ++++++++++++++++++++++++++--
 drivers/gpu/drm/amd/include/amd_shared.h | 14 ++++++++++++++
 4 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index a6f51eb..3e930ee 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -116,6 +116,7 @@ extern int amdgpu_cntl_sb_buf_per_se;
 extern int amdgpu_param_buf_per_se;
 extern int amdgpu_job_hang_limit;
 extern int amdgpu_lbpw;
+extern unsigned amdgpu_ecc_flags;
 
 #define AMDGPU_DEFAULT_GTT_SIZE_MB		3072ULL /* 3GB by default */
 #define AMDGPU_WAIT_IDLE_TIMEOUT_IN_MS	        3000
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index c4825ff..972660d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -115,6 +115,7 @@ int amdgpu_cntl_sb_buf_per_se = 0;
 int amdgpu_param_buf_per_se = 0;
 int amdgpu_job_hang_limit = 0;
 int amdgpu_lbpw = -1;
+unsigned amdgpu_ecc_flags = 0;
 
 MODULE_PARM_DESC(vramlimit, "Restrict VRAM for testing, in megabytes");
 module_param_named(vramlimit, amdgpu_vram_limit, int, 0600);
@@ -795,6 +796,9 @@ static struct pci_driver amdgpu_kms_pci_driver = {
 	.driver.pm = &amdgpu_pm_ops,
 };
 
+MODULE_PARM_DESC(ecc_flags, "ECC/EDC enable flags (0 = disable ECC/EDC (default))");
+module_param_named(ecc_flags, amdgpu_ecc_flags, uint, 0444);
+
 
 
 static int __init amdgpu_init(void)
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 46e766e..3b5685c 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -1824,7 +1824,17 @@ static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
 	if (dis_bit) {
 		/* On Carrizo, EDC may be permanently disabled by a fuse. */
 		DRM_INFO("CZ EDC hardware is disabled, GC_EDC_CONFIG: 0x%08x.\n",
-			tmp);
+			 tmp);
+		return 0;
+	}
+
+	/*
+	 * Check if EDC has been requested by a kernel parameter.
+	 * For Carrizo, EDC is the best/safest mode WRT error handling.
+	 */
+	if (!(amdgpu_ecc_flags
+	      & (AMD_ECC_SUPPORT_BEST | AMD_ECC_SUPPORT_EDC))) {
+		DRM_INFO("EDC support has not been requested.\n");
 		return 0;
 	}
 
@@ -1962,6 +1972,20 @@ static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
 		goto fail;
 	}
 
+	/* 00 - GB_EDC_DED_MODE_LOG: Count DED errors but do not halt */
+	tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, DED_MODE, 0);
+	/* Do not propagate the errors to the next block. */
+	tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, PROP_FED, 0);
+	WREG32(mmGB_EDC_MODE, tmp);
+
+	tmp = RREG32(mmCC_GC_EDC_CONFIG);
+
+	/*
+	 * Clear EDC_DISABLE bit so the counters are available.
+	 */
+	tmp = REG_SET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC, 0);
+	WREG32(mmCC_GC_EDC_CONFIG, tmp);
+
 	gfx_v8_0_edc_clear_counters(adev);
 
 fail:
@@ -5548,7 +5572,7 @@ static int gfx_v8_0_pre_soft_reset(void *handle)
 		gfx_v8_0_cp_compute_enable(adev, false);
 	}
 
-       return 0;
+	return 0;
 }
 
 static int gfx_v8_0_soft_reset(void *handle)
diff --git a/drivers/gpu/drm/amd/include/amd_shared.h b/drivers/gpu/drm/amd/include/amd_shared.h
index 0f58e95..1cf30aa 100644
--- a/drivers/gpu/drm/amd/include/amd_shared.h
+++ b/drivers/gpu/drm/amd/include/amd_shared.h
@@ -187,6 +187,20 @@ enum amd_fan_ctrl_mode {
 #define AMD_PG_SUPPORT_GFX_QUICK_MG		(1 << 11)
 #define AMD_PG_SUPPORT_GFX_PIPELINE		(1 << 12)
 
+/*
+ * ECC flags
+ * Allows the user to choose what kind of error detection/correction is used.
+ * Currently, EDC is supported on Carrizo.
+ *
+ * The AMD_ECC_SUPPORT_BEST bit is used to allow a user to have the driver
+ * set what it thinks is best/safest mode.  This may not be the same as the
+ * default, depending on the GPU and the application.
+ * Using a single bit makes it easy to request the best support without
+ * needing to know all currently supported modes.
+ */
+#define AMD_ECC_SUPPORT_BEST			(1 << 0)
+#define AMD_ECC_SUPPORT_EDC			(1 << 1)
+
 enum amd_pm_state_type {
 	/* not used for dpm */
 	POWER_STATE_TYPE_DEFAULT,
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* RE: [PATCH 2/3] drm/amdgpu: Complete Carrizo EDC (Error Detection and Correction) workarounds.
       [not found]     ` <1496781172-23297-3-git-send-email-David.Panariti-5C7GfCeVMHo@public.gmane.org>
@ 2017-06-06 21:16       ` Deucher, Alexander
  0 siblings, 0 replies; 14+ messages in thread
From: Deucher, Alexander @ 2017-06-06 21:16 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Panariti, David

> -----Original Message-----
> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf
> Of David Panariti
> Sent: Tuesday, June 06, 2017 4:33 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Panariti, David
> Subject: [PATCH 2/3] drm/amdgpu: Complete Carrizo EDC (Error Detection
> and Correction) workarounds.
> 
> The workarounds are unconditionally performed on CZs with EDC enabled.
> EDC detects uncorrected ECC errors and uses data poisoning to prevent
> corrupted compute results from being used (read).
> EDC enabled CZs are often used in embedded environments.
> 
> Signed-off-by: David Panariti <David.Panariti@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h   |  13 ++++
>  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 136
> +++++++++++++++++++++++-----------
>  2 files changed, 107 insertions(+), 42 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 0d64bbba..a6f51eb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -2060,5 +2060,18 @@ int amdgpu_dm_display_resume(struct
> amdgpu_device *adev );
>  static inline int amdgpu_dm_display_resume(struct amdgpu_device *adev)
> { return 0; }
>  #endif
> 
> +/* Carrizo EDC support */
> +struct reg32_counter_name_map {
> +	uint32_t rnmap_addr;	     /* Counter register address */
> +	const char *rnmap_name;	     /* Name of the counter */
> +	size_t rnmap_num_instances;  /* Number of block instances */
> +};
> +
> +#define DEF_mmREG32_NAME_MAP_ELEMENT(reg, num_instances) {
> 	\
> +		.rnmap_addr = mm##reg,				\
> +		.rnmap_name = #reg,				\
> +		.rnmap_num_instances = num_instances		\
> +}
> +
>  #include "amdgpu_object.h"
>  #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index 07172f8..46e766e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -1727,35 +1727,80 @@ static const u32 sgpr2_init_regs[] =
>  	mmCOMPUTE_USER_DATA_9, 0xedcedc09,
>  };
> 
> -static const u32 sec_ded_counter_registers[] =
> -{
> -	mmCPC_EDC_ATC_CNT,
> -	mmCPC_EDC_SCRATCH_CNT,
> -	mmCPC_EDC_UCODE_CNT,
> -	mmCPF_EDC_ATC_CNT,
> -	mmCPF_EDC_ROQ_CNT,
> -	mmCPF_EDC_TAG_CNT,
> -	mmCPG_EDC_ATC_CNT,
> -	mmCPG_EDC_DMA_CNT,
> -	mmCPG_EDC_TAG_CNT,
> -	mmDC_EDC_CSINVOC_CNT,
> -	mmDC_EDC_RESTORE_CNT,
> -	mmDC_EDC_STATE_CNT,
> -	mmGDS_EDC_CNT,
> -	mmGDS_EDC_GRBM_CNT,
> -	mmGDS_EDC_OA_DED,
> -	mmSPI_EDC_CNT,
> -	mmSQC_ATC_EDC_GATCL1_CNT,
> -	mmSQC_EDC_CNT,
> -	mmSQ_EDC_DED_CNT,
> -	mmSQ_EDC_INFO,
> -	mmSQ_EDC_SEC_CNT,
> -	mmTCC_EDC_CNT,
> -	mmTCP_ATC_EDC_GATCL1_CNT,
> -	mmTCP_EDC_CNT,
> -	mmTD_EDC_CNT
> +/* See GRBM_GFX_INDEX, et. al. registers. */
> +static const struct reg32_counter_name_map sec_ded_counter_registers[]
> = {
> +	DEF_mmREG32_NAME_MAP_ELEMENT(SQC_EDC_CNT, 2),
> +
> 	DEF_mmREG32_NAME_MAP_ELEMENT(SQC_ATC_EDC_GATCL1_CN
> T, 2),
> +
> +	DEF_mmREG32_NAME_MAP_ELEMENT(SQ_EDC_SEC_CNT, 8),
> +	DEF_mmREG32_NAME_MAP_ELEMENT(SQ_EDC_DED_CNT, 8),
> +	DEF_mmREG32_NAME_MAP_ELEMENT(TCP_EDC_CNT, 8),
> +
> 	DEF_mmREG32_NAME_MAP_ELEMENT(TCP_ATC_EDC_GATCL1_CN
> T, 8),
> +	DEF_mmREG32_NAME_MAP_ELEMENT(TD_EDC_CNT, 8),
> +
> +	DEF_mmREG32_NAME_MAP_ELEMENT(TCC_EDC_CNT, 4),
> +
> +	DEF_mmREG32_NAME_MAP_ELEMENT(CPC_EDC_ATC_CNT, 1),
> +	DEF_mmREG32_NAME_MAP_ELEMENT(CPC_EDC_SCRATCH_CNT,
> 1),
> +	DEF_mmREG32_NAME_MAP_ELEMENT(CPC_EDC_UCODE_CNT, 1),
> +	DEF_mmREG32_NAME_MAP_ELEMENT(CPF_EDC_ATC_CNT, 1),
> +	DEF_mmREG32_NAME_MAP_ELEMENT(CPF_EDC_ROQ_CNT, 1),
> +	DEF_mmREG32_NAME_MAP_ELEMENT(CPF_EDC_TAG_CNT, 1),
> +	DEF_mmREG32_NAME_MAP_ELEMENT(CPG_EDC_ATC_CNT, 1),
> +	DEF_mmREG32_NAME_MAP_ELEMENT(CPG_EDC_DMA_CNT, 1),
> +	DEF_mmREG32_NAME_MAP_ELEMENT(CPG_EDC_TAG_CNT, 1),
> +	DEF_mmREG32_NAME_MAP_ELEMENT(DC_EDC_CSINVOC_CNT, 1),
> +	DEF_mmREG32_NAME_MAP_ELEMENT(DC_EDC_STATE_CNT, 1),
> +	DEF_mmREG32_NAME_MAP_ELEMENT(DC_EDC_RESTORE_CNT, 1),
> +	DEF_mmREG32_NAME_MAP_ELEMENT(GDS_EDC_CNT, 1),
> +	DEF_mmREG32_NAME_MAP_ELEMENT(GDS_EDC_GRBM_CNT, 1),
> +	DEF_mmREG32_NAME_MAP_ELEMENT(SPI_EDC_CNT, 1),
>  };
> 
> +static int gfx_v8_0_edc_clear_counters(struct amdgpu_device *adev)
> +{
> +	int ci, se, sh, i;
> +	uint32_t count;
> +	int r = 0;
> +
> +	mutex_lock(&adev->grbm_idx_mutex);
> +
> +	for (ci = 0; ci < ARRAY_SIZE(sec_ded_counter_registers); ++ci) {
> +		const struct reg32_counter_name_map *cp =
> +			sec_ded_counter_registers + ci;
> +		const char *name = cp->rnmap_name;
> +
> +		for (se = 0; se < adev->gfx.config.max_shader_engines;
> ++se) {
> +			for (sh = 0; sh < adev->gfx.config.max_sh_per_se;
> ++sh) {
> +				for (i = 0; i < cp->rnmap_num_instances; ++i)
> {
> +					gfx_v8_0_select_se_sh(adev, se, sh,
> i);
> +					count = RREG32(cp->rnmap_addr);
> +					count = RREG32(cp->rnmap_addr);
> +					if (count != 0) {
> +						/*
> +						 * EDC workaround failed.
> +						 * If people are interested
> +						 * in EDC at all, they will
> +						 * want to know which
> +						 * counters had problems.
> +						 */
> +						DRM_WARN("EDC counter
> %s is 0x%08x, but should be 0\n.",
> +							 name, count);
> +						r = -EINVAL;
> +						goto ret;
> +					}
> +				}
> +			}
> +		}
> +	}
> +
> +ret:
> +	gfx_v8_0_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff);
> +	mutex_unlock(&adev->grbm_idx_mutex);
> +
> +	return r;
> +}
> +
>  static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device
> *adev)
>  {
>  	struct amdgpu_ring *ring = &adev->gfx.compute_ring[0];
> @@ -1763,18 +1808,36 @@ static int
> gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
>  	struct dma_fence *f = NULL;
>  	int r, i;
>  	u32 tmp;
> +	u32 dis_bit;
>  	unsigned total_size, vgpr_offset, sgpr_offset;
>  	u64 gpu_addr;
> 
> -	/* only supported on CZ */
> -	if (adev->asic_type != CHIP_CARRIZO)
> +	if (adev->asic_type != CHIP_CARRIZO) {
> +		/* EDC is only supported on CZ */
> +		return 0;
> +	}

No need to add the additional parens.

> +
> +	DRM_INFO("Detected Carrizo.\n");
> +

Drop this debugging print.

> +	tmp = RREG32(mmCC_GC_EDC_CONFIG);
> +	dis_bit = REG_GET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC);
> +	if (dis_bit) {

You can just check the bit directly, e.g.,
if (REG_GET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC)) {

> +		/* On Carrizo, EDC may be permanently disabled by a fuse. */
> +		DRM_INFO("CZ EDC hardware is disabled, GC_EDC_CONFIG:
> 0x%08x.\n",
> +			tmp);

I would reverse the logic here and only print a message if EDC is enabled.

>  		return 0;
> +	}
> 
>  	/* bail if the compute ring is not ready */
>  	if (!ring->ready)
>  		return 0;
> 
> -	tmp = RREG32(mmGB_EDC_MODE);
> +	DRM_INFO("Applying EDC workarounds.\n");
> +

Drop the debugging print.

> +	/*
> +	 * Interested parties can enable EDC using debugfs register reads and
> +	 * writes.
> +	 */
>  	WREG32(mmGB_EDC_MODE, 0);
> 
>  	total_size =
> @@ -1899,18 +1962,7 @@ static int
> gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
>  		goto fail;
>  	}
> 
> -	tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, DED_MODE, 2);
> -	tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, PROP_FED, 1);
> -	WREG32(mmGB_EDC_MODE, tmp);
> -
> -	tmp = RREG32(mmCC_GC_EDC_CONFIG);
> -	tmp = REG_SET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC, 0) | 1;
> -	WREG32(mmCC_GC_EDC_CONFIG, tmp);
> -
> -
> -	/* read back registers to clear the counters */
> -	for (i = 0; i < ARRAY_SIZE(sec_ded_counter_registers); i++)
> -		RREG32(sec_ded_counter_registers[i]);
> +	gfx_v8_0_edc_clear_counters(adev);
> 
>  fail:
>  	amdgpu_ib_free(adev, &ib, NULL);
> --
> 2.7.4
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH 1/3] drm/amdgpu: Moved gfx_v8_0_select_se_sh() in lieu of re-redundant prototype.
       [not found]     ` <1496781172-23297-2-git-send-email-David.Panariti-5C7GfCeVMHo@public.gmane.org>
@ 2017-06-06 21:16       ` Deucher, Alexander
  0 siblings, 0 replies; 14+ messages in thread
From: Deucher, Alexander @ 2017-06-06 21:16 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Panariti, David

> -----Original Message-----
> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf
> Of David Panariti
> Sent: Tuesday, June 06, 2017 4:33 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Panariti, David
> Subject: [PATCH 1/3] drm/amdgpu: Moved gfx_v8_0_select_se_sh() in lieu
> of re-redundant prototype.
> 
> Will be needed for the rest of the EDC workarounds patch.
> 
> Signed-off-by: David Panariti <David.Panariti@amd.com>

Reviewed-by: Alex Deucher <alexander.deucher@amd.com>

> ---
>  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 46 +++++++++++++++++-------
> -----------
>  1 file changed, 23 insertions(+), 23 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index 69d9bbd..07172f8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -662,6 +662,29 @@ static void gfx_v8_0_ring_emit_de_meta(struct
> amdgpu_ring *ring);
>  static int gfx_v8_0_compute_mqd_sw_init(struct amdgpu_device *adev);
>  static void gfx_v8_0_compute_mqd_sw_fini(struct amdgpu_device *adev);
> 
> +static void gfx_v8_0_select_se_sh(struct amdgpu_device *adev,
> +				  u32 se_num, u32 sh_num, u32 instance)
> +{
> +	u32 data;
> +
> +	if (instance == 0xffffffff)
> +		data = REG_SET_FIELD(0, GRBM_GFX_INDEX,
> INSTANCE_BROADCAST_WRITES, 1);
> +	else
> +		data = REG_SET_FIELD(0, GRBM_GFX_INDEX,
> INSTANCE_INDEX, instance);
> +
> +	if (se_num == 0xffffffff)
> +		data = REG_SET_FIELD(data, GRBM_GFX_INDEX,
> SE_BROADCAST_WRITES, 1);
> +	else
> +		data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SE_INDEX,
> se_num);
> +
> +	if (sh_num == 0xffffffff)
> +		data = REG_SET_FIELD(data, GRBM_GFX_INDEX,
> SH_BROADCAST_WRITES, 1);
> +	else
> +		data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SH_INDEX,
> sh_num);
> +
> +	WREG32(mmGRBM_GFX_INDEX, data);
> +}
> +
>  static void gfx_v8_0_init_golden_registers(struct amdgpu_device *adev)
>  {
>  	switch (adev->asic_type) {
> @@ -3679,29 +3702,6 @@ static void gfx_v8_0_tiling_mode_table_init(struct
> amdgpu_device *adev)
>  	}
>  }
> 
> -static void gfx_v8_0_select_se_sh(struct amdgpu_device *adev,
> -				  u32 se_num, u32 sh_num, u32 instance)
> -{
> -	u32 data;
> -
> -	if (instance == 0xffffffff)
> -		data = REG_SET_FIELD(0, GRBM_GFX_INDEX,
> INSTANCE_BROADCAST_WRITES, 1);
> -	else
> -		data = REG_SET_FIELD(0, GRBM_GFX_INDEX,
> INSTANCE_INDEX, instance);
> -
> -	if (se_num == 0xffffffff)
> -		data = REG_SET_FIELD(data, GRBM_GFX_INDEX,
> SE_BROADCAST_WRITES, 1);
> -	else
> -		data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SE_INDEX,
> se_num);
> -
> -	if (sh_num == 0xffffffff)
> -		data = REG_SET_FIELD(data, GRBM_GFX_INDEX,
> SH_BROADCAST_WRITES, 1);
> -	else
> -		data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SH_INDEX,
> sh_num);
> -
> -	WREG32(mmGRBM_GFX_INDEX, data);
> -}
> -
>  static u32 gfx_v8_0_create_bitmask(u32 bit_width)
>  {
>  	return (u32)((1ULL << bit_width) - 1);
> --
> 2.7.4
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH 3/3] drm/amdgpu: Add kernel parameter to control use of ECC/EDC.
       [not found]     ` <1496781172-23297-4-git-send-email-David.Panariti-5C7GfCeVMHo@public.gmane.org>
@ 2017-06-06 21:22       ` Deucher, Alexander
       [not found]         ` <CY4PR12MB165315F98261CDFE2FBF9E3EF7CB0-rpdhrqHFk06apTa93KjAaQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  0 siblings, 1 reply; 14+ messages in thread
From: Deucher, Alexander @ 2017-06-06 21:22 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Panariti, David

> -----Original Message-----
> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf
> Of David Panariti
> Sent: Tuesday, June 06, 2017 4:33 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Panariti, David
> Subject: [PATCH 3/3] drm/amdgpu: Add kernel parameter to control use of
> ECC/EDC.
> 
> Allow various kinds of memory integrity methods (e.g. ECC/EDC) to be
> enabled
> or disabled.  By default, all features are disabled.
> 
> EDC is Error Detection and Correction.  It can detect ECC errors and do 0 or
> more of: count SEC (single error corrected) and DED (double error detected,
> i.e. uncorrected ECC error), halt the affected block, interrupt the CPU.
> Currently, only counting errors is supported.
> 
> Fixed a whitespace error.
> 
> Signed-off-by: David Panariti <David.Panariti@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h      |  1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  |  4 ++++
>  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 28
> ++++++++++++++++++++++++++--
>  drivers/gpu/drm/amd/include/amd_shared.h | 14 ++++++++++++++
>  4 files changed, 45 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index a6f51eb..3e930ee 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -116,6 +116,7 @@ extern int amdgpu_cntl_sb_buf_per_se;
>  extern int amdgpu_param_buf_per_se;
>  extern int amdgpu_job_hang_limit;
>  extern int amdgpu_lbpw;
> +extern unsigned amdgpu_ecc_flags;
> 
>  #define AMDGPU_DEFAULT_GTT_SIZE_MB		3072ULL /* 3GB by
> default */
>  #define AMDGPU_WAIT_IDLE_TIMEOUT_IN_MS	        3000
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index c4825ff..972660d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -115,6 +115,7 @@ int amdgpu_cntl_sb_buf_per_se = 0;
>  int amdgpu_param_buf_per_se = 0;
>  int amdgpu_job_hang_limit = 0;
>  int amdgpu_lbpw = -1;
> +unsigned amdgpu_ecc_flags = 0;

I'd suggest setting amdgpu_ecc_flags to AMD_ECC_SUPPORT_BEST by default.  That can be our asic specific default setting.  In the case of CZ, that will be disabled until we decide to enable EDC by default.

> 
>  MODULE_PARM_DESC(vramlimit, "Restrict VRAM for testing, in
> megabytes");
>  module_param_named(vramlimit, amdgpu_vram_limit, int, 0600);
> @@ -795,6 +796,9 @@ static struct pci_driver amdgpu_kms_pci_driver = {
>  	.driver.pm = &amdgpu_pm_ops,
>  };
> 
> +MODULE_PARM_DESC(ecc_flags, "ECC/EDC enable flags (0 = disable
> ECC/EDC (default))");
> +module_param_named(ecc_flags, amdgpu_ecc_flags, uint, 0444);
> +
> 
> 
>  static int __init amdgpu_init(void)
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index 46e766e..3b5685c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -1824,7 +1824,17 @@ static int
> gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
>  	if (dis_bit) {
>  		/* On Carrizo, EDC may be permanently disabled by a fuse. */
>  		DRM_INFO("CZ EDC hardware is disabled, GC_EDC_CONFIG:
> 0x%08x.\n",
> -			tmp);
> +			 tmp);
> +		return 0;
> +	}
> +
> +	/*
> +	 * Check if EDC has been requested by a kernel parameter.
> +	 * For Carrizo, EDC is the best/safest mode WRT error handling.
> +	 */
> +	if (!(amdgpu_ecc_flags
> +	      & (AMD_ECC_SUPPORT_BEST | AMD_ECC_SUPPORT_EDC))) {
> +		DRM_INFO("EDC support has not been requested.\n");

Can drop this debugging statement or make it DRM_DEBUG.

>  		return 0;
>  	}
> 
> @@ -1962,6 +1972,20 @@ static int
> gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
>  		goto fail;
>  	}
> 
> +	/* 00 - GB_EDC_DED_MODE_LOG: Count DED errors but do not halt
> */
> +	tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, DED_MODE, 0);
> +	/* Do not propagate the errors to the next block. */
> +	tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, PROP_FED, 0);
> +	WREG32(mmGB_EDC_MODE, tmp);
> +
> +	tmp = RREG32(mmCC_GC_EDC_CONFIG);
> +
> +	/*
> +	 * Clear EDC_DISABLE bit so the counters are available.
> +	 */
> +	tmp = REG_SET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC, 0);
> +	WREG32(mmCC_GC_EDC_CONFIG, tmp);
> +
>  	gfx_v8_0_edc_clear_counters(adev);
> 
>  fail:
> @@ -5548,7 +5572,7 @@ static int gfx_v8_0_pre_soft_reset(void *handle)
>  		gfx_v8_0_cp_compute_enable(adev, false);
>  	}
> 
> -       return 0;
> +	return 0;
>  }
> 
>  static int gfx_v8_0_soft_reset(void *handle)
> diff --git a/drivers/gpu/drm/amd/include/amd_shared.h
> b/drivers/gpu/drm/amd/include/amd_shared.h
> index 0f58e95..1cf30aa 100644
> --- a/drivers/gpu/drm/amd/include/amd_shared.h
> +++ b/drivers/gpu/drm/amd/include/amd_shared.h
> @@ -187,6 +187,20 @@ enum amd_fan_ctrl_mode {
>  #define AMD_PG_SUPPORT_GFX_QUICK_MG		(1 << 11)
>  #define AMD_PG_SUPPORT_GFX_PIPELINE		(1 << 12)
> 
> +/*
> + * ECC flags
> + * Allows the user to choose what kind of error detection/correction is
> used.
> + * Currently, EDC is supported on Carrizo.
> + *
> + * The AMD_ECC_SUPPORT_BEST bit is used to allow a user to have the
> driver
> + * set what it thinks is best/safest mode.  This may not be the same as the
> + * default, depending on the GPU and the application.
> + * Using a single bit makes it easy to request the best support without
> + * needing to know all currently supported modes.
> + */
> +#define AMD_ECC_SUPPORT_BEST			(1 << 0)
> +#define AMD_ECC_SUPPORT_EDC			(1 << 1)
> +
>  enum amd_pm_state_type {
>  	/* not used for dpm */
>  	POWER_STATE_TYPE_DEFAULT,
> --
> 2.7.4
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH 3/3] drm/amdgpu: Add kernel parameter to control use of ECC/EDC.
       [not found]         ` <CY4PR12MB165315F98261CDFE2FBF9E3EF7CB0-rpdhrqHFk06apTa93KjAaQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2017-06-06 21:49           ` Panariti, David
       [not found]             ` <BN6PR12MB1889419C24B4D3E20C688E2195CB0-/b2+HYfkarQsUGYmqVGGPAdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  0 siblings, 1 reply; 14+ messages in thread
From: Panariti, David @ 2017-06-06 21:49 UTC (permalink / raw)
  To: Deucher, Alexander, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Rather than inlining this in a number of places, Re verbosity:
I've worked in embedded environments and when dealing with intermittent problems it's nice to have all of the information ASAP rather than waiting for a problem to reoccur, especially if it's very intermittent.
I would've preferred more.
Since it only shows up happens on CZ, it adds little to the output.
I like to show the reasons why EDC didn't happen, hence the backwards looking messages.
In this particular case,  without the "... not requested..." we can't tell if it was the flags or the ring being unready that made us bail earlier.

> -----Original Message-----
> From: Deucher, Alexander
> Sent: Tuesday, June 06, 2017 5:22 PM
> To: Panariti, David <David.Panariti@amd.com>; amd-
> gfx@lists.freedesktop.org
> Cc: Panariti, David <David.Panariti@amd.com>
> Subject: RE: [PATCH 3/3] drm/amdgpu: Add kernel parameter to control use
> of ECC/EDC.
> 
> > -----Original Message-----
> > From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf
> > Of David Panariti
> > Sent: Tuesday, June 06, 2017 4:33 PM
> > To: amd-gfx@lists.freedesktop.org
> > Cc: Panariti, David
> > Subject: [PATCH 3/3] drm/amdgpu: Add kernel parameter to control use
> > of ECC/EDC.
> >
> > Allow various kinds of memory integrity methods (e.g. ECC/EDC) to be
> > enabled or disabled.  By default, all features are disabled.
> >
> > EDC is Error Detection and Correction.  It can detect ECC errors and
> > do 0 or more of: count SEC (single error corrected) and DED (double
> > error detected, i.e. uncorrected ECC error), halt the affected block,
> interrupt the CPU.
> > Currently, only counting errors is supported.
> >
> > Fixed a whitespace error.
> >
> > Signed-off-by: David Panariti <David.Panariti@amd.com>
> > ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu.h      |  1 +
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  |  4 ++++
> >  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 28
> > ++++++++++++++++++++++++++--
> >  drivers/gpu/drm/amd/include/amd_shared.h | 14 ++++++++++++++
> >  4 files changed, 45 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > index a6f51eb..3e930ee 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > @@ -116,6 +116,7 @@ extern int amdgpu_cntl_sb_buf_per_se;  extern int
> > amdgpu_param_buf_per_se;  extern int amdgpu_job_hang_limit;  extern
> > int amdgpu_lbpw;
> > +extern unsigned amdgpu_ecc_flags;
> >
> >  #define AMDGPU_DEFAULT_GTT_SIZE_MB		3072ULL /* 3GB by
> > default */
> >  #define AMDGPU_WAIT_IDLE_TIMEOUT_IN_MS	        3000
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> > index c4825ff..972660d 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> > @@ -115,6 +115,7 @@ int amdgpu_cntl_sb_buf_per_se = 0;  int
> > amdgpu_param_buf_per_se = 0;  int amdgpu_job_hang_limit = 0;  int
> > amdgpu_lbpw = -1;
> > +unsigned amdgpu_ecc_flags = 0;
> 
> I'd suggest setting amdgpu_ecc_flags to AMD_ECC_SUPPORT_BEST by
> default.  That can be our asic specific default setting.  In the case of CZ, that
> will be disabled until we decide to enable EDC by default.
[davep] I'm confused.  ECC...BEST will cause EDC to be enabled.
I used ECC as the generic term for ECC and EDC, since ECC seems more basic (EDC is built on top of ECC).
If I understand you, we can't do what you want with the current setup.

> 
> >
> >  MODULE_PARM_DESC(vramlimit, "Restrict VRAM for testing, in
> > megabytes");  module_param_named(vramlimit, amdgpu_vram_limit, int,
> > 0600); @@ -795,6 +796,9 @@ static struct pci_driver
> > amdgpu_kms_pci_driver = {
> >  	.driver.pm = &amdgpu_pm_ops,
> >  };
> >
> > +MODULE_PARM_DESC(ecc_flags, "ECC/EDC enable flags (0 = disable
> > ECC/EDC (default))");
> > +module_param_named(ecc_flags, amdgpu_ecc_flags, uint, 0444);
> > +
> >
> >
> >  static int __init amdgpu_init(void)
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > index 46e766e..3b5685c 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > @@ -1824,7 +1824,17 @@ static int
> > gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
> >  	if (dis_bit) {
> >  		/* On Carrizo, EDC may be permanently disabled by a fuse. */
> >  		DRM_INFO("CZ EDC hardware is disabled, GC_EDC_CONFIG:
> > 0x%08x.\n",
> > -			tmp);
> > +			 tmp);
> > +		return 0;
> > +	}
> > +
> > +	/*
> > +	 * Check if EDC has been requested by a kernel parameter.
> > +	 * For Carrizo, EDC is the best/safest mode WRT error handling.
> > +	 */
> > +	if (!(amdgpu_ecc_flags
> > +	      & (AMD_ECC_SUPPORT_BEST | AMD_ECC_SUPPORT_EDC))) {
> > +		DRM_INFO("EDC support has not been requested.\n");
> 
> Can drop this debugging statement or make it DRM_DEBUG.
[davep] See top post.
> 
> >  		return 0;
> >  	}
> >
> > @@ -1962,6 +1972,20 @@ static int
> > gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
> >  		goto fail;
> >  	}
> >
> > +	/* 00 - GB_EDC_DED_MODE_LOG: Count DED errors but do not halt
> > */
> > +	tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, DED_MODE, 0);
> > +	/* Do not propagate the errors to the next block. */
> > +	tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, PROP_FED, 0);
> > +	WREG32(mmGB_EDC_MODE, tmp);
> > +
> > +	tmp = RREG32(mmCC_GC_EDC_CONFIG);
> > +
> > +	/*
> > +	 * Clear EDC_DISABLE bit so the counters are available.
> > +	 */
> > +	tmp = REG_SET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC, 0);
> > +	WREG32(mmCC_GC_EDC_CONFIG, tmp);
> > +
> >  	gfx_v8_0_edc_clear_counters(adev);
> >
> >  fail:
> > @@ -5548,7 +5572,7 @@ static int gfx_v8_0_pre_soft_reset(void *handle)
> >  		gfx_v8_0_cp_compute_enable(adev, false);
> >  	}
> >
> > -       return 0;
> > +	return 0;
> >  }
> >
> >  static int gfx_v8_0_soft_reset(void *handle) diff --git
> > a/drivers/gpu/drm/amd/include/amd_shared.h
> > b/drivers/gpu/drm/amd/include/amd_shared.h
> > index 0f58e95..1cf30aa 100644
> > --- a/drivers/gpu/drm/amd/include/amd_shared.h
> > +++ b/drivers/gpu/drm/amd/include/amd_shared.h
> > @@ -187,6 +187,20 @@ enum amd_fan_ctrl_mode {
> >  #define AMD_PG_SUPPORT_GFX_QUICK_MG		(1 << 11)
> >  #define AMD_PG_SUPPORT_GFX_PIPELINE		(1 << 12)
> >
> > +/*
> > + * ECC flags
> > + * Allows the user to choose what kind of error detection/correction
> > +is
> > used.
> > + * Currently, EDC is supported on Carrizo.
> > + *
> > + * The AMD_ECC_SUPPORT_BEST bit is used to allow a user to have the
> > driver
> > + * set what it thinks is best/safest mode.  This may not be the same
> > +as the
> > + * default, depending on the GPU and the application.
> > + * Using a single bit makes it easy to request the best support
> > +without
> > + * needing to know all currently supported modes.
> > + */
> > +#define AMD_ECC_SUPPORT_BEST			(1 << 0)
> > +#define AMD_ECC_SUPPORT_EDC			(1 << 1)
> > +
> >  enum amd_pm_state_type {
> >  	/* not used for dpm */
> >  	POWER_STATE_TYPE_DEFAULT,
> > --
> > 2.7.4
> >
> > _______________________________________________
> > amd-gfx mailing list
> > amd-gfx@lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH 3/3] drm/amdgpu: Add kernel parameter to control use of ECC/EDC.
       [not found]             ` <BN6PR12MB1889419C24B4D3E20C688E2195CB0-/b2+HYfkarQsUGYmqVGGPAdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2017-06-06 22:16               ` Deucher, Alexander
       [not found]                 ` <CY4PR12MB16532F2483501CF138CCACCDF7CB0-rpdhrqHFk06apTa93KjAaQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  0 siblings, 1 reply; 14+ messages in thread
From: Deucher, Alexander @ 2017-06-06 22:16 UTC (permalink / raw)
  To: Panariti, David, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

> -----Original Message-----
> From: Panariti, David
> Sent: Tuesday, June 06, 2017 5:50 PM
> To: Deucher, Alexander; amd-gfx@lists.freedesktop.org
> Subject: RE: [PATCH 3/3] drm/amdgpu: Add kernel parameter to control use
> of ECC/EDC.
> 
> Rather than inlining this in a number of places, Re verbosity:
> I've worked in embedded environments and when dealing with intermittent
> problems it's nice to have all of the information ASAP rather than waiting for
> a problem to reoccur, especially if it's very intermittent.
> I would've preferred more.
> Since it only shows up happens on CZ, it adds little to the output.
> I like to show the reasons why EDC didn't happen, hence the backwards
> looking messages.
> In this particular case,  without the "... not requested..." we can't tell if it was
> the flags or the ring being unready that made us bail earlier.

I'm fine with a message about EDC either being enabled or disabled, but a bunch of random debug statements along the way are too much.  They tend to just cause confusion and clutter up the logs.

> 
> > -----Original Message-----
> > From: Deucher, Alexander
> > Sent: Tuesday, June 06, 2017 5:22 PM
> > To: Panariti, David <David.Panariti@amd.com>; amd-
> > gfx@lists.freedesktop.org
> > Cc: Panariti, David <David.Panariti@amd.com>
> > Subject: RE: [PATCH 3/3] drm/amdgpu: Add kernel parameter to control
> use
> > of ECC/EDC.
> >
> > > -----Original Message-----
> > > From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On
> Behalf
> > > Of David Panariti
> > > Sent: Tuesday, June 06, 2017 4:33 PM
> > > To: amd-gfx@lists.freedesktop.org
> > > Cc: Panariti, David
> > > Subject: [PATCH 3/3] drm/amdgpu: Add kernel parameter to control use
> > > of ECC/EDC.
> > >
> > > Allow various kinds of memory integrity methods (e.g. ECC/EDC) to be
> > > enabled or disabled.  By default, all features are disabled.
> > >
> > > EDC is Error Detection and Correction.  It can detect ECC errors and
> > > do 0 or more of: count SEC (single error corrected) and DED (double
> > > error detected, i.e. uncorrected ECC error), halt the affected block,
> > interrupt the CPU.
> > > Currently, only counting errors is supported.
> > >
> > > Fixed a whitespace error.
> > >
> > > Signed-off-by: David Panariti <David.Panariti@amd.com>
> > > ---
> > >  drivers/gpu/drm/amd/amdgpu/amdgpu.h      |  1 +
> > >  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  |  4 ++++
> > >  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 28
> > > ++++++++++++++++++++++++++--
> > >  drivers/gpu/drm/amd/include/amd_shared.h | 14 ++++++++++++++
> > >  4 files changed, 45 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > > index a6f51eb..3e930ee 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > > @@ -116,6 +116,7 @@ extern int amdgpu_cntl_sb_buf_per_se;  extern
> int
> > > amdgpu_param_buf_per_se;  extern int amdgpu_job_hang_limit;
> extern
> > > int amdgpu_lbpw;
> > > +extern unsigned amdgpu_ecc_flags;
> > >
> > >  #define AMDGPU_DEFAULT_GTT_SIZE_MB		3072ULL /*
> 3GB by
> > > default */
> > >  #define AMDGPU_WAIT_IDLE_TIMEOUT_IN_MS	        3000
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> > > index c4825ff..972660d 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> > > @@ -115,6 +115,7 @@ int amdgpu_cntl_sb_buf_per_se = 0;  int
> > > amdgpu_param_buf_per_se = 0;  int amdgpu_job_hang_limit = 0;  int
> > > amdgpu_lbpw = -1;
> > > +unsigned amdgpu_ecc_flags = 0;
> >
> > I'd suggest setting amdgpu_ecc_flags to AMD_ECC_SUPPORT_BEST by
> > default.  That can be our asic specific default setting.  In the case of CZ, that
> > will be disabled until we decide to enable EDC by default.
> [davep] I'm confused.  ECC...BEST will cause EDC to be enabled.
> I used ECC as the generic term for ECC and EDC, since ECC seems more basic
> (EDC is built on top of ECC).
> If I understand you, we can't do what you want with the current setup.

I'm saying we make ECC_BEST the default config (feel free to re-name if ECC_DEFAULT).  Each asic can have a different default depending on what features are ready.  So for CZ, we'd make ECC_BEST equivalent to disabling ECC for now.  If a user wants to force it on, they can set ECC_EDC.  Once EDC is stable on CZ, we can make ECC_BEST be equivalent to ECC_EDC.  The way the default (ECC_BEST) always maps to the best available combination in that version of the driver.

E.g., in the current gfx8 code:

if ((amdgpu_ecc_flags & ECC_EDC) && edc_fuse_enabled) {
    // enable EDC
    goto enable_edc;
} else {
    // disable EDC
    return 0;
}

Then if we want to enable EDC by default, we'd change the code like so:

if (((amdgpu_ecc_flags == ECC_EDC) || (amdgpu_ecc_flags & ECC_EDC)) && edc_fuse_enabled) {
    // enable EDC
    goto enable_edc;
} else {
    // disable EDC
    return 0;
}

That way we can have different ECC features enabled by default for each asic family without needing to specify command line options.

Alex

> 
> >
> > >
> > >  MODULE_PARM_DESC(vramlimit, "Restrict VRAM for testing, in
> > > megabytes");  module_param_named(vramlimit, amdgpu_vram_limit,
> int,
> > > 0600); @@ -795,6 +796,9 @@ static struct pci_driver
> > > amdgpu_kms_pci_driver = {
> > >  	.driver.pm = &amdgpu_pm_ops,
> > >  };
> > >
> > > +MODULE_PARM_DESC(ecc_flags, "ECC/EDC enable flags (0 = disable
> > > ECC/EDC (default))");
> > > +module_param_named(ecc_flags, amdgpu_ecc_flags, uint, 0444);
> > > +
> > >
> > >
> > >  static int __init amdgpu_init(void)
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > > b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > > index 46e766e..3b5685c 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > > @@ -1824,7 +1824,17 @@ static int
> > > gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
> > >  	if (dis_bit) {
> > >  		/* On Carrizo, EDC may be permanently disabled by a fuse. */
> > >  		DRM_INFO("CZ EDC hardware is disabled, GC_EDC_CONFIG:
> > > 0x%08x.\n",
> > > -			tmp);
> > > +			 tmp);
> > > +		return 0;
> > > +	}
> > > +
> > > +	/*
> > > +	 * Check if EDC has been requested by a kernel parameter.
> > > +	 * For Carrizo, EDC is the best/safest mode WRT error handling.
> > > +	 */
> > > +	if (!(amdgpu_ecc_flags
> > > +	      & (AMD_ECC_SUPPORT_BEST | AMD_ECC_SUPPORT_EDC))) {
> > > +		DRM_INFO("EDC support has not been requested.\n");
> >
> > Can drop this debugging statement or make it DRM_DEBUG.
> [davep] See top post.
> >
> > >  		return 0;
> > >  	}
> > >
> > > @@ -1962,6 +1972,20 @@ static int
> > > gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
> > >  		goto fail;
> > >  	}
> > >
> > > +	/* 00 - GB_EDC_DED_MODE_LOG: Count DED errors but do not halt
> > > */
> > > +	tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, DED_MODE, 0);
> > > +	/* Do not propagate the errors to the next block. */
> > > +	tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, PROP_FED, 0);
> > > +	WREG32(mmGB_EDC_MODE, tmp);
> > > +
> > > +	tmp = RREG32(mmCC_GC_EDC_CONFIG);
> > > +
> > > +	/*
> > > +	 * Clear EDC_DISABLE bit so the counters are available.
> > > +	 */
> > > +	tmp = REG_SET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC, 0);
> > > +	WREG32(mmCC_GC_EDC_CONFIG, tmp);
> > > +
> > >  	gfx_v8_0_edc_clear_counters(adev);
> > >
> > >  fail:
> > > @@ -5548,7 +5572,7 @@ static int gfx_v8_0_pre_soft_reset(void
> *handle)
> > >  		gfx_v8_0_cp_compute_enable(adev, false);
> > >  	}
> > >
> > > -       return 0;
> > > +	return 0;
> > >  }
> > >
> > >  static int gfx_v8_0_soft_reset(void *handle) diff --git
> > > a/drivers/gpu/drm/amd/include/amd_shared.h
> > > b/drivers/gpu/drm/amd/include/amd_shared.h
> > > index 0f58e95..1cf30aa 100644
> > > --- a/drivers/gpu/drm/amd/include/amd_shared.h
> > > +++ b/drivers/gpu/drm/amd/include/amd_shared.h
> > > @@ -187,6 +187,20 @@ enum amd_fan_ctrl_mode {
> > >  #define AMD_PG_SUPPORT_GFX_QUICK_MG		(1 << 11)
> > >  #define AMD_PG_SUPPORT_GFX_PIPELINE		(1 << 12)
> > >
> > > +/*
> > > + * ECC flags
> > > + * Allows the user to choose what kind of error detection/correction
> > > +is
> > > used.
> > > + * Currently, EDC is supported on Carrizo.
> > > + *
> > > + * The AMD_ECC_SUPPORT_BEST bit is used to allow a user to have the
> > > driver
> > > + * set what it thinks is best/safest mode.  This may not be the same
> > > +as the
> > > + * default, depending on the GPU and the application.
> > > + * Using a single bit makes it easy to request the best support
> > > +without
> > > + * needing to know all currently supported modes.
> > > + */
> > > +#define AMD_ECC_SUPPORT_BEST			(1 << 0)
> > > +#define AMD_ECC_SUPPORT_EDC			(1 << 1)
> > > +
> > >  enum amd_pm_state_type {
> > >  	/* not used for dpm */
> > >  	POWER_STATE_TYPE_DEFAULT,
> > > --
> > > 2.7.4
> > >
> > > _______________________________________________
> > > amd-gfx mailing list
> > > amd-gfx@lists.freedesktop.org
> > > https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH 3/3] drm/amdgpu: Add kernel parameter to control use of ECC/EDC.
       [not found]                 ` <CY4PR12MB16532F2483501CF138CCACCDF7CB0-rpdhrqHFk06apTa93KjAaQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2017-06-26 16:06                   ` Panariti, David
       [not found]                     ` <BN6PR12MB117285B9134A4BAD2E7731ED95DF0-/b2+HYfkarTft/eMqzLDqAdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  0 siblings, 1 reply; 14+ messages in thread
From: Panariti, David @ 2017-06-26 16:06 UTC (permalink / raw)
  To: Deucher, Alexander, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

>> > I'd suggest setting amdgpu_ecc_flags to AMD_ECC_SUPPORT_BEST by 
>> > default.  That can be our asic specific default setting.  In the 
>> > case of CZ, that will be disabled until we decide to enable EDC by default.
>> [davep] I'm confused.  ECC...BEST will cause EDC to be enabled.
>> I used ECC as the generic term for ECC and EDC, since ECC seems more 
>> basic (EDC is built on top of ECC).
>> If I understand you, we can't do what you want with the current setup.

>I'm saying we make ECC_BEST the default config (feel free to re-name if ECC_DEFAULT).  Each asic can have a different default depending 
>on what features are ready.  So for CZ, we'd make ECC_BEST equivalent to disabling ECC for now.  If a user wants to force it on, they can 
>set ECC_EDC.  Once EDC is stable on CZ, we can make ECC_BEST be equivalent to ECC_EDC.  The way the default (ECC_BEST) always maps 
>to the best available combination in that version of the driver.

That's not how I meant it to work WRT BEST.
Each asic will have a DEFAULT, but that isn't what BEST means.
CZ is a good example (when fully implemented).  DEFAULT for CZ is everything except HALT, since, IMO opinion, most people do not want to hang or reboot.
BEST for CZ would be everything a person most interested in reliability would want, which IMO, includes HALT/reboot.
Similar is if something like performance degradation is really bad, DEFAULT would be OFF. BEST would be ON, e.g., if the user's app doesn't trigger the performance problem.
The BEST bit is in a fixed position, so that customers don't need to worry what bits are needed for the most reliable performance (in our opinion) on a given asic.
And if a customer (or developer) wants some arbitrary set of features, they can set bits as they want.

I think DEFAULT will make most people happy.
BEST allows people who are interested in everything they can get, regardless of any issues that brings with it. It is requested simply by using a fixed param value (0x01) for any asic.
This probably should not include features that have any kind of fatal flaw such as the Vega10 HBM ECC issue.  When fixed, it can be added to DEFAULT.
And allowing per-feature control allows anyone to do precisely what they want.
"Effort" increases as the number of interested users decreases.

Using defines in the init code will be a problem if there is more than one kind of asic involved or a single asic that the user wants to use with different parameters.  However, this doesn't seem to be a high priority.
If we do want to worry about it, then we'll need to put the values into the amdgpu_gfx struct.

regards,
davep

> -----Original Message-----
> From: Deucher, Alexander
> Sent: Tuesday, June 06, 2017 6:16 PM
> To: Panariti, David <David.Panariti@amd.com>; amd-
> gfx@lists.freedesktop.org
> Subject: RE: [PATCH 3/3] drm/amdgpu: Add kernel parameter to control use
> of ECC/EDC.
> 
> > -----Original Message-----
> > From: Panariti, David
> > Sent: Tuesday, June 06, 2017 5:50 PM
> > To: Deucher, Alexander; amd-gfx@lists.freedesktop.org
> > Subject: RE: [PATCH 3/3] drm/amdgpu: Add kernel parameter to control
> > use of ECC/EDC.
> >
> > Rather than inlining this in a number of places, Re verbosity:
> > I've worked in embedded environments and when dealing with
> > intermittent problems it's nice to have all of the information ASAP
> > rather than waiting for a problem to reoccur, especially if it's very
> intermittent.
> > I would've preferred more.
> > Since it only shows up happens on CZ, it adds little to the output.
> > I like to show the reasons why EDC didn't happen, hence the backwards
> > looking messages.
> > In this particular case,  without the "... not requested..." we can't
> > tell if it was the flags or the ring being unready that made us bail earlier.
> 
> I'm fine with a message about EDC either being enabled or disabled, but a
> bunch of random debug statements along the way are too much.  They tend
> to just cause confusion and clutter up the logs.
> 
> >
> > > -----Original Message-----
> > > From: Deucher, Alexander
> > > Sent: Tuesday, June 06, 2017 5:22 PM
> > > To: Panariti, David <David.Panariti@amd.com>; amd-
> > > gfx@lists.freedesktop.org
> > > Cc: Panariti, David <David.Panariti@amd.com>
> > > Subject: RE: [PATCH 3/3] drm/amdgpu: Add kernel parameter to control
> > use
> > > of ECC/EDC.
> > >
> > > > -----Original Message-----
> > > > From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On
> > Behalf
> > > > Of David Panariti
> > > > Sent: Tuesday, June 06, 2017 4:33 PM
> > > > To: amd-gfx@lists.freedesktop.org
> > > > Cc: Panariti, David
> > > > Subject: [PATCH 3/3] drm/amdgpu: Add kernel parameter to control
> > > > use of ECC/EDC.
> > > >
> > > > Allow various kinds of memory integrity methods (e.g. ECC/EDC) to
> > > > be enabled or disabled.  By default, all features are disabled.
> > > >
> > > > EDC is Error Detection and Correction.  It can detect ECC errors
> > > > and do 0 or more of: count SEC (single error corrected) and DED
> > > > (double error detected, i.e. uncorrected ECC error), halt the
> > > > affected block,
> > > interrupt the CPU.
> > > > Currently, only counting errors is supported.
> > > >
> > > > Fixed a whitespace error.
> > > >
> > > > Signed-off-by: David Panariti <David.Panariti@amd.com>
> > > > ---
> > > >  drivers/gpu/drm/amd/amdgpu/amdgpu.h      |  1 +
> > > >  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  |  4 ++++
> > > >  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 28
> > > > ++++++++++++++++++++++++++--
> > > >  drivers/gpu/drm/amd/include/amd_shared.h | 14 ++++++++++++++
> > > >  4 files changed, 45 insertions(+), 2 deletions(-)
> > > >
> > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > > > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > > > index a6f51eb..3e930ee 100644
> > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > > > @@ -116,6 +116,7 @@ extern int amdgpu_cntl_sb_buf_per_se;  extern
> > int
> > > > amdgpu_param_buf_per_se;  extern int amdgpu_job_hang_limit;
> > extern
> > > > int amdgpu_lbpw;
> > > > +extern unsigned amdgpu_ecc_flags;
> > > >
> > > >  #define AMDGPU_DEFAULT_GTT_SIZE_MB		3072ULL /*
> > 3GB by
> > > > default */
> > > >  #define AMDGPU_WAIT_IDLE_TIMEOUT_IN_MS	        3000
> > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> > > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> > > > index c4825ff..972660d 100644
> > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> > > > @@ -115,6 +115,7 @@ int amdgpu_cntl_sb_buf_per_se = 0;  int
> > > > amdgpu_param_buf_per_se = 0;  int amdgpu_job_hang_limit = 0;  int
> > > > amdgpu_lbpw = -1;
> > > > +unsigned amdgpu_ecc_flags = 0;
> > >
> > > I'd suggest setting amdgpu_ecc_flags to AMD_ECC_SUPPORT_BEST by
> > > default.  That can be our asic specific default setting.  In the
> > > case of CZ, that will be disabled until we decide to enable EDC by default.
> > [davep] I'm confused.  ECC...BEST will cause EDC to be enabled.
> > I used ECC as the generic term for ECC and EDC, since ECC seems more
> > basic (EDC is built on top of ECC).
> > If I understand you, we can't do what you want with the current setup.
> 
> I'm saying we make ECC_BEST the default config (feel free to re-name if
> ECC_DEFAULT).  Each asic can have a different default depending on what
> features are ready.  So for CZ, we'd make ECC_BEST equivalent to disabling
> ECC for now.  If a user wants to force it on, they can set ECC_EDC.  Once EDC
> is stable on CZ, we can make ECC_BEST be equivalent to ECC_EDC.  The way
> the default (ECC_BEST) always maps to the best available combination in that
> version of the driver.
> 
> E.g., in the current gfx8 code:
> 
> if ((amdgpu_ecc_flags & ECC_EDC) && edc_fuse_enabled) {
>     // enable EDC
>     goto enable_edc;
> } else {
>     // disable EDC
>     return 0;
> }
> 
> Then if we want to enable EDC by default, we'd change the code like so:
> 
> if (((amdgpu_ecc_flags == ECC_EDC) || (amdgpu_ecc_flags & ECC_EDC)) &&
> edc_fuse_enabled) {
>     // enable EDC
>     goto enable_edc;
> } else {
>     // disable EDC
>     return 0;
> }
> 
> That way we can have different ECC features enabled by default for each asic
> family without needing to specify command line options.
> 
> Alex
> 
> >
> > >
> > > >
> > > >  MODULE_PARM_DESC(vramlimit, "Restrict VRAM for testing, in
> > > > megabytes");  module_param_named(vramlimit, amdgpu_vram_limit,
> > int,
> > > > 0600); @@ -795,6 +796,9 @@ static struct pci_driver
> > > > amdgpu_kms_pci_driver = {
> > > >  	.driver.pm = &amdgpu_pm_ops,
> > > >  };
> > > >
> > > > +MODULE_PARM_DESC(ecc_flags, "ECC/EDC enable flags (0 = disable
> > > > ECC/EDC (default))");
> > > > +module_param_named(ecc_flags, amdgpu_ecc_flags, uint, 0444);
> > > > +
> > > >
> > > >
> > > >  static int __init amdgpu_init(void) diff --git
> > > > a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > > > b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > > > index 46e766e..3b5685c 100644
> > > > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > > > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > > > @@ -1824,7 +1824,17 @@ static int
> > > > gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
> > > >  	if (dis_bit) {
> > > >  		/* On Carrizo, EDC may be permanently disabled by a fuse. */
> > > >  		DRM_INFO("CZ EDC hardware is disabled, GC_EDC_CONFIG:
> > > > 0x%08x.\n",
> > > > -			tmp);
> > > > +			 tmp);
> > > > +		return 0;
> > > > +	}
> > > > +
> > > > +	/*
> > > > +	 * Check if EDC has been requested by a kernel parameter.
> > > > +	 * For Carrizo, EDC is the best/safest mode WRT error handling.
> > > > +	 */
> > > > +	if (!(amdgpu_ecc_flags
> > > > +	      & (AMD_ECC_SUPPORT_BEST | AMD_ECC_SUPPORT_EDC))) {
> > > > +		DRM_INFO("EDC support has not been requested.\n");
> > >
> > > Can drop this debugging statement or make it DRM_DEBUG.
> > [davep] See top post.
> > >
> > > >  		return 0;
> > > >  	}
> > > >
> > > > @@ -1962,6 +1972,20 @@ static int
> > > > gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
> > > >  		goto fail;
> > > >  	}
> > > >
> > > > +	/* 00 - GB_EDC_DED_MODE_LOG: Count DED errors but do not halt
> > > > */
> > > > +	tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, DED_MODE, 0);
> > > > +	/* Do not propagate the errors to the next block. */
> > > > +	tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, PROP_FED, 0);
> > > > +	WREG32(mmGB_EDC_MODE, tmp);
> > > > +
> > > > +	tmp = RREG32(mmCC_GC_EDC_CONFIG);
> > > > +
> > > > +	/*
> > > > +	 * Clear EDC_DISABLE bit so the counters are available.
> > > > +	 */
> > > > +	tmp = REG_SET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC, 0);
> > > > +	WREG32(mmCC_GC_EDC_CONFIG, tmp);
> > > > +
> > > >  	gfx_v8_0_edc_clear_counters(adev);
> > > >
> > > >  fail:
> > > > @@ -5548,7 +5572,7 @@ static int gfx_v8_0_pre_soft_reset(void
> > *handle)
> > > >  		gfx_v8_0_cp_compute_enable(adev, false);
> > > >  	}
> > > >
> > > > -       return 0;
> > > > +	return 0;
> > > >  }
> > > >
> > > >  static int gfx_v8_0_soft_reset(void *handle) diff --git
> > > > a/drivers/gpu/drm/amd/include/amd_shared.h
> > > > b/drivers/gpu/drm/amd/include/amd_shared.h
> > > > index 0f58e95..1cf30aa 100644
> > > > --- a/drivers/gpu/drm/amd/include/amd_shared.h
> > > > +++ b/drivers/gpu/drm/amd/include/amd_shared.h
> > > > @@ -187,6 +187,20 @@ enum amd_fan_ctrl_mode {
> > > >  #define AMD_PG_SUPPORT_GFX_QUICK_MG		(1 << 11)
> > > >  #define AMD_PG_SUPPORT_GFX_PIPELINE		(1 << 12)
> > > >
> > > > +/*
> > > > + * ECC flags
> > > > + * Allows the user to choose what kind of error
> > > > +detection/correction is
> > > > used.
> > > > + * Currently, EDC is supported on Carrizo.
> > > > + *
> > > > + * The AMD_ECC_SUPPORT_BEST bit is used to allow a user to have
> > > > + the
> > > > driver
> > > > + * set what it thinks is best/safest mode.  This may not be the
> > > > +same as the
> > > > + * default, depending on the GPU and the application.
> > > > + * Using a single bit makes it easy to request the best support
> > > > +without
> > > > + * needing to know all currently supported modes.
> > > > + */
> > > > +#define AMD_ECC_SUPPORT_BEST			(1 << 0)
> > > > +#define AMD_ECC_SUPPORT_EDC			(1 << 1)
> > > > +
> > > >  enum amd_pm_state_type {
> > > >  	/* not used for dpm */
> > > >  	POWER_STATE_TYPE_DEFAULT,
> > > > --
> > > > 2.7.4
> > > >
> > > > _______________________________________________
> > > > amd-gfx mailing list
> > > > amd-gfx@lists.freedesktop.org
> > > > https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH 3/3] drm/amdgpu: Add kernel parameter to control use of ECC/EDC.
       [not found]                     ` <BN6PR12MB117285B9134A4BAD2E7731ED95DF0-/b2+HYfkarTft/eMqzLDqAdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2017-06-26 16:21                       ` Deucher, Alexander
  2017-06-26 18:11                       ` Xie, AlexBin
  1 sibling, 0 replies; 14+ messages in thread
From: Deucher, Alexander @ 2017-06-26 16:21 UTC (permalink / raw)
  To: Panariti, David, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

> -----Original Message-----
> From: Panariti, David
> Sent: Monday, June 26, 2017 12:06 PM
> To: Deucher, Alexander; amd-gfx@lists.freedesktop.org
> Subject: RE: [PATCH 3/3] drm/amdgpu: Add kernel parameter to control use
> of ECC/EDC.
> 
> >> > I'd suggest setting amdgpu_ecc_flags to AMD_ECC_SUPPORT_BEST by
> >> > default.  That can be our asic specific default setting.  In the
> >> > case of CZ, that will be disabled until we decide to enable EDC by
> default.
> >> [davep] I'm confused.  ECC...BEST will cause EDC to be enabled.
> >> I used ECC as the generic term for ECC and EDC, since ECC seems more
> >> basic (EDC is built on top of ECC).
> >> If I understand you, we can't do what you want with the current setup.
> 
> >I'm saying we make ECC_BEST the default config (feel free to re-name if
> ECC_DEFAULT).  Each asic can have a different default depending
> >on what features are ready.  So for CZ, we'd make ECC_BEST equivalent to
> disabling ECC for now.  If a user wants to force it on, they can
> >set ECC_EDC.  Once EDC is stable on CZ, we can make ECC_BEST be
> equivalent to ECC_EDC.  The way the default (ECC_BEST) always maps
> >to the best available combination in that version of the driver.
> 
> That's not how I meant it to work WRT BEST.
> Each asic will have a DEFAULT, but that isn't what BEST means.
> CZ is a good example (when fully implemented).  DEFAULT for CZ is
> everything except HALT, since, IMO opinion, most people do not want to
> hang or reboot.
> BEST for CZ would be everything a person most interested in reliability would
> want, which IMO, includes HALT/reboot.
> Similar is if something like performance degradation is really bad, DEFAULT
> would be OFF. BEST would be ON, e.g., if the user's app doesn't trigger the
> performance problem.
> The BEST bit is in a fixed position, so that customers don't need to worry
> what bits are needed for the most reliable performance (in our opinion) on a
> given asic.
> And if a customer (or developer) wants some arbitrary set of features, they
> can set bits as they want.
> 
> I think DEFAULT will make most people happy.
> BEST allows people who are interested in everything they can get, regardless
> of any issues that brings with it. It is requested simply by using a fixed param
> value (0x01) for any asic.
> This probably should not include features that have any kind of fatal flaw
> such as the Vega10 HBM ECC issue.  When fixed, it can be added to DEFAULT.
> And allowing per-feature control allows anyone to do precisely what they
> want.
> "Effort" increases as the number of interested users decreases.
> 

I would like to be able to have different default settings on all asics without requiring explicitly setting the option for each chip.  If there is a best setting it should be the default.  BEST implies it's the best option so users will try it whether it makes sense or not.  If you want to enable something that is not the default, then you need to explicitly ask for it.  BEST could be defined as 0xffffffff since each asic would only look at the masked bits for the features it supports.  I would prefer not to call it BEST though.  Maybe ALL.

RAS_NONE 0
RAS_DEFAULT (1 << 0)
RAS_VRAM_ECC (1 << 1)
RAS_SRAM_EDC (1 << 2)
...
RAS_ALL 0xffffffff

Alex

> Using defines in the init code will be a problem if there is more than one kind
> of asic involved or a single asic that the user wants to use with different
> parameters.  However, this doesn't seem to be a high priority.
> If we do want to worry about it, then we'll need to put the values into the
> amdgpu_gfx struct.
> 
> regards,
> davep
> 
> > -----Original Message-----
> > From: Deucher, Alexander
> > Sent: Tuesday, June 06, 2017 6:16 PM
> > To: Panariti, David <David.Panariti@amd.com>; amd-
> > gfx@lists.freedesktop.org
> > Subject: RE: [PATCH 3/3] drm/amdgpu: Add kernel parameter to control
> use
> > of ECC/EDC.
> >
> > > -----Original Message-----
> > > From: Panariti, David
> > > Sent: Tuesday, June 06, 2017 5:50 PM
> > > To: Deucher, Alexander; amd-gfx@lists.freedesktop.org
> > > Subject: RE: [PATCH 3/3] drm/amdgpu: Add kernel parameter to control
> > > use of ECC/EDC.
> > >
> > > Rather than inlining this in a number of places, Re verbosity:
> > > I've worked in embedded environments and when dealing with
> > > intermittent problems it's nice to have all of the information ASAP
> > > rather than waiting for a problem to reoccur, especially if it's very
> > intermittent.
> > > I would've preferred more.
> > > Since it only shows up happens on CZ, it adds little to the output.
> > > I like to show the reasons why EDC didn't happen, hence the backwards
> > > looking messages.
> > > In this particular case,  without the "... not requested..." we can't
> > > tell if it was the flags or the ring being unready that made us bail earlier.
> >
> > I'm fine with a message about EDC either being enabled or disabled, but a
> > bunch of random debug statements along the way are too much.  They
> tend
> > to just cause confusion and clutter up the logs.
> >
> > >
> > > > -----Original Message-----
> > > > From: Deucher, Alexander
> > > > Sent: Tuesday, June 06, 2017 5:22 PM
> > > > To: Panariti, David <David.Panariti@amd.com>; amd-
> > > > gfx@lists.freedesktop.org
> > > > Cc: Panariti, David <David.Panariti@amd.com>
> > > > Subject: RE: [PATCH 3/3] drm/amdgpu: Add kernel parameter to control
> > > use
> > > > of ECC/EDC.
> > > >
> > > > > -----Original Message-----
> > > > > From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On
> > > Behalf
> > > > > Of David Panariti
> > > > > Sent: Tuesday, June 06, 2017 4:33 PM
> > > > > To: amd-gfx@lists.freedesktop.org
> > > > > Cc: Panariti, David
> > > > > Subject: [PATCH 3/3] drm/amdgpu: Add kernel parameter to control
> > > > > use of ECC/EDC.
> > > > >
> > > > > Allow various kinds of memory integrity methods (e.g. ECC/EDC) to
> > > > > be enabled or disabled.  By default, all features are disabled.
> > > > >
> > > > > EDC is Error Detection and Correction.  It can detect ECC errors
> > > > > and do 0 or more of: count SEC (single error corrected) and DED
> > > > > (double error detected, i.e. uncorrected ECC error), halt the
> > > > > affected block,
> > > > interrupt the CPU.
> > > > > Currently, only counting errors is supported.
> > > > >
> > > > > Fixed a whitespace error.
> > > > >
> > > > > Signed-off-by: David Panariti <David.Panariti@amd.com>
> > > > > ---
> > > > >  drivers/gpu/drm/amd/amdgpu/amdgpu.h      |  1 +
> > > > >  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  |  4 ++++
> > > > >  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 28
> > > > > ++++++++++++++++++++++++++--
> > > > >  drivers/gpu/drm/amd/include/amd_shared.h | 14 ++++++++++++++
> > > > >  4 files changed, 45 insertions(+), 2 deletions(-)
> > > > >
> > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > > > > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > > > > index a6f51eb..3e930ee 100644
> > > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > > > > @@ -116,6 +116,7 @@ extern int amdgpu_cntl_sb_buf_per_se;
> extern
> > > int
> > > > > amdgpu_param_buf_per_se;  extern int amdgpu_job_hang_limit;
> > > extern
> > > > > int amdgpu_lbpw;
> > > > > +extern unsigned amdgpu_ecc_flags;
> > > > >
> > > > >  #define AMDGPU_DEFAULT_GTT_SIZE_MB		3072ULL /*
> > > 3GB by
> > > > > default */
> > > > >  #define AMDGPU_WAIT_IDLE_TIMEOUT_IN_MS	        3000
> > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> > > > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> > > > > index c4825ff..972660d 100644
> > > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> > > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> > > > > @@ -115,6 +115,7 @@ int amdgpu_cntl_sb_buf_per_se = 0;  int
> > > > > amdgpu_param_buf_per_se = 0;  int amdgpu_job_hang_limit = 0;  int
> > > > > amdgpu_lbpw = -1;
> > > > > +unsigned amdgpu_ecc_flags = 0;
> > > >
> > > > I'd suggest setting amdgpu_ecc_flags to AMD_ECC_SUPPORT_BEST by
> > > > default.  That can be our asic specific default setting.  In the
> > > > case of CZ, that will be disabled until we decide to enable EDC by
> default.
> > > [davep] I'm confused.  ECC...BEST will cause EDC to be enabled.
> > > I used ECC as the generic term for ECC and EDC, since ECC seems more
> > > basic (EDC is built on top of ECC).
> > > If I understand you, we can't do what you want with the current setup.
> >
> > I'm saying we make ECC_BEST the default config (feel free to re-name if
> > ECC_DEFAULT).  Each asic can have a different default depending on what
> > features are ready.  So for CZ, we'd make ECC_BEST equivalent to disabling
> > ECC for now.  If a user wants to force it on, they can set ECC_EDC.  Once
> EDC
> > is stable on CZ, we can make ECC_BEST be equivalent to ECC_EDC.  The way
> > the default (ECC_BEST) always maps to the best available combination in
> that
> > version of the driver.
> >
> > E.g., in the current gfx8 code:
> >
> > if ((amdgpu_ecc_flags & ECC_EDC) && edc_fuse_enabled) {
> >     // enable EDC
> >     goto enable_edc;
> > } else {
> >     // disable EDC
> >     return 0;
> > }
> >
> > Then if we want to enable EDC by default, we'd change the code like so:
> >
> > if (((amdgpu_ecc_flags == ECC_EDC) || (amdgpu_ecc_flags & ECC_EDC))
> &&
> > edc_fuse_enabled) {
> >     // enable EDC
> >     goto enable_edc;
> > } else {
> >     // disable EDC
> >     return 0;
> > }
> >
> > That way we can have different ECC features enabled by default for each
> asic
> > family without needing to specify command line options.
> >
> > Alex
> >
> > >
> > > >
> > > > >
> > > > >  MODULE_PARM_DESC(vramlimit, "Restrict VRAM for testing, in
> > > > > megabytes");  module_param_named(vramlimit,
> amdgpu_vram_limit,
> > > int,
> > > > > 0600); @@ -795,6 +796,9 @@ static struct pci_driver
> > > > > amdgpu_kms_pci_driver = {
> > > > >  	.driver.pm = &amdgpu_pm_ops,
> > > > >  };
> > > > >
> > > > > +MODULE_PARM_DESC(ecc_flags, "ECC/EDC enable flags (0 = disable
> > > > > ECC/EDC (default))");
> > > > > +module_param_named(ecc_flags, amdgpu_ecc_flags, uint, 0444);
> > > > > +
> > > > >
> > > > >
> > > > >  static int __init amdgpu_init(void) diff --git
> > > > > a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > > > > b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > > > > index 46e766e..3b5685c 100644
> > > > > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > > > > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > > > > @@ -1824,7 +1824,17 @@ static int
> > > > > gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
> > > > >  	if (dis_bit) {
> > > > >  		/* On Carrizo, EDC may be permanently disabled by a
> fuse. */
> > > > >  		DRM_INFO("CZ EDC hardware is disabled,
> GC_EDC_CONFIG:
> > > > > 0x%08x.\n",
> > > > > -			tmp);
> > > > > +			 tmp);
> > > > > +		return 0;
> > > > > +	}
> > > > > +
> > > > > +	/*
> > > > > +	 * Check if EDC has been requested by a kernel parameter.
> > > > > +	 * For Carrizo, EDC is the best/safest mode WRT error
> handling.
> > > > > +	 */
> > > > > +	if (!(amdgpu_ecc_flags
> > > > > +	      & (AMD_ECC_SUPPORT_BEST |
> AMD_ECC_SUPPORT_EDC))) {
> > > > > +		DRM_INFO("EDC support has not been
> requested.\n");
> > > >
> > > > Can drop this debugging statement or make it DRM_DEBUG.
> > > [davep] See top post.
> > > >
> > > > >  		return 0;
> > > > >  	}
> > > > >
> > > > > @@ -1962,6 +1972,20 @@ static int
> > > > > gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
> > > > >  		goto fail;
> > > > >  	}
> > > > >
> > > > > +	/* 00 - GB_EDC_DED_MODE_LOG: Count DED errors but do
> not halt
> > > > > */
> > > > > +	tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, DED_MODE, 0);
> > > > > +	/* Do not propagate the errors to the next block. */
> > > > > +	tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, PROP_FED, 0);
> > > > > +	WREG32(mmGB_EDC_MODE, tmp);
> > > > > +
> > > > > +	tmp = RREG32(mmCC_GC_EDC_CONFIG);
> > > > > +
> > > > > +	/*
> > > > > +	 * Clear EDC_DISABLE bit so the counters are available.
> > > > > +	 */
> > > > > +	tmp = REG_SET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC,
> 0);
> > > > > +	WREG32(mmCC_GC_EDC_CONFIG, tmp);
> > > > > +
> > > > >  	gfx_v8_0_edc_clear_counters(adev);
> > > > >
> > > > >  fail:
> > > > > @@ -5548,7 +5572,7 @@ static int gfx_v8_0_pre_soft_reset(void
> > > *handle)
> > > > >  		gfx_v8_0_cp_compute_enable(adev, false);
> > > > >  	}
> > > > >
> > > > > -       return 0;
> > > > > +	return 0;
> > > > >  }
> > > > >
> > > > >  static int gfx_v8_0_soft_reset(void *handle) diff --git
> > > > > a/drivers/gpu/drm/amd/include/amd_shared.h
> > > > > b/drivers/gpu/drm/amd/include/amd_shared.h
> > > > > index 0f58e95..1cf30aa 100644
> > > > > --- a/drivers/gpu/drm/amd/include/amd_shared.h
> > > > > +++ b/drivers/gpu/drm/amd/include/amd_shared.h
> > > > > @@ -187,6 +187,20 @@ enum amd_fan_ctrl_mode {
> > > > >  #define AMD_PG_SUPPORT_GFX_QUICK_MG		(1 << 11)
> > > > >  #define AMD_PG_SUPPORT_GFX_PIPELINE		(1 << 12)
> > > > >
> > > > > +/*
> > > > > + * ECC flags
> > > > > + * Allows the user to choose what kind of error
> > > > > +detection/correction is
> > > > > used.
> > > > > + * Currently, EDC is supported on Carrizo.
> > > > > + *
> > > > > + * The AMD_ECC_SUPPORT_BEST bit is used to allow a user to have
> > > > > + the
> > > > > driver
> > > > > + * set what it thinks is best/safest mode.  This may not be the
> > > > > +same as the
> > > > > + * default, depending on the GPU and the application.
> > > > > + * Using a single bit makes it easy to request the best support
> > > > > +without
> > > > > + * needing to know all currently supported modes.
> > > > > + */
> > > > > +#define AMD_ECC_SUPPORT_BEST			(1 << 0)
> > > > > +#define AMD_ECC_SUPPORT_EDC			(1 << 1)
> > > > > +
> > > > >  enum amd_pm_state_type {
> > > > >  	/* not used for dpm */
> > > > >  	POWER_STATE_TYPE_DEFAULT,
> > > > > --
> > > > > 2.7.4
> > > > >
> > > > > _______________________________________________
> > > > > amd-gfx mailing list
> > > > > amd-gfx@lists.freedesktop.org
> > > > > https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH 3/3] drm/amdgpu: Add kernel parameter to control use of ECC/EDC.
       [not found]                     ` <BN6PR12MB117285B9134A4BAD2E7731ED95DF0-/b2+HYfkarTft/eMqzLDqAdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  2017-06-26 16:21                       ` Deucher, Alexander
@ 2017-06-26 18:11                       ` Xie, AlexBin
       [not found]                         ` <DM5PR12MB12577898D0B6BD85BFB8670EF2DF0-2J9CzHegvk/NHlLGalgXawdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  1 sibling, 1 reply; 14+ messages in thread
From: Xie, AlexBin @ 2017-06-26 18:11 UTC (permalink / raw)
  To: Panariti, David, Deucher, Alexander,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Hi,

I have not checked the background of this discussion very closely yet. And you might have known the following.

Customers may not want the default setting to change meaning. This is like an API.
Example: The application and its environment is already set up and tested. Then if customer updates driver, suddenly driver has some new behavior?
Certain serious application definitely does not accept this.

IMHO, it is better to avoid vague concepts like "best". It will become rather difficult to define what is best when there are multiple customers with different requirements. Driver is to provide a feature or mechanism. "Best" sounds like a policy or a preference from driver side.

In my pass work, I generally use default for two cases:
1. The default is the most conservative option, which must work. Then the application can choose advanced features by choosing other parameter value/option.
2. The default parameter is the compatible behavior before introducing this parameter/option.

Regards,
Alex Bin


-----Original Message-----
From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf Of Panariti, David
Sent: Monday, June 26, 2017 12:06 PM
To: Deucher, Alexander <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
Subject: RE: [PATCH 3/3] drm/amdgpu: Add kernel parameter to control use of ECC/EDC.

>> > I'd suggest setting amdgpu_ecc_flags to AMD_ECC_SUPPORT_BEST by 
>> > default.  That can be our asic specific default setting.  In the 
>> > case of CZ, that will be disabled until we decide to enable EDC by default.
>> [davep] I'm confused.  ECC...BEST will cause EDC to be enabled.
>> I used ECC as the generic term for ECC and EDC, since ECC seems more 
>> basic (EDC is built on top of ECC).
>> If I understand you, we can't do what you want with the current setup.

>I'm saying we make ECC_BEST the default config (feel free to re-name if ECC_DEFAULT).  Each asic can have a different default depending 
>on what features are ready.  So for CZ, we'd make ECC_BEST equivalent to disabling ECC for now.  If a user wants to force it on, they can 
>set ECC_EDC.  Once EDC is stable on CZ, we can make ECC_BEST be equivalent to ECC_EDC.  The way the default (ECC_BEST) always maps 
>to the best available combination in that version of the driver.

That's not how I meant it to work WRT BEST.
Each asic will have a DEFAULT, but that isn't what BEST means.
CZ is a good example (when fully implemented).  DEFAULT for CZ is everything except HALT, since, IMO opinion, most people do not want to hang or reboot.
BEST for CZ would be everything a person most interested in reliability would want, which IMO, includes HALT/reboot.
Similar is if something like performance degradation is really bad, DEFAULT would be OFF. BEST would be ON, e.g., if the user's app doesn't trigger the performance problem.
The BEST bit is in a fixed position, so that customers don't need to worry what bits are needed for the most reliable performance (in our opinion) on a given asic.
And if a customer (or developer) wants some arbitrary set of features, they can set bits as they want.

I think DEFAULT will make most people happy.
BEST allows people who are interested in everything they can get, regardless of any issues that brings with it. It is requested simply by using a fixed param value (0x01) for any asic.
This probably should not include features that have any kind of fatal flaw such as the Vega10 HBM ECC issue.  When fixed, it can be added to DEFAULT.
And allowing per-feature control allows anyone to do precisely what they want.
"Effort" increases as the number of interested users decreases.

Using defines in the init code will be a problem if there is more than one kind of asic involved or a single asic that the user wants to use with different parameters.  However, this doesn't seem to be a high priority.
If we do want to worry about it, then we'll need to put the values into the amdgpu_gfx struct.

regards,
davep

> -----Original Message-----
> From: Deucher, Alexander
> Sent: Tuesday, June 06, 2017 6:16 PM
> To: Panariti, David <David.Panariti@amd.com>; amd-
> gfx@lists.freedesktop.org
> Subject: RE: [PATCH 3/3] drm/amdgpu: Add kernel parameter to control use
> of ECC/EDC.
> 
> > -----Original Message-----
> > From: Panariti, David
> > Sent: Tuesday, June 06, 2017 5:50 PM
> > To: Deucher, Alexander; amd-gfx@lists.freedesktop.org
> > Subject: RE: [PATCH 3/3] drm/amdgpu: Add kernel parameter to control
> > use of ECC/EDC.
> >
> > Rather than inlining this in a number of places, Re verbosity:
> > I've worked in embedded environments and when dealing with
> > intermittent problems it's nice to have all of the information ASAP
> > rather than waiting for a problem to reoccur, especially if it's very
> intermittent.
> > I would've preferred more.
> > Since it only shows up happens on CZ, it adds little to the output.
> > I like to show the reasons why EDC didn't happen, hence the backwards
> > looking messages.
> > In this particular case,  without the "... not requested..." we can't
> > tell if it was the flags or the ring being unready that made us bail earlier.
> 
> I'm fine with a message about EDC either being enabled or disabled, but a
> bunch of random debug statements along the way are too much.  They tend
> to just cause confusion and clutter up the logs.
> 
> >
> > > -----Original Message-----
> > > From: Deucher, Alexander
> > > Sent: Tuesday, June 06, 2017 5:22 PM
> > > To: Panariti, David <David.Panariti@amd.com>; amd-
> > > gfx@lists.freedesktop.org
> > > Cc: Panariti, David <David.Panariti@amd.com>
> > > Subject: RE: [PATCH 3/3] drm/amdgpu: Add kernel parameter to control
> > use
> > > of ECC/EDC.
> > >
> > > > -----Original Message-----
> > > > From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On
> > Behalf
> > > > Of David Panariti
> > > > Sent: Tuesday, June 06, 2017 4:33 PM
> > > > To: amd-gfx@lists.freedesktop.org
> > > > Cc: Panariti, David
> > > > Subject: [PATCH 3/3] drm/amdgpu: Add kernel parameter to control
> > > > use of ECC/EDC.
> > > >
> > > > Allow various kinds of memory integrity methods (e.g. ECC/EDC) to
> > > > be enabled or disabled.  By default, all features are disabled.
> > > >
> > > > EDC is Error Detection and Correction.  It can detect ECC errors
> > > > and do 0 or more of: count SEC (single error corrected) and DED
> > > > (double error detected, i.e. uncorrected ECC error), halt the
> > > > affected block,
> > > interrupt the CPU.
> > > > Currently, only counting errors is supported.
> > > >
> > > > Fixed a whitespace error.
> > > >
> > > > Signed-off-by: David Panariti <David.Panariti@amd.com>
> > > > ---
> > > >  drivers/gpu/drm/amd/amdgpu/amdgpu.h      |  1 +
> > > >  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  |  4 ++++
> > > >  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 28
> > > > ++++++++++++++++++++++++++--
> > > >  drivers/gpu/drm/amd/include/amd_shared.h | 14 ++++++++++++++
> > > >  4 files changed, 45 insertions(+), 2 deletions(-)
> > > >
> > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > > > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > > > index a6f51eb..3e930ee 100644
> > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > > > @@ -116,6 +116,7 @@ extern int amdgpu_cntl_sb_buf_per_se;  extern
> > int
> > > > amdgpu_param_buf_per_se;  extern int amdgpu_job_hang_limit;
> > extern
> > > > int amdgpu_lbpw;
> > > > +extern unsigned amdgpu_ecc_flags;
> > > >
> > > >  #define AMDGPU_DEFAULT_GTT_SIZE_MB		3072ULL /*
> > 3GB by
> > > > default */
> > > >  #define AMDGPU_WAIT_IDLE_TIMEOUT_IN_MS	        3000
> > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> > > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> > > > index c4825ff..972660d 100644
> > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> > > > @@ -115,6 +115,7 @@ int amdgpu_cntl_sb_buf_per_se = 0;  int
> > > > amdgpu_param_buf_per_se = 0;  int amdgpu_job_hang_limit = 0;  int
> > > > amdgpu_lbpw = -1;
> > > > +unsigned amdgpu_ecc_flags = 0;
> > >
> > > I'd suggest setting amdgpu_ecc_flags to AMD_ECC_SUPPORT_BEST by
> > > default.  That can be our asic specific default setting.  In the
> > > case of CZ, that will be disabled until we decide to enable EDC by default.
> > [davep] I'm confused.  ECC...BEST will cause EDC to be enabled.
> > I used ECC as the generic term for ECC and EDC, since ECC seems more
> > basic (EDC is built on top of ECC).
> > If I understand you, we can't do what you want with the current setup.
> 
> I'm saying we make ECC_BEST the default config (feel free to re-name if
> ECC_DEFAULT).  Each asic can have a different default depending on what
> features are ready.  So for CZ, we'd make ECC_BEST equivalent to disabling
> ECC for now.  If a user wants to force it on, they can set ECC_EDC.  Once EDC
> is stable on CZ, we can make ECC_BEST be equivalent to ECC_EDC.  The way
> the default (ECC_BEST) always maps to the best available combination in that
> version of the driver.
> 
> E.g., in the current gfx8 code:
> 
> if ((amdgpu_ecc_flags & ECC_EDC) && edc_fuse_enabled) {
>     // enable EDC
>     goto enable_edc;
> } else {
>     // disable EDC
>     return 0;
> }
> 
> Then if we want to enable EDC by default, we'd change the code like so:
> 
> if (((amdgpu_ecc_flags == ECC_EDC) || (amdgpu_ecc_flags & ECC_EDC)) &&
> edc_fuse_enabled) {
>     // enable EDC
>     goto enable_edc;
> } else {
>     // disable EDC
>     return 0;
> }
> 
> That way we can have different ECC features enabled by default for each asic
> family without needing to specify command line options.
> 
> Alex
> 
> >
> > >
> > > >
> > > >  MODULE_PARM_DESC(vramlimit, "Restrict VRAM for testing, in
> > > > megabytes");  module_param_named(vramlimit, amdgpu_vram_limit,
> > int,
> > > > 0600); @@ -795,6 +796,9 @@ static struct pci_driver
> > > > amdgpu_kms_pci_driver = {
> > > >  	.driver.pm = &amdgpu_pm_ops,
> > > >  };
> > > >
> > > > +MODULE_PARM_DESC(ecc_flags, "ECC/EDC enable flags (0 = disable
> > > > ECC/EDC (default))");
> > > > +module_param_named(ecc_flags, amdgpu_ecc_flags, uint, 0444);
> > > > +
> > > >
> > > >
> > > >  static int __init amdgpu_init(void) diff --git
> > > > a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > > > b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > > > index 46e766e..3b5685c 100644
> > > > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > > > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > > > @@ -1824,7 +1824,17 @@ static int
> > > > gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
> > > >  	if (dis_bit) {
> > > >  		/* On Carrizo, EDC may be permanently disabled by a fuse. */
> > > >  		DRM_INFO("CZ EDC hardware is disabled, GC_EDC_CONFIG:
> > > > 0x%08x.\n",
> > > > -			tmp);
> > > > +			 tmp);
> > > > +		return 0;
> > > > +	}
> > > > +
> > > > +	/*
> > > > +	 * Check if EDC has been requested by a kernel parameter.
> > > > +	 * For Carrizo, EDC is the best/safest mode WRT error handling.
> > > > +	 */
> > > > +	if (!(amdgpu_ecc_flags
> > > > +	      & (AMD_ECC_SUPPORT_BEST | AMD_ECC_SUPPORT_EDC))) {
> > > > +		DRM_INFO("EDC support has not been requested.\n");
> > >
> > > Can drop this debugging statement or make it DRM_DEBUG.
> > [davep] See top post.
> > >
> > > >  		return 0;
> > > >  	}
> > > >
> > > > @@ -1962,6 +1972,20 @@ static int
> > > > gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
> > > >  		goto fail;
> > > >  	}
> > > >
> > > > +	/* 00 - GB_EDC_DED_MODE_LOG: Count DED errors but do not halt
> > > > */
> > > > +	tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, DED_MODE, 0);
> > > > +	/* Do not propagate the errors to the next block. */
> > > > +	tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, PROP_FED, 0);
> > > > +	WREG32(mmGB_EDC_MODE, tmp);
> > > > +
> > > > +	tmp = RREG32(mmCC_GC_EDC_CONFIG);
> > > > +
> > > > +	/*
> > > > +	 * Clear EDC_DISABLE bit so the counters are available.
> > > > +	 */
> > > > +	tmp = REG_SET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC, 0);
> > > > +	WREG32(mmCC_GC_EDC_CONFIG, tmp);
> > > > +
> > > >  	gfx_v8_0_edc_clear_counters(adev);
> > > >
> > > >  fail:
> > > > @@ -5548,7 +5572,7 @@ static int gfx_v8_0_pre_soft_reset(void
> > *handle)
> > > >  		gfx_v8_0_cp_compute_enable(adev, false);
> > > >  	}
> > > >
> > > > -       return 0;
> > > > +	return 0;
> > > >  }
> > > >
> > > >  static int gfx_v8_0_soft_reset(void *handle) diff --git
> > > > a/drivers/gpu/drm/amd/include/amd_shared.h
> > > > b/drivers/gpu/drm/amd/include/amd_shared.h
> > > > index 0f58e95..1cf30aa 100644
> > > > --- a/drivers/gpu/drm/amd/include/amd_shared.h
> > > > +++ b/drivers/gpu/drm/amd/include/amd_shared.h
> > > > @@ -187,6 +187,20 @@ enum amd_fan_ctrl_mode {
> > > >  #define AMD_PG_SUPPORT_GFX_QUICK_MG		(1 << 11)
> > > >  #define AMD_PG_SUPPORT_GFX_PIPELINE		(1 << 12)
> > > >
> > > > +/*
> > > > + * ECC flags
> > > > + * Allows the user to choose what kind of error
> > > > +detection/correction is
> > > > used.
> > > > + * Currently, EDC is supported on Carrizo.
> > > > + *
> > > > + * The AMD_ECC_SUPPORT_BEST bit is used to allow a user to have
> > > > + the
> > > > driver
> > > > + * set what it thinks is best/safest mode.  This may not be the
> > > > +same as the
> > > > + * default, depending on the GPU and the application.
> > > > + * Using a single bit makes it easy to request the best support
> > > > +without
> > > > + * needing to know all currently supported modes.
> > > > + */
> > > > +#define AMD_ECC_SUPPORT_BEST			(1 << 0)
> > > > +#define AMD_ECC_SUPPORT_EDC			(1 << 1)
> > > > +
> > > >  enum amd_pm_state_type {
> > > >  	/* not used for dpm */
> > > >  	POWER_STATE_TYPE_DEFAULT,
> > > > --
> > > > 2.7.4
> > > >
> > > > _______________________________________________
> > > > amd-gfx mailing list
> > > > amd-gfx@lists.freedesktop.org
> > > > https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH 3/3] drm/amdgpu: Add kernel parameter to control use of ECC/EDC.
       [not found]                         ` <DM5PR12MB12577898D0B6BD85BFB8670EF2DF0-2J9CzHegvk/NHlLGalgXawdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2017-06-26 19:12                           ` Bridgman, John
       [not found]                             ` <BN6PR12MB1348857756669A145968B1BBE8DF0-/b2+HYfkarQX0pEhCR5T8QdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  0 siblings, 1 reply; 14+ messages in thread
From: Bridgman, John @ 2017-06-26 19:12 UTC (permalink / raw)
  To: Xie, AlexBin, Panariti, David, Deucher, Alexander,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Agreed... one person's "best" is another person's "OMG I didn't want that". IMO we should have bits correspond to specific options as much as possible, modulo HW capabilities.

>-----Original Message-----
>From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf Of
>Xie, AlexBin
>Sent: Monday, June 26, 2017 2:12 PM
>To: Panariti, David; Deucher, Alexander; amd-gfx@lists.freedesktop.org
>Subject: RE: [PATCH 3/3] drm/amdgpu: Add kernel parameter to control use
>of ECC/EDC.
>
>Hi,
>
>I have not checked the background of this discussion very closely yet. And you
>might have known the following.
>
>Customers may not want the default setting to change meaning. This is like an
>API.
>Example: The application and its environment is already set up and tested.
>Then if customer updates driver, suddenly driver has some new behavior?
>Certain serious application definitely does not accept this.
>
>IMHO, it is better to avoid vague concepts like "best". It will become rather
>difficult to define what is best when there are multiple customers with
>different requirements. Driver is to provide a feature or mechanism. "Best"
>sounds like a policy or a preference from driver side.
>
>In my pass work, I generally use default for two cases:
>1. The default is the most conservative option, which must work. Then the
>application can choose advanced features by choosing other parameter
>value/option.
>2. The default parameter is the compatible behavior before introducing this
>parameter/option.
>
>Regards,
>Alex Bin
>
>
>-----Original Message-----
>From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf Of
>Panariti, David
>Sent: Monday, June 26, 2017 12:06 PM
>To: Deucher, Alexander <Alexander.Deucher@amd.com>; amd-
>gfx@lists.freedesktop.org
>Subject: RE: [PATCH 3/3] drm/amdgpu: Add kernel parameter to control use
>of ECC/EDC.
>
>>> > I'd suggest setting amdgpu_ecc_flags to AMD_ECC_SUPPORT_BEST by
>>> > default.  That can be our asic specific default setting.  In the
>>> > case of CZ, that will be disabled until we decide to enable EDC by default.
>>> [davep] I'm confused.  ECC...BEST will cause EDC to be enabled.
>>> I used ECC as the generic term for ECC and EDC, since ECC seems more
>>> basic (EDC is built on top of ECC).
>>> If I understand you, we can't do what you want with the current setup.
>
>>I'm saying we make ECC_BEST the default config (feel free to re-name if
>>ECC_DEFAULT).  Each asic can have a different default depending on what
>>features are ready.  So for CZ, we'd make ECC_BEST equivalent to
>>disabling ECC for now.  If a user wants to force it on, they can set ECC_EDC.
>Once EDC is stable on CZ, we can make ECC_BEST be equivalent to ECC_EDC.
>The way the default (ECC_BEST) always maps to the best available
>combination in that version of the driver.
>
>That's not how I meant it to work WRT BEST.
>Each asic will have a DEFAULT, but that isn't what BEST means.
>CZ is a good example (when fully implemented).  DEFAULT for CZ is everything
>except HALT, since, IMO opinion, most people do not want to hang or reboot.
>BEST for CZ would be everything a person most interested in reliability would
>want, which IMO, includes HALT/reboot.
>Similar is if something like performance degradation is really bad, DEFAULT
>would be OFF. BEST would be ON, e.g., if the user's app doesn't trigger the
>performance problem.
>The BEST bit is in a fixed position, so that customers don't need to worry what
>bits are needed for the most reliable performance (in our opinion) on a given
>asic.
>And if a customer (or developer) wants some arbitrary set of features, they
>can set bits as they want.
>
>I think DEFAULT will make most people happy.
>BEST allows people who are interested in everything they can get, regardless
>of any issues that brings with it. It is requested simply by using a fixed param
>value (0x01) for any asic.
>This probably should not include features that have any kind of fatal flaw such
>as the Vega10 HBM ECC issue.  When fixed, it can be added to DEFAULT.
>And allowing per-feature control allows anyone to do precisely what they
>want.
>"Effort" increases as the number of interested users decreases.
>
>Using defines in the init code will be a problem if there is more than one kind
>of asic involved or a single asic that the user wants to use with different
>parameters.  However, this doesn't seem to be a high priority.
>If we do want to worry about it, then we'll need to put the values into the
>amdgpu_gfx struct.
>
>regards,
>davep
>
>> -----Original Message-----
>> From: Deucher, Alexander
>> Sent: Tuesday, June 06, 2017 6:16 PM
>> To: Panariti, David <David.Panariti@amd.com>; amd-
>> gfx@lists.freedesktop.org
>> Subject: RE: [PATCH 3/3] drm/amdgpu: Add kernel parameter to control
>> use of ECC/EDC.
>>
>> > -----Original Message-----
>> > From: Panariti, David
>> > Sent: Tuesday, June 06, 2017 5:50 PM
>> > To: Deucher, Alexander; amd-gfx@lists.freedesktop.org
>> > Subject: RE: [PATCH 3/3] drm/amdgpu: Add kernel parameter to control
>> > use of ECC/EDC.
>> >
>> > Rather than inlining this in a number of places, Re verbosity:
>> > I've worked in embedded environments and when dealing with
>> > intermittent problems it's nice to have all of the information ASAP
>> > rather than waiting for a problem to reoccur, especially if it's
>> > very
>> intermittent.
>> > I would've preferred more.
>> > Since it only shows up happens on CZ, it adds little to the output.
>> > I like to show the reasons why EDC didn't happen, hence the
>> > backwards looking messages.
>> > In this particular case,  without the "... not requested..." we
>> > can't tell if it was the flags or the ring being unready that made us bail
>earlier.
>>
>> I'm fine with a message about EDC either being enabled or disabled,
>> but a bunch of random debug statements along the way are too much.
>> They tend to just cause confusion and clutter up the logs.
>>
>> >
>> > > -----Original Message-----
>> > > From: Deucher, Alexander
>> > > Sent: Tuesday, June 06, 2017 5:22 PM
>> > > To: Panariti, David <David.Panariti@amd.com>; amd-
>> > > gfx@lists.freedesktop.org
>> > > Cc: Panariti, David <David.Panariti@amd.com>
>> > > Subject: RE: [PATCH 3/3] drm/amdgpu: Add kernel parameter to
>> > > control
>> > use
>> > > of ECC/EDC.
>> > >
>> > > > -----Original Message-----
>> > > > From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On
>> > Behalf
>> > > > Of David Panariti
>> > > > Sent: Tuesday, June 06, 2017 4:33 PM
>> > > > To: amd-gfx@lists.freedesktop.org
>> > > > Cc: Panariti, David
>> > > > Subject: [PATCH 3/3] drm/amdgpu: Add kernel parameter to control
>> > > > use of ECC/EDC.
>> > > >
>> > > > Allow various kinds of memory integrity methods (e.g. ECC/EDC)
>> > > > to be enabled or disabled.  By default, all features are disabled.
>> > > >
>> > > > EDC is Error Detection and Correction.  It can detect ECC errors
>> > > > and do 0 or more of: count SEC (single error corrected) and DED
>> > > > (double error detected, i.e. uncorrected ECC error), halt the
>> > > > affected block,
>> > > interrupt the CPU.
>> > > > Currently, only counting errors is supported.
>> > > >
>> > > > Fixed a whitespace error.
>> > > >
>> > > > Signed-off-by: David Panariti <David.Panariti@amd.com>
>> > > > ---
>> > > >  drivers/gpu/drm/amd/amdgpu/amdgpu.h      |  1 +
>> > > >  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  |  4 ++++
>> > > >  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 28
>> > > > ++++++++++++++++++++++++++--
>> > > >  drivers/gpu/drm/amd/include/amd_shared.h | 14 ++++++++++++++
>> > > >  4 files changed, 45 insertions(+), 2 deletions(-)
>> > > >
>> > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> > > > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> > > > index a6f51eb..3e930ee 100644
>> > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> > > > @@ -116,6 +116,7 @@ extern int amdgpu_cntl_sb_buf_per_se;
>> > > > extern
>> > int
>> > > > amdgpu_param_buf_per_se;  extern int amdgpu_job_hang_limit;
>> > extern
>> > > > int amdgpu_lbpw;
>> > > > +extern unsigned amdgpu_ecc_flags;
>> > > >
>> > > >  #define AMDGPU_DEFAULT_GTT_SIZE_MB		3072ULL /*
>> > 3GB by
>> > > > default */
>> > > >  #define AMDGPU_WAIT_IDLE_TIMEOUT_IN_MS	        3000
>> > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> > > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> > > > index c4825ff..972660d 100644
>> > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> > > > @@ -115,6 +115,7 @@ int amdgpu_cntl_sb_buf_per_se = 0;  int
>> > > > amdgpu_param_buf_per_se = 0;  int amdgpu_job_hang_limit = 0;
>> > > > int amdgpu_lbpw = -1;
>> > > > +unsigned amdgpu_ecc_flags = 0;
>> > >
>> > > I'd suggest setting amdgpu_ecc_flags to AMD_ECC_SUPPORT_BEST by
>> > > default.  That can be our asic specific default setting.  In the
>> > > case of CZ, that will be disabled until we decide to enable EDC by default.
>> > [davep] I'm confused.  ECC...BEST will cause EDC to be enabled.
>> > I used ECC as the generic term for ECC and EDC, since ECC seems more
>> > basic (EDC is built on top of ECC).
>> > If I understand you, we can't do what you want with the current setup.
>>
>> I'm saying we make ECC_BEST the default config (feel free to re-name
>> if ECC_DEFAULT).  Each asic can have a different default depending on
>> what features are ready.  So for CZ, we'd make ECC_BEST equivalent to
>> disabling ECC for now.  If a user wants to force it on, they can set
>> ECC_EDC.  Once EDC is stable on CZ, we can make ECC_BEST be equivalent
>> to ECC_EDC.  The way the default (ECC_BEST) always maps to the best
>> available combination in that version of the driver.
>>
>> E.g., in the current gfx8 code:
>>
>> if ((amdgpu_ecc_flags & ECC_EDC) && edc_fuse_enabled) {
>>     // enable EDC
>>     goto enable_edc;
>> } else {
>>     // disable EDC
>>     return 0;
>> }
>>
>> Then if we want to enable EDC by default, we'd change the code like so:
>>
>> if (((amdgpu_ecc_flags == ECC_EDC) || (amdgpu_ecc_flags & ECC_EDC)) &&
>> edc_fuse_enabled) {
>>     // enable EDC
>>     goto enable_edc;
>> } else {
>>     // disable EDC
>>     return 0;
>> }
>>
>> That way we can have different ECC features enabled by default for
>> each asic family without needing to specify command line options.
>>
>> Alex
>>
>> >
>> > >
>> > > >
>> > > >  MODULE_PARM_DESC(vramlimit, "Restrict VRAM for testing, in
>> > > > megabytes");  module_param_named(vramlimit, amdgpu_vram_limit,
>> > int,
>> > > > 0600); @@ -795,6 +796,9 @@ static struct pci_driver
>> > > > amdgpu_kms_pci_driver = {
>> > > >  	.driver.pm = &amdgpu_pm_ops,
>> > > >  };
>> > > >
>> > > > +MODULE_PARM_DESC(ecc_flags, "ECC/EDC enable flags (0 = disable
>> > > > ECC/EDC (default))");
>> > > > +module_param_named(ecc_flags, amdgpu_ecc_flags, uint, 0444);
>> > > > +
>> > > >
>> > > >
>> > > >  static int __init amdgpu_init(void) diff --git
>> > > > a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> > > > b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> > > > index 46e766e..3b5685c 100644
>> > > > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> > > > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> > > > @@ -1824,7 +1824,17 @@ static int
>> > > > gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
>> > > >  	if (dis_bit) {
>> > > >  		/* On Carrizo, EDC may be permanently disabled by a
>fuse. */
>> > > >  		DRM_INFO("CZ EDC hardware is disabled,
>GC_EDC_CONFIG:
>> > > > 0x%08x.\n",
>> > > > -			tmp);
>> > > > +			 tmp);
>> > > > +		return 0;
>> > > > +	}
>> > > > +
>> > > > +	/*
>> > > > +	 * Check if EDC has been requested by a kernel parameter.
>> > > > +	 * For Carrizo, EDC is the best/safest mode WRT error
>handling.
>> > > > +	 */
>> > > > +	if (!(amdgpu_ecc_flags
>> > > > +	      & (AMD_ECC_SUPPORT_BEST |
>AMD_ECC_SUPPORT_EDC))) {
>> > > > +		DRM_INFO("EDC support has not been
>requested.\n");
>> > >
>> > > Can drop this debugging statement or make it DRM_DEBUG.
>> > [davep] See top post.
>> > >
>> > > >  		return 0;
>> > > >  	}
>> > > >
>> > > > @@ -1962,6 +1972,20 @@ static int
>> > > > gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
>> > > >  		goto fail;
>> > > >  	}
>> > > >
>> > > > +	/* 00 - GB_EDC_DED_MODE_LOG: Count DED errors but do
>not halt
>> > > > */
>> > > > +	tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, DED_MODE, 0);
>> > > > +	/* Do not propagate the errors to the next block. */
>> > > > +	tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, PROP_FED, 0);
>> > > > +	WREG32(mmGB_EDC_MODE, tmp);
>> > > > +
>> > > > +	tmp = RREG32(mmCC_GC_EDC_CONFIG);
>> > > > +
>> > > > +	/*
>> > > > +	 * Clear EDC_DISABLE bit so the counters are available.
>> > > > +	 */
>> > > > +	tmp = REG_SET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC,
>0);
>> > > > +	WREG32(mmCC_GC_EDC_CONFIG, tmp);
>> > > > +
>> > > >  	gfx_v8_0_edc_clear_counters(adev);
>> > > >
>> > > >  fail:
>> > > > @@ -5548,7 +5572,7 @@ static int gfx_v8_0_pre_soft_reset(void
>> > *handle)
>> > > >  		gfx_v8_0_cp_compute_enable(adev, false);
>> > > >  	}
>> > > >
>> > > > -       return 0;
>> > > > +	return 0;
>> > > >  }
>> > > >
>> > > >  static int gfx_v8_0_soft_reset(void *handle) diff --git
>> > > > a/drivers/gpu/drm/amd/include/amd_shared.h
>> > > > b/drivers/gpu/drm/amd/include/amd_shared.h
>> > > > index 0f58e95..1cf30aa 100644
>> > > > --- a/drivers/gpu/drm/amd/include/amd_shared.h
>> > > > +++ b/drivers/gpu/drm/amd/include/amd_shared.h
>> > > > @@ -187,6 +187,20 @@ enum amd_fan_ctrl_mode {
>> > > >  #define AMD_PG_SUPPORT_GFX_QUICK_MG		(1 << 11)
>> > > >  #define AMD_PG_SUPPORT_GFX_PIPELINE		(1 << 12)
>> > > >
>> > > > +/*
>> > > > + * ECC flags
>> > > > + * Allows the user to choose what kind of error
>> > > > +detection/correction is
>> > > > used.
>> > > > + * Currently, EDC is supported on Carrizo.
>> > > > + *
>> > > > + * The AMD_ECC_SUPPORT_BEST bit is used to allow a user to have
>> > > > + the
>> > > > driver
>> > > > + * set what it thinks is best/safest mode.  This may not be the
>> > > > +same as the
>> > > > + * default, depending on the GPU and the application.
>> > > > + * Using a single bit makes it easy to request the best support
>> > > > +without
>> > > > + * needing to know all currently supported modes.
>> > > > + */
>> > > > +#define AMD_ECC_SUPPORT_BEST			(1 << 0)
>> > > > +#define AMD_ECC_SUPPORT_EDC			(1 << 1)
>> > > > +
>> > > >  enum amd_pm_state_type {
>> > > >  	/* not used for dpm */
>> > > >  	POWER_STATE_TYPE_DEFAULT,
>> > > > --
>> > > > 2.7.4
>> > > >
>> > > > _______________________________________________
>> > > > amd-gfx mailing list
>> > > > amd-gfx@lists.freedesktop.org
>> > > > https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>_______________________________________________
>amd-gfx mailing list
>amd-gfx@lists.freedesktop.org
>https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>_______________________________________________
>amd-gfx mailing list
>amd-gfx@lists.freedesktop.org
>https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH 3/3] drm/amdgpu: Add kernel parameter to control use of ECC/EDC.
       [not found]                             ` <BN6PR12MB1348857756669A145968B1BBE8DF0-/b2+HYfkarQX0pEhCR5T8QdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2017-06-26 20:53                               ` Panariti, David
  0 siblings, 0 replies; 14+ messages in thread
From: Panariti, David @ 2017-06-26 20:53 UTC (permalink / raw)
  To: Bridgman, John, Xie, AlexBin, Deucher, Alexander,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW



> -----Original Message-----
> From: Bridgman, John
> Sent: Monday, June 26, 2017 3:12 PM
> To: Xie, AlexBin <AlexBin.Xie@amd.com>; Panariti, David
> <David.Panariti@amd.com>; Deucher, Alexander
> <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: RE: [PATCH 3/3] drm/amdgpu: Add kernel parameter to control use
> of ECC/EDC.
> 
> Agreed... one person's "best" is another person's "OMG I didn't want that".
[davep] However, I've questioned the sanity of many default choices.
> IMO we should have bits correspond to specific options as much as possible,
> modulo HW capabilities.
[davep] Yes, we can all agree that the word BEST, wasn't.
You can throw tomatoes at me when we're all in Markham.
But it's specifically the "modulo HW capabilities" that makes me want to use a single value to specify BEST (ADVANCED, RELIABLEST, SAFEST, AAGGRESSIVE, PURPLE, COMPUTE, PROFESSIONAL, etc.) 
Please, suggestions for better than BEST.

To summarize, I propose:

DEFAULT: User need do nothing, no parameter.  As AlexBin suggested, this can be a conservative choice of options. Options defined per asic.
ADVANCED: More features than DEFAULT, but nothing known to break.  Things we're sure not everyone would want. Selected by a unique value e.g. (1 << 3).  Options defined per asic.
ALL: All compatible features, mod mutually exclusive or otherwise incompatible to the HW.  Could be specified as 0xffffffff.
BITMASK: The ultimate catchall.  I'd even say no masking out of any bits, caveat usor.  Don't forget, we're users of this interface and who knows what will be useful for dev or testing.
NONE: ras_param=0

DEFAULT, ADVANCED, ALL and NONE could go into a 2 bit field since they're mutually exclusive.

Examples:
CZ and eCZ:
DEFAULT: everything except PROP_FED (halt ip/reboot)
ADVANCED: DEFAULT + PROP_FED.
ALL: Same as ADVANCED.
BITMASK: PROP_FED, no counters (who knows why, but we can do it)

Vega10:
DEFAULT: No ECC
ADVANCED: No ECC.
ALL: ECC
BITMASK: ECC, don't count on your results.

To me, the benefit of ADVANCED and ALL is that I, as a user, would want them to change with new drivers.  I want that logical behavior and don't want to check ChangeLogs to see what new bits I need.
I think, for example, an HPC customer would want to use ADVANCED because that is what we think is the most reliable. 
Basically they're just macros so in the most common cases, users don't need to worry about the bits.

Providing a bitmask argument allows anything to be overridden, and as an advanced user (such as a developer), system evaluator, etc, absolute flexibility is essential.

From AlexD earlier (I may have redundancies):
> BEST could be defined as
> 0xffffffff since each asic would only look at the masked bits for the features it
> supports. 
[davep] This could cause problems if there are mutually exclusive or otherwise incompatible bits.  Then there would need to be code choosing one over the other which is what we'd need to do to define AGGRESSIVE anyway.
Also I've seen fields where 1 enables a feature and others where 1 disables a feature.
This is why I chose a single bit.  The driver will know what to do with BEST, especially if features are added for fixed. It's no more effort than specifying 0xff...f
> I would prefer not to call it BEST though.  Maybe ALL.
[davep] See 0xfff..f for why ALL may not be BEST. Pun intended.

BTW:  I like the RAS_ prefix over ECC, since it is really the overarching concept.
> 
> RAS_NONE 0
> RAS_DEFAULT (1 << 0)
> RAS_VRAM_ECC (1 << 1)
> RAS_SRAM_EDC (1 << 2)
> ...
> RAS_ALL 0xffffffff

[davep] So I see this with a change to RAS_ALL to basically be equivalent to what I had considered BEST and should be a fixed bit such as (1 << 3), and we choose the options based on what we know at the time of a release.
0xff...f as a number as opposed to a bitmask could be used instead of (1 << 3)
> 
> >-----Original Message-----
> >From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf
> >Of Xie, AlexBin
> >Sent: Monday, June 26, 2017 2:12 PM
> >To: Panariti, David; Deucher, Alexander; amd-gfx@lists.freedesktop.org
> >Subject: RE: [PATCH 3/3] drm/amdgpu: Add kernel parameter to control
> >use of ECC/EDC.
> >
> >Hi,
> >
> >I have not checked the background of this discussion very closely yet.
> >And you might have known the following.
> >
> >Customers may not want the default setting to change meaning. This is
> >like an API.
> >Example: The application and its environment is already set up and tested.
> >Then if customer updates driver, suddenly driver has some new behavior?
> >Certain serious application definitely does not accept this.
> >
> >IMHO, it is better to avoid vague concepts like "best". It will become
> >rather difficult to define what is best when there are multiple
> >customers with different requirements. Driver is to provide a feature or
> mechanism. "Best"
> >sounds like a policy or a preference from driver side.
> >
> >In my pass work, I generally use default for two cases:
> >1. The default is the most conservative option, which must work. Then
> >the application can choose advanced features by choosing other
> >parameter value/option.
> >2. The default parameter is the compatible behavior before introducing
> >this parameter/option.
> >
> >Regards,
> >Alex Bin
> >
> >
> >-----Original Message-----
> >From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf
> >Of Panariti, David
> >Sent: Monday, June 26, 2017 12:06 PM
> >To: Deucher, Alexander <Alexander.Deucher@amd.com>; amd-
> >gfx@lists.freedesktop.org
> >Subject: RE: [PATCH 3/3] drm/amdgpu: Add kernel parameter to control
> >use of ECC/EDC.
> >
> >>> > I'd suggest setting amdgpu_ecc_flags to AMD_ECC_SUPPORT_BEST by
> >>> > default.  That can be our asic specific default setting.  In the
> >>> > case of CZ, that will be disabled until we decide to enable EDC by
> default.
> >>> [davep] I'm confused.  ECC...BEST will cause EDC to be enabled.
> >>> I used ECC as the generic term for ECC and EDC, since ECC seems more
> >>> basic (EDC is built on top of ECC).
> >>> If I understand you, we can't do what you want with the current setup.
> >
> >>I'm saying we make ECC_BEST the default config (feel free to re-name
> >>if ECC_DEFAULT).  Each asic can have a different default depending on
> >>what features are ready.  So for CZ, we'd make ECC_BEST equivalent to
> >>disabling ECC for now.  If a user wants to force it on, they can set ECC_EDC.
> >Once EDC is stable on CZ, we can make ECC_BEST be equivalent to
> ECC_EDC.
> >The way the default (ECC_BEST) always maps to the best available
> >combination in that version of the driver.
> >
> >That's not how I meant it to work WRT BEST.
> >Each asic will have a DEFAULT, but that isn't what BEST means.
> >CZ is a good example (when fully implemented).  DEFAULT for CZ is
> >everything except HALT, since, IMO opinion, most people do not want to
> hang or reboot.
> >BEST for CZ would be everything a person most interested in reliability
> >would want, which IMO, includes HALT/reboot.
> >Similar is if something like performance degradation is really bad,
> >DEFAULT would be OFF. BEST would be ON, e.g., if the user's app doesn't
> >trigger the performance problem.
> >The BEST bit is in a fixed position, so that customers don't need to
> >worry what bits are needed for the most reliable performance (in our
> >opinion) on a given asic.
> >And if a customer (or developer) wants some arbitrary set of features,
> >they can set bits as they want.
> >
> >I think DEFAULT will make most people happy.
> >BEST allows people who are interested in everything they can get,
> >regardless of any issues that brings with it. It is requested simply by
> >using a fixed param value (0x01) for any asic.
> >This probably should not include features that have any kind of fatal
> >flaw such as the Vega10 HBM ECC issue.  When fixed, it can be added to
> DEFAULT.
> >And allowing per-feature control allows anyone to do precisely what
> >they want.
> >"Effort" increases as the number of interested users decreases.
> >
> >Using defines in the init code will be a problem if there is more than
> >one kind of asic involved or a single asic that the user wants to use
> >with different parameters.  However, this doesn't seem to be a high
> priority.
> >If we do want to worry about it, then we'll need to put the values into
> >the amdgpu_gfx struct.
> >
> >regards,
> >davep
> >
> >> -----Original Message-----
> >> From: Deucher, Alexander
> >> Sent: Tuesday, June 06, 2017 6:16 PM
> >> To: Panariti, David <David.Panariti@amd.com>; amd-
> >> gfx@lists.freedesktop.org
> >> Subject: RE: [PATCH 3/3] drm/amdgpu: Add kernel parameter to control
> >> use of ECC/EDC.
> >>
> >> > -----Original Message-----
> >> > From: Panariti, David
> >> > Sent: Tuesday, June 06, 2017 5:50 PM
> >> > To: Deucher, Alexander; amd-gfx@lists.freedesktop.org
> >> > Subject: RE: [PATCH 3/3] drm/amdgpu: Add kernel parameter to
> >> > control use of ECC/EDC.
> >> >
> >> > Rather than inlining this in a number of places, Re verbosity:
> >> > I've worked in embedded environments and when dealing with
> >> > intermittent problems it's nice to have all of the information ASAP
> >> > rather than waiting for a problem to reoccur, especially if it's
> >> > very
> >> intermittent.
> >> > I would've preferred more.
> >> > Since it only shows up happens on CZ, it adds little to the output.
> >> > I like to show the reasons why EDC didn't happen, hence the
> >> > backwards looking messages.
> >> > In this particular case,  without the "... not requested..." we
> >> > can't tell if it was the flags or the ring being unready that made
> >> > us bail
> >earlier.
> >>
> >> I'm fine with a message about EDC either being enabled or disabled,
> >> but a bunch of random debug statements along the way are too much.
> >> They tend to just cause confusion and clutter up the logs.
> >>
> >> >
> >> > > -----Original Message-----
> >> > > From: Deucher, Alexander
> >> > > Sent: Tuesday, June 06, 2017 5:22 PM
> >> > > To: Panariti, David <David.Panariti@amd.com>; amd-
> >> > > gfx@lists.freedesktop.org
> >> > > Cc: Panariti, David <David.Panariti@amd.com>
> >> > > Subject: RE: [PATCH 3/3] drm/amdgpu: Add kernel parameter to
> >> > > control
> >> > use
> >> > > of ECC/EDC.
> >> > >
> >> > > > -----Original Message-----
> >> > > > From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On
> >> > Behalf
> >> > > > Of David Panariti
> >> > > > Sent: Tuesday, June 06, 2017 4:33 PM
> >> > > > To: amd-gfx@lists.freedesktop.org
> >> > > > Cc: Panariti, David
> >> > > > Subject: [PATCH 3/3] drm/amdgpu: Add kernel parameter to
> >> > > > control use of ECC/EDC.
> >> > > >
> >> > > > Allow various kinds of memory integrity methods (e.g. ECC/EDC)
> >> > > > to be enabled or disabled.  By default, all features are disabled.
> >> > > >
> >> > > > EDC is Error Detection and Correction.  It can detect ECC
> >> > > > errors and do 0 or more of: count SEC (single error corrected)
> >> > > > and DED (double error detected, i.e. uncorrected ECC error),
> >> > > > halt the affected block,
> >> > > interrupt the CPU.
> >> > > > Currently, only counting errors is supported.
> >> > > >
> >> > > > Fixed a whitespace error.
> >> > > >
> >> > > > Signed-off-by: David Panariti <David.Panariti@amd.com>
> >> > > > ---
> >> > > >  drivers/gpu/drm/amd/amdgpu/amdgpu.h      |  1 +
> >> > > >  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  |  4 ++++
> >> > > >  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 28
> >> > > > ++++++++++++++++++++++++++--
> >> > > >  drivers/gpu/drm/amd/include/amd_shared.h | 14
> ++++++++++++++
> >> > > >  4 files changed, 45 insertions(+), 2 deletions(-)
> >> > > >
> >> > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> >> > > > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> >> > > > index a6f51eb..3e930ee 100644
> >> > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> >> > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> >> > > > @@ -116,6 +116,7 @@ extern int amdgpu_cntl_sb_buf_per_se;
> >> > > > extern
> >> > int
> >> > > > amdgpu_param_buf_per_se;  extern int amdgpu_job_hang_limit;
> >> > extern
> >> > > > int amdgpu_lbpw;
> >> > > > +extern unsigned amdgpu_ecc_flags;
> >> > > >
> >> > > >  #define AMDGPU_DEFAULT_GTT_SIZE_MB		3072ULL /*
> >> > 3GB by
> >> > > > default */
> >> > > >  #define AMDGPU_WAIT_IDLE_TIMEOUT_IN_MS	        3000
> >> > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> >> > > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> >> > > > index c4825ff..972660d 100644
> >> > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> >> > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> >> > > > @@ -115,6 +115,7 @@ int amdgpu_cntl_sb_buf_per_se = 0;  int
> >> > > > amdgpu_param_buf_per_se = 0;  int amdgpu_job_hang_limit = 0;
> >> > > > int amdgpu_lbpw = -1;
> >> > > > +unsigned amdgpu_ecc_flags = 0;
> >> > >
> >> > > I'd suggest setting amdgpu_ecc_flags to AMD_ECC_SUPPORT_BEST by
> >> > > default.  That can be our asic specific default setting.  In the
> >> > > case of CZ, that will be disabled until we decide to enable EDC by
> default.
> >> > [davep] I'm confused.  ECC...BEST will cause EDC to be enabled.
> >> > I used ECC as the generic term for ECC and EDC, since ECC seems
> >> > more basic (EDC is built on top of ECC).
> >> > If I understand you, we can't do what you want with the current setup.
> >>
> >> I'm saying we make ECC_BEST the default config (feel free to re-name
> >> if ECC_DEFAULT).  Each asic can have a different default depending on
> >> what features are ready.  So for CZ, we'd make ECC_BEST equivalent to
> >> disabling ECC for now.  If a user wants to force it on, they can set
> >> ECC_EDC.  Once EDC is stable on CZ, we can make ECC_BEST be
> >> equivalent to ECC_EDC.  The way the default (ECC_BEST) always maps to
> >> the best available combination in that version of the driver.
> >>
> >> E.g., in the current gfx8 code:
> >>
> >> if ((amdgpu_ecc_flags & ECC_EDC) && edc_fuse_enabled) {
> >>     // enable EDC
> >>     goto enable_edc;
> >> } else {
> >>     // disable EDC
> >>     return 0;
> >> }
> >>
> >> Then if we want to enable EDC by default, we'd change the code like so:
> >>
> >> if (((amdgpu_ecc_flags == ECC_EDC) || (amdgpu_ecc_flags & ECC_EDC))
> >> &&
> >> edc_fuse_enabled) {
> >>     // enable EDC
> >>     goto enable_edc;
> >> } else {
> >>     // disable EDC
> >>     return 0;
> >> }
> >>
> >> That way we can have different ECC features enabled by default for
> >> each asic family without needing to specify command line options.
> >>
> >> Alex
> >>
> >> >
> >> > >
> >> > > >
> >> > > >  MODULE_PARM_DESC(vramlimit, "Restrict VRAM for testing, in
> >> > > > megabytes");  module_param_named(vramlimit,
> amdgpu_vram_limit,
> >> > int,
> >> > > > 0600); @@ -795,6 +796,9 @@ static struct pci_driver
> >> > > > amdgpu_kms_pci_driver = {
> >> > > >  	.driver.pm = &amdgpu_pm_ops,
> >> > > >  };
> >> > > >
> >> > > > +MODULE_PARM_DESC(ecc_flags, "ECC/EDC enable flags (0 =
> disable
> >> > > > ECC/EDC (default))");
> >> > > > +module_param_named(ecc_flags, amdgpu_ecc_flags, uint, 0444);
> >> > > > +
> >> > > >
> >> > > >
> >> > > >  static int __init amdgpu_init(void) diff --git
> >> > > > a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> >> > > > b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> >> > > > index 46e766e..3b5685c 100644
> >> > > > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> >> > > > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> >> > > > @@ -1824,7 +1824,17 @@ static int
> >> > > > gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
> >> > > >  	if (dis_bit) {
> >> > > >  		/* On Carrizo, EDC may be permanently disabled by a
> >fuse. */
> >> > > >  		DRM_INFO("CZ EDC hardware is disabled,
> >GC_EDC_CONFIG:
> >> > > > 0x%08x.\n",
> >> > > > -			tmp);
> >> > > > +			 tmp);
> >> > > > +		return 0;
> >> > > > +	}
> >> > > > +
> >> > > > +	/*
> >> > > > +	 * Check if EDC has been requested by a kernel parameter.
> >> > > > +	 * For Carrizo, EDC is the best/safest mode WRT error
> >handling.
> >> > > > +	 */
> >> > > > +	if (!(amdgpu_ecc_flags
> >> > > > +	      & (AMD_ECC_SUPPORT_BEST |
> >AMD_ECC_SUPPORT_EDC))) {
> >> > > > +		DRM_INFO("EDC support has not been
> >requested.\n");
> >> > >
> >> > > Can drop this debugging statement or make it DRM_DEBUG.
> >> > [davep] See top post.
> >> > >
> >> > > >  		return 0;
> >> > > >  	}
> >> > > >
> >> > > > @@ -1962,6 +1972,20 @@ static int
> >> > > > gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
> >> > > >  		goto fail;
> >> > > >  	}
> >> > > >
> >> > > > +	/* 00 - GB_EDC_DED_MODE_LOG: Count DED errors but do
> >not halt
> >> > > > */
> >> > > > +	tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, DED_MODE, 0);
> >> > > > +	/* Do not propagate the errors to the next block. */
> >> > > > +	tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, PROP_FED, 0);
> >> > > > +	WREG32(mmGB_EDC_MODE, tmp);
> >> > > > +
> >> > > > +	tmp = RREG32(mmCC_GC_EDC_CONFIG);
> >> > > > +
> >> > > > +	/*
> >> > > > +	 * Clear EDC_DISABLE bit so the counters are available.
> >> > > > +	 */
> >> > > > +	tmp = REG_SET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC,
> >0);
> >> > > > +	WREG32(mmCC_GC_EDC_CONFIG, tmp);
> >> > > > +
> >> > > >  	gfx_v8_0_edc_clear_counters(adev);
> >> > > >
> >> > > >  fail:
> >> > > > @@ -5548,7 +5572,7 @@ static int gfx_v8_0_pre_soft_reset(void
> >> > *handle)
> >> > > >  		gfx_v8_0_cp_compute_enable(adev, false);
> >> > > >  	}
> >> > > >
> >> > > > -       return 0;
> >> > > > +	return 0;
> >> > > >  }
> >> > > >
> >> > > >  static int gfx_v8_0_soft_reset(void *handle) diff --git
> >> > > > a/drivers/gpu/drm/amd/include/amd_shared.h
> >> > > > b/drivers/gpu/drm/amd/include/amd_shared.h
> >> > > > index 0f58e95..1cf30aa 100644
> >> > > > --- a/drivers/gpu/drm/amd/include/amd_shared.h
> >> > > > +++ b/drivers/gpu/drm/amd/include/amd_shared.h
> >> > > > @@ -187,6 +187,20 @@ enum amd_fan_ctrl_mode {
> >> > > >  #define AMD_PG_SUPPORT_GFX_QUICK_MG		(1 <<
> 11)
> >> > > >  #define AMD_PG_SUPPORT_GFX_PIPELINE		(1 << 12)
> >> > > >
> >> > > > +/*
> >> > > > + * ECC flags
> >> > > > + * Allows the user to choose what kind of error
> >> > > > +detection/correction is
> >> > > > used.
> >> > > > + * Currently, EDC is supported on Carrizo.
> >> > > > + *
> >> > > > + * The AMD_ECC_SUPPORT_BEST bit is used to allow a user to
> >> > > > + have the
> >> > > > driver
> >> > > > + * set what it thinks is best/safest mode.  This may not be
> >> > > > +the same as the
> >> > > > + * default, depending on the GPU and the application.
> >> > > > + * Using a single bit makes it easy to request the best
> >> > > > +support without
> >> > > > + * needing to know all currently supported modes.
> >> > > > + */
> >> > > > +#define AMD_ECC_SUPPORT_BEST			(1 << 0)
> >> > > > +#define AMD_ECC_SUPPORT_EDC			(1 << 1)
> >> > > > +
> >> > > >  enum amd_pm_state_type {
> >> > > >  	/* not used for dpm */
> >> > > >  	POWER_STATE_TYPE_DEFAULT,
> >> > > > --
> >> > > > 2.7.4
> >> > > >
> >> > > > _______________________________________________
> >> > > > amd-gfx mailing list
> >> > > > amd-gfx@lists.freedesktop.org
> >> > > > https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> >_______________________________________________
> >amd-gfx mailing list
> >amd-gfx@lists.freedesktop.org
> >https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> >_______________________________________________
> >amd-gfx mailing list
> >amd-gfx@lists.freedesktop.org
> >https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2017-06-26 20:53 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-06-06 20:32 Fix CZ EDC workarounds and provide a kernel parameter to control use of EDC David Panariti
     [not found] ` <1496781172-23297-1-git-send-email-David.Panariti-5C7GfCeVMHo@public.gmane.org>
2017-06-06 20:32   ` [PATCH 1/3] drm/amdgpu: Moved gfx_v8_0_select_se_sh() in lieu of re-redundant prototype David Panariti
     [not found]     ` <1496781172-23297-2-git-send-email-David.Panariti-5C7GfCeVMHo@public.gmane.org>
2017-06-06 21:16       ` Deucher, Alexander
2017-06-06 20:32   ` [PATCH 2/3] drm/amdgpu: Complete Carrizo EDC (Error Detection and Correction) workarounds David Panariti
     [not found]     ` <1496781172-23297-3-git-send-email-David.Panariti-5C7GfCeVMHo@public.gmane.org>
2017-06-06 21:16       ` Deucher, Alexander
2017-06-06 20:32   ` [PATCH 3/3] drm/amdgpu: Add kernel parameter to control use of ECC/EDC David Panariti
     [not found]     ` <1496781172-23297-4-git-send-email-David.Panariti-5C7GfCeVMHo@public.gmane.org>
2017-06-06 21:22       ` Deucher, Alexander
     [not found]         ` <CY4PR12MB165315F98261CDFE2FBF9E3EF7CB0-rpdhrqHFk06apTa93KjAaQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2017-06-06 21:49           ` Panariti, David
     [not found]             ` <BN6PR12MB1889419C24B4D3E20C688E2195CB0-/b2+HYfkarQsUGYmqVGGPAdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2017-06-06 22:16               ` Deucher, Alexander
     [not found]                 ` <CY4PR12MB16532F2483501CF138CCACCDF7CB0-rpdhrqHFk06apTa93KjAaQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2017-06-26 16:06                   ` Panariti, David
     [not found]                     ` <BN6PR12MB117285B9134A4BAD2E7731ED95DF0-/b2+HYfkarTft/eMqzLDqAdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2017-06-26 16:21                       ` Deucher, Alexander
2017-06-26 18:11                       ` Xie, AlexBin
     [not found]                         ` <DM5PR12MB12577898D0B6BD85BFB8670EF2DF0-2J9CzHegvk/NHlLGalgXawdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2017-06-26 19:12                           ` Bridgman, John
     [not found]                             ` <BN6PR12MB1348857756669A145968B1BBE8DF0-/b2+HYfkarQX0pEhCR5T8QdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2017-06-26 20:53                               ` Panariti, David

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.