All of lore.kernel.org
 help / color / mirror / Atom feed
* CZ EDC param and support
@ 2017-04-27 22:13 Panariti, David
       [not found] ` <BN6PR12MB1889CED9632BAE0144E8AE1795100-/b2+HYfkarQsUGYmqVGGPAdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  0 siblings, 1 reply; 5+ messages in thread
From: Panariti, David @ 2017-04-27 22:13 UTC (permalink / raw)
  To: gpudriverdevsupport, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 20697 bytes --]

The changes in the workarounds function use DRM_INFO rather than DRM_DEBUG because CZs with EDC are often used in embedded environments and any info can be useful especially in the case of an intermittent problem.

>From e1ce383592c275b58ad95bd80b5479af8c1f9dae Mon Sep 17 00:00:00 2001
From: David Panariti <David.Panariti-5C7GfCeVMHo@public.gmane.org>
Date: Fri, 14 Apr 2017 13:41:52 -0400
Subject: [PATCH 1/3] drm/amdgpu: Moved gfx_v8_0_select_se_sh() in lieu of
re-redundant prototype.

Will be needed for the rest of the EDC workarounds patch.

Change-Id: Ie586ab38a69e98a91c6cb5747e285ce8bfdd1c86
Signed-off-by: David Panariti <David.Panariti-5C7GfCeVMHo@public.gmane.org>
---
drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 46 +++++++++++++++++------------------
1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 2ff5f19..27b57cb 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -1500,6 +1500,29 @@ static int gfx_v8_0_kiq_init(struct amdgpu_device *adev)
               return 0;
}
+static void gfx_v8_0_select_se_sh(struct amdgpu_device *adev,
+                                                               u32 se_num, u32 sh_num, u32 instance)
+{
+             u32 data;
+
+             if (instance == 0xffffffff)
+                             data = REG_SET_FIELD(0, GRBM_GFX_INDEX, INSTANCE_BROADCAST_WRITES, 1);
+             else
+                             data = REG_SET_FIELD(0, GRBM_GFX_INDEX, INSTANCE_INDEX, instance);
+
+             if (se_num == 0xffffffff)
+                             data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SE_BROADCAST_WRITES, 1);
+             else
+                             data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SE_INDEX, se_num);
+
+             if (sh_num == 0xffffffff)
+                             data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SH_BROADCAST_WRITES, 1);
+             else
+                             data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SH_INDEX, sh_num);
+
+             WREG32(mmGRBM_GFX_INDEX, data);
+}
+
static const u32 vgpr_init_compute_shader[] =
{
               0x7e000209, 0x7e020208,
@@ -3556,29 +3579,6 @@ static void gfx_v8_0_tiling_mode_table_init(struct amdgpu_device *adev)
               }
}
-static void gfx_v8_0_select_se_sh(struct amdgpu_device *adev,
-                                                                u32 se_num, u32 sh_num, u32 instance)
-{
-              u32 data;
-
-              if (instance == 0xffffffff)
-                              data = REG_SET_FIELD(0, GRBM_GFX_INDEX, INSTANCE_BROADCAST_WRITES, 1);
-              else
-                              data = REG_SET_FIELD(0, GRBM_GFX_INDEX, INSTANCE_INDEX, instance);
-
-              if (se_num == 0xffffffff)
-                              data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SE_BROADCAST_WRITES, 1);
-              else
-                              data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SE_INDEX, se_num);
-
-              if (sh_num == 0xffffffff)
-                              data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SH_BROADCAST_WRITES, 1);
-              else
-                              data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SH_INDEX, sh_num);
-
-              WREG32(mmGRBM_GFX_INDEX, data);
-}
-
static u32 gfx_v8_0_create_bitmask(u32 bit_width)
{
               return (u32)((1ULL << bit_width) - 1);
--
2.7.4


>From 38fac8cab73dbc07e0ee7599b52106bc09dd32ea Mon Sep 17 00:00:00 2001
From: David Panariti <David.Panariti-5C7GfCeVMHo@public.gmane.org>
Date: Mon, 24 Apr 2017 11:05:45 -0400
Subject: [PATCH 2/3] drm/amdgpu: Complete Carrizo EDC (Error Detection and
Correction) workarounds.

The workarounds are unconditionally performed on CZs with EDC enabled.
EDC detects uncorrected ECC errors and uses data poisoning to prevent
corrupted compute results from being used (read).
EDC enabled CZs are often used in embedded environments.

Change-Id: I84c261785329beeb797f11efbe0ec35790f2996c
Signed-off-by: David Panariti <David.Panariti-5C7GfCeVMHo@public.gmane.org>
---
drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 148 ++++++++++++++++++++++++----------
1 file changed, 106 insertions(+), 42 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 27b57cb..2f5bf5f 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -1645,35 +1645,92 @@ static const u32 sgpr2_init_regs[] =
               mmCOMPUTE_USER_DATA_9, 0xedcedc09,
};
-static const u32 sec_ded_counter_registers[] =
-{
-              mmCPC_EDC_ATC_CNT,
-              mmCPC_EDC_SCRATCH_CNT,
-              mmCPC_EDC_UCODE_CNT,
-              mmCPF_EDC_ATC_CNT,
-              mmCPF_EDC_ROQ_CNT,
-              mmCPF_EDC_TAG_CNT,
-              mmCPG_EDC_ATC_CNT,
-              mmCPG_EDC_DMA_CNT,
-              mmCPG_EDC_TAG_CNT,
-              mmDC_EDC_CSINVOC_CNT,
-              mmDC_EDC_RESTORE_CNT,
-              mmDC_EDC_STATE_CNT,
-              mmGDS_EDC_CNT,
-              mmGDS_EDC_GRBM_CNT,
-              mmGDS_EDC_OA_DED,
-              mmSPI_EDC_CNT,
-              mmSQC_ATC_EDC_GATCL1_CNT,
-              mmSQC_EDC_CNT,
-              mmSQ_EDC_DED_CNT,
-              mmSQ_EDC_INFO,
-              mmSQ_EDC_SEC_CNT,
-              mmTCC_EDC_CNT,
-              mmTCP_ATC_EDC_GATCL1_CNT,
-              mmTCP_EDC_CNT,
-              mmTD_EDC_CNT
+struct reg32_counter_name_map {
+             uint32_t rnmap_addr;   /* Counter register address */
+             const char *rnmap_name;           /* Name of the counter */
+             size_t rnmap_num_instances;  /* Number of block instances */
};
+#define DEF_mmREG32_NAME_MAP_ELEMENT(reg, num_instances) {             \
+             .rnmap_addr = mm##reg,                                                                           \
+             .rnmap_name = #reg,                                                                    \
+             .rnmap_num_instances = num_instances                                            \
+}
+
+/* See GRBM_GFX_INDEX, et. al. registers. */
+static const struct reg32_counter_name_map sec_ded_counter_registers[] = {
+             DEF_mmREG32_NAME_MAP_ELEMENT(SQC_EDC_CNT, 2),
+             DEF_mmREG32_NAME_MAP_ELEMENT(SQC_ATC_EDC_GATCL1_CNT, 2),
+
+             DEF_mmREG32_NAME_MAP_ELEMENT(SQ_EDC_SEC_CNT, 8),
+             DEF_mmREG32_NAME_MAP_ELEMENT(SQ_EDC_DED_CNT, 8),
+             DEF_mmREG32_NAME_MAP_ELEMENT(TCP_EDC_CNT, 8),
+             DEF_mmREG32_NAME_MAP_ELEMENT(TCP_ATC_EDC_GATCL1_CNT, 8),
+             DEF_mmREG32_NAME_MAP_ELEMENT(TD_EDC_CNT, 8),
+
+             DEF_mmREG32_NAME_MAP_ELEMENT(TCC_EDC_CNT, 4),
+
+             DEF_mmREG32_NAME_MAP_ELEMENT(CPC_EDC_ATC_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(CPC_EDC_SCRATCH_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(CPC_EDC_UCODE_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(CPF_EDC_ATC_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(CPF_EDC_ROQ_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(CPF_EDC_TAG_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(CPG_EDC_ATC_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(CPG_EDC_DMA_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(CPG_EDC_TAG_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(DC_EDC_CSINVOC_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(DC_EDC_STATE_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(DC_EDC_RESTORE_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(GDS_EDC_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(GDS_EDC_GRBM_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(SPI_EDC_CNT, 1),
+};
+
+static int gfx_v8_0_edc_clear_counters(struct amdgpu_device *adev)
+{
+             int ci, se, sh, i;
+             uint32_t count;
+             int r = 0;
+
+             mutex_lock(&adev->grbm_idx_mutex);
+
+             for (ci = 0; ci < ARRAY_SIZE(sec_ded_counter_registers); ++ci) {
+                             const struct reg32_counter_name_map *cp =
+                                             sec_ded_counter_registers + ci;
+                             const char *name = cp->rnmap_name;
+
+                             for (se = 0; se < adev->gfx.config.max_shader_engines; ++se) {
+                                             for (sh = 0; sh < adev->gfx.config.max_sh_per_se; ++sh) {
+                                                             for (i = 0; i < cp->rnmap_num_instances; ++i) {
+                                                                             gfx_v8_0_select_se_sh(adev, se, sh, i);
+                                                                             count = RREG32(cp->rnmap_addr);
+                                                                             count = RREG32(cp->rnmap_addr);
+                                                                             if (count != 0) {
+                                                                                             /*
+                                                                                             * Workaround failed.
+                                                                                             * If people are interested
+                                                                                             * in EDC at all, they will
+                                                                                             * want to know which
+                                                                                             * counters had problems.
+                                                                                             */
+                                                                                             DRM_WARN("EDC counter %s is 0x%08x, but should be 0\n.",
+                                                                                                             name, count);
+                                                                                             r = -EINVAL;
+                                                                                             goto ret;
+                                                                             }
+                                                             }
+                                             }
+                             }
+             }
+
+ret:
+             gfx_v8_0_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff);
+             mutex_unlock(&adev->grbm_idx_mutex);
+
+             return r;
+}
+
static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
{
               struct amdgpu_ring *ring = &adev->gfx.compute_ring[0];
@@ -1681,18 +1738,36 @@ static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
               struct fence *f = NULL;
               int r, i;
               u32 tmp;
+             u32 dis_bit;
               unsigned total_size, vgpr_offset, sgpr_offset;
               u64 gpu_addr;
-              /* only supported on CZ */
-              if (adev->asic_type != CHIP_CARRIZO)
+             if (adev->asic_type != CHIP_CARRIZO) {
+                             /* EDC is only supported on CZ */
+                             return 0;
+             }
+
+        DRM_INFO("Detected Carrizo.\n");
+
+             tmp = RREG32(mmCC_GC_EDC_CONFIG);
+             dis_bit = REG_GET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC);
+             if (dis_bit) {
+                             /* On Carrizo, EDC may be disabled by a fuse. */
+                             DRM_INFO("EDC hardware is disabled, GC_EDC_CONFIG: 0x%08x.\n",
+                                             tmp);
                               return 0;
+             }
                /* bail if the compute ring is not ready */
               if (!ring->ready)
                               return 0;
-              tmp = RREG32(mmGB_EDC_MODE);
+             DRM_INFO("Applying EDC workarounds.\n");
+
+             /*
+             * Interested parties can enable EDC using debugfs register reads and
+             * writes.
+             */
               WREG32(mmGB_EDC_MODE, 0);
                total_size =
@@ -1817,18 +1892,7 @@ static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
                               goto fail;
               }
-              tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, DED_MODE, 2);
-              tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, PROP_FED, 1);
-              WREG32(mmGB_EDC_MODE, tmp);
-
-              tmp = RREG32(mmCC_GC_EDC_CONFIG);
-              tmp = REG_SET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC, 0) | 1;
-              WREG32(mmCC_GC_EDC_CONFIG, tmp);
-
-
-              /* read back registers to clear the counters */
-              for (i = 0; i < ARRAY_SIZE(sec_ded_counter_registers); i++)
-                              RREG32(sec_ded_counter_registers[i]);
+             gfx_v8_0_edc_clear_counters(adev);
 fail:
               amdgpu_ib_free(adev, &ib, NULL);
--
2.7.4


>From ec4803205582c1011f5ced1ead70ee244268b4b8 Mon Sep 17 00:00:00 2001
From: David Panariti <David.Panariti-5C7GfCeVMHo@public.gmane.org>
Date: Wed, 26 Apr 2017 10:13:06 -0400
Subject: [PATCH 3/3] drm/amdgpu: Add kernel parameter to control use of
ECC/EDC.

Allow various kinds of memory integrity methods (e.g. ECC/EDC) to be enabled
or disabled.  By default, all features are disabled.

EDC is Error Detection and Correction.  It can detect ECC errors and do 0 or
more of: count SEC (single error corrected) and DED (double error detected,
i.e. uncorrected ECC error), halt the affected block, interrupt the CPU.
Currently, only counting errors is supported.

Signed-off-by: David Panariti <David.Panariti-5C7GfCeVMHo@public.gmane.org>
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h      |  1 +
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  |  4 ++++
drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 34 +++++++++++++++++++++++++++-----
drivers/gpu/drm/amd/include/amd_shared.h | 14 +++++++++++++
4 files changed, 48 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 4a16e3c..0322392 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -111,6 +111,7 @@ extern int amdgpu_prim_buf_per_se;
extern int amdgpu_pos_buf_per_se;
extern int amdgpu_cntl_sb_buf_per_se;
extern int amdgpu_param_buf_per_se;
+extern unsigned amdgpu_ecc_flags;
 #define AMDGPU_DEFAULT_GTT_SIZE_MB                      3072ULL /* 3GB by default */
#define AMDGPU_WAIT_IDLE_TIMEOUT_IN_MS                   3000
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index ead00d7..00e16ac 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -110,6 +110,7 @@ int amdgpu_prim_buf_per_se = 0;
int amdgpu_pos_buf_per_se = 0;
int amdgpu_cntl_sb_buf_per_se = 0;
int amdgpu_param_buf_per_se = 0;
+unsigned amdgpu_ecc_flags = 0;
 MODULE_PARM_DESC(vramlimit, "Restrict VRAM for testing, in megabytes");
module_param_named(vramlimit, amdgpu_vram_limit, int, 0600);
@@ -235,6 +236,9 @@ module_param_named(cntl_sb_buf_per_se, amdgpu_cntl_sb_buf_per_se, int, 0444);
MODULE_PARM_DESC(param_buf_per_se, "the size of Off-Chip Pramater Cache per Shader Engine (default depending on gfx)");
module_param_named(param_buf_per_se, amdgpu_param_buf_per_se, int, 0444);
+MODULE_PARM_DESC(ecc_flags, "ECC/EDC enable flags (0 = disable ECC/EDC (default))");
+module_param_named(ecc_flags, amdgpu_ecc_flags, uint, 0444);
+
 static const struct pci_device_id pciidlist[] = {
#ifdef  CONFIG_DRM_AMDGPU_SI
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 2f5bf5f..05cab7e 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -1708,7 +1708,7 @@ static int gfx_v8_0_edc_clear_counters(struct amdgpu_device *adev)
                                                                               count = RREG32(cp->rnmap_addr);
                                                                               if (count != 0) {
                                                                                               /*
-                                                                                              * Workaround failed.
+                                                                                             * EDC workaround failed.
                                                                                                * If people are interested
                                                                                                * in EDC at all, they will
                                                                                                * want to know which
@@ -1747,14 +1747,24 @@ static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
                               return 0;
               }
-        DRM_INFO("Detected Carrizo.\n");
+             DRM_INFO("Detected Carrizo.\n");
                tmp = RREG32(mmCC_GC_EDC_CONFIG);
               dis_bit = REG_GET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC);
               if (dis_bit) {
-                              /* On Carrizo, EDC may be disabled by a fuse. */
-                              DRM_INFO("EDC hardware is disabled, GC_EDC_CONFIG: 0x%08x.\n",
-                                              tmp);
+                             /* On Carrizo, EDC may be disabled permanently by a fuse. */
+                             DRM_INFO("Carrizo EDC hardware is disabled, GC_EDC_CONFIG: 0x%08x.\n",
+                                             tmp);
+                             return 0;
+             }
+
+             /*
+             * Check if EDC has been requested by a kernel parameter.
+             * For Carrizo, EDC is the best/safest mode WRT error handling.
+             */
+             if (!(amdgpu_ecc_flags
+                   & (AMD_ECC_SUPPORT_BEST | AMD_ECC_SUPPORT_EDC))) {
+                             DRM_INFO("EDC support has not been requested.\n");
                               return 0;
               }
@@ -1892,6 +1902,20 @@ static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
                               goto fail;
               }
+             /* 00 - GB_EDC_DED_MODE_LOG: Count DED errors but do not halt */
+             tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, DED_MODE, 0);
+             /* Do not propagate the errors to the next block. */
+             tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, PROP_FED, 0);
+             WREG32(mmGB_EDC_MODE, tmp);
+
+             tmp = RREG32(mmCC_GC_EDC_CONFIG);
+
+             /*
+             * Clear EDC_DISABLE bit so the counters are available.
+             */
+             tmp = REG_SET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC, 0);
+             WREG32(mmCC_GC_EDC_CONFIG, tmp);
+
               gfx_v8_0_edc_clear_counters(adev);
 fail:
diff --git a/drivers/gpu/drm/amd/include/amd_shared.h b/drivers/gpu/drm/amd/include/amd_shared.h
index 2ccf44e..c4fd013 100644
--- a/drivers/gpu/drm/amd/include/amd_shared.h
+++ b/drivers/gpu/drm/amd/include/amd_shared.h
@@ -179,6 +179,20 @@ struct amd_pp_profile {
#define AMD_PG_SUPPORT_GFX_QUICK_MG                                (1 << 11)
#define AMD_PG_SUPPORT_GFX_PIPELINE                     (1 << 12)
+/*
+ * ECC flags
+ * Allows the user to choose what kind of error detection/correction is used.
+ * Currently, EDC is supported on Carrizo.
+ *
+ * The AMD_ECC_SUPPORT_BEST bit is used to allow a user to have the driver
+ * set what it thinks is best/safest mode.  This may not be the same as the
+ * default, depending on the GPU and the application.
+ * Using a single bit makes it easy to request the best support without
+ * needing to know all currently supported modes.
+ */
+#define AMD_ECC_SUPPORT_BEST                                     (1 << 0)
+#define AMD_ECC_SUPPORT_EDC                                       (1 << 1)
+
enum amd_pm_state_type {
               /* not used for dpm */
               POWER_STATE_TYPE_DEFAULT,
--
2.7.4


[-- Attachment #1.2: Type: text/html, Size: 68903 bytes --]

[-- Attachment #2: all-edc.patch --]
[-- Type: application/octet-stream, Size: 14445 bytes --]

From e1ce383592c275b58ad95bd80b5479af8c1f9dae Mon Sep 17 00:00:00 2001
From: David Panariti <David.Panariti@amd.com>
Date: Fri, 14 Apr 2017 13:41:52 -0400
Subject: [PATCH 1/3] drm/amdgpu: Moved gfx_v8_0_select_se_sh() in lieu of
 re-redundant prototype.

Will be needed for the rest of the EDC workarounds patch.

Change-Id: Ie586ab38a69e98a91c6cb5747e285ce8bfdd1c86
Signed-off-by: David Panariti <David.Panariti@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 46 +++++++++++++++++------------------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 2ff5f19..27b57cb 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -1500,6 +1500,29 @@ static int gfx_v8_0_kiq_init(struct amdgpu_device *adev)
 	return 0;
 }
 
+static void gfx_v8_0_select_se_sh(struct amdgpu_device *adev,
+				  u32 se_num, u32 sh_num, u32 instance)
+{
+	u32 data;
+
+	if (instance == 0xffffffff)
+		data = REG_SET_FIELD(0, GRBM_GFX_INDEX, INSTANCE_BROADCAST_WRITES, 1);
+	else
+		data = REG_SET_FIELD(0, GRBM_GFX_INDEX, INSTANCE_INDEX, instance);
+
+	if (se_num == 0xffffffff)
+		data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SE_BROADCAST_WRITES, 1);
+	else
+		data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SE_INDEX, se_num);
+
+	if (sh_num == 0xffffffff)
+		data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SH_BROADCAST_WRITES, 1);
+	else
+		data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SH_INDEX, sh_num);
+
+	WREG32(mmGRBM_GFX_INDEX, data);
+}
+
 static const u32 vgpr_init_compute_shader[] =
 {
 	0x7e000209, 0x7e020208,
@@ -3556,29 +3579,6 @@ static void gfx_v8_0_tiling_mode_table_init(struct amdgpu_device *adev)
 	}
 }
 
-static void gfx_v8_0_select_se_sh(struct amdgpu_device *adev,
-				  u32 se_num, u32 sh_num, u32 instance)
-{
-	u32 data;
-
-	if (instance == 0xffffffff)
-		data = REG_SET_FIELD(0, GRBM_GFX_INDEX, INSTANCE_BROADCAST_WRITES, 1);
-	else
-		data = REG_SET_FIELD(0, GRBM_GFX_INDEX, INSTANCE_INDEX, instance);
-
-	if (se_num == 0xffffffff)
-		data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SE_BROADCAST_WRITES, 1);
-	else
-		data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SE_INDEX, se_num);
-
-	if (sh_num == 0xffffffff)
-		data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SH_BROADCAST_WRITES, 1);
-	else
-		data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SH_INDEX, sh_num);
-
-	WREG32(mmGRBM_GFX_INDEX, data);
-}
-
 static u32 gfx_v8_0_create_bitmask(u32 bit_width)
 {
 	return (u32)((1ULL << bit_width) - 1);
-- 
2.7.4


From 38fac8cab73dbc07e0ee7599b52106bc09dd32ea Mon Sep 17 00:00:00 2001
From: David Panariti <David.Panariti@amd.com>
Date: Mon, 24 Apr 2017 11:05:45 -0400
Subject: [PATCH 2/3] drm/amdgpu: Complete Carrizo EDC (Error Detection and
 Correction) workarounds.

The workarounds are unconditionally performed on CZs with EDC enabled.
EDC detects uncorrected ECC errors and uses data poisoning to prevent
corrupted compute results from being used (read).
EDC enabled CZs are often used in embedded environments.

Change-Id: I84c261785329beeb797f11efbe0ec35790f2996c
Signed-off-by: David Panariti <David.Panariti@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 148 ++++++++++++++++++++++++----------
 1 file changed, 106 insertions(+), 42 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 27b57cb..2f5bf5f 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -1645,35 +1645,92 @@ static const u32 sgpr2_init_regs[] =
 	mmCOMPUTE_USER_DATA_9, 0xedcedc09,
 };
 
-static const u32 sec_ded_counter_registers[] =
-{
-	mmCPC_EDC_ATC_CNT,
-	mmCPC_EDC_SCRATCH_CNT,
-	mmCPC_EDC_UCODE_CNT,
-	mmCPF_EDC_ATC_CNT,
-	mmCPF_EDC_ROQ_CNT,
-	mmCPF_EDC_TAG_CNT,
-	mmCPG_EDC_ATC_CNT,
-	mmCPG_EDC_DMA_CNT,
-	mmCPG_EDC_TAG_CNT,
-	mmDC_EDC_CSINVOC_CNT,
-	mmDC_EDC_RESTORE_CNT,
-	mmDC_EDC_STATE_CNT,
-	mmGDS_EDC_CNT,
-	mmGDS_EDC_GRBM_CNT,
-	mmGDS_EDC_OA_DED,
-	mmSPI_EDC_CNT,
-	mmSQC_ATC_EDC_GATCL1_CNT,
-	mmSQC_EDC_CNT,
-	mmSQ_EDC_DED_CNT,
-	mmSQ_EDC_INFO,
-	mmSQ_EDC_SEC_CNT,
-	mmTCC_EDC_CNT,
-	mmTCP_ATC_EDC_GATCL1_CNT,
-	mmTCP_EDC_CNT,
-	mmTD_EDC_CNT
+struct reg32_counter_name_map {
+	uint32_t rnmap_addr;	/* Counter register address */
+	const char *rnmap_name;	/* Name of the counter */
+	size_t rnmap_num_instances;  /* Number of block instances */
 };
 
+#define DEF_mmREG32_NAME_MAP_ELEMENT(reg, num_instances) {	\
+	.rnmap_addr = mm##reg,					\
+	.rnmap_name = #reg,					\
+	.rnmap_num_instances = num_instances			\
+}
+
+/* See GRBM_GFX_INDEX, et. al. registers. */
+static const struct reg32_counter_name_map sec_ded_counter_registers[] = {
+	DEF_mmREG32_NAME_MAP_ELEMENT(SQC_EDC_CNT, 2),
+	DEF_mmREG32_NAME_MAP_ELEMENT(SQC_ATC_EDC_GATCL1_CNT, 2),
+
+	DEF_mmREG32_NAME_MAP_ELEMENT(SQ_EDC_SEC_CNT, 8),
+	DEF_mmREG32_NAME_MAP_ELEMENT(SQ_EDC_DED_CNT, 8),
+	DEF_mmREG32_NAME_MAP_ELEMENT(TCP_EDC_CNT, 8),
+	DEF_mmREG32_NAME_MAP_ELEMENT(TCP_ATC_EDC_GATCL1_CNT, 8),
+	DEF_mmREG32_NAME_MAP_ELEMENT(TD_EDC_CNT, 8),
+
+	DEF_mmREG32_NAME_MAP_ELEMENT(TCC_EDC_CNT, 4),
+
+	DEF_mmREG32_NAME_MAP_ELEMENT(CPC_EDC_ATC_CNT, 1),
+	DEF_mmREG32_NAME_MAP_ELEMENT(CPC_EDC_SCRATCH_CNT, 1),
+	DEF_mmREG32_NAME_MAP_ELEMENT(CPC_EDC_UCODE_CNT, 1),
+	DEF_mmREG32_NAME_MAP_ELEMENT(CPF_EDC_ATC_CNT, 1),
+	DEF_mmREG32_NAME_MAP_ELEMENT(CPF_EDC_ROQ_CNT, 1),
+	DEF_mmREG32_NAME_MAP_ELEMENT(CPF_EDC_TAG_CNT, 1),
+	DEF_mmREG32_NAME_MAP_ELEMENT(CPG_EDC_ATC_CNT, 1),
+	DEF_mmREG32_NAME_MAP_ELEMENT(CPG_EDC_DMA_CNT, 1),
+	DEF_mmREG32_NAME_MAP_ELEMENT(CPG_EDC_TAG_CNT, 1),
+	DEF_mmREG32_NAME_MAP_ELEMENT(DC_EDC_CSINVOC_CNT, 1),
+	DEF_mmREG32_NAME_MAP_ELEMENT(DC_EDC_STATE_CNT, 1),
+	DEF_mmREG32_NAME_MAP_ELEMENT(DC_EDC_RESTORE_CNT, 1),
+	DEF_mmREG32_NAME_MAP_ELEMENT(GDS_EDC_CNT, 1),
+	DEF_mmREG32_NAME_MAP_ELEMENT(GDS_EDC_GRBM_CNT, 1),
+	DEF_mmREG32_NAME_MAP_ELEMENT(SPI_EDC_CNT, 1),
+};
+
+static int gfx_v8_0_edc_clear_counters(struct amdgpu_device *adev)
+{
+	int ci, se, sh, i;
+	uint32_t count;
+	int r = 0;
+
+	mutex_lock(&adev->grbm_idx_mutex);
+
+	for (ci = 0; ci < ARRAY_SIZE(sec_ded_counter_registers); ++ci) {
+		const struct reg32_counter_name_map *cp =
+			sec_ded_counter_registers + ci;
+		const char *name = cp->rnmap_name;
+
+		for (se = 0; se < adev->gfx.config.max_shader_engines; ++se) {
+			for (sh = 0; sh < adev->gfx.config.max_sh_per_se; ++sh) {
+				for (i = 0; i < cp->rnmap_num_instances; ++i) {
+					gfx_v8_0_select_se_sh(adev, se, sh, i);
+					count = RREG32(cp->rnmap_addr);
+					count = RREG32(cp->rnmap_addr);
+					if (count != 0) {
+						/*
+						 * Workaround failed.
+						 * If people are interested
+						 * in EDC at all, they will
+						 * want to know which
+						 * counters had problems.
+						 */
+						DRM_WARN("EDC counter %s is 0x%08x, but should be 0\n.",
+							 name, count);
+						r = -EINVAL;
+						goto ret;
+					}
+				}
+			}
+		}
+	}
+
+ret:
+	gfx_v8_0_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff);
+	mutex_unlock(&adev->grbm_idx_mutex);
+
+	return r;
+}
+
 static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
 {
 	struct amdgpu_ring *ring = &adev->gfx.compute_ring[0];
@@ -1681,18 +1738,36 @@ static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
 	struct fence *f = NULL;
 	int r, i;
 	u32 tmp;
+	u32 dis_bit;
 	unsigned total_size, vgpr_offset, sgpr_offset;
 	u64 gpu_addr;
 
-	/* only supported on CZ */
-	if (adev->asic_type != CHIP_CARRIZO)
+	if (adev->asic_type != CHIP_CARRIZO) {
+		/* EDC is only supported on CZ */
+		return 0;
+	}
+
+        DRM_INFO("Detected Carrizo.\n");
+
+	tmp = RREG32(mmCC_GC_EDC_CONFIG);
+	dis_bit = REG_GET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC);
+	if (dis_bit) {
+		/* On Carrizo, EDC may be disabled by a fuse. */
+		DRM_INFO("EDC hardware is disabled, GC_EDC_CONFIG: 0x%08x.\n",
+			tmp);
 		return 0;
+	}
 
 	/* bail if the compute ring is not ready */
 	if (!ring->ready)
 		return 0;
 
-	tmp = RREG32(mmGB_EDC_MODE);
+	DRM_INFO("Applying EDC workarounds.\n");
+
+	/*
+	 * Interested parties can enable EDC using debugfs register reads and
+	 * writes.
+	 */
 	WREG32(mmGB_EDC_MODE, 0);
 
 	total_size =
@@ -1817,18 +1892,7 @@ static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
 		goto fail;
 	}
 
-	tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, DED_MODE, 2);
-	tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, PROP_FED, 1);
-	WREG32(mmGB_EDC_MODE, tmp);
-
-	tmp = RREG32(mmCC_GC_EDC_CONFIG);
-	tmp = REG_SET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC, 0) | 1;
-	WREG32(mmCC_GC_EDC_CONFIG, tmp);
-
-
-	/* read back registers to clear the counters */
-	for (i = 0; i < ARRAY_SIZE(sec_ded_counter_registers); i++)
-		RREG32(sec_ded_counter_registers[i]);
+	gfx_v8_0_edc_clear_counters(adev);
 
 fail:
 	amdgpu_ib_free(adev, &ib, NULL);
-- 
2.7.4


From ec4803205582c1011f5ced1ead70ee244268b4b8 Mon Sep 17 00:00:00 2001
From: David Panariti <David.Panariti@amd.com>
Date: Wed, 26 Apr 2017 10:13:06 -0400
Subject: [PATCH 3/3] drm/amdgpu: Add kernel parameter to control use of
 ECC/EDC.

Allow various kinds of memory integrity methods (e.g. ECC/EDC) to be enabled
or disabled.  By default, all features are disabled.

EDC is Error Detection and Correction.  It can detect ECC errors and do 0 or
more of: count SEC (single error corrected) and DED (double error detected,
i.e. uncorrected ECC error), halt the affected block, interrupt the CPU.
Currently, only counting errors is supported.

Signed-off-by: David Panariti <David.Panariti@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h      |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  |  4 ++++
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 34 +++++++++++++++++++++++++++-----
 drivers/gpu/drm/amd/include/amd_shared.h | 14 +++++++++++++
 4 files changed, 48 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 4a16e3c..0322392 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -111,6 +111,7 @@ extern int amdgpu_prim_buf_per_se;
 extern int amdgpu_pos_buf_per_se;
 extern int amdgpu_cntl_sb_buf_per_se;
 extern int amdgpu_param_buf_per_se;
+extern unsigned amdgpu_ecc_flags;
 
 #define AMDGPU_DEFAULT_GTT_SIZE_MB		3072ULL /* 3GB by default */
 #define AMDGPU_WAIT_IDLE_TIMEOUT_IN_MS	        3000
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index ead00d7..00e16ac 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -110,6 +110,7 @@ int amdgpu_prim_buf_per_se = 0;
 int amdgpu_pos_buf_per_se = 0;
 int amdgpu_cntl_sb_buf_per_se = 0;
 int amdgpu_param_buf_per_se = 0;
+unsigned amdgpu_ecc_flags = 0;
 
 MODULE_PARM_DESC(vramlimit, "Restrict VRAM for testing, in megabytes");
 module_param_named(vramlimit, amdgpu_vram_limit, int, 0600);
@@ -235,6 +236,9 @@ module_param_named(cntl_sb_buf_per_se, amdgpu_cntl_sb_buf_per_se, int, 0444);
 MODULE_PARM_DESC(param_buf_per_se, "the size of Off-Chip Pramater Cache per Shader Engine (default depending on gfx)");
 module_param_named(param_buf_per_se, amdgpu_param_buf_per_se, int, 0444);
 
+MODULE_PARM_DESC(ecc_flags, "ECC/EDC enable flags (0 = disable ECC/EDC (default))");
+module_param_named(ecc_flags, amdgpu_ecc_flags, uint, 0444);
+
 
 static const struct pci_device_id pciidlist[] = {
 #ifdef  CONFIG_DRM_AMDGPU_SI
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 2f5bf5f..05cab7e 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -1708,7 +1708,7 @@ static int gfx_v8_0_edc_clear_counters(struct amdgpu_device *adev)
 					count = RREG32(cp->rnmap_addr);
 					if (count != 0) {
 						/*
-						 * Workaround failed.
+						 * EDC workaround failed.
 						 * If people are interested
 						 * in EDC at all, they will
 						 * want to know which
@@ -1747,14 +1747,24 @@ static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
 		return 0;
 	}
 
-        DRM_INFO("Detected Carrizo.\n");
+	DRM_INFO("Detected Carrizo.\n");
 
 	tmp = RREG32(mmCC_GC_EDC_CONFIG);
 	dis_bit = REG_GET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC);
 	if (dis_bit) {
-		/* On Carrizo, EDC may be disabled by a fuse. */
-		DRM_INFO("EDC hardware is disabled, GC_EDC_CONFIG: 0x%08x.\n",
-			tmp);
+		/* On Carrizo, EDC may be disabled permanently by a fuse. */
+		DRM_INFO("Carrizo EDC hardware is disabled, GC_EDC_CONFIG: 0x%08x.\n",
+			 tmp);
+		return 0;
+	}
+
+	/*
+	 * Check if EDC has been requested by a kernel parameter.
+	 * For Carrizo, EDC is the best/safest mode WRT error handling.
+	 */
+	if (!(amdgpu_ecc_flags
+	      & (AMD_ECC_SUPPORT_BEST | AMD_ECC_SUPPORT_EDC))) {
+		DRM_INFO("EDC support has not been requested.\n");
 		return 0;
 	}
 
@@ -1892,6 +1902,20 @@ static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
 		goto fail;
 	}
 
+	/* 00 - GB_EDC_DED_MODE_LOG: Count DED errors but do not halt */
+	tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, DED_MODE, 0);
+	/* Do not propagate the errors to the next block. */
+	tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, PROP_FED, 0);
+	WREG32(mmGB_EDC_MODE, tmp);
+
+	tmp = RREG32(mmCC_GC_EDC_CONFIG);
+
+	/*
+	 * Clear EDC_DISABLE bit so the counters are available.
+	 */
+	tmp = REG_SET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC, 0);
+	WREG32(mmCC_GC_EDC_CONFIG, tmp);
+
 	gfx_v8_0_edc_clear_counters(adev);
 
 fail:
diff --git a/drivers/gpu/drm/amd/include/amd_shared.h b/drivers/gpu/drm/amd/include/amd_shared.h
index 2ccf44e..c4fd013 100644
--- a/drivers/gpu/drm/amd/include/amd_shared.h
+++ b/drivers/gpu/drm/amd/include/amd_shared.h
@@ -179,6 +179,20 @@ struct amd_pp_profile {
 #define AMD_PG_SUPPORT_GFX_QUICK_MG		(1 << 11)
 #define AMD_PG_SUPPORT_GFX_PIPELINE		(1 << 12)
 
+/*
+ * ECC flags
+ * Allows the user to choose what kind of error detection/correction is used.
+ * Currently, EDC is supported on Carrizo.
+ *
+ * The AMD_ECC_SUPPORT_BEST bit is used to allow a user to have the driver
+ * set what it thinks is best/safest mode.  This may not be the same as the
+ * default, depending on the GPU and the application.
+ * Using a single bit makes it easy to request the best support without
+ * needing to know all currently supported modes.
+ */
+#define AMD_ECC_SUPPORT_BEST			(1 << 0)
+#define AMD_ECC_SUPPORT_EDC			(1 << 1)
+
 enum amd_pm_state_type {
 	/* not used for dpm */
 	POWER_STATE_TYPE_DEFAULT,
-- 
2.7.4


[-- Attachment #3: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: CZ EDC param and support
       [not found] ` <BN6PR12MB1889CED9632BAE0144E8AE1795100-/b2+HYfkarQsUGYmqVGGPAdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2017-04-28  8:05   ` Christian König
       [not found]     ` <40321e5a-36c5-6e07-8176-bcd596e7e982-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 5+ messages in thread
From: Christian König @ 2017-04-28  8:05 UTC (permalink / raw)
  To: Panariti, David, gpudriverdevsupport,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 19081 bytes --]

You somehow messed up the attachment.

Instead of individual files everything is squashed together as 
all-edc.patch.

Please fix that otherwise proper review won't be possible.

Christian.

Am 28.04.2017 um 00:13 schrieb Panariti, David:
>
> The changes in the workarounds function use DRM_INFO rather than 
> DRM_DEBUG because CZs with EDC are often used in embedded environments 
> and any info can be useful especially in the case of an intermittent 
> problem.
>
> From e1ce383592c275b58ad95bd80b5479af8c1f9dae Mon Sep 17 00:00:00 2001
>
> From: David Panariti <David.Panariti-5C7GfCeVMHo@public.gmane.org>
>
> Date: Fri, 14 Apr 2017 13:41:52 -0400
>
> Subject: [PATCH 1/3] drm/amdgpu: Moved gfx_v8_0_select_se_sh() in lieu of
>
> re-redundant prototype.
>
> Will be needed for the rest of the EDC workarounds patch.
>
> Change-Id: Ie586ab38a69e98a91c6cb5747e285ce8bfdd1c86
>
> Signed-off-by: David Panariti <David.Panariti-5C7GfCeVMHo@public.gmane.org>
>
> ---
>
> drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 46 
> +++++++++++++++++------------------
>
> 1 file changed, 23 insertions(+), 23 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>
> index 2ff5f19..27b57cb 100644
>
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>
> @@ -1500,6 +1500,29 @@ static int gfx_v8_0_kiq_init(struct 
> amdgpu_device *adev)
>
>                return 0;
>
> }
>
> +static void gfx_v8_0_select_se_sh(struct amdgpu_device *adev,
>
> +   u32 se_num, u32 sh_num, u32 instance)
>
> +{
>
> +             u32 data;
>
> +
>
> +             if (instance == 0xffffffff)
>
> +                             data = REG_SET_FIELD(0, GRBM_GFX_INDEX, 
> INSTANCE_BROADCAST_WRITES, 1);
>
> +             else
>
> +                             data = REG_SET_FIELD(0, GRBM_GFX_INDEX, 
> INSTANCE_INDEX, instance);
>
> +
>
> +             if (se_num == 0xffffffff)
>
> +                             data = REG_SET_FIELD(data, 
> GRBM_GFX_INDEX, SE_BROADCAST_WRITES, 1);
>
> +             else
>
> +                             data = REG_SET_FIELD(data, 
> GRBM_GFX_INDEX, SE_INDEX, se_num);
>
> +
>
> +             if (sh_num == 0xffffffff)
>
> +                             data = REG_SET_FIELD(data, 
> GRBM_GFX_INDEX, SH_BROADCAST_WRITES, 1);
>
> +             else
>
> +                             data = REG_SET_FIELD(data, 
> GRBM_GFX_INDEX, SH_INDEX, sh_num);
>
> +
>
> +             WREG32(mmGRBM_GFX_INDEX, data);
>
> +}
>
> +
>
> static const u32 vgpr_init_compute_shader[] =
>
> {
>
>                0x7e000209, 0x7e020208,
>
> @@ -3556,29 +3579,6 @@ static void 
> gfx_v8_0_tiling_mode_table_init(struct amdgpu_device *adev)
>
>                }
>
> }
>
> -static void gfx_v8_0_select_se_sh(struct amdgpu_device *adev,
>
> -   u32 se_num, u32 sh_num, u32 instance)
>
> -{
>
> -              u32 data;
>
> -
>
> -              if (instance == 0xffffffff)
>
> -                              data = REG_SET_FIELD(0, GRBM_GFX_INDEX, 
> INSTANCE_BROADCAST_WRITES, 1);
>
> -              else
>
> -                              data = REG_SET_FIELD(0, GRBM_GFX_INDEX, 
> INSTANCE_INDEX, instance);
>
> -
>
> -              if (se_num == 0xffffffff)
>
> -                              data = REG_SET_FIELD(data, 
> GRBM_GFX_INDEX, SE_BROADCAST_WRITES, 1);
>
> -              else
>
> -                              data = REG_SET_FIELD(data, 
> GRBM_GFX_INDEX, SE_INDEX, se_num);
>
> -
>
> -              if (sh_num == 0xffffffff)
>
> -                              data = REG_SET_FIELD(data, 
> GRBM_GFX_INDEX, SH_BROADCAST_WRITES, 1);
>
> -              else
>
> -                              data = REG_SET_FIELD(data, 
> GRBM_GFX_INDEX, SH_INDEX, sh_num);
>
> -
>
> -              WREG32(mmGRBM_GFX_INDEX, data);
>
> -}
>
> -
>
> static u32 gfx_v8_0_create_bitmask(u32 bit_width)
>
> {
>
>                return (u32)((1ULL << bit_width) - 1);
>
> -- 
>
> 2.7.4
>
> From 38fac8cab73dbc07e0ee7599b52106bc09dd32ea Mon Sep 17 00:00:00 2001
>
> From: David Panariti <David.Panariti-5C7GfCeVMHo@public.gmane.org>
>
> Date: Mon, 24 Apr 2017 11:05:45 -0400
>
> Subject: [PATCH 2/3] drm/amdgpu: Complete Carrizo EDC (Error Detection and
>
> Correction) workarounds.
>
> The workarounds are unconditionally performed on CZs with EDC enabled.
>
> EDC detects uncorrected ECC errors and uses data poisoning to prevent
>
> corrupted compute results from being used (read).
>
> EDC enabled CZs are often used in embedded environments.
>
> Change-Id: I84c261785329beeb797f11efbe0ec35790f2996c
>
> Signed-off-by: David Panariti <David.Panariti-5C7GfCeVMHo@public.gmane.org>
>
> ---
>
> drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 148 
> ++++++++++++++++++++++++----------
>
> 1 file changed, 106 insertions(+), 42 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>
> index 27b57cb..2f5bf5f 100644
>
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>
> @@ -1645,35 +1645,92 @@ static const u32 sgpr2_init_regs[] =
>
>                mmCOMPUTE_USER_DATA_9, 0xedcedc09,
>
> };
>
> -static const u32 sec_ded_counter_registers[] =
>
> -{
>
> -              mmCPC_EDC_ATC_CNT,
>
> -              mmCPC_EDC_SCRATCH_CNT,
>
> -              mmCPC_EDC_UCODE_CNT,
>
> -              mmCPF_EDC_ATC_CNT,
>
> -              mmCPF_EDC_ROQ_CNT,
>
> -              mmCPF_EDC_TAG_CNT,
>
> -              mmCPG_EDC_ATC_CNT,
>
> -              mmCPG_EDC_DMA_CNT,
>
> -              mmCPG_EDC_TAG_CNT,
>
> -              mmDC_EDC_CSINVOC_CNT,
>
> -              mmDC_EDC_RESTORE_CNT,
>
> -              mmDC_EDC_STATE_CNT,
>
> -              mmGDS_EDC_CNT,
>
> -              mmGDS_EDC_GRBM_CNT,
>
> -              mmGDS_EDC_OA_DED,
>
> -              mmSPI_EDC_CNT,
>
> -              mmSQC_ATC_EDC_GATCL1_CNT,
>
> -              mmSQC_EDC_CNT,
>
> -              mmSQ_EDC_DED_CNT,
>
> -              mmSQ_EDC_INFO,
>
> -              mmSQ_EDC_SEC_CNT,
>
> -              mmTCC_EDC_CNT,
>
> -              mmTCP_ATC_EDC_GATCL1_CNT,
>
> -              mmTCP_EDC_CNT,
>
> -              mmTD_EDC_CNT
>
> +struct reg32_counter_name_map {
>
> +             uint32_t rnmap_addr;   /* Counter register address */
>
> +             const char *rnmap_name;           /* Name of the counter */
>
> +             size_t rnmap_num_instances; /* Number of block instances */
>
> };
>
> +#define DEF_mmREG32_NAME_MAP_ELEMENT(reg, num_instances) {             \
>
> +             .rnmap_addr = mm##reg, \
>
> +             .rnmap_name = #reg, \
>
> +             .rnmap_num_instances = 
> num_instances                                            \
>
> +}
>
> +
>
> +/* See GRBM_GFX_INDEX, et. al. registers. */
>
> +static const struct reg32_counter_name_map 
> sec_ded_counter_registers[] = {
>
> + DEF_mmREG32_NAME_MAP_ELEMENT(SQC_EDC_CNT, 2),
>
> + DEF_mmREG32_NAME_MAP_ELEMENT(SQC_ATC_EDC_GATCL1_CNT, 2),
>
> +
>
> + DEF_mmREG32_NAME_MAP_ELEMENT(SQ_EDC_SEC_CNT, 8),
>
> + DEF_mmREG32_NAME_MAP_ELEMENT(SQ_EDC_DED_CNT, 8),
>
> + DEF_mmREG32_NAME_MAP_ELEMENT(TCP_EDC_CNT, 8),
>
> + DEF_mmREG32_NAME_MAP_ELEMENT(TCP_ATC_EDC_GATCL1_CNT, 8),
>
> + DEF_mmREG32_NAME_MAP_ELEMENT(TD_EDC_CNT, 8),
>
> +
>
> + DEF_mmREG32_NAME_MAP_ELEMENT(TCC_EDC_CNT, 4),
>
> +
>
> + DEF_mmREG32_NAME_MAP_ELEMENT(CPC_EDC_ATC_CNT, 1),
>
> + DEF_mmREG32_NAME_MAP_ELEMENT(CPC_EDC_SCRATCH_CNT, 1),
>
> + DEF_mmREG32_NAME_MAP_ELEMENT(CPC_EDC_UCODE_CNT, 1),
>
> + DEF_mmREG32_NAME_MAP_ELEMENT(CPF_EDC_ATC_CNT, 1),
>
> + DEF_mmREG32_NAME_MAP_ELEMENT(CPF_EDC_ROQ_CNT, 1),
>
> + DEF_mmREG32_NAME_MAP_ELEMENT(CPF_EDC_TAG_CNT, 1),
>
> + DEF_mmREG32_NAME_MAP_ELEMENT(CPG_EDC_ATC_CNT, 1),
>
> + DEF_mmREG32_NAME_MAP_ELEMENT(CPG_EDC_DMA_CNT, 1),
>
> + DEF_mmREG32_NAME_MAP_ELEMENT(CPG_EDC_TAG_CNT, 1),
>
> + DEF_mmREG32_NAME_MAP_ELEMENT(DC_EDC_CSINVOC_CNT, 1),
>
> + DEF_mmREG32_NAME_MAP_ELEMENT(DC_EDC_STATE_CNT, 1),
>
> + DEF_mmREG32_NAME_MAP_ELEMENT(DC_EDC_RESTORE_CNT, 1),
>
> + DEF_mmREG32_NAME_MAP_ELEMENT(GDS_EDC_CNT, 1),
>
> + DEF_mmREG32_NAME_MAP_ELEMENT(GDS_EDC_GRBM_CNT, 1),
>
> + DEF_mmREG32_NAME_MAP_ELEMENT(SPI_EDC_CNT, 1),
>
> +};
>
> +
>
> +static int gfx_v8_0_edc_clear_counters(struct amdgpu_device *adev)
>
> +{
>
> +             int ci, se, sh, i;
>
> +             uint32_t count;
>
> +             int r = 0;
>
> +
>
> + mutex_lock(&adev->grbm_idx_mutex);
>
> +
>
> +             for (ci = 0; ci < ARRAY_SIZE(sec_ded_counter_registers); 
> ++ci) {
>
> +                             const struct reg32_counter_name_map *cp =
>
> + sec_ded_counter_registers + ci;
>
> +                             const char *name = cp->rnmap_name;
>
> +
>
> +                             for (se = 0; se < 
> adev->gfx.config.max_shader_engines; ++se) {
>
> + for (sh = 0; sh < adev->gfx.config.max_sh_per_se; ++sh) {
>
> + for (i = 0; i < cp->rnmap_num_instances; ++i) {
>
> + gfx_v8_0_select_se_sh(adev, se, sh, i);
>
> + count = RREG32(cp->rnmap_addr);
>
> + count = RREG32(cp->rnmap_addr);
>
> + if (count != 0) {
>
> + /*
>
> + * Workaround failed.
>
> + * If people are interested
>
> + * in EDC at all, they will
>
> + * want to know which
>
> + * counters had problems.
>
> + */
>
> + DRM_WARN("EDC counter %s is 0x%08x, but should be 0\n.",
>
> + name, count);
>
> + r = -EINVAL;
>
> + goto ret;
>
> + }
>
> + }
>
> + }
>
> +                             }
>
> +             }
>
> +
>
> +ret:
>
> +             gfx_v8_0_select_se_sh(adev, 0xffffffff, 0xffffffff, 
> 0xffffffff);
>
> + mutex_unlock(&adev->grbm_idx_mutex);
>
> +
>
> +             return r;
>
> +}
>
> +
>
> static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
>
> {
>
>                struct amdgpu_ring *ring = &adev->gfx.compute_ring[0];
>
> @@ -1681,18 +1738,36 @@ static int 
> gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
>
>                struct fence *f = NULL;
>
>                int r, i;
>
>                u32 tmp;
>
> +             u32 dis_bit;
>
>                unsigned total_size, vgpr_offset, sgpr_offset;
>
>                u64 gpu_addr;
>
> -              /* only supported on CZ */
>
> -              if (adev->asic_type != CHIP_CARRIZO)
>
> +             if (adev->asic_type != CHIP_CARRIZO) {
>
> +                             /* EDC is only supported on CZ */
>
> +                             return 0;
>
> +             }
>
> +
>
> +        DRM_INFO("Detected Carrizo.\n");
>
> +
>
> +             tmp = RREG32(mmCC_GC_EDC_CONFIG);
>
> +             dis_bit = REG_GET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC);
>
> +             if (dis_bit) {
>
> +                             /* On Carrizo, EDC may be disabled by a 
> fuse. */
>
> +                             DRM_INFO("EDC hardware is disabled, 
> GC_EDC_CONFIG: 0x%08x.\n",
>
> + tmp);
>
>                                return 0;
>
> +             }
>
>                 /* bail if the compute ring is not ready */
>
>                if (!ring->ready)
>
>                                return 0;
>
> -              tmp = RREG32(mmGB_EDC_MODE);
>
> +             DRM_INFO("Applying EDC workarounds.\n");
>
> +
>
> +             /*
>
> +             * Interested parties can enable EDC using debugfs 
> register reads and
>
> +             * writes.
>
> +             */
>
>                WREG32(mmGB_EDC_MODE, 0);
>
>                 total_size =
>
> @@ -1817,18 +1892,7 @@ static int 
> gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
>
>                                goto fail;
>
>                }
>
> -              tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, DED_MODE, 2);
>
> -              tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, PROP_FED, 1);
>
> -              WREG32(mmGB_EDC_MODE, tmp);
>
> -
>
> -              tmp = RREG32(mmCC_GC_EDC_CONFIG);
>
> -              tmp = REG_SET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC, 0) | 1;
>
> -              WREG32(mmCC_GC_EDC_CONFIG, tmp);
>
> -
>
> -
>
> -              /* read back registers to clear the counters */
>
> -              for (i = 0; i < ARRAY_SIZE(sec_ded_counter_registers); i++)
>
> - RREG32(sec_ded_counter_registers[i]);
>
> + gfx_v8_0_edc_clear_counters(adev);
>
>  fail:
>
>                amdgpu_ib_free(adev, &ib, NULL);
>
> -- 
>
> 2.7.4
>
> From ec4803205582c1011f5ced1ead70ee244268b4b8 Mon Sep 17 00:00:00 2001
>
> From: David Panariti <David.Panariti-5C7GfCeVMHo@public.gmane.org>
>
> Date: Wed, 26 Apr 2017 10:13:06 -0400
>
> Subject: [PATCH 3/3] drm/amdgpu: Add kernel parameter to control use of
>
> ECC/EDC.
>
> Allow various kinds of memory integrity methods (e.g. ECC/EDC) to be 
> enabled
>
> or disabled.  By default, all features are disabled.
>
> EDC is Error Detection and Correction.  It can detect ECC errors and 
> do 0 or
>
> more of: count SEC (single error corrected) and DED (double error 
> detected,
>
> i.e. uncorrected ECC error), halt the affected block, interrupt the CPU.
>
> Currently, only counting errors is supported.
>
> Signed-off-by: David Panariti <David.Panariti-5C7GfCeVMHo@public.gmane.org>
>
> ---
>
> drivers/gpu/drm/amd/amdgpu/amdgpu.h      | 1 +
>
> drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  | 4 ++++
>
> drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 34 
> +++++++++++++++++++++++++++-----
>
> drivers/gpu/drm/amd/include/amd_shared.h | 14 +++++++++++++
>
> 4 files changed, 48 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>
> index 4a16e3c..0322392 100644
>
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>
> @@ -111,6 +111,7 @@ extern int amdgpu_prim_buf_per_se;
>
> extern int amdgpu_pos_buf_per_se;
>
> extern int amdgpu_cntl_sb_buf_per_se;
>
> extern int amdgpu_param_buf_per_se;
>
> +extern unsigned amdgpu_ecc_flags;
>
>  #define AMDGPU_DEFAULT_GTT_SIZE_MB                      3072ULL /* 
> 3GB by default */
>
> #define AMDGPU_WAIT_IDLE_TIMEOUT_IN_MS                   3000
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>
> index ead00d7..00e16ac 100644
>
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>
> @@ -110,6 +110,7 @@ int amdgpu_prim_buf_per_se = 0;
>
> int amdgpu_pos_buf_per_se = 0;
>
> int amdgpu_cntl_sb_buf_per_se = 0;
>
> int amdgpu_param_buf_per_se = 0;
>
> +unsigned amdgpu_ecc_flags = 0;
>
>  MODULE_PARM_DESC(vramlimit, "Restrict VRAM for testing, in megabytes");
>
> module_param_named(vramlimit, amdgpu_vram_limit, int, 0600);
>
> @@ -235,6 +236,9 @@ module_param_named(cntl_sb_buf_per_se, 
> amdgpu_cntl_sb_buf_per_se, int, 0444);
>
> MODULE_PARM_DESC(param_buf_per_se, "the size of Off-Chip Pramater 
> Cache per Shader Engine (default depending on gfx)");
>
> module_param_named(param_buf_per_se, amdgpu_param_buf_per_se, int, 0444);
>
> +MODULE_PARM_DESC(ecc_flags, "ECC/EDC enable flags (0 = disable 
> ECC/EDC (default))");
>
> +module_param_named(ecc_flags, amdgpu_ecc_flags, uint, 0444);
>
> +
>
>  static const struct pci_device_id pciidlist[] = {
>
> #ifdef  CONFIG_DRM_AMDGPU_SI
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>
> index 2f5bf5f..05cab7e 100644
>
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>
> @@ -1708,7 +1708,7 @@ static int gfx_v8_0_edc_clear_counters(struct 
> amdgpu_device *adev)
>
> count = RREG32(cp->rnmap_addr);
>
> if (count != 0) {
>
> /*
>
> - * Workaround failed.
>
> + * EDC workaround failed.
>
>  * If people are interested
>
>  * in EDC at all, they will
>
>  * want to know which
>
> @@ -1747,14 +1747,24 @@ static int 
> gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
>
>                                return 0;
>
>                }
>
> -        DRM_INFO("Detected Carrizo.\n");
>
> +             DRM_INFO("Detected Carrizo.\n");
>
>                 tmp = RREG32(mmCC_GC_EDC_CONFIG);
>
>                dis_bit = REG_GET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC);
>
>                if (dis_bit) {
>
> -                              /* On Carrizo, EDC may be disabled by a 
> fuse. */
>
> - DRM_INFO("EDC hardware is disabled, GC_EDC_CONFIG: 0x%08x.\n",
>
> - tmp);
>
> +                             /* On Carrizo, EDC may be disabled 
> permanently by a fuse. */
>
> + DRM_INFO("Carrizo EDC hardware is disabled, GC_EDC_CONFIG: 0x%08x.\n",
>
> + tmp);
>
> +                             return 0;
>
> +             }
>
> +
>
> +             /*
>
> +             * Check if EDC has been requested by a kernel parameter.
>
> +             * For Carrizo, EDC is the best/safest mode WRT error 
> handling.
>
> +             */
>
> +             if (!(amdgpu_ecc_flags
>
> +                   & (AMD_ECC_SUPPORT_BEST | AMD_ECC_SUPPORT_EDC))) {
>
> +                             DRM_INFO("EDC support has not been 
> requested.\n");
>
>                                return 0;
>
>                }
>
> @@ -1892,6 +1902,20 @@ static int 
> gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
>
>                                goto fail;
>
>                }
>
> +             /* 00 - GB_EDC_DED_MODE_LOG: Count DED errors but do not 
> halt */
>
> +             tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, DED_MODE, 0);
>
> +             /* Do not propagate the errors to the next block. */
>
> +             tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, PROP_FED, 0);
>
> +             WREG32(mmGB_EDC_MODE, tmp);
>
> +
>
> +             tmp = RREG32(mmCC_GC_EDC_CONFIG);
>
> +
>
> +             /*
>
> +             * Clear EDC_DISABLE bit so the counters are available.
>
> +             */
>
> +             tmp = REG_SET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC, 0);
>
> +             WREG32(mmCC_GC_EDC_CONFIG, tmp);
>
> +
>
> gfx_v8_0_edc_clear_counters(adev);
>
>  fail:
>
> diff --git a/drivers/gpu/drm/amd/include/amd_shared.h 
> b/drivers/gpu/drm/amd/include/amd_shared.h
>
> index 2ccf44e..c4fd013 100644
>
> --- a/drivers/gpu/drm/amd/include/amd_shared.h
>
> +++ b/drivers/gpu/drm/amd/include/amd_shared.h
>
> @@ -179,6 +179,20 @@ struct amd_pp_profile {
>
> #define AMD_PG_SUPPORT_GFX_QUICK_MG                                (1 
> << 11)
>
> #define AMD_PG_SUPPORT_GFX_PIPELINE                     (1 << 12)
>
> +/*
>
> + * ECC flags
>
> + * Allows the user to choose what kind of error detection/correction 
> is used.
>
> + * Currently, EDC is supported on Carrizo.
>
> + *
>
> + * The AMD_ECC_SUPPORT_BEST bit is used to allow a user to have the 
> driver
>
> + * set what it thinks is best/safest mode.  This may not be the same 
> as the
>
> + * default, depending on the GPU and the application.
>
> + * Using a single bit makes it easy to request the best support without
>
> + * needing to know all currently supported modes.
>
> + */
>
> +#define AMD_ECC_SUPPORT_BEST                                     (1 << 0)
>
> +#define AMD_ECC_SUPPORT_EDC                                       (1 
> << 1)
>
> +
>
> enum amd_pm_state_type {
>
>                /* not used for dpm */
>
>                POWER_STATE_TYPE_DEFAULT,
>
> -- 
>
> 2.7.4
>


[-- Attachment #1.2: Type: text/html, Size: 44608 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 5+ messages in thread

* RE: CZ EDC param and support
       [not found]     ` <40321e5a-36c5-6e07-8176-bcd596e7e982-5C7GfCeVMHo@public.gmane.org>
@ 2017-04-28 14:18       ` Panariti, David
       [not found]         ` <BN6PR12MB1889C8C94B10AA6E3DF6BD6495130-/b2+HYfkarQsUGYmqVGGPAdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  0 siblings, 1 reply; 5+ messages in thread
From: Panariti, David @ 2017-04-28 14:18 UTC (permalink / raw)
  To: Koenig, Christian, gpudriverdevsupport,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 21754 bytes --]

Actually, the attachment was an oversight.
It's easier for me to attach, open the attachment and then delete the attachment.
I got only 2/3 this time.
I've gotten a comment that inline patches are preferred.

Sorry for the inconvenience.

davep

From: Koenig, Christian
Sent: Friday, April 28, 2017 4:06 AM
To: Panariti, David <David.Panariti-5C7GfCeVMHo@public.gmane.org>; gpudriverdevsupport <gpudriverdevsupport-5C7GfCeVMHo@public.gmane.org>; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
Subject: Re: CZ EDC param and support

You somehow messed up the attachment.

Instead of individual files everything is squashed together as all-edc.patch.

Please fix that otherwise proper review won't be possible.

Christian.

Am 28.04.2017 um 00:13 schrieb Panariti, David:
The changes in the workarounds function use DRM_INFO rather than DRM_DEBUG because CZs with EDC are often used in embedded environments and any info can be useful especially in the case of an intermittent problem.

>From e1ce383592c275b58ad95bd80b5479af8c1f9dae Mon Sep 17 00:00:00 2001
From: David Panariti <David.Panariti-5C7GfCeVMHo@public.gmane.org><mailto:David.Panariti-5C7GfCeVMHo@public.gmane.org>
Date: Fri, 14 Apr 2017 13:41:52 -0400
Subject: [PATCH 1/3] drm/amdgpu: Moved gfx_v8_0_select_se_sh() in lieu of
re-redundant prototype.

Will be needed for the rest of the EDC workarounds patch.

Change-Id: Ie586ab38a69e98a91c6cb5747e285ce8bfdd1c86
Signed-off-by: David Panariti <David.Panariti-5C7GfCeVMHo@public.gmane.org><mailto:David.Panariti@amd.com>
---
drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 46 +++++++++++++++++------------------
1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 2ff5f19..27b57cb 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -1500,6 +1500,29 @@ static int gfx_v8_0_kiq_init(struct amdgpu_device *adev)
               return 0;
}
+static void gfx_v8_0_select_se_sh(struct amdgpu_device *adev,
+                                                               u32 se_num, u32 sh_num, u32 instance)
+{
+             u32 data;
+
+             if (instance == 0xffffffff)
+                             data = REG_SET_FIELD(0, GRBM_GFX_INDEX, INSTANCE_BROADCAST_WRITES, 1);
+             else
+                             data = REG_SET_FIELD(0, GRBM_GFX_INDEX, INSTANCE_INDEX, instance);
+
+             if (se_num == 0xffffffff)
+                             data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SE_BROADCAST_WRITES, 1);
+             else
+                             data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SE_INDEX, se_num);
+
+             if (sh_num == 0xffffffff)
+                             data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SH_BROADCAST_WRITES, 1);
+             else
+                             data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SH_INDEX, sh_num);
+
+             WREG32(mmGRBM_GFX_INDEX, data);
+}
+
static const u32 vgpr_init_compute_shader[] =
{
               0x7e000209, 0x7e020208,
@@ -3556,29 +3579,6 @@ static void gfx_v8_0_tiling_mode_table_init(struct amdgpu_device *adev)
               }
}
-static void gfx_v8_0_select_se_sh(struct amdgpu_device *adev,
-                                                                u32 se_num, u32 sh_num, u32 instance)
-{
-              u32 data;
-
-              if (instance == 0xffffffff)
-                              data = REG_SET_FIELD(0, GRBM_GFX_INDEX, INSTANCE_BROADCAST_WRITES, 1);
-              else
-                              data = REG_SET_FIELD(0, GRBM_GFX_INDEX, INSTANCE_INDEX, instance);
-
-              if (se_num == 0xffffffff)
-                              data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SE_BROADCAST_WRITES, 1);
-              else
-                              data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SE_INDEX, se_num);
-
-              if (sh_num == 0xffffffff)
-                              data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SH_BROADCAST_WRITES, 1);
-              else
-                              data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SH_INDEX, sh_num);
-
-              WREG32(mmGRBM_GFX_INDEX, data);
-}
-
static u32 gfx_v8_0_create_bitmask(u32 bit_width)
{
               return (u32)((1ULL << bit_width) - 1);
--
2.7.4


>From 38fac8cab73dbc07e0ee7599b52106bc09dd32ea Mon Sep 17 00:00:00 2001
From: David Panariti <David.Panariti-5C7GfCeVMHo@public.gmane.org><mailto:David.Panariti-5C7GfCeVMHo@public.gmane.org>
Date: Mon, 24 Apr 2017 11:05:45 -0400
Subject: [PATCH 2/3] drm/amdgpu: Complete Carrizo EDC (Error Detection and
Correction) workarounds.

The workarounds are unconditionally performed on CZs with EDC enabled.
EDC detects uncorrected ECC errors and uses data poisoning to prevent
corrupted compute results from being used (read).
EDC enabled CZs are often used in embedded environments.

Change-Id: I84c261785329beeb797f11efbe0ec35790f2996c
Signed-off-by: David Panariti <David.Panariti-5C7GfCeVMHo@public.gmane.org><mailto:David.Panariti@amd.com>
---
drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 148 ++++++++++++++++++++++++----------
1 file changed, 106 insertions(+), 42 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 27b57cb..2f5bf5f 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -1645,35 +1645,92 @@ static const u32 sgpr2_init_regs[] =
               mmCOMPUTE_USER_DATA_9, 0xedcedc09,
};
-static const u32 sec_ded_counter_registers[] =
-{
-              mmCPC_EDC_ATC_CNT,
-              mmCPC_EDC_SCRATCH_CNT,
-              mmCPC_EDC_UCODE_CNT,
-              mmCPF_EDC_ATC_CNT,
-              mmCPF_EDC_ROQ_CNT,
-              mmCPF_EDC_TAG_CNT,
-              mmCPG_EDC_ATC_CNT,
-              mmCPG_EDC_DMA_CNT,
-              mmCPG_EDC_TAG_CNT,
-              mmDC_EDC_CSINVOC_CNT,
-              mmDC_EDC_RESTORE_CNT,
-              mmDC_EDC_STATE_CNT,
-              mmGDS_EDC_CNT,
-              mmGDS_EDC_GRBM_CNT,
-              mmGDS_EDC_OA_DED,
-              mmSPI_EDC_CNT,
-              mmSQC_ATC_EDC_GATCL1_CNT,
-              mmSQC_EDC_CNT,
-              mmSQ_EDC_DED_CNT,
-              mmSQ_EDC_INFO,
-              mmSQ_EDC_SEC_CNT,
-              mmTCC_EDC_CNT,
-              mmTCP_ATC_EDC_GATCL1_CNT,
-              mmTCP_EDC_CNT,
-              mmTD_EDC_CNT
+struct reg32_counter_name_map {
+             uint32_t rnmap_addr;   /* Counter register address */
+             const char *rnmap_name;           /* Name of the counter */
+             size_t rnmap_num_instances;  /* Number of block instances */
};
+#define DEF_mmREG32_NAME_MAP_ELEMENT(reg, num_instances) {             \
+             .rnmap_addr = mm##reg,                                                                           \
+             .rnmap_name = #reg,                                                                    \
+             .rnmap_num_instances = num_instances                                            \
+}
+
+/* See GRBM_GFX_INDEX, et. al. registers. */
+static const struct reg32_counter_name_map sec_ded_counter_registers[] = {
+             DEF_mmREG32_NAME_MAP_ELEMENT(SQC_EDC_CNT, 2),
+             DEF_mmREG32_NAME_MAP_ELEMENT(SQC_ATC_EDC_GATCL1_CNT, 2),
+
+             DEF_mmREG32_NAME_MAP_ELEMENT(SQ_EDC_SEC_CNT, 8),
+             DEF_mmREG32_NAME_MAP_ELEMENT(SQ_EDC_DED_CNT, 8),
+             DEF_mmREG32_NAME_MAP_ELEMENT(TCP_EDC_CNT, 8),
+             DEF_mmREG32_NAME_MAP_ELEMENT(TCP_ATC_EDC_GATCL1_CNT, 8),
+             DEF_mmREG32_NAME_MAP_ELEMENT(TD_EDC_CNT, 8),
+
+             DEF_mmREG32_NAME_MAP_ELEMENT(TCC_EDC_CNT, 4),
+
+             DEF_mmREG32_NAME_MAP_ELEMENT(CPC_EDC_ATC_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(CPC_EDC_SCRATCH_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(CPC_EDC_UCODE_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(CPF_EDC_ATC_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(CPF_EDC_ROQ_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(CPF_EDC_TAG_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(CPG_EDC_ATC_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(CPG_EDC_DMA_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(CPG_EDC_TAG_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(DC_EDC_CSINVOC_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(DC_EDC_STATE_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(DC_EDC_RESTORE_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(GDS_EDC_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(GDS_EDC_GRBM_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(SPI_EDC_CNT, 1),
+};
+
+static int gfx_v8_0_edc_clear_counters(struct amdgpu_device *adev)
+{
+             int ci, se, sh, i;
+             uint32_t count;
+             int r = 0;
+
+             mutex_lock(&adev->grbm_idx_mutex);
+
+             for (ci = 0; ci < ARRAY_SIZE(sec_ded_counter_registers); ++ci) {
+                             const struct reg32_counter_name_map *cp =
+                                             sec_ded_counter_registers + ci;
+                             const char *name = cp->rnmap_name;
+
+                             for (se = 0; se < adev->gfx.config.max_shader_engines; ++se) {
+                                             for (sh = 0; sh < adev->gfx.config.max_sh_per_se; ++sh) {
+                                                             for (i = 0; i < cp->rnmap_num_instances; ++i) {
+                                                                             gfx_v8_0_select_se_sh(adev, se, sh, i);
+                                                                             count = RREG32(cp->rnmap_addr);
+                                                                             count = RREG32(cp->rnmap_addr);
+                                                                             if (count != 0) {
+                                                                                             /*
+                                                                                             * Workaround failed.
+                                                                                             * If people are interested
+                                                                                             * in EDC at all, they will
+                                                                                             * want to know which
+                                                                                             * counters had problems.
+                                                                                             */
+                                                                                             DRM_WARN("EDC counter %s is 0x%08x, but should be 0\n.",
+                                                                                                             name, count);
+                                                                                             r = -EINVAL;
+                                                                                             goto ret;
+                                                                             }
+                                                             }
+                                             }
+                             }
+             }
+
+ret:
+             gfx_v8_0_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff);
+             mutex_unlock(&adev->grbm_idx_mutex);
+
+             return r;
+}
+
static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
{
               struct amdgpu_ring *ring = &adev->gfx.compute_ring[0];
@@ -1681,18 +1738,36 @@ static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
               struct fence *f = NULL;
               int r, i;
               u32 tmp;
+             u32 dis_bit;
               unsigned total_size, vgpr_offset, sgpr_offset;
               u64 gpu_addr;
-              /* only supported on CZ */
-              if (adev->asic_type != CHIP_CARRIZO)
+             if (adev->asic_type != CHIP_CARRIZO) {
+                             /* EDC is only supported on CZ */
+                             return 0;
+             }
+
+        DRM_INFO("Detected Carrizo.\n");
+
+             tmp = RREG32(mmCC_GC_EDC_CONFIG);
+             dis_bit = REG_GET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC);
+             if (dis_bit) {
+                             /* On Carrizo, EDC may be disabled by a fuse. */
+                             DRM_INFO("EDC hardware is disabled, GC_EDC_CONFIG: 0x%08x.\n",
+                                             tmp);
                               return 0;
+             }
                /* bail if the compute ring is not ready */
               if (!ring->ready)
                               return 0;
-              tmp = RREG32(mmGB_EDC_MODE);
+             DRM_INFO("Applying EDC workarounds.\n");
+
+             /*
+             * Interested parties can enable EDC using debugfs register reads and
+             * writes.
+             */
               WREG32(mmGB_EDC_MODE, 0);
                total_size =
@@ -1817,18 +1892,7 @@ static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
                               goto fail;
               }
-              tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, DED_MODE, 2);
-              tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, PROP_FED, 1);
-              WREG32(mmGB_EDC_MODE, tmp);
-
-              tmp = RREG32(mmCC_GC_EDC_CONFIG);
-              tmp = REG_SET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC, 0) | 1;
-              WREG32(mmCC_GC_EDC_CONFIG, tmp);
-
-
-              /* read back registers to clear the counters */
-              for (i = 0; i < ARRAY_SIZE(sec_ded_counter_registers); i++)
-                              RREG32(sec_ded_counter_registers[i]);
+             gfx_v8_0_edc_clear_counters(adev);
 fail:
               amdgpu_ib_free(adev, &ib, NULL);
--
2.7.4


>From ec4803205582c1011f5ced1ead70ee244268b4b8 Mon Sep 17 00:00:00 2001
From: David Panariti <David.Panariti-5C7GfCeVMHo@public.gmane.org><mailto:David.Panariti-5C7GfCeVMHo@public.gmane.org>
Date: Wed, 26 Apr 2017 10:13:06 -0400
Subject: [PATCH 3/3] drm/amdgpu: Add kernel parameter to control use of
ECC/EDC.

Allow various kinds of memory integrity methods (e.g. ECC/EDC) to be enabled
or disabled.  By default, all features are disabled.

EDC is Error Detection and Correction.  It can detect ECC errors and do 0 or
more of: count SEC (single error corrected) and DED (double error detected,
i.e. uncorrected ECC error), halt the affected block, interrupt the CPU.
Currently, only counting errors is supported.

Signed-off-by: David Panariti <David.Panariti-5C7GfCeVMHo@public.gmane.org><mailto:David.Panariti@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h      |  1 +
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  |  4 ++++
drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 34 +++++++++++++++++++++++++++-----
drivers/gpu/drm/amd/include/amd_shared.h | 14 +++++++++++++
4 files changed, 48 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 4a16e3c..0322392 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -111,6 +111,7 @@ extern int amdgpu_prim_buf_per_se;
extern int amdgpu_pos_buf_per_se;
extern int amdgpu_cntl_sb_buf_per_se;
extern int amdgpu_param_buf_per_se;
+extern unsigned amdgpu_ecc_flags;
 #define AMDGPU_DEFAULT_GTT_SIZE_MB                      3072ULL /* 3GB by default */
#define AMDGPU_WAIT_IDLE_TIMEOUT_IN_MS                   3000
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index ead00d7..00e16ac 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -110,6 +110,7 @@ int amdgpu_prim_buf_per_se = 0;
int amdgpu_pos_buf_per_se = 0;
int amdgpu_cntl_sb_buf_per_se = 0;
int amdgpu_param_buf_per_se = 0;
+unsigned amdgpu_ecc_flags = 0;
 MODULE_PARM_DESC(vramlimit, "Restrict VRAM for testing, in megabytes");
module_param_named(vramlimit, amdgpu_vram_limit, int, 0600);
@@ -235,6 +236,9 @@ module_param_named(cntl_sb_buf_per_se, amdgpu_cntl_sb_buf_per_se, int, 0444);
MODULE_PARM_DESC(param_buf_per_se, "the size of Off-Chip Pramater Cache per Shader Engine (default depending on gfx)");
module_param_named(param_buf_per_se, amdgpu_param_buf_per_se, int, 0444);
+MODULE_PARM_DESC(ecc_flags, "ECC/EDC enable flags (0 = disable ECC/EDC (default))");
+module_param_named(ecc_flags, amdgpu_ecc_flags, uint, 0444);
+
 static const struct pci_device_id pciidlist[] = {
#ifdef  CONFIG_DRM_AMDGPU_SI
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 2f5bf5f..05cab7e 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -1708,7 +1708,7 @@ static int gfx_v8_0_edc_clear_counters(struct amdgpu_device *adev)
                                                                               count = RREG32(cp->rnmap_addr);
                                                                               if (count != 0) {
                                                                                               /*
-                                                                                              * Workaround failed.
+                                                                                             * EDC workaround failed.
                                                                                                * If people are interested
                                                                                                * in EDC at all, they will
                                                                                                * want to know which
@@ -1747,14 +1747,24 @@ static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
                               return 0;
               }
-        DRM_INFO("Detected Carrizo.\n");
+             DRM_INFO("Detected Carrizo.\n");
                tmp = RREG32(mmCC_GC_EDC_CONFIG);
               dis_bit = REG_GET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC);
               if (dis_bit) {
-                              /* On Carrizo, EDC may be disabled by a fuse. */
-                              DRM_INFO("EDC hardware is disabled, GC_EDC_CONFIG: 0x%08x.\n",
-                                              tmp);
+                             /* On Carrizo, EDC may be disabled permanently by a fuse. */
+                             DRM_INFO("Carrizo EDC hardware is disabled, GC_EDC_CONFIG: 0x%08x.\n",
+                                             tmp);
+                             return 0;
+             }
+
+             /*
+             * Check if EDC has been requested by a kernel parameter.
+             * For Carrizo, EDC is the best/safest mode WRT error handling.
+             */
+             if (!(amdgpu_ecc_flags
+                   & (AMD_ECC_SUPPORT_BEST | AMD_ECC_SUPPORT_EDC))) {
+                             DRM_INFO("EDC support has not been requested.\n");
                               return 0;
               }
@@ -1892,6 +1902,20 @@ static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
                               goto fail;
               }
+             /* 00 - GB_EDC_DED_MODE_LOG: Count DED errors but do not halt */
+             tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, DED_MODE, 0);
+             /* Do not propagate the errors to the next block. */
+             tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, PROP_FED, 0);
+             WREG32(mmGB_EDC_MODE, tmp);
+
+             tmp = RREG32(mmCC_GC_EDC_CONFIG);
+
+             /*
+             * Clear EDC_DISABLE bit so the counters are available.
+             */
+             tmp = REG_SET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC, 0);
+             WREG32(mmCC_GC_EDC_CONFIG, tmp);
+
               gfx_v8_0_edc_clear_counters(adev);
 fail:
diff --git a/drivers/gpu/drm/amd/include/amd_shared.h b/drivers/gpu/drm/amd/include/amd_shared.h
index 2ccf44e..c4fd013 100644
--- a/drivers/gpu/drm/amd/include/amd_shared.h
+++ b/drivers/gpu/drm/amd/include/amd_shared.h
@@ -179,6 +179,20 @@ struct amd_pp_profile {
#define AMD_PG_SUPPORT_GFX_QUICK_MG                                (1 << 11)
#define AMD_PG_SUPPORT_GFX_PIPELINE                     (1 << 12)
+/*
+ * ECC flags
+ * Allows the user to choose what kind of error detection/correction is used.
+ * Currently, EDC is supported on Carrizo.
+ *
+ * The AMD_ECC_SUPPORT_BEST bit is used to allow a user to have the driver
+ * set what it thinks is best/safest mode.  This may not be the same as the
+ * default, depending on the GPU and the application.
+ * Using a single bit makes it easy to request the best support without
+ * needing to know all currently supported modes.
+ */
+#define AMD_ECC_SUPPORT_BEST                                     (1 << 0)
+#define AMD_ECC_SUPPORT_EDC                                       (1 << 1)
+
enum amd_pm_state_type {
               /* not used for dpm */
               POWER_STATE_TYPE_DEFAULT,
--
2.7.4




[-- Attachment #1.2: Type: text/html, Size: 70958 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: CZ EDC param and support
       [not found]         ` <BN6PR12MB1889C8C94B10AA6E3DF6BD6495130-/b2+HYfkarQsUGYmqVGGPAdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2017-05-01 14:40           ` Christian König
       [not found]             ` <502f7d5e-431c-b39a-1281-dbe3b710a75e-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
  0 siblings, 1 reply; 5+ messages in thread
From: Christian König @ 2017-05-01 14:40 UTC (permalink / raw)
  To: Panariti, David, Koenig, Christian, gpudriverdevsupport,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 22193 bytes --]

> I’ve gotten a comment that inline patches are preferred.
>
Well, they are. But you should send them with "git send-email" and not 
squashed together all in one mail :)

Otherwise I can't see how we should be able to apply them.

Additional to that at least I'm perfectly fine with attached patches as 
well.

Christian.

Am 28.04.2017 um 16:18 schrieb Panariti, David:
>
> Actually, the attachment was an oversight.
>
> It’s easier for me to attach, open the attachment and then delete the 
> attachment.
>
> I got only 2/3 this time.
>
> I’ve gotten a comment that inline patches are preferred.
>
> Sorry for the inconvenience.
>
> davep
>
> *From:*Koenig, Christian
> *Sent:* Friday, April 28, 2017 4:06 AM
> *To:* Panariti, David <David.Panariti-5C7GfCeVMHo@public.gmane.org>; gpudriverdevsupport 
> <gpudriverdevsupport-5C7GfCeVMHo@public.gmane.org>; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> *Subject:* Re: CZ EDC param and support
>
> You somehow messed up the attachment.
>
> Instead of individual files everything is squashed together as 
> all-edc.patch.
>
> Please fix that otherwise proper review won't be possible.
>
> Christian.
>
> Am 28.04.2017 um 00:13 schrieb Panariti, David:
>
>     The changes in the workarounds function use DRM_INFO rather than
>     DRM_DEBUG because CZs with EDC are often used in embedded
>     environments and any info can be useful especially in the case of
>     an intermittent problem.
>
>     From e1ce383592c275b58ad95bd80b5479af8c1f9dae Mon Sep 17 00:00:00 2001
>
>     From: David Panariti <David.Panariti-5C7GfCeVMHo@public.gmane.org>
>     <mailto:David.Panariti-5C7GfCeVMHo@public.gmane.org>
>
>     Date: Fri, 14 Apr 2017 13:41:52 -0400
>
>     Subject: [PATCH 1/3] drm/amdgpu: Moved gfx_v8_0_select_se_sh() in
>     lieu of
>
>     re-redundant prototype.
>
>     Will be needed for the rest of the EDC workarounds patch.
>
>     Change-Id: Ie586ab38a69e98a91c6cb5747e285ce8bfdd1c86
>
>     Signed-off-by: David Panariti <David.Panariti-5C7GfCeVMHo@public.gmane.org>
>     <mailto:David.Panariti-5C7GfCeVMHo@public.gmane.org>
>
>     ---
>
>     drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 46
>     +++++++++++++++++------------------
>
>     1 file changed, 23 insertions(+), 23 deletions(-)
>
>     diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>     b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>
>     index 2ff5f19..27b57cb 100644
>
>     --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>
>     +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>
>     @@ -1500,6 +1500,29 @@ static int gfx_v8_0_kiq_init(struct
>     amdgpu_device *adev)
>
>                    return 0;
>
>     }
>
>     +static void gfx_v8_0_select_se_sh(struct amdgpu_device *adev,
>
>     +   u32 se_num, u32 sh_num, u32 instance)
>
>     +{
>
>     +             u32 data;
>
>     +
>
>     +             if (instance == 0xffffffff)
>
>     +                             data = REG_SET_FIELD(0,
>     GRBM_GFX_INDEX, INSTANCE_BROADCAST_WRITES, 1);
>
>     +             else
>
>     +                             data = REG_SET_FIELD(0,
>     GRBM_GFX_INDEX, INSTANCE_INDEX, instance);
>
>     +
>
>     +             if (se_num == 0xffffffff)
>
>     +                             data = REG_SET_FIELD(data,
>     GRBM_GFX_INDEX, SE_BROADCAST_WRITES, 1);
>
>     +             else
>
>     +                             data = REG_SET_FIELD(data,
>     GRBM_GFX_INDEX, SE_INDEX, se_num);
>
>     +
>
>     +             if (sh_num == 0xffffffff)
>
>     +                             data = REG_SET_FIELD(data,
>     GRBM_GFX_INDEX, SH_BROADCAST_WRITES, 1);
>
>     +             else
>
>     +                             data = REG_SET_FIELD(data,
>     GRBM_GFX_INDEX, SH_INDEX, sh_num);
>
>     +
>
>     +             WREG32(mmGRBM_GFX_INDEX, data);
>
>     +}
>
>     +
>
>     static const u32 vgpr_init_compute_shader[] =
>
>     {
>
>                    0x7e000209, 0x7e020208,
>
>     @@ -3556,29 +3579,6 @@ static void
>     gfx_v8_0_tiling_mode_table_init(struct amdgpu_device *adev)
>
>                    }
>
>     }
>
>     -static void gfx_v8_0_select_se_sh(struct amdgpu_device *adev,
>
>     -   u32 se_num, u32 sh_num, u32 instance)
>
>     -{
>
>     -              u32 data;
>
>     -
>
>     -              if (instance == 0xffffffff)
>
>     -                              data = REG_SET_FIELD(0,
>     GRBM_GFX_INDEX, INSTANCE_BROADCAST_WRITES, 1);
>
>     -              else
>
>     -                              data = REG_SET_FIELD(0,
>     GRBM_GFX_INDEX, INSTANCE_INDEX, instance);
>
>     -
>
>     -              if (se_num == 0xffffffff)
>
>     -                              data = REG_SET_FIELD(data,
>     GRBM_GFX_INDEX, SE_BROADCAST_WRITES, 1);
>
>     -              else
>
>     -                              data = REG_SET_FIELD(data,
>     GRBM_GFX_INDEX, SE_INDEX, se_num);
>
>     -
>
>     -              if (sh_num == 0xffffffff)
>
>     -                              data = REG_SET_FIELD(data,
>     GRBM_GFX_INDEX, SH_BROADCAST_WRITES, 1);
>
>     -              else
>
>     -                              data = REG_SET_FIELD(data,
>     GRBM_GFX_INDEX, SH_INDEX, sh_num);
>
>     -
>
>     -              WREG32(mmGRBM_GFX_INDEX, data);
>
>     -}
>
>     -
>
>     static u32 gfx_v8_0_create_bitmask(u32 bit_width)
>
>     {
>
>                    return (u32)((1ULL << bit_width) - 1);
>
>     -- 
>
>     2.7.4
>
>     From 38fac8cab73dbc07e0ee7599b52106bc09dd32ea Mon Sep 17 00:00:00 2001
>
>     From: David Panariti <David.Panariti-5C7GfCeVMHo@public.gmane.org>
>     <mailto:David.Panariti-5C7GfCeVMHo@public.gmane.org>
>
>     Date: Mon, 24 Apr 2017 11:05:45 -0400
>
>     Subject: [PATCH 2/3] drm/amdgpu: Complete Carrizo EDC (Error
>     Detection and
>
>     Correction) workarounds.
>
>     The workarounds are unconditionally performed on CZs with EDC enabled.
>
>     EDC detects uncorrected ECC errors and uses data poisoning to prevent
>
>     corrupted compute results from being used (read).
>
>     EDC enabled CZs are often used in embedded environments.
>
>     Change-Id: I84c261785329beeb797f11efbe0ec35790f2996c
>
>     Signed-off-by: David Panariti <David.Panariti-5C7GfCeVMHo@public.gmane.org>
>     <mailto:David.Panariti-5C7GfCeVMHo@public.gmane.org>
>
>     ---
>
>     drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 148
>     ++++++++++++++++++++++++----------
>
>     1 file changed, 106 insertions(+), 42 deletions(-)
>
>     diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>     b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>
>     index 27b57cb..2f5bf5f 100644
>
>     --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>
>     +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>
>     @@ -1645,35 +1645,92 @@ static const u32 sgpr2_init_regs[] =
>
>                    mmCOMPUTE_USER_DATA_9, 0xedcedc09,
>
>     };
>
>     -static const u32 sec_ded_counter_registers[] =
>
>     -{
>
>     -              mmCPC_EDC_ATC_CNT,
>
>     -              mmCPC_EDC_SCRATCH_CNT,
>
>     -              mmCPC_EDC_UCODE_CNT,
>
>     -              mmCPF_EDC_ATC_CNT,
>
>     -              mmCPF_EDC_ROQ_CNT,
>
>     -              mmCPF_EDC_TAG_CNT,
>
>     -              mmCPG_EDC_ATC_CNT,
>
>     -              mmCPG_EDC_DMA_CNT,
>
>     -              mmCPG_EDC_TAG_CNT,
>
>     -              mmDC_EDC_CSINVOC_CNT,
>
>     -              mmDC_EDC_RESTORE_CNT,
>
>     -              mmDC_EDC_STATE_CNT,
>
>     -              mmGDS_EDC_CNT,
>
>     -              mmGDS_EDC_GRBM_CNT,
>
>     -              mmGDS_EDC_OA_DED,
>
>     -              mmSPI_EDC_CNT,
>
>     - mmSQC_ATC_EDC_GATCL1_CNT,
>
>     -              mmSQC_EDC_CNT,
>
>     -              mmSQ_EDC_DED_CNT,
>
>     -              mmSQ_EDC_INFO,
>
>     -              mmSQ_EDC_SEC_CNT,
>
>     -              mmTCC_EDC_CNT,
>
>     - mmTCP_ATC_EDC_GATCL1_CNT,
>
>     -              mmTCP_EDC_CNT,
>
>     -              mmTD_EDC_CNT
>
>     +struct reg32_counter_name_map {
>
>     +             uint32_t rnmap_addr;   /* Counter register address */
>
>     +             const char *rnmap_name;           /* Name of the
>     counter */
>
>     +             size_t rnmap_num_instances;  /* Number of block
>     instances */
>
>     };
>
>     +#define DEF_mmREG32_NAME_MAP_ELEMENT(reg, num_instances)
>     {             \
>
>     +             .rnmap_addr = mm##reg, \
>
>     +             .rnmap_name = #reg, \
>
>     +             .rnmap_num_instances =
>     num_instances                                            \
>
>     +}
>
>     +
>
>     +/* See GRBM_GFX_INDEX, et. al. registers. */
>
>     +static const struct reg32_counter_name_map
>     sec_ded_counter_registers[] = {
>
>     + DEF_mmREG32_NAME_MAP_ELEMENT(SQC_EDC_CNT, 2),
>
>     + DEF_mmREG32_NAME_MAP_ELEMENT(SQC_ATC_EDC_GATCL1_CNT, 2),
>
>     +
>
>     + DEF_mmREG32_NAME_MAP_ELEMENT(SQ_EDC_SEC_CNT, 8),
>
>     + DEF_mmREG32_NAME_MAP_ELEMENT(SQ_EDC_DED_CNT, 8),
>
>     + DEF_mmREG32_NAME_MAP_ELEMENT(TCP_EDC_CNT, 8),
>
>     + DEF_mmREG32_NAME_MAP_ELEMENT(TCP_ATC_EDC_GATCL1_CNT, 8),
>
>     + DEF_mmREG32_NAME_MAP_ELEMENT(TD_EDC_CNT, 8),
>
>     +
>
>     + DEF_mmREG32_NAME_MAP_ELEMENT(TCC_EDC_CNT, 4),
>
>     +
>
>     + DEF_mmREG32_NAME_MAP_ELEMENT(CPC_EDC_ATC_CNT, 1),
>
>     + DEF_mmREG32_NAME_MAP_ELEMENT(CPC_EDC_SCRATCH_CNT, 1),
>
>     + DEF_mmREG32_NAME_MAP_ELEMENT(CPC_EDC_UCODE_CNT, 1),
>
>     + DEF_mmREG32_NAME_MAP_ELEMENT(CPF_EDC_ATC_CNT, 1),
>
>     + DEF_mmREG32_NAME_MAP_ELEMENT(CPF_EDC_ROQ_CNT, 1),
>
>     + DEF_mmREG32_NAME_MAP_ELEMENT(CPF_EDC_TAG_CNT, 1),
>
>     + DEF_mmREG32_NAME_MAP_ELEMENT(CPG_EDC_ATC_CNT, 1),
>
>     + DEF_mmREG32_NAME_MAP_ELEMENT(CPG_EDC_DMA_CNT, 1),
>
>     + DEF_mmREG32_NAME_MAP_ELEMENT(CPG_EDC_TAG_CNT, 1),
>
>     + DEF_mmREG32_NAME_MAP_ELEMENT(DC_EDC_CSINVOC_CNT, 1),
>
>     + DEF_mmREG32_NAME_MAP_ELEMENT(DC_EDC_STATE_CNT, 1),
>
>     + DEF_mmREG32_NAME_MAP_ELEMENT(DC_EDC_RESTORE_CNT, 1),
>
>     + DEF_mmREG32_NAME_MAP_ELEMENT(GDS_EDC_CNT, 1),
>
>     + DEF_mmREG32_NAME_MAP_ELEMENT(GDS_EDC_GRBM_CNT, 1),
>
>     + DEF_mmREG32_NAME_MAP_ELEMENT(SPI_EDC_CNT, 1),
>
>     +};
>
>     +
>
>     +static int gfx_v8_0_edc_clear_counters(struct amdgpu_device *adev)
>
>     +{
>
>     +             int ci, se, sh, i;
>
>     +             uint32_t count;
>
>     +             int r = 0;
>
>     +
>
>     + mutex_lock(&adev->grbm_idx_mutex);
>
>     +
>
>     +             for (ci = 0; ci <
>     ARRAY_SIZE(sec_ded_counter_registers); ++ci) {
>
>     +                             const struct reg32_counter_name_map
>     *cp =
>
>     + sec_ded_counter_registers + ci;
>
>     +                             const char *name = cp->rnmap_name;
>
>     +
>
>     +                             for (se = 0; se <
>     adev->gfx.config.max_shader_engines; ++se) {
>
>     + for (sh = 0; sh < adev->gfx.config.max_sh_per_se; ++sh) {
>
>     + for (i = 0; i < cp->rnmap_num_instances; ++i) {
>
>     + gfx_v8_0_select_se_sh(adev, se, sh, i);
>
>     + count = RREG32(cp->rnmap_addr);
>
>     + count = RREG32(cp->rnmap_addr);
>
>     + if (count != 0) {
>
>     + /*
>
>     + * Workaround failed.
>
>     + * If people are interested
>
>     + * in EDC at all, they will
>
>     + * want to know which
>
>     + * counters had problems.
>
>     + */
>
>     + DRM_WARN("EDC counter %s is 0x%08x, but should be 0\n.",
>
>     + name, count);
>
>     + r = -EINVAL;
>
>     + goto ret;
>
>     + }
>
>     + }
>
>     + }
>
>     +                             }
>
>     +             }
>
>     +
>
>     +ret:
>
>     + gfx_v8_0_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff);
>
>     + mutex_unlock(&adev->grbm_idx_mutex);
>
>     +
>
>     +             return r;
>
>     +}
>
>     +
>
>     static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
>
>     {
>
>                    struct amdgpu_ring *ring = &adev->gfx.compute_ring[0];
>
>     @@ -1681,18 +1738,36 @@ static int
>     gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
>
>                    struct fence *f = NULL;
>
>                    int r, i;
>
>                    u32 tmp;
>
>     +             u32 dis_bit;
>
>                    unsigned total_size, vgpr_offset, sgpr_offset;
>
>                    u64 gpu_addr;
>
>     -              /* only supported on CZ */
>
>     -              if (adev->asic_type != CHIP_CARRIZO)
>
>     +             if (adev->asic_type != CHIP_CARRIZO) {
>
>     +                             /* EDC is only supported on CZ */
>
>     +                             return 0;
>
>     +             }
>
>     +
>
>     +        DRM_INFO("Detected Carrizo.\n");
>
>     +
>
>     +             tmp = RREG32(mmCC_GC_EDC_CONFIG);
>
>     +             dis_bit = REG_GET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC);
>
>     +             if (dis_bit) {
>
>     +                             /* On Carrizo, EDC may be disabled
>     by a fuse. */
>
>     + DRM_INFO("EDC hardware is disabled, GC_EDC_CONFIG: 0x%08x.\n",
>
>     + tmp);
>
>                                    return 0;
>
>     +             }
>
>                     /* bail if the compute ring is not ready */
>
>                    if (!ring->ready)
>
>                                    return 0;
>
>     -              tmp = RREG32(mmGB_EDC_MODE);
>
>     +             DRM_INFO("Applying EDC workarounds.\n");
>
>     +
>
>     +             /*
>
>     +             * Interested parties can enable EDC using debugfs
>     register reads and
>
>     +             * writes.
>
>     +             */
>
>                    WREG32(mmGB_EDC_MODE, 0);
>
>                     total_size =
>
>     @@ -1817,18 +1892,7 @@ static int
>     gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
>
>                                    goto fail;
>
>                    }
>
>     -              tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, DED_MODE, 2);
>
>     -              tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, PROP_FED, 1);
>
>     -              WREG32(mmGB_EDC_MODE, tmp);
>
>     -
>
>     -              tmp = RREG32(mmCC_GC_EDC_CONFIG);
>
>     -              tmp = REG_SET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC,
>     0) | 1;
>
>     - WREG32(mmCC_GC_EDC_CONFIG, tmp);
>
>     -
>
>     -
>
>     -              /* read back registers to clear the counters */
>
>     -              for (i = 0; i <
>     ARRAY_SIZE(sec_ded_counter_registers); i++)
>
>     - RREG32(sec_ded_counter_registers[i]);
>
>     + gfx_v8_0_edc_clear_counters(adev);
>
>      fail:
>
>                    amdgpu_ib_free(adev, &ib, NULL);
>
>     -- 
>
>     2.7.4
>
>     From ec4803205582c1011f5ced1ead70ee244268b4b8 Mon Sep 17 00:00:00 2001
>
>     From: David Panariti <David.Panariti-5C7GfCeVMHo@public.gmane.org>
>     <mailto:David.Panariti-5C7GfCeVMHo@public.gmane.org>
>
>     Date: Wed, 26 Apr 2017 10:13:06 -0400
>
>     Subject: [PATCH 3/3] drm/amdgpu: Add kernel parameter to control
>     use of
>
>     ECC/EDC.
>
>     Allow various kinds of memory integrity methods (e.g. ECC/EDC) to
>     be enabled
>
>     or disabled.  By default, all features are disabled.
>
>     EDC is Error Detection and Correction. It can detect ECC errors
>     and do 0 or
>
>     more of: count SEC (single error corrected) and DED (double error
>     detected,
>
>     i.e. uncorrected ECC error), halt the affected block, interrupt
>     the CPU.
>
>     Currently, only counting errors is supported.
>
>     Signed-off-by: David Panariti <David.Panariti-5C7GfCeVMHo@public.gmane.org>
>     <mailto:David.Panariti-5C7GfCeVMHo@public.gmane.org>
>
>     ---
>
>     drivers/gpu/drm/amd/amdgpu/amdgpu.h |  1 +
>
>     drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  4 ++++
>
>     drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 34
>     +++++++++++++++++++++++++++-----
>
>     drivers/gpu/drm/amd/include/amd_shared.h | 14 +++++++++++++
>
>     4 files changed, 48 insertions(+), 5 deletions(-)
>
>     diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>     b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>
>     index 4a16e3c..0322392 100644
>
>     --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>
>     +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>
>     @@ -111,6 +111,7 @@ extern int amdgpu_prim_buf_per_se;
>
>     extern int amdgpu_pos_buf_per_se;
>
>     extern int amdgpu_cntl_sb_buf_per_se;
>
>     extern int amdgpu_param_buf_per_se;
>
>     +extern unsigned amdgpu_ecc_flags;
>
>      #define AMDGPU_DEFAULT_GTT_SIZE_MB                      3072ULL
>     /* 3GB by default */
>
>     #define AMDGPU_WAIT_IDLE_TIMEOUT_IN_MS                   3000
>
>     diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>     b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>
>     index ead00d7..00e16ac 100644
>
>     --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>
>     +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>
>     @@ -110,6 +110,7 @@ int amdgpu_prim_buf_per_se = 0;
>
>     int amdgpu_pos_buf_per_se = 0;
>
>     int amdgpu_cntl_sb_buf_per_se = 0;
>
>     int amdgpu_param_buf_per_se = 0;
>
>     +unsigned amdgpu_ecc_flags = 0;
>
>      MODULE_PARM_DESC(vramlimit, "Restrict VRAM for testing, in
>     megabytes");
>
>     module_param_named(vramlimit, amdgpu_vram_limit, int, 0600);
>
>     @@ -235,6 +236,9 @@ module_param_named(cntl_sb_buf_per_se,
>     amdgpu_cntl_sb_buf_per_se, int, 0444);
>
>     MODULE_PARM_DESC(param_buf_per_se, "the size of Off-Chip Pramater
>     Cache per Shader Engine (default depending on gfx)");
>
>     module_param_named(param_buf_per_se, amdgpu_param_buf_per_se, int,
>     0444);
>
>     +MODULE_PARM_DESC(ecc_flags, "ECC/EDC enable flags (0 = disable
>     ECC/EDC (default))");
>
>     +module_param_named(ecc_flags, amdgpu_ecc_flags, uint, 0444);
>
>     +
>
>      static const struct pci_device_id pciidlist[] = {
>
>     #ifdef  CONFIG_DRM_AMDGPU_SI
>
>     diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>     b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>
>     index 2f5bf5f..05cab7e 100644
>
>     --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>
>     +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>
>     @@ -1708,7 +1708,7 @@ static int
>     gfx_v8_0_edc_clear_counters(struct amdgpu_device *adev)
>
>     count = RREG32(cp->rnmap_addr);
>
>     if (count != 0) {
>
>     /*
>
>     - * Workaround failed.
>
>     + * EDC workaround failed.
>
>      * If people are interested
>
>      * in EDC at all, they will
>
>      * want to know which
>
>     @@ -1747,14 +1747,24 @@ static int
>     gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
>
>                                    return 0;
>
>                    }
>
>     -        DRM_INFO("Detected Carrizo.\n");
>
>     +             DRM_INFO("Detected Carrizo.\n");
>
>                     tmp = RREG32(mmCC_GC_EDC_CONFIG);
>
>                    dis_bit = REG_GET_FIELD(tmp, CC_GC_EDC_CONFIG,
>     DIS_EDC);
>
>                    if (dis_bit) {
>
>     -                              /* On Carrizo, EDC may be disabled
>     by a fuse. */
>
>     - DRM_INFO("EDC hardware is disabled, GC_EDC_CONFIG: 0x%08x.\n",
>
>     - tmp);
>
>     +                             /* On Carrizo, EDC may be disabled
>     permanently by a fuse. */
>
>     + DRM_INFO("Carrizo EDC hardware is disabled, GC_EDC_CONFIG:
>     0x%08x.\n",
>
>     + tmp);
>
>     +                             return 0;
>
>     +             }
>
>     +
>
>     +             /*
>
>     +             * Check if EDC has been requested by a kernel parameter.
>
>     +             * For Carrizo, EDC is the best/safest mode WRT error
>     handling.
>
>     +             */
>
>     +             if (!(amdgpu_ecc_flags
>
>     +                   & (AMD_ECC_SUPPORT_BEST | AMD_ECC_SUPPORT_EDC))) {
>
>     + DRM_INFO("EDC support has not been requested.\n");
>
>                                    return 0;
>
>                    }
>
>     @@ -1892,6 +1902,20 @@ static int
>     gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
>
>                                    goto fail;
>
>                    }
>
>     +             /* 00 - GB_EDC_DED_MODE_LOG: Count DED errors but do
>     not halt */
>
>     +             tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, DED_MODE, 0);
>
>     +             /* Do not propagate the errors to the next block. */
>
>     +             tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, PROP_FED, 0);
>
>     +             WREG32(mmGB_EDC_MODE, tmp);
>
>     +
>
>     +             tmp = RREG32(mmCC_GC_EDC_CONFIG);
>
>     +
>
>     +             /*
>
>     +             * Clear EDC_DISABLE bit so the counters are available.
>
>     +             */
>
>     +             tmp = REG_SET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC, 0);
>
>     + WREG32(mmCC_GC_EDC_CONFIG, tmp);
>
>     +
>
>     gfx_v8_0_edc_clear_counters(adev);
>
>      fail:
>
>     diff --git a/drivers/gpu/drm/amd/include/amd_shared.h
>     b/drivers/gpu/drm/amd/include/amd_shared.h
>
>     index 2ccf44e..c4fd013 100644
>
>     --- a/drivers/gpu/drm/amd/include/amd_shared.h
>
>     +++ b/drivers/gpu/drm/amd/include/amd_shared.h
>
>     @@ -179,6 +179,20 @@ struct amd_pp_profile {
>
>     #define AMD_PG_SUPPORT_GFX_QUICK_MG (1 << 11)
>
>     #define AMD_PG_SUPPORT_GFX_PIPELINE                     (1 << 12)
>
>     +/*
>
>     + * ECC flags
>
>     + * Allows the user to choose what kind of error
>     detection/correction is used.
>
>     + * Currently, EDC is supported on Carrizo.
>
>     + *
>
>     + * The AMD_ECC_SUPPORT_BEST bit is used to allow a user to have
>     the driver
>
>     + * set what it thinks is best/safest mode.  This may not be the
>     same as the
>
>     + * default, depending on the GPU and the application.
>
>     + * Using a single bit makes it easy to request the best support
>     without
>
>     + * needing to know all currently supported modes.
>
>     + */
>
>     +#define AMD_ECC_SUPPORT_BEST (1 << 0)
>
>     +#define AMD_ECC_SUPPORT_EDC (1 << 1)
>
>     +
>
>     enum amd_pm_state_type {
>
>                    /* not used for dpm */
>
>     POWER_STATE_TYPE_DEFAULT,
>
>     -- 
>
>     2.7.4
>
>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx



[-- Attachment #1.2: Type: text/html, Size: 57044 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 5+ messages in thread

* RE: CZ EDC param and support
       [not found]             ` <502f7d5e-431c-b39a-1281-dbe3b710a75e-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
@ 2017-05-01 14:44               ` Panariti, David
  0 siblings, 0 replies; 5+ messages in thread
From: Panariti, David @ 2017-05-01 14:44 UTC (permalink / raw)
  To: Christian König, Koenig, Christian, gpudriverdevsupport,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 22613 bytes --]

Thanks.
I’ve been using my Windwoes box way too much.  I need to set up email on my linux boxen.

davep

From: Christian König [mailto:deathsimple@vodafone.de]
Sent: Monday, May 01, 2017 10:40 AM
To: Panariti, David <David.Panariti@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; gpudriverdevsupport <gpudriverdevsupport@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: CZ EDC param and support

I’ve gotten a comment that inline patches are preferred.
Well, they are. But you should send them with "git send-email" and not squashed together all in one mail :)

Otherwise I can't see how we should be able to apply them.

Additional to that at least I'm perfectly fine with attached patches as well.

Christian.

Am 28.04.2017 um 16:18 schrieb Panariti, David:
Actually, the attachment was an oversight.
It’s easier for me to attach, open the attachment and then delete the attachment.
I got only 2/3 this time.
I’ve gotten a comment that inline patches are preferred.

Sorry for the inconvenience.

davep

From: Koenig, Christian
Sent: Friday, April 28, 2017 4:06 AM
To: Panariti, David <David.Panariti@amd.com><mailto:David.Panariti@amd.com>; gpudriverdevsupport <gpudriverdevsupport@amd.com><mailto:gpudriverdevsupport@amd.com>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
Subject: Re: CZ EDC param and support

You somehow messed up the attachment.

Instead of individual files everything is squashed together as all-edc.patch.

Please fix that otherwise proper review won't be possible.

Christian.

Am 28.04.2017 um 00:13 schrieb Panariti, David:
The changes in the workarounds function use DRM_INFO rather than DRM_DEBUG because CZs with EDC are often used in embedded environments and any info can be useful especially in the case of an intermittent problem.

From e1ce383592c275b58ad95bd80b5479af8c1f9dae Mon Sep 17 00:00:00 2001
From: David Panariti <David.Panariti@amd.com><mailto:David.Panariti@amd.com>
Date: Fri, 14 Apr 2017 13:41:52 -0400
Subject: [PATCH 1/3] drm/amdgpu: Moved gfx_v8_0_select_se_sh() in lieu of
re-redundant prototype.

Will be needed for the rest of the EDC workarounds patch.

Change-Id: Ie586ab38a69e98a91c6cb5747e285ce8bfdd1c86
Signed-off-by: David Panariti <David.Panariti@amd.com><mailto:David.Panariti@amd.com>
---
drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 46 +++++++++++++++++------------------
1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 2ff5f19..27b57cb 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -1500,6 +1500,29 @@ static int gfx_v8_0_kiq_init(struct amdgpu_device *adev)
               return 0;
}
+static void gfx_v8_0_select_se_sh(struct amdgpu_device *adev,
+                                                               u32 se_num, u32 sh_num, u32 instance)
+{
+             u32 data;
+
+             if (instance == 0xffffffff)
+                             data = REG_SET_FIELD(0, GRBM_GFX_INDEX, INSTANCE_BROADCAST_WRITES, 1);
+             else
+                             data = REG_SET_FIELD(0, GRBM_GFX_INDEX, INSTANCE_INDEX, instance);
+
+             if (se_num == 0xffffffff)
+                             data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SE_BROADCAST_WRITES, 1);
+             else
+                             data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SE_INDEX, se_num);
+
+             if (sh_num == 0xffffffff)
+                             data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SH_BROADCAST_WRITES, 1);
+             else
+                             data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SH_INDEX, sh_num);
+
+             WREG32(mmGRBM_GFX_INDEX, data);
+}
+
static const u32 vgpr_init_compute_shader[] =
{
               0x7e000209, 0x7e020208,
@@ -3556,29 +3579,6 @@ static void gfx_v8_0_tiling_mode_table_init(struct amdgpu_device *adev)
               }
}
-static void gfx_v8_0_select_se_sh(struct amdgpu_device *adev,
-                                                                u32 se_num, u32 sh_num, u32 instance)
-{
-              u32 data;
-
-              if (instance == 0xffffffff)
-                              data = REG_SET_FIELD(0, GRBM_GFX_INDEX, INSTANCE_BROADCAST_WRITES, 1);
-              else
-                              data = REG_SET_FIELD(0, GRBM_GFX_INDEX, INSTANCE_INDEX, instance);
-
-              if (se_num == 0xffffffff)
-                              data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SE_BROADCAST_WRITES, 1);
-              else
-                              data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SE_INDEX, se_num);
-
-              if (sh_num == 0xffffffff)
-                              data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SH_BROADCAST_WRITES, 1);
-              else
-                              data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SH_INDEX, sh_num);
-
-              WREG32(mmGRBM_GFX_INDEX, data);
-}
-
static u32 gfx_v8_0_create_bitmask(u32 bit_width)
{
               return (u32)((1ULL << bit_width) - 1);
--
2.7.4


From 38fac8cab73dbc07e0ee7599b52106bc09dd32ea Mon Sep 17 00:00:00 2001
From: David Panariti <David.Panariti@amd.com><mailto:David.Panariti@amd.com>
Date: Mon, 24 Apr 2017 11:05:45 -0400
Subject: [PATCH 2/3] drm/amdgpu: Complete Carrizo EDC (Error Detection and
Correction) workarounds.

The workarounds are unconditionally performed on CZs with EDC enabled.
EDC detects uncorrected ECC errors and uses data poisoning to prevent
corrupted compute results from being used (read).
EDC enabled CZs are often used in embedded environments.

Change-Id: I84c261785329beeb797f11efbe0ec35790f2996c
Signed-off-by: David Panariti <David.Panariti@amd.com><mailto:David.Panariti@amd.com>
---
drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 148 ++++++++++++++++++++++++----------
1 file changed, 106 insertions(+), 42 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 27b57cb..2f5bf5f 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -1645,35 +1645,92 @@ static const u32 sgpr2_init_regs[] =
               mmCOMPUTE_USER_DATA_9, 0xedcedc09,
};
-static const u32 sec_ded_counter_registers[] =
-{
-              mmCPC_EDC_ATC_CNT,
-              mmCPC_EDC_SCRATCH_CNT,
-              mmCPC_EDC_UCODE_CNT,
-              mmCPF_EDC_ATC_CNT,
-              mmCPF_EDC_ROQ_CNT,
-              mmCPF_EDC_TAG_CNT,
-              mmCPG_EDC_ATC_CNT,
-              mmCPG_EDC_DMA_CNT,
-              mmCPG_EDC_TAG_CNT,
-              mmDC_EDC_CSINVOC_CNT,
-              mmDC_EDC_RESTORE_CNT,
-              mmDC_EDC_STATE_CNT,
-              mmGDS_EDC_CNT,
-              mmGDS_EDC_GRBM_CNT,
-              mmGDS_EDC_OA_DED,
-              mmSPI_EDC_CNT,
-              mmSQC_ATC_EDC_GATCL1_CNT,
-              mmSQC_EDC_CNT,
-              mmSQ_EDC_DED_CNT,
-              mmSQ_EDC_INFO,
-              mmSQ_EDC_SEC_CNT,
-              mmTCC_EDC_CNT,
-              mmTCP_ATC_EDC_GATCL1_CNT,
-              mmTCP_EDC_CNT,
-              mmTD_EDC_CNT
+struct reg32_counter_name_map {
+             uint32_t rnmap_addr;   /* Counter register address */
+             const char *rnmap_name;           /* Name of the counter */
+             size_t rnmap_num_instances;  /* Number of block instances */
};
+#define DEF_mmREG32_NAME_MAP_ELEMENT(reg, num_instances) {             \
+             .rnmap_addr = mm##reg,                                                                           \
+             .rnmap_name = #reg,                                                                    \
+             .rnmap_num_instances = num_instances                                            \
+}
+
+/* See GRBM_GFX_INDEX, et. al. registers. */
+static const struct reg32_counter_name_map sec_ded_counter_registers[] = {
+             DEF_mmREG32_NAME_MAP_ELEMENT(SQC_EDC_CNT, 2),
+             DEF_mmREG32_NAME_MAP_ELEMENT(SQC_ATC_EDC_GATCL1_CNT, 2),
+
+             DEF_mmREG32_NAME_MAP_ELEMENT(SQ_EDC_SEC_CNT, 8),
+             DEF_mmREG32_NAME_MAP_ELEMENT(SQ_EDC_DED_CNT, 8),
+             DEF_mmREG32_NAME_MAP_ELEMENT(TCP_EDC_CNT, 8),
+             DEF_mmREG32_NAME_MAP_ELEMENT(TCP_ATC_EDC_GATCL1_CNT, 8),
+             DEF_mmREG32_NAME_MAP_ELEMENT(TD_EDC_CNT, 8),
+
+             DEF_mmREG32_NAME_MAP_ELEMENT(TCC_EDC_CNT, 4),
+
+             DEF_mmREG32_NAME_MAP_ELEMENT(CPC_EDC_ATC_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(CPC_EDC_SCRATCH_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(CPC_EDC_UCODE_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(CPF_EDC_ATC_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(CPF_EDC_ROQ_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(CPF_EDC_TAG_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(CPG_EDC_ATC_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(CPG_EDC_DMA_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(CPG_EDC_TAG_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(DC_EDC_CSINVOC_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(DC_EDC_STATE_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(DC_EDC_RESTORE_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(GDS_EDC_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(GDS_EDC_GRBM_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(SPI_EDC_CNT, 1),
+};
+
+static int gfx_v8_0_edc_clear_counters(struct amdgpu_device *adev)
+{
+             int ci, se, sh, i;
+             uint32_t count;
+             int r = 0;
+
+             mutex_lock(&adev->grbm_idx_mutex);
+
+             for (ci = 0; ci < ARRAY_SIZE(sec_ded_counter_registers); ++ci) {
+                             const struct reg32_counter_name_map *cp =
+                                             sec_ded_counter_registers + ci;
+                             const char *name = cp->rnmap_name;
+
+                             for (se = 0; se < adev->gfx.config.max_shader_engines; ++se) {
+                                             for (sh = 0; sh < adev->gfx.config.max_sh_per_se; ++sh) {
+                                                             for (i = 0; i < cp->rnmap_num_instances; ++i) {
+                                                                             gfx_v8_0_select_se_sh(adev, se, sh, i);
+                                                                             count = RREG32(cp->rnmap_addr);
+                                                                             count = RREG32(cp->rnmap_addr);
+                                                                             if (count != 0) {
+                                                                                             /*
+                                                                                             * Workaround failed.
+                                                                                             * If people are interested
+                                                                                             * in EDC at all, they will
+                                                                                             * want to know which
+                                                                                             * counters had problems.
+                                                                                             */
+                                                                                             DRM_WARN("EDC counter %s is 0x%08x, but should be 0\n.",
+                                                                                                             name, count);
+                                                                                             r = -EINVAL;
+                                                                                             goto ret;
+                                                                             }
+                                                             }
+                                             }
+                             }
+             }
+
+ret:
+             gfx_v8_0_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff);
+             mutex_unlock(&adev->grbm_idx_mutex);
+
+             return r;
+}
+
static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
{
               struct amdgpu_ring *ring = &adev->gfx.compute_ring[0];
@@ -1681,18 +1738,36 @@ static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
               struct fence *f = NULL;
               int r, i;
               u32 tmp;
+             u32 dis_bit;
               unsigned total_size, vgpr_offset, sgpr_offset;
               u64 gpu_addr;
-              /* only supported on CZ */
-              if (adev->asic_type != CHIP_CARRIZO)
+             if (adev->asic_type != CHIP_CARRIZO) {
+                             /* EDC is only supported on CZ */
+                             return 0;
+             }
+
+        DRM_INFO("Detected Carrizo.\n");
+
+             tmp = RREG32(mmCC_GC_EDC_CONFIG);
+             dis_bit = REG_GET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC);
+             if (dis_bit) {
+                             /* On Carrizo, EDC may be disabled by a fuse. */
+                             DRM_INFO("EDC hardware is disabled, GC_EDC_CONFIG: 0x%08x.\n",
+                                             tmp);
                               return 0;
+             }
                /* bail if the compute ring is not ready */
               if (!ring->ready)
                               return 0;
-              tmp = RREG32(mmGB_EDC_MODE);
+             DRM_INFO("Applying EDC workarounds.\n");
+
+             /*
+             * Interested parties can enable EDC using debugfs register reads and
+             * writes.
+             */
               WREG32(mmGB_EDC_MODE, 0);
                total_size =
@@ -1817,18 +1892,7 @@ static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
                               goto fail;
               }
-              tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, DED_MODE, 2);
-              tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, PROP_FED, 1);
-              WREG32(mmGB_EDC_MODE, tmp);
-
-              tmp = RREG32(mmCC_GC_EDC_CONFIG);
-              tmp = REG_SET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC, 0) | 1;
-              WREG32(mmCC_GC_EDC_CONFIG, tmp);
-
-
-              /* read back registers to clear the counters */
-              for (i = 0; i < ARRAY_SIZE(sec_ded_counter_registers); i++)
-                              RREG32(sec_ded_counter_registers[i]);
+             gfx_v8_0_edc_clear_counters(adev);
 fail:
               amdgpu_ib_free(adev, &ib, NULL);
--
2.7.4


From ec4803205582c1011f5ced1ead70ee244268b4b8 Mon Sep 17 00:00:00 2001
From: David Panariti <David.Panariti@amd.com><mailto:David.Panariti@amd.com>
Date: Wed, 26 Apr 2017 10:13:06 -0400
Subject: [PATCH 3/3] drm/amdgpu: Add kernel parameter to control use of
ECC/EDC.

Allow various kinds of memory integrity methods (e.g. ECC/EDC) to be enabled
or disabled.  By default, all features are disabled.

EDC is Error Detection and Correction.  It can detect ECC errors and do 0 or
more of: count SEC (single error corrected) and DED (double error detected,
i.e. uncorrected ECC error), halt the affected block, interrupt the CPU.
Currently, only counting errors is supported.

Signed-off-by: David Panariti <David.Panariti@amd.com><mailto:David.Panariti@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h      |  1 +
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  |  4 ++++
drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 34 +++++++++++++++++++++++++++-----
drivers/gpu/drm/amd/include/amd_shared.h | 14 +++++++++++++
4 files changed, 48 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 4a16e3c..0322392 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -111,6 +111,7 @@ extern int amdgpu_prim_buf_per_se;
extern int amdgpu_pos_buf_per_se;
extern int amdgpu_cntl_sb_buf_per_se;
extern int amdgpu_param_buf_per_se;
+extern unsigned amdgpu_ecc_flags;
 #define AMDGPU_DEFAULT_GTT_SIZE_MB                      3072ULL /* 3GB by default */
#define AMDGPU_WAIT_IDLE_TIMEOUT_IN_MS                   3000
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index ead00d7..00e16ac 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -110,6 +110,7 @@ int amdgpu_prim_buf_per_se = 0;
int amdgpu_pos_buf_per_se = 0;
int amdgpu_cntl_sb_buf_per_se = 0;
int amdgpu_param_buf_per_se = 0;
+unsigned amdgpu_ecc_flags = 0;
 MODULE_PARM_DESC(vramlimit, "Restrict VRAM for testing, in megabytes");
module_param_named(vramlimit, amdgpu_vram_limit, int, 0600);
@@ -235,6 +236,9 @@ module_param_named(cntl_sb_buf_per_se, amdgpu_cntl_sb_buf_per_se, int, 0444);
MODULE_PARM_DESC(param_buf_per_se, "the size of Off-Chip Pramater Cache per Shader Engine (default depending on gfx)");
module_param_named(param_buf_per_se, amdgpu_param_buf_per_se, int, 0444);
+MODULE_PARM_DESC(ecc_flags, "ECC/EDC enable flags (0 = disable ECC/EDC (default))");
+module_param_named(ecc_flags, amdgpu_ecc_flags, uint, 0444);
+
 static const struct pci_device_id pciidlist[] = {
#ifdef  CONFIG_DRM_AMDGPU_SI
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 2f5bf5f..05cab7e 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -1708,7 +1708,7 @@ static int gfx_v8_0_edc_clear_counters(struct amdgpu_device *adev)
                                                                               count = RREG32(cp->rnmap_addr);
                                                                               if (count != 0) {
                                                                                               /*
-                                                                                              * Workaround failed.
+                                                                                             * EDC workaround failed.
                                                                                                * If people are interested
                                                                                                * in EDC at all, they will
                                                                                                * want to know which
@@ -1747,14 +1747,24 @@ static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
                               return 0;
               }
-        DRM_INFO("Detected Carrizo.\n");
+             DRM_INFO("Detected Carrizo.\n");
                tmp = RREG32(mmCC_GC_EDC_CONFIG);
               dis_bit = REG_GET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC);
               if (dis_bit) {
-                              /* On Carrizo, EDC may be disabled by a fuse. */
-                              DRM_INFO("EDC hardware is disabled, GC_EDC_CONFIG: 0x%08x.\n",
-                                              tmp);
+                             /* On Carrizo, EDC may be disabled permanently by a fuse. */
+                             DRM_INFO("Carrizo EDC hardware is disabled, GC_EDC_CONFIG: 0x%08x.\n",
+                                             tmp);
+                             return 0;
+             }
+
+             /*
+             * Check if EDC has been requested by a kernel parameter.
+             * For Carrizo, EDC is the best/safest mode WRT error handling.
+             */
+             if (!(amdgpu_ecc_flags
+                   & (AMD_ECC_SUPPORT_BEST | AMD_ECC_SUPPORT_EDC))) {
+                             DRM_INFO("EDC support has not been requested.\n");
                               return 0;
               }
@@ -1892,6 +1902,20 @@ static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
                               goto fail;
               }
+             /* 00 - GB_EDC_DED_MODE_LOG: Count DED errors but do not halt */
+             tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, DED_MODE, 0);
+             /* Do not propagate the errors to the next block. */
+             tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, PROP_FED, 0);
+             WREG32(mmGB_EDC_MODE, tmp);
+
+             tmp = RREG32(mmCC_GC_EDC_CONFIG);
+
+             /*
+             * Clear EDC_DISABLE bit so the counters are available.
+             */
+             tmp = REG_SET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC, 0);
+             WREG32(mmCC_GC_EDC_CONFIG, tmp);
+
               gfx_v8_0_edc_clear_counters(adev);
 fail:
diff --git a/drivers/gpu/drm/amd/include/amd_shared.h b/drivers/gpu/drm/amd/include/amd_shared.h
index 2ccf44e..c4fd013 100644
--- a/drivers/gpu/drm/amd/include/amd_shared.h
+++ b/drivers/gpu/drm/amd/include/amd_shared.h
@@ -179,6 +179,20 @@ struct amd_pp_profile {
#define AMD_PG_SUPPORT_GFX_QUICK_MG                                (1 << 11)
#define AMD_PG_SUPPORT_GFX_PIPELINE                     (1 << 12)
+/*
+ * ECC flags
+ * Allows the user to choose what kind of error detection/correction is used.
+ * Currently, EDC is supported on Carrizo.
+ *
+ * The AMD_ECC_SUPPORT_BEST bit is used to allow a user to have the driver
+ * set what it thinks is best/safest mode.  This may not be the same as the
+ * default, depending on the GPU and the application.
+ * Using a single bit makes it easy to request the best support without
+ * needing to know all currently supported modes.
+ */
+#define AMD_ECC_SUPPORT_BEST                                     (1 << 0)
+#define AMD_ECC_SUPPORT_EDC                                       (1 << 1)
+
enum amd_pm_state_type {
               /* not used for dpm */
               POWER_STATE_TYPE_DEFAULT,
--
2.7.4







_______________________________________________

amd-gfx mailing list

amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>

https://lists.freedesktop.org/mailman/listinfo/amd-gfx



[-- Attachment #1.2: Type: text/html, Size: 73773 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2017-05-01 14:44 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-04-27 22:13 CZ EDC param and support Panariti, David
     [not found] ` <BN6PR12MB1889CED9632BAE0144E8AE1795100-/b2+HYfkarQsUGYmqVGGPAdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2017-04-28  8:05   ` Christian König
     [not found]     ` <40321e5a-36c5-6e07-8176-bcd596e7e982-5C7GfCeVMHo@public.gmane.org>
2017-04-28 14:18       ` Panariti, David
     [not found]         ` <BN6PR12MB1889C8C94B10AA6E3DF6BD6495130-/b2+HYfkarQsUGYmqVGGPAdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2017-05-01 14:40           ` Christian König
     [not found]             ` <502f7d5e-431c-b39a-1281-dbe3b710a75e-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
2017-05-01 14:44               ` Panariti, David

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.