All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/5] drm/amdgpu: Add gfx ras function on gfx v11_0_3
@ 2023-01-13  6:23 YiPeng Chai
  2023-01-13  6:23 ` [PATCH 2/5] amd/amdgpu: Add RLC_RLCS_FED_STATUS_* to gc v11_0_3 ip headers YiPeng Chai
                   ` (3 more replies)
  0 siblings, 4 replies; 5+ messages in thread
From: YiPeng Chai @ 2023-01-13  6:23 UTC (permalink / raw)
  To: amd-gfx; +Cc: Tao Zhou, Hawking.Zhang, YiPeng Chai, Candice.Li, yipechai

Add gfx ras function on gfx v11_0_3.

V2:
 1. Add separate source files for gfx v11_0_3.
 2. Create a common function to initialize gfx ras block.

V3:
 1. Rename amdgpu_gfx_ras_block_init to amdgpu_gfx_ras_sw_init.
 2. Adjust the calling position of amdgpu_gfx_ras_sw_init.
 3. Remove gfx_v11_0_3_ras_ops.

V4:
 Revert changes in amdgpu_ras_interrupt_poison_consumption_handler.

V5:
 1. Remove invalid include file in gfx_v11_0_3.c.
 2. Reduce the number of parameters of amdgpu_gfx_ras_sw_init.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/Makefile      |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c  | 35 ++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h  |  1 +
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c   | 13 +++++++++
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c | 27 ++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.h | 29 ++++++++++++++++++++
 6 files changed, 106 insertions(+)
 create mode 100644 drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
 create mode 100644 drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.h

diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile
index 332cf8bda7a2..5df603192cdc 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -137,6 +137,7 @@ amdgpu-y += \
 	gfx_v10_0.o \
 	imu_v11_0.o \
 	gfx_v11_0.o \
+	gfx_v11_0_3.o \
 	imu_v11_0_3.o
 
 # add async DMA block
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 42a939cd2eac..09c42c00e43c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -696,6 +696,41 @@ int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *r
 	return r;
 }
 
+int amdgpu_gfx_ras_sw_init(struct amdgpu_device *adev)
+{
+	int err = 0;
+	struct amdgpu_gfx_ras *ras = NULL;
+
+	/* adev->gfx.ras is NULL, which means gfx does not
+	 * support ras function, then do nothing here.
+	 */
+	if (!adev->gfx.ras)
+		return 0;
+
+	ras = adev->gfx.ras;
+
+	err = amdgpu_ras_register_ras_block(adev, &ras->ras_block);
+	if (err) {
+		dev_err(adev->dev, "Failed to register gfx ras block!\n");
+		return err;
+	}
+
+	strcpy(ras->ras_block.ras_comm.name, "gfx");
+	ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__GFX;
+	ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+	adev->gfx.ras_if = &ras->ras_block.ras_comm;
+
+	/* If not define special ras_late_init function, use gfx default ras_late_init */
+	if (!ras->ras_block.ras_late_init)
+		ras->ras_block.ras_late_init = amdgpu_ras_block_late_init;
+
+	/* If not defined special ras_cb function, use default ras_cb */
+	if (!ras->ras_block.ras_cb)
+		ras->ras_block.ras_cb = amdgpu_gfx_process_ras_data_cb;
+
+	return 0;
+}
+
 int amdgpu_gfx_process_ras_data_cb(struct amdgpu_device *adev,
 		void *err_data,
 		struct amdgpu_iv_entry *entry)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index b3df4787877e..6b26597217ed 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -432,4 +432,5 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);
 int amdgpu_gfx_get_num_kcq(struct amdgpu_device *adev);
 void amdgpu_gfx_cp_init_microcode(struct amdgpu_device *adev, uint32_t ucode_id);
 
+int amdgpu_gfx_ras_sw_init(struct amdgpu_device *adev);
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index 259ebf0356db..82beb46788cf 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -46,6 +46,7 @@
 #include "clearstate_gfx11.h"
 #include "v11_structs.h"
 #include "gfx_v11_0.h"
+#include "gfx_v11_0_3.h"
 #include "nbio_v4_3.h"
 #include "mes_v11_0.h"
 
@@ -852,7 +853,14 @@ static int gfx_v11_0_gpu_early_init(struct amdgpu_device *adev)
 	switch (adev->ip_versions[GC_HWIP][0]) {
 	case IP_VERSION(11, 0, 0):
 	case IP_VERSION(11, 0, 2):
+		adev->gfx.config.max_hw_contexts = 8;
+		adev->gfx.config.sc_prim_fifo_size_frontend = 0x20;
+		adev->gfx.config.sc_prim_fifo_size_backend = 0x100;
+		adev->gfx.config.sc_hiz_tile_fifo_size = 0;
+		adev->gfx.config.sc_earlyz_tile_fifo_size = 0x4C0;
+		break;
 	case IP_VERSION(11, 0, 3):
+		adev->gfx.ras = &gfx_v11_0_3_ras;
 		adev->gfx.config.max_hw_contexts = 8;
 		adev->gfx.config.sc_prim_fifo_size_frontend = 0x20;
 		adev->gfx.config.sc_prim_fifo_size_backend = 0x100;
@@ -1422,6 +1430,11 @@ static int gfx_v11_0_sw_init(void *handle)
 	if (r)
 		return r;
 
+	if (amdgpu_gfx_ras_sw_init(adev)) {
+		dev_err(adev->dev, "Failed to initialize gfx ras block!\n");
+		return -EINVAL;
+	}
+
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
new file mode 100644
index 000000000000..5966d984a30a
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
@@ -0,0 +1,27 @@
+/*
+ * Copyright 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "amdgpu.h"
+
+
+struct amdgpu_gfx_ras gfx_v11_0_3_ras;
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.h b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.h
new file mode 100644
index 000000000000..7095abddcbc0
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2023 dvanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __GFX_V11_0_3_H__
+#define __GFX_V11_0_3_H__
+
+extern struct amdgpu_gfx_ras gfx_v11_0_3_ras;
+
+#endif
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 2/5] amd/amdgpu: Add RLC_RLCS_FED_STATUS_* to gc v11_0_3 ip headers
  2023-01-13  6:23 [PATCH 1/5] drm/amdgpu: Add gfx ras function on gfx v11_0_3 YiPeng Chai
@ 2023-01-13  6:23 ` YiPeng Chai
  2023-01-13  6:23 ` [PATCH 3/5] drm/amdgpu: Add gfx ras poison consumption irq handling on gfx v11_0_3 YiPeng Chai
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 5+ messages in thread
From: YiPeng Chai @ 2023-01-13  6:23 UTC (permalink / raw)
  To: amd-gfx
  Cc: Tao Zhou, YiPeng Chai, yipechai, Alex Deucher, Candice.Li, Hawking.Zhang

V2:
   Add RLC_RLCS_FED_STATUS_0 and RLC_RLCS_FED_STATUS_1 register
   offset and shift masks.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
---
 .../include/asic_reg/gc/gc_11_0_3_offset.h    |  8 +++
 .../include/asic_reg/gc/gc_11_0_3_sh_mask.h   | 50 +++++++++++++++++++
 2 files changed, 58 insertions(+)

diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_11_0_3_offset.h b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_11_0_3_offset.h
index 3b95a59b196c..56e00252bff8 100644
--- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_11_0_3_offset.h
+++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_11_0_3_offset.h
@@ -3593,6 +3593,14 @@
 #define regGCL2TLB_PERFCOUNTER_RSLT_CNTL_BASE_IDX                                                       1
 
 
+// addressBlock: gc_rlcsdec
+// base address: 0x3b980
+#define regRLC_RLCS_FED_STATUS_0                                                                        0x4eff
+#define regRLC_RLCS_FED_STATUS_0_BASE_IDX                                                               1
+#define regRLC_RLCS_FED_STATUS_1                                                                        0x4f00
+#define regRLC_RLCS_FED_STATUS_1_BASE_IDX                                                               1
+
+
 // addressBlock: gc_gcvml2pspdec
 // base address: 0x3f900
 #define regGCUTCL2_TRANSLATION_BYPASS_BY_VMID                                                           0x5e41
diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_11_0_3_sh_mask.h b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_11_0_3_sh_mask.h
index ae3ef8a9e702..658e88a8e2ac 100644
--- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_11_0_3_sh_mask.h
+++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_11_0_3_sh_mask.h
@@ -37642,6 +37642,56 @@
 #define RLC_RLCG_DOORBELL_RANGE__LOWER_ADDR_MASK                                                              0x00000FFCL
 #define RLC_RLCG_DOORBELL_RANGE__UPPER_ADDR_RESERVED_MASK                                                     0x00030000L
 #define RLC_RLCG_DOORBELL_RANGE__UPPER_ADDR_MASK                                                              0x0FFC0000L
+//RLC_RLCS_FED_STATUS_0
+#define RLC_RLCS_FED_STATUS_0__RLC_FED_ERR__SHIFT                                                             0x0
+#define RLC_RLCS_FED_STATUS_0__UTCL2_FED_ERR__SHIFT                                                           0x1
+#define RLC_RLCS_FED_STATUS_0__GE_FED_ERR__SHIFT                                                              0x2
+#define RLC_RLCS_FED_STATUS_0__CPC_FED_ERR__SHIFT                                                             0x3
+#define RLC_RLCS_FED_STATUS_0__CPF_FED_ERR__SHIFT                                                             0x4
+#define RLC_RLCS_FED_STATUS_0__CPG_FED_ERR__SHIFT                                                             0x5
+#define RLC_RLCS_FED_STATUS_0__SDMA0_FED_ERR__SHIFT                                                           0x6
+#define RLC_RLCS_FED_STATUS_0__SDMA1_FED_ERR__SHIFT                                                           0x7
+#define RLC_RLCS_FED_STATUS_0__RLC_FED_ERR_MASK                                                               0x00000001L
+#define RLC_RLCS_FED_STATUS_0__UTCL2_FED_ERR_MASK                                                             0x00000002L
+#define RLC_RLCS_FED_STATUS_0__GE_FED_ERR_MASK                                                                0x00000004L
+#define RLC_RLCS_FED_STATUS_0__CPC_FED_ERR_MASK                                                               0x00000008L
+#define RLC_RLCS_FED_STATUS_0__CPF_FED_ERR_MASK                                                               0x00000010L
+#define RLC_RLCS_FED_STATUS_0__CPG_FED_ERR_MASK                                                               0x00000020L
+#define RLC_RLCS_FED_STATUS_0__SDMA0_FED_ERR_MASK                                                             0x00000040L
+#define RLC_RLCS_FED_STATUS_0__SDMA1_FED_ERR_MASK                                                             0x00000080L
+//RLC_RLCS_FED_STATUS_1
+#define RLC_RLCS_FED_STATUS_1__GL2C0_FED_ERR__SHIFT                                                           0x0
+#define RLC_RLCS_FED_STATUS_1__GL2C1_FED_ERR__SHIFT                                                           0x1
+#define RLC_RLCS_FED_STATUS_1__GL2C2_FED_ERR__SHIFT                                                           0x2
+#define RLC_RLCS_FED_STATUS_1__GL2C3_FED_ERR__SHIFT                                                           0x3
+#define RLC_RLCS_FED_STATUS_1__GL2C4_FED_ERR__SHIFT                                                           0x4
+#define RLC_RLCS_FED_STATUS_1__GL2C5_FED_ERR__SHIFT                                                           0x5
+#define RLC_RLCS_FED_STATUS_1__GL2C6_FED_ERR__SHIFT                                                           0x6
+#define RLC_RLCS_FED_STATUS_1__GL2C7_FED_ERR__SHIFT                                                           0x7
+#define RLC_RLCS_FED_STATUS_1__GL2C8_FED_ERR__SHIFT                                                           0x8
+#define RLC_RLCS_FED_STATUS_1__GL2C9_FED_ERR__SHIFT                                                           0x9
+#define RLC_RLCS_FED_STATUS_1__GL2C10_FED_ERR__SHIFT                                                          0xa
+#define RLC_RLCS_FED_STATUS_1__GL2C11_FED_ERR__SHIFT                                                          0xb
+#define RLC_RLCS_FED_STATUS_1__GL2C12_FED_ERR__SHIFT                                                          0xc
+#define RLC_RLCS_FED_STATUS_1__GL2C13_FED_ERR__SHIFT                                                          0xd
+#define RLC_RLCS_FED_STATUS_1__GL2C14_FED_ERR__SHIFT                                                          0xe
+#define RLC_RLCS_FED_STATUS_1__GL2C15_FED_ERR__SHIFT                                                          0xf
+#define RLC_RLCS_FED_STATUS_1__GL2C0_FED_ERR_MASK                                                             0x00000001L
+#define RLC_RLCS_FED_STATUS_1__GL2C1_FED_ERR_MASK                                                             0x00000002L
+#define RLC_RLCS_FED_STATUS_1__GL2C2_FED_ERR_MASK                                                             0x00000004L
+#define RLC_RLCS_FED_STATUS_1__GL2C3_FED_ERR_MASK                                                             0x00000008L
+#define RLC_RLCS_FED_STATUS_1__GL2C4_FED_ERR_MASK                                                             0x00000010L
+#define RLC_RLCS_FED_STATUS_1__GL2C5_FED_ERR_MASK                                                             0x00000020L
+#define RLC_RLCS_FED_STATUS_1__GL2C6_FED_ERR_MASK                                                             0x00000040L
+#define RLC_RLCS_FED_STATUS_1__GL2C7_FED_ERR_MASK                                                             0x00000080L
+#define RLC_RLCS_FED_STATUS_1__GL2C8_FED_ERR_MASK                                                             0x00000100L
+#define RLC_RLCS_FED_STATUS_1__GL2C9_FED_ERR_MASK                                                             0x00000200L
+#define RLC_RLCS_FED_STATUS_1__GL2C10_FED_ERR_MASK                                                            0x00000400L
+#define RLC_RLCS_FED_STATUS_1__GL2C11_FED_ERR_MASK                                                            0x00000800L
+#define RLC_RLCS_FED_STATUS_1__GL2C12_FED_ERR_MASK                                                            0x00001000L
+#define RLC_RLCS_FED_STATUS_1__GL2C13_FED_ERR_MASK                                                            0x00002000L
+#define RLC_RLCS_FED_STATUS_1__GL2C14_FED_ERR_MASK                                                            0x00004000L
+#define RLC_RLCS_FED_STATUS_1__GL2C15_FED_ERR_MASK                                                            0x00008000L
 //RLC_CGTT_MGCG_OVERRIDE
 #define RLC_CGTT_MGCG_OVERRIDE__RLC_REPEATER_FGCG_OVERRIDE__SHIFT                                             0x0
 #define RLC_CGTT_MGCG_OVERRIDE__RLC_CGTT_SCLK_OVERRIDE__SHIFT                                                 0x1
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 3/5] drm/amdgpu: Add gfx ras poison consumption irq handling on gfx v11_0_3
  2023-01-13  6:23 [PATCH 1/5] drm/amdgpu: Add gfx ras function on gfx v11_0_3 YiPeng Chai
  2023-01-13  6:23 ` [PATCH 2/5] amd/amdgpu: Add RLC_RLCS_FED_STATUS_* to gc v11_0_3 ip headers YiPeng Chai
@ 2023-01-13  6:23 ` YiPeng Chai
  2023-01-13  6:23 ` [PATCH 4/5] drm/amdgpu: Add gfx cp ecc error " YiPeng Chai
  2023-01-13  6:23 ` [PATCH 5/5] drm/amdgpu: Perform gpu reset after gfx finishes processing ras poison consumption on gfx_v11_0_3 YiPeng Chai
  3 siblings, 0 replies; 5+ messages in thread
From: YiPeng Chai @ 2023-01-13  6:23 UTC (permalink / raw)
  To: amd-gfx; +Cc: Tao Zhou, Hawking.Zhang, YiPeng Chai, Candice.Li, yipechai

Add gfx ras poison consumption irq handling on gfx v11_0_3.

V2:
  Move ras poison consumption irq handling code of gfx
     v11_0_3 to gfx_v11_0_3.c.
V5:
  Create dedicated irq handler for RLC_GC_FED_INTERRUPT.

V6:
  Remove invalid function call.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h       |  4 ++
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c        | 24 +++++++++
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c      | 50 ++++++++++++++++++-
 .../include/ivsrcid/gfx/irqsrcs_gfx_11_0_0.h  |  2 +
 4 files changed, 79 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index 6b26597217ed..0b39fe3cd624 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -210,6 +210,9 @@ struct amdgpu_gfx_ras {
 	struct amdgpu_ras_block_object  ras_block;
 	void (*enable_watchdog_timer)(struct amdgpu_device *adev);
 	bool (*query_utcl2_poison_status)(struct amdgpu_device *adev);
+	int (*rlc_gc_fed_irq)(struct amdgpu_device *adev,
+				struct amdgpu_irq_src *source,
+				struct amdgpu_iv_entry *entry);
 };
 
 struct amdgpu_gfx_funcs {
@@ -323,6 +326,7 @@ struct amdgpu_gfx {
 	struct amdgpu_irq_src		priv_inst_irq;
 	struct amdgpu_irq_src		cp_ecc_error_irq;
 	struct amdgpu_irq_src		sq_irq;
+	struct amdgpu_irq_src		rlc_gc_fed_irq;
 	struct sq_work			sq_work;
 
 	/* gfx status */
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index 82beb46788cf..cc634cae77d7 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -1338,6 +1338,13 @@ static int gfx_v11_0_sw_init(void *handle)
 	if (r)
 		return r;
 
+	/* FED error */
+	r = amdgpu_irq_add_id(adev, SOC21_IH_CLIENTID_GFX,
+				  GFX_11_0_0__SRCID__RLC_GC_FED_INTERRUPT,
+				  &adev->gfx.rlc_gc_fed_irq);
+	if (r)
+		return r;
+
 	adev->gfx.gfx_current_status = AMDGPU_GFX_NORMAL_MODE;
 
 	if (adev->gfx.imu.funcs) {
@@ -6034,6 +6041,16 @@ static int gfx_v11_0_priv_inst_irq(struct amdgpu_device *adev,
 	return 0;
 }
 
+static int gfx_v11_0_rlc_gc_fed_irq(struct amdgpu_device *adev,
+				  struct amdgpu_irq_src *source,
+				  struct amdgpu_iv_entry *entry)
+{
+	if (adev->gfx.ras && adev->gfx.ras->rlc_gc_fed_irq)
+		return adev->gfx.ras->rlc_gc_fed_irq(adev, source, entry);
+
+	return 0;
+}
+
 #if 0
 static int gfx_v11_0_kiq_set_interrupt_state(struct amdgpu_device *adev,
 					     struct amdgpu_irq_src *src,
@@ -6264,6 +6281,10 @@ static const struct amdgpu_irq_src_funcs gfx_v11_0_priv_inst_irq_funcs = {
 	.process = gfx_v11_0_priv_inst_irq,
 };
 
+static const struct amdgpu_irq_src_funcs gfx_v11_0_rlc_gc_fed_irq_funcs = {
+	.process = gfx_v11_0_rlc_gc_fed_irq,
+};
+
 static void gfx_v11_0_set_irq_funcs(struct amdgpu_device *adev)
 {
 	adev->gfx.eop_irq.num_types = AMDGPU_CP_IRQ_LAST;
@@ -6274,6 +6295,9 @@ static void gfx_v11_0_set_irq_funcs(struct amdgpu_device *adev)
 
 	adev->gfx.priv_inst_irq.num_types = 1;
 	adev->gfx.priv_inst_irq.funcs = &gfx_v11_0_priv_inst_irq_funcs;
+
+	adev->gfx.rlc_gc_fed_irq.num_types = 1; /* 0x80 FED error */
+	adev->gfx.rlc_gc_fed_irq.funcs = &gfx_v11_0_rlc_gc_fed_irq_funcs;
 }
 
 static void gfx_v11_0_set_imu_funcs(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
index 5966d984a30a..a18e09de31dd 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
@@ -22,6 +22,54 @@
  */
 
 #include "amdgpu.h"
+#include "soc21.h"
+#include "gc/gc_11_0_3_offset.h"
+#include "gc/gc_11_0_3_sh_mask.h"
+#include "ivsrcid/gfx/irqsrcs_gfx_11_0_0.h"
+#include "soc15.h"
+#include "soc15d.h"
+#include "gfx_v11_0.h"
 
 
-struct amdgpu_gfx_ras gfx_v11_0_3_ras;
+static int gfx_v11_0_3_rlc_gc_fed_irq(struct amdgpu_device *adev,
+				  struct amdgpu_irq_src *source,
+				  struct amdgpu_iv_entry *entry)
+{
+	uint32_t rlc_status0 = 0, rlc_status1 = 0;
+	struct ras_common_if *ras_if = NULL;
+	struct ras_dispatch_if ih_data = {
+		.entry = entry,
+	};
+
+	rlc_status0 = RREG32(SOC15_REG_OFFSET(GC, 0, regRLC_RLCS_FED_STATUS_0));
+	rlc_status1 = RREG32(SOC15_REG_OFFSET(GC, 0, regRLC_RLCS_FED_STATUS_1));
+
+	if (!rlc_status0 && !rlc_status1) {
+		dev_warn(adev->dev, "RLC_GC_FED irq is generated, but rlc_status0 and rlc_status1 are empty!\n");
+		return 0;
+	}
+
+	/* Use RLC_RLCS_FED_STATUS_0/1 to distinguish FED error block. */
+	if (REG_GET_FIELD(rlc_status0, RLC_RLCS_FED_STATUS_0, SDMA0_FED_ERR) ||
+	    REG_GET_FIELD(rlc_status0, RLC_RLCS_FED_STATUS_0, SDMA1_FED_ERR))
+		ras_if = adev->sdma.ras_if;
+	else
+		ras_if = adev->gfx.ras_if;
+
+	if (!ras_if) {
+		dev_err(adev->dev, "Gfx or sdma ras block not initialized, rlc_status0:0x%x.\n",
+				rlc_status0);
+		return -EINVAL;
+	}
+
+	ih_data.head = *ras_if;
+
+	dev_warn(adev->dev, "RLC %s FED IRQ\n", ras_if->name);
+	amdgpu_ras_interrupt_dispatch(adev, &ih_data);
+
+	return 0;
+}
+
+struct amdgpu_gfx_ras gfx_v11_0_3_ras = {
+	.rlc_gc_fed_irq = gfx_v11_0_3_rlc_gc_fed_irq,
+};
diff --git a/drivers/gpu/drm/amd/include/ivsrcid/gfx/irqsrcs_gfx_11_0_0.h b/drivers/gpu/drm/amd/include/ivsrcid/gfx/irqsrcs_gfx_11_0_0.h
index 9e8ed9f4bb15..3a4670bc4449 100644
--- a/drivers/gpu/drm/amd/include/ivsrcid/gfx/irqsrcs_gfx_11_0_0.h
+++ b/drivers/gpu/drm/amd/include/ivsrcid/gfx/irqsrcs_gfx_11_0_0.h
@@ -49,6 +49,8 @@
 #define GFX_11_0_0__SRCID__SDMA_SEM_INCOMPLETE_TIMEOUT          65      // 0x41 GPF(Sem incomplete timeout)
 #define GFX_11_0_0__SRCID__SDMA_SEM_WAIT_FAIL_TIMEOUT           66      // 0x42 Semaphore wait fail timeout
 
+#define GFX_11_0_0__SRCID__RLC_GC_FED_INTERRUPT                 128     // 0x80 FED Interrupt (for data poisoning)
+
 #define GFX_11_0_0__SRCID__CP_GENERIC_INT				        177		// 0xB1 CP_GENERIC int
 #define GFX_11_0_0__SRCID__CP_PM4_PKT_RSVD_BIT_ERROR		    180		// 0xB4 PM4 Pkt Rsvd Bits Error
 #define GFX_11_0_0__SRCID__CP_EOP_INTERRUPT					    181		// 0xB5 End-of-Pipe Interrupt
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 4/5] drm/amdgpu: Add gfx cp ecc error irq handling on gfx v11_0_3
  2023-01-13  6:23 [PATCH 1/5] drm/amdgpu: Add gfx ras function on gfx v11_0_3 YiPeng Chai
  2023-01-13  6:23 ` [PATCH 2/5] amd/amdgpu: Add RLC_RLCS_FED_STATUS_* to gc v11_0_3 ip headers YiPeng Chai
  2023-01-13  6:23 ` [PATCH 3/5] drm/amdgpu: Add gfx ras poison consumption irq handling on gfx v11_0_3 YiPeng Chai
@ 2023-01-13  6:23 ` YiPeng Chai
  2023-01-13  6:23 ` [PATCH 5/5] drm/amdgpu: Perform gpu reset after gfx finishes processing ras poison consumption on gfx_v11_0_3 YiPeng Chai
  3 siblings, 0 replies; 5+ messages in thread
From: YiPeng Chai @ 2023-01-13  6:23 UTC (permalink / raw)
  To: amd-gfx; +Cc: Tao Zhou, Hawking.Zhang, YiPeng Chai, Candice.Li, yipechai

V2:
  Optimize gfx_v11_0_set_cp_ecc_error_state function.

V3:
  Define macro constant for me pipe instance address interval.

V5:
  Register and handle gfx cp ecc error irq on gfx v11_0_3.

V6:
  Remove invalid intermediate function call.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 47 ++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index cc634cae77d7..39f48227bcf8 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -1338,6 +1338,13 @@ static int gfx_v11_0_sw_init(void *handle)
 	if (r)
 		return r;
 
+	/* ECC error */
+	r = amdgpu_irq_add_id(adev, SOC21_IH_CLIENTID_GRBM_CP,
+				  GFX_11_0_0__SRCID__CP_ECC_ERROR,
+				  &adev->gfx.cp_ecc_error_irq);
+	if (r)
+		return r;
+
 	/* FED error */
 	r = amdgpu_irq_add_id(adev, SOC21_IH_CLIENTID_GFX,
 				  GFX_11_0_0__SRCID__RLC_GC_FED_INTERRUPT,
@@ -4434,6 +4441,7 @@ static int gfx_v11_0_hw_fini(void *handle)
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 	int r;
 
+	amdgpu_irq_put(adev, &adev->gfx.cp_ecc_error_irq, 0);
 	amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0);
 	amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0);
 
@@ -5865,6 +5873,36 @@ static void gfx_v11_0_set_compute_eop_interrupt_state(struct amdgpu_device *adev
 	}
 }
 
+#define CP_ME1_PIPE_INST_ADDR_INTERVAL  0x1
+#define SET_ECC_ME_PIPE_STATE(reg_addr, state) \
+	do { \
+		uint32_t tmp = RREG32_SOC15_IP(GC, reg_addr); \
+		tmp = REG_SET_FIELD(tmp, CP_ME1_PIPE0_INT_CNTL, CP_ECC_ERROR_INT_ENABLE, state); \
+		WREG32_SOC15_IP(GC, reg_addr, tmp); \
+	} while (0)
+
+static int gfx_v11_0_set_cp_ecc_error_state(struct amdgpu_device *adev,
+							struct amdgpu_irq_src *source,
+							unsigned type,
+							enum amdgpu_interrupt_state state)
+{
+	uint32_t ecc_irq_state = 0;
+	uint32_t pipe0_int_cntl_addr = 0;
+	int i = 0;
+
+	ecc_irq_state = (state == AMDGPU_IRQ_STATE_ENABLE) ? 1 : 0;
+
+	pipe0_int_cntl_addr = SOC15_REG_OFFSET(GC, 0, regCP_ME1_PIPE0_INT_CNTL);
+
+	WREG32_FIELD15_PREREG(GC, 0, CP_INT_CNTL_RING0, CP_ECC_ERROR_INT_ENABLE, ecc_irq_state);
+
+	for (i = 0; i < adev->gfx.mec.num_pipe_per_mec; i++)
+		SET_ECC_ME_PIPE_STATE(pipe0_int_cntl_addr + i * CP_ME1_PIPE_INST_ADDR_INTERVAL,
+					ecc_irq_state);
+
+	return 0;
+}
+
 static int gfx_v11_0_set_eop_interrupt_state(struct amdgpu_device *adev,
 					    struct amdgpu_irq_src *src,
 					    unsigned type,
@@ -6281,6 +6319,11 @@ static const struct amdgpu_irq_src_funcs gfx_v11_0_priv_inst_irq_funcs = {
 	.process = gfx_v11_0_priv_inst_irq,
 };
 
+static const struct amdgpu_irq_src_funcs gfx_v11_0_cp_ecc_error_irq_funcs = {
+	.set = gfx_v11_0_set_cp_ecc_error_state,
+	.process = amdgpu_gfx_cp_ecc_error_irq,
+};
+
 static const struct amdgpu_irq_src_funcs gfx_v11_0_rlc_gc_fed_irq_funcs = {
 	.process = gfx_v11_0_rlc_gc_fed_irq,
 };
@@ -6296,8 +6339,12 @@ static void gfx_v11_0_set_irq_funcs(struct amdgpu_device *adev)
 	adev->gfx.priv_inst_irq.num_types = 1;
 	adev->gfx.priv_inst_irq.funcs = &gfx_v11_0_priv_inst_irq_funcs;
 
+	adev->gfx.cp_ecc_error_irq.num_types = 1; /* CP ECC error */
+	adev->gfx.cp_ecc_error_irq.funcs = &gfx_v11_0_cp_ecc_error_irq_funcs;
+
 	adev->gfx.rlc_gc_fed_irq.num_types = 1; /* 0x80 FED error */
 	adev->gfx.rlc_gc_fed_irq.funcs = &gfx_v11_0_rlc_gc_fed_irq_funcs;
+
 }
 
 static void gfx_v11_0_set_imu_funcs(struct amdgpu_device *adev)
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 5/5] drm/amdgpu: Perform gpu reset after gfx finishes processing ras poison consumption on gfx_v11_0_3
  2023-01-13  6:23 [PATCH 1/5] drm/amdgpu: Add gfx ras function on gfx v11_0_3 YiPeng Chai
                   ` (2 preceding siblings ...)
  2023-01-13  6:23 ` [PATCH 4/5] drm/amdgpu: Add gfx cp ecc error " YiPeng Chai
@ 2023-01-13  6:23 ` YiPeng Chai
  3 siblings, 0 replies; 5+ messages in thread
From: YiPeng Chai @ 2023-01-13  6:23 UTC (permalink / raw)
  To: amd-gfx; +Cc: Tao Zhou, Hawking.Zhang, YiPeng Chai, Candice.Li, yipechai

Perform gpu reset after gfx finishes processing
ras poison consumption on gfx_v11_0_3.

V2:
 Move gfx poison consumption handler from hw_ops to ip
 function level.

V3:
 Adjust the calling position of amdgpu_gfx_poison_consumation_handler.

V4:
   Since gfx v11_0_3 does not have .hw_ops instance, the .hw_ops null
 pointer check in amdgpu_ras_interrupt_poison_consumption_handler
 needs to be adjusted.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c  |  9 +++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h  |  4 ++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  |  8 +++++---
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c | 13 +++++++++++++
 4 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 09c42c00e43c..caf7fd3adcbd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -731,6 +731,15 @@ int amdgpu_gfx_ras_sw_init(struct amdgpu_device *adev)
 	return 0;
 }
 
+int amdgpu_gfx_poison_consumption_handler(struct amdgpu_device *adev,
+						struct amdgpu_iv_entry *entry)
+{
+	if (adev->gfx.ras && adev->gfx.ras->poison_consumption_handler)
+		return adev->gfx.ras->poison_consumption_handler(adev, entry);
+
+	return 0;
+}
+
 int amdgpu_gfx_process_ras_data_cb(struct amdgpu_device *adev,
 		void *err_data,
 		struct amdgpu_iv_entry *entry)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index 0b39fe3cd624..86ec9d0d12c8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -213,6 +213,8 @@ struct amdgpu_gfx_ras {
 	int (*rlc_gc_fed_irq)(struct amdgpu_device *adev,
 				struct amdgpu_irq_src *source,
 				struct amdgpu_iv_entry *entry);
+	int (*poison_consumption_handler)(struct amdgpu_device *adev,
+						struct amdgpu_iv_entry *entry);
 };
 
 struct amdgpu_gfx_funcs {
@@ -437,4 +439,6 @@ int amdgpu_gfx_get_num_kcq(struct amdgpu_device *adev);
 void amdgpu_gfx_cp_init_microcode(struct amdgpu_device *adev, uint32_t ucode_id);
 
 int amdgpu_gfx_ras_sw_init(struct amdgpu_device *adev);
+int amdgpu_gfx_poison_consumption_handler(struct amdgpu_device *adev,
+						struct amdgpu_iv_entry *entry);
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index d06beb884a16..0a95d1c1e7ab 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1620,14 +1620,14 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
 	struct amdgpu_ras_block_object *block_obj =
 		amdgpu_ras_get_ras_block(adev, obj->head.block, 0);
 
-	if (!block_obj || !block_obj->hw_ops)
+	if (!block_obj)
 		return;
 
 	/* both query_poison_status and handle_poison_consumption are optional,
 	 * but at least one of them should be implemented if we need poison
 	 * consumption handler
 	 */
-	if (block_obj->hw_ops->query_poison_status) {
+	if (block_obj->hw_ops && block_obj->hw_ops->query_poison_status) {
 		poison_stat = block_obj->hw_ops->query_poison_status(adev);
 		if (!poison_stat) {
 			/* Not poison consumption interrupt, no need to handle it */
@@ -1641,7 +1641,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
 	if (!adev->gmc.xgmi.connected_to_cpu)
 		amdgpu_umc_poison_handler(adev, false);
 
-	if (block_obj->hw_ops->handle_poison_consumption)
+	if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption)
 		poison_stat = block_obj->hw_ops->handle_poison_consumption(adev);
 
 	/* gpu reset is fallback for failed and default cases */
@@ -1649,6 +1649,8 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
 		dev_info(adev->dev, "GPU reset for %s RAS poison consumption is issued!\n",
 				block_obj->ras_comm.name);
 		amdgpu_ras_reset_gpu(adev);
+	} else {
+		amdgpu_gfx_poison_consumption_handler(adev, entry);
 	}
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
index a18e09de31dd..b07a72ca25d9 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
@@ -70,6 +70,19 @@ static int gfx_v11_0_3_rlc_gc_fed_irq(struct amdgpu_device *adev,
 	return 0;
 }
 
+static int gfx_v11_0_3_poison_consumption_handler(struct amdgpu_device *adev,
+					struct amdgpu_iv_entry *entry)
+{
+	/* Workaround: when vmid and pasid are both zero, trigger gpu reset in KGD. */
+	if (entry && (entry->client_id == SOC21_IH_CLIENTID_GFX) &&
+	    (entry->src_id == GFX_11_0_0__SRCID__RLC_GC_FED_INTERRUPT) &&
+	     !entry->vmid && !entry->pasid)
+		amdgpu_ras_reset_gpu(adev);
+
+	return 0;
+}
+
 struct amdgpu_gfx_ras gfx_v11_0_3_ras = {
 	.rlc_gc_fed_irq = gfx_v11_0_3_rlc_gc_fed_irq,
+	.poison_consumption_handler = gfx_v11_0_3_poison_consumption_handler,
 };
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2023-01-13  6:25 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-01-13  6:23 [PATCH 1/5] drm/amdgpu: Add gfx ras function on gfx v11_0_3 YiPeng Chai
2023-01-13  6:23 ` [PATCH 2/5] amd/amdgpu: Add RLC_RLCS_FED_STATUS_* to gc v11_0_3 ip headers YiPeng Chai
2023-01-13  6:23 ` [PATCH 3/5] drm/amdgpu: Add gfx ras poison consumption irq handling on gfx v11_0_3 YiPeng Chai
2023-01-13  6:23 ` [PATCH 4/5] drm/amdgpu: Add gfx cp ecc error " YiPeng Chai
2023-01-13  6:23 ` [PATCH 5/5] drm/amdgpu: Perform gpu reset after gfx finishes processing ras poison consumption on gfx_v11_0_3 YiPeng Chai

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.