* [PATCH 1/5] drm/amdgpu: Add gfx ras function on gfx v11_0_3
@ 2023-01-13 6:23 YiPeng Chai
2023-01-13 6:23 ` [PATCH 2/5] amd/amdgpu: Add RLC_RLCS_FED_STATUS_* to gc v11_0_3 ip headers YiPeng Chai
` (3 more replies)
0 siblings, 4 replies; 5+ messages in thread
From: YiPeng Chai @ 2023-01-13 6:23 UTC (permalink / raw)
To: amd-gfx; +Cc: Tao Zhou, Hawking.Zhang, YiPeng Chai, Candice.Li, yipechai
Add gfx ras function on gfx v11_0_3.
V2:
1. Add separate source files for gfx v11_0_3.
2. Create a common function to initialize gfx ras block.
V3:
1. Rename amdgpu_gfx_ras_block_init to amdgpu_gfx_ras_sw_init.
2. Adjust the calling position of amdgpu_gfx_ras_sw_init.
3. Remove gfx_v11_0_3_ras_ops.
V4:
Revert changes in amdgpu_ras_interrupt_poison_consumption_handler.
V5:
1. Remove invalid include file in gfx_v11_0_3.c.
2. Reduce the number of parameters of amdgpu_gfx_ras_sw_init.
Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
---
drivers/gpu/drm/amd/amdgpu/Makefile | 1 +
drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 35 ++++++++++++++++++++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 1 +
drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 13 +++++++++
drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c | 27 ++++++++++++++++++
drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.h | 29 ++++++++++++++++++++
6 files changed, 106 insertions(+)
create mode 100644 drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
create mode 100644 drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.h
diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile
index 332cf8bda7a2..5df603192cdc 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -137,6 +137,7 @@ amdgpu-y += \
gfx_v10_0.o \
imu_v11_0.o \
gfx_v11_0.o \
+ gfx_v11_0_3.o \
imu_v11_0_3.o
# add async DMA block
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 42a939cd2eac..09c42c00e43c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -696,6 +696,41 @@ int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *r
return r;
}
+int amdgpu_gfx_ras_sw_init(struct amdgpu_device *adev)
+{
+ int err = 0;
+ struct amdgpu_gfx_ras *ras = NULL;
+
+ /* adev->gfx.ras is NULL, which means gfx does not
+ * support ras function, then do nothing here.
+ */
+ if (!adev->gfx.ras)
+ return 0;
+
+ ras = adev->gfx.ras;
+
+ err = amdgpu_ras_register_ras_block(adev, &ras->ras_block);
+ if (err) {
+ dev_err(adev->dev, "Failed to register gfx ras block!\n");
+ return err;
+ }
+
+ strcpy(ras->ras_block.ras_comm.name, "gfx");
+ ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__GFX;
+ ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+ adev->gfx.ras_if = &ras->ras_block.ras_comm;
+
+ /* If not define special ras_late_init function, use gfx default ras_late_init */
+ if (!ras->ras_block.ras_late_init)
+ ras->ras_block.ras_late_init = amdgpu_ras_block_late_init;
+
+ /* If not defined special ras_cb function, use default ras_cb */
+ if (!ras->ras_block.ras_cb)
+ ras->ras_block.ras_cb = amdgpu_gfx_process_ras_data_cb;
+
+ return 0;
+}
+
int amdgpu_gfx_process_ras_data_cb(struct amdgpu_device *adev,
void *err_data,
struct amdgpu_iv_entry *entry)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index b3df4787877e..6b26597217ed 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -432,4 +432,5 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);
int amdgpu_gfx_get_num_kcq(struct amdgpu_device *adev);
void amdgpu_gfx_cp_init_microcode(struct amdgpu_device *adev, uint32_t ucode_id);
+int amdgpu_gfx_ras_sw_init(struct amdgpu_device *adev);
#endif
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index 259ebf0356db..82beb46788cf 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -46,6 +46,7 @@
#include "clearstate_gfx11.h"
#include "v11_structs.h"
#include "gfx_v11_0.h"
+#include "gfx_v11_0_3.h"
#include "nbio_v4_3.h"
#include "mes_v11_0.h"
@@ -852,7 +853,14 @@ static int gfx_v11_0_gpu_early_init(struct amdgpu_device *adev)
switch (adev->ip_versions[GC_HWIP][0]) {
case IP_VERSION(11, 0, 0):
case IP_VERSION(11, 0, 2):
+ adev->gfx.config.max_hw_contexts = 8;
+ adev->gfx.config.sc_prim_fifo_size_frontend = 0x20;
+ adev->gfx.config.sc_prim_fifo_size_backend = 0x100;
+ adev->gfx.config.sc_hiz_tile_fifo_size = 0;
+ adev->gfx.config.sc_earlyz_tile_fifo_size = 0x4C0;
+ break;
case IP_VERSION(11, 0, 3):
+ adev->gfx.ras = &gfx_v11_0_3_ras;
adev->gfx.config.max_hw_contexts = 8;
adev->gfx.config.sc_prim_fifo_size_frontend = 0x20;
adev->gfx.config.sc_prim_fifo_size_backend = 0x100;
@@ -1422,6 +1430,11 @@ static int gfx_v11_0_sw_init(void *handle)
if (r)
return r;
+ if (amdgpu_gfx_ras_sw_init(adev)) {
+ dev_err(adev->dev, "Failed to initialize gfx ras block!\n");
+ return -EINVAL;
+ }
+
return 0;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
new file mode 100644
index 000000000000..5966d984a30a
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
@@ -0,0 +1,27 @@
+/*
+ * Copyright 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "amdgpu.h"
+
+
+struct amdgpu_gfx_ras gfx_v11_0_3_ras;
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.h b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.h
new file mode 100644
index 000000000000..7095abddcbc0
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2023 dvanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __GFX_V11_0_3_H__
+#define __GFX_V11_0_3_H__
+
+extern struct amdgpu_gfx_ras gfx_v11_0_3_ras;
+
+#endif
--
2.25.1
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH 2/5] amd/amdgpu: Add RLC_RLCS_FED_STATUS_* to gc v11_0_3 ip headers
2023-01-13 6:23 [PATCH 1/5] drm/amdgpu: Add gfx ras function on gfx v11_0_3 YiPeng Chai
@ 2023-01-13 6:23 ` YiPeng Chai
2023-01-13 6:23 ` [PATCH 3/5] drm/amdgpu: Add gfx ras poison consumption irq handling on gfx v11_0_3 YiPeng Chai
` (2 subsequent siblings)
3 siblings, 0 replies; 5+ messages in thread
From: YiPeng Chai @ 2023-01-13 6:23 UTC (permalink / raw)
To: amd-gfx
Cc: Tao Zhou, YiPeng Chai, yipechai, Alex Deucher, Candice.Li, Hawking.Zhang
V2:
Add RLC_RLCS_FED_STATUS_0 and RLC_RLCS_FED_STATUS_1 register
offset and shift masks.
Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
---
.../include/asic_reg/gc/gc_11_0_3_offset.h | 8 +++
.../include/asic_reg/gc/gc_11_0_3_sh_mask.h | 50 +++++++++++++++++++
2 files changed, 58 insertions(+)
diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_11_0_3_offset.h b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_11_0_3_offset.h
index 3b95a59b196c..56e00252bff8 100644
--- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_11_0_3_offset.h
+++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_11_0_3_offset.h
@@ -3593,6 +3593,14 @@
#define regGCL2TLB_PERFCOUNTER_RSLT_CNTL_BASE_IDX 1
+// addressBlock: gc_rlcsdec
+// base address: 0x3b980
+#define regRLC_RLCS_FED_STATUS_0 0x4eff
+#define regRLC_RLCS_FED_STATUS_0_BASE_IDX 1
+#define regRLC_RLCS_FED_STATUS_1 0x4f00
+#define regRLC_RLCS_FED_STATUS_1_BASE_IDX 1
+
+
// addressBlock: gc_gcvml2pspdec
// base address: 0x3f900
#define regGCUTCL2_TRANSLATION_BYPASS_BY_VMID 0x5e41
diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_11_0_3_sh_mask.h b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_11_0_3_sh_mask.h
index ae3ef8a9e702..658e88a8e2ac 100644
--- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_11_0_3_sh_mask.h
+++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_11_0_3_sh_mask.h
@@ -37642,6 +37642,56 @@
#define RLC_RLCG_DOORBELL_RANGE__LOWER_ADDR_MASK 0x00000FFCL
#define RLC_RLCG_DOORBELL_RANGE__UPPER_ADDR_RESERVED_MASK 0x00030000L
#define RLC_RLCG_DOORBELL_RANGE__UPPER_ADDR_MASK 0x0FFC0000L
+//RLC_RLCS_FED_STATUS_0
+#define RLC_RLCS_FED_STATUS_0__RLC_FED_ERR__SHIFT 0x0
+#define RLC_RLCS_FED_STATUS_0__UTCL2_FED_ERR__SHIFT 0x1
+#define RLC_RLCS_FED_STATUS_0__GE_FED_ERR__SHIFT 0x2
+#define RLC_RLCS_FED_STATUS_0__CPC_FED_ERR__SHIFT 0x3
+#define RLC_RLCS_FED_STATUS_0__CPF_FED_ERR__SHIFT 0x4
+#define RLC_RLCS_FED_STATUS_0__CPG_FED_ERR__SHIFT 0x5
+#define RLC_RLCS_FED_STATUS_0__SDMA0_FED_ERR__SHIFT 0x6
+#define RLC_RLCS_FED_STATUS_0__SDMA1_FED_ERR__SHIFT 0x7
+#define RLC_RLCS_FED_STATUS_0__RLC_FED_ERR_MASK 0x00000001L
+#define RLC_RLCS_FED_STATUS_0__UTCL2_FED_ERR_MASK 0x00000002L
+#define RLC_RLCS_FED_STATUS_0__GE_FED_ERR_MASK 0x00000004L
+#define RLC_RLCS_FED_STATUS_0__CPC_FED_ERR_MASK 0x00000008L
+#define RLC_RLCS_FED_STATUS_0__CPF_FED_ERR_MASK 0x00000010L
+#define RLC_RLCS_FED_STATUS_0__CPG_FED_ERR_MASK 0x00000020L
+#define RLC_RLCS_FED_STATUS_0__SDMA0_FED_ERR_MASK 0x00000040L
+#define RLC_RLCS_FED_STATUS_0__SDMA1_FED_ERR_MASK 0x00000080L
+//RLC_RLCS_FED_STATUS_1
+#define RLC_RLCS_FED_STATUS_1__GL2C0_FED_ERR__SHIFT 0x0
+#define RLC_RLCS_FED_STATUS_1__GL2C1_FED_ERR__SHIFT 0x1
+#define RLC_RLCS_FED_STATUS_1__GL2C2_FED_ERR__SHIFT 0x2
+#define RLC_RLCS_FED_STATUS_1__GL2C3_FED_ERR__SHIFT 0x3
+#define RLC_RLCS_FED_STATUS_1__GL2C4_FED_ERR__SHIFT 0x4
+#define RLC_RLCS_FED_STATUS_1__GL2C5_FED_ERR__SHIFT 0x5
+#define RLC_RLCS_FED_STATUS_1__GL2C6_FED_ERR__SHIFT 0x6
+#define RLC_RLCS_FED_STATUS_1__GL2C7_FED_ERR__SHIFT 0x7
+#define RLC_RLCS_FED_STATUS_1__GL2C8_FED_ERR__SHIFT 0x8
+#define RLC_RLCS_FED_STATUS_1__GL2C9_FED_ERR__SHIFT 0x9
+#define RLC_RLCS_FED_STATUS_1__GL2C10_FED_ERR__SHIFT 0xa
+#define RLC_RLCS_FED_STATUS_1__GL2C11_FED_ERR__SHIFT 0xb
+#define RLC_RLCS_FED_STATUS_1__GL2C12_FED_ERR__SHIFT 0xc
+#define RLC_RLCS_FED_STATUS_1__GL2C13_FED_ERR__SHIFT 0xd
+#define RLC_RLCS_FED_STATUS_1__GL2C14_FED_ERR__SHIFT 0xe
+#define RLC_RLCS_FED_STATUS_1__GL2C15_FED_ERR__SHIFT 0xf
+#define RLC_RLCS_FED_STATUS_1__GL2C0_FED_ERR_MASK 0x00000001L
+#define RLC_RLCS_FED_STATUS_1__GL2C1_FED_ERR_MASK 0x00000002L
+#define RLC_RLCS_FED_STATUS_1__GL2C2_FED_ERR_MASK 0x00000004L
+#define RLC_RLCS_FED_STATUS_1__GL2C3_FED_ERR_MASK 0x00000008L
+#define RLC_RLCS_FED_STATUS_1__GL2C4_FED_ERR_MASK 0x00000010L
+#define RLC_RLCS_FED_STATUS_1__GL2C5_FED_ERR_MASK 0x00000020L
+#define RLC_RLCS_FED_STATUS_1__GL2C6_FED_ERR_MASK 0x00000040L
+#define RLC_RLCS_FED_STATUS_1__GL2C7_FED_ERR_MASK 0x00000080L
+#define RLC_RLCS_FED_STATUS_1__GL2C8_FED_ERR_MASK 0x00000100L
+#define RLC_RLCS_FED_STATUS_1__GL2C9_FED_ERR_MASK 0x00000200L
+#define RLC_RLCS_FED_STATUS_1__GL2C10_FED_ERR_MASK 0x00000400L
+#define RLC_RLCS_FED_STATUS_1__GL2C11_FED_ERR_MASK 0x00000800L
+#define RLC_RLCS_FED_STATUS_1__GL2C12_FED_ERR_MASK 0x00001000L
+#define RLC_RLCS_FED_STATUS_1__GL2C13_FED_ERR_MASK 0x00002000L
+#define RLC_RLCS_FED_STATUS_1__GL2C14_FED_ERR_MASK 0x00004000L
+#define RLC_RLCS_FED_STATUS_1__GL2C15_FED_ERR_MASK 0x00008000L
//RLC_CGTT_MGCG_OVERRIDE
#define RLC_CGTT_MGCG_OVERRIDE__RLC_REPEATER_FGCG_OVERRIDE__SHIFT 0x0
#define RLC_CGTT_MGCG_OVERRIDE__RLC_CGTT_SCLK_OVERRIDE__SHIFT 0x1
--
2.25.1
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH 3/5] drm/amdgpu: Add gfx ras poison consumption irq handling on gfx v11_0_3
2023-01-13 6:23 [PATCH 1/5] drm/amdgpu: Add gfx ras function on gfx v11_0_3 YiPeng Chai
2023-01-13 6:23 ` [PATCH 2/5] amd/amdgpu: Add RLC_RLCS_FED_STATUS_* to gc v11_0_3 ip headers YiPeng Chai
@ 2023-01-13 6:23 ` YiPeng Chai
2023-01-13 6:23 ` [PATCH 4/5] drm/amdgpu: Add gfx cp ecc error " YiPeng Chai
2023-01-13 6:23 ` [PATCH 5/5] drm/amdgpu: Perform gpu reset after gfx finishes processing ras poison consumption on gfx_v11_0_3 YiPeng Chai
3 siblings, 0 replies; 5+ messages in thread
From: YiPeng Chai @ 2023-01-13 6:23 UTC (permalink / raw)
To: amd-gfx; +Cc: Tao Zhou, Hawking.Zhang, YiPeng Chai, Candice.Li, yipechai
Add gfx ras poison consumption irq handling on gfx v11_0_3.
V2:
Move ras poison consumption irq handling code of gfx
v11_0_3 to gfx_v11_0_3.c.
V5:
Create dedicated irq handler for RLC_GC_FED_INTERRUPT.
V6:
Remove invalid function call.
Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 4 ++
drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 24 +++++++++
drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c | 50 ++++++++++++++++++-
.../include/ivsrcid/gfx/irqsrcs_gfx_11_0_0.h | 2 +
4 files changed, 79 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index 6b26597217ed..0b39fe3cd624 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -210,6 +210,9 @@ struct amdgpu_gfx_ras {
struct amdgpu_ras_block_object ras_block;
void (*enable_watchdog_timer)(struct amdgpu_device *adev);
bool (*query_utcl2_poison_status)(struct amdgpu_device *adev);
+ int (*rlc_gc_fed_irq)(struct amdgpu_device *adev,
+ struct amdgpu_irq_src *source,
+ struct amdgpu_iv_entry *entry);
};
struct amdgpu_gfx_funcs {
@@ -323,6 +326,7 @@ struct amdgpu_gfx {
struct amdgpu_irq_src priv_inst_irq;
struct amdgpu_irq_src cp_ecc_error_irq;
struct amdgpu_irq_src sq_irq;
+ struct amdgpu_irq_src rlc_gc_fed_irq;
struct sq_work sq_work;
/* gfx status */
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index 82beb46788cf..cc634cae77d7 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -1338,6 +1338,13 @@ static int gfx_v11_0_sw_init(void *handle)
if (r)
return r;
+ /* FED error */
+ r = amdgpu_irq_add_id(adev, SOC21_IH_CLIENTID_GFX,
+ GFX_11_0_0__SRCID__RLC_GC_FED_INTERRUPT,
+ &adev->gfx.rlc_gc_fed_irq);
+ if (r)
+ return r;
+
adev->gfx.gfx_current_status = AMDGPU_GFX_NORMAL_MODE;
if (adev->gfx.imu.funcs) {
@@ -6034,6 +6041,16 @@ static int gfx_v11_0_priv_inst_irq(struct amdgpu_device *adev,
return 0;
}
+static int gfx_v11_0_rlc_gc_fed_irq(struct amdgpu_device *adev,
+ struct amdgpu_irq_src *source,
+ struct amdgpu_iv_entry *entry)
+{
+ if (adev->gfx.ras && adev->gfx.ras->rlc_gc_fed_irq)
+ return adev->gfx.ras->rlc_gc_fed_irq(adev, source, entry);
+
+ return 0;
+}
+
#if 0
static int gfx_v11_0_kiq_set_interrupt_state(struct amdgpu_device *adev,
struct amdgpu_irq_src *src,
@@ -6264,6 +6281,10 @@ static const struct amdgpu_irq_src_funcs gfx_v11_0_priv_inst_irq_funcs = {
.process = gfx_v11_0_priv_inst_irq,
};
+static const struct amdgpu_irq_src_funcs gfx_v11_0_rlc_gc_fed_irq_funcs = {
+ .process = gfx_v11_0_rlc_gc_fed_irq,
+};
+
static void gfx_v11_0_set_irq_funcs(struct amdgpu_device *adev)
{
adev->gfx.eop_irq.num_types = AMDGPU_CP_IRQ_LAST;
@@ -6274,6 +6295,9 @@ static void gfx_v11_0_set_irq_funcs(struct amdgpu_device *adev)
adev->gfx.priv_inst_irq.num_types = 1;
adev->gfx.priv_inst_irq.funcs = &gfx_v11_0_priv_inst_irq_funcs;
+
+ adev->gfx.rlc_gc_fed_irq.num_types = 1; /* 0x80 FED error */
+ adev->gfx.rlc_gc_fed_irq.funcs = &gfx_v11_0_rlc_gc_fed_irq_funcs;
}
static void gfx_v11_0_set_imu_funcs(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
index 5966d984a30a..a18e09de31dd 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
@@ -22,6 +22,54 @@
*/
#include "amdgpu.h"
+#include "soc21.h"
+#include "gc/gc_11_0_3_offset.h"
+#include "gc/gc_11_0_3_sh_mask.h"
+#include "ivsrcid/gfx/irqsrcs_gfx_11_0_0.h"
+#include "soc15.h"
+#include "soc15d.h"
+#include "gfx_v11_0.h"
-struct amdgpu_gfx_ras gfx_v11_0_3_ras;
+static int gfx_v11_0_3_rlc_gc_fed_irq(struct amdgpu_device *adev,
+ struct amdgpu_irq_src *source,
+ struct amdgpu_iv_entry *entry)
+{
+ uint32_t rlc_status0 = 0, rlc_status1 = 0;
+ struct ras_common_if *ras_if = NULL;
+ struct ras_dispatch_if ih_data = {
+ .entry = entry,
+ };
+
+ rlc_status0 = RREG32(SOC15_REG_OFFSET(GC, 0, regRLC_RLCS_FED_STATUS_0));
+ rlc_status1 = RREG32(SOC15_REG_OFFSET(GC, 0, regRLC_RLCS_FED_STATUS_1));
+
+ if (!rlc_status0 && !rlc_status1) {
+ dev_warn(adev->dev, "RLC_GC_FED irq is generated, but rlc_status0 and rlc_status1 are empty!\n");
+ return 0;
+ }
+
+ /* Use RLC_RLCS_FED_STATUS_0/1 to distinguish FED error block. */
+ if (REG_GET_FIELD(rlc_status0, RLC_RLCS_FED_STATUS_0, SDMA0_FED_ERR) ||
+ REG_GET_FIELD(rlc_status0, RLC_RLCS_FED_STATUS_0, SDMA1_FED_ERR))
+ ras_if = adev->sdma.ras_if;
+ else
+ ras_if = adev->gfx.ras_if;
+
+ if (!ras_if) {
+ dev_err(adev->dev, "Gfx or sdma ras block not initialized, rlc_status0:0x%x.\n",
+ rlc_status0);
+ return -EINVAL;
+ }
+
+ ih_data.head = *ras_if;
+
+ dev_warn(adev->dev, "RLC %s FED IRQ\n", ras_if->name);
+ amdgpu_ras_interrupt_dispatch(adev, &ih_data);
+
+ return 0;
+}
+
+struct amdgpu_gfx_ras gfx_v11_0_3_ras = {
+ .rlc_gc_fed_irq = gfx_v11_0_3_rlc_gc_fed_irq,
+};
diff --git a/drivers/gpu/drm/amd/include/ivsrcid/gfx/irqsrcs_gfx_11_0_0.h b/drivers/gpu/drm/amd/include/ivsrcid/gfx/irqsrcs_gfx_11_0_0.h
index 9e8ed9f4bb15..3a4670bc4449 100644
--- a/drivers/gpu/drm/amd/include/ivsrcid/gfx/irqsrcs_gfx_11_0_0.h
+++ b/drivers/gpu/drm/amd/include/ivsrcid/gfx/irqsrcs_gfx_11_0_0.h
@@ -49,6 +49,8 @@
#define GFX_11_0_0__SRCID__SDMA_SEM_INCOMPLETE_TIMEOUT 65 // 0x41 GPF(Sem incomplete timeout)
#define GFX_11_0_0__SRCID__SDMA_SEM_WAIT_FAIL_TIMEOUT 66 // 0x42 Semaphore wait fail timeout
+#define GFX_11_0_0__SRCID__RLC_GC_FED_INTERRUPT 128 // 0x80 FED Interrupt (for data poisoning)
+
#define GFX_11_0_0__SRCID__CP_GENERIC_INT 177 // 0xB1 CP_GENERIC int
#define GFX_11_0_0__SRCID__CP_PM4_PKT_RSVD_BIT_ERROR 180 // 0xB4 PM4 Pkt Rsvd Bits Error
#define GFX_11_0_0__SRCID__CP_EOP_INTERRUPT 181 // 0xB5 End-of-Pipe Interrupt
--
2.25.1
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH 4/5] drm/amdgpu: Add gfx cp ecc error irq handling on gfx v11_0_3
2023-01-13 6:23 [PATCH 1/5] drm/amdgpu: Add gfx ras function on gfx v11_0_3 YiPeng Chai
2023-01-13 6:23 ` [PATCH 2/5] amd/amdgpu: Add RLC_RLCS_FED_STATUS_* to gc v11_0_3 ip headers YiPeng Chai
2023-01-13 6:23 ` [PATCH 3/5] drm/amdgpu: Add gfx ras poison consumption irq handling on gfx v11_0_3 YiPeng Chai
@ 2023-01-13 6:23 ` YiPeng Chai
2023-01-13 6:23 ` [PATCH 5/5] drm/amdgpu: Perform gpu reset after gfx finishes processing ras poison consumption on gfx_v11_0_3 YiPeng Chai
3 siblings, 0 replies; 5+ messages in thread
From: YiPeng Chai @ 2023-01-13 6:23 UTC (permalink / raw)
To: amd-gfx; +Cc: Tao Zhou, Hawking.Zhang, YiPeng Chai, Candice.Li, yipechai
V2:
Optimize gfx_v11_0_set_cp_ecc_error_state function.
V3:
Define macro constant for me pipe instance address interval.
V5:
Register and handle gfx cp ecc error irq on gfx v11_0_3.
V6:
Remove invalid intermediate function call.
Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
---
drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 47 ++++++++++++++++++++++++++
1 file changed, 47 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index cc634cae77d7..39f48227bcf8 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -1338,6 +1338,13 @@ static int gfx_v11_0_sw_init(void *handle)
if (r)
return r;
+ /* ECC error */
+ r = amdgpu_irq_add_id(adev, SOC21_IH_CLIENTID_GRBM_CP,
+ GFX_11_0_0__SRCID__CP_ECC_ERROR,
+ &adev->gfx.cp_ecc_error_irq);
+ if (r)
+ return r;
+
/* FED error */
r = amdgpu_irq_add_id(adev, SOC21_IH_CLIENTID_GFX,
GFX_11_0_0__SRCID__RLC_GC_FED_INTERRUPT,
@@ -4434,6 +4441,7 @@ static int gfx_v11_0_hw_fini(void *handle)
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
int r;
+ amdgpu_irq_put(adev, &adev->gfx.cp_ecc_error_irq, 0);
amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0);
amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0);
@@ -5865,6 +5873,36 @@ static void gfx_v11_0_set_compute_eop_interrupt_state(struct amdgpu_device *adev
}
}
+#define CP_ME1_PIPE_INST_ADDR_INTERVAL 0x1
+#define SET_ECC_ME_PIPE_STATE(reg_addr, state) \
+ do { \
+ uint32_t tmp = RREG32_SOC15_IP(GC, reg_addr); \
+ tmp = REG_SET_FIELD(tmp, CP_ME1_PIPE0_INT_CNTL, CP_ECC_ERROR_INT_ENABLE, state); \
+ WREG32_SOC15_IP(GC, reg_addr, tmp); \
+ } while (0)
+
+static int gfx_v11_0_set_cp_ecc_error_state(struct amdgpu_device *adev,
+ struct amdgpu_irq_src *source,
+ unsigned type,
+ enum amdgpu_interrupt_state state)
+{
+ uint32_t ecc_irq_state = 0;
+ uint32_t pipe0_int_cntl_addr = 0;
+ int i = 0;
+
+ ecc_irq_state = (state == AMDGPU_IRQ_STATE_ENABLE) ? 1 : 0;
+
+ pipe0_int_cntl_addr = SOC15_REG_OFFSET(GC, 0, regCP_ME1_PIPE0_INT_CNTL);
+
+ WREG32_FIELD15_PREREG(GC, 0, CP_INT_CNTL_RING0, CP_ECC_ERROR_INT_ENABLE, ecc_irq_state);
+
+ for (i = 0; i < adev->gfx.mec.num_pipe_per_mec; i++)
+ SET_ECC_ME_PIPE_STATE(pipe0_int_cntl_addr + i * CP_ME1_PIPE_INST_ADDR_INTERVAL,
+ ecc_irq_state);
+
+ return 0;
+}
+
static int gfx_v11_0_set_eop_interrupt_state(struct amdgpu_device *adev,
struct amdgpu_irq_src *src,
unsigned type,
@@ -6281,6 +6319,11 @@ static const struct amdgpu_irq_src_funcs gfx_v11_0_priv_inst_irq_funcs = {
.process = gfx_v11_0_priv_inst_irq,
};
+static const struct amdgpu_irq_src_funcs gfx_v11_0_cp_ecc_error_irq_funcs = {
+ .set = gfx_v11_0_set_cp_ecc_error_state,
+ .process = amdgpu_gfx_cp_ecc_error_irq,
+};
+
static const struct amdgpu_irq_src_funcs gfx_v11_0_rlc_gc_fed_irq_funcs = {
.process = gfx_v11_0_rlc_gc_fed_irq,
};
@@ -6296,8 +6339,12 @@ static void gfx_v11_0_set_irq_funcs(struct amdgpu_device *adev)
adev->gfx.priv_inst_irq.num_types = 1;
adev->gfx.priv_inst_irq.funcs = &gfx_v11_0_priv_inst_irq_funcs;
+ adev->gfx.cp_ecc_error_irq.num_types = 1; /* CP ECC error */
+ adev->gfx.cp_ecc_error_irq.funcs = &gfx_v11_0_cp_ecc_error_irq_funcs;
+
adev->gfx.rlc_gc_fed_irq.num_types = 1; /* 0x80 FED error */
adev->gfx.rlc_gc_fed_irq.funcs = &gfx_v11_0_rlc_gc_fed_irq_funcs;
+
}
static void gfx_v11_0_set_imu_funcs(struct amdgpu_device *adev)
--
2.25.1
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH 5/5] drm/amdgpu: Perform gpu reset after gfx finishes processing ras poison consumption on gfx_v11_0_3
2023-01-13 6:23 [PATCH 1/5] drm/amdgpu: Add gfx ras function on gfx v11_0_3 YiPeng Chai
` (2 preceding siblings ...)
2023-01-13 6:23 ` [PATCH 4/5] drm/amdgpu: Add gfx cp ecc error " YiPeng Chai
@ 2023-01-13 6:23 ` YiPeng Chai
3 siblings, 0 replies; 5+ messages in thread
From: YiPeng Chai @ 2023-01-13 6:23 UTC (permalink / raw)
To: amd-gfx; +Cc: Tao Zhou, Hawking.Zhang, YiPeng Chai, Candice.Li, yipechai
Perform gpu reset after gfx finishes processing
ras poison consumption on gfx_v11_0_3.
V2:
Move gfx poison consumption handler from hw_ops to ip
function level.
V3:
Adjust the calling position of amdgpu_gfx_poison_consumation_handler.
V4:
Since gfx v11_0_3 does not have .hw_ops instance, the .hw_ops null
pointer check in amdgpu_ras_interrupt_poison_consumption_handler
needs to be adjusted.
Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 9 +++++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 4 ++++
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 8 +++++---
drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c | 13 +++++++++++++
4 files changed, 31 insertions(+), 3 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 09c42c00e43c..caf7fd3adcbd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -731,6 +731,15 @@ int amdgpu_gfx_ras_sw_init(struct amdgpu_device *adev)
return 0;
}
+int amdgpu_gfx_poison_consumption_handler(struct amdgpu_device *adev,
+ struct amdgpu_iv_entry *entry)
+{
+ if (adev->gfx.ras && adev->gfx.ras->poison_consumption_handler)
+ return adev->gfx.ras->poison_consumption_handler(adev, entry);
+
+ return 0;
+}
+
int amdgpu_gfx_process_ras_data_cb(struct amdgpu_device *adev,
void *err_data,
struct amdgpu_iv_entry *entry)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index 0b39fe3cd624..86ec9d0d12c8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -213,6 +213,8 @@ struct amdgpu_gfx_ras {
int (*rlc_gc_fed_irq)(struct amdgpu_device *adev,
struct amdgpu_irq_src *source,
struct amdgpu_iv_entry *entry);
+ int (*poison_consumption_handler)(struct amdgpu_device *adev,
+ struct amdgpu_iv_entry *entry);
};
struct amdgpu_gfx_funcs {
@@ -437,4 +439,6 @@ int amdgpu_gfx_get_num_kcq(struct amdgpu_device *adev);
void amdgpu_gfx_cp_init_microcode(struct amdgpu_device *adev, uint32_t ucode_id);
int amdgpu_gfx_ras_sw_init(struct amdgpu_device *adev);
+int amdgpu_gfx_poison_consumption_handler(struct amdgpu_device *adev,
+ struct amdgpu_iv_entry *entry);
#endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index d06beb884a16..0a95d1c1e7ab 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1620,14 +1620,14 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
struct amdgpu_ras_block_object *block_obj =
amdgpu_ras_get_ras_block(adev, obj->head.block, 0);
- if (!block_obj || !block_obj->hw_ops)
+ if (!block_obj)
return;
/* both query_poison_status and handle_poison_consumption are optional,
* but at least one of them should be implemented if we need poison
* consumption handler
*/
- if (block_obj->hw_ops->query_poison_status) {
+ if (block_obj->hw_ops && block_obj->hw_ops->query_poison_status) {
poison_stat = block_obj->hw_ops->query_poison_status(adev);
if (!poison_stat) {
/* Not poison consumption interrupt, no need to handle it */
@@ -1641,7 +1641,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
if (!adev->gmc.xgmi.connected_to_cpu)
amdgpu_umc_poison_handler(adev, false);
- if (block_obj->hw_ops->handle_poison_consumption)
+ if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption)
poison_stat = block_obj->hw_ops->handle_poison_consumption(adev);
/* gpu reset is fallback for failed and default cases */
@@ -1649,6 +1649,8 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
dev_info(adev->dev, "GPU reset for %s RAS poison consumption is issued!\n",
block_obj->ras_comm.name);
amdgpu_ras_reset_gpu(adev);
+ } else {
+ amdgpu_gfx_poison_consumption_handler(adev, entry);
}
}
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
index a18e09de31dd..b07a72ca25d9 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
@@ -70,6 +70,19 @@ static int gfx_v11_0_3_rlc_gc_fed_irq(struct amdgpu_device *adev,
return 0;
}
+static int gfx_v11_0_3_poison_consumption_handler(struct amdgpu_device *adev,
+ struct amdgpu_iv_entry *entry)
+{
+ /* Workaround: when vmid and pasid are both zero, trigger gpu reset in KGD. */
+ if (entry && (entry->client_id == SOC21_IH_CLIENTID_GFX) &&
+ (entry->src_id == GFX_11_0_0__SRCID__RLC_GC_FED_INTERRUPT) &&
+ !entry->vmid && !entry->pasid)
+ amdgpu_ras_reset_gpu(adev);
+
+ return 0;
+}
+
struct amdgpu_gfx_ras gfx_v11_0_3_ras = {
.rlc_gc_fed_irq = gfx_v11_0_3_rlc_gc_fed_irq,
+ .poison_consumption_handler = gfx_v11_0_3_poison_consumption_handler,
};
--
2.25.1
^ permalink raw reply related [flat|nested] 5+ messages in thread
end of thread, other threads:[~2023-01-13 6:25 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-01-13 6:23 [PATCH 1/5] drm/amdgpu: Add gfx ras function on gfx v11_0_3 YiPeng Chai
2023-01-13 6:23 ` [PATCH 2/5] amd/amdgpu: Add RLC_RLCS_FED_STATUS_* to gc v11_0_3 ip headers YiPeng Chai
2023-01-13 6:23 ` [PATCH 3/5] drm/amdgpu: Add gfx ras poison consumption irq handling on gfx v11_0_3 YiPeng Chai
2023-01-13 6:23 ` [PATCH 4/5] drm/amdgpu: Add gfx cp ecc error " YiPeng Chai
2023-01-13 6:23 ` [PATCH 5/5] drm/amdgpu: Perform gpu reset after gfx finishes processing ras poison consumption on gfx_v11_0_3 YiPeng Chai
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.