[PATCH v2 1/4] drm/amdgpu: Set fatal errror detected flag earlier

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v2 1/4] drm/amdgpu: Set fatal errror detected flag earlier
@ 2024-03-28  2:35 Lijo Lazar
  2024-03-28  2:35 ` [PATCH v2 2/4] drm/amd/pm: Add PMFW message and capability flags Lijo Lazar
                   ` (3 more replies)
  0 siblings, 4 replies; 10+ messages in thread
From: Lijo Lazar @ 2024-03-28  2:35 UTC (permalink / raw)
  To: amd-gfx; +Cc: Hawking.Zhang, Alexander.Deucher, kevinyang.wang

In case of fatal errors, set FED status when interrupt is received. Set
the flag on other devices in the hive before RAS recovery work.

Signed-off-by: Lijo Lazar <lijo.lazar@amd.com>
---
v2: Avoid accessing hive in interrupt handler as it may take mutex path (Kevin)

 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 41 +++++++++++++++++--------
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index b8c7d0bf8fb1..352ce16a0963 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2399,6 +2399,19 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
 	return ret;
 }
 
+static void amdgpu_ras_set_fed_all(struct amdgpu_device *adev,
+				   struct amdgpu_hive_info *hive, bool status)
+{
+	struct amdgpu_device *tmp_adev;
+
+	if (hive) {
+		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
+			amdgpu_ras_set_fed(tmp_adev, status);
+	} else {
+		amdgpu_ras_set_fed(adev, status);
+	}
+}
+
 static void amdgpu_ras_do_recovery(struct work_struct *work)
 {
 	struct amdgpu_ras *ras =
@@ -2408,8 +2421,21 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
 	struct list_head device_list, *device_list_handle =  NULL;
 	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
 
-	if (hive)
+	if (hive) {
 		atomic_set(&hive->ras_recovery, 1);
+
+		/* If any device which is part of the hive received RAS fatal
+		 * error interrupt, set fatal error status on all. This
+		 * condition will need a recovery, and flag will be cleared
+		 * as part of recovery.
+		 */
+		list_for_each_entry(remote_adev, &hive->device_list,
+				    gmc.xgmi.head)
+			if (amdgpu_ras_get_fed_status(remote_adev)) {
+				amdgpu_ras_set_fed_all(adev, hive, true);
+				break;
+			}
+	}
 	if (!ras->disable_ras_err_cnt_harvest) {
 
 		/* Build list of devices to query RAS related errors */
@@ -2454,18 +2480,6 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
 				ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE1_RESET;
 				set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
 
-				/* For any RAS error that needs a full reset to
-				 * recover, set the fatal error status
-				 */
-				if (hive) {
-					list_for_each_entry(remote_adev,
-							    &hive->device_list,
-							    gmc.xgmi.head)
-						amdgpu_ras_set_fed(remote_adev,
-								   true);
-				} else {
-					amdgpu_ras_set_fed(adev, true);
-				}
 				psp_fatal_error_recovery_quirk(&adev->psp);
 			}
 		}
@@ -3550,6 +3564,7 @@ void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
 		RAS_EVENT_LOG(adev, event_id, "uncorrectable hardware error"
 			      "(ERREVENT_ATHUB_INTERRUPT) detected!\n");
 
+		amdgpu_ras_set_fed(adev, true);
 		ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET;
 		amdgpu_ras_reset_gpu(adev);
 	}
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH v2 2/4] drm/amd/pm: Add PMFW message and capability flags
  2024-03-28  2:35 [PATCH v2 1/4] drm/amdgpu: Set fatal errror detected flag earlier Lijo Lazar
@ 2024-03-28  2:35 ` Lijo Lazar
  2024-03-28  3:27   ` Wang, Yang(Kevin)
  2024-04-01 11:15   ` Kamal, Asad
  2024-03-28  2:35 ` [PATCH v2 3/4] drm/amd/pm: Add special handling for RAS messages Lijo Lazar
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 10+ messages in thread
From: Lijo Lazar @ 2024-03-28  2:35 UTC (permalink / raw)
  To: amd-gfx; +Cc: Hawking.Zhang, Alexander.Deucher, kevinyang.wang

Add flags to categorize messages and PMFW capabilities.

Signed-off-by: Lijo Lazar <lijo.lazar@amd.com>
---
 drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h | 3 ++-
 drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h  | 7 +++++++
 drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c        | 2 +-
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
index a870bdd49a4e..aa835df7ba1a 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
@@ -458,7 +458,7 @@ struct smu_umd_pstate_table {
 struct cmn2asic_msg_mapping {
 	int	valid_mapping;
 	int	map_to;
-	int	valid_in_vf;
+	uint32_t flags;
 };
 
 struct cmn2asic_mapping {
@@ -538,6 +538,7 @@ struct smu_context {
 	uint32_t smc_driver_if_version;
 	uint32_t smc_fw_if_version;
 	uint32_t smc_fw_version;
+	uint32_t smc_fw_caps;
 
 	bool uploading_custom_pp_table;
 	bool dc_controlled_by_gpio;
diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
index af427cc7dbb8..c48214e3dc8e 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
@@ -445,4 +445,11 @@ enum smu_feature_mask {
 	SMU_FEATURE_COUNT,
 };
 
+/* Message category flags */
+#define SMU_MSG_VF_FLAG			(1U << 0)
+#define SMU_MSG_RAS_PRI			(1U << 1)
+
+/* Firmware capability flags */
+#define SMU_FW_CAP_RAS_PRI		(1U << 0)
+
 #endif
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
index b8dbd4e25348..3227e514e8ae 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
@@ -437,7 +437,7 @@ int smu_cmn_to_asic_specific_index(struct smu_context *smu,
 			return -EINVAL;
 
 		if (amdgpu_sriov_vf(smu->adev) &&
-		    !msg_mapping.valid_in_vf)
+		    !(msg_mapping.flags & SMU_MSG_VF_FLAG))
 			return -EACCES;
 
 		return msg_mapping.map_to;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH v2 3/4] drm/amd/pm: Add special handling for RAS messages
  2024-03-28  2:35 [PATCH v2 1/4] drm/amdgpu: Set fatal errror detected flag earlier Lijo Lazar
  2024-03-28  2:35 ` [PATCH v2 2/4] drm/amd/pm: Add PMFW message and capability flags Lijo Lazar
@ 2024-03-28  2:35 ` Lijo Lazar
  2024-03-28  2:35 ` [PATCH v2 4/4] drm/amd/pm: Categorize RAS messages on SMUv13.0.6 Lijo Lazar
  2024-04-01 11:33 ` [PATCH v2 1/4] drm/amdgpu: Set fatal errror detected flag earlier Kamal, Asad
  3 siblings, 0 replies; 10+ messages in thread
From: Lijo Lazar @ 2024-03-28  2:35 UTC (permalink / raw)
  To: amd-gfx; +Cc: Hawking.Zhang, Alexander.Deucher, kevinyang.wang

When a RAS fatal error is detected, PMFW will only process priority
messages. Other messages won't be taken up for processing and therefore
won't get any response in such a state.

Add logic to filter out non-priority messages when RAS error is
detected. Also, don't poll response response status register before
sending priority messages. Use firmware capability flag to determine
whether to filter priority messages.

Signed-off-by: Lijo Lazar <lijo.lazar@amd.com>
---
 drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 65 +++++++++++++++++++++++---
 1 file changed, 59 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
index 3227e514e8ae..6d1c3af927ca 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
@@ -235,6 +235,50 @@ static void __smu_cmn_send_msg(struct smu_context *smu,
 	WREG32(smu->msg_reg, msg);
 }
 
+static inline uint32_t __smu_cmn_get_msg_flags(struct smu_context *smu,
+					       enum smu_message_type msg)
+{
+	return smu->message_map[msg].flags;
+}
+
+static int __smu_cmn_ras_filter_msg(struct smu_context *smu,
+				    enum smu_message_type msg, bool *poll)
+{
+	struct amdgpu_device *adev = smu->adev;
+	uint32_t flags, resp;
+	bool fed_status;
+
+	flags = __smu_cmn_get_msg_flags(smu, msg);
+	*poll = true;
+
+	/* When there is RAS fatal error, FW won't process non-RAS priority
+	 * messages. Don't allow any messages other than RAS priority messages.
+	 */
+	fed_status = amdgpu_ras_get_fed_status(adev);
+	if (fed_status) {
+		if (!(flags & SMU_MSG_RAS_PRI)) {
+			dev_dbg(adev->dev,
+				"RAS error detected, skip sending %s",
+				smu_get_message_name(smu, msg));
+			return -EACCES;
+		}
+
+		/* FW will ignore non-priority messages when a RAS fatal error
+		 * is detected. Hence it is possible that a previous message
+		 * wouldn't have got response. Allow to continue without polling
+		 * for response status for priority messages.
+		 */
+		resp = RREG32(smu->resp_reg);
+		dev_dbg(adev->dev,
+			"Sending RAS priority message %s response status: %x",
+			smu_get_message_name(smu, msg), resp);
+		if (resp == 0)
+			*poll = false;
+	}
+
+	return 0;
+}
+
 static int __smu_cmn_send_debug_msg(struct smu_context *smu,
 			       u32 msg,
 			       u32 param)
@@ -354,6 +398,7 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
 {
 	struct amdgpu_device *adev = smu->adev;
 	int res, index;
+	bool poll = true;
 	u32 reg;
 
 	if (adev->no_hw_access)
@@ -366,12 +411,20 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
 		return index == -EACCES ? 0 : index;
 
 	mutex_lock(&smu->message_lock);
-	reg = __smu_cmn_poll_stat(smu);
-	res = __smu_cmn_reg2errno(smu, reg);
-	if (reg == SMU_RESP_NONE ||
-	    res == -EREMOTEIO) {
-		__smu_cmn_reg_print_error(smu, reg, index, param, msg);
-		goto Out;
+
+	if (smu->smc_fw_caps & SMU_FW_CAP_RAS_PRI) {
+		res = __smu_cmn_ras_filter_msg(smu, msg, &poll);
+		if (res)
+			goto Out;
+	}
+
+	if (poll) {
+		reg = __smu_cmn_poll_stat(smu);
+		res = __smu_cmn_reg2errno(smu, reg);
+		if (reg == SMU_RESP_NONE || res == -EREMOTEIO) {
+			__smu_cmn_reg_print_error(smu, reg, index, param, msg);
+			goto Out;
+		}
 	}
 	__smu_cmn_send_msg(smu, (uint16_t) index, param);
 	reg = __smu_cmn_poll_stat(smu);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH v2 4/4] drm/amd/pm: Categorize RAS messages on SMUv13.0.6
  2024-03-28  2:35 [PATCH v2 1/4] drm/amdgpu: Set fatal errror detected flag earlier Lijo Lazar
  2024-03-28  2:35 ` [PATCH v2 2/4] drm/amd/pm: Add PMFW message and capability flags Lijo Lazar
  2024-03-28  2:35 ` [PATCH v2 3/4] drm/amd/pm: Add special handling for RAS messages Lijo Lazar
@ 2024-03-28  2:35 ` Lijo Lazar
  2024-03-28  8:01   ` Zhang, Hawking
  2024-04-01 11:33 ` [PATCH v2 1/4] drm/amdgpu: Set fatal errror detected flag earlier Kamal, Asad
  3 siblings, 1 reply; 10+ messages in thread
From: Lijo Lazar @ 2024-03-28  2:35 UTC (permalink / raw)
  To: amd-gfx; +Cc: Hawking.Zhang, Alexander.Deucher, kevinyang.wang

Set RAS priority handling capability for SMUv13.0.6 SOCs and categorize
RAS priority messages allowed.

Signed-off-by: Lijo Lazar <lijo.lazar@amd.com>
---
v2: Move setting FW capability flags to IP specific code (Kevin)

 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 443233563a52..6e06729fb2e3 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -144,7 +144,7 @@ static const struct cmn2asic_msg_mapping smu_v13_0_6_message_map[SMU_MSG_MAX_COU
 	MSG_MAP(GetDpmFreqByIndex,		     PPSMC_MSG_GetDpmFreqByIndex,		1),
 	MSG_MAP(SetPptLimit,			     PPSMC_MSG_SetPptLimit,			0),
 	MSG_MAP(GetPptLimit,			     PPSMC_MSG_GetPptLimit,			1),
-	MSG_MAP(GfxDeviceDriverReset,		     PPSMC_MSG_GfxDriverReset,			0),
+	MSG_MAP(GfxDeviceDriverReset,		     PPSMC_MSG_GfxDriverReset,			SMU_MSG_RAS_PRI),
 	MSG_MAP(DramLogSetDramAddrHigh,		     PPSMC_MSG_DramLogSetDramAddrHigh,		0),
 	MSG_MAP(DramLogSetDramAddrLow,		     PPSMC_MSG_DramLogSetDramAddrLow,		0),
 	MSG_MAP(DramLogSetDramSize,		     PPSMC_MSG_DramLogSetDramSize,		0),
@@ -167,10 +167,10 @@ static const struct cmn2asic_msg_mapping smu_v13_0_6_message_map[SMU_MSG_MAX_COU
 	MSG_MAP(GetCTFLimit,                         PPSMC_MSG_GetCTFLimit,                     0),
 	MSG_MAP(GetThermalLimit,                     PPSMC_MSG_ReadThrottlerLimit,              0),
 	MSG_MAP(ClearMcaOnRead,	                     PPSMC_MSG_ClearMcaOnRead,                  0),
-	MSG_MAP(QueryValidMcaCount,                  PPSMC_MSG_QueryValidMcaCount,              0),
-	MSG_MAP(QueryValidMcaCeCount,                PPSMC_MSG_QueryValidMcaCeCount,            0),
-	MSG_MAP(McaBankDumpDW,                       PPSMC_MSG_McaBankDumpDW,                   0),
-	MSG_MAP(McaBankCeDumpDW,                     PPSMC_MSG_McaBankCeDumpDW,                 0),
+	MSG_MAP(QueryValidMcaCount,                  PPSMC_MSG_QueryValidMcaCount,              SMU_MSG_RAS_PRI),
+	MSG_MAP(QueryValidMcaCeCount,                PPSMC_MSG_QueryValidMcaCeCount,            SMU_MSG_RAS_PRI),
+	MSG_MAP(McaBankDumpDW,                       PPSMC_MSG_McaBankDumpDW,                   SMU_MSG_RAS_PRI),
+	MSG_MAP(McaBankCeDumpDW,                     PPSMC_MSG_McaBankCeDumpDW,                 SMU_MSG_RAS_PRI),
 	MSG_MAP(SelectPLPDMode,                      PPSMC_MSG_SelectPLPDMode,                  0),
 	MSG_MAP(RmaDueToBadPageThreshold,            PPSMC_MSG_RmaDueToBadPageThreshold,        0),
 };
@@ -3218,6 +3218,7 @@ void smu_v13_0_6_set_ppt_funcs(struct smu_context *smu)
 	smu->feature_map = smu_v13_0_6_feature_mask_map;
 	smu->table_map = smu_v13_0_6_table_map;
 	smu->smc_driver_if_version = SMU13_0_6_DRIVER_IF_VERSION;
+	smu->smc_fw_caps |= SMU_FW_CAP_RAS_PRI;
 	smu_v13_0_set_smu_mailbox_registers(smu);
 	amdgpu_mca_smu_init_funcs(smu->adev, &smu_v13_0_6_mca_smu_funcs);
 	amdgpu_aca_set_smu_funcs(smu->adev, &smu_v13_0_6_aca_smu_funcs);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* RE: [PATCH v2 2/4] drm/amd/pm: Add PMFW message and capability flags
  2024-03-28  2:35 ` [PATCH v2 2/4] drm/amd/pm: Add PMFW message and capability flags Lijo Lazar
@ 2024-03-28  3:27   ` Wang, Yang(Kevin)
  2024-03-28  3:33     ` Lazar, Lijo
  2024-04-01 11:15   ` Kamal, Asad
  1 sibling, 1 reply; 10+ messages in thread
From: Wang, Yang(Kevin) @ 2024-03-28  3:27 UTC (permalink / raw)
  To: Lazar, Lijo, amd-gfx; +Cc: Zhang, Hawking, Deucher, Alexander

[AMD Official Use Only - General]

-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Lijo Lazar
Sent: Thursday, March 28, 2024 10:36 AM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Wang, Yang(Kevin) <KevinYang.Wang@amd.com>
Subject: [PATCH v2 2/4] drm/amd/pm: Add PMFW message and capability flags

Add flags to categorize messages and PMFW capabilities.

Signed-off-by: Lijo Lazar <lijo.lazar@amd.com>
---
 drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h | 3 ++-  drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h  | 7 +++++++
 drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c        | 2 +-
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
index a870bdd49a4e..aa835df7ba1a 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
@@ -458,7 +458,7 @@ struct smu_umd_pstate_table {  struct cmn2asic_msg_mapping {
        int     valid_mapping;
        int     map_to;
-       int     valid_in_vf;
+       uint32_t flags;
 };

 struct cmn2asic_mapping {
@@ -538,6 +538,7 @@ struct smu_context {
        uint32_t smc_driver_if_version;
        uint32_t smc_fw_if_version;
        uint32_t smc_fw_version;
+       uint32_t smc_fw_caps;

        bool uploading_custom_pp_table;
        bool dc_controlled_by_gpio;
diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
index af427cc7dbb8..c48214e3dc8e 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
@@ -445,4 +445,11 @@ enum smu_feature_mask {
        SMU_FEATURE_COUNT,
 };

+/* Message category flags */
+#define SMU_MSG_VF_FLAG                        (1U << 0)
+#define SMU_MSG_RAS_PRI                        (1U << 1)
+
+/* Firmware capability flags */
+#define SMU_FW_CAP_RAS_PRI             (1U << 0)
+
 #endif
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
index b8dbd4e25348..3227e514e8ae 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
@@ -437,7 +437,7 @@ int smu_cmn_to_asic_specific_index(struct smu_context *smu,
                        return -EINVAL;

                if (amdgpu_sriov_vf(smu->adev) &&
-                   !msg_mapping.valid_in_vf)
+                   !(msg_mapping.flags & SMU_MSG_VF_FLAG))
                        return -EACCES;
[kevin]:

Is it possible to use smc_fw_caps uniformly to handle sriov cases (likes FED/ras msg.flags) , which would look more reasonable?
+       smu->smc_fw_caps |= SMU_FW_CAP_VF ?

Best Regards,
Kevin

                return msg_mapping.map_to;
--
2.25.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 2/4] drm/amd/pm: Add PMFW message and capability flags
  2024-03-28  3:27   ` Wang, Yang(Kevin)
@ 2024-03-28  3:33     ` Lazar, Lijo
  0 siblings, 0 replies; 10+ messages in thread
From: Lazar, Lijo @ 2024-03-28  3:33 UTC (permalink / raw)
  To: Wang, Yang(Kevin), amd-gfx; +Cc: Zhang, Hawking, Deucher, Alexander



On 3/28/2024 8:57 AM, Wang, Yang(Kevin) wrote:
> [AMD Official Use Only - General]
> 
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Lijo Lazar
> Sent: Thursday, March 28, 2024 10:36 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Zhang, Hawking <Hawking.Zhang@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Wang, Yang(Kevin) <KevinYang.Wang@amd.com>
> Subject: [PATCH v2 2/4] drm/amd/pm: Add PMFW message and capability flags
> 
> Add flags to categorize messages and PMFW capabilities.
> 
> Signed-off-by: Lijo Lazar <lijo.lazar@amd.com>
> ---
>  drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h | 3 ++-  drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h  | 7 +++++++
>  drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c        | 2 +-
>  3 files changed, 10 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> index a870bdd49a4e..aa835df7ba1a 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> @@ -458,7 +458,7 @@ struct smu_umd_pstate_table {  struct cmn2asic_msg_mapping {
>         int     valid_mapping;
>         int     map_to;
> -       int     valid_in_vf;
> +       uint32_t flags;
>  };
> 
>  struct cmn2asic_mapping {
> @@ -538,6 +538,7 @@ struct smu_context {
>         uint32_t smc_driver_if_version;
>         uint32_t smc_fw_if_version;
>         uint32_t smc_fw_version;
> +       uint32_t smc_fw_caps;
> 
>         bool uploading_custom_pp_table;
>         bool dc_controlled_by_gpio;
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
> index af427cc7dbb8..c48214e3dc8e 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
> +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
> @@ -445,4 +445,11 @@ enum smu_feature_mask {
>         SMU_FEATURE_COUNT,
>  };
> 
> +/* Message category flags */
> +#define SMU_MSG_VF_FLAG                        (1U << 0)
> +#define SMU_MSG_RAS_PRI                        (1U << 1)
> +
> +/* Firmware capability flags */
> +#define SMU_FW_CAP_RAS_PRI             (1U << 0)
> +
>  #endif
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> index b8dbd4e25348..3227e514e8ae 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> @@ -437,7 +437,7 @@ int smu_cmn_to_asic_specific_index(struct smu_context *smu,
>                         return -EINVAL;
> 
>                 if (amdgpu_sriov_vf(smu->adev) &&
> -                   !msg_mapping.valid_in_vf)
> +                   !(msg_mapping.flags & SMU_MSG_VF_FLAG))
>                         return -EACCES;
> [kevin]:
> 
> Is it possible to use smc_fw_caps uniformly to handle sriov cases (likes FED/ras msg.flags) , which would look more reasonable?
> +       smu->smc_fw_caps |= SMU_FW_CAP_VF ?

Presently, most FW support handling VF messages which is why another
capability is not added.

Messages need to be categorized even if FW capability is there.

Thanks,
Lijo

> 
> Best Regards,
> Kevin
> 
>                 return msg_mapping.map_to;
> --
> 2.25.1
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH v2 4/4] drm/amd/pm: Categorize RAS messages on SMUv13.0.6
  2024-03-28  2:35 ` [PATCH v2 4/4] drm/amd/pm: Categorize RAS messages on SMUv13.0.6 Lijo Lazar
@ 2024-03-28  8:01   ` Zhang, Hawking
  0 siblings, 0 replies; 10+ messages in thread
From: Zhang, Hawking @ 2024-03-28  8:01 UTC (permalink / raw)
  To: Lazar, Lijo, amd-gfx; +Cc: Deucher, Alexander, Wang, Yang(Kevin)

[AMD Official Use Only - General]

Series is

Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>

Regards,
Hawking
-----Original Message-----
From: Lazar, Lijo <Lijo.Lazar@amd.com>
Sent: Thursday, March 28, 2024 10:36
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Wang, Yang(Kevin) <KevinYang.Wang@amd.com>
Subject: [PATCH v2 4/4] drm/amd/pm: Categorize RAS messages on SMUv13.0.6

Set RAS priority handling capability for SMUv13.0.6 SOCs and categorize RAS priority messages allowed.

Signed-off-by: Lijo Lazar <lijo.lazar@amd.com>
---
v2: Move setting FW capability flags to IP specific code (Kevin)

 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 443233563a52..6e06729fb2e3 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -144,7 +144,7 @@ static const struct cmn2asic_msg_mapping smu_v13_0_6_message_map[SMU_MSG_MAX_COU
        MSG_MAP(GetDpmFreqByIndex,                   PPSMC_MSG_GetDpmFreqByIndex,               1),
        MSG_MAP(SetPptLimit,                         PPSMC_MSG_SetPptLimit,                     0),
        MSG_MAP(GetPptLimit,                         PPSMC_MSG_GetPptLimit,                     1),
-       MSG_MAP(GfxDeviceDriverReset,                PPSMC_MSG_GfxDriverReset,                  0),
+       MSG_MAP(GfxDeviceDriverReset,                PPSMC_MSG_GfxDriverReset,                  SMU_MSG_RAS_PRI),
        MSG_MAP(DramLogSetDramAddrHigh,              PPSMC_MSG_DramLogSetDramAddrHigh,          0),
        MSG_MAP(DramLogSetDramAddrLow,               PPSMC_MSG_DramLogSetDramAddrLow,           0),
        MSG_MAP(DramLogSetDramSize,                  PPSMC_MSG_DramLogSetDramSize,              0),
@@ -167,10 +167,10 @@ static const struct cmn2asic_msg_mapping smu_v13_0_6_message_map[SMU_MSG_MAX_COU
        MSG_MAP(GetCTFLimit,                         PPSMC_MSG_GetCTFLimit,                     0),
        MSG_MAP(GetThermalLimit,                     PPSMC_MSG_ReadThrottlerLimit,              0),
        MSG_MAP(ClearMcaOnRead,                      PPSMC_MSG_ClearMcaOnRead,                  0),
-       MSG_MAP(QueryValidMcaCount,                  PPSMC_MSG_QueryValidMcaCount,              0),
-       MSG_MAP(QueryValidMcaCeCount,                PPSMC_MSG_QueryValidMcaCeCount,            0),
-       MSG_MAP(McaBankDumpDW,                       PPSMC_MSG_McaBankDumpDW,                   0),
-       MSG_MAP(McaBankCeDumpDW,                     PPSMC_MSG_McaBankCeDumpDW,                 0),
+       MSG_MAP(QueryValidMcaCount,                  PPSMC_MSG_QueryValidMcaCount,              SMU_MSG_RAS_PRI),
+       MSG_MAP(QueryValidMcaCeCount,                PPSMC_MSG_QueryValidMcaCeCount,            SMU_MSG_RAS_PRI),
+       MSG_MAP(McaBankDumpDW,                       PPSMC_MSG_McaBankDumpDW,                   SMU_MSG_RAS_PRI),
+       MSG_MAP(McaBankCeDumpDW,                     PPSMC_MSG_McaBankCeDumpDW,                 SMU_MSG_RAS_PRI),
        MSG_MAP(SelectPLPDMode,                      PPSMC_MSG_SelectPLPDMode,                  0),
        MSG_MAP(RmaDueToBadPageThreshold,            PPSMC_MSG_RmaDueToBadPageThreshold,        0),
 };
@@ -3218,6 +3218,7 @@ void smu_v13_0_6_set_ppt_funcs(struct smu_context *smu)
        smu->feature_map = smu_v13_0_6_feature_mask_map;
        smu->table_map = smu_v13_0_6_table_map;
        smu->smc_driver_if_version = SMU13_0_6_DRIVER_IF_VERSION;
+       smu->smc_fw_caps |= SMU_FW_CAP_RAS_PRI;
        smu_v13_0_set_smu_mailbox_registers(smu);
        amdgpu_mca_smu_init_funcs(smu->adev, &smu_v13_0_6_mca_smu_funcs);
        amdgpu_aca_set_smu_funcs(smu->adev, &smu_v13_0_6_aca_smu_funcs);
--
2.25.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* RE: [PATCH v2 2/4] drm/amd/pm: Add PMFW message and capability flags
  2024-03-28  2:35 ` [PATCH v2 2/4] drm/amd/pm: Add PMFW message and capability flags Lijo Lazar
  2024-03-28  3:27   ` Wang, Yang(Kevin)
@ 2024-04-01 11:15   ` Kamal, Asad
  2024-04-01 11:20     ` Lazar, Lijo
  1 sibling, 1 reply; 10+ messages in thread
From: Kamal, Asad @ 2024-04-01 11:15 UTC (permalink / raw)
  To: Lazar, Lijo, amd-gfx
  Cc: Zhang, Hawking, Deucher, Alexander, Wang, Yang(Kevin)

[AMD Official Use Only - General]

-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Lijo Lazar
Sent: Thursday, March 28, 2024 8:06 AM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Wang, Yang(Kevin) <KevinYang.Wang@amd.com>
Subject: [PATCH v2 2/4] drm/amd/pm: Add PMFW message and capability flags

Add flags to categorize messages and PMFW capabilities.

Signed-off-by: Lijo Lazar <lijo.lazar@amd.com>
---
 drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h | 3 ++-  drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h  | 7 +++++++
 drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c        | 2 +-
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
index a870bdd49a4e..aa835df7ba1a 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
@@ -458,7 +458,7 @@ struct smu_umd_pstate_table {  struct cmn2asic_msg_mapping {
        int     valid_mapping;
        int     map_to;
-       int     valid_in_vf;
+       uint32_t flags;
 };

[Kamal, Asad] Do we need to change the following macro , to have flags rather than valid_in_vf
#define MSG_MAP(msg, index, valid_in_vf) \
        [SMU_MSG_##msg] = {1, (index), (valid_in_vf)}

Thanks & Regards
Asad

 struct cmn2asic_mapping {
@@ -538,6 +538,7 @@ struct smu_context {
        uint32_t smc_driver_if_version;
        uint32_t smc_fw_if_version;
        uint32_t smc_fw_version;
+       uint32_t smc_fw_caps;

        bool uploading_custom_pp_table;
        bool dc_controlled_by_gpio;
diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
index af427cc7dbb8..c48214e3dc8e 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
@@ -445,4 +445,11 @@ enum smu_feature_mask {
        SMU_FEATURE_COUNT,
 };

+/* Message category flags */
+#define SMU_MSG_VF_FLAG                        (1U << 0)
+#define SMU_MSG_RAS_PRI                        (1U << 1)
+
+/* Firmware capability flags */
+#define SMU_FW_CAP_RAS_PRI             (1U << 0)
+
 #endif
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
index b8dbd4e25348..3227e514e8ae 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
@@ -437,7 +437,7 @@ int smu_cmn_to_asic_specific_index(struct smu_context *smu,
                        return -EINVAL;

                if (amdgpu_sriov_vf(smu->adev) &&
-                   !msg_mapping.valid_in_vf)
+                   !(msg_mapping.flags & SMU_MSG_VF_FLAG))
                        return -EACCES;

                return msg_mapping.map_to;
--
2.25.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 2/4] drm/amd/pm: Add PMFW message and capability flags
  2024-04-01 11:15   ` Kamal, Asad
@ 2024-04-01 11:20     ` Lazar, Lijo
  0 siblings, 0 replies; 10+ messages in thread
From: Lazar, Lijo @ 2024-04-01 11:20 UTC (permalink / raw)
  To: Kamal, Asad, amd-gfx
  Cc: Zhang, Hawking, Deucher, Alexander, Wang, Yang(Kevin)



On 4/1/2024 4:45 PM, Kamal, Asad wrote:
> [AMD Official Use Only - General]
> 
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Lijo Lazar
> Sent: Thursday, March 28, 2024 8:06 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Zhang, Hawking <Hawking.Zhang@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Wang, Yang(Kevin) <KevinYang.Wang@amd.com>
> Subject: [PATCH v2 2/4] drm/amd/pm: Add PMFW message and capability flags
> 
> Add flags to categorize messages and PMFW capabilities.
> 
> Signed-off-by: Lijo Lazar <lijo.lazar@amd.com>
> ---
>  drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h | 3 ++-  drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h  | 7 +++++++
>  drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c        | 2 +-
>  3 files changed, 10 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> index a870bdd49a4e..aa835df7ba1a 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> @@ -458,7 +458,7 @@ struct smu_umd_pstate_table {  struct cmn2asic_msg_mapping {
>         int     valid_mapping;
>         int     map_to;
> -       int     valid_in_vf;
> +       uint32_t flags;
>  };
> 
> [Kamal, Asad] Do we need to change the following macro , to have flags rather than valid_in_vf
> #define MSG_MAP(msg, index, valid_in_vf) \
>         [SMU_MSG_##msg] = {1, (index), (valid_in_vf)}
> 

Yes, will rename this to flags before submitting.

Thanks,
Lijo

> Thanks & Regards
> Asad
> 
>  struct cmn2asic_mapping {
> @@ -538,6 +538,7 @@ struct smu_context {
>         uint32_t smc_driver_if_version;
>         uint32_t smc_fw_if_version;
>         uint32_t smc_fw_version;
> +       uint32_t smc_fw_caps;
> 
>         bool uploading_custom_pp_table;
>         bool dc_controlled_by_gpio;
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
> index af427cc7dbb8..c48214e3dc8e 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
> +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
> @@ -445,4 +445,11 @@ enum smu_feature_mask {
>         SMU_FEATURE_COUNT,
>  };
> 
> +/* Message category flags */
> +#define SMU_MSG_VF_FLAG                        (1U << 0)
> +#define SMU_MSG_RAS_PRI                        (1U << 1)
> +
> +/* Firmware capability flags */
> +#define SMU_FW_CAP_RAS_PRI             (1U << 0)
> +
>  #endif
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> index b8dbd4e25348..3227e514e8ae 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> @@ -437,7 +437,7 @@ int smu_cmn_to_asic_specific_index(struct smu_context *smu,
>                         return -EINVAL;
> 
>                 if (amdgpu_sriov_vf(smu->adev) &&
> -                   !msg_mapping.valid_in_vf)
> +                   !(msg_mapping.flags & SMU_MSG_VF_FLAG))
>                         return -EACCES;
> 
>                 return msg_mapping.map_to;
> --
> 2.25.1
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH v2 1/4] drm/amdgpu: Set fatal errror detected flag earlier
  2024-03-28  2:35 [PATCH v2 1/4] drm/amdgpu: Set fatal errror detected flag earlier Lijo Lazar
                   ` (2 preceding siblings ...)
  2024-03-28  2:35 ` [PATCH v2 4/4] drm/amd/pm: Categorize RAS messages on SMUv13.0.6 Lijo Lazar
@ 2024-04-01 11:33 ` Kamal, Asad
  3 siblings, 0 replies; 10+ messages in thread
From: Kamal, Asad @ 2024-04-01 11:33 UTC (permalink / raw)
  To: Lazar, Lijo, amd-gfx
  Cc: Zhang, Hawking, Deucher, Alexander, Wang, Yang(Kevin)

[AMD Official Use Only - General]

Series is
Reviewed-by: Asad Kamal <asad.kamal@amd.com>

Thanks & Regards
Asad


-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Lijo Lazar
Sent: Thursday, March 28, 2024 8:06 AM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Wang, Yang(Kevin) <KevinYang.Wang@amd.com>
Subject: [PATCH v2 1/4] drm/amdgpu: Set fatal errror detected flag earlier

In case of fatal errors, set FED status when interrupt is received. Set the flag on other devices in the hive before RAS recovery work.

Signed-off-by: Lijo Lazar <lijo.lazar@amd.com>
---
v2: Avoid accessing hive in interrupt handler as it may take mutex path (Kevin)

 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 41 +++++++++++++++++--------
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index b8c7d0bf8fb1..352ce16a0963 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2399,6 +2399,19 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
        return ret;
 }

+static void amdgpu_ras_set_fed_all(struct amdgpu_device *adev,
+                                  struct amdgpu_hive_info *hive, bool status) {
+       struct amdgpu_device *tmp_adev;
+
+       if (hive) {
+               list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
+                       amdgpu_ras_set_fed(tmp_adev, status);
+       } else {
+               amdgpu_ras_set_fed(adev, status);
+       }
+}
+
 static void amdgpu_ras_do_recovery(struct work_struct *work)  {
        struct amdgpu_ras *ras =
@@ -2408,8 +2421,21 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
        struct list_head device_list, *device_list_handle =  NULL;
        struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);

-       if (hive)
+       if (hive) {
                atomic_set(&hive->ras_recovery, 1);
+
+               /* If any device which is part of the hive received RAS fatal
+                * error interrupt, set fatal error status on all. This
+                * condition will need a recovery, and flag will be cleared
+                * as part of recovery.
+                */
+               list_for_each_entry(remote_adev, &hive->device_list,
+                                   gmc.xgmi.head)
+                       if (amdgpu_ras_get_fed_status(remote_adev)) {
+                               amdgpu_ras_set_fed_all(adev, hive, true);
+                               break;
+                       }
+       }
        if (!ras->disable_ras_err_cnt_harvest) {

                /* Build list of devices to query RAS related errors */ @@ -2454,18 +2480,6 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
                                ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE1_RESET;
                                set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);

-                               /* For any RAS error that needs a full reset to
-                                * recover, set the fatal error status
-                                */
-                               if (hive) {
-                                       list_for_each_entry(remote_adev,
-                                                           &hive->device_list,
-                                                           gmc.xgmi.head)
-                                               amdgpu_ras_set_fed(remote_adev,
-                                                                  true);
-                               } else {
-                                       amdgpu_ras_set_fed(adev, true);
-                               }
                                psp_fatal_error_recovery_quirk(&adev->psp);
                        }
                }
@@ -3550,6 +3564,7 @@ void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
                RAS_EVENT_LOG(adev, event_id, "uncorrectable hardware error"
                              "(ERREVENT_ATHUB_INTERRUPT) detected!\n");

+               amdgpu_ras_set_fed(adev, true);
                ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET;
                amdgpu_ras_reset_gpu(adev);
        }
--
2.25.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2024-04-01 11:33 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-03-28  2:35 [PATCH v2 1/4] drm/amdgpu: Set fatal errror detected flag earlier Lijo Lazar
2024-03-28  2:35 ` [PATCH v2 2/4] drm/amd/pm: Add PMFW message and capability flags Lijo Lazar
2024-03-28  3:27   ` Wang, Yang(Kevin)
2024-03-28  3:33     ` Lazar, Lijo
2024-04-01 11:15   ` Kamal, Asad
2024-04-01 11:20     ` Lazar, Lijo
2024-03-28  2:35 ` [PATCH v2 3/4] drm/amd/pm: Add special handling for RAS messages Lijo Lazar
2024-03-28  2:35 ` [PATCH v2 4/4] drm/amd/pm: Categorize RAS messages on SMUv13.0.6 Lijo Lazar
2024-03-28  8:01   ` Zhang, Hawking
2024-04-01 11:33 ` [PATCH v2 1/4] drm/amdgpu: Set fatal errror detected flag earlier Kamal, Asad

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.