[PATCH] drm/amdgpu: add SMU debug option support

* [PATCH] drm/amdgpu: add SMU debug option support
@ 2021-11-30  5:17 Lang Yu
  2021-11-30 11:18 ` Christian König
  2021-12-01  5:14 ` Lazar, Lijo
  0 siblings, 2 replies; 12+ messages in thread
From: Lang Yu @ 2021-11-30  5:17 UTC (permalink / raw)
  To: amd-gfx; +Cc: Alex Deucher, Lijo Lazar, Huang Rui, Lang Yu, Christian Koenig

To maintain system error state when SMU errors occurred,
which will aid in debugging SMU firmware issues,
add SMU debug option support.

It can be enabled or disabled via amdgpu_smu_debug
debugfs file. When enabled, it makes SMU errors fatal.
It is disabled by default.

== Command Guide ==

1, enable SMU debug option

 # echo 1 > /sys/kernel/debug/dri/0/amdgpu_smu_debug

2, disable SMU debug option

 # echo 0 > /sys/kernel/debug/dri/0/amdgpu_smu_debug

v2:
 - Resend command when timeout.(Lijo)
 - Use debugfs file instead of module parameter.

Signed-off-by: Lang Yu <lang.yu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 32 +++++++++++++++++
 drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c      | 39 +++++++++++++++++++--
 2 files changed, 69 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 164d6a9e9fbb..f9412de86599 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -39,6 +39,8 @@
 
 #if defined(CONFIG_DEBUG_FS)
 
+extern int amdgpu_smu_debug;
+
 /**
  * amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
  *
@@ -1152,6 +1154,8 @@ static ssize_t amdgpu_debugfs_gfxoff_read(struct file *f, char __user *buf,
 	return result;
 }
 
+
+
 static const struct file_operations amdgpu_debugfs_regs2_fops = {
 	.owner = THIS_MODULE,
 	.unlocked_ioctl = amdgpu_debugfs_regs2_ioctl,
@@ -1609,6 +1613,26 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
 DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
 			amdgpu_debugfs_sclk_set, "%llu\n");
 
+static int amdgpu_debugfs_smu_debug_get(void *data, u64 *val)
+{
+	*val = amdgpu_smu_debug;
+	return 0;
+}
+
+static int amdgpu_debugfs_smu_debug_set(void *data, u64 val)
+{
+	if (val != 0 && val != 1)
+		return -EINVAL;
+
+	amdgpu_smu_debug = val;
+	return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(fops_smu_debug,
+			 amdgpu_debugfs_smu_debug_get,
+			 amdgpu_debugfs_smu_debug_set,
+			 "%llu\n");
+
 int amdgpu_debugfs_init(struct amdgpu_device *adev)
 {
 	struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
@@ -1632,6 +1656,14 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
 		return PTR_ERR(ent);
 	}
 
+	ent = debugfs_create_file("amdgpu_smu_debug", 0600, root, adev,
+				  &fops_smu_debug);
+	if (IS_ERR(ent)) {
+		DRM_ERROR("unable to create amdgpu_smu_debug debugsfs file\n");
+		return PTR_ERR(ent);
+	}
+
+
 	/* Register debugfs entries for amdgpu_ttm */
 	amdgpu_ttm_debugfs_init(adev);
 	amdgpu_debugfs_pm_init(adev);
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
index 048ca1673863..b3969d7933d3 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
@@ -55,6 +55,14 @@
 
 #undef __SMU_DUMMY_MAP
 #define __SMU_DUMMY_MAP(type)	#type
+
+/*
+ * Used to enable SMU debug option. When enabled, it makes SMU errors fatal.
+ * This will aid in debugging SMU firmware issues.
+ * (0 = disabled (default), 1 = enabled)
+ */
+int amdgpu_smu_debug;
+
 static const char * const __smu_message_names[] = {
 	SMU_MESSAGE_TYPES
 };
@@ -272,6 +280,11 @@ int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
 	__smu_cmn_send_msg(smu, msg_index, param);
 	res = 0;
 Out:
+	if (unlikely(amdgpu_smu_debug == 1) && res) {
+		mutex_unlock(&smu->message_lock);
+		BUG();
+	}
+
 	return res;
 }
 
@@ -288,9 +301,17 @@ int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
 int smu_cmn_wait_for_response(struct smu_context *smu)
 {
 	u32 reg;
+	int res;
 
 	reg = __smu_cmn_poll_stat(smu);
-	return __smu_cmn_reg2errno(smu, reg);
+	res = __smu_cmn_reg2errno(smu, reg);
+
+	if (unlikely(amdgpu_smu_debug == 1) && res) {
+		mutex_unlock(&smu->message_lock);
+		BUG();
+	}
+
+	return res;
 }
 
 /**
@@ -328,6 +349,7 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
 				    uint32_t param,
 				    uint32_t *read_arg)
 {
+	int retry_count = 0;
 	int res, index;
 	u32 reg;
 
@@ -349,15 +371,28 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
 		__smu_cmn_reg_print_error(smu, reg, index, param, msg);
 		goto Out;
 	}
+retry:
 	__smu_cmn_send_msg(smu, (uint16_t) index, param);
 	reg = __smu_cmn_poll_stat(smu);
 	res = __smu_cmn_reg2errno(smu, reg);
-	if (res != 0)
+	if (res != 0) {
 		__smu_cmn_reg_print_error(smu, reg, index, param, msg);
+		if ((res == -ETIME) && (retry_count++ < 1)) {
+			usleep_range(500, 1000);
+			dev_err(smu->adev->dev,
+				"SMU: resend command: index:%d param:0x%08X message:%s",
+				index, param, smu_get_message_name(smu, msg));
+			goto retry;
+		}
+		goto Out;
+	}
 	if (read_arg)
 		smu_cmn_read_arg(smu, read_arg);
 Out:
 	mutex_unlock(&smu->message_lock);
+
+	BUG_ON(unlikely(amdgpu_smu_debug == 1) && res);
+
 	return res;
 }
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread