All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdgpu: add SMU debug option support
@ 2021-11-30  5:17 Lang Yu
  2021-11-30 11:18 ` Christian König
  2021-12-01  5:14 ` Lazar, Lijo
  0 siblings, 2 replies; 12+ messages in thread
From: Lang Yu @ 2021-11-30  5:17 UTC (permalink / raw)
  To: amd-gfx; +Cc: Alex Deucher, Lijo Lazar, Huang Rui, Lang Yu, Christian Koenig

To maintain system error state when SMU errors occurred,
which will aid in debugging SMU firmware issues,
add SMU debug option support.

It can be enabled or disabled via amdgpu_smu_debug
debugfs file. When enabled, it makes SMU errors fatal.
It is disabled by default.

== Command Guide ==

1, enable SMU debug option

 # echo 1 > /sys/kernel/debug/dri/0/amdgpu_smu_debug

2, disable SMU debug option

 # echo 0 > /sys/kernel/debug/dri/0/amdgpu_smu_debug

v2:
 - Resend command when timeout.(Lijo)
 - Use debugfs file instead of module parameter.

Signed-off-by: Lang Yu <lang.yu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 32 +++++++++++++++++
 drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c      | 39 +++++++++++++++++++--
 2 files changed, 69 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 164d6a9e9fbb..f9412de86599 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -39,6 +39,8 @@
 
 #if defined(CONFIG_DEBUG_FS)
 
+extern int amdgpu_smu_debug;
+
 /**
  * amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
  *
@@ -1152,6 +1154,8 @@ static ssize_t amdgpu_debugfs_gfxoff_read(struct file *f, char __user *buf,
 	return result;
 }
 
+
+
 static const struct file_operations amdgpu_debugfs_regs2_fops = {
 	.owner = THIS_MODULE,
 	.unlocked_ioctl = amdgpu_debugfs_regs2_ioctl,
@@ -1609,6 +1613,26 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
 DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
 			amdgpu_debugfs_sclk_set, "%llu\n");
 
+static int amdgpu_debugfs_smu_debug_get(void *data, u64 *val)
+{
+	*val = amdgpu_smu_debug;
+	return 0;
+}
+
+static int amdgpu_debugfs_smu_debug_set(void *data, u64 val)
+{
+	if (val != 0 && val != 1)
+		return -EINVAL;
+
+	amdgpu_smu_debug = val;
+	return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(fops_smu_debug,
+			 amdgpu_debugfs_smu_debug_get,
+			 amdgpu_debugfs_smu_debug_set,
+			 "%llu\n");
+
 int amdgpu_debugfs_init(struct amdgpu_device *adev)
 {
 	struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
@@ -1632,6 +1656,14 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
 		return PTR_ERR(ent);
 	}
 
+	ent = debugfs_create_file("amdgpu_smu_debug", 0600, root, adev,
+				  &fops_smu_debug);
+	if (IS_ERR(ent)) {
+		DRM_ERROR("unable to create amdgpu_smu_debug debugsfs file\n");
+		return PTR_ERR(ent);
+	}
+
+
 	/* Register debugfs entries for amdgpu_ttm */
 	amdgpu_ttm_debugfs_init(adev);
 	amdgpu_debugfs_pm_init(adev);
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
index 048ca1673863..b3969d7933d3 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
@@ -55,6 +55,14 @@
 
 #undef __SMU_DUMMY_MAP
 #define __SMU_DUMMY_MAP(type)	#type
+
+/*
+ * Used to enable SMU debug option. When enabled, it makes SMU errors fatal.
+ * This will aid in debugging SMU firmware issues.
+ * (0 = disabled (default), 1 = enabled)
+ */
+int amdgpu_smu_debug;
+
 static const char * const __smu_message_names[] = {
 	SMU_MESSAGE_TYPES
 };
@@ -272,6 +280,11 @@ int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
 	__smu_cmn_send_msg(smu, msg_index, param);
 	res = 0;
 Out:
+	if (unlikely(amdgpu_smu_debug == 1) && res) {
+		mutex_unlock(&smu->message_lock);
+		BUG();
+	}
+
 	return res;
 }
 
@@ -288,9 +301,17 @@ int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
 int smu_cmn_wait_for_response(struct smu_context *smu)
 {
 	u32 reg;
+	int res;
 
 	reg = __smu_cmn_poll_stat(smu);
-	return __smu_cmn_reg2errno(smu, reg);
+	res = __smu_cmn_reg2errno(smu, reg);
+
+	if (unlikely(amdgpu_smu_debug == 1) && res) {
+		mutex_unlock(&smu->message_lock);
+		BUG();
+	}
+
+	return res;
 }
 
 /**
@@ -328,6 +349,7 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
 				    uint32_t param,
 				    uint32_t *read_arg)
 {
+	int retry_count = 0;
 	int res, index;
 	u32 reg;
 
@@ -349,15 +371,28 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
 		__smu_cmn_reg_print_error(smu, reg, index, param, msg);
 		goto Out;
 	}
+retry:
 	__smu_cmn_send_msg(smu, (uint16_t) index, param);
 	reg = __smu_cmn_poll_stat(smu);
 	res = __smu_cmn_reg2errno(smu, reg);
-	if (res != 0)
+	if (res != 0) {
 		__smu_cmn_reg_print_error(smu, reg, index, param, msg);
+		if ((res == -ETIME) && (retry_count++ < 1)) {
+			usleep_range(500, 1000);
+			dev_err(smu->adev->dev,
+				"SMU: resend command: index:%d param:0x%08X message:%s",
+				index, param, smu_get_message_name(smu, msg));
+			goto retry;
+		}
+		goto Out;
+	}
 	if (read_arg)
 		smu_cmn_read_arg(smu, read_arg);
 Out:
 	mutex_unlock(&smu->message_lock);
+
+	BUG_ON(unlikely(amdgpu_smu_debug == 1) && res);
+
 	return res;
 }
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH] drm/amdgpu: add SMU debug option support
  2021-11-30  5:17 [PATCH] drm/amdgpu: add SMU debug option support Lang Yu
@ 2021-11-30 11:18 ` Christian König
  2021-12-01  2:28   ` Lang Yu
  2021-12-01  5:14 ` Lazar, Lijo
  1 sibling, 1 reply; 12+ messages in thread
From: Christian König @ 2021-11-30 11:18 UTC (permalink / raw)
  To: Lang Yu, amd-gfx; +Cc: Alex Deucher, Lijo Lazar, Huang Rui

Am 30.11.21 um 06:17 schrieb Lang Yu:
> To maintain system error state when SMU errors occurred,
> which will aid in debugging SMU firmware issues,
> add SMU debug option support.
>
> It can be enabled or disabled via amdgpu_smu_debug
> debugfs file. When enabled, it makes SMU errors fatal.
> It is disabled by default.
>
> == Command Guide ==
>
> 1, enable SMU debug option
>
>   # echo 1 > /sys/kernel/debug/dri/0/amdgpu_smu_debug
>
> 2, disable SMU debug option
>
>   # echo 0 > /sys/kernel/debug/dri/0/amdgpu_smu_debug
>
> v2:
>   - Resend command when timeout.(Lijo)
>   - Use debugfs file instead of module parameter.
>
> Signed-off-by: Lang Yu <lang.yu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 32 +++++++++++++++++
>   drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c      | 39 +++++++++++++++++++--
>   2 files changed, 69 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> index 164d6a9e9fbb..f9412de86599 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> @@ -39,6 +39,8 @@
>   
>   #if defined(CONFIG_DEBUG_FS)
>   
> +extern int amdgpu_smu_debug;
> +
>   /**
>    * amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
>    *
> @@ -1152,6 +1154,8 @@ static ssize_t amdgpu_debugfs_gfxoff_read(struct file *f, char __user *buf,
>   	return result;
>   }
>   
> +
> +

Unrelated change.

>   static const struct file_operations amdgpu_debugfs_regs2_fops = {
>   	.owner = THIS_MODULE,
>   	.unlocked_ioctl = amdgpu_debugfs_regs2_ioctl,
> @@ -1609,6 +1613,26 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
>   DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
>   			amdgpu_debugfs_sclk_set, "%llu\n");
>   
> +static int amdgpu_debugfs_smu_debug_get(void *data, u64 *val)
> +{
> +	*val = amdgpu_smu_debug;
> +	return 0;
> +}
> +
> +static int amdgpu_debugfs_smu_debug_set(void *data, u64 val)
> +{
> +	if (val != 0 && val != 1)
> +		return -EINVAL;
> +
> +	amdgpu_smu_debug = val;
> +	return 0;
> +}
> +
> +DEFINE_DEBUGFS_ATTRIBUTE(fops_smu_debug,
> +			 amdgpu_debugfs_smu_debug_get,
> +			 amdgpu_debugfs_smu_debug_set,
> +			 "%llu\n");
> +

That can be done much simpler. Take a look at the debugfs_create_bool() 
function.

>   int amdgpu_debugfs_init(struct amdgpu_device *adev)
>   {
>   	struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
> @@ -1632,6 +1656,14 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
>   		return PTR_ERR(ent);
>   	}
>   
> +	ent = debugfs_create_file("amdgpu_smu_debug", 0600, root, adev,
> +				  &fops_smu_debug);
> +	if (IS_ERR(ent)) {
> +		DRM_ERROR("unable to create amdgpu_smu_debug debugsfs file\n");
> +		return PTR_ERR(ent);
> +	}
> +
> +
>   	/* Register debugfs entries for amdgpu_ttm */
>   	amdgpu_ttm_debugfs_init(adev);
>   	amdgpu_debugfs_pm_init(adev);
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> index 048ca1673863..b3969d7933d3 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> @@ -55,6 +55,14 @@
>   
>   #undef __SMU_DUMMY_MAP
>   #define __SMU_DUMMY_MAP(type)	#type
> +
> +/*
> + * Used to enable SMU debug option. When enabled, it makes SMU errors fatal.
> + * This will aid in debugging SMU firmware issues.
> + * (0 = disabled (default), 1 = enabled)
> + */
> +int amdgpu_smu_debug;

Probably better to put that into amdgpu_device or similar structure.

Regards,
Christian.

> +
>   static const char * const __smu_message_names[] = {
>   	SMU_MESSAGE_TYPES
>   };
> @@ -272,6 +280,11 @@ int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
>   	__smu_cmn_send_msg(smu, msg_index, param);
>   	res = 0;
>   Out:
> +	if (unlikely(amdgpu_smu_debug == 1) && res) {
> +		mutex_unlock(&smu->message_lock);
> +		BUG();
> +	}
> +
>   	return res;
>   }
>   
> @@ -288,9 +301,17 @@ int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
>   int smu_cmn_wait_for_response(struct smu_context *smu)
>   {
>   	u32 reg;
> +	int res;
>   
>   	reg = __smu_cmn_poll_stat(smu);
> -	return __smu_cmn_reg2errno(smu, reg);
> +	res = __smu_cmn_reg2errno(smu, reg);
> +
> +	if (unlikely(amdgpu_smu_debug == 1) && res) {
> +		mutex_unlock(&smu->message_lock);
> +		BUG();
> +	}
> +
> +	return res;
>   }
>   
>   /**
> @@ -328,6 +349,7 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
>   				    uint32_t param,
>   				    uint32_t *read_arg)
>   {
> +	int retry_count = 0;
>   	int res, index;
>   	u32 reg;
>   
> @@ -349,15 +371,28 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
>   		__smu_cmn_reg_print_error(smu, reg, index, param, msg);
>   		goto Out;
>   	}
> +retry:
>   	__smu_cmn_send_msg(smu, (uint16_t) index, param);
>   	reg = __smu_cmn_poll_stat(smu);
>   	res = __smu_cmn_reg2errno(smu, reg);
> -	if (res != 0)
> +	if (res != 0) {
>   		__smu_cmn_reg_print_error(smu, reg, index, param, msg);
> +		if ((res == -ETIME) && (retry_count++ < 1)) {
> +			usleep_range(500, 1000);
> +			dev_err(smu->adev->dev,
> +				"SMU: resend command: index:%d param:0x%08X message:%s",
> +				index, param, smu_get_message_name(smu, msg));
> +			goto retry;
> +		}
> +		goto Out;
> +	}
>   	if (read_arg)
>   		smu_cmn_read_arg(smu, read_arg);
>   Out:
>   	mutex_unlock(&smu->message_lock);
> +
> +	BUG_ON(unlikely(amdgpu_smu_debug == 1) && res);
> +
>   	return res;
>   }
>   


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] drm/amdgpu: add SMU debug option support
  2021-11-30 11:18 ` Christian König
@ 2021-12-01  2:28   ` Lang Yu
  0 siblings, 0 replies; 12+ messages in thread
From: Lang Yu @ 2021-12-01  2:28 UTC (permalink / raw)
  To: Christian KKKnig; +Cc: Alex Deucher, Lijo Lazar, Huang Rui, amd-gfx

On 11/30/ , Christian KKKnig wrote:
> Am 30.11.21 um 06:17 schrieb Lang Yu:
> > To maintain system error state when SMU errors occurred,
> > which will aid in debugging SMU firmware issues,
> > add SMU debug option support.
> > 
> > It can be enabled or disabled via amdgpu_smu_debug
> > debugfs file. When enabled, it makes SMU errors fatal.
> > It is disabled by default.
> > 
> > == Command Guide ==
> > 
> > 1, enable SMU debug option
> > 
> >   # echo 1 > /sys/kernel/debug/dri/0/amdgpu_smu_debug
> > 
> > 2, disable SMU debug option
> > 
> >   # echo 0 > /sys/kernel/debug/dri/0/amdgpu_smu_debug
> > 
> > v2:
> >   - Resend command when timeout.(Lijo)
> >   - Use debugfs file instead of module parameter.
> > 
> > Signed-off-by: Lang Yu <lang.yu@amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 32 +++++++++++++++++
> >   drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c      | 39 +++++++++++++++++++--
> >   2 files changed, 69 insertions(+), 2 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> > index 164d6a9e9fbb..f9412de86599 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> > @@ -39,6 +39,8 @@
> >   #if defined(CONFIG_DEBUG_FS)
> > +extern int amdgpu_smu_debug;
> > +
> >   /**
> >    * amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
> >    *
> > @@ -1152,6 +1154,8 @@ static ssize_t amdgpu_debugfs_gfxoff_read(struct file *f, char __user *buf,
> >   	return result;
> >   }
> > +
> > +
> 
> Unrelated change.
Will remove it.

> >   static const struct file_operations amdgpu_debugfs_regs2_fops = {
> >   	.owner = THIS_MODULE,
> >   	.unlocked_ioctl = amdgpu_debugfs_regs2_ioctl,
> > @@ -1609,6 +1613,26 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
> >   DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
> >   			amdgpu_debugfs_sclk_set, "%llu\n");
> > +static int amdgpu_debugfs_smu_debug_get(void *data, u64 *val)
> > +{
> > +	*val = amdgpu_smu_debug;
> > +	return 0;
> > +}
> > +
> > +static int amdgpu_debugfs_smu_debug_set(void *data, u64 val)
> > +{
> > +	if (val != 0 && val != 1)
> > +		return -EINVAL;
> > +
> > +	amdgpu_smu_debug = val;
> > +	return 0;
> > +}
> > +
> > +DEFINE_DEBUGFS_ATTRIBUTE(fops_smu_debug,
> > +			 amdgpu_debugfs_smu_debug_get,
> > +			 amdgpu_debugfs_smu_debug_set,
> > +			 "%llu\n");
> > +
> 
> That can be done much simpler. Take a look at the debugfs_create_bool()
> function.
Thanks for your advice. Will modify that.

> >   int amdgpu_debugfs_init(struct amdgpu_device *adev)
> >   {
> >   	struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
> > @@ -1632,6 +1656,14 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
> >   		return PTR_ERR(ent);
> >   	}
> > +	ent = debugfs_create_file("amdgpu_smu_debug", 0600, root, adev,
> > +				  &fops_smu_debug);
> > +	if (IS_ERR(ent)) {
> > +		DRM_ERROR("unable to create amdgpu_smu_debug debugsfs file\n");
> > +		return PTR_ERR(ent);
> > +	}
> > +
> > +
> >   	/* Register debugfs entries for amdgpu_ttm */
> >   	amdgpu_ttm_debugfs_init(adev);
> >   	amdgpu_debugfs_pm_init(adev);
> > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> > index 048ca1673863..b3969d7933d3 100644
> > --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> > @@ -55,6 +55,14 @@
> >   #undef __SMU_DUMMY_MAP
> >   #define __SMU_DUMMY_MAP(type)	#type
> > +
> > +/*
> > + * Used to enable SMU debug option. When enabled, it makes SMU errors fatal.
> > + * This will aid in debugging SMU firmware issues.
> > + * (0 = disabled (default), 1 = enabled)
> > + */
> > +int amdgpu_smu_debug;
> 
> Probably better to put that into amdgpu_device or similar structure.
Ok. Thanks for your advice.

Regards,
Lang

> Regards,
> Christian.
> 
> > +
> >   static const char * const __smu_message_names[] = {
> >   	SMU_MESSAGE_TYPES
> >   };
> > @@ -272,6 +280,11 @@ int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
> >   	__smu_cmn_send_msg(smu, msg_index, param);
> >   	res = 0;
> >   Out:
> > +	if (unlikely(amdgpu_smu_debug == 1) && res) {
> > +		mutex_unlock(&smu->message_lock);
> > +		BUG();
> > +	}
> > +
> >   	return res;
> >   }
> > @@ -288,9 +301,17 @@ int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
> >   int smu_cmn_wait_for_response(struct smu_context *smu)
> >   {
> >   	u32 reg;
> > +	int res;
> >   	reg = __smu_cmn_poll_stat(smu);
> > -	return __smu_cmn_reg2errno(smu, reg);
> > +	res = __smu_cmn_reg2errno(smu, reg);
> > +
> > +	if (unlikely(amdgpu_smu_debug == 1) && res) {
> > +		mutex_unlock(&smu->message_lock);
> > +		BUG();
> > +	}
> > +
> > +	return res;
> >   }
> >   /**
> > @@ -328,6 +349,7 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
> >   				    uint32_t param,
> >   				    uint32_t *read_arg)
> >   {
> > +	int retry_count = 0;
> >   	int res, index;
> >   	u32 reg;
> > @@ -349,15 +371,28 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
> >   		__smu_cmn_reg_print_error(smu, reg, index, param, msg);
> >   		goto Out;
> >   	}
> > +retry:
> >   	__smu_cmn_send_msg(smu, (uint16_t) index, param);
> >   	reg = __smu_cmn_poll_stat(smu);
> >   	res = __smu_cmn_reg2errno(smu, reg);
> > -	if (res != 0)
> > +	if (res != 0) {
> >   		__smu_cmn_reg_print_error(smu, reg, index, param, msg);
> > +		if ((res == -ETIME) && (retry_count++ < 1)) {
> > +			usleep_range(500, 1000);
> > +			dev_err(smu->adev->dev,
> > +				"SMU: resend command: index:%d param:0x%08X message:%s",
> > +				index, param, smu_get_message_name(smu, msg));
> > +			goto retry;
> > +		}
> > +		goto Out;
> > +	}
> >   	if (read_arg)
> >   		smu_cmn_read_arg(smu, read_arg);
> >   Out:
> >   	mutex_unlock(&smu->message_lock);
> > +
> > +	BUG_ON(unlikely(amdgpu_smu_debug == 1) && res);
> > +
> >   	return res;
> >   }
> 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] drm/amdgpu: add SMU debug option support
  2021-11-30  5:17 [PATCH] drm/amdgpu: add SMU debug option support Lang Yu
  2021-11-30 11:18 ` Christian König
@ 2021-12-01  5:14 ` Lazar, Lijo
  2021-12-01  5:44   ` Lazar, Lijo
  1 sibling, 1 reply; 12+ messages in thread
From: Lazar, Lijo @ 2021-12-01  5:14 UTC (permalink / raw)
  To: Lang Yu, amd-gfx; +Cc: Alex Deucher, Huang Rui, Christian Koenig



On 11/30/2021 10:47 AM, Lang Yu wrote:
> To maintain system error state when SMU errors occurred,
> which will aid in debugging SMU firmware issues,
> add SMU debug option support.
> 
> It can be enabled or disabled via amdgpu_smu_debug
> debugfs file. When enabled, it makes SMU errors fatal.
> It is disabled by default.
> 
> == Command Guide ==
> 
> 1, enable SMU debug option
> 
>   # echo 1 > /sys/kernel/debug/dri/0/amdgpu_smu_debug
> 
> 2, disable SMU debug option
> 
>   # echo 0 > /sys/kernel/debug/dri/0/amdgpu_smu_debug
> 
> v2:
>   - Resend command when timeout.(Lijo)
>   - Use debugfs file instead of module parameter.
> 
> Signed-off-by: Lang Yu <lang.yu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 32 +++++++++++++++++
>   drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c      | 39 +++++++++++++++++++--
>   2 files changed, 69 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> index 164d6a9e9fbb..f9412de86599 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> @@ -39,6 +39,8 @@
>   
>   #if defined(CONFIG_DEBUG_FS)
>   
> +extern int amdgpu_smu_debug;
> +
>   /**
>    * amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
>    *
> @@ -1152,6 +1154,8 @@ static ssize_t amdgpu_debugfs_gfxoff_read(struct file *f, char __user *buf,
>   	return result;
>   }
>   
> +
> +
>   static const struct file_operations amdgpu_debugfs_regs2_fops = {
>   	.owner = THIS_MODULE,
>   	.unlocked_ioctl = amdgpu_debugfs_regs2_ioctl,
> @@ -1609,6 +1613,26 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
>   DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
>   			amdgpu_debugfs_sclk_set, "%llu\n");
>   
> +static int amdgpu_debugfs_smu_debug_get(void *data, u64 *val)
> +{
> +	*val = amdgpu_smu_debug;
> +	return 0;
> +}
> +
> +static int amdgpu_debugfs_smu_debug_set(void *data, u64 val)
> +{
> +	if (val != 0 && val != 1)
> +		return -EINVAL;
> +
> +	amdgpu_smu_debug = val;
> +	return 0;
> +}
> +
> +DEFINE_DEBUGFS_ATTRIBUTE(fops_smu_debug,
> +			 amdgpu_debugfs_smu_debug_get,
> +			 amdgpu_debugfs_smu_debug_set,
> +			 "%llu\n");
> +
>   int amdgpu_debugfs_init(struct amdgpu_device *adev)
>   {
>   	struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
> @@ -1632,6 +1656,14 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
>   		return PTR_ERR(ent);
>   	}
>   
> +	ent = debugfs_create_file("amdgpu_smu_debug", 0600, root, adev,
> +				  &fops_smu_debug);
> +	if (IS_ERR(ent)) {
> +		DRM_ERROR("unable to create amdgpu_smu_debug debugsfs file\n");
> +		return PTR_ERR(ent);
> +	}
> +
> +
>   	/* Register debugfs entries for amdgpu_ttm */
>   	amdgpu_ttm_debugfs_init(adev);
>   	amdgpu_debugfs_pm_init(adev);
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> index 048ca1673863..b3969d7933d3 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> @@ -55,6 +55,14 @@
>   
>   #undef __SMU_DUMMY_MAP
>   #define __SMU_DUMMY_MAP(type)	#type
> +
> +/*
> + * Used to enable SMU debug option. When enabled, it makes SMU errors fatal.
> + * This will aid in debugging SMU firmware issues.
> + * (0 = disabled (default), 1 = enabled)
> + */
> +int amdgpu_smu_debug;
> +
>   static const char * const __smu_message_names[] = {
>   	SMU_MESSAGE_TYPES
>   };
> @@ -272,6 +280,11 @@ int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
>   	__smu_cmn_send_msg(smu, msg_index, param);
>   	res = 0;
>   Out:
> +	if (unlikely(amdgpu_smu_debug == 1) && res) {
> +		mutex_unlock(&smu->message_lock);
> +		BUG();
> +	}
> +
>   	return res;
>   }
>   
> @@ -288,9 +301,17 @@ int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
>   int smu_cmn_wait_for_response(struct smu_context *smu)
>   {
>   	u32 reg;
> +	int res;
>   
>   	reg = __smu_cmn_poll_stat(smu);
> -	return __smu_cmn_reg2errno(smu, reg);
> +	res = __smu_cmn_reg2errno(smu, reg);
> +
> +	if (unlikely(amdgpu_smu_debug == 1) && res) {
> +		mutex_unlock(&smu->message_lock);
> +		BUG();
> +	}
> +
> +	return res;
>   }
>   
>   /**
> @@ -328,6 +349,7 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
>   				    uint32_t param,
>   				    uint32_t *read_arg)
>   {
> +	int retry_count = 0;
>   	int res, index;
>   	u32 reg;
>   
> @@ -349,15 +371,28 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
>   		__smu_cmn_reg_print_error(smu, reg, index, param, msg);
>   		goto Out;
>   	}
> +retry:
>   	__smu_cmn_send_msg(smu, (uint16_t) index, param);
>   	reg = __smu_cmn_poll_stat(smu);
>   	res = __smu_cmn_reg2errno(smu, reg);
> -	if (res != 0)
> +	if (res != 0) {
>   		__smu_cmn_reg_print_error(smu, reg, index, param, msg);
> +		if ((res == -ETIME) && (retry_count++ < 1)) {
> +			usleep_range(500, 1000);
> +			dev_err(smu->adev->dev,
> +				"SMU: resend command: index:%d param:0x%08X message:%s",
> +				index, param, smu_get_message_name(smu, msg));
> +			goto retry;
> +		}
> +		goto Out;
> +	}

Sorry, what I meant is to have an extended wait time in debug mode. 
Something like below, not a 'full retry' as in sending the message again.


+#define MAX_DBG_WAIT_CNT 3
+
  /**
   * __smu_cmn_poll_stat -- poll for a status from the SMU
   * smu: a pointer to SMU context
@@ -115,17 +117,24 @@ static void smu_cmn_read_arg(struct smu_context *smu,
  static u32 __smu_cmn_poll_stat(struct smu_context *smu)
  {
         struct amdgpu_device *adev = smu->adev;
-       int timeout = adev->usec_timeout * 20;
+       int timeout;
         u32 reg;
+       int extended_wait = smu_debug_mode ? MAX_DBG_WAIT_CNT : 0;

-       for ( ; timeout > 0; timeout--) {
-               reg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
-               if ((reg & MP1_C2PMSG_90__CONTENT_MASK) != 0)
-                       break;
+       do {
+               timeout = adev->usec_timeout * 20;
+               for (; timeout > 0; timeout--) {
+                       reg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
+                       if ((reg & MP1_C2PMSG_90__CONTENT_MASK) != 0)
+                               break;

-               udelay(1);
-       }
+                       udelay(1);
+               }
+       } while (extended_wait-- >= 0);

+       if (extended_wait != MAX_DBG_WAIT_CNT && reg != SMU_RESP_NONE)
+               dev_err(adev->dev,
+                       "SMU: Unexpected extended wait for response");
         return reg;
  }

Thanks,
Lijo

>   	if (read_arg)
>   		smu_cmn_read_arg(smu, read_arg);
>   Out:
>   	mutex_unlock(&smu->message_lock);
> +
> +	BUG_ON(unlikely(amdgpu_smu_debug == 1) && res);
> +
>   	return res;
>   }
>   
> 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [PATCH] drm/amdgpu: add SMU debug option support
  2021-12-01  5:14 ` Lazar, Lijo
@ 2021-12-01  5:44   ` Lazar, Lijo
  2021-12-01  6:27     ` Yu, Lang
  0 siblings, 1 reply; 12+ messages in thread
From: Lazar, Lijo @ 2021-12-01  5:44 UTC (permalink / raw)
  To: Lazar, Lijo, Yu, Lang, amd-gfx
  Cc: Deucher, Alexander, Huang, Ray, Koenig, Christian

Just realized that the patch I pasted won't work. Outer loop exit needs to be like this. 
	(reg & MP1_C2PMSG_90__CONTENT_MASK) != 0 && extended_wait-- >= 0

Anyway, that patch is only there to communicate what I really meant in the earlier comment.

Thanks,
Lijo

-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Lazar, Lijo
Sent: Wednesday, December 1, 2021 10:44 AM
To: Yu, Lang <Lang.Yu@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
Subject: Re: [PATCH] drm/amdgpu: add SMU debug option support



On 11/30/2021 10:47 AM, Lang Yu wrote:
> To maintain system error state when SMU errors occurred, which will 
> aid in debugging SMU firmware issues, add SMU debug option support.
> 
> It can be enabled or disabled via amdgpu_smu_debug debugfs file. When 
> enabled, it makes SMU errors fatal.
> It is disabled by default.
> 
> == Command Guide ==
> 
> 1, enable SMU debug option
> 
>   # echo 1 > /sys/kernel/debug/dri/0/amdgpu_smu_debug
> 
> 2, disable SMU debug option
> 
>   # echo 0 > /sys/kernel/debug/dri/0/amdgpu_smu_debug
> 
> v2:
>   - Resend command when timeout.(Lijo)
>   - Use debugfs file instead of module parameter.
> 
> Signed-off-by: Lang Yu <lang.yu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 32 +++++++++++++++++
>   drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c      | 39 +++++++++++++++++++--
>   2 files changed, 69 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> index 164d6a9e9fbb..f9412de86599 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> @@ -39,6 +39,8 @@
>   
>   #if defined(CONFIG_DEBUG_FS)
>   
> +extern int amdgpu_smu_debug;
> +
>   /**
>    * amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
>    *
> @@ -1152,6 +1154,8 @@ static ssize_t amdgpu_debugfs_gfxoff_read(struct file *f, char __user *buf,
>   	return result;
>   }
>   
> +
> +
>   static const struct file_operations amdgpu_debugfs_regs2_fops = {
>   	.owner = THIS_MODULE,
>   	.unlocked_ioctl = amdgpu_debugfs_regs2_ioctl, @@ -1609,6 +1613,26 
> @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
>   DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
>   			amdgpu_debugfs_sclk_set, "%llu\n");
>   
> +static int amdgpu_debugfs_smu_debug_get(void *data, u64 *val) {
> +	*val = amdgpu_smu_debug;
> +	return 0;
> +}
> +
> +static int amdgpu_debugfs_smu_debug_set(void *data, u64 val) {
> +	if (val != 0 && val != 1)
> +		return -EINVAL;
> +
> +	amdgpu_smu_debug = val;
> +	return 0;
> +}
> +
> +DEFINE_DEBUGFS_ATTRIBUTE(fops_smu_debug,
> +			 amdgpu_debugfs_smu_debug_get,
> +			 amdgpu_debugfs_smu_debug_set,
> +			 "%llu\n");
> +
>   int amdgpu_debugfs_init(struct amdgpu_device *adev)
>   {
>   	struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
> @@ -1632,6 +1656,14 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
>   		return PTR_ERR(ent);
>   	}
>   
> +	ent = debugfs_create_file("amdgpu_smu_debug", 0600, root, adev,
> +				  &fops_smu_debug);
> +	if (IS_ERR(ent)) {
> +		DRM_ERROR("unable to create amdgpu_smu_debug debugsfs file\n");
> +		return PTR_ERR(ent);
> +	}
> +
> +
>   	/* Register debugfs entries for amdgpu_ttm */
>   	amdgpu_ttm_debugfs_init(adev);
>   	amdgpu_debugfs_pm_init(adev);
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c 
> b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> index 048ca1673863..b3969d7933d3 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> @@ -55,6 +55,14 @@
>   
>   #undef __SMU_DUMMY_MAP
>   #define __SMU_DUMMY_MAP(type)	#type
> +
> +/*
> + * Used to enable SMU debug option. When enabled, it makes SMU errors fatal.
> + * This will aid in debugging SMU firmware issues.
> + * (0 = disabled (default), 1 = enabled)  */ int amdgpu_smu_debug;
> +
>   static const char * const __smu_message_names[] = {
>   	SMU_MESSAGE_TYPES
>   };
> @@ -272,6 +280,11 @@ int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
>   	__smu_cmn_send_msg(smu, msg_index, param);
>   	res = 0;
>   Out:
> +	if (unlikely(amdgpu_smu_debug == 1) && res) {
> +		mutex_unlock(&smu->message_lock);
> +		BUG();
> +	}
> +
>   	return res;
>   }
>   
> @@ -288,9 +301,17 @@ int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
>   int smu_cmn_wait_for_response(struct smu_context *smu)
>   {
>   	u32 reg;
> +	int res;
>   
>   	reg = __smu_cmn_poll_stat(smu);
> -	return __smu_cmn_reg2errno(smu, reg);
> +	res = __smu_cmn_reg2errno(smu, reg);
> +
> +	if (unlikely(amdgpu_smu_debug == 1) && res) {
> +		mutex_unlock(&smu->message_lock);
> +		BUG();
> +	}
> +
> +	return res;
>   }
>   
>   /**
> @@ -328,6 +349,7 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
>   				    uint32_t param,
>   				    uint32_t *read_arg)
>   {
> +	int retry_count = 0;
>   	int res, index;
>   	u32 reg;
>   
> @@ -349,15 +371,28 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
>   		__smu_cmn_reg_print_error(smu, reg, index, param, msg);
>   		goto Out;
>   	}
> +retry:
>   	__smu_cmn_send_msg(smu, (uint16_t) index, param);
>   	reg = __smu_cmn_poll_stat(smu);
>   	res = __smu_cmn_reg2errno(smu, reg);
> -	if (res != 0)
> +	if (res != 0) {
>   		__smu_cmn_reg_print_error(smu, reg, index, param, msg);
> +		if ((res == -ETIME) && (retry_count++ < 1)) {
> +			usleep_range(500, 1000);
> +			dev_err(smu->adev->dev,
> +				"SMU: resend command: index:%d param:0x%08X message:%s",
> +				index, param, smu_get_message_name(smu, msg));
> +			goto retry;
> +		}
> +		goto Out;
> +	}

Sorry, what I meant is to have an extended wait time in debug mode. 
Something like below, not a 'full retry' as in sending the message again.


+#define MAX_DBG_WAIT_CNT 3
+
  /**
   * __smu_cmn_poll_stat -- poll for a status from the SMU
   * smu: a pointer to SMU context
@@ -115,17 +117,24 @@ static void smu_cmn_read_arg(struct smu_context *smu,
  static u32 __smu_cmn_poll_stat(struct smu_context *smu)
  {
         struct amdgpu_device *adev = smu->adev;
-       int timeout = adev->usec_timeout * 20;
+       int timeout;
         u32 reg;
+       int extended_wait = smu_debug_mode ? MAX_DBG_WAIT_CNT : 0;

-       for ( ; timeout > 0; timeout--) {
-               reg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
-               if ((reg & MP1_C2PMSG_90__CONTENT_MASK) != 0)
-                       break;
+       do {
+               timeout = adev->usec_timeout * 20;
+               for (; timeout > 0; timeout--) {
+                       reg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
+                       if ((reg & MP1_C2PMSG_90__CONTENT_MASK) != 0)
+                               break;

-               udelay(1);
-       }
+                       udelay(1);
+               }
+       } while (extended_wait-- >= 0);

+       if (extended_wait != MAX_DBG_WAIT_CNT && reg != SMU_RESP_NONE)
+               dev_err(adev->dev,
+                       "SMU: Unexpected extended wait for response");
         return reg;
  }

Thanks,
Lijo

>   	if (read_arg)
>   		smu_cmn_read_arg(smu, read_arg);
>   Out:
>   	mutex_unlock(&smu->message_lock);
> +
> +	BUG_ON(unlikely(amdgpu_smu_debug == 1) && res);
> +
>   	return res;
>   }
>   
> 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [PATCH] drm/amdgpu: add SMU debug option support
  2021-12-01  5:44   ` Lazar, Lijo
@ 2021-12-01  6:27     ` Yu, Lang
  2021-12-01  6:56       ` Lazar, Lijo
  0 siblings, 1 reply; 12+ messages in thread
From: Yu, Lang @ 2021-12-01  6:27 UTC (permalink / raw)
  To: Lazar, Lijo, amd-gfx; +Cc: Deucher, Alexander, Huang, Ray, Koenig, Christian

[AMD Official Use Only]

Hi Lijo,

Thanks for your comments.
 
From my understanding, that just increases the timeout threshold and
could hide some potential issues which should be exposed and solved.

If current timeout threshold is not enough for some corner cases,
(1) Do we consider to increase the threshold to cover these cases?
(2) Or do we just expose them and request SMU FW to optimize them?

I think it doesn't make much sense to increase the threshold in debug mode.
How do you think? Thanks!

Regards,
Lang

>-----Original Message-----
>From: Lazar, Lijo <Lijo.Lazar@amd.com>
>Sent: Wednesday, December 1, 2021 1:44 PM
>To: Lazar, Lijo <Lijo.Lazar@amd.com>; Yu, Lang <Lang.Yu@amd.com>; amd-
>gfx@lists.freedesktop.org
>Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Huang, Ray
><Ray.Huang@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
>Subject: RE: [PATCH] drm/amdgpu: add SMU debug option support
>
>Just realized that the patch I pasted won't work. Outer loop exit needs to be like
>this.
>	(reg & MP1_C2PMSG_90__CONTENT_MASK) != 0 && extended_wait-- >=
>0
>
>Anyway, that patch is only there to communicate what I really meant in the
>earlier comment.
>
>Thanks,
>Lijo
>
>-----Original Message-----
>From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Lazar,
>Lijo
>Sent: Wednesday, December 1, 2021 10:44 AM
>To: Yu, Lang <Lang.Yu@amd.com>; amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Huang, Ray
><Ray.Huang@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
>Subject: Re: [PATCH] drm/amdgpu: add SMU debug option support
>
>
>
>On 11/30/2021 10:47 AM, Lang Yu wrote:
>> To maintain system error state when SMU errors occurred, which will
>> aid in debugging SMU firmware issues, add SMU debug option support.
>>
>> It can be enabled or disabled via amdgpu_smu_debug debugfs file. When
>> enabled, it makes SMU errors fatal.
>> It is disabled by default.
>>
>> == Command Guide ==
>>
>> 1, enable SMU debug option
>>
>>   # echo 1 > /sys/kernel/debug/dri/0/amdgpu_smu_debug
>>
>> 2, disable SMU debug option
>>
>>   # echo 0 > /sys/kernel/debug/dri/0/amdgpu_smu_debug
>>
>> v2:
>>   - Resend command when timeout.(Lijo)
>>   - Use debugfs file instead of module parameter.
>>
>> Signed-off-by: Lang Yu <lang.yu@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 32 +++++++++++++++++
>>   drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c      | 39 +++++++++++++++++++-
>-
>>   2 files changed, 69 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> index 164d6a9e9fbb..f9412de86599 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> @@ -39,6 +39,8 @@
>>
>>   #if defined(CONFIG_DEBUG_FS)
>>
>> +extern int amdgpu_smu_debug;
>> +
>>   /**
>>    * amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
>>    *
>> @@ -1152,6 +1154,8 @@ static ssize_t amdgpu_debugfs_gfxoff_read(struct
>file *f, char __user *buf,
>>   	return result;
>>   }
>>
>> +
>> +
>>   static const struct file_operations amdgpu_debugfs_regs2_fops = {
>>   	.owner = THIS_MODULE,
>>   	.unlocked_ioctl = amdgpu_debugfs_regs2_ioctl, @@ -1609,6 +1613,26
>> @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
>>   DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
>>   			amdgpu_debugfs_sclk_set, "%llu\n");
>>
>> +static int amdgpu_debugfs_smu_debug_get(void *data, u64 *val) {
>> +	*val = amdgpu_smu_debug;
>> +	return 0;
>> +}
>> +
>> +static int amdgpu_debugfs_smu_debug_set(void *data, u64 val) {
>> +	if (val != 0 && val != 1)
>> +		return -EINVAL;
>> +
>> +	amdgpu_smu_debug = val;
>> +	return 0;
>> +}
>> +
>> +DEFINE_DEBUGFS_ATTRIBUTE(fops_smu_debug,
>> +			 amdgpu_debugfs_smu_debug_get,
>> +			 amdgpu_debugfs_smu_debug_set,
>> +			 "%llu\n");
>> +
>>   int amdgpu_debugfs_init(struct amdgpu_device *adev)
>>   {
>>   	struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
>> @@ -1632,6 +1656,14 @@ int amdgpu_debugfs_init(struct amdgpu_device
>*adev)
>>   		return PTR_ERR(ent);
>>   	}
>>
>> +	ent = debugfs_create_file("amdgpu_smu_debug", 0600, root, adev,
>> +				  &fops_smu_debug);
>> +	if (IS_ERR(ent)) {
>> +		DRM_ERROR("unable to create amdgpu_smu_debug debugsfs
>file\n");
>> +		return PTR_ERR(ent);
>> +	}
>> +
>> +
>>   	/* Register debugfs entries for amdgpu_ttm */
>>   	amdgpu_ttm_debugfs_init(adev);
>>   	amdgpu_debugfs_pm_init(adev);
>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>> b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>> index 048ca1673863..b3969d7933d3 100644
>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>> @@ -55,6 +55,14 @@
>>
>>   #undef __SMU_DUMMY_MAP
>>   #define __SMU_DUMMY_MAP(type)	#type
>> +
>> +/*
>> + * Used to enable SMU debug option. When enabled, it makes SMU errors
>fatal.
>> + * This will aid in debugging SMU firmware issues.
>> + * (0 = disabled (default), 1 = enabled)  */ int amdgpu_smu_debug;
>> +
>>   static const char * const __smu_message_names[] = {
>>   	SMU_MESSAGE_TYPES
>>   };
>> @@ -272,6 +280,11 @@ int smu_cmn_send_msg_without_waiting(struct
>smu_context *smu,
>>   	__smu_cmn_send_msg(smu, msg_index, param);
>>   	res = 0;
>>   Out:
>> +	if (unlikely(amdgpu_smu_debug == 1) && res) {
>> +		mutex_unlock(&smu->message_lock);
>> +		BUG();
>> +	}
>> +
>>   	return res;
>>   }
>>
>> @@ -288,9 +301,17 @@ int smu_cmn_send_msg_without_waiting(struct
>smu_context *smu,
>>   int smu_cmn_wait_for_response(struct smu_context *smu)
>>   {
>>   	u32 reg;
>> +	int res;
>>
>>   	reg = __smu_cmn_poll_stat(smu);
>> -	return __smu_cmn_reg2errno(smu, reg);
>> +	res = __smu_cmn_reg2errno(smu, reg);
>> +
>> +	if (unlikely(amdgpu_smu_debug == 1) && res) {
>> +		mutex_unlock(&smu->message_lock);
>> +		BUG();
>> +	}
>> +
>> +	return res;
>>   }
>>
>>   /**
>> @@ -328,6 +349,7 @@ int smu_cmn_send_smc_msg_with_param(struct
>smu_context *smu,
>>   				    uint32_t param,
>>   				    uint32_t *read_arg)
>>   {
>> +	int retry_count = 0;
>>   	int res, index;
>>   	u32 reg;
>>
>> @@ -349,15 +371,28 @@ int smu_cmn_send_smc_msg_with_param(struct
>smu_context *smu,
>>   		__smu_cmn_reg_print_error(smu, reg, index, param, msg);
>>   		goto Out;
>>   	}
>> +retry:
>>   	__smu_cmn_send_msg(smu, (uint16_t) index, param);
>>   	reg = __smu_cmn_poll_stat(smu);
>>   	res = __smu_cmn_reg2errno(smu, reg);
>> -	if (res != 0)
>> +	if (res != 0) {
>>   		__smu_cmn_reg_print_error(smu, reg, index, param, msg);
>> +		if ((res == -ETIME) && (retry_count++ < 1)) {
>> +			usleep_range(500, 1000);
>> +			dev_err(smu->adev->dev,
>> +				"SMU: resend command: index:%d
>param:0x%08X message:%s",
>> +				index, param, smu_get_message_name(smu,
>msg));
>> +			goto retry;
>> +		}
>> +		goto Out;
>> +	}
>
>Sorry, what I meant is to have an extended wait time in debug mode.
>Something like below, not a 'full retry' as in sending the message again.
>
>
>+#define MAX_DBG_WAIT_CNT 3
>+
>  /**
>   * __smu_cmn_poll_stat -- poll for a status from the SMU
>   * smu: a pointer to SMU context
>@@ -115,17 +117,24 @@ static void smu_cmn_read_arg(struct smu_context
>*smu,
>  static u32 __smu_cmn_poll_stat(struct smu_context *smu)
>  {
>         struct amdgpu_device *adev = smu->adev;
>-       int timeout = adev->usec_timeout * 20;
>+       int timeout;
>         u32 reg;
>+       int extended_wait = smu_debug_mode ? MAX_DBG_WAIT_CNT : 0;
>
>-       for ( ; timeout > 0; timeout--) {
>-               reg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
>-               if ((reg & MP1_C2PMSG_90__CONTENT_MASK) != 0)
>-                       break;
>+       do {
>+               timeout = adev->usec_timeout * 20;
>+               for (; timeout > 0; timeout--) {
>+                       reg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
>+                       if ((reg & MP1_C2PMSG_90__CONTENT_MASK) != 0)
>+                               break;
>
>-               udelay(1);
>-       }
>+                       udelay(1);
>+               }
>+       } while (extended_wait-- >= 0);
>
>+       if (extended_wait != MAX_DBG_WAIT_CNT && reg != SMU_RESP_NONE)
>+               dev_err(adev->dev,
>+                       "SMU: Unexpected extended wait for response");
>         return reg;
>  }
>
>Thanks,
>Lijo
>
>>   	if (read_arg)
>>   		smu_cmn_read_arg(smu, read_arg);
>>   Out:
>>   	mutex_unlock(&smu->message_lock);
>> +
>> +	BUG_ON(unlikely(amdgpu_smu_debug == 1) && res);
>> +
>>   	return res;
>>   }
>>
>>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] drm/amdgpu: add SMU debug option support
  2021-12-01  6:27     ` Yu, Lang
@ 2021-12-01  6:56       ` Lazar, Lijo
  2021-12-01  7:07         ` Yu, Lang
  0 siblings, 1 reply; 12+ messages in thread
From: Lazar, Lijo @ 2021-12-01  6:56 UTC (permalink / raw)
  To: Yu, Lang, amd-gfx; +Cc: Deucher, Alexander, Huang, Ray, Koenig, Christian



On 12/1/2021 11:57 AM, Yu, Lang wrote:
> [AMD Official Use Only]
> 
> Hi Lijo,
> 
> Thanks for your comments.
>   
>  From my understanding, that just increases the timeout threshold and
> could hide some potential issues which should be exposed and solved.
> 
> If current timeout threshold is not enough for some corner cases,
> (1) Do we consider to increase the threshold to cover these cases?
> (2) Or do we just expose them and request SMU FW to optimize them?
> 
> I think it doesn't make much sense to increase the threshold in debug mode.
> How do you think? Thanks!

In normal cases, 2secs would be more than enough. If we hang 
immediately, then check the FW registers later, the response would have 
come. I thought we just need to note those cases and not to fail 
everytime. Just to mark as a red flag in the log to tell us that FW is 
unexpectedly busy processing something else when the message is sent.

There are some issues related to S0ix where we see the FW comes back 
with a response with an increased timeout under certain conditions.

Thanks,
Lijo

> 
> Regards,
> Lang
> 
>> -----Original Message-----
>> From: Lazar, Lijo <Lijo.Lazar@amd.com>
>> Sent: Wednesday, December 1, 2021 1:44 PM
>> To: Lazar, Lijo <Lijo.Lazar@amd.com>; Yu, Lang <Lang.Yu@amd.com>; amd-
>> gfx@lists.freedesktop.org
>> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Huang, Ray
>> <Ray.Huang@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
>> Subject: RE: [PATCH] drm/amdgpu: add SMU debug option support
>>
>> Just realized that the patch I pasted won't work. Outer loop exit needs to be like
>> this.
>> 	(reg & MP1_C2PMSG_90__CONTENT_MASK) != 0 && extended_wait-- >=
>> 0
>>
>> Anyway, that patch is only there to communicate what I really meant in the
>> earlier comment.
>>
>> Thanks,
>> Lijo
>>
>> -----Original Message-----
>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Lazar,
>> Lijo
>> Sent: Wednesday, December 1, 2021 10:44 AM
>> To: Yu, Lang <Lang.Yu@amd.com>; amd-gfx@lists.freedesktop.org
>> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Huang, Ray
>> <Ray.Huang@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
>> Subject: Re: [PATCH] drm/amdgpu: add SMU debug option support
>>
>>
>>
>> On 11/30/2021 10:47 AM, Lang Yu wrote:
>>> To maintain system error state when SMU errors occurred, which will
>>> aid in debugging SMU firmware issues, add SMU debug option support.
>>>
>>> It can be enabled or disabled via amdgpu_smu_debug debugfs file. When
>>> enabled, it makes SMU errors fatal.
>>> It is disabled by default.
>>>
>>> == Command Guide ==
>>>
>>> 1, enable SMU debug option
>>>
>>>    # echo 1 > /sys/kernel/debug/dri/0/amdgpu_smu_debug
>>>
>>> 2, disable SMU debug option
>>>
>>>    # echo 0 > /sys/kernel/debug/dri/0/amdgpu_smu_debug
>>>
>>> v2:
>>>    - Resend command when timeout.(Lijo)
>>>    - Use debugfs file instead of module parameter.
>>>
>>> Signed-off-by: Lang Yu <lang.yu@amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 32 +++++++++++++++++
>>>    drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c      | 39 +++++++++++++++++++-
>> -
>>>    2 files changed, 69 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> index 164d6a9e9fbb..f9412de86599 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> @@ -39,6 +39,8 @@
>>>
>>>    #if defined(CONFIG_DEBUG_FS)
>>>
>>> +extern int amdgpu_smu_debug;
>>> +
>>>    /**
>>>     * amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
>>>     *
>>> @@ -1152,6 +1154,8 @@ static ssize_t amdgpu_debugfs_gfxoff_read(struct
>> file *f, char __user *buf,
>>>    	return result;
>>>    }
>>>
>>> +
>>> +
>>>    static const struct file_operations amdgpu_debugfs_regs2_fops = {
>>>    	.owner = THIS_MODULE,
>>>    	.unlocked_ioctl = amdgpu_debugfs_regs2_ioctl, @@ -1609,6 +1613,26
>>> @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
>>>    DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
>>>    			amdgpu_debugfs_sclk_set, "%llu\n");
>>>
>>> +static int amdgpu_debugfs_smu_debug_get(void *data, u64 *val) {
>>> +	*val = amdgpu_smu_debug;
>>> +	return 0;
>>> +}
>>> +
>>> +static int amdgpu_debugfs_smu_debug_set(void *data, u64 val) {
>>> +	if (val != 0 && val != 1)
>>> +		return -EINVAL;
>>> +
>>> +	amdgpu_smu_debug = val;
>>> +	return 0;
>>> +}
>>> +
>>> +DEFINE_DEBUGFS_ATTRIBUTE(fops_smu_debug,
>>> +			 amdgpu_debugfs_smu_debug_get,
>>> +			 amdgpu_debugfs_smu_debug_set,
>>> +			 "%llu\n");
>>> +
>>>    int amdgpu_debugfs_init(struct amdgpu_device *adev)
>>>    {
>>>    	struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
>>> @@ -1632,6 +1656,14 @@ int amdgpu_debugfs_init(struct amdgpu_device
>> *adev)
>>>    		return PTR_ERR(ent);
>>>    	}
>>>
>>> +	ent = debugfs_create_file("amdgpu_smu_debug", 0600, root, adev,
>>> +				  &fops_smu_debug);
>>> +	if (IS_ERR(ent)) {
>>> +		DRM_ERROR("unable to create amdgpu_smu_debug debugsfs
>> file\n");
>>> +		return PTR_ERR(ent);
>>> +	}
>>> +
>>> +
>>>    	/* Register debugfs entries for amdgpu_ttm */
>>>    	amdgpu_ttm_debugfs_init(adev);
>>>    	amdgpu_debugfs_pm_init(adev);
>>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>>> b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>>> index 048ca1673863..b3969d7933d3 100644
>>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>>> @@ -55,6 +55,14 @@
>>>
>>>    #undef __SMU_DUMMY_MAP
>>>    #define __SMU_DUMMY_MAP(type)	#type
>>> +
>>> +/*
>>> + * Used to enable SMU debug option. When enabled, it makes SMU errors
>> fatal.
>>> + * This will aid in debugging SMU firmware issues.
>>> + * (0 = disabled (default), 1 = enabled)  */ int amdgpu_smu_debug;
>>> +
>>>    static const char * const __smu_message_names[] = {
>>>    	SMU_MESSAGE_TYPES
>>>    };
>>> @@ -272,6 +280,11 @@ int smu_cmn_send_msg_without_waiting(struct
>> smu_context *smu,
>>>    	__smu_cmn_send_msg(smu, msg_index, param);
>>>    	res = 0;
>>>    Out:
>>> +	if (unlikely(amdgpu_smu_debug == 1) && res) {
>>> +		mutex_unlock(&smu->message_lock);
>>> +		BUG();
>>> +	}
>>> +
>>>    	return res;
>>>    }
>>>
>>> @@ -288,9 +301,17 @@ int smu_cmn_send_msg_without_waiting(struct
>> smu_context *smu,
>>>    int smu_cmn_wait_for_response(struct smu_context *smu)
>>>    {
>>>    	u32 reg;
>>> +	int res;
>>>
>>>    	reg = __smu_cmn_poll_stat(smu);
>>> -	return __smu_cmn_reg2errno(smu, reg);
>>> +	res = __smu_cmn_reg2errno(smu, reg);
>>> +
>>> +	if (unlikely(amdgpu_smu_debug == 1) && res) {
>>> +		mutex_unlock(&smu->message_lock);
>>> +		BUG();
>>> +	}
>>> +
>>> +	return res;
>>>    }
>>>
>>>    /**
>>> @@ -328,6 +349,7 @@ int smu_cmn_send_smc_msg_with_param(struct
>> smu_context *smu,
>>>    				    uint32_t param,
>>>    				    uint32_t *read_arg)
>>>    {
>>> +	int retry_count = 0;
>>>    	int res, index;
>>>    	u32 reg;
>>>
>>> @@ -349,15 +371,28 @@ int smu_cmn_send_smc_msg_with_param(struct
>> smu_context *smu,
>>>    		__smu_cmn_reg_print_error(smu, reg, index, param, msg);
>>>    		goto Out;
>>>    	}
>>> +retry:
>>>    	__smu_cmn_send_msg(smu, (uint16_t) index, param);
>>>    	reg = __smu_cmn_poll_stat(smu);
>>>    	res = __smu_cmn_reg2errno(smu, reg);
>>> -	if (res != 0)
>>> +	if (res != 0) {
>>>    		__smu_cmn_reg_print_error(smu, reg, index, param, msg);
>>> +		if ((res == -ETIME) && (retry_count++ < 1)) {
>>> +			usleep_range(500, 1000);
>>> +			dev_err(smu->adev->dev,
>>> +				"SMU: resend command: index:%d
>> param:0x%08X message:%s",
>>> +				index, param, smu_get_message_name(smu,
>> msg));
>>> +			goto retry;
>>> +		}
>>> +		goto Out;
>>> +	}
>>
>> Sorry, what I meant is to have an extended wait time in debug mode.
>> Something like below, not a 'full retry' as in sending the message again.
>>
>>
>> +#define MAX_DBG_WAIT_CNT 3
>> +
>>   /**
>>    * __smu_cmn_poll_stat -- poll for a status from the SMU
>>    * smu: a pointer to SMU context
>> @@ -115,17 +117,24 @@ static void smu_cmn_read_arg(struct smu_context
>> *smu,
>>   static u32 __smu_cmn_poll_stat(struct smu_context *smu)
>>   {
>>          struct amdgpu_device *adev = smu->adev;
>> -       int timeout = adev->usec_timeout * 20;
>> +       int timeout;
>>          u32 reg;
>> +       int extended_wait = smu_debug_mode ? MAX_DBG_WAIT_CNT : 0;
>>
>> -       for ( ; timeout > 0; timeout--) {
>> -               reg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
>> -               if ((reg & MP1_C2PMSG_90__CONTENT_MASK) != 0)
>> -                       break;
>> +       do {
>> +               timeout = adev->usec_timeout * 20;
>> +               for (; timeout > 0; timeout--) {
>> +                       reg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
>> +                       if ((reg & MP1_C2PMSG_90__CONTENT_MASK) != 0)
>> +                               break;
>>
>> -               udelay(1);
>> -       }
>> +                       udelay(1);
>> +               }
>> +       } while (extended_wait-- >= 0);
>>
>> +       if (extended_wait != MAX_DBG_WAIT_CNT && reg != SMU_RESP_NONE)
>> +               dev_err(adev->dev,
>> +                       "SMU: Unexpected extended wait for response");
>>          return reg;
>>   }
>>
>> Thanks,
>> Lijo
>>
>>>    	if (read_arg)
>>>    		smu_cmn_read_arg(smu, read_arg);
>>>    Out:
>>>    	mutex_unlock(&smu->message_lock);
>>> +
>>> +	BUG_ON(unlikely(amdgpu_smu_debug == 1) && res);
>>> +
>>>    	return res;
>>>    }
>>>
>>>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [PATCH] drm/amdgpu: add SMU debug option support
  2021-12-01  6:56       ` Lazar, Lijo
@ 2021-12-01  7:07         ` Yu, Lang
  2021-12-01  7:28           ` Lazar, Lijo
  0 siblings, 1 reply; 12+ messages in thread
From: Yu, Lang @ 2021-12-01  7:07 UTC (permalink / raw)
  To: Lazar, Lijo, amd-gfx; +Cc: Deucher, Alexander, Huang, Ray, Koenig, Christian

[AMD Official Use Only]



>-----Original Message-----
>From: Lazar, Lijo <Lijo.Lazar@amd.com>
>Sent: Wednesday, December 1, 2021 2:56 PM
>To: Yu, Lang <Lang.Yu@amd.com>; amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Huang, Ray
><Ray.Huang@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
>Subject: Re: [PATCH] drm/amdgpu: add SMU debug option support
>
>
>
>On 12/1/2021 11:57 AM, Yu, Lang wrote:
>> [AMD Official Use Only]
>>
>> Hi Lijo,
>>
>> Thanks for your comments.
>>
>>  From my understanding, that just increases the timeout threshold and
>> could hide some potential issues which should be exposed and solved.
>>
>> If current timeout threshold is not enough for some corner cases,
>> (1) Do we consider to increase the threshold to cover these cases?
>> (2) Or do we just expose them and request SMU FW to optimize them?
>>
>> I think it doesn't make much sense to increase the threshold in debug mode.
>> How do you think? Thanks!
>
>In normal cases, 2secs would be more than enough. If we hang immediately, then
>check the FW registers later, the response would have come. I thought we just
>need to note those cases and not to fail everytime. Just to mark as a red flag in
>the log to tell us that FW is unexpectedly busy processing something else when
>the message is sent.
>
>There are some issues related to S0ix where we see the FW comes back with a
>response with an increased timeout under certain conditions.

If these issues still exists, could we just blacklist the tests that triggered them 
before solve them? Or we just increase the threshold to cover all the cases?

Regards,
Lang

>
>Thanks,
>Lijo
>
>>
>> Regards,
>> Lang
>>
>>> -----Original Message-----
>>> From: Lazar, Lijo <Lijo.Lazar@amd.com>
>>> Sent: Wednesday, December 1, 2021 1:44 PM
>>> To: Lazar, Lijo <Lijo.Lazar@amd.com>; Yu, Lang <Lang.Yu@amd.com>;
>>> amd- gfx@lists.freedesktop.org
>>> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Huang, Ray
>>> <Ray.Huang@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
>>> Subject: RE: [PATCH] drm/amdgpu: add SMU debug option support
>>>
>>> Just realized that the patch I pasted won't work. Outer loop exit
>>> needs to be like this.
>>> 	(reg & MP1_C2PMSG_90__CONTENT_MASK) != 0 && extended_wait-- >=
>>> 0
>>>
>>> Anyway, that patch is only there to communicate what I really meant
>>> in the earlier comment.
>>>
>>> Thanks,
>>> Lijo
>>>
>>> -----Original Message-----
>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>>> Lazar, Lijo
>>> Sent: Wednesday, December 1, 2021 10:44 AM
>>> To: Yu, Lang <Lang.Yu@amd.com>; amd-gfx@lists.freedesktop.org
>>> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Huang, Ray
>>> <Ray.Huang@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
>>> Subject: Re: [PATCH] drm/amdgpu: add SMU debug option support
>>>
>>>
>>>
>>> On 11/30/2021 10:47 AM, Lang Yu wrote:
>>>> To maintain system error state when SMU errors occurred, which will
>>>> aid in debugging SMU firmware issues, add SMU debug option support.
>>>>
>>>> It can be enabled or disabled via amdgpu_smu_debug debugfs file.
>>>> When enabled, it makes SMU errors fatal.
>>>> It is disabled by default.
>>>>
>>>> == Command Guide ==
>>>>
>>>> 1, enable SMU debug option
>>>>
>>>>    # echo 1 > /sys/kernel/debug/dri/0/amdgpu_smu_debug
>>>>
>>>> 2, disable SMU debug option
>>>>
>>>>    # echo 0 > /sys/kernel/debug/dri/0/amdgpu_smu_debug
>>>>
>>>> v2:
>>>>    - Resend command when timeout.(Lijo)
>>>>    - Use debugfs file instead of module parameter.
>>>>
>>>> Signed-off-by: Lang Yu <lang.yu@amd.com>
>>>> ---
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 32
>+++++++++++++++++
>>>>    drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c      | 39
>+++++++++++++++++++-
>>> -
>>>>    2 files changed, 69 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>> index 164d6a9e9fbb..f9412de86599 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>> @@ -39,6 +39,8 @@
>>>>
>>>>    #if defined(CONFIG_DEBUG_FS)
>>>>
>>>> +extern int amdgpu_smu_debug;
>>>> +
>>>>    /**
>>>>     * amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
>>>>     *
>>>> @@ -1152,6 +1154,8 @@ static ssize_t
>>>> amdgpu_debugfs_gfxoff_read(struct
>>> file *f, char __user *buf,
>>>>    	return result;
>>>>    }
>>>>
>>>> +
>>>> +
>>>>    static const struct file_operations amdgpu_debugfs_regs2_fops = {
>>>>    	.owner = THIS_MODULE,
>>>>    	.unlocked_ioctl = amdgpu_debugfs_regs2_ioctl, @@ -1609,6
>>>> +1613,26 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
>>>>    DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
>>>>    			amdgpu_debugfs_sclk_set, "%llu\n");
>>>>
>>>> +static int amdgpu_debugfs_smu_debug_get(void *data, u64 *val) {
>>>> +	*val = amdgpu_smu_debug;
>>>> +	return 0;
>>>> +}
>>>> +
>>>> +static int amdgpu_debugfs_smu_debug_set(void *data, u64 val) {
>>>> +	if (val != 0 && val != 1)
>>>> +		return -EINVAL;
>>>> +
>>>> +	amdgpu_smu_debug = val;
>>>> +	return 0;
>>>> +}
>>>> +
>>>> +DEFINE_DEBUGFS_ATTRIBUTE(fops_smu_debug,
>>>> +			 amdgpu_debugfs_smu_debug_get,
>>>> +			 amdgpu_debugfs_smu_debug_set,
>>>> +			 "%llu\n");
>>>> +
>>>>    int amdgpu_debugfs_init(struct amdgpu_device *adev)
>>>>    {
>>>>    	struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
>>>> @@ -1632,6 +1656,14 @@ int amdgpu_debugfs_init(struct amdgpu_device
>>> *adev)
>>>>    		return PTR_ERR(ent);
>>>>    	}
>>>>
>>>> +	ent = debugfs_create_file("amdgpu_smu_debug", 0600, root, adev,
>>>> +				  &fops_smu_debug);
>>>> +	if (IS_ERR(ent)) {
>>>> +		DRM_ERROR("unable to create amdgpu_smu_debug debugsfs
>>> file\n");
>>>> +		return PTR_ERR(ent);
>>>> +	}
>>>> +
>>>> +
>>>>    	/* Register debugfs entries for amdgpu_ttm */
>>>>    	amdgpu_ttm_debugfs_init(adev);
>>>>    	amdgpu_debugfs_pm_init(adev);
>>>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>>>> b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>>>> index 048ca1673863..b3969d7933d3 100644
>>>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>>>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>>>> @@ -55,6 +55,14 @@
>>>>
>>>>    #undef __SMU_DUMMY_MAP
>>>>    #define __SMU_DUMMY_MAP(type)	#type
>>>> +
>>>> +/*
>>>> + * Used to enable SMU debug option. When enabled, it makes SMU
>>>> +errors
>>> fatal.
>>>> + * This will aid in debugging SMU firmware issues.
>>>> + * (0 = disabled (default), 1 = enabled)  */ int amdgpu_smu_debug;
>>>> +
>>>>    static const char * const __smu_message_names[] = {
>>>>    	SMU_MESSAGE_TYPES
>>>>    };
>>>> @@ -272,6 +280,11 @@ int smu_cmn_send_msg_without_waiting(struct
>>> smu_context *smu,
>>>>    	__smu_cmn_send_msg(smu, msg_index, param);
>>>>    	res = 0;
>>>>    Out:
>>>> +	if (unlikely(amdgpu_smu_debug == 1) && res) {
>>>> +		mutex_unlock(&smu->message_lock);
>>>> +		BUG();
>>>> +	}
>>>> +
>>>>    	return res;
>>>>    }
>>>>
>>>> @@ -288,9 +301,17 @@ int smu_cmn_send_msg_without_waiting(struct
>>> smu_context *smu,
>>>>    int smu_cmn_wait_for_response(struct smu_context *smu)
>>>>    {
>>>>    	u32 reg;
>>>> +	int res;
>>>>
>>>>    	reg = __smu_cmn_poll_stat(smu);
>>>> -	return __smu_cmn_reg2errno(smu, reg);
>>>> +	res = __smu_cmn_reg2errno(smu, reg);
>>>> +
>>>> +	if (unlikely(amdgpu_smu_debug == 1) && res) {
>>>> +		mutex_unlock(&smu->message_lock);
>>>> +		BUG();
>>>> +	}
>>>> +
>>>> +	return res;
>>>>    }
>>>>
>>>>    /**
>>>> @@ -328,6 +349,7 @@ int smu_cmn_send_smc_msg_with_param(struct
>>> smu_context *smu,
>>>>    				    uint32_t param,
>>>>    				    uint32_t *read_arg)
>>>>    {
>>>> +	int retry_count = 0;
>>>>    	int res, index;
>>>>    	u32 reg;
>>>>
>>>> @@ -349,15 +371,28 @@ int smu_cmn_send_smc_msg_with_param(struct
>>> smu_context *smu,
>>>>    		__smu_cmn_reg_print_error(smu, reg, index, param, msg);
>>>>    		goto Out;
>>>>    	}
>>>> +retry:
>>>>    	__smu_cmn_send_msg(smu, (uint16_t) index, param);
>>>>    	reg = __smu_cmn_poll_stat(smu);
>>>>    	res = __smu_cmn_reg2errno(smu, reg);
>>>> -	if (res != 0)
>>>> +	if (res != 0) {
>>>>    		__smu_cmn_reg_print_error(smu, reg, index, param, msg);
>>>> +		if ((res == -ETIME) && (retry_count++ < 1)) {
>>>> +			usleep_range(500, 1000);
>>>> +			dev_err(smu->adev->dev,
>>>> +				"SMU: resend command: index:%d
>>> param:0x%08X message:%s",
>>>> +				index, param, smu_get_message_name(smu,
>>> msg));
>>>> +			goto retry;
>>>> +		}
>>>> +		goto Out;
>>>> +	}
>>>
>>> Sorry, what I meant is to have an extended wait time in debug mode.
>>> Something like below, not a 'full retry' as in sending the message again.
>>>
>>>
>>> +#define MAX_DBG_WAIT_CNT 3
>>> +
>>>   /**
>>>    * __smu_cmn_poll_stat -- poll for a status from the SMU
>>>    * smu: a pointer to SMU context
>>> @@ -115,17 +117,24 @@ static void smu_cmn_read_arg(struct smu_context
>>> *smu,
>>>   static u32 __smu_cmn_poll_stat(struct smu_context *smu)
>>>   {
>>>          struct amdgpu_device *adev = smu->adev;
>>> -       int timeout = adev->usec_timeout * 20;
>>> +       int timeout;
>>>          u32 reg;
>>> +       int extended_wait = smu_debug_mode ? MAX_DBG_WAIT_CNT : 0;
>>>
>>> -       for ( ; timeout > 0; timeout--) {
>>> -               reg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
>>> -               if ((reg & MP1_C2PMSG_90__CONTENT_MASK) != 0)
>>> -                       break;
>>> +       do {
>>> +               timeout = adev->usec_timeout * 20;
>>> +               for (; timeout > 0; timeout--) {
>>> +                       reg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
>>> +                       if ((reg & MP1_C2PMSG_90__CONTENT_MASK) != 0)
>>> +                               break;
>>>
>>> -               udelay(1);
>>> -       }
>>> +                       udelay(1);
>>> +               }
>>> +       } while (extended_wait-- >= 0);
>>>
>>> +       if (extended_wait != MAX_DBG_WAIT_CNT && reg != SMU_RESP_NONE)
>>> +               dev_err(adev->dev,
>>> +                       "SMU: Unexpected extended wait for
>>> + response");
>>>          return reg;
>>>   }
>>>
>>> Thanks,
>>> Lijo
>>>
>>>>    	if (read_arg)
>>>>    		smu_cmn_read_arg(smu, read_arg);
>>>>    Out:
>>>>    	mutex_unlock(&smu->message_lock);
>>>> +
>>>> +	BUG_ON(unlikely(amdgpu_smu_debug == 1) && res);
>>>> +
>>>>    	return res;
>>>>    }
>>>>
>>>>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] drm/amdgpu: add SMU debug option support
  2021-12-01  7:07         ` Yu, Lang
@ 2021-12-01  7:28           ` Lazar, Lijo
  2021-12-01  7:57             ` Yu, Lang
  0 siblings, 1 reply; 12+ messages in thread
From: Lazar, Lijo @ 2021-12-01  7:28 UTC (permalink / raw)
  To: Yu, Lang, amd-gfx; +Cc: Deucher, Alexander, Huang, Ray, Koenig, Christian



On 12/1/2021 12:37 PM, Yu, Lang wrote:
> [AMD Official Use Only]
> 
> 
> 
>> -----Original Message-----
>> From: Lazar, Lijo <Lijo.Lazar@amd.com>
>> Sent: Wednesday, December 1, 2021 2:56 PM
>> To: Yu, Lang <Lang.Yu@amd.com>; amd-gfx@lists.freedesktop.org
>> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Huang, Ray
>> <Ray.Huang@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
>> Subject: Re: [PATCH] drm/amdgpu: add SMU debug option support
>>
>>
>>
>> On 12/1/2021 11:57 AM, Yu, Lang wrote:
>>> [AMD Official Use Only]
>>>
>>> Hi Lijo,
>>>
>>> Thanks for your comments.
>>>
>>>   From my understanding, that just increases the timeout threshold and
>>> could hide some potential issues which should be exposed and solved.
>>>
>>> If current timeout threshold is not enough for some corner cases,
>>> (1) Do we consider to increase the threshold to cover these cases?
>>> (2) Or do we just expose them and request SMU FW to optimize them?
>>>
>>> I think it doesn't make much sense to increase the threshold in debug mode.
>>> How do you think? Thanks!
>>
>> In normal cases, 2secs would be more than enough. If we hang immediately, then
>> check the FW registers later, the response would have come. I thought we just
>> need to note those cases and not to fail everytime. Just to mark as a red flag in
>> the log to tell us that FW is unexpectedly busy processing something else when
>> the message is sent.
>>
>> There are some issues related to S0ix where we see the FW comes back with a
>> response with an increased timeout under certain conditions.
> 
> If these issues still exists, could we just blacklist the tests that triggered them
> before solve them? Or we just increase the threshold to cover all the cases?
> 

Actually, the timeout is message specific - like i2c transfer from 
EEPROM could take longer time.

I am not sure if we should have more than 2s as timeout. Whenever this 
kind of issue happens, FW team check registers (then it will have a 
proper value) and say they don't see anything abnormal :) Usually, those 
are just signs of crack and it eventually breaks.

Option is just fail immediately (then again not sure useful it will be 
if the issue is this sort of thing) or wait to see how far it goes with 
an added timeout before it fails eventually.

Thanks,
Lijo

> Regards,
> Lang
> 
>>
>> Thanks,
>> Lijo
>>
>>>
>>> Regards,
>>> Lang
>>>
>>>> -----Original Message-----
>>>> From: Lazar, Lijo <Lijo.Lazar@amd.com>
>>>> Sent: Wednesday, December 1, 2021 1:44 PM
>>>> To: Lazar, Lijo <Lijo.Lazar@amd.com>; Yu, Lang <Lang.Yu@amd.com>;
>>>> amd- gfx@lists.freedesktop.org
>>>> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Huang, Ray
>>>> <Ray.Huang@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
>>>> Subject: RE: [PATCH] drm/amdgpu: add SMU debug option support
>>>>
>>>> Just realized that the patch I pasted won't work. Outer loop exit
>>>> needs to be like this.
>>>> 	(reg & MP1_C2PMSG_90__CONTENT_MASK) != 0 && extended_wait-- >=
>>>> 0
>>>>
>>>> Anyway, that patch is only there to communicate what I really meant
>>>> in the earlier comment.
>>>>
>>>> Thanks,
>>>> Lijo
>>>>
>>>> -----Original Message-----
>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>>>> Lazar, Lijo
>>>> Sent: Wednesday, December 1, 2021 10:44 AM
>>>> To: Yu, Lang <Lang.Yu@amd.com>; amd-gfx@lists.freedesktop.org
>>>> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Huang, Ray
>>>> <Ray.Huang@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
>>>> Subject: Re: [PATCH] drm/amdgpu: add SMU debug option support
>>>>
>>>>
>>>>
>>>> On 11/30/2021 10:47 AM, Lang Yu wrote:
>>>>> To maintain system error state when SMU errors occurred, which will
>>>>> aid in debugging SMU firmware issues, add SMU debug option support.
>>>>>
>>>>> It can be enabled or disabled via amdgpu_smu_debug debugfs file.
>>>>> When enabled, it makes SMU errors fatal.
>>>>> It is disabled by default.
>>>>>
>>>>> == Command Guide ==
>>>>>
>>>>> 1, enable SMU debug option
>>>>>
>>>>>     # echo 1 > /sys/kernel/debug/dri/0/amdgpu_smu_debug
>>>>>
>>>>> 2, disable SMU debug option
>>>>>
>>>>>     # echo 0 > /sys/kernel/debug/dri/0/amdgpu_smu_debug
>>>>>
>>>>> v2:
>>>>>     - Resend command when timeout.(Lijo)
>>>>>     - Use debugfs file instead of module parameter.
>>>>>
>>>>> Signed-off-by: Lang Yu <lang.yu@amd.com>
>>>>> ---
>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 32
>> +++++++++++++++++
>>>>>     drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c      | 39
>> +++++++++++++++++++-
>>>> -
>>>>>     2 files changed, 69 insertions(+), 2 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>> index 164d6a9e9fbb..f9412de86599 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>> @@ -39,6 +39,8 @@
>>>>>
>>>>>     #if defined(CONFIG_DEBUG_FS)
>>>>>
>>>>> +extern int amdgpu_smu_debug;
>>>>> +
>>>>>     /**
>>>>>      * amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
>>>>>      *
>>>>> @@ -1152,6 +1154,8 @@ static ssize_t
>>>>> amdgpu_debugfs_gfxoff_read(struct
>>>> file *f, char __user *buf,
>>>>>     	return result;
>>>>>     }
>>>>>
>>>>> +
>>>>> +
>>>>>     static const struct file_operations amdgpu_debugfs_regs2_fops = {
>>>>>     	.owner = THIS_MODULE,
>>>>>     	.unlocked_ioctl = amdgpu_debugfs_regs2_ioctl, @@ -1609,6
>>>>> +1613,26 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
>>>>>     DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
>>>>>     			amdgpu_debugfs_sclk_set, "%llu\n");
>>>>>
>>>>> +static int amdgpu_debugfs_smu_debug_get(void *data, u64 *val) {
>>>>> +	*val = amdgpu_smu_debug;
>>>>> +	return 0;
>>>>> +}
>>>>> +
>>>>> +static int amdgpu_debugfs_smu_debug_set(void *data, u64 val) {
>>>>> +	if (val != 0 && val != 1)
>>>>> +		return -EINVAL;
>>>>> +
>>>>> +	amdgpu_smu_debug = val;
>>>>> +	return 0;
>>>>> +}
>>>>> +
>>>>> +DEFINE_DEBUGFS_ATTRIBUTE(fops_smu_debug,
>>>>> +			 amdgpu_debugfs_smu_debug_get,
>>>>> +			 amdgpu_debugfs_smu_debug_set,
>>>>> +			 "%llu\n");
>>>>> +
>>>>>     int amdgpu_debugfs_init(struct amdgpu_device *adev)
>>>>>     {
>>>>>     	struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
>>>>> @@ -1632,6 +1656,14 @@ int amdgpu_debugfs_init(struct amdgpu_device
>>>> *adev)
>>>>>     		return PTR_ERR(ent);
>>>>>     	}
>>>>>
>>>>> +	ent = debugfs_create_file("amdgpu_smu_debug", 0600, root, adev,
>>>>> +				  &fops_smu_debug);
>>>>> +	if (IS_ERR(ent)) {
>>>>> +		DRM_ERROR("unable to create amdgpu_smu_debug debugsfs
>>>> file\n");
>>>>> +		return PTR_ERR(ent);
>>>>> +	}
>>>>> +
>>>>> +
>>>>>     	/* Register debugfs entries for amdgpu_ttm */
>>>>>     	amdgpu_ttm_debugfs_init(adev);
>>>>>     	amdgpu_debugfs_pm_init(adev);
>>>>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>>>>> b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>>>>> index 048ca1673863..b3969d7933d3 100644
>>>>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>>>>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>>>>> @@ -55,6 +55,14 @@
>>>>>
>>>>>     #undef __SMU_DUMMY_MAP
>>>>>     #define __SMU_DUMMY_MAP(type)	#type
>>>>> +
>>>>> +/*
>>>>> + * Used to enable SMU debug option. When enabled, it makes SMU
>>>>> +errors
>>>> fatal.
>>>>> + * This will aid in debugging SMU firmware issues.
>>>>> + * (0 = disabled (default), 1 = enabled)  */ int amdgpu_smu_debug;
>>>>> +
>>>>>     static const char * const __smu_message_names[] = {
>>>>>     	SMU_MESSAGE_TYPES
>>>>>     };
>>>>> @@ -272,6 +280,11 @@ int smu_cmn_send_msg_without_waiting(struct
>>>> smu_context *smu,
>>>>>     	__smu_cmn_send_msg(smu, msg_index, param);
>>>>>     	res = 0;
>>>>>     Out:
>>>>> +	if (unlikely(amdgpu_smu_debug == 1) && res) {
>>>>> +		mutex_unlock(&smu->message_lock);
>>>>> +		BUG();
>>>>> +	}
>>>>> +
>>>>>     	return res;
>>>>>     }
>>>>>
>>>>> @@ -288,9 +301,17 @@ int smu_cmn_send_msg_without_waiting(struct
>>>> smu_context *smu,
>>>>>     int smu_cmn_wait_for_response(struct smu_context *smu)
>>>>>     {
>>>>>     	u32 reg;
>>>>> +	int res;
>>>>>
>>>>>     	reg = __smu_cmn_poll_stat(smu);
>>>>> -	return __smu_cmn_reg2errno(smu, reg);
>>>>> +	res = __smu_cmn_reg2errno(smu, reg);
>>>>> +
>>>>> +	if (unlikely(amdgpu_smu_debug == 1) && res) {
>>>>> +		mutex_unlock(&smu->message_lock);
>>>>> +		BUG();
>>>>> +	}
>>>>> +
>>>>> +	return res;
>>>>>     }
>>>>>
>>>>>     /**
>>>>> @@ -328,6 +349,7 @@ int smu_cmn_send_smc_msg_with_param(struct
>>>> smu_context *smu,
>>>>>     				    uint32_t param,
>>>>>     				    uint32_t *read_arg)
>>>>>     {
>>>>> +	int retry_count = 0;
>>>>>     	int res, index;
>>>>>     	u32 reg;
>>>>>
>>>>> @@ -349,15 +371,28 @@ int smu_cmn_send_smc_msg_with_param(struct
>>>> smu_context *smu,
>>>>>     		__smu_cmn_reg_print_error(smu, reg, index, param, msg);
>>>>>     		goto Out;
>>>>>     	}
>>>>> +retry:
>>>>>     	__smu_cmn_send_msg(smu, (uint16_t) index, param);
>>>>>     	reg = __smu_cmn_poll_stat(smu);
>>>>>     	res = __smu_cmn_reg2errno(smu, reg);
>>>>> -	if (res != 0)
>>>>> +	if (res != 0) {
>>>>>     		__smu_cmn_reg_print_error(smu, reg, index, param, msg);
>>>>> +		if ((res == -ETIME) && (retry_count++ < 1)) {
>>>>> +			usleep_range(500, 1000);
>>>>> +			dev_err(smu->adev->dev,
>>>>> +				"SMU: resend command: index:%d
>>>> param:0x%08X message:%s",
>>>>> +				index, param, smu_get_message_name(smu,
>>>> msg));
>>>>> +			goto retry;
>>>>> +		}
>>>>> +		goto Out;
>>>>> +	}
>>>>
>>>> Sorry, what I meant is to have an extended wait time in debug mode.
>>>> Something like below, not a 'full retry' as in sending the message again.
>>>>
>>>>
>>>> +#define MAX_DBG_WAIT_CNT 3
>>>> +
>>>>    /**
>>>>     * __smu_cmn_poll_stat -- poll for a status from the SMU
>>>>     * smu: a pointer to SMU context
>>>> @@ -115,17 +117,24 @@ static void smu_cmn_read_arg(struct smu_context
>>>> *smu,
>>>>    static u32 __smu_cmn_poll_stat(struct smu_context *smu)
>>>>    {
>>>>           struct amdgpu_device *adev = smu->adev;
>>>> -       int timeout = adev->usec_timeout * 20;
>>>> +       int timeout;
>>>>           u32 reg;
>>>> +       int extended_wait = smu_debug_mode ? MAX_DBG_WAIT_CNT : 0;
>>>>
>>>> -       for ( ; timeout > 0; timeout--) {
>>>> -               reg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
>>>> -               if ((reg & MP1_C2PMSG_90__CONTENT_MASK) != 0)
>>>> -                       break;
>>>> +       do {
>>>> +               timeout = adev->usec_timeout * 20;
>>>> +               for (; timeout > 0; timeout--) {
>>>> +                       reg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
>>>> +                       if ((reg & MP1_C2PMSG_90__CONTENT_MASK) != 0)
>>>> +                               break;
>>>>
>>>> -               udelay(1);
>>>> -       }
>>>> +                       udelay(1);
>>>> +               }
>>>> +       } while (extended_wait-- >= 0);
>>>>
>>>> +       if (extended_wait != MAX_DBG_WAIT_CNT && reg != SMU_RESP_NONE)
>>>> +               dev_err(adev->dev,
>>>> +                       "SMU: Unexpected extended wait for
>>>> + response");
>>>>           return reg;
>>>>    }
>>>>
>>>> Thanks,
>>>> Lijo
>>>>
>>>>>     	if (read_arg)
>>>>>     		smu_cmn_read_arg(smu, read_arg);
>>>>>     Out:
>>>>>     	mutex_unlock(&smu->message_lock);
>>>>> +
>>>>> +	BUG_ON(unlikely(amdgpu_smu_debug == 1) && res);
>>>>> +
>>>>>     	return res;
>>>>>     }
>>>>>
>>>>>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [PATCH] drm/amdgpu: add SMU debug option support
  2021-12-01  7:28           ` Lazar, Lijo
@ 2021-12-01  7:57             ` Yu, Lang
  2021-12-01  8:18               ` Yu, Lang
  0 siblings, 1 reply; 12+ messages in thread
From: Yu, Lang @ 2021-12-01  7:57 UTC (permalink / raw)
  To: Lazar, Lijo, amd-gfx; +Cc: Deucher, Alexander, Huang, Ray, Koenig, Christian

[AMD Official Use Only]



>-----Original Message-----
>From: Lazar, Lijo <Lijo.Lazar@amd.com>
>Sent: Wednesday, December 1, 2021 3:28 PM
>To: Yu, Lang <Lang.Yu@amd.com>; amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Huang, Ray
><Ray.Huang@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
>Subject: Re: [PATCH] drm/amdgpu: add SMU debug option support
>
>
>
>On 12/1/2021 12:37 PM, Yu, Lang wrote:
>> [AMD Official Use Only]
>>
>>
>>
>>> -----Original Message-----
>>> From: Lazar, Lijo <Lijo.Lazar@amd.com>
>>> Sent: Wednesday, December 1, 2021 2:56 PM
>>> To: Yu, Lang <Lang.Yu@amd.com>; amd-gfx@lists.freedesktop.org
>>> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Huang, Ray
>>> <Ray.Huang@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
>>> Subject: Re: [PATCH] drm/amdgpu: add SMU debug option support
>>>
>>>
>>>
>>> On 12/1/2021 11:57 AM, Yu, Lang wrote:
>>>> [AMD Official Use Only]
>>>>
>>>> Hi Lijo,
>>>>
>>>> Thanks for your comments.
>>>>
>>>>   From my understanding, that just increases the timeout threshold
>>>> and could hide some potential issues which should be exposed and solved.
>>>>
>>>> If current timeout threshold is not enough for some corner cases,
>>>> (1) Do we consider to increase the threshold to cover these cases?
>>>> (2) Or do we just expose them and request SMU FW to optimize them?
>>>>
>>>> I think it doesn't make much sense to increase the threshold in debug mode.
>>>> How do you think? Thanks!
>>>
>>> In normal cases, 2secs would be more than enough. If we hang
>>> immediately, then check the FW registers later, the response would
>>> have come. I thought we just need to note those cases and not to fail
>>> everytime. Just to mark as a red flag in the log to tell us that FW
>>> is unexpectedly busy processing something else when the message is sent.
>>>
>>> There are some issues related to S0ix where we see the FW comes back
>>> with a response with an increased timeout under certain conditions.
>>
>> If these issues still exists, could we just blacklist the tests that
>> triggered them before solve them? Or we just increase the threshold to cover
>all the cases?
>>
>
>Actually, the timeout is message specific - like i2c transfer from EEPROM could
>take longer time.
>
>I am not sure if we should have more than 2s as timeout. Whenever this kind of
>issue happens, FW team check registers (then it will have a proper value) and say
>they don't see anything abnormal :) Usually, those are just signs of crack and it
>eventually breaks.
>
>Option is just fail immediately (then again not sure useful it will be if the issue is
>this sort of thing) or wait to see how far it goes with an added timeout before it
>fails eventually.

Does smu_cmn_wait_for_response()/smu_cmn_send_msg_without_waiting() are 
designed for long timeout cases? Is it fine that we don't fail here in the event of timeout?

Thanks,
Lang 

>
>Thanks,
>Lijo
>
>> Regards,
>> Lang
>>
>>>
>>> Thanks,
>>> Lijo
>>>
>>>>
>>>> Regards,
>>>> Lang
>>>>
>>>>> -----Original Message-----
>>>>> From: Lazar, Lijo <Lijo.Lazar@amd.com>
>>>>> Sent: Wednesday, December 1, 2021 1:44 PM
>>>>> To: Lazar, Lijo <Lijo.Lazar@amd.com>; Yu, Lang <Lang.Yu@amd.com>;
>>>>> amd- gfx@lists.freedesktop.org
>>>>> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Huang, Ray
>>>>> <Ray.Huang@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
>>>>> Subject: RE: [PATCH] drm/amdgpu: add SMU debug option support
>>>>>
>>>>> Just realized that the patch I pasted won't work. Outer loop exit
>>>>> needs to be like this.
>>>>> 	(reg & MP1_C2PMSG_90__CONTENT_MASK) != 0 && extended_wait-- >=
>>>>> 0
>>>>>
>>>>> Anyway, that patch is only there to communicate what I really meant
>>>>> in the earlier comment.
>>>>>
>>>>> Thanks,
>>>>> Lijo
>>>>>
>>>>> -----Original Message-----
>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>>>>> Lazar, Lijo
>>>>> Sent: Wednesday, December 1, 2021 10:44 AM
>>>>> To: Yu, Lang <Lang.Yu@amd.com>; amd-gfx@lists.freedesktop.org
>>>>> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Huang, Ray
>>>>> <Ray.Huang@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
>>>>> Subject: Re: [PATCH] drm/amdgpu: add SMU debug option support
>>>>>
>>>>>
>>>>>
>>>>> On 11/30/2021 10:47 AM, Lang Yu wrote:
>>>>>> To maintain system error state when SMU errors occurred, which
>>>>>> will aid in debugging SMU firmware issues, add SMU debug option support.
>>>>>>
>>>>>> It can be enabled or disabled via amdgpu_smu_debug debugfs file.
>>>>>> When enabled, it makes SMU errors fatal.
>>>>>> It is disabled by default.
>>>>>>
>>>>>> == Command Guide ==
>>>>>>
>>>>>> 1, enable SMU debug option
>>>>>>
>>>>>>     # echo 1 > /sys/kernel/debug/dri/0/amdgpu_smu_debug
>>>>>>
>>>>>> 2, disable SMU debug option
>>>>>>
>>>>>>     # echo 0 > /sys/kernel/debug/dri/0/amdgpu_smu_debug
>>>>>>
>>>>>> v2:
>>>>>>     - Resend command when timeout.(Lijo)
>>>>>>     - Use debugfs file instead of module parameter.
>>>>>>
>>>>>> Signed-off-by: Lang Yu <lang.yu@amd.com>
>>>>>> ---
>>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 32
>>> +++++++++++++++++
>>>>>>     drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c      | 39
>>> +++++++++++++++++++-
>>>>> -
>>>>>>     2 files changed, 69 insertions(+), 2 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>>> index 164d6a9e9fbb..f9412de86599 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>>> @@ -39,6 +39,8 @@
>>>>>>
>>>>>>     #if defined(CONFIG_DEBUG_FS)
>>>>>>
>>>>>> +extern int amdgpu_smu_debug;
>>>>>> +
>>>>>>     /**
>>>>>>      * amdgpu_debugfs_process_reg_op - Handle MMIO register
>reads/writes
>>>>>>      *
>>>>>> @@ -1152,6 +1154,8 @@ static ssize_t
>>>>>> amdgpu_debugfs_gfxoff_read(struct
>>>>> file *f, char __user *buf,
>>>>>>     	return result;
>>>>>>     }
>>>>>>
>>>>>> +
>>>>>> +
>>>>>>     static const struct file_operations amdgpu_debugfs_regs2_fops = {
>>>>>>     	.owner = THIS_MODULE,
>>>>>>     	.unlocked_ioctl = amdgpu_debugfs_regs2_ioctl, @@ -1609,6
>>>>>> +1613,26 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
>>>>>>     DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
>>>>>>     			amdgpu_debugfs_sclk_set, "%llu\n");
>>>>>>
>>>>>> +static int amdgpu_debugfs_smu_debug_get(void *data, u64 *val) {
>>>>>> +	*val = amdgpu_smu_debug;
>>>>>> +	return 0;
>>>>>> +}
>>>>>> +
>>>>>> +static int amdgpu_debugfs_smu_debug_set(void *data, u64 val) {
>>>>>> +	if (val != 0 && val != 1)
>>>>>> +		return -EINVAL;
>>>>>> +
>>>>>> +	amdgpu_smu_debug = val;
>>>>>> +	return 0;
>>>>>> +}
>>>>>> +
>>>>>> +DEFINE_DEBUGFS_ATTRIBUTE(fops_smu_debug,
>>>>>> +			 amdgpu_debugfs_smu_debug_get,
>>>>>> +			 amdgpu_debugfs_smu_debug_set,
>>>>>> +			 "%llu\n");
>>>>>> +
>>>>>>     int amdgpu_debugfs_init(struct amdgpu_device *adev)
>>>>>>     {
>>>>>>     	struct dentry *root =
>>>>>> adev_to_drm(adev)->primary->debugfs_root;
>>>>>> @@ -1632,6 +1656,14 @@ int amdgpu_debugfs_init(struct
>>>>>> amdgpu_device
>>>>> *adev)
>>>>>>     		return PTR_ERR(ent);
>>>>>>     	}
>>>>>>
>>>>>> +	ent = debugfs_create_file("amdgpu_smu_debug", 0600, root, adev,
>>>>>> +				  &fops_smu_debug);
>>>>>> +	if (IS_ERR(ent)) {
>>>>>> +		DRM_ERROR("unable to create amdgpu_smu_debug debugsfs
>>>>> file\n");
>>>>>> +		return PTR_ERR(ent);
>>>>>> +	}
>>>>>> +
>>>>>> +
>>>>>>     	/* Register debugfs entries for amdgpu_ttm */
>>>>>>     	amdgpu_ttm_debugfs_init(adev);
>>>>>>     	amdgpu_debugfs_pm_init(adev); diff --git
>>>>>> a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>>>>>> b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>>>>>> index 048ca1673863..b3969d7933d3 100644
>>>>>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>>>>>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>>>>>> @@ -55,6 +55,14 @@
>>>>>>
>>>>>>     #undef __SMU_DUMMY_MAP
>>>>>>     #define __SMU_DUMMY_MAP(type)	#type
>>>>>> +
>>>>>> +/*
>>>>>> + * Used to enable SMU debug option. When enabled, it makes SMU
>>>>>> +errors
>>>>> fatal.
>>>>>> + * This will aid in debugging SMU firmware issues.
>>>>>> + * (0 = disabled (default), 1 = enabled)  */ int
>>>>>> + amdgpu_smu_debug;
>>>>>> +
>>>>>>     static const char * const __smu_message_names[] = {
>>>>>>     	SMU_MESSAGE_TYPES
>>>>>>     };
>>>>>> @@ -272,6 +280,11 @@ int smu_cmn_send_msg_without_waiting(struct
>>>>> smu_context *smu,
>>>>>>     	__smu_cmn_send_msg(smu, msg_index, param);
>>>>>>     	res = 0;
>>>>>>     Out:
>>>>>> +	if (unlikely(amdgpu_smu_debug == 1) && res) {
>>>>>> +		mutex_unlock(&smu->message_lock);
>>>>>> +		BUG();
>>>>>> +	}
>>>>>> +
>>>>>>     	return res;
>>>>>>     }
>>>>>>
>>>>>> @@ -288,9 +301,17 @@ int smu_cmn_send_msg_without_waiting(struct
>>>>> smu_context *smu,
>>>>>>     int smu_cmn_wait_for_response(struct smu_context *smu)
>>>>>>     {
>>>>>>     	u32 reg;
>>>>>> +	int res;
>>>>>>
>>>>>>     	reg = __smu_cmn_poll_stat(smu);
>>>>>> -	return __smu_cmn_reg2errno(smu, reg);
>>>>>> +	res = __smu_cmn_reg2errno(smu, reg);
>>>>>> +
>>>>>> +	if (unlikely(amdgpu_smu_debug == 1) && res) {
>>>>>> +		mutex_unlock(&smu->message_lock);
>>>>>> +		BUG();
>>>>>> +	}
>>>>>> +
>>>>>> +	return res;
>>>>>>     }
>>>>>>
>>>>>>     /**
>>>>>> @@ -328,6 +349,7 @@ int smu_cmn_send_smc_msg_with_param(struct
>>>>> smu_context *smu,
>>>>>>     				    uint32_t param,
>>>>>>     				    uint32_t *read_arg)
>>>>>>     {
>>>>>> +	int retry_count = 0;
>>>>>>     	int res, index;
>>>>>>     	u32 reg;
>>>>>>
>>>>>> @@ -349,15 +371,28 @@ int
>smu_cmn_send_smc_msg_with_param(struct
>>>>> smu_context *smu,
>>>>>>     		__smu_cmn_reg_print_error(smu, reg, index, param,
>msg);
>>>>>>     		goto Out;
>>>>>>     	}
>>>>>> +retry:
>>>>>>     	__smu_cmn_send_msg(smu, (uint16_t) index, param);
>>>>>>     	reg = __smu_cmn_poll_stat(smu);
>>>>>>     	res = __smu_cmn_reg2errno(smu, reg);
>>>>>> -	if (res != 0)
>>>>>> +	if (res != 0) {
>>>>>>     		__smu_cmn_reg_print_error(smu, reg, index, param,
>msg);
>>>>>> +		if ((res == -ETIME) && (retry_count++ < 1)) {
>>>>>> +			usleep_range(500, 1000);
>>>>>> +			dev_err(smu->adev->dev,
>>>>>> +				"SMU: resend command: index:%d
>>>>> param:0x%08X message:%s",
>>>>>> +				index, param, smu_get_message_name(smu,
>>>>> msg));
>>>>>> +			goto retry;
>>>>>> +		}
>>>>>> +		goto Out;
>>>>>> +	}
>>>>>
>>>>> Sorry, what I meant is to have an extended wait time in debug mode.
>>>>> Something like below, not a 'full retry' as in sending the message again.
>>>>>
>>>>>
>>>>> +#define MAX_DBG_WAIT_CNT 3
>>>>> +
>>>>>    /**
>>>>>     * __smu_cmn_poll_stat -- poll for a status from the SMU
>>>>>     * smu: a pointer to SMU context @@ -115,17 +117,24 @@ static
>>>>> void smu_cmn_read_arg(struct smu_context *smu,
>>>>>    static u32 __smu_cmn_poll_stat(struct smu_context *smu)
>>>>>    {
>>>>>           struct amdgpu_device *adev = smu->adev;
>>>>> -       int timeout = adev->usec_timeout * 20;
>>>>> +       int timeout;
>>>>>           u32 reg;
>>>>> +       int extended_wait = smu_debug_mode ? MAX_DBG_WAIT_CNT : 0;
>>>>>
>>>>> -       for ( ; timeout > 0; timeout--) {
>>>>> -               reg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
>>>>> -               if ((reg & MP1_C2PMSG_90__CONTENT_MASK) != 0)
>>>>> -                       break;
>>>>> +       do {
>>>>> +               timeout = adev->usec_timeout * 20;
>>>>> +               for (; timeout > 0; timeout--) {
>>>>> +                       reg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
>>>>> +                       if ((reg & MP1_C2PMSG_90__CONTENT_MASK) != 0)
>>>>> +                               break;
>>>>>
>>>>> -               udelay(1);
>>>>> -       }
>>>>> +                       udelay(1);
>>>>> +               }
>>>>> +       } while (extended_wait-- >= 0);
>>>>>
>>>>> +       if (extended_wait != MAX_DBG_WAIT_CNT && reg !=
>SMU_RESP_NONE)
>>>>> +               dev_err(adev->dev,
>>>>> +                       "SMU: Unexpected extended wait for
>>>>> + response");
>>>>>           return reg;
>>>>>    }
>>>>>
>>>>> Thanks,
>>>>> Lijo
>>>>>
>>>>>>     	if (read_arg)
>>>>>>     		smu_cmn_read_arg(smu, read_arg);
>>>>>>     Out:
>>>>>>     	mutex_unlock(&smu->message_lock);
>>>>>> +
>>>>>> +	BUG_ON(unlikely(amdgpu_smu_debug == 1) && res);
>>>>>> +
>>>>>>     	return res;
>>>>>>     }
>>>>>>
>>>>>>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [PATCH] drm/amdgpu: add SMU debug option support
  2021-12-01  7:57             ` Yu, Lang
@ 2021-12-01  8:18               ` Yu, Lang
  2021-12-01  8:46                 ` Lazar, Lijo
  0 siblings, 1 reply; 12+ messages in thread
From: Yu, Lang @ 2021-12-01  8:18 UTC (permalink / raw)
  To: Yu, Lang, Lazar, Lijo, amd-gfx
  Cc: Deucher, Alexander, Huang, Ray, Koenig, Christian

[AMD Official Use Only]



>-----Original Message-----
>From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Yu, Lang
>Sent: Wednesday, December 1, 2021 3:58 PM
>To: Lazar, Lijo <Lijo.Lazar@amd.com>; amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Huang, Ray
><Ray.Huang@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
>Subject: RE: [PATCH] drm/amdgpu: add SMU debug option support
>
>[AMD Official Use Only]
>
>
>
>>-----Original Message-----
>>From: Lazar, Lijo <Lijo.Lazar@amd.com>
>>Sent: Wednesday, December 1, 2021 3:28 PM
>>To: Yu, Lang <Lang.Yu@amd.com>; amd-gfx@lists.freedesktop.org
>>Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Huang, Ray
>><Ray.Huang@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
>>Subject: Re: [PATCH] drm/amdgpu: add SMU debug option support
>>
>>
>>
>>On 12/1/2021 12:37 PM, Yu, Lang wrote:
>>> [AMD Official Use Only]
>>>
>>>
>>>
>>>> -----Original Message-----
>>>> From: Lazar, Lijo <Lijo.Lazar@amd.com>
>>>> Sent: Wednesday, December 1, 2021 2:56 PM
>>>> To: Yu, Lang <Lang.Yu@amd.com>; amd-gfx@lists.freedesktop.org
>>>> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Huang, Ray
>>>> <Ray.Huang@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
>>>> Subject: Re: [PATCH] drm/amdgpu: add SMU debug option support
>>>>
>>>>
>>>>
>>>> On 12/1/2021 11:57 AM, Yu, Lang wrote:
>>>>> [AMD Official Use Only]
>>>>>
>>>>> Hi Lijo,
>>>>>
>>>>> Thanks for your comments.
>>>>>
>>>>>   From my understanding, that just increases the timeout threshold
>>>>> and could hide some potential issues which should be exposed and solved.
>>>>>
>>>>> If current timeout threshold is not enough for some corner cases,
>>>>> (1) Do we consider to increase the threshold to cover these cases?
>>>>> (2) Or do we just expose them and request SMU FW to optimize them?
>>>>>
>>>>> I think it doesn't make much sense to increase the threshold in debug mode.
>>>>> How do you think? Thanks!
>>>>
>>>> In normal cases, 2secs would be more than enough. If we hang
>>>> immediately, then check the FW registers later, the response would
>>>> have come. I thought we just need to note those cases and not to
>>>> fail everytime. Just to mark as a red flag in the log to tell us
>>>> that FW is unexpectedly busy processing something else when the message is
>sent.
>>>>
>>>> There are some issues related to S0ix where we see the FW comes back
>>>> with a response with an increased timeout under certain conditions.
>>>
>>> If these issues still exists, could we just blacklist the tests that
>>> triggered them before solve them? Or we just increase the threshold
>>> to cover
>>all the cases?
>>>
>>
>>Actually, the timeout is message specific - like i2c transfer from
>>EEPROM could take longer time.
>>
>>I am not sure if we should have more than 2s as timeout. Whenever this
>>kind of issue happens, FW team check registers (then it will have a
>>proper value) and say they don't see anything abnormal :) Usually,
>>those are just signs of crack and it eventually breaks.
>>
>>Option is just fail immediately (then again not sure useful it will be
>>if the issue is this sort of thing) or wait to see how far it goes with
>>an added timeout before it fails eventually.
>
>Are smu_cmn_wait_for_response()/smu_cmn_send_msg_without_waiting()
>designed for long timeout cases? Is it fine that we don't fail here in the event of
>timeout?

Or we can add a timeout parameter into smu_cmn_send_smc_msg_with_param() 
to specify the timeout you want for specific message.
I think this may be another story. Thanks!
 
Thanks,
Lang
>
>>
>>Thanks,
>>Lijo
>>
>>> Regards,
>>> Lang
>>>
>>>>
>>>> Thanks,
>>>> Lijo
>>>>
>>>>>
>>>>> Regards,
>>>>> Lang
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: Lazar, Lijo <Lijo.Lazar@amd.com>
>>>>>> Sent: Wednesday, December 1, 2021 1:44 PM
>>>>>> To: Lazar, Lijo <Lijo.Lazar@amd.com>; Yu, Lang <Lang.Yu@amd.com>;
>>>>>> amd- gfx@lists.freedesktop.org
>>>>>> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Huang, Ray
>>>>>> <Ray.Huang@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
>>>>>> Subject: RE: [PATCH] drm/amdgpu: add SMU debug option support
>>>>>>
>>>>>> Just realized that the patch I pasted won't work. Outer loop exit
>>>>>> needs to be like this.
>>>>>> 	(reg & MP1_C2PMSG_90__CONTENT_MASK) != 0 && extended_wait-- >=
>>>>>> 0
>>>>>>
>>>>>> Anyway, that patch is only there to communicate what I really
>>>>>> meant in the earlier comment.
>>>>>>
>>>>>> Thanks,
>>>>>> Lijo
>>>>>>
>>>>>> -----Original Message-----
>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>>>>>> Lazar, Lijo
>>>>>> Sent: Wednesday, December 1, 2021 10:44 AM
>>>>>> To: Yu, Lang <Lang.Yu@amd.com>; amd-gfx@lists.freedesktop.org
>>>>>> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Huang, Ray
>>>>>> <Ray.Huang@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
>>>>>> Subject: Re: [PATCH] drm/amdgpu: add SMU debug option support
>>>>>>
>>>>>>
>>>>>>
>>>>>> On 11/30/2021 10:47 AM, Lang Yu wrote:
>>>>>>> To maintain system error state when SMU errors occurred, which
>>>>>>> will aid in debugging SMU firmware issues, add SMU debug option
>support.
>>>>>>>
>>>>>>> It can be enabled or disabled via amdgpu_smu_debug debugfs file.
>>>>>>> When enabled, it makes SMU errors fatal.
>>>>>>> It is disabled by default.
>>>>>>>
>>>>>>> == Command Guide ==
>>>>>>>
>>>>>>> 1, enable SMU debug option
>>>>>>>
>>>>>>>     # echo 1 > /sys/kernel/debug/dri/0/amdgpu_smu_debug
>>>>>>>
>>>>>>> 2, disable SMU debug option
>>>>>>>
>>>>>>>     # echo 0 > /sys/kernel/debug/dri/0/amdgpu_smu_debug
>>>>>>>
>>>>>>> v2:
>>>>>>>     - Resend command when timeout.(Lijo)
>>>>>>>     - Use debugfs file instead of module parameter.
>>>>>>>
>>>>>>> Signed-off-by: Lang Yu <lang.yu@amd.com>
>>>>>>> ---
>>>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 32
>>>> +++++++++++++++++
>>>>>>>     drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c      | 39
>>>> +++++++++++++++++++-
>>>>>> -
>>>>>>>     2 files changed, 69 insertions(+), 2 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>>>> index 164d6a9e9fbb..f9412de86599 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>>>> @@ -39,6 +39,8 @@
>>>>>>>
>>>>>>>     #if defined(CONFIG_DEBUG_FS)
>>>>>>>
>>>>>>> +extern int amdgpu_smu_debug;
>>>>>>> +
>>>>>>>     /**
>>>>>>>      * amdgpu_debugfs_process_reg_op - Handle MMIO register
>>reads/writes
>>>>>>>      *
>>>>>>> @@ -1152,6 +1154,8 @@ static ssize_t
>>>>>>> amdgpu_debugfs_gfxoff_read(struct
>>>>>> file *f, char __user *buf,
>>>>>>>     	return result;
>>>>>>>     }
>>>>>>>
>>>>>>> +
>>>>>>> +
>>>>>>>     static const struct file_operations amdgpu_debugfs_regs2_fops = {
>>>>>>>     	.owner = THIS_MODULE,
>>>>>>>     	.unlocked_ioctl = amdgpu_debugfs_regs2_ioctl, @@ -1609,6
>>>>>>> +1613,26 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
>>>>>>>     DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
>>>>>>>     			amdgpu_debugfs_sclk_set, "%llu\n");
>>>>>>>
>>>>>>> +static int amdgpu_debugfs_smu_debug_get(void *data, u64 *val) {
>>>>>>> +	*val = amdgpu_smu_debug;
>>>>>>> +	return 0;
>>>>>>> +}
>>>>>>> +
>>>>>>> +static int amdgpu_debugfs_smu_debug_set(void *data, u64 val) {
>>>>>>> +	if (val != 0 && val != 1)
>>>>>>> +		return -EINVAL;
>>>>>>> +
>>>>>>> +	amdgpu_smu_debug = val;
>>>>>>> +	return 0;
>>>>>>> +}
>>>>>>> +
>>>>>>> +DEFINE_DEBUGFS_ATTRIBUTE(fops_smu_debug,
>>>>>>> +			 amdgpu_debugfs_smu_debug_get,
>>>>>>> +			 amdgpu_debugfs_smu_debug_set,
>>>>>>> +			 "%llu\n");
>>>>>>> +
>>>>>>>     int amdgpu_debugfs_init(struct amdgpu_device *adev)
>>>>>>>     {
>>>>>>>     	struct dentry *root =
>>>>>>> adev_to_drm(adev)->primary->debugfs_root;
>>>>>>> @@ -1632,6 +1656,14 @@ int amdgpu_debugfs_init(struct
>>>>>>> amdgpu_device
>>>>>> *adev)
>>>>>>>     		return PTR_ERR(ent);
>>>>>>>     	}
>>>>>>>
>>>>>>> +	ent = debugfs_create_file("amdgpu_smu_debug", 0600, root,
>adev,
>>>>>>> +				  &fops_smu_debug);
>>>>>>> +	if (IS_ERR(ent)) {
>>>>>>> +		DRM_ERROR("unable to create amdgpu_smu_debug
>debugsfs
>>>>>> file\n");
>>>>>>> +		return PTR_ERR(ent);
>>>>>>> +	}
>>>>>>> +
>>>>>>> +
>>>>>>>     	/* Register debugfs entries for amdgpu_ttm */
>>>>>>>     	amdgpu_ttm_debugfs_init(adev);
>>>>>>>     	amdgpu_debugfs_pm_init(adev); diff --git
>>>>>>> a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>>>>>>> b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>>>>>>> index 048ca1673863..b3969d7933d3 100644
>>>>>>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>>>>>>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>>>>>>> @@ -55,6 +55,14 @@
>>>>>>>
>>>>>>>     #undef __SMU_DUMMY_MAP
>>>>>>>     #define __SMU_DUMMY_MAP(type)	#type
>>>>>>> +
>>>>>>> +/*
>>>>>>> + * Used to enable SMU debug option. When enabled, it makes SMU
>>>>>>> +errors
>>>>>> fatal.
>>>>>>> + * This will aid in debugging SMU firmware issues.
>>>>>>> + * (0 = disabled (default), 1 = enabled)  */ int
>>>>>>> + amdgpu_smu_debug;
>>>>>>> +
>>>>>>>     static const char * const __smu_message_names[] = {
>>>>>>>     	SMU_MESSAGE_TYPES
>>>>>>>     };
>>>>>>> @@ -272,6 +280,11 @@ int smu_cmn_send_msg_without_waiting(struct
>>>>>> smu_context *smu,
>>>>>>>     	__smu_cmn_send_msg(smu, msg_index, param);
>>>>>>>     	res = 0;
>>>>>>>     Out:
>>>>>>> +	if (unlikely(amdgpu_smu_debug == 1) && res) {
>>>>>>> +		mutex_unlock(&smu->message_lock);
>>>>>>> +		BUG();
>>>>>>> +	}
>>>>>>> +
>>>>>>>     	return res;
>>>>>>>     }
>>>>>>>
>>>>>>> @@ -288,9 +301,17 @@ int smu_cmn_send_msg_without_waiting(struct
>>>>>> smu_context *smu,
>>>>>>>     int smu_cmn_wait_for_response(struct smu_context *smu)
>>>>>>>     {
>>>>>>>     	u32 reg;
>>>>>>> +	int res;
>>>>>>>
>>>>>>>     	reg = __smu_cmn_poll_stat(smu);
>>>>>>> -	return __smu_cmn_reg2errno(smu, reg);
>>>>>>> +	res = __smu_cmn_reg2errno(smu, reg);
>>>>>>> +
>>>>>>> +	if (unlikely(amdgpu_smu_debug == 1) && res) {
>>>>>>> +		mutex_unlock(&smu->message_lock);
>>>>>>> +		BUG();
>>>>>>> +	}
>>>>>>> +
>>>>>>> +	return res;
>>>>>>>     }
>>>>>>>
>>>>>>>     /**
>>>>>>> @@ -328,6 +349,7 @@ int smu_cmn_send_smc_msg_with_param(struct
>>>>>> smu_context *smu,
>>>>>>>     				    uint32_t param,
>>>>>>>     				    uint32_t *read_arg)
>>>>>>>     {
>>>>>>> +	int retry_count = 0;
>>>>>>>     	int res, index;
>>>>>>>     	u32 reg;
>>>>>>>
>>>>>>> @@ -349,15 +371,28 @@ int
>>smu_cmn_send_smc_msg_with_param(struct
>>>>>> smu_context *smu,
>>>>>>>     		__smu_cmn_reg_print_error(smu, reg, index, param,
>>msg);
>>>>>>>     		goto Out;
>>>>>>>     	}
>>>>>>> +retry:
>>>>>>>     	__smu_cmn_send_msg(smu, (uint16_t) index, param);
>>>>>>>     	reg = __smu_cmn_poll_stat(smu);
>>>>>>>     	res = __smu_cmn_reg2errno(smu, reg);
>>>>>>> -	if (res != 0)
>>>>>>> +	if (res != 0) {
>>>>>>>     		__smu_cmn_reg_print_error(smu, reg, index, param,
>>msg);
>>>>>>> +		if ((res == -ETIME) && (retry_count++ < 1)) {
>>>>>>> +			usleep_range(500, 1000);
>>>>>>> +			dev_err(smu->adev->dev,
>>>>>>> +				"SMU: resend command: index:%d
>>>>>> param:0x%08X message:%s",
>>>>>>> +				index, param,
>smu_get_message_name(smu,
>>>>>> msg));
>>>>>>> +			goto retry;
>>>>>>> +		}
>>>>>>> +		goto Out;
>>>>>>> +	}
>>>>>>
>>>>>> Sorry, what I meant is to have an extended wait time in debug mode.
>>>>>> Something like below, not a 'full retry' as in sending the message again.
>>>>>>
>>>>>>
>>>>>> +#define MAX_DBG_WAIT_CNT 3
>>>>>> +
>>>>>>    /**
>>>>>>     * __smu_cmn_poll_stat -- poll for a status from the SMU
>>>>>>     * smu: a pointer to SMU context @@ -115,17 +117,24 @@ static
>>>>>> void smu_cmn_read_arg(struct smu_context *smu,
>>>>>>    static u32 __smu_cmn_poll_stat(struct smu_context *smu)
>>>>>>    {
>>>>>>           struct amdgpu_device *adev = smu->adev;
>>>>>> -       int timeout = adev->usec_timeout * 20;
>>>>>> +       int timeout;
>>>>>>           u32 reg;
>>>>>> +       int extended_wait = smu_debug_mode ? MAX_DBG_WAIT_CNT : 0;
>>>>>>
>>>>>> -       for ( ; timeout > 0; timeout--) {
>>>>>> -               reg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
>>>>>> -               if ((reg & MP1_C2PMSG_90__CONTENT_MASK) != 0)
>>>>>> -                       break;
>>>>>> +       do {
>>>>>> +               timeout = adev->usec_timeout * 20;
>>>>>> +               for (; timeout > 0; timeout--) {
>>>>>> +                       reg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
>>>>>> +                       if ((reg & MP1_C2PMSG_90__CONTENT_MASK) != 0)
>>>>>> +                               break;
>>>>>>
>>>>>> -               udelay(1);
>>>>>> -       }
>>>>>> +                       udelay(1);
>>>>>> +               }
>>>>>> +       } while (extended_wait-- >= 0);
>>>>>>
>>>>>> +       if (extended_wait != MAX_DBG_WAIT_CNT && reg !=
>>SMU_RESP_NONE)
>>>>>> +               dev_err(adev->dev,
>>>>>> +                       "SMU: Unexpected extended wait for
>>>>>> + response");
>>>>>>           return reg;
>>>>>>    }
>>>>>>
>>>>>> Thanks,
>>>>>> Lijo
>>>>>>
>>>>>>>     	if (read_arg)
>>>>>>>     		smu_cmn_read_arg(smu, read_arg);
>>>>>>>     Out:
>>>>>>>     	mutex_unlock(&smu->message_lock);
>>>>>>> +
>>>>>>> +	BUG_ON(unlikely(amdgpu_smu_debug == 1) && res);
>>>>>>> +
>>>>>>>     	return res;
>>>>>>>     }
>>>>>>>
>>>>>>>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] drm/amdgpu: add SMU debug option support
  2021-12-01  8:18               ` Yu, Lang
@ 2021-12-01  8:46                 ` Lazar, Lijo
  0 siblings, 0 replies; 12+ messages in thread
From: Lazar, Lijo @ 2021-12-01  8:46 UTC (permalink / raw)
  To: Yu, Lang, amd-gfx; +Cc: Deucher, Alexander, Huang, Ray, Koenig, Christian



On 12/1/2021 1:48 PM, Yu, Lang wrote:
> [AMD Official Use Only]
> 
> 
> 
>> -----Original Message-----
>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Yu, Lang
>> Sent: Wednesday, December 1, 2021 3:58 PM
>> To: Lazar, Lijo <Lijo.Lazar@amd.com>; amd-gfx@lists.freedesktop.org
>> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Huang, Ray
>> <Ray.Huang@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
>> Subject: RE: [PATCH] drm/amdgpu: add SMU debug option support
>>
>> [AMD Official Use Only]
>>
>>
>>
>>> -----Original Message-----
>>> From: Lazar, Lijo <Lijo.Lazar@amd.com>
>>> Sent: Wednesday, December 1, 2021 3:28 PM
>>> To: Yu, Lang <Lang.Yu@amd.com>; amd-gfx@lists.freedesktop.org
>>> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Huang, Ray
>>> <Ray.Huang@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
>>> Subject: Re: [PATCH] drm/amdgpu: add SMU debug option support
>>>
>>>
>>>
>>> On 12/1/2021 12:37 PM, Yu, Lang wrote:
>>>> [AMD Official Use Only]
>>>>
>>>>
>>>>
>>>>> -----Original Message-----
>>>>> From: Lazar, Lijo <Lijo.Lazar@amd.com>
>>>>> Sent: Wednesday, December 1, 2021 2:56 PM
>>>>> To: Yu, Lang <Lang.Yu@amd.com>; amd-gfx@lists.freedesktop.org
>>>>> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Huang, Ray
>>>>> <Ray.Huang@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
>>>>> Subject: Re: [PATCH] drm/amdgpu: add SMU debug option support
>>>>>
>>>>>
>>>>>
>>>>> On 12/1/2021 11:57 AM, Yu, Lang wrote:
>>>>>> [AMD Official Use Only]
>>>>>>
>>>>>> Hi Lijo,
>>>>>>
>>>>>> Thanks for your comments.
>>>>>>
>>>>>>    From my understanding, that just increases the timeout threshold
>>>>>> and could hide some potential issues which should be exposed and solved.
>>>>>>
>>>>>> If current timeout threshold is not enough for some corner cases,
>>>>>> (1) Do we consider to increase the threshold to cover these cases?
>>>>>> (2) Or do we just expose them and request SMU FW to optimize them?
>>>>>>
>>>>>> I think it doesn't make much sense to increase the threshold in debug mode.
>>>>>> How do you think? Thanks!
>>>>>
>>>>> In normal cases, 2secs would be more than enough. If we hang
>>>>> immediately, then check the FW registers later, the response would
>>>>> have come. I thought we just need to note those cases and not to
>>>>> fail everytime. Just to mark as a red flag in the log to tell us
>>>>> that FW is unexpectedly busy processing something else when the message is
>> sent.
>>>>>
>>>>> There are some issues related to S0ix where we see the FW comes back
>>>>> with a response with an increased timeout under certain conditions.
>>>>
>>>> If these issues still exists, could we just blacklist the tests that
>>>> triggered them before solve them? Or we just increase the threshold
>>>> to cover
>>> all the cases?
>>>>
>>>
>>> Actually, the timeout is message specific - like i2c transfer from
>>> EEPROM could take longer time.
>>>
>>> I am not sure if we should have more than 2s as timeout. Whenever this
>>> kind of issue happens, FW team check registers (then it will have a
>>> proper value) and say they don't see anything abnormal :) Usually,
>>> those are just signs of crack and it eventually breaks.
>>>
>>> Option is just fail immediately (then again not sure useful it will be
>>> if the issue is this sort of thing) or wait to see how far it goes with
>>> an added timeout before it fails eventually.
>>
>> Are smu_cmn_wait_for_response()/smu_cmn_send_msg_without_waiting()
>> designed for long timeout cases? Is it fine that we don't fail here in the event of
>> timeout?
> 
> Or we can add a timeout parameter into smu_cmn_send_smc_msg_with_param()
> to specify the timeout you want for specific message.
> I think this may be another story. Thanks!
>

Yes, that will be a different patch. For now, skip the extended timeout. 
Every timeout will trigger a debug alarm and let it be that way for 
debug mode. I think you can skip the retry also (originally I meant this 
by that comment - retry again for response reg check).

Thanks,
Lijo

> Thanks,
> Lang
>>
>>>
>>> Thanks,
>>> Lijo
>>>
>>>> Regards,
>>>> Lang
>>>>
>>>>>
>>>>> Thanks,
>>>>> Lijo
>>>>>
>>>>>>
>>>>>> Regards,
>>>>>> Lang
>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: Lazar, Lijo <Lijo.Lazar@amd.com>
>>>>>>> Sent: Wednesday, December 1, 2021 1:44 PM
>>>>>>> To: Lazar, Lijo <Lijo.Lazar@amd.com>; Yu, Lang <Lang.Yu@amd.com>;
>>>>>>> amd- gfx@lists.freedesktop.org
>>>>>>> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Huang, Ray
>>>>>>> <Ray.Huang@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>> Subject: RE: [PATCH] drm/amdgpu: add SMU debug option support
>>>>>>>
>>>>>>> Just realized that the patch I pasted won't work. Outer loop exit
>>>>>>> needs to be like this.
>>>>>>> 	(reg & MP1_C2PMSG_90__CONTENT_MASK) != 0 && extended_wait-- >=
>>>>>>> 0
>>>>>>>
>>>>>>> Anyway, that patch is only there to communicate what I really
>>>>>>> meant in the earlier comment.
>>>>>>>
>>>>>>> Thanks,
>>>>>>> Lijo
>>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>>>>>>> Lazar, Lijo
>>>>>>> Sent: Wednesday, December 1, 2021 10:44 AM
>>>>>>> To: Yu, Lang <Lang.Yu@amd.com>; amd-gfx@lists.freedesktop.org
>>>>>>> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Huang, Ray
>>>>>>> <Ray.Huang@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>> Subject: Re: [PATCH] drm/amdgpu: add SMU debug option support
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> On 11/30/2021 10:47 AM, Lang Yu wrote:
>>>>>>>> To maintain system error state when SMU errors occurred, which
>>>>>>>> will aid in debugging SMU firmware issues, add SMU debug option
>> support.
>>>>>>>>
>>>>>>>> It can be enabled or disabled via amdgpu_smu_debug debugfs file.
>>>>>>>> When enabled, it makes SMU errors fatal.
>>>>>>>> It is disabled by default.
>>>>>>>>
>>>>>>>> == Command Guide ==
>>>>>>>>
>>>>>>>> 1, enable SMU debug option
>>>>>>>>
>>>>>>>>      # echo 1 > /sys/kernel/debug/dri/0/amdgpu_smu_debug
>>>>>>>>
>>>>>>>> 2, disable SMU debug option
>>>>>>>>
>>>>>>>>      # echo 0 > /sys/kernel/debug/dri/0/amdgpu_smu_debug
>>>>>>>>
>>>>>>>> v2:
>>>>>>>>      - Resend command when timeout.(Lijo)
>>>>>>>>      - Use debugfs file instead of module parameter.
>>>>>>>>
>>>>>>>> Signed-off-by: Lang Yu <lang.yu@amd.com>
>>>>>>>> ---
>>>>>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 32
>>>>> +++++++++++++++++
>>>>>>>>      drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c      | 39
>>>>> +++++++++++++++++++-
>>>>>>> -
>>>>>>>>      2 files changed, 69 insertions(+), 2 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>>>>> index 164d6a9e9fbb..f9412de86599 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>>>>> @@ -39,6 +39,8 @@
>>>>>>>>
>>>>>>>>      #if defined(CONFIG_DEBUG_FS)
>>>>>>>>
>>>>>>>> +extern int amdgpu_smu_debug;
>>>>>>>> +
>>>>>>>>      /**
>>>>>>>>       * amdgpu_debugfs_process_reg_op - Handle MMIO register
>>> reads/writes
>>>>>>>>       *
>>>>>>>> @@ -1152,6 +1154,8 @@ static ssize_t
>>>>>>>> amdgpu_debugfs_gfxoff_read(struct
>>>>>>> file *f, char __user *buf,
>>>>>>>>      	return result;
>>>>>>>>      }
>>>>>>>>
>>>>>>>> +
>>>>>>>> +
>>>>>>>>      static const struct file_operations amdgpu_debugfs_regs2_fops = {
>>>>>>>>      	.owner = THIS_MODULE,
>>>>>>>>      	.unlocked_ioctl = amdgpu_debugfs_regs2_ioctl, @@ -1609,6
>>>>>>>> +1613,26 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
>>>>>>>>      DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
>>>>>>>>      			amdgpu_debugfs_sclk_set, "%llu\n");
>>>>>>>>
>>>>>>>> +static int amdgpu_debugfs_smu_debug_get(void *data, u64 *val) {
>>>>>>>> +	*val = amdgpu_smu_debug;
>>>>>>>> +	return 0;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static int amdgpu_debugfs_smu_debug_set(void *data, u64 val) {
>>>>>>>> +	if (val != 0 && val != 1)
>>>>>>>> +		return -EINVAL;
>>>>>>>> +
>>>>>>>> +	amdgpu_smu_debug = val;
>>>>>>>> +	return 0;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +DEFINE_DEBUGFS_ATTRIBUTE(fops_smu_debug,
>>>>>>>> +			 amdgpu_debugfs_smu_debug_get,
>>>>>>>> +			 amdgpu_debugfs_smu_debug_set,
>>>>>>>> +			 "%llu\n");
>>>>>>>> +
>>>>>>>>      int amdgpu_debugfs_init(struct amdgpu_device *adev)
>>>>>>>>      {
>>>>>>>>      	struct dentry *root =
>>>>>>>> adev_to_drm(adev)->primary->debugfs_root;
>>>>>>>> @@ -1632,6 +1656,14 @@ int amdgpu_debugfs_init(struct
>>>>>>>> amdgpu_device
>>>>>>> *adev)
>>>>>>>>      		return PTR_ERR(ent);
>>>>>>>>      	}
>>>>>>>>
>>>>>>>> +	ent = debugfs_create_file("amdgpu_smu_debug", 0600, root,
>> adev,
>>>>>>>> +				  &fops_smu_debug);
>>>>>>>> +	if (IS_ERR(ent)) {
>>>>>>>> +		DRM_ERROR("unable to create amdgpu_smu_debug
>> debugsfs
>>>>>>> file\n");
>>>>>>>> +		return PTR_ERR(ent);
>>>>>>>> +	}
>>>>>>>> +
>>>>>>>> +
>>>>>>>>      	/* Register debugfs entries for amdgpu_ttm */
>>>>>>>>      	amdgpu_ttm_debugfs_init(adev);
>>>>>>>>      	amdgpu_debugfs_pm_init(adev); diff --git
>>>>>>>> a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>>>>>>>> b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>>>>>>>> index 048ca1673863..b3969d7933d3 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>>>>>>>> @@ -55,6 +55,14 @@
>>>>>>>>
>>>>>>>>      #undef __SMU_DUMMY_MAP
>>>>>>>>      #define __SMU_DUMMY_MAP(type)	#type
>>>>>>>> +
>>>>>>>> +/*
>>>>>>>> + * Used to enable SMU debug option. When enabled, it makes SMU
>>>>>>>> +errors
>>>>>>> fatal.
>>>>>>>> + * This will aid in debugging SMU firmware issues.
>>>>>>>> + * (0 = disabled (default), 1 = enabled)  */ int
>>>>>>>> + amdgpu_smu_debug;
>>>>>>>> +
>>>>>>>>      static const char * const __smu_message_names[] = {
>>>>>>>>      	SMU_MESSAGE_TYPES
>>>>>>>>      };
>>>>>>>> @@ -272,6 +280,11 @@ int smu_cmn_send_msg_without_waiting(struct
>>>>>>> smu_context *smu,
>>>>>>>>      	__smu_cmn_send_msg(smu, msg_index, param);
>>>>>>>>      	res = 0;
>>>>>>>>      Out:
>>>>>>>> +	if (unlikely(amdgpu_smu_debug == 1) && res) {
>>>>>>>> +		mutex_unlock(&smu->message_lock);
>>>>>>>> +		BUG();
>>>>>>>> +	}
>>>>>>>> +
>>>>>>>>      	return res;
>>>>>>>>      }
>>>>>>>>
>>>>>>>> @@ -288,9 +301,17 @@ int smu_cmn_send_msg_without_waiting(struct
>>>>>>> smu_context *smu,
>>>>>>>>      int smu_cmn_wait_for_response(struct smu_context *smu)
>>>>>>>>      {
>>>>>>>>      	u32 reg;
>>>>>>>> +	int res;
>>>>>>>>
>>>>>>>>      	reg = __smu_cmn_poll_stat(smu);
>>>>>>>> -	return __smu_cmn_reg2errno(smu, reg);
>>>>>>>> +	res = __smu_cmn_reg2errno(smu, reg);
>>>>>>>> +
>>>>>>>> +	if (unlikely(amdgpu_smu_debug == 1) && res) {
>>>>>>>> +		mutex_unlock(&smu->message_lock);
>>>>>>>> +		BUG();
>>>>>>>> +	}
>>>>>>>> +
>>>>>>>> +	return res;
>>>>>>>>      }
>>>>>>>>
>>>>>>>>      /**
>>>>>>>> @@ -328,6 +349,7 @@ int smu_cmn_send_smc_msg_with_param(struct
>>>>>>> smu_context *smu,
>>>>>>>>      				    uint32_t param,
>>>>>>>>      				    uint32_t *read_arg)
>>>>>>>>      {
>>>>>>>> +	int retry_count = 0;
>>>>>>>>      	int res, index;
>>>>>>>>      	u32 reg;
>>>>>>>>
>>>>>>>> @@ -349,15 +371,28 @@ int
>>> smu_cmn_send_smc_msg_with_param(struct
>>>>>>> smu_context *smu,
>>>>>>>>      		__smu_cmn_reg_print_error(smu, reg, index, param,
>>> msg);
>>>>>>>>      		goto Out;
>>>>>>>>      	}
>>>>>>>> +retry:
>>>>>>>>      	__smu_cmn_send_msg(smu, (uint16_t) index, param);
>>>>>>>>      	reg = __smu_cmn_poll_stat(smu);
>>>>>>>>      	res = __smu_cmn_reg2errno(smu, reg);
>>>>>>>> -	if (res != 0)
>>>>>>>> +	if (res != 0) {
>>>>>>>>      		__smu_cmn_reg_print_error(smu, reg, index, param,
>>> msg);
>>>>>>>> +		if ((res == -ETIME) && (retry_count++ < 1)) {
>>>>>>>> +			usleep_range(500, 1000);
>>>>>>>> +			dev_err(smu->adev->dev,
>>>>>>>> +				"SMU: resend command: index:%d
>>>>>>> param:0x%08X message:%s",
>>>>>>>> +				index, param,
>> smu_get_message_name(smu,
>>>>>>> msg));
>>>>>>>> +			goto retry;
>>>>>>>> +		}
>>>>>>>> +		goto Out;
>>>>>>>> +	}
>>>>>>>
>>>>>>> Sorry, what I meant is to have an extended wait time in debug mode.
>>>>>>> Something like below, not a 'full retry' as in sending the message again.
>>>>>>>
>>>>>>>
>>>>>>> +#define MAX_DBG_WAIT_CNT 3
>>>>>>> +
>>>>>>>     /**
>>>>>>>      * __smu_cmn_poll_stat -- poll for a status from the SMU
>>>>>>>      * smu: a pointer to SMU context @@ -115,17 +117,24 @@ static
>>>>>>> void smu_cmn_read_arg(struct smu_context *smu,
>>>>>>>     static u32 __smu_cmn_poll_stat(struct smu_context *smu)
>>>>>>>     {
>>>>>>>            struct amdgpu_device *adev = smu->adev;
>>>>>>> -       int timeout = adev->usec_timeout * 20;
>>>>>>> +       int timeout;
>>>>>>>            u32 reg;
>>>>>>> +       int extended_wait = smu_debug_mode ? MAX_DBG_WAIT_CNT : 0;
>>>>>>>
>>>>>>> -       for ( ; timeout > 0; timeout--) {
>>>>>>> -               reg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
>>>>>>> -               if ((reg & MP1_C2PMSG_90__CONTENT_MASK) != 0)
>>>>>>> -                       break;
>>>>>>> +       do {
>>>>>>> +               timeout = adev->usec_timeout * 20;
>>>>>>> +               for (; timeout > 0; timeout--) {
>>>>>>> +                       reg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
>>>>>>> +                       if ((reg & MP1_C2PMSG_90__CONTENT_MASK) != 0)
>>>>>>> +                               break;
>>>>>>>
>>>>>>> -               udelay(1);
>>>>>>> -       }
>>>>>>> +                       udelay(1);
>>>>>>> +               }
>>>>>>> +       } while (extended_wait-- >= 0);
>>>>>>>
>>>>>>> +       if (extended_wait != MAX_DBG_WAIT_CNT && reg !=
>>> SMU_RESP_NONE)
>>>>>>> +               dev_err(adev->dev,
>>>>>>> +                       "SMU: Unexpected extended wait for
>>>>>>> + response");
>>>>>>>            return reg;
>>>>>>>     }
>>>>>>>
>>>>>>> Thanks,
>>>>>>> Lijo
>>>>>>>
>>>>>>>>      	if (read_arg)
>>>>>>>>      		smu_cmn_read_arg(smu, read_arg);
>>>>>>>>      Out:
>>>>>>>>      	mutex_unlock(&smu->message_lock);
>>>>>>>> +
>>>>>>>> +	BUG_ON(unlikely(amdgpu_smu_debug == 1) && res);
>>>>>>>> +
>>>>>>>>      	return res;
>>>>>>>>      }
>>>>>>>>
>>>>>>>>

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2021-12-01  8:46 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-11-30  5:17 [PATCH] drm/amdgpu: add SMU debug option support Lang Yu
2021-11-30 11:18 ` Christian König
2021-12-01  2:28   ` Lang Yu
2021-12-01  5:14 ` Lazar, Lijo
2021-12-01  5:44   ` Lazar, Lijo
2021-12-01  6:27     ` Yu, Lang
2021-12-01  6:56       ` Lazar, Lijo
2021-12-01  7:07         ` Yu, Lang
2021-12-01  7:28           ` Lazar, Lijo
2021-12-01  7:57             ` Yu, Lang
2021-12-01  8:18               ` Yu, Lang
2021-12-01  8:46                 ` Lazar, Lijo

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.