All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v4 1/2] drm/amdgpu: add debugfs for reset registers list
@ 2022-02-15 10:12 Somalapuram Amaranath
  2022-02-15 10:12 ` [PATCH v4 2/2] drm/amdgpu: add reset register dump trace on GPU reset Somalapuram Amaranath
  2022-02-15 10:16 ` [PATCH v4 1/2] drm/amdgpu: add debugfs for reset registers list Christian König
  0 siblings, 2 replies; 7+ messages in thread
From: Somalapuram Amaranath @ 2022-02-15 10:12 UTC (permalink / raw)
  To: amd-gfx
  Cc: alexander.deucher, Somalapuram Amaranath, christian.koenig,
	shashank.sharma

List of register populated for dump collection during the GPU reset.

Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  4 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 89 +++++++++++++++++++++
 2 files changed, 93 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index b85b67a88a3d..2e8c2318276d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1097,6 +1097,10 @@ struct amdgpu_device {
 
 	struct amdgpu_reset_control     *reset_cntl;
 	uint32_t                        ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
+
+	/* reset dump register */
+	uint32_t			*reset_dump_reg_list;
+	int                             n_regs;
 };
 
 static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 164d6a9e9fbb..edcb032bc1f3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1609,6 +1609,93 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
 DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
 			amdgpu_debugfs_sclk_set, "%llu\n");
 
+static ssize_t amdgpu_reset_dump_register_list_read(struct file *f,
+				char __user *buf, size_t size, loff_t *pos)
+{
+	struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
+	char reg_offset[11];
+	int i, r, len = 0;
+
+	if (adev->n_regs == 0)
+		return 0;
+
+	for (i = 0; i < adev->n_regs; i++) {
+		memset(reg_offset,  0, 11);
+		sprintf(reg_offset + strlen(reg_offset),
+				"0x%x ", adev->reset_dump_reg_list[i]);
+		r = copy_to_user(buf + len, reg_offset, strlen(reg_offset));
+		len += strlen(reg_offset);
+	}
+
+	r = copy_to_user(buf + len, "\n", 1);
+	len++;
+
+	if (*pos >= len)
+		return 0;
+
+	*pos += len - r;
+
+	return len;
+}
+
+static ssize_t amdgpu_reset_dump_register_list_write(struct file *f,
+			const char __user *buf, size_t size, loff_t *pos)
+{
+	struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
+	char *reg_offset, *reg, reg_temp[11];
+	int ret, i = 0, len = 0;
+
+	reg_offset = reg_temp;
+	memset(reg_offset,  0, 11);
+	ret = copy_from_user(reg_offset, buf, 11);
+
+	if (ret)
+		return -EFAULT;
+
+	if (adev->n_regs > 0) {
+		adev->n_regs = 0;
+		kfree(adev->reset_dump_reg_list);
+		adev->reset_dump_reg_list = NULL;
+	}
+
+	while (((reg = strsep(&reg_offset, " ")) != NULL) && len < size) {
+		adev->reset_dump_reg_list =  krealloc_array(
+						adev->reset_dump_reg_list, 1,
+						sizeof(uint32_t), GFP_KERNEL);
+		ret  = kstrtouint(reg, 16, &adev->reset_dump_reg_list[i]);
+
+		if (ret) {
+			kfree(adev->reset_dump_reg_list);
+			adev->reset_dump_reg_list = NULL;
+			return -EINVAL;
+		}
+
+		len += strlen(reg) + 1;
+		reg_offset = reg_temp;
+		memset(reg_offset,  0, 11);
+		ret = copy_from_user(reg_offset, buf + len, 11);
+
+		if (ret) {
+			kfree(adev->reset_dump_reg_list);
+			adev->reset_dump_reg_list = NULL;
+			return -EFAULT;
+		}
+
+		i++;
+	}
+
+	adev->n_regs = i;
+
+	return size;
+}
+
+static const struct file_operations amdgpu_reset_dump_register_list = {
+	.owner = THIS_MODULE,
+	.read = amdgpu_reset_dump_register_list_read,
+	.write = amdgpu_reset_dump_register_list_write,
+	.llseek = default_llseek
+};
+
 int amdgpu_debugfs_init(struct amdgpu_device *adev)
 {
 	struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
@@ -1672,6 +1759,8 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
 			    &amdgpu_debugfs_test_ib_fops);
 	debugfs_create_file("amdgpu_vm_info", 0444, root, adev,
 			    &amdgpu_debugfs_vm_info_fops);
+	debugfs_create_file("amdgpu_reset_dump_register_list", 0644, root, adev,
+			    &amdgpu_reset_dump_register_list);
 
 	adev->debugfs_vbios_blob.data = adev->bios;
 	adev->debugfs_vbios_blob.size = adev->bios_size;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH v4 2/2] drm/amdgpu: add reset register dump trace on GPU reset
  2022-02-15 10:12 [PATCH v4 1/2] drm/amdgpu: add debugfs for reset registers list Somalapuram Amaranath
@ 2022-02-15 10:12 ` Somalapuram Amaranath
  2022-02-15 16:39   ` Andrey Grodzovsky
  2022-02-15 10:16 ` [PATCH v4 1/2] drm/amdgpu: add debugfs for reset registers list Christian König
  1 sibling, 1 reply; 7+ messages in thread
From: Somalapuram Amaranath @ 2022-02-15 10:12 UTC (permalink / raw)
  To: amd-gfx
  Cc: alexander.deucher, Somalapuram Amaranath, christian.koenig,
	shashank.sharma

Dump the list of register values to trace event on GPU reset.

Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 ++++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h  | 16 ++++++++++++++++
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 1e651b959141..ff21262c6fea 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4534,6 +4534,19 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
 	return r;
 }
 
+static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
+{
+	uint32_t reg_value;
+	int i;
+
+	for (i = 0; i < adev->n_regs; i++) {
+		reg_value = RREG32(adev->reset_dump_reg_list[i]);
+		trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], reg_value);
+	}
+
+	return 0;
+}
+
 int amdgpu_do_asic_reset(struct list_head *device_list_handle,
 			 struct amdgpu_reset_context *reset_context)
 {
@@ -4567,8 +4580,10 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
 				tmp_adev->gmc.xgmi.pending_reset = false;
 				if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
 					r = -EALREADY;
-			} else
+			} else {
+				amdgpu_reset_reg_dumps(tmp_adev);
 				r = amdgpu_asic_reset(tmp_adev);
+			}
 
 			if (r) {
 				dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
index d855cb53c7e0..b9637925e85c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
@@ -537,6 +537,22 @@ TRACE_EVENT(amdgpu_ib_pipe_sync,
 		      __entry->seqno)
 );
 
+TRACE_EVENT(amdgpu_reset_reg_dumps,
+	    TP_PROTO(uint32_t address, uint32_t value),
+	    TP_ARGS(address, value),
+	    TP_STRUCT__entry(
+			     __field(uint32_t, address)
+			     __field(uint32_t, value)
+			     ),
+	    TP_fast_assign(
+			   __entry->address = address;
+			   __entry->value = value;
+			   ),
+	    TP_printk("amdgpu register dump 0x%x: 0x%x",
+		      __entry->address,
+		      __entry->value)
+);
+
 #undef AMDGPU_JOB_GET_TIMELINE_NAME
 #endif
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH v4 1/2] drm/amdgpu: add debugfs for reset registers list
  2022-02-15 10:12 [PATCH v4 1/2] drm/amdgpu: add debugfs for reset registers list Somalapuram Amaranath
  2022-02-15 10:12 ` [PATCH v4 2/2] drm/amdgpu: add reset register dump trace on GPU reset Somalapuram Amaranath
@ 2022-02-15 10:16 ` Christian König
  2022-02-15 11:15   ` Somalapuram, Amaranath
  1 sibling, 1 reply; 7+ messages in thread
From: Christian König @ 2022-02-15 10:16 UTC (permalink / raw)
  To: Somalapuram Amaranath, amd-gfx; +Cc: alexander.deucher, shashank.sharma



Am 15.02.22 um 11:12 schrieb Somalapuram Amaranath:
> List of register populated for dump collection during the GPU reset.
>
> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  4 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 89 +++++++++++++++++++++
>   2 files changed, 93 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index b85b67a88a3d..2e8c2318276d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1097,6 +1097,10 @@ struct amdgpu_device {
>   
>   	struct amdgpu_reset_control     *reset_cntl;
>   	uint32_t                        ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
> +
> +	/* reset dump register */
> +	uint32_t			*reset_dump_reg_list;
> +	int                             n_regs;
>   };
>   
>   static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> index 164d6a9e9fbb..edcb032bc1f3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> @@ -1609,6 +1609,93 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
>   DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
>   			amdgpu_debugfs_sclk_set, "%llu\n");
>   
> +static ssize_t amdgpu_reset_dump_register_list_read(struct file *f,
> +				char __user *buf, size_t size, loff_t *pos)
> +{
> +	struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
> +	char reg_offset[11];
> +	int i, r, len = 0;
> +
> +	if (adev->n_regs == 0)
> +		return 0;
> +
> +	for (i = 0; i < adev->n_regs; i++) {
> +		memset(reg_offset,  0, 11);
> +		sprintf(reg_offset + strlen(reg_offset),

That here looks odd, probably just a leftover from the older code.

Without it you can also drop the memset().

> +				"0x%x ", adev->reset_dump_reg_list[i]);
> +		r = copy_to_user(buf + len, reg_offset, strlen(reg_offset));

Whenever copy_to_user() returns a nonzero value the best practice is to 
return -EFAULT.

> +		len += strlen(reg_offset);
> +	}
> +
> +	r = copy_to_user(buf + len, "\n", 1);

Same here.

> +	len++;
> +
> +	if (*pos >= len)
> +		return 0;

What is that good for?

Regards,
Christian.

> +
> +	*pos += len - r;
> +
> +	return len;
> +}
> +
> +static ssize_t amdgpu_reset_dump_register_list_write(struct file *f,
> +			const char __user *buf, size_t size, loff_t *pos)
> +{
> +	struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
> +	char *reg_offset, *reg, reg_temp[11];
> +	int ret, i = 0, len = 0;
> +
> +	reg_offset = reg_temp;
> +	memset(reg_offset,  0, 11);
> +	ret = copy_from_user(reg_offset, buf, 11);
> +
> +	if (ret)
> +		return -EFAULT;
> +
> +	if (adev->n_regs > 0) {
> +		adev->n_regs = 0;
> +		kfree(adev->reset_dump_reg_list);
> +		adev->reset_dump_reg_list = NULL;
> +	}
> +
> +	while (((reg = strsep(&reg_offset, " ")) != NULL) && len < size) {
> +		adev->reset_dump_reg_list =  krealloc_array(
> +						adev->reset_dump_reg_list, 1,
> +						sizeof(uint32_t), GFP_KERNEL);
> +		ret  = kstrtouint(reg, 16, &adev->reset_dump_reg_list[i]);
> +
> +		if (ret) {
> +			kfree(adev->reset_dump_reg_list);
> +			adev->reset_dump_reg_list = NULL;
> +			return -EINVAL;
> +		}
> +
> +		len += strlen(reg) + 1;
> +		reg_offset = reg_temp;
> +		memset(reg_offset,  0, 11);
> +		ret = copy_from_user(reg_offset, buf + len, 11);
> +
> +		if (ret) {
> +			kfree(adev->reset_dump_reg_list);
> +			adev->reset_dump_reg_list = NULL;
> +			return -EFAULT;
> +		}
> +
> +		i++;
> +	}
> +
> +	adev->n_regs = i;
> +
> +	return size;
> +}
> +
> +static const struct file_operations amdgpu_reset_dump_register_list = {
> +	.owner = THIS_MODULE,
> +	.read = amdgpu_reset_dump_register_list_read,
> +	.write = amdgpu_reset_dump_register_list_write,
> +	.llseek = default_llseek
> +};
> +
>   int amdgpu_debugfs_init(struct amdgpu_device *adev)
>   {
>   	struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
> @@ -1672,6 +1759,8 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
>   			    &amdgpu_debugfs_test_ib_fops);
>   	debugfs_create_file("amdgpu_vm_info", 0444, root, adev,
>   			    &amdgpu_debugfs_vm_info_fops);
> +	debugfs_create_file("amdgpu_reset_dump_register_list", 0644, root, adev,
> +			    &amdgpu_reset_dump_register_list);
>   
>   	adev->debugfs_vbios_blob.data = adev->bios;
>   	adev->debugfs_vbios_blob.size = adev->bios_size;


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v4 1/2] drm/amdgpu: add debugfs for reset registers list
  2022-02-15 10:16 ` [PATCH v4 1/2] drm/amdgpu: add debugfs for reset registers list Christian König
@ 2022-02-15 11:15   ` Somalapuram, Amaranath
  0 siblings, 0 replies; 7+ messages in thread
From: Somalapuram, Amaranath @ 2022-02-15 11:15 UTC (permalink / raw)
  To: Christian König, Somalapuram Amaranath, amd-gfx
  Cc: alexander.deucher, shashank.sharma


On 2/15/2022 3:46 PM, Christian König wrote:
>
>
> Am 15.02.22 um 11:12 schrieb Somalapuram Amaranath:
>> List of register populated for dump collection during the GPU reset.
>>
>> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  4 +
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 89 +++++++++++++++++++++
>>   2 files changed, 93 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index b85b67a88a3d..2e8c2318276d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -1097,6 +1097,10 @@ struct amdgpu_device {
>>         struct amdgpu_reset_control     *reset_cntl;
>>       uint32_t ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
>> +
>> +    /* reset dump register */
>> +    uint32_t            *reset_dump_reg_list;
>> +    int                             n_regs;
>>   };
>>     static inline struct amdgpu_device *drm_to_adev(struct drm_device 
>> *ddev)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> index 164d6a9e9fbb..edcb032bc1f3 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> @@ -1609,6 +1609,93 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
>>   DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
>>               amdgpu_debugfs_sclk_set, "%llu\n");
>>   +static ssize_t amdgpu_reset_dump_register_list_read(struct file *f,
>> +                char __user *buf, size_t size, loff_t *pos)
>> +{
>> +    struct amdgpu_device *adev = (struct amdgpu_device 
>> *)file_inode(f)->i_private;
>> +    char reg_offset[11];
>> +    int i, r, len = 0;
>> +
>> +    if (adev->n_regs == 0)
>> +        return 0;
>> +
>> +    for (i = 0; i < adev->n_regs; i++) {
>> +        memset(reg_offset,  0, 11);
>> +        sprintf(reg_offset + strlen(reg_offset),
>
> That here looks odd, probably just a leftover from the older code.
>
My mistake leftover code.

let me try to remove memset() also.

> Without it you can also drop the memset().
>
>> +                "0x%x ", adev->reset_dump_reg_list[i]);
>> +        r = copy_to_user(buf + len, reg_offset, strlen(reg_offset));
>
> Whenever copy_to_user() returns a nonzero value the best practice is 
> to return -EFAULT.
>
agreed.
>> +        len += strlen(reg_offset);
>> +    }
>> +
>> +    r = copy_to_user(buf + len, "\n", 1);
>
> Same here.
>
agreed.
>> +    len++;
>> +
>> +    if (*pos >= len)
>> +        return 0;
>
> What is that good for?
>
If i don't have this condition read operation will be infinite loop and 
data keeps repeating.

I am not sure why !. maybe i need to set something while write operation.

> Regards,
> Christian.
>
>> +
>> +    *pos += len - r;
>> +
>> +    return len;
>> +}
>> +
>> +static ssize_t amdgpu_reset_dump_register_list_write(struct file *f,
>> +            const char __user *buf, size_t size, loff_t *pos)
>> +{
>> +    struct amdgpu_device *adev = (struct amdgpu_device 
>> *)file_inode(f)->i_private;
>> +    char *reg_offset, *reg, reg_temp[11];
>> +    int ret, i = 0, len = 0;
>> +
>> +    reg_offset = reg_temp;
>> +    memset(reg_offset,  0, 11);
>> +    ret = copy_from_user(reg_offset, buf, 11);
>> +
>> +    if (ret)
>> +        return -EFAULT;
>> +
>> +    if (adev->n_regs > 0) {
>> +        adev->n_regs = 0;
>> +        kfree(adev->reset_dump_reg_list);
>> +        adev->reset_dump_reg_list = NULL;
>> +    }
>> +
>> +    while (((reg = strsep(&reg_offset, " ")) != NULL) && len < size) {
>> +        adev->reset_dump_reg_list =  krealloc_array(
>> +                        adev->reset_dump_reg_list, 1,
>> +                        sizeof(uint32_t), GFP_KERNEL);
>> +        ret  = kstrtouint(reg, 16, &adev->reset_dump_reg_list[i]);
>> +
>> +        if (ret) {
>> +            kfree(adev->reset_dump_reg_list);
>> +            adev->reset_dump_reg_list = NULL;
>> +            return -EINVAL;
>> +        }
>> +
>> +        len += strlen(reg) + 1;
>> +        reg_offset = reg_temp;
>> +        memset(reg_offset,  0, 11);
>> +        ret = copy_from_user(reg_offset, buf + len, 11);
>> +
>> +        if (ret) {
>> +            kfree(adev->reset_dump_reg_list);
>> +            adev->reset_dump_reg_list = NULL;
>> +            return -EFAULT;
>> +        }
>> +
>> +        i++;
>> +    }
>> +
>> +    adev->n_regs = i;
>> +
>> +    return size;
>> +}
>> +
>> +static const struct file_operations amdgpu_reset_dump_register_list = {
>> +    .owner = THIS_MODULE,
>> +    .read = amdgpu_reset_dump_register_list_read,
>> +    .write = amdgpu_reset_dump_register_list_write,
>> +    .llseek = default_llseek
>> +};
>> +
>>   int amdgpu_debugfs_init(struct amdgpu_device *adev)
>>   {
>>       struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
>> @@ -1672,6 +1759,8 @@ int amdgpu_debugfs_init(struct amdgpu_device 
>> *adev)
>>                   &amdgpu_debugfs_test_ib_fops);
>>       debugfs_create_file("amdgpu_vm_info", 0444, root, adev,
>>                   &amdgpu_debugfs_vm_info_fops);
>> +    debugfs_create_file("amdgpu_reset_dump_register_list", 0644, 
>> root, adev,
>> +                &amdgpu_reset_dump_register_list);
>>         adev->debugfs_vbios_blob.data = adev->bios;
>>       adev->debugfs_vbios_blob.size = adev->bios_size;
>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v4 2/2] drm/amdgpu: add reset register dump trace on GPU reset
  2022-02-15 10:12 ` [PATCH v4 2/2] drm/amdgpu: add reset register dump trace on GPU reset Somalapuram Amaranath
@ 2022-02-15 16:39   ` Andrey Grodzovsky
  2022-02-16 10:46     ` Somalapuram, Amaranath
  0 siblings, 1 reply; 7+ messages in thread
From: Andrey Grodzovsky @ 2022-02-15 16:39 UTC (permalink / raw)
  To: Somalapuram Amaranath, amd-gfx
  Cc: alexander.deucher, christian.koenig, shashank.sharma


On 2022-02-15 05:12, Somalapuram Amaranath wrote:
> Dump the list of register values to trace event on GPU reset.
>
> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 ++++++++++++++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h  | 16 ++++++++++++++++
>   2 files changed, 32 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 1e651b959141..ff21262c6fea 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4534,6 +4534,19 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>   	return r;
>   }
>   
> +static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
> +{
> +	uint32_t reg_value;
> +	int i;
> +
> +	for (i = 0; i < adev->n_regs; i++) {
> +		reg_value = RREG32(adev->reset_dump_reg_list[i]);
> +		trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], reg_value);
> +	}
> +
> +	return 0;
> +}
> +
>   int amdgpu_do_asic_reset(struct list_head *device_list_handle,
>   			 struct amdgpu_reset_context *reset_context)
>   {
> @@ -4567,8 +4580,10 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
>   				tmp_adev->gmc.xgmi.pending_reset = false;
>   				if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
>   					r = -EALREADY;
> -			} else
> +			} else {
> +				amdgpu_reset_reg_dumps(tmp_adev);
>   				r = amdgpu_asic_reset(tmp_adev);
> +			}


Is there any particular reason you only dump registers in single ASIC 
case and not for XGMI ?

Andrey


>   
>   			if (r) {
>   				dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
> index d855cb53c7e0..b9637925e85c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
> @@ -537,6 +537,22 @@ TRACE_EVENT(amdgpu_ib_pipe_sync,
>   		      __entry->seqno)
>   );
>   
> +TRACE_EVENT(amdgpu_reset_reg_dumps,
> +	    TP_PROTO(uint32_t address, uint32_t value),
> +	    TP_ARGS(address, value),
> +	    TP_STRUCT__entry(
> +			     __field(uint32_t, address)
> +			     __field(uint32_t, value)
> +			     ),
> +	    TP_fast_assign(
> +			   __entry->address = address;
> +			   __entry->value = value;
> +			   ),
> +	    TP_printk("amdgpu register dump 0x%x: 0x%x",
> +		      __entry->address,
> +		      __entry->value)
> +);
> +
>   #undef AMDGPU_JOB_GET_TIMELINE_NAME
>   #endif
>   

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v4 2/2] drm/amdgpu: add reset register dump trace on GPU reset
  2022-02-15 16:39   ` Andrey Grodzovsky
@ 2022-02-16 10:46     ` Somalapuram, Amaranath
  2022-02-16 15:40       ` Andrey Grodzovsky
  0 siblings, 1 reply; 7+ messages in thread
From: Somalapuram, Amaranath @ 2022-02-16 10:46 UTC (permalink / raw)
  To: Andrey Grodzovsky, Somalapuram Amaranath, amd-gfx
  Cc: alexander.deucher, christian.koenig, shashank.sharma


On 2/15/2022 10:09 PM, Andrey Grodzovsky wrote:
>
> On 2022-02-15 05:12, Somalapuram Amaranath wrote:
>> Dump the list of register values to trace event on GPU reset.
>>
>> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 ++++++++++++++++-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h  | 16 ++++++++++++++++
>>   2 files changed, 32 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 1e651b959141..ff21262c6fea 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -4534,6 +4534,19 @@ int amdgpu_device_pre_asic_reset(struct 
>> amdgpu_device *adev,
>>       return r;
>>   }
>>   +static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
>> +{
>> +    uint32_t reg_value;
>> +    int i;
>> +
>> +    for (i = 0; i < adev->n_regs; i++) {
>> +        reg_value = RREG32(adev->reset_dump_reg_list[i]);
>> + trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], reg_value);
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>>   int amdgpu_do_asic_reset(struct list_head *device_list_handle,
>>                struct amdgpu_reset_context *reset_context)
>>   {
>> @@ -4567,8 +4580,10 @@ int amdgpu_do_asic_reset(struct list_head 
>> *device_list_handle,
>>                   tmp_adev->gmc.xgmi.pending_reset = false;
>>                   if (!queue_work(system_unbound_wq, 
>> &tmp_adev->xgmi_reset_work))
>>                       r = -EALREADY;
>> -            } else
>> +            } else {
>> +                amdgpu_reset_reg_dumps(tmp_adev);
>>                   r = amdgpu_asic_reset(tmp_adev);
>> +            }
>
>
> Is there any particular reason you only dump registers in single ASIC 
> case and not for XGMI ?
>
> Andrey
>
Not really, should I move it to the top of function?

Regards,

S.Amarnath

>
>>                 if (r) {
>>                   dev_err(tmp_adev->dev, "ASIC reset failed with 
>> error, %d for drm dev, %s",
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>> index d855cb53c7e0..b9637925e85c 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>> @@ -537,6 +537,22 @@ TRACE_EVENT(amdgpu_ib_pipe_sync,
>>                 __entry->seqno)
>>   );
>>   +TRACE_EVENT(amdgpu_reset_reg_dumps,
>> +        TP_PROTO(uint32_t address, uint32_t value),
>> +        TP_ARGS(address, value),
>> +        TP_STRUCT__entry(
>> +                 __field(uint32_t, address)
>> +                 __field(uint32_t, value)
>> +                 ),
>> +        TP_fast_assign(
>> +               __entry->address = address;
>> +               __entry->value = value;
>> +               ),
>> +        TP_printk("amdgpu register dump 0x%x: 0x%x",
>> +              __entry->address,
>> +              __entry->value)
>> +);
>> +
>>   #undef AMDGPU_JOB_GET_TIMELINE_NAME
>>   #endif

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v4 2/2] drm/amdgpu: add reset register dump trace on GPU reset
  2022-02-16 10:46     ` Somalapuram, Amaranath
@ 2022-02-16 15:40       ` Andrey Grodzovsky
  0 siblings, 0 replies; 7+ messages in thread
From: Andrey Grodzovsky @ 2022-02-16 15:40 UTC (permalink / raw)
  To: Somalapuram, Amaranath, Somalapuram Amaranath, amd-gfx
  Cc: alexander.deucher, christian.koenig, shashank.sharma


On 2022-02-16 05:46, Somalapuram, Amaranath wrote:
>
> On 2/15/2022 10:09 PM, Andrey Grodzovsky wrote:
>>
>> On 2022-02-15 05:12, Somalapuram Amaranath wrote:
>>> Dump the list of register values to trace event on GPU reset.
>>>
>>> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 ++++++++++++++++-
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h  | 16 ++++++++++++++++
>>>   2 files changed, 32 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index 1e651b959141..ff21262c6fea 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -4534,6 +4534,19 @@ int amdgpu_device_pre_asic_reset(struct 
>>> amdgpu_device *adev,
>>>       return r;
>>>   }
>>>   +static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
>>> +{
>>> +    uint32_t reg_value;
>>> +    int i;
>>> +
>>> +    for (i = 0; i < adev->n_regs; i++) {
>>> +        reg_value = RREG32(adev->reset_dump_reg_list[i]);
>>> + trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], 
>>> reg_value);
>>> +    }
>>> +
>>> +    return 0;
>>> +}
>>> +
>>>   int amdgpu_do_asic_reset(struct list_head *device_list_handle,
>>>                struct amdgpu_reset_context *reset_context)
>>>   {
>>> @@ -4567,8 +4580,10 @@ int amdgpu_do_asic_reset(struct list_head 
>>> *device_list_handle,
>>>                   tmp_adev->gmc.xgmi.pending_reset = false;
>>>                   if (!queue_work(system_unbound_wq, 
>>> &tmp_adev->xgmi_reset_work))
>>>                       r = -EALREADY;
>>> -            } else
>>> +            } else {
>>> +                amdgpu_reset_reg_dumps(tmp_adev);
>>>                   r = amdgpu_asic_reset(tmp_adev);
>>> +            }
>>
>>
>> Is there any particular reason you only dump registers in single ASIC 
>> case and not for XGMI ?
>>
>> Andrey
>>
> Not really, should I move it to the top of function?
>
> Regards,
>
> S.Amarnath


Yes, no reason to avoid dumping this info for XGMI case.

Consider also that maybe you don't want to print this when
the reset is not due to any error state - like manual GPU reset
from debugfs - but to know this you will have to wire a flag from
high above in the call stack to here and so probably this not worth
all this changes.

Andrey


>
>>
>>>                 if (r) {
>>>                   dev_err(tmp_adev->dev, "ASIC reset failed with 
>>> error, %d for drm dev, %s",
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>>> index d855cb53c7e0..b9637925e85c 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>>> @@ -537,6 +537,22 @@ TRACE_EVENT(amdgpu_ib_pipe_sync,
>>>                 __entry->seqno)
>>>   );
>>>   +TRACE_EVENT(amdgpu_reset_reg_dumps,
>>> +        TP_PROTO(uint32_t address, uint32_t value),
>>> +        TP_ARGS(address, value),
>>> +        TP_STRUCT__entry(
>>> +                 __field(uint32_t, address)
>>> +                 __field(uint32_t, value)
>>> +                 ),
>>> +        TP_fast_assign(
>>> +               __entry->address = address;
>>> +               __entry->value = value;
>>> +               ),
>>> +        TP_printk("amdgpu register dump 0x%x: 0x%x",
>>> +              __entry->address,
>>> +              __entry->value)
>>> +);
>>> +
>>>   #undef AMDGPU_JOB_GET_TIMELINE_NAME
>>>   #endif

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2022-02-16 15:40 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-02-15 10:12 [PATCH v4 1/2] drm/amdgpu: add debugfs for reset registers list Somalapuram Amaranath
2022-02-15 10:12 ` [PATCH v4 2/2] drm/amdgpu: add reset register dump trace on GPU reset Somalapuram Amaranath
2022-02-15 16:39   ` Andrey Grodzovsky
2022-02-16 10:46     ` Somalapuram, Amaranath
2022-02-16 15:40       ` Andrey Grodzovsky
2022-02-15 10:16 ` [PATCH v4 1/2] drm/amdgpu: add debugfs for reset registers list Christian König
2022-02-15 11:15   ` Somalapuram, Amaranath

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.