[PATCH 1/2] drm/amdgpu: add debugfs for reset registers list

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 1/2] drm/amdgpu: add debugfs for reset registers list
@ 2022-02-08  8:16 Somalapuram Amaranath
  2022-02-08  8:16 ` [PATCH 2/2] drm/amdgpu: add reset register trace function on GPU reset Somalapuram Amaranath
                   ` (2 more replies)
  0 siblings, 3 replies; 21+ messages in thread
From: Somalapuram Amaranath @ 2022-02-08  8:16 UTC (permalink / raw)
  To: amd-gfx
  Cc: alexander.deucher, Somalapuram Amaranath, christian.koenig,
	shashank.sharma

List of register to be populated for dump collection during the GPU reset.

Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  3 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 60 +++++++++++++++++++++
 2 files changed, 63 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index b85b67a88a3d..78fa46f959c5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1097,6 +1097,9 @@ struct amdgpu_device {
 
 	struct amdgpu_reset_control     *reset_cntl;
 	uint32_t                        ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
+
+	/* reset dump register */
+	long			reset_dump_reg_list[128];
 };
 
 static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 164d6a9e9fbb..dad268e8a81a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1609,6 +1609,64 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
 DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
 			amdgpu_debugfs_sclk_set, "%llu\n");
 
+static ssize_t amdgpu_reset_dump_register_list_read(struct file *f,
+				char __user *buf, size_t size, loff_t *pos)
+{
+	struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
+	char *reg_offset;
+	int i, r, len;
+
+	reg_offset = kmalloc(2048, GFP_KERNEL);
+	memset(reg_offset,  0, 2048);
+	for (i = 0; adev->reset_dump_reg_list[i] != 0; i++)
+		sprintf(reg_offset + strlen(reg_offset), "0x%lx ", adev->reset_dump_reg_list[i]);
+
+	sprintf(reg_offset + strlen(reg_offset), "\n");
+	len = strlen(reg_offset);
+
+	if (*pos >=  len)
+		return 0;
+
+	r = copy_to_user(buf, reg_offset, len);
+	*pos += len - r;
+	kfree(reg_offset);
+
+	return len - r;
+}
+
+static ssize_t amdgpu_reset_dump_register_list_write(struct file *f, const char __user *buf,
+		size_t size, loff_t *pos)
+{
+	struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
+	char *reg_offset, *reg;
+	int ret, i = 0;
+
+	reg_offset = kmalloc(size, GFP_KERNEL);
+	memset(reg_offset,  0, size);
+	ret = copy_from_user(reg_offset, buf, size);
+
+	if (ret)
+		return -EFAULT;
+
+	while ((reg = strsep(&reg_offset, " ")) != NULL) {
+		ret  = kstrtol(reg, 16, &adev->reset_dump_reg_list[i]);
+		if (ret)
+			return -EINVAL;
+		i++;
+	}
+
+	kfree(reg_offset);
+
+	return size;
+}
+
+static const struct file_operations amdgpu_reset_dump_register_list = {
+	.owner = THIS_MODULE,
+	.read = amdgpu_reset_dump_register_list_read,
+	.write = amdgpu_reset_dump_register_list_write,
+	.llseek = default_llseek
+};
+
 int amdgpu_debugfs_init(struct amdgpu_device *adev)
 {
 	struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
@@ -1672,6 +1730,8 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
 			    &amdgpu_debugfs_test_ib_fops);
 	debugfs_create_file("amdgpu_vm_info", 0444, root, adev,
 			    &amdgpu_debugfs_vm_info_fops);
+	debugfs_create_file("amdgpu_reset_dump_register_list", 0644, root, adev,
+			    &amdgpu_reset_dump_register_list);
 
 	adev->debugfs_vbios_blob.data = adev->bios;
 	adev->debugfs_vbios_blob.size = adev->bios_size;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 2/2] drm/amdgpu: add reset register trace function on GPU reset
  2022-02-08  8:16 [PATCH 1/2] drm/amdgpu: add debugfs for reset registers list Somalapuram Amaranath
@ 2022-02-08  8:16 ` Somalapuram Amaranath
  2022-02-08 15:28   ` Alex Deucher
  2022-02-08  8:18 ` [PATCH 1/2] drm/amdgpu: add debugfs for reset registers list Christian König
  2022-02-08 10:54 ` Sharma, Shashank
  2 siblings, 1 reply; 21+ messages in thread
From: Somalapuram Amaranath @ 2022-02-08  8:16 UTC (permalink / raw)
  To: amd-gfx
  Cc: alexander.deucher, Somalapuram Amaranath, christian.koenig,
	shashank.sharma

Dump the list of register values to trace event on GPU reset.

Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 21 ++++++++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h  | 19 +++++++++++++++++++
 2 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 1e651b959141..057922fb7e37 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4534,6 +4534,23 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
 	return r;
 }
 
+static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
+{
+	int i;
+	uint32_t reg_value[128];
+
+	for (i = 0; adev->reset_dump_reg_list[i] != 0; i++) {
+		if (adev->asic_type >= CHIP_NAVI10)
+			reg_value[i] = RREG32_SOC15_IP(GC, adev->reset_dump_reg_list[i]);
+		else
+			reg_value[i] = RREG32(adev->reset_dump_reg_list[i]);
+	}
+
+	trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list, reg_value, i);
+
+	return 0;
+}
+
 int amdgpu_do_asic_reset(struct list_head *device_list_handle,
 			 struct amdgpu_reset_context *reset_context)
 {
@@ -4567,8 +4584,10 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
 				tmp_adev->gmc.xgmi.pending_reset = false;
 				if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
 					r = -EALREADY;
-			} else
+			} else {
+				amdgpu_reset_reg_dumps(tmp_adev);
 				r = amdgpu_asic_reset(tmp_adev);
+			}
 
 			if (r) {
 				dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
index d855cb53c7e0..3fe33de3564a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
@@ -537,6 +537,25 @@ TRACE_EVENT(amdgpu_ib_pipe_sync,
 		      __entry->seqno)
 );
 
+TRACE_EVENT(amdgpu_reset_reg_dumps,
+	    TP_PROTO(long *address, uint32_t *value, int length),
+	    TP_ARGS(address, value, length),
+	    TP_STRUCT__entry(
+			     __array(long, address, 128)
+			     __array(uint32_t, value, 128)
+			     __field(int, len)
+			     ),
+	    TP_fast_assign(
+			   memcpy(__entry->address, address, 128);
+			   memcpy(__entry->value,  value, 128);
+			   __entry->len = length;
+			   ),
+	    TP_printk("amdgpu register dump offset: %s value: %s ",
+		      __print_array(__entry->address, __entry->len, 8),
+		      __print_array(__entry->value, __entry->len, 8)
+		     )
+);
+
 #undef AMDGPU_JOB_GET_TIMELINE_NAME
 #endif
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: add reset register trace function on GPU reset
  2022-02-08  8:16 ` [PATCH 2/2] drm/amdgpu: add reset register trace function on GPU reset Somalapuram Amaranath
@ 2022-02-08 15:28   ` Alex Deucher
  2022-02-09  7:47     ` Christian König
  0 siblings, 1 reply; 21+ messages in thread
From: Alex Deucher @ 2022-02-08 15:28 UTC (permalink / raw)
  To: Somalapuram Amaranath
  Cc: Deucher, Alexander, Christian Koenig, amd-gfx list, Shashank Sharma

On Tue, Feb 8, 2022 at 3:17 AM Somalapuram Amaranath
<Amaranath.Somalapuram@amd.com> wrote:
>
> Dump the list of register values to trace event on GPU reset.
>
> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 21 ++++++++++++++++++++-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h  | 19 +++++++++++++++++++
>  2 files changed, 39 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 1e651b959141..057922fb7e37 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4534,6 +4534,23 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>         return r;
>  }
>
> +static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
> +{
> +       int i;
> +       uint32_t reg_value[128];
> +
> +       for (i = 0; adev->reset_dump_reg_list[i] != 0; i++) {
> +               if (adev->asic_type >= CHIP_NAVI10)

This check should be against CHIP_VEGA10.  Also, this only allows for
GC registers.  If we wanted to dump other registers, we'd need a
different macro.  Might be better to just use RREG32 here for
everything and then encode the full offset using
SOC15_REG_ENTRY_OFFSET() or a similar macro.  Also, we need to think
about how to handle gfxoff in this case.  gfxoff needs to be disabled
or we'll hang the chip if we try and read GC or SDMA registers via
MMIO which will adversely affect the hang signature.

Alex

> +                       reg_value[i] = RREG32_SOC15_IP(GC, adev->reset_dump_reg_list[i]);
> +               else
> +                       reg_value[i] = RREG32(adev->reset_dump_reg_list[i]);
> +       }
> +
> +       trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list, reg_value, i);
> +
> +       return 0;
> +}
> +
>  int amdgpu_do_asic_reset(struct list_head *device_list_handle,
>                          struct amdgpu_reset_context *reset_context)
>  {
> @@ -4567,8 +4584,10 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
>                                 tmp_adev->gmc.xgmi.pending_reset = false;
>                                 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
>                                         r = -EALREADY;
> -                       } else
> +                       } else {
> +                               amdgpu_reset_reg_dumps(tmp_adev);
>                                 r = amdgpu_asic_reset(tmp_adev);
> +                       }
>
>                         if (r) {
>                                 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
> index d855cb53c7e0..3fe33de3564a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
> @@ -537,6 +537,25 @@ TRACE_EVENT(amdgpu_ib_pipe_sync,
>                       __entry->seqno)
>  );
>
> +TRACE_EVENT(amdgpu_reset_reg_dumps,
> +           TP_PROTO(long *address, uint32_t *value, int length),
> +           TP_ARGS(address, value, length),
> +           TP_STRUCT__entry(
> +                            __array(long, address, 128)
> +                            __array(uint32_t, value, 128)
> +                            __field(int, len)
> +                            ),
> +           TP_fast_assign(
> +                          memcpy(__entry->address, address, 128);
> +                          memcpy(__entry->value,  value, 128);
> +                          __entry->len = length;
> +                          ),
> +           TP_printk("amdgpu register dump offset: %s value: %s ",
> +                     __print_array(__entry->address, __entry->len, 8),
> +                     __print_array(__entry->value, __entry->len, 8)
> +                    )
> +);
> +
>  #undef AMDGPU_JOB_GET_TIMELINE_NAME
>  #endif
>
> --
> 2.25.1
>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: add reset register trace function on GPU reset
  2022-02-08 15:28   ` Alex Deucher
@ 2022-02-09  7:47     ` Christian König
  2022-02-10  5:29       ` Somalapuram, Amaranath
  0 siblings, 1 reply; 21+ messages in thread
From: Christian König @ 2022-02-09  7:47 UTC (permalink / raw)
  To: Alex Deucher, Somalapuram Amaranath
  Cc: Deucher, Alexander, amd-gfx list, Shashank Sharma

Am 08.02.22 um 16:28 schrieb Alex Deucher:
> On Tue, Feb 8, 2022 at 3:17 AM Somalapuram Amaranath
> <Amaranath.Somalapuram@amd.com> wrote:
>> Dump the list of register values to trace event on GPU reset.
>>
>> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 21 ++++++++++++++++++++-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h  | 19 +++++++++++++++++++
>>   2 files changed, 39 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 1e651b959141..057922fb7e37 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -4534,6 +4534,23 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>          return r;
>>   }
>>
>> +static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
>> +{
>> +       int i;
>> +       uint32_t reg_value[128];
>> +
>> +       for (i = 0; adev->reset_dump_reg_list[i] != 0; i++) {
>> +               if (adev->asic_type >= CHIP_NAVI10)
> This check should be against CHIP_VEGA10.  Also, this only allows for
> GC registers.  If we wanted to dump other registers, we'd need a
> different macro.  Might be better to just use RREG32 here for
> everything and then encode the full offset using
> SOC15_REG_ENTRY_OFFSET() or a similar macro.  Also, we need to think
> about how to handle gfxoff in this case.  gfxoff needs to be disabled
> or we'll hang the chip if we try and read GC or SDMA registers via
> MMIO which will adversely affect the hang signature.

Well this should execute right before a GPU reset, so I think it 
shouldn't matter if we hang the chip or not as long as the read comes 
back correctly (I remember a very long UVD debug session because of this).

But in general I agree, we should just use RREG32() here and always 
encode the full register offset.

Regards,
Christian.


>
> Alex
>
>> +                       reg_value[i] = RREG32_SOC15_IP(GC, adev->reset_dump_reg_list[i]);
>> +               else
>> +                       reg_value[i] = RREG32(adev->reset_dump_reg_list[i]);
>> +       }
>> +
>> +       trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list, reg_value, i);
>> +
>> +       return 0;
>> +}
>> +
>>   int amdgpu_do_asic_reset(struct list_head *device_list_handle,
>>                           struct amdgpu_reset_context *reset_context)
>>   {
>> @@ -4567,8 +4584,10 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
>>                                  tmp_adev->gmc.xgmi.pending_reset = false;
>>                                  if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
>>                                          r = -EALREADY;
>> -                       } else
>> +                       } else {
>> +                               amdgpu_reset_reg_dumps(tmp_adev);
>>                                  r = amdgpu_asic_reset(tmp_adev);
>> +                       }
>>
>>                          if (r) {
>>                                  dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>> index d855cb53c7e0..3fe33de3564a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>> @@ -537,6 +537,25 @@ TRACE_EVENT(amdgpu_ib_pipe_sync,
>>                        __entry->seqno)
>>   );
>>
>> +TRACE_EVENT(amdgpu_reset_reg_dumps,
>> +           TP_PROTO(long *address, uint32_t *value, int length),
>> +           TP_ARGS(address, value, length),
>> +           TP_STRUCT__entry(
>> +                            __array(long, address, 128)
>> +                            __array(uint32_t, value, 128)
>> +                            __field(int, len)
>> +                            ),
>> +           TP_fast_assign(
>> +                          memcpy(__entry->address, address, 128);
>> +                          memcpy(__entry->value,  value, 128);
>> +                          __entry->len = length;
>> +                          ),
>> +           TP_printk("amdgpu register dump offset: %s value: %s ",
>> +                     __print_array(__entry->address, __entry->len, 8),
>> +                     __print_array(__entry->value, __entry->len, 8)
>> +                    )
>> +);
>> +
>>   #undef AMDGPU_JOB_GET_TIMELINE_NAME
>>   #endif
>>
>> --
>> 2.25.1
>>


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: add reset register trace function on GPU reset
  2022-02-09  7:47     ` Christian König
@ 2022-02-10  5:29       ` Somalapuram, Amaranath
  2022-02-10  7:09         ` Christian König
  2022-02-10 11:59         ` Sharma, Shashank
  0 siblings, 2 replies; 21+ messages in thread
From: Somalapuram, Amaranath @ 2022-02-10  5:29 UTC (permalink / raw)
  To: Christian König, Alex Deucher, Somalapuram Amaranath
  Cc: Deucher, Alexander, amd-gfx list, Shashank Sharma


On 2/9/2022 1:17 PM, Christian König wrote:
> Am 08.02.22 um 16:28 schrieb Alex Deucher:
>> On Tue, Feb 8, 2022 at 3:17 AM Somalapuram Amaranath
>> <Amaranath.Somalapuram@amd.com> wrote:
>>> Dump the list of register values to trace event on GPU reset.
>>>
>>> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 21 ++++++++++++++++++++-
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h  | 19 +++++++++++++++++++
>>>   2 files changed, 39 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index 1e651b959141..057922fb7e37 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -4534,6 +4534,23 @@ int amdgpu_device_pre_asic_reset(struct 
>>> amdgpu_device *adev,
>>>          return r;
>>>   }
>>>
>>> +static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
>>> +{
>>> +       int i;
>>> +       uint32_t reg_value[128];
>>> +
>>> +       for (i = 0; adev->reset_dump_reg_list[i] != 0; i++) {
>>> +               if (adev->asic_type >= CHIP_NAVI10)
>> This check should be against CHIP_VEGA10.  Also, this only allows for
>> GC registers.  If we wanted to dump other registers, we'd need a
>> different macro.  Might be better to just use RREG32 here for
>> everything and then encode the full offset using
>> SOC15_REG_ENTRY_OFFSET() or a similar macro.  Also, we need to think
>> about how to handle gfxoff in this case.  gfxoff needs to be disabled
>> or we'll hang the chip if we try and read GC or SDMA registers via
>> MMIO which will adversely affect the hang signature.
>
> Well this should execute right before a GPU reset, so I think it 
> shouldn't matter if we hang the chip or not as long as the read comes 
> back correctly (I remember a very long UVD debug session because of 
> this).
>
> But in general I agree, we should just use RREG32() here and always 
> encode the full register offset.
>
> Regards,
> Christian.
>
Can I use something like this:

+                       reg_value[i] = 
RREG32((adev->reg_offset[adev->reset_dump_reg_list[i][0]]
+ [adev->reset_dump_reg_list[i][1]]
+ [adev->reset_dump_reg_list[i][2]])
+                                 + adev->reset_dump_reg_list[i][3]);

ip --> adev->reset_dump_reg_list[i][0]

inst --> adev->reset_dump_reg_list[i][1]

BASE_IDX--> adev->reset_dump_reg_list[i][2]

reg --> adev->reset_dump_reg_list[i][3]

which requires 4 values in user space for each register.

using any existing macro like RREG32_SOC15** will not be able to pass 
proper argument from user space (like ip##_HWIP or reg##_BASE_IDX)

>
>>
>> Alex
>>
>>> +                       reg_value[i] = RREG32_SOC15_IP(GC, 
>>> adev->reset_dump_reg_list[i]);
>>> +               else
>>> +                       reg_value[i] = 
>>> RREG32(adev->reset_dump_reg_list[i]);
>>> +       }
>>> +
>>> + trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list, reg_value, 
>>> i);
>>> +
>>> +       return 0;
>>> +}
>>> +
>>>   int amdgpu_do_asic_reset(struct list_head *device_list_handle,
>>>                           struct amdgpu_reset_context *reset_context)
>>>   {
>>> @@ -4567,8 +4584,10 @@ int amdgpu_do_asic_reset(struct list_head 
>>> *device_list_handle,
>>> tmp_adev->gmc.xgmi.pending_reset = false;
>>>                                  if (!queue_work(system_unbound_wq, 
>>> &tmp_adev->xgmi_reset_work))
>>>                                          r = -EALREADY;
>>> -                       } else
>>> +                       } else {
>>> + amdgpu_reset_reg_dumps(tmp_adev);
>>>                                  r = amdgpu_asic_reset(tmp_adev);
>>> +                       }
>>>
>>>                          if (r) {
>>>                                  dev_err(tmp_adev->dev, "ASIC reset 
>>> failed with error, %d for drm dev, %s",
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>>> index d855cb53c7e0..3fe33de3564a 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>>> @@ -537,6 +537,25 @@ TRACE_EVENT(amdgpu_ib_pipe_sync,
>>>                        __entry->seqno)
>>>   );
>>>
>>> +TRACE_EVENT(amdgpu_reset_reg_dumps,
>>> +           TP_PROTO(long *address, uint32_t *value, int length),
>>> +           TP_ARGS(address, value, length),
>>> +           TP_STRUCT__entry(
>>> +                            __array(long, address, 128)
>>> +                            __array(uint32_t, value, 128)
>>> +                            __field(int, len)
>>> +                            ),
>>> +           TP_fast_assign(
>>> +                          memcpy(__entry->address, address, 128);
>>> +                          memcpy(__entry->value,  value, 128);
>>> +                          __entry->len = length;
>>> +                          ),
>>> +           TP_printk("amdgpu register dump offset: %s value: %s ",
>>> +                     __print_array(__entry->address, __entry->len, 8),
>>> +                     __print_array(__entry->value, __entry->len, 8)
>>> +                    )
>>> +);
>>> +
>>>   #undef AMDGPU_JOB_GET_TIMELINE_NAME
>>>   #endif
>>>
>>> -- 
>>> 2.25.1
>>>
>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: add reset register trace function on GPU reset
  2022-02-10  5:29       ` Somalapuram, Amaranath
@ 2022-02-10  7:09         ` Christian König
  2022-02-10  7:34           ` Somalapuram, Amaranath
  2022-02-10 11:59         ` Sharma, Shashank
  1 sibling, 1 reply; 21+ messages in thread
From: Christian König @ 2022-02-10  7:09 UTC (permalink / raw)
  To: Somalapuram, Amaranath, Alex Deucher, Somalapuram Amaranath
  Cc: Deucher, Alexander, amd-gfx list, Shashank Sharma

Am 10.02.22 um 06:29 schrieb Somalapuram, Amaranath:
>
> On 2/9/2022 1:17 PM, Christian König wrote:
>> Am 08.02.22 um 16:28 schrieb Alex Deucher:
>>> On Tue, Feb 8, 2022 at 3:17 AM Somalapuram Amaranath
>>> <Amaranath.Somalapuram@amd.com> wrote:
>>>> Dump the list of register values to trace event on GPU reset.
>>>>
>>>> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
>>>> ---
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 21 
>>>> ++++++++++++++++++++-
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h  | 19 +++++++++++++++++++
>>>>   2 files changed, 39 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> index 1e651b959141..057922fb7e37 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> @@ -4534,6 +4534,23 @@ int amdgpu_device_pre_asic_reset(struct 
>>>> amdgpu_device *adev,
>>>>          return r;
>>>>   }
>>>>
>>>> +static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
>>>> +{
>>>> +       int i;
>>>> +       uint32_t reg_value[128];
>>>> +
>>>> +       for (i = 0; adev->reset_dump_reg_list[i] != 0; i++) {
>>>> +               if (adev->asic_type >= CHIP_NAVI10)
>>> This check should be against CHIP_VEGA10.  Also, this only allows for
>>> GC registers.  If we wanted to dump other registers, we'd need a
>>> different macro.  Might be better to just use RREG32 here for
>>> everything and then encode the full offset using
>>> SOC15_REG_ENTRY_OFFSET() or a similar macro.  Also, we need to think
>>> about how to handle gfxoff in this case.  gfxoff needs to be disabled
>>> or we'll hang the chip if we try and read GC or SDMA registers via
>>> MMIO which will adversely affect the hang signature.
>>
>> Well this should execute right before a GPU reset, so I think it 
>> shouldn't matter if we hang the chip or not as long as the read comes 
>> back correctly (I remember a very long UVD debug session because of 
>> this).
>>
>> But in general I agree, we should just use RREG32() here and always 
>> encode the full register offset.
>>
>> Regards,
>> Christian.
>>
> Can I use something like this:
>
> +                       reg_value[i] = 
> RREG32((adev->reg_offset[adev->reset_dump_reg_list[i][0]]
> + [adev->reset_dump_reg_list[i][1]]
> + [adev->reset_dump_reg_list[i][2]])
> +                                 + adev->reset_dump_reg_list[i][3]);
>
> ip --> adev->reset_dump_reg_list[i][0]
>
> inst --> adev->reset_dump_reg_list[i][1]
>
> BASE_IDX--> adev->reset_dump_reg_list[i][2]
>
> reg --> adev->reset_dump_reg_list[i][3]

No, that won't work.

What you need to do is to use the full 32bit address of the register. 
Userspace can worry about figuring out which ip, instance, base and reg 
to resolve into that address.

Regards,
Christian.

>
> which requires 4 values in user space for each register.
>
> using any existing macro like RREG32_SOC15** will not be able to pass 
> proper argument from user space (like ip##_HWIP or reg##_BASE_IDX)
>
>>
>>>
>>> Alex
>>>
>>>> +                       reg_value[i] = RREG32_SOC15_IP(GC, 
>>>> adev->reset_dump_reg_list[i]);
>>>> +               else
>>>> +                       reg_value[i] = 
>>>> RREG32(adev->reset_dump_reg_list[i]);
>>>> +       }
>>>> +
>>>> + trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list, 
>>>> reg_value, i);
>>>> +
>>>> +       return 0;
>>>> +}
>>>> +
>>>>   int amdgpu_do_asic_reset(struct list_head *device_list_handle,
>>>>                           struct amdgpu_reset_context *reset_context)
>>>>   {
>>>> @@ -4567,8 +4584,10 @@ int amdgpu_do_asic_reset(struct list_head 
>>>> *device_list_handle,
>>>> tmp_adev->gmc.xgmi.pending_reset = false;
>>>>                                  if (!queue_work(system_unbound_wq, 
>>>> &tmp_adev->xgmi_reset_work))
>>>>                                          r = -EALREADY;
>>>> -                       } else
>>>> +                       } else {
>>>> + amdgpu_reset_reg_dumps(tmp_adev);
>>>>                                  r = amdgpu_asic_reset(tmp_adev);
>>>> +                       }
>>>>
>>>>                          if (r) {
>>>>                                  dev_err(tmp_adev->dev, "ASIC reset 
>>>> failed with error, %d for drm dev, %s",
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>>>> index d855cb53c7e0..3fe33de3564a 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>>>> @@ -537,6 +537,25 @@ TRACE_EVENT(amdgpu_ib_pipe_sync,
>>>>                        __entry->seqno)
>>>>   );
>>>>
>>>> +TRACE_EVENT(amdgpu_reset_reg_dumps,
>>>> +           TP_PROTO(long *address, uint32_t *value, int length),
>>>> +           TP_ARGS(address, value, length),
>>>> +           TP_STRUCT__entry(
>>>> +                            __array(long, address, 128)
>>>> +                            __array(uint32_t, value, 128)
>>>> +                            __field(int, len)
>>>> +                            ),
>>>> +           TP_fast_assign(
>>>> +                          memcpy(__entry->address, address, 128);
>>>> +                          memcpy(__entry->value,  value, 128);
>>>> +                          __entry->len = length;
>>>> +                          ),
>>>> +           TP_printk("amdgpu register dump offset: %s value: %s ",
>>>> +                     __print_array(__entry->address, __entry->len, 
>>>> 8),
>>>> +                     __print_array(__entry->value, __entry->len, 8)
>>>> +                    )
>>>> +);
>>>> +
>>>>   #undef AMDGPU_JOB_GET_TIMELINE_NAME
>>>>   #endif
>>>>
>>>> -- 
>>>> 2.25.1
>>>>
>>


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: add reset register trace function on GPU reset
  2022-02-10  7:09         ` Christian König
@ 2022-02-10  7:34           ` Somalapuram, Amaranath
  2022-02-10  7:38             ` Christian König
  0 siblings, 1 reply; 21+ messages in thread
From: Somalapuram, Amaranath @ 2022-02-10  7:34 UTC (permalink / raw)
  To: Christian König, Alex Deucher, Somalapuram Amaranath
  Cc: Deucher, Alexander, amd-gfx list, Shashank Sharma


On 2/10/2022 12:39 PM, Christian König wrote:
> Am 10.02.22 um 06:29 schrieb Somalapuram, Amaranath:
>>
>> On 2/9/2022 1:17 PM, Christian König wrote:
>>> Am 08.02.22 um 16:28 schrieb Alex Deucher:
>>>> On Tue, Feb 8, 2022 at 3:17 AM Somalapuram Amaranath
>>>> <Amaranath.Somalapuram@amd.com> wrote:
>>>>> Dump the list of register values to trace event on GPU reset.
>>>>>
>>>>> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
>>>>> ---
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 21 
>>>>> ++++++++++++++++++++-
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h  | 19 +++++++++++++++++++
>>>>>   2 files changed, 39 insertions(+), 1 deletion(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> index 1e651b959141..057922fb7e37 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> @@ -4534,6 +4534,23 @@ int amdgpu_device_pre_asic_reset(struct 
>>>>> amdgpu_device *adev,
>>>>>          return r;
>>>>>   }
>>>>>
>>>>> +static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
>>>>> +{
>>>>> +       int i;
>>>>> +       uint32_t reg_value[128];
>>>>> +
>>>>> +       for (i = 0; adev->reset_dump_reg_list[i] != 0; i++) {
>>>>> +               if (adev->asic_type >= CHIP_NAVI10)
>>>> This check should be against CHIP_VEGA10.  Also, this only allows for
>>>> GC registers.  If we wanted to dump other registers, we'd need a
>>>> different macro.  Might be better to just use RREG32 here for
>>>> everything and then encode the full offset using
>>>> SOC15_REG_ENTRY_OFFSET() or a similar macro.  Also, we need to think
>>>> about how to handle gfxoff in this case.  gfxoff needs to be disabled
>>>> or we'll hang the chip if we try and read GC or SDMA registers via
>>>> MMIO which will adversely affect the hang signature.
>>>
>>> Well this should execute right before a GPU reset, so I think it 
>>> shouldn't matter if we hang the chip or not as long as the read 
>>> comes back correctly (I remember a very long UVD debug session 
>>> because of this).
>>>
>>> But in general I agree, we should just use RREG32() here and always 
>>> encode the full register offset.
>>>
>>> Regards,
>>> Christian.
>>>
>> Can I use something like this:
>>
>> +                       reg_value[i] = 
>> RREG32((adev->reg_offset[adev->reset_dump_reg_list[i][0]]
>> + [adev->reset_dump_reg_list[i][1]]
>> + [adev->reset_dump_reg_list[i][2]])
>> +                                 + adev->reset_dump_reg_list[i][3]);
>>
>> ip --> adev->reset_dump_reg_list[i][0]
>>
>> inst --> adev->reset_dump_reg_list[i][1]
>>
>> BASE_IDX--> adev->reset_dump_reg_list[i][2]
>>
>> reg --> adev->reset_dump_reg_list[i][3]
>
> No, that won't work.
>
> What you need to do is to use the full 32bit address of the register. 
> Userspace can worry about figuring out which ip, instance, base and 
> reg to resolve into that address.
>
> Regards,
> Christian.
>
Thanks Christian.

should I consider using gfxoff like below code or not required:
amdgpu_gfx_off_ctrl(adev, false);
amdgpu_gfx_off_ctrl(adev, true);


Regards,

S.Amarnath
>>
>> which requires 4 values in user space for each register.
>>
>> using any existing macro like RREG32_SOC15** will not be able to pass 
>> proper argument from user space (like ip##_HWIP or reg##_BASE_IDX)
>>
>>>
>>>>
>>>> Alex
>>>>
>>>>> +                       reg_value[i] = RREG32_SOC15_IP(GC, 
>>>>> adev->reset_dump_reg_list[i]);
>>>>> +               else
>>>>> +                       reg_value[i] = 
>>>>> RREG32(adev->reset_dump_reg_list[i]);
>>>>> +       }
>>>>> +
>>>>> + trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list, 
>>>>> reg_value, i);
>>>>> +
>>>>> +       return 0;
>>>>> +}
>>>>> +
>>>>>   int amdgpu_do_asic_reset(struct list_head *device_list_handle,
>>>>>                           struct amdgpu_reset_context *reset_context)
>>>>>   {
>>>>> @@ -4567,8 +4584,10 @@ int amdgpu_do_asic_reset(struct list_head 
>>>>> *device_list_handle,
>>>>> tmp_adev->gmc.xgmi.pending_reset = false;
>>>>>                                  if 
>>>>> (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
>>>>>                                          r = -EALREADY;
>>>>> -                       } else
>>>>> +                       } else {
>>>>> + amdgpu_reset_reg_dumps(tmp_adev);
>>>>>                                  r = amdgpu_asic_reset(tmp_adev);
>>>>> +                       }
>>>>>
>>>>>                          if (r) {
>>>>>                                  dev_err(tmp_adev->dev, "ASIC 
>>>>> reset failed with error, %d for drm dev, %s",
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>>>>> index d855cb53c7e0..3fe33de3564a 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>>>>> @@ -537,6 +537,25 @@ TRACE_EVENT(amdgpu_ib_pipe_sync,
>>>>>                        __entry->seqno)
>>>>>   );
>>>>>
>>>>> +TRACE_EVENT(amdgpu_reset_reg_dumps,
>>>>> +           TP_PROTO(long *address, uint32_t *value, int length),
>>>>> +           TP_ARGS(address, value, length),
>>>>> +           TP_STRUCT__entry(
>>>>> +                            __array(long, address, 128)
>>>>> +                            __array(uint32_t, value, 128)
>>>>> +                            __field(int, len)
>>>>> +                            ),
>>>>> +           TP_fast_assign(
>>>>> +                          memcpy(__entry->address, address, 128);
>>>>> +                          memcpy(__entry->value, value, 128);
>>>>> +                          __entry->len = length;
>>>>> +                          ),
>>>>> +           TP_printk("amdgpu register dump offset: %s value: %s ",
>>>>> +                     __print_array(__entry->address, 
>>>>> __entry->len, 8),
>>>>> +                     __print_array(__entry->value, __entry->len, 8)
>>>>> +                    )
>>>>> +);
>>>>> +
>>>>>   #undef AMDGPU_JOB_GET_TIMELINE_NAME
>>>>>   #endif
>>>>>
>>>>> -- 
>>>>> 2.25.1
>>>>>
>>>
>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: add reset register trace function on GPU reset
  2022-02-10  7:34           ` Somalapuram, Amaranath
@ 2022-02-10  7:38             ` Christian König
  2022-02-10 13:18               ` Sharma, Shashank
  0 siblings, 1 reply; 21+ messages in thread
From: Christian König @ 2022-02-10  7:38 UTC (permalink / raw)
  To: Somalapuram, Amaranath, Alex Deucher, Somalapuram Amaranath
  Cc: Deucher, Alexander, amd-gfx list, Shashank Sharma

Am 10.02.22 um 08:34 schrieb Somalapuram, Amaranath:
>
> On 2/10/2022 12:39 PM, Christian König wrote:
>> Am 10.02.22 um 06:29 schrieb Somalapuram, Amaranath:
>>>
>>> On 2/9/2022 1:17 PM, Christian König wrote:
>>>> Am 08.02.22 um 16:28 schrieb Alex Deucher:
>>>>> On Tue, Feb 8, 2022 at 3:17 AM Somalapuram Amaranath
>>>>> <Amaranath.Somalapuram@amd.com> wrote:
>>>>>> Dump the list of register values to trace event on GPU reset.
>>>>>>
>>>>>> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
>>>>>> ---
>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 21 
>>>>>> ++++++++++++++++++++-
>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h  | 19 
>>>>>> +++++++++++++++++++
>>>>>>   2 files changed, 39 insertions(+), 1 deletion(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> index 1e651b959141..057922fb7e37 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> @@ -4534,6 +4534,23 @@ int amdgpu_device_pre_asic_reset(struct 
>>>>>> amdgpu_device *adev,
>>>>>>          return r;
>>>>>>   }
>>>>>>
>>>>>> +static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
>>>>>> +{
>>>>>> +       int i;
>>>>>> +       uint32_t reg_value[128];
>>>>>> +
>>>>>> +       for (i = 0; adev->reset_dump_reg_list[i] != 0; i++) {
>>>>>> +               if (adev->asic_type >= CHIP_NAVI10)
>>>>> This check should be against CHIP_VEGA10.  Also, this only allows for
>>>>> GC registers.  If we wanted to dump other registers, we'd need a
>>>>> different macro.  Might be better to just use RREG32 here for
>>>>> everything and then encode the full offset using
>>>>> SOC15_REG_ENTRY_OFFSET() or a similar macro.  Also, we need to think
>>>>> about how to handle gfxoff in this case.  gfxoff needs to be disabled
>>>>> or we'll hang the chip if we try and read GC or SDMA registers via
>>>>> MMIO which will adversely affect the hang signature.
>>>>
>>>> Well this should execute right before a GPU reset, so I think it 
>>>> shouldn't matter if we hang the chip or not as long as the read 
>>>> comes back correctly (I remember a very long UVD debug session 
>>>> because of this).
>>>>
>>>> But in general I agree, we should just use RREG32() here and always 
>>>> encode the full register offset.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>> Can I use something like this:
>>>
>>> +                       reg_value[i] = 
>>> RREG32((adev->reg_offset[adev->reset_dump_reg_list[i][0]]
>>> + [adev->reset_dump_reg_list[i][1]]
>>> + [adev->reset_dump_reg_list[i][2]])
>>> +                                 + adev->reset_dump_reg_list[i][3]);
>>>
>>> ip --> adev->reset_dump_reg_list[i][0]
>>>
>>> inst --> adev->reset_dump_reg_list[i][1]
>>>
>>> BASE_IDX--> adev->reset_dump_reg_list[i][2]
>>>
>>> reg --> adev->reset_dump_reg_list[i][3]
>>
>> No, that won't work.
>>
>> What you need to do is to use the full 32bit address of the register. 
>> Userspace can worry about figuring out which ip, instance, base and 
>> reg to resolve into that address.
>>
>> Regards,
>> Christian.
>>
> Thanks Christian.
>
> should I consider using gfxoff like below code or not required:
> amdgpu_gfx_off_ctrl(adev, false);
> amdgpu_gfx_off_ctrl(adev, true);

That's a really good question I can't fully answer.

I think we don't want that because the GPU is stuck when the dump is 
made, but better let Alex comment as well.

Regards,
Christian.

>
>
> Regards,
>
> S.Amarnath
>>>
>>> which requires 4 values in user space for each register.
>>>
>>> using any existing macro like RREG32_SOC15** will not be able to 
>>> pass proper argument from user space (like ip##_HWIP or reg##_BASE_IDX)
>>>
>>>>
>>>>>
>>>>> Alex
>>>>>
>>>>>> + reg_value[i] = RREG32_SOC15_IP(GC, adev->reset_dump_reg_list[i]);
>>>>>> +               else
>>>>>> +                       reg_value[i] = 
>>>>>> RREG32(adev->reset_dump_reg_list[i]);
>>>>>> +       }
>>>>>> +
>>>>>> + trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list, 
>>>>>> reg_value, i);
>>>>>> +
>>>>>> +       return 0;
>>>>>> +}
>>>>>> +
>>>>>>   int amdgpu_do_asic_reset(struct list_head *device_list_handle,
>>>>>>                           struct amdgpu_reset_context 
>>>>>> *reset_context)
>>>>>>   {
>>>>>> @@ -4567,8 +4584,10 @@ int amdgpu_do_asic_reset(struct list_head 
>>>>>> *device_list_handle,
>>>>>> tmp_adev->gmc.xgmi.pending_reset = false;
>>>>>>                                  if 
>>>>>> (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
>>>>>>                                          r = -EALREADY;
>>>>>> -                       } else
>>>>>> +                       } else {
>>>>>> + amdgpu_reset_reg_dumps(tmp_adev);
>>>>>>                                  r = amdgpu_asic_reset(tmp_adev);
>>>>>> +                       }
>>>>>>
>>>>>>                          if (r) {
>>>>>> dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm 
>>>>>> dev, %s",
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h 
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>>>>>> index d855cb53c7e0..3fe33de3564a 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>>>>>> @@ -537,6 +537,25 @@ TRACE_EVENT(amdgpu_ib_pipe_sync,
>>>>>>                        __entry->seqno)
>>>>>>   );
>>>>>>
>>>>>> +TRACE_EVENT(amdgpu_reset_reg_dumps,
>>>>>> +           TP_PROTO(long *address, uint32_t *value, int length),
>>>>>> +           TP_ARGS(address, value, length),
>>>>>> +           TP_STRUCT__entry(
>>>>>> +                            __array(long, address, 128)
>>>>>> +                            __array(uint32_t, value, 128)
>>>>>> +                            __field(int, len)
>>>>>> +                            ),
>>>>>> +           TP_fast_assign(
>>>>>> +                          memcpy(__entry->address, address, 128);
>>>>>> +                          memcpy(__entry->value, value, 128);
>>>>>> +                          __entry->len = length;
>>>>>> +                          ),
>>>>>> +           TP_printk("amdgpu register dump offset: %s value: %s ",
>>>>>> +                     __print_array(__entry->address, 
>>>>>> __entry->len, 8),
>>>>>> +                     __print_array(__entry->value, __entry->len, 8)
>>>>>> +                    )
>>>>>> +);
>>>>>> +
>>>>>>   #undef AMDGPU_JOB_GET_TIMELINE_NAME
>>>>>>   #endif
>>>>>>
>>>>>> -- 
>>>>>> 2.25.1
>>>>>>
>>>>
>>


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: add reset register trace function on GPU reset
  2022-02-10  7:38             ` Christian König
@ 2022-02-10 13:18               ` Sharma, Shashank
  2022-02-10 14:05                 ` Christian König
  0 siblings, 1 reply; 21+ messages in thread
From: Sharma, Shashank @ 2022-02-10 13:18 UTC (permalink / raw)
  To: Christian König, Somalapuram, Amaranath, Alex Deucher,
	Somalapuram Amaranath
  Cc: Deucher, Alexander, amd-gfx list



On 2/10/2022 8:38 AM, Christian König wrote:
> Am 10.02.22 um 08:34 schrieb Somalapuram, Amaranath:
>>
>> On 2/10/2022 12:39 PM, Christian König wrote:
>>> Am 10.02.22 um 06:29 schrieb Somalapuram, Amaranath:
>>>>
>>>> On 2/9/2022 1:17 PM, Christian König wrote:
>>>>> Am 08.02.22 um 16:28 schrieb Alex Deucher:
>>>>>> On Tue, Feb 8, 2022 at 3:17 AM Somalapuram Amaranath
>>>>>> <Amaranath.Somalapuram@amd.com> wrote:
>>>>>>> Dump the list of register values to trace event on GPU reset.
>>>>>>>
>>>>>>> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
>>>>>>> ---
>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 21 
>>>>>>> ++++++++++++++++++++-
>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h  | 19 
>>>>>>> +++++++++++++++++++
>>>>>>>   2 files changed, 39 insertions(+), 1 deletion(-)
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>> index 1e651b959141..057922fb7e37 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>> @@ -4534,6 +4534,23 @@ int amdgpu_device_pre_asic_reset(struct 
>>>>>>> amdgpu_device *adev,
>>>>>>>          return r;
>>>>>>>   }
>>>>>>>
>>>>>>> +static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
>>>>>>> +{
>>>>>>> +       int i;
>>>>>>> +       uint32_t reg_value[128];
>>>>>>> +
>>>>>>> +       for (i = 0; adev->reset_dump_reg_list[i] != 0; i++) {
>>>>>>> +               if (adev->asic_type >= CHIP_NAVI10)
>>>>>> This check should be against CHIP_VEGA10.  Also, this only allows for
>>>>>> GC registers.  If we wanted to dump other registers, we'd need a
>>>>>> different macro.  Might be better to just use RREG32 here for
>>>>>> everything and then encode the full offset using
>>>>>> SOC15_REG_ENTRY_OFFSET() or a similar macro.  Also, we need to think
>>>>>> about how to handle gfxoff in this case.  gfxoff needs to be disabled
>>>>>> or we'll hang the chip if we try and read GC or SDMA registers via
>>>>>> MMIO which will adversely affect the hang signature.
>>>>>
>>>>> Well this should execute right before a GPU reset, so I think it 
>>>>> shouldn't matter if we hang the chip or not as long as the read 
>>>>> comes back correctly (I remember a very long UVD debug session 
>>>>> because of this).
>>>>>
>>>>> But in general I agree, we should just use RREG32() here and always 
>>>>> encode the full register offset.
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>> Can I use something like this:
>>>>
>>>> +                       reg_value[i] = 
>>>> RREG32((adev->reg_offset[adev->reset_dump_reg_list[i][0]]
>>>> + [adev->reset_dump_reg_list[i][1]]
>>>> + [adev->reset_dump_reg_list[i][2]])
>>>> +                                 + adev->reset_dump_reg_list[i][3]);
>>>>
>>>> ip --> adev->reset_dump_reg_list[i][0]
>>>>
>>>> inst --> adev->reset_dump_reg_list[i][1]
>>>>
>>>> BASE_IDX--> adev->reset_dump_reg_list[i][2]
>>>>
>>>> reg --> adev->reset_dump_reg_list[i][3]
>>>
>>> No, that won't work.
>>>
>>> What you need to do is to use the full 32bit address of the register. 
>>> Userspace can worry about figuring out which ip, instance, base and 
>>> reg to resolve into that address.
>>>
>>> Regards,
>>> Christian.
>>>
>> Thanks Christian.
>>
>> should I consider using gfxoff like below code or not required:
>> amdgpu_gfx_off_ctrl(adev, false);
>> amdgpu_gfx_off_ctrl(adev, true);
> 
> That's a really good question I can't fully answer.
> 
> I think we don't want that because the GPU is stuck when the dump is 
> made, but better let Alex comment as well.
> 
> Regards,
> Christian.


I had a quick look at the function amdgpu_gfx_off_ctrl, and it locks 
this mutex internally:
mutex_lock(&adev->gfx.gfx_off_mutex);

and the reference state is tracked in:
adev->gfx.gfx_off_state

We can do something like this maybe:
- If (adev->gfx_off_state == 0) {
   trylock(gfx_off_mutex)
   read_regs_now;
   unlock_mutex();
}

How does it sounds ?

- Shashank
> 
>>
>>
>> Regards,
>>
>> S.Amarnath
>>>>
>>>> which requires 4 values in user space for each register.
>>>>
>>>> using any existing macro like RREG32_SOC15** will not be able to 
>>>> pass proper argument from user space (like ip##_HWIP or reg##_BASE_IDX)
>>>>
>>>>>
>>>>>>
>>>>>> Alex
>>>>>>
>>>>>>> + reg_value[i] = RREG32_SOC15_IP(GC, adev->reset_dump_reg_list[i]);
>>>>>>> +               else
>>>>>>> +                       reg_value[i] = 
>>>>>>> RREG32(adev->reset_dump_reg_list[i]);
>>>>>>> +       }
>>>>>>> +
>>>>>>> + trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list, 
>>>>>>> reg_value, i);
>>>>>>> +
>>>>>>> +       return 0;
>>>>>>> +}
>>>>>>> +
>>>>>>>   int amdgpu_do_asic_reset(struct list_head *device_list_handle,
>>>>>>>                           struct amdgpu_reset_context 
>>>>>>> *reset_context)
>>>>>>>   {
>>>>>>> @@ -4567,8 +4584,10 @@ int amdgpu_do_asic_reset(struct list_head 
>>>>>>> *device_list_handle,
>>>>>>> tmp_adev->gmc.xgmi.pending_reset = false;
>>>>>>>                                  if 
>>>>>>> (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
>>>>>>>                                          r = -EALREADY;
>>>>>>> -                       } else
>>>>>>> +                       } else {
>>>>>>> + amdgpu_reset_reg_dumps(tmp_adev);
>>>>>>>                                  r = amdgpu_asic_reset(tmp_adev);
>>>>>>> +                       }
>>>>>>>
>>>>>>>                          if (r) {
>>>>>>> dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm 
>>>>>>> dev, %s",
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h 
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>>>>>>> index d855cb53c7e0..3fe33de3564a 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>>>>>>> @@ -537,6 +537,25 @@ TRACE_EVENT(amdgpu_ib_pipe_sync,
>>>>>>>                        __entry->seqno)
>>>>>>>   );
>>>>>>>
>>>>>>> +TRACE_EVENT(amdgpu_reset_reg_dumps,
>>>>>>> +           TP_PROTO(long *address, uint32_t *value, int length),
>>>>>>> +           TP_ARGS(address, value, length),
>>>>>>> +           TP_STRUCT__entry(
>>>>>>> +                            __array(long, address, 128)
>>>>>>> +                            __array(uint32_t, value, 128)
>>>>>>> +                            __field(int, len)
>>>>>>> +                            ),
>>>>>>> +           TP_fast_assign(
>>>>>>> +                          memcpy(__entry->address, address, 128);
>>>>>>> +                          memcpy(__entry->value, value, 128);
>>>>>>> +                          __entry->len = length;
>>>>>>> +                          ),
>>>>>>> +           TP_printk("amdgpu register dump offset: %s value: %s ",
>>>>>>> +                     __print_array(__entry->address, 
>>>>>>> __entry->len, 8),
>>>>>>> +                     __print_array(__entry->value, __entry->len, 8)
>>>>>>> +                    )
>>>>>>> +);
>>>>>>> +
>>>>>>>   #undef AMDGPU_JOB_GET_TIMELINE_NAME
>>>>>>>   #endif
>>>>>>>
>>>>>>> -- 
>>>>>>> 2.25.1
>>>>>>>
>>>>>
>>>
> 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: add reset register trace function on GPU reset
  2022-02-10 13:18               ` Sharma, Shashank
@ 2022-02-10 14:05                 ` Christian König
  2022-02-10 14:11                   ` Sharma, Shashank
  0 siblings, 1 reply; 21+ messages in thread
From: Christian König @ 2022-02-10 14:05 UTC (permalink / raw)
  To: Sharma, Shashank, Christian König, Somalapuram, Amaranath,
	Alex Deucher, Somalapuram Amaranath
  Cc: Deucher, Alexander, amd-gfx list

Am 10.02.22 um 14:18 schrieb Sharma, Shashank:
>
>
> On 2/10/2022 8:38 AM, Christian König wrote:
>> Am 10.02.22 um 08:34 schrieb Somalapuram, Amaranath:
>>>
>>> On 2/10/2022 12:39 PM, Christian König wrote:
>>>> Am 10.02.22 um 06:29 schrieb Somalapuram, Amaranath:
>>>>>
>>>>> On 2/9/2022 1:17 PM, Christian König wrote:
>>>>>> Am 08.02.22 um 16:28 schrieb Alex Deucher:
>>>>>>> On Tue, Feb 8, 2022 at 3:17 AM Somalapuram Amaranath
>>>>>>> <Amaranath.Somalapuram@amd.com> wrote:
>>>>>>>> Dump the list of register values to trace event on GPU reset.
>>>>>>>>
>>>>>>>> Signed-off-by: Somalapuram Amaranath 
>>>>>>>> <Amaranath.Somalapuram@amd.com>
>>>>>>>> ---
>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 21 
>>>>>>>> ++++++++++++++++++++-
>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h  | 19 
>>>>>>>> +++++++++++++++++++
>>>>>>>>   2 files changed, 39 insertions(+), 1 deletion(-)
>>>>>>>>
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> index 1e651b959141..057922fb7e37 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> @@ -4534,6 +4534,23 @@ int amdgpu_device_pre_asic_reset(struct 
>>>>>>>> amdgpu_device *adev,
>>>>>>>>          return r;
>>>>>>>>   }
>>>>>>>>
>>>>>>>> +static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
>>>>>>>> +{
>>>>>>>> +       int i;
>>>>>>>> +       uint32_t reg_value[128];
>>>>>>>> +
>>>>>>>> +       for (i = 0; adev->reset_dump_reg_list[i] != 0; i++) {
>>>>>>>> +               if (adev->asic_type >= CHIP_NAVI10)
>>>>>>> This check should be against CHIP_VEGA10.  Also, this only 
>>>>>>> allows for
>>>>>>> GC registers.  If we wanted to dump other registers, we'd need a
>>>>>>> different macro.  Might be better to just use RREG32 here for
>>>>>>> everything and then encode the full offset using
>>>>>>> SOC15_REG_ENTRY_OFFSET() or a similar macro.  Also, we need to 
>>>>>>> think
>>>>>>> about how to handle gfxoff in this case.  gfxoff needs to be 
>>>>>>> disabled
>>>>>>> or we'll hang the chip if we try and read GC or SDMA registers via
>>>>>>> MMIO which will adversely affect the hang signature.
>>>>>>
>>>>>> Well this should execute right before a GPU reset, so I think it 
>>>>>> shouldn't matter if we hang the chip or not as long as the read 
>>>>>> comes back correctly (I remember a very long UVD debug session 
>>>>>> because of this).
>>>>>>
>>>>>> But in general I agree, we should just use RREG32() here and 
>>>>>> always encode the full register offset.
>>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>> Can I use something like this:
>>>>>
>>>>> +                       reg_value[i] = 
>>>>> RREG32((adev->reg_offset[adev->reset_dump_reg_list[i][0]]
>>>>> + [adev->reset_dump_reg_list[i][1]]
>>>>> + [adev->reset_dump_reg_list[i][2]])
>>>>> +                                 + adev->reset_dump_reg_list[i][3]);
>>>>>
>>>>> ip --> adev->reset_dump_reg_list[i][0]
>>>>>
>>>>> inst --> adev->reset_dump_reg_list[i][1]
>>>>>
>>>>> BASE_IDX--> adev->reset_dump_reg_list[i][2]
>>>>>
>>>>> reg --> adev->reset_dump_reg_list[i][3]
>>>>
>>>> No, that won't work.
>>>>
>>>> What you need to do is to use the full 32bit address of the 
>>>> register. Userspace can worry about figuring out which ip, 
>>>> instance, base and reg to resolve into that address.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>> Thanks Christian.
>>>
>>> should I consider using gfxoff like below code or not required:
>>> amdgpu_gfx_off_ctrl(adev, false);
>>> amdgpu_gfx_off_ctrl(adev, true);
>>
>> That's a really good question I can't fully answer.
>>
>> I think we don't want that because the GPU is stuck when the dump is 
>> made, but better let Alex comment as well.
>>
>> Regards,
>> Christian.
>
>
> I had a quick look at the function amdgpu_gfx_off_ctrl, and it locks 
> this mutex internally:
> mutex_lock(&adev->gfx.gfx_off_mutex);
>
> and the reference state is tracked in:
> adev->gfx.gfx_off_state
>
> We can do something like this maybe:
> - If (adev->gfx_off_state == 0) {
>   trylock(gfx_off_mutex)
>   read_regs_now;
>   unlock_mutex();
> }
>
> How does it sounds ?

As far as I know that won't work. GFX_off is only disabled intentionally 
in very few places.

So we will probably never get a register trace with that.

Regards,
Christian.

>
> - Shashank
>>
>>>
>>>
>>> Regards,
>>>
>>> S.Amarnath
>>>>>
>>>>> which requires 4 values in user space for each register.
>>>>>
>>>>> using any existing macro like RREG32_SOC15** will not be able to 
>>>>> pass proper argument from user space (like ip##_HWIP or 
>>>>> reg##_BASE_IDX)
>>>>>
>>>>>>
>>>>>>>
>>>>>>> Alex
>>>>>>>
>>>>>>>> + reg_value[i] = RREG32_SOC15_IP(GC, 
>>>>>>>> adev->reset_dump_reg_list[i]);
>>>>>>>> +               else
>>>>>>>> +                       reg_value[i] = 
>>>>>>>> RREG32(adev->reset_dump_reg_list[i]);
>>>>>>>> +       }
>>>>>>>> +
>>>>>>>> + trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list, 
>>>>>>>> reg_value, i);
>>>>>>>> +
>>>>>>>> +       return 0;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>>   int amdgpu_do_asic_reset(struct list_head *device_list_handle,
>>>>>>>>                           struct amdgpu_reset_context 
>>>>>>>> *reset_context)
>>>>>>>>   {
>>>>>>>> @@ -4567,8 +4584,10 @@ int amdgpu_do_asic_reset(struct 
>>>>>>>> list_head *device_list_handle,
>>>>>>>> tmp_adev->gmc.xgmi.pending_reset = false;
>>>>>>>>                                  if 
>>>>>>>> (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
>>>>>>>>                                          r = -EALREADY;
>>>>>>>> -                       } else
>>>>>>>> +                       } else {
>>>>>>>> + amdgpu_reset_reg_dumps(tmp_adev);
>>>>>>>>                                  r = amdgpu_asic_reset(tmp_adev);
>>>>>>>> +                       }
>>>>>>>>
>>>>>>>>                          if (r) {
>>>>>>>> dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for 
>>>>>>>> drm dev, %s",
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h 
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>>>>>>>> index d855cb53c7e0..3fe33de3564a 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>>>>>>>> @@ -537,6 +537,25 @@ TRACE_EVENT(amdgpu_ib_pipe_sync,
>>>>>>>>                        __entry->seqno)
>>>>>>>>   );
>>>>>>>>
>>>>>>>> +TRACE_EVENT(amdgpu_reset_reg_dumps,
>>>>>>>> +           TP_PROTO(long *address, uint32_t *value, int length),
>>>>>>>> +           TP_ARGS(address, value, length),
>>>>>>>> +           TP_STRUCT__entry(
>>>>>>>> +                            __array(long, address, 128)
>>>>>>>> +                            __array(uint32_t, value, 128)
>>>>>>>> +                            __field(int, len)
>>>>>>>> +                            ),
>>>>>>>> +           TP_fast_assign(
>>>>>>>> + memcpy(__entry->address, address, 128);
>>>>>>>> +                          memcpy(__entry->value, value, 128);
>>>>>>>> +                          __entry->len = length;
>>>>>>>> +                          ),
>>>>>>>> +           TP_printk("amdgpu register dump offset: %s value: 
>>>>>>>> %s ",
>>>>>>>> + __print_array(__entry->address, __entry->len, 8),
>>>>>>>> + __print_array(__entry->value, __entry->len, 8)
>>>>>>>> +                    )
>>>>>>>> +);
>>>>>>>> +
>>>>>>>>   #undef AMDGPU_JOB_GET_TIMELINE_NAME
>>>>>>>>   #endif
>>>>>>>>
>>>>>>>> -- 
>>>>>>>> 2.25.1
>>>>>>>>
>>>>>>
>>>>
>>


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: add reset register trace function on GPU reset
  2022-02-10 14:05                 ` Christian König
@ 2022-02-10 14:11                   ` Sharma, Shashank
  2022-02-10 15:59                     ` Alex Deucher
  0 siblings, 1 reply; 21+ messages in thread
From: Sharma, Shashank @ 2022-02-10 14:11 UTC (permalink / raw)
  To: Christian König, Christian König, Somalapuram,
	Amaranath, Alex Deucher, Somalapuram Amaranath
  Cc: Deucher, Alexander, amd-gfx list



On 2/10/2022 3:05 PM, Christian König wrote:
> Am 10.02.22 um 14:18 schrieb Sharma, Shashank:
>>
>>
>> On 2/10/2022 8:38 AM, Christian König wrote:
>>> Am 10.02.22 um 08:34 schrieb Somalapuram, Amaranath:
>>>>
>>>> On 2/10/2022 12:39 PM, Christian König wrote:
>>>>> Am 10.02.22 um 06:29 schrieb Somalapuram, Amaranath:
>>>>>>
>>>>>> On 2/9/2022 1:17 PM, Christian König wrote:
>>>>>>> Am 08.02.22 um 16:28 schrieb Alex Deucher:
>>>>>>>> On Tue, Feb 8, 2022 at 3:17 AM Somalapuram Amaranath
>>>>>>>> <Amaranath.Somalapuram@amd.com> wrote:
>>>>>>>>> Dump the list of register values to trace event on GPU reset.
>>>>>>>>>
>>>>>>>>> Signed-off-by: Somalapuram Amaranath 
>>>>>>>>> <Amaranath.Somalapuram@amd.com>
>>>>>>>>> ---
>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 21 
>>>>>>>>> ++++++++++++++++++++-
>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h  | 19 
>>>>>>>>> +++++++++++++++++++
>>>>>>>>>   2 files changed, 39 insertions(+), 1 deletion(-)
>>>>>>>>>
>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>> index 1e651b959141..057922fb7e37 100644
>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>> @@ -4534,6 +4534,23 @@ int amdgpu_device_pre_asic_reset(struct 
>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>          return r;
>>>>>>>>>   }
>>>>>>>>>
>>>>>>>>> +static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
>>>>>>>>> +{
>>>>>>>>> +       int i;
>>>>>>>>> +       uint32_t reg_value[128];
>>>>>>>>> +
>>>>>>>>> +       for (i = 0; adev->reset_dump_reg_list[i] != 0; i++) {
>>>>>>>>> +               if (adev->asic_type >= CHIP_NAVI10)
>>>>>>>> This check should be against CHIP_VEGA10.  Also, this only 
>>>>>>>> allows for
>>>>>>>> GC registers.  If we wanted to dump other registers, we'd need a
>>>>>>>> different macro.  Might be better to just use RREG32 here for
>>>>>>>> everything and then encode the full offset using
>>>>>>>> SOC15_REG_ENTRY_OFFSET() or a similar macro.  Also, we need to 
>>>>>>>> think
>>>>>>>> about how to handle gfxoff in this case.  gfxoff needs to be 
>>>>>>>> disabled
>>>>>>>> or we'll hang the chip if we try and read GC or SDMA registers via
>>>>>>>> MMIO which will adversely affect the hang signature.
>>>>>>>
>>>>>>> Well this should execute right before a GPU reset, so I think it 
>>>>>>> shouldn't matter if we hang the chip or not as long as the read 
>>>>>>> comes back correctly (I remember a very long UVD debug session 
>>>>>>> because of this).
>>>>>>>
>>>>>>> But in general I agree, we should just use RREG32() here and 
>>>>>>> always encode the full register offset.
>>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>> Can I use something like this:
>>>>>>
>>>>>> +                       reg_value[i] = 
>>>>>> RREG32((adev->reg_offset[adev->reset_dump_reg_list[i][0]]
>>>>>> + [adev->reset_dump_reg_list[i][1]]
>>>>>> + [adev->reset_dump_reg_list[i][2]])
>>>>>> +                                 + adev->reset_dump_reg_list[i][3]);
>>>>>>
>>>>>> ip --> adev->reset_dump_reg_list[i][0]
>>>>>>
>>>>>> inst --> adev->reset_dump_reg_list[i][1]
>>>>>>
>>>>>> BASE_IDX--> adev->reset_dump_reg_list[i][2]
>>>>>>
>>>>>> reg --> adev->reset_dump_reg_list[i][3]
>>>>>
>>>>> No, that won't work.
>>>>>
>>>>> What you need to do is to use the full 32bit address of the 
>>>>> register. Userspace can worry about figuring out which ip, 
>>>>> instance, base and reg to resolve into that address.
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>> Thanks Christian.
>>>>
>>>> should I consider using gfxoff like below code or not required:
>>>> amdgpu_gfx_off_ctrl(adev, false);
>>>> amdgpu_gfx_off_ctrl(adev, true);
>>>
>>> That's a really good question I can't fully answer.
>>>
>>> I think we don't want that because the GPU is stuck when the dump is 
>>> made, but better let Alex comment as well.
>>>
>>> Regards,
>>> Christian.
>>
>>
>> I had a quick look at the function amdgpu_gfx_off_ctrl, and it locks 
>> this mutex internally:
>> mutex_lock(&adev->gfx.gfx_off_mutex);
>>
>> and the reference state is tracked in:
>> adev->gfx.gfx_off_state
>>
>> We can do something like this maybe:
>> - If (adev->gfx_off_state == 0) {
>>   trylock(gfx_off_mutex)
>>   read_regs_now;
>>   unlock_mutex();
>> }
>>
>> How does it sounds ?
> 
> As far as I know that won't work. GFX_off is only disabled intentionally 
> in very few places.
> 
> So we will probably never get a register trace with that.
> 

Ok, I don't know much about this feature, but due to the name I was 
udner the impression that gfx_off will be mostly disabled. But if it is 
hardly ever disabled, we need more infomrmation about it first, like 
when is this disabled and why ?

Alex ?

- Shashank

> Regards,
> Christian.
> 
>>
>> - Shashank
>>>
>>>>
>>>>
>>>> Regards,
>>>>
>>>> S.Amarnath
>>>>>>
>>>>>> which requires 4 values in user space for each register.
>>>>>>
>>>>>> using any existing macro like RREG32_SOC15** will not be able to 
>>>>>> pass proper argument from user space (like ip##_HWIP or 
>>>>>> reg##_BASE_IDX)
>>>>>>
>>>>>>>
>>>>>>>>
>>>>>>>> Alex
>>>>>>>>
>>>>>>>>> + reg_value[i] = RREG32_SOC15_IP(GC, 
>>>>>>>>> adev->reset_dump_reg_list[i]);
>>>>>>>>> +               else
>>>>>>>>> +                       reg_value[i] = 
>>>>>>>>> RREG32(adev->reset_dump_reg_list[i]);
>>>>>>>>> +       }
>>>>>>>>> +
>>>>>>>>> + trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list, 
>>>>>>>>> reg_value, i);
>>>>>>>>> +
>>>>>>>>> +       return 0;
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>>   int amdgpu_do_asic_reset(struct list_head *device_list_handle,
>>>>>>>>>                           struct amdgpu_reset_context 
>>>>>>>>> *reset_context)
>>>>>>>>>   {
>>>>>>>>> @@ -4567,8 +4584,10 @@ int amdgpu_do_asic_reset(struct 
>>>>>>>>> list_head *device_list_handle,
>>>>>>>>> tmp_adev->gmc.xgmi.pending_reset = false;
>>>>>>>>>                                  if 
>>>>>>>>> (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
>>>>>>>>>                                          r = -EALREADY;
>>>>>>>>> -                       } else
>>>>>>>>> +                       } else {
>>>>>>>>> + amdgpu_reset_reg_dumps(tmp_adev);
>>>>>>>>>                                  r = amdgpu_asic_reset(tmp_adev);
>>>>>>>>> +                       }
>>>>>>>>>
>>>>>>>>>                          if (r) {
>>>>>>>>> dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for 
>>>>>>>>> drm dev, %s",
>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h 
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>>>>>>>>> index d855cb53c7e0..3fe33de3564a 100644
>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>>>>>>>>> @@ -537,6 +537,25 @@ TRACE_EVENT(amdgpu_ib_pipe_sync,
>>>>>>>>>                        __entry->seqno)
>>>>>>>>>   );
>>>>>>>>>
>>>>>>>>> +TRACE_EVENT(amdgpu_reset_reg_dumps,
>>>>>>>>> +           TP_PROTO(long *address, uint32_t *value, int length),
>>>>>>>>> +           TP_ARGS(address, value, length),
>>>>>>>>> +           TP_STRUCT__entry(
>>>>>>>>> +                            __array(long, address, 128)
>>>>>>>>> +                            __array(uint32_t, value, 128)
>>>>>>>>> +                            __field(int, len)
>>>>>>>>> +                            ),
>>>>>>>>> +           TP_fast_assign(
>>>>>>>>> + memcpy(__entry->address, address, 128);
>>>>>>>>> +                          memcpy(__entry->value, value, 128);
>>>>>>>>> +                          __entry->len = length;
>>>>>>>>> +                          ),
>>>>>>>>> +           TP_printk("amdgpu register dump offset: %s value: 
>>>>>>>>> %s ",
>>>>>>>>> + __print_array(__entry->address, __entry->len, 8),
>>>>>>>>> + __print_array(__entry->value, __entry->len, 8)
>>>>>>>>> +                    )
>>>>>>>>> +);
>>>>>>>>> +
>>>>>>>>>   #undef AMDGPU_JOB_GET_TIMELINE_NAME
>>>>>>>>>   #endif
>>>>>>>>>
>>>>>>>>> -- 
>>>>>>>>> 2.25.1
>>>>>>>>>
>>>>>>>
>>>>>
>>>
> 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: add reset register trace function on GPU reset
  2022-02-10 14:11                   ` Sharma, Shashank
@ 2022-02-10 15:59                     ` Alex Deucher
  0 siblings, 0 replies; 21+ messages in thread
From: Alex Deucher @ 2022-02-10 15:59 UTC (permalink / raw)
  To: Sharma, Shashank
  Cc: Somalapuram, Amaranath, Christian König,
	Somalapuram Amaranath, amd-gfx list, Deucher, Alexander,
	Christian König

On Thu, Feb 10, 2022 at 9:11 AM Sharma, Shashank
<shashank.sharma@amd.com> wrote:
>
>
>
> On 2/10/2022 3:05 PM, Christian König wrote:
> > Am 10.02.22 um 14:18 schrieb Sharma, Shashank:
> >>
> >>
> >> On 2/10/2022 8:38 AM, Christian König wrote:
> >>> Am 10.02.22 um 08:34 schrieb Somalapuram, Amaranath:
> >>>>
> >>>> On 2/10/2022 12:39 PM, Christian König wrote:
> >>>>> Am 10.02.22 um 06:29 schrieb Somalapuram, Amaranath:
> >>>>>>
> >>>>>> On 2/9/2022 1:17 PM, Christian König wrote:
> >>>>>>> Am 08.02.22 um 16:28 schrieb Alex Deucher:
> >>>>>>>> On Tue, Feb 8, 2022 at 3:17 AM Somalapuram Amaranath
> >>>>>>>> <Amaranath.Somalapuram@amd.com> wrote:
> >>>>>>>>> Dump the list of register values to trace event on GPU reset.
> >>>>>>>>>
> >>>>>>>>> Signed-off-by: Somalapuram Amaranath
> >>>>>>>>> <Amaranath.Somalapuram@amd.com>
> >>>>>>>>> ---
> >>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 21
> >>>>>>>>> ++++++++++++++++++++-
> >>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h  | 19
> >>>>>>>>> +++++++++++++++++++
> >>>>>>>>>   2 files changed, 39 insertions(+), 1 deletion(-)
> >>>>>>>>>
> >>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> >>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> >>>>>>>>> index 1e651b959141..057922fb7e37 100644
> >>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> >>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> >>>>>>>>> @@ -4534,6 +4534,23 @@ int amdgpu_device_pre_asic_reset(struct
> >>>>>>>>> amdgpu_device *adev,
> >>>>>>>>>          return r;
> >>>>>>>>>   }
> >>>>>>>>>
> >>>>>>>>> +static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
> >>>>>>>>> +{
> >>>>>>>>> +       int i;
> >>>>>>>>> +       uint32_t reg_value[128];
> >>>>>>>>> +
> >>>>>>>>> +       for (i = 0; adev->reset_dump_reg_list[i] != 0; i++) {
> >>>>>>>>> +               if (adev->asic_type >= CHIP_NAVI10)
> >>>>>>>> This check should be against CHIP_VEGA10.  Also, this only
> >>>>>>>> allows for
> >>>>>>>> GC registers.  If we wanted to dump other registers, we'd need a
> >>>>>>>> different macro.  Might be better to just use RREG32 here for
> >>>>>>>> everything and then encode the full offset using
> >>>>>>>> SOC15_REG_ENTRY_OFFSET() or a similar macro.  Also, we need to
> >>>>>>>> think
> >>>>>>>> about how to handle gfxoff in this case.  gfxoff needs to be
> >>>>>>>> disabled
> >>>>>>>> or we'll hang the chip if we try and read GC or SDMA registers via
> >>>>>>>> MMIO which will adversely affect the hang signature.
> >>>>>>>
> >>>>>>> Well this should execute right before a GPU reset, so I think it
> >>>>>>> shouldn't matter if we hang the chip or not as long as the read
> >>>>>>> comes back correctly (I remember a very long UVD debug session
> >>>>>>> because of this).
> >>>>>>>
> >>>>>>> But in general I agree, we should just use RREG32() here and
> >>>>>>> always encode the full register offset.
> >>>>>>>
> >>>>>>> Regards,
> >>>>>>> Christian.
> >>>>>>>
> >>>>>> Can I use something like this:
> >>>>>>
> >>>>>> +                       reg_value[i] =
> >>>>>> RREG32((adev->reg_offset[adev->reset_dump_reg_list[i][0]]
> >>>>>> + [adev->reset_dump_reg_list[i][1]]
> >>>>>> + [adev->reset_dump_reg_list[i][2]])
> >>>>>> +                                 + adev->reset_dump_reg_list[i][3]);
> >>>>>>
> >>>>>> ip --> adev->reset_dump_reg_list[i][0]
> >>>>>>
> >>>>>> inst --> adev->reset_dump_reg_list[i][1]
> >>>>>>
> >>>>>> BASE_IDX--> adev->reset_dump_reg_list[i][2]
> >>>>>>
> >>>>>> reg --> adev->reset_dump_reg_list[i][3]
> >>>>>
> >>>>> No, that won't work.
> >>>>>
> >>>>> What you need to do is to use the full 32bit address of the
> >>>>> register. Userspace can worry about figuring out which ip,
> >>>>> instance, base and reg to resolve into that address.
> >>>>>
> >>>>> Regards,
> >>>>> Christian.
> >>>>>
> >>>> Thanks Christian.
> >>>>
> >>>> should I consider using gfxoff like below code or not required:
> >>>> amdgpu_gfx_off_ctrl(adev, false);
> >>>> amdgpu_gfx_off_ctrl(adev, true);
> >>>
> >>> That's a really good question I can't fully answer.
> >>>
> >>> I think we don't want that because the GPU is stuck when the dump is
> >>> made, but better let Alex comment as well.
> >>>
> >>> Regards,
> >>> Christian.
> >>
> >>
> >> I had a quick look at the function amdgpu_gfx_off_ctrl, and it locks
> >> this mutex internally:
> >> mutex_lock(&adev->gfx.gfx_off_mutex);
> >>
> >> and the reference state is tracked in:
> >> adev->gfx.gfx_off_state
> >>
> >> We can do something like this maybe:
> >> - If (adev->gfx_off_state == 0) {
> >>   trylock(gfx_off_mutex)
> >>   read_regs_now;
> >>   unlock_mutex();
> >> }
> >>
> >> How does it sounds ?
> >
> > As far as I know that won't work. GFX_off is only disabled intentionally
> > in very few places.
> >
> > So we will probably never get a register trace with that.
> >
>
> Ok, I don't know much about this feature, but due to the name I was
> udner the impression that gfx_off will be mostly disabled. But if it is
> hardly ever disabled, we need more infomrmation about it first, like
> when is this disabled and why ?
>
> Alex ?

gfxoff is enabled most of the time.  The only times we disable it are
for certain features (reading gfx or sdma MMIO registers, using SQTT,
etc.).  The two main cases where it would get disabled are calling the
READ_MMR query to the INFO ioctl and if you change the power state to
a profiling mode (either via sysfs or the context ioctl).  If you
don't disable gfx off and you access a register via MMIO, the GFX
block will hang.  So depending on the the nature of the hang, gfxoff
may or may not be a problem.  E.g., if the hang is caused by VCN or
some power management thing gfxoff may still kick in and power down
the gfx block.  If the gfx block itself is hung, then most likely
gfxoff would not be active since the gfx block is active (and hung).
So if we didn't disable gfxoff and it is active, and then we try and
read the registers, we'll hang the gfx block.  That's probably fine
since we'll be reseting the GPU anyway, but it will mean any registers
in the dump will be worthless.

Alex

>
> - Shashank
>
> > Regards,
> > Christian.
> >
> >>
> >> - Shashank
> >>>
> >>>>
> >>>>
> >>>> Regards,
> >>>>
> >>>> S.Amarnath
> >>>>>>
> >>>>>> which requires 4 values in user space for each register.
> >>>>>>
> >>>>>> using any existing macro like RREG32_SOC15** will not be able to
> >>>>>> pass proper argument from user space (like ip##_HWIP or
> >>>>>> reg##_BASE_IDX)
> >>>>>>
> >>>>>>>
> >>>>>>>>
> >>>>>>>> Alex
> >>>>>>>>
> >>>>>>>>> + reg_value[i] = RREG32_SOC15_IP(GC,
> >>>>>>>>> adev->reset_dump_reg_list[i]);
> >>>>>>>>> +               else
> >>>>>>>>> +                       reg_value[i] =
> >>>>>>>>> RREG32(adev->reset_dump_reg_list[i]);
> >>>>>>>>> +       }
> >>>>>>>>> +
> >>>>>>>>> + trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list,
> >>>>>>>>> reg_value, i);
> >>>>>>>>> +
> >>>>>>>>> +       return 0;
> >>>>>>>>> +}
> >>>>>>>>> +
> >>>>>>>>>   int amdgpu_do_asic_reset(struct list_head *device_list_handle,
> >>>>>>>>>                           struct amdgpu_reset_context
> >>>>>>>>> *reset_context)
> >>>>>>>>>   {
> >>>>>>>>> @@ -4567,8 +4584,10 @@ int amdgpu_do_asic_reset(struct
> >>>>>>>>> list_head *device_list_handle,
> >>>>>>>>> tmp_adev->gmc.xgmi.pending_reset = false;
> >>>>>>>>>                                  if
> >>>>>>>>> (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
> >>>>>>>>>                                          r = -EALREADY;
> >>>>>>>>> -                       } else
> >>>>>>>>> +                       } else {
> >>>>>>>>> + amdgpu_reset_reg_dumps(tmp_adev);
> >>>>>>>>>                                  r = amdgpu_asic_reset(tmp_adev);
> >>>>>>>>> +                       }
> >>>>>>>>>
> >>>>>>>>>                          if (r) {
> >>>>>>>>> dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for
> >>>>>>>>> drm dev, %s",
> >>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
> >>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
> >>>>>>>>> index d855cb53c7e0..3fe33de3564a 100644
> >>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
> >>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
> >>>>>>>>> @@ -537,6 +537,25 @@ TRACE_EVENT(amdgpu_ib_pipe_sync,
> >>>>>>>>>                        __entry->seqno)
> >>>>>>>>>   );
> >>>>>>>>>
> >>>>>>>>> +TRACE_EVENT(amdgpu_reset_reg_dumps,
> >>>>>>>>> +           TP_PROTO(long *address, uint32_t *value, int length),
> >>>>>>>>> +           TP_ARGS(address, value, length),
> >>>>>>>>> +           TP_STRUCT__entry(
> >>>>>>>>> +                            __array(long, address, 128)
> >>>>>>>>> +                            __array(uint32_t, value, 128)
> >>>>>>>>> +                            __field(int, len)
> >>>>>>>>> +                            ),
> >>>>>>>>> +           TP_fast_assign(
> >>>>>>>>> + memcpy(__entry->address, address, 128);
> >>>>>>>>> +                          memcpy(__entry->value, value, 128);
> >>>>>>>>> +                          __entry->len = length;
> >>>>>>>>> +                          ),
> >>>>>>>>> +           TP_printk("amdgpu register dump offset: %s value:
> >>>>>>>>> %s ",
> >>>>>>>>> + __print_array(__entry->address, __entry->len, 8),
> >>>>>>>>> + __print_array(__entry->value, __entry->len, 8)
> >>>>>>>>> +                    )
> >>>>>>>>> +);
> >>>>>>>>> +
> >>>>>>>>>   #undef AMDGPU_JOB_GET_TIMELINE_NAME
> >>>>>>>>>   #endif
> >>>>>>>>>
> >>>>>>>>> --
> >>>>>>>>> 2.25.1
> >>>>>>>>>
> >>>>>>>
> >>>>>
> >>>
> >

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: add reset register trace function on GPU reset
  2022-02-10  5:29       ` Somalapuram, Amaranath
  2022-02-10  7:09         ` Christian König
@ 2022-02-10 11:59         ` Sharma, Shashank
  2022-02-10 12:27           ` Christian König
  1 sibling, 1 reply; 21+ messages in thread
From: Sharma, Shashank @ 2022-02-10 11:59 UTC (permalink / raw)
  To: Somalapuram, Amaranath, Christian König, Alex Deucher,
	Somalapuram Amaranath
  Cc: Deucher, Alexander, amd-gfx list



On 2/10/2022 6:29 AM, Somalapuram, Amaranath wrote:
> 
> On 2/9/2022 1:17 PM, Christian König wrote:
>> Am 08.02.22 um 16:28 schrieb Alex Deucher:
>>> On Tue, Feb 8, 2022 at 3:17 AM Somalapuram Amaranath
>>> <Amaranath.Somalapuram@amd.com> wrote:
>>>> Dump the list of register values to trace event on GPU reset.
>>>>
>>>> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
>>>> ---
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 21 ++++++++++++++++++++-
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h  | 19 +++++++++++++++++++
>>>>   2 files changed, 39 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> index 1e651b959141..057922fb7e37 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> @@ -4534,6 +4534,23 @@ int amdgpu_device_pre_asic_reset(struct 
>>>> amdgpu_device *adev,
>>>>          return r;
>>>>   }
>>>>
>>>> +static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
>>>> +{
>>>> +       int i;
>>>> +       uint32_t reg_value[128];
>>>> +
>>>> +       for (i = 0; adev->reset_dump_reg_list[i] != 0; i++) {
>>>> +               if (adev->asic_type >= CHIP_NAVI10)
>>> This check should be against CHIP_VEGA10.  Also, this only allows for
>>> GC registers.  If we wanted to dump other registers, we'd need a
>>> different macro.  Might be better to just use RREG32 here for
>>> everything and then encode the full offset using
>>> SOC15_REG_ENTRY_OFFSET() or a similar macro.  Also, we need to think
>>> about how to handle gfxoff in this case.  gfxoff needs to be disabled
>>> or we'll hang the chip if we try and read GC or SDMA registers via
>>> MMIO which will adversely affect the hang signature.
>>
>> Well this should execute right before a GPU reset, so I think it 
>> shouldn't matter if we hang the chip or not as long as the read comes 
>> back correctly (I remember a very long UVD debug session because of 
>> this).
>>
>> But in general I agree, we should just use RREG32() here and always 
>> encode the full register offset.
>>
>> Regards,
>> Christian.
>>
> Can I use something like this:
> 
> +                       reg_value[i] = 
> RREG32((adev->reg_offset[adev->reset_dump_reg_list[i][0]]
> + [adev->reset_dump_reg_list[i][1]]
> + [adev->reset_dump_reg_list[i][2]])
> +                                 + adev->reset_dump_reg_list[i][3]);
> 
> ip --> adev->reset_dump_reg_list[i][0]
> 
> inst --> adev->reset_dump_reg_list[i][1]
> 
> BASE_IDX--> adev->reset_dump_reg_list[i][2]
> 
> reg --> adev->reset_dump_reg_list[i][3]
> 
> which requires 4 values in user space for each register.
> 
> using any existing macro like RREG32_SOC15** will not be able to pass 
> proper argument from user space (like ip##_HWIP or reg##_BASE_IDX)
> 


Why cant we use just a simple array
adev->reset_dump_reg_list[10] for both ip and reg offsets ?

Userspace can provide the IP engine enum in first entry of the array,
reset_dump_reg_list[0], and register offsets in other entries starting 
from 1. We can convert that into desirable engine substring using an 
array of char *, something like:

const char *ip_engine_name_substing[] = {
	/* Same order as enum amd_hw_ip_block_type */
	"GC", "HDP", ......
}

engine enum;
u32 ip = adev->reset_dump_reg_list[0];
const char *ip_name = ip_engine_name_subs[ip];

for (i = 0; i < 9; i++) {
	reg_val = RREG_SOC15_IP(ip_name, reset_dump_reg_list[i+1]);
}

- Shashank

>>
>>>
>>> Alex
>>>
>>>> +                       reg_value[i] = RREG32_SOC15_IP(GC, 
>>>> adev->reset_dump_reg_list[i]);
>>>> +               else
>>>> +                       reg_value[i] = 
>>>> RREG32(adev->reset_dump_reg_list[i]);
>>>> +       }
>>>> +
>>>> + trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list, reg_value, 
>>>> i);
>>>> +
>>>> +       return 0;
>>>> +}
>>>> +
>>>>   int amdgpu_do_asic_reset(struct list_head *device_list_handle,
>>>>                           struct amdgpu_reset_context *reset_context)
>>>>   {
>>>> @@ -4567,8 +4584,10 @@ int amdgpu_do_asic_reset(struct list_head 
>>>> *device_list_handle,
>>>> tmp_adev->gmc.xgmi.pending_reset = false;
>>>>                                  if (!queue_work(system_unbound_wq, 
>>>> &tmp_adev->xgmi_reset_work))
>>>>                                          r = -EALREADY;
>>>> -                       } else
>>>> +                       } else {
>>>> + amdgpu_reset_reg_dumps(tmp_adev);
>>>>                                  r = amdgpu_asic_reset(tmp_adev);
>>>> +                       }
>>>>
>>>>                          if (r) {
>>>>                                  dev_err(tmp_adev->dev, "ASIC reset 
>>>> failed with error, %d for drm dev, %s",
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>>>> index d855cb53c7e0..3fe33de3564a 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>>>> @@ -537,6 +537,25 @@ TRACE_EVENT(amdgpu_ib_pipe_sync,
>>>>                        __entry->seqno)
>>>>   );
>>>>
>>>> +TRACE_EVENT(amdgpu_reset_reg_dumps,
>>>> +           TP_PROTO(long *address, uint32_t *value, int length),
>>>> +           TP_ARGS(address, value, length),
>>>> +           TP_STRUCT__entry(
>>>> +                            __array(long, address, 128)
>>>> +                            __array(uint32_t, value, 128)
>>>> +                            __field(int, len)
>>>> +                            ),
>>>> +           TP_fast_assign(
>>>> +                          memcpy(__entry->address, address, 128);
>>>> +                          memcpy(__entry->value,  value, 128);
>>>> +                          __entry->len = length;
>>>> +                          ),
>>>> +           TP_printk("amdgpu register dump offset: %s value: %s ",
>>>> +                     __print_array(__entry->address, __entry->len, 8),
>>>> +                     __print_array(__entry->value, __entry->len, 8)
>>>> +                    )
>>>> +);
>>>> +
>>>>   #undef AMDGPU_JOB_GET_TIMELINE_NAME
>>>>   #endif
>>>>
>>>> -- 
>>>> 2.25.1
>>>>
>>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: add reset register trace function on GPU reset
  2022-02-10 11:59         ` Sharma, Shashank
@ 2022-02-10 12:27           ` Christian König
  0 siblings, 0 replies; 21+ messages in thread
From: Christian König @ 2022-02-10 12:27 UTC (permalink / raw)
  To: Sharma, Shashank, Somalapuram, Amaranath, Christian König,
	Alex Deucher, Somalapuram Amaranath
  Cc: Deucher, Alexander, amd-gfx list

Am 10.02.22 um 12:59 schrieb Sharma, Shashank:
>
>
> On 2/10/2022 6:29 AM, Somalapuram, Amaranath wrote:
>>
>> On 2/9/2022 1:17 PM, Christian König wrote:
>>> Am 08.02.22 um 16:28 schrieb Alex Deucher:
>>>> On Tue, Feb 8, 2022 at 3:17 AM Somalapuram Amaranath
>>>> <Amaranath.Somalapuram@amd.com> wrote:
>>>>> Dump the list of register values to trace event on GPU reset.
>>>>>
>>>>> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
>>>>> ---
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 21 
>>>>> ++++++++++++++++++++-
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h  | 19 +++++++++++++++++++
>>>>>   2 files changed, 39 insertions(+), 1 deletion(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> index 1e651b959141..057922fb7e37 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> @@ -4534,6 +4534,23 @@ int amdgpu_device_pre_asic_reset(struct 
>>>>> amdgpu_device *adev,
>>>>>          return r;
>>>>>   }
>>>>>
>>>>> +static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
>>>>> +{
>>>>> +       int i;
>>>>> +       uint32_t reg_value[128];
>>>>> +
>>>>> +       for (i = 0; adev->reset_dump_reg_list[i] != 0; i++) {
>>>>> +               if (adev->asic_type >= CHIP_NAVI10)
>>>> This check should be against CHIP_VEGA10.  Also, this only allows for
>>>> GC registers.  If we wanted to dump other registers, we'd need a
>>>> different macro.  Might be better to just use RREG32 here for
>>>> everything and then encode the full offset using
>>>> SOC15_REG_ENTRY_OFFSET() or a similar macro.  Also, we need to think
>>>> about how to handle gfxoff in this case.  gfxoff needs to be disabled
>>>> or we'll hang the chip if we try and read GC or SDMA registers via
>>>> MMIO which will adversely affect the hang signature.
>>>
>>> Well this should execute right before a GPU reset, so I think it 
>>> shouldn't matter if we hang the chip or not as long as the read 
>>> comes back correctly (I remember a very long UVD debug session 
>>> because of this).
>>>
>>> But in general I agree, we should just use RREG32() here and always 
>>> encode the full register offset.
>>>
>>> Regards,
>>> Christian.
>>>
>> Can I use something like this:
>>
>> +                       reg_value[i] = 
>> RREG32((adev->reg_offset[adev->reset_dump_reg_list[i][0]]
>> + [adev->reset_dump_reg_list[i][1]]
>> + [adev->reset_dump_reg_list[i][2]])
>> +                                 + adev->reset_dump_reg_list[i][3]);
>>
>> ip --> adev->reset_dump_reg_list[i][0]
>>
>> inst --> adev->reset_dump_reg_list[i][1]
>>
>> BASE_IDX--> adev->reset_dump_reg_list[i][2]
>>
>> reg --> adev->reset_dump_reg_list[i][3]
>>
>> which requires 4 values in user space for each register.
>>
>> using any existing macro like RREG32_SOC15** will not be able to pass 
>> proper argument from user space (like ip##_HWIP or reg##_BASE_IDX)
>>
>
>
> Why cant we use just a simple array
> adev->reset_dump_reg_list[10] for both ip and reg offsets ?

That won't work. The IPs are separated into several base registers, see 
how the SOC15 functions work.

But that's also not necessary. Userspace should have the same 
information as the kernel about which IP is mapped where.

So all we need here is the already resolved 32bit value of which 
register to read and are basically done.

Regards,
Christian.

>
> Userspace can provide the IP engine enum in first entry of the array,
> reset_dump_reg_list[0], and register offsets in other entries starting 
> from 1. We can convert that into desirable engine substring using an 
> array of char *, something like:
>
> const char *ip_engine_name_substing[] = {
>     /* Same order as enum amd_hw_ip_block_type */
>     "GC", "HDP", ......
> }
>
> engine enum;
> u32 ip = adev->reset_dump_reg_list[0];
> const char *ip_name = ip_engine_name_subs[ip];
>
> for (i = 0; i < 9; i++) {
>     reg_val = RREG_SOC15_IP(ip_name, reset_dump_reg_list[i+1]);
> }
>
> - Shashank
>
>>>
>>>>
>>>> Alex
>>>>
>>>>> +                       reg_value[i] = RREG32_SOC15_IP(GC, 
>>>>> adev->reset_dump_reg_list[i]);
>>>>> +               else
>>>>> +                       reg_value[i] = 
>>>>> RREG32(adev->reset_dump_reg_list[i]);
>>>>> +       }
>>>>> +
>>>>> + trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list, 
>>>>> reg_value, i);
>>>>> +
>>>>> +       return 0;
>>>>> +}
>>>>> +
>>>>>   int amdgpu_do_asic_reset(struct list_head *device_list_handle,
>>>>>                           struct amdgpu_reset_context *reset_context)
>>>>>   {
>>>>> @@ -4567,8 +4584,10 @@ int amdgpu_do_asic_reset(struct list_head 
>>>>> *device_list_handle,
>>>>> tmp_adev->gmc.xgmi.pending_reset = false;
>>>>>                                  if 
>>>>> (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
>>>>>                                          r = -EALREADY;
>>>>> -                       } else
>>>>> +                       } else {
>>>>> + amdgpu_reset_reg_dumps(tmp_adev);
>>>>>                                  r = amdgpu_asic_reset(tmp_adev);
>>>>> +                       }
>>>>>
>>>>>                          if (r) {
>>>>>                                  dev_err(tmp_adev->dev, "ASIC 
>>>>> reset failed with error, %d for drm dev, %s",
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>>>>> index d855cb53c7e0..3fe33de3564a 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>>>>> @@ -537,6 +537,25 @@ TRACE_EVENT(amdgpu_ib_pipe_sync,
>>>>>                        __entry->seqno)
>>>>>   );
>>>>>
>>>>> +TRACE_EVENT(amdgpu_reset_reg_dumps,
>>>>> +           TP_PROTO(long *address, uint32_t *value, int length),
>>>>> +           TP_ARGS(address, value, length),
>>>>> +           TP_STRUCT__entry(
>>>>> +                            __array(long, address, 128)
>>>>> +                            __array(uint32_t, value, 128)
>>>>> +                            __field(int, len)
>>>>> +                            ),
>>>>> +           TP_fast_assign(
>>>>> +                          memcpy(__entry->address, address, 128);
>>>>> +                          memcpy(__entry->value, value, 128);
>>>>> +                          __entry->len = length;
>>>>> +                          ),
>>>>> +           TP_printk("amdgpu register dump offset: %s value: %s ",
>>>>> +                     __print_array(__entry->address, 
>>>>> __entry->len, 8),
>>>>> +                     __print_array(__entry->value, __entry->len, 8)
>>>>> +                    )
>>>>> +);
>>>>> +
>>>>>   #undef AMDGPU_JOB_GET_TIMELINE_NAME
>>>>>   #endif
>>>>>
>>>>> -- 
>>>>> 2.25.1
>>>>>
>>>


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: add debugfs for reset registers list
  2022-02-08  8:16 [PATCH 1/2] drm/amdgpu: add debugfs for reset registers list Somalapuram Amaranath
  2022-02-08  8:16 ` [PATCH 2/2] drm/amdgpu: add reset register trace function on GPU reset Somalapuram Amaranath
@ 2022-02-08  8:18 ` Christian König
  2022-02-08 11:13   ` Sharma, Shashank
  2022-02-08 10:54 ` Sharma, Shashank
  2 siblings, 1 reply; 21+ messages in thread
From: Christian König @ 2022-02-08  8:18 UTC (permalink / raw)
  To: Somalapuram Amaranath, amd-gfx; +Cc: alexander.deucher, shashank.sharma

Am 08.02.22 um 09:16 schrieb Somalapuram Amaranath:
> List of register to be populated for dump collection during the GPU reset.
>
> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  3 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 60 +++++++++++++++++++++
>   2 files changed, 63 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index b85b67a88a3d..78fa46f959c5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1097,6 +1097,9 @@ struct amdgpu_device {
>   
>   	struct amdgpu_reset_control     *reset_cntl;
>   	uint32_t                        ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
> +
> +	/* reset dump register */
> +	long			reset_dump_reg_list[128];

I don't have time for a full review, but using long here certainly makes 
no sense.

long is either 32bit or 64bit depending on the CPU architecture.

Regards,
Christian.

>   };
>   
>   static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> index 164d6a9e9fbb..dad268e8a81a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> @@ -1609,6 +1609,64 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
>   DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
>   			amdgpu_debugfs_sclk_set, "%llu\n");
>   
> +static ssize_t amdgpu_reset_dump_register_list_read(struct file *f,
> +				char __user *buf, size_t size, loff_t *pos)
> +{
> +	struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
> +	char *reg_offset;
> +	int i, r, len;
> +
> +	reg_offset = kmalloc(2048, GFP_KERNEL);
> +	memset(reg_offset,  0, 2048);
> +	for (i = 0; adev->reset_dump_reg_list[i] != 0; i++)
> +		sprintf(reg_offset + strlen(reg_offset), "0x%lx ", adev->reset_dump_reg_list[i]);
> +
> +	sprintf(reg_offset + strlen(reg_offset), "\n");
> +	len = strlen(reg_offset);
> +
> +	if (*pos >=  len)
> +		return 0;
> +
> +	r = copy_to_user(buf, reg_offset, len);
> +	*pos += len - r;
> +	kfree(reg_offset);
> +
> +	return len - r;
> +}
> +
> +static ssize_t amdgpu_reset_dump_register_list_write(struct file *f, const char __user *buf,
> +		size_t size, loff_t *pos)
> +{
> +	struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
> +	char *reg_offset, *reg;
> +	int ret, i = 0;
> +
> +	reg_offset = kmalloc(size, GFP_KERNEL);
> +	memset(reg_offset,  0, size);
> +	ret = copy_from_user(reg_offset, buf, size);
> +
> +	if (ret)
> +		return -EFAULT;
> +
> +	while ((reg = strsep(&reg_offset, " ")) != NULL) {
> +		ret  = kstrtol(reg, 16, &adev->reset_dump_reg_list[i]);
> +		if (ret)
> +			return -EINVAL;
> +		i++;
> +	}
> +
> +	kfree(reg_offset);
> +
> +	return size;
> +}
> +
> +static const struct file_operations amdgpu_reset_dump_register_list = {
> +	.owner = THIS_MODULE,
> +	.read = amdgpu_reset_dump_register_list_read,
> +	.write = amdgpu_reset_dump_register_list_write,
> +	.llseek = default_llseek
> +};
> +
>   int amdgpu_debugfs_init(struct amdgpu_device *adev)
>   {
>   	struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
> @@ -1672,6 +1730,8 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
>   			    &amdgpu_debugfs_test_ib_fops);
>   	debugfs_create_file("amdgpu_vm_info", 0444, root, adev,
>   			    &amdgpu_debugfs_vm_info_fops);
> +	debugfs_create_file("amdgpu_reset_dump_register_list", 0644, root, adev,
> +			    &amdgpu_reset_dump_register_list);
>   
>   	adev->debugfs_vbios_blob.data = adev->bios;
>   	adev->debugfs_vbios_blob.size = adev->bios_size;


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: add debugfs for reset registers list
  2022-02-08  8:18 ` [PATCH 1/2] drm/amdgpu: add debugfs for reset registers list Christian König
@ 2022-02-08 11:13   ` Sharma, Shashank
  2022-02-08 13:39     ` Somalapuram, Amaranath
  0 siblings, 1 reply; 21+ messages in thread
From: Sharma, Shashank @ 2022-02-08 11:13 UTC (permalink / raw)
  To: Christian König, Somalapuram Amaranath, amd-gfx; +Cc: alexander.deucher


Amar,
Apart from the long comment,there are a few more bugs in the patch, 
which I have mentioned here inline. Please check them out.

- Shashank

On 2/8/2022 9:18 AM, Christian König wrote:
> Am 08.02.22 um 09:16 schrieb Somalapuram Amaranath:
>> List of register to be populated for dump collection during the GPU 
>> reset.
>>
>> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  3 ++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 60 +++++++++++++++++++++
>>   2 files changed, 63 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index b85b67a88a3d..78fa46f959c5 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -1097,6 +1097,9 @@ struct amdgpu_device {
>>       struct amdgpu_reset_control     *reset_cntl;
>>       uint32_t                        
>> ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
>> +
>> +    /* reset dump register */
>> +    long            reset_dump_reg_list[128];
> 
> I don't have time for a full review, but using long here certainly makes 
> no sense.
> 
> long is either 32bit or 64bit depending on the CPU architecture.
> 
> Regards,
> Christian.
> 
>>   };
>>   static inline struct amdgpu_device *drm_to_adev(struct drm_device 
>> *ddev)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> index 164d6a9e9fbb..dad268e8a81a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> @@ -1609,6 +1609,64 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
>>   DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
>>               amdgpu_debugfs_sclk_set, "%llu\n");
>> +static ssize_t amdgpu_reset_dump_register_list_read(struct file *f,
>> +                char __user *buf, size_t size, loff_t *pos)
>> +{
>> +    struct amdgpu_device *adev = (struct amdgpu_device 
>> *)file_inode(f)->i_private;
>> +    char *reg_offset;
>> +    int i, r, len;
>> +
>> +    reg_offset = kmalloc(2048, GFP_KERNEL);
>> +    memset(reg_offset,  0, 2048);
>> +    for (i = 0; adev->reset_dump_reg_list[i] != 0; i++)

This loop termination condition is incorrect, why are we running the 
loop until adev->reset_dump_reg_list[i] != 0 ?

What if I have 10 registers to dump, but my 4th register value is 0 ? It 
will break the loop at 4 and we will not get all values.

>> +        sprintf(reg_offset + strlen(reg_offset), "0x%lx ", 
>> adev->reset_dump_reg_list[i]);
>> +
>> +    sprintf(reg_offset + strlen(reg_offset), "\n");
>> +    len = strlen(reg_offset);
>> +
>> +    if (*pos >=  len)
>> +        return 0;
>> +
>> +    r = copy_to_user(buf, reg_offset, len);
>> +    *pos += len - r;
>> +    kfree(reg_offset);

Also, why are we doing a dynamic memory allocation for reg_offest ? We 
can simply use adev->reset_dump_reg_list[i] isnt't it ?

simply
for (i=0; i<num_of_regs;i++) {
	copy_to_user(buf, adev->reg_list[i], sizeof(uint64_t));
}

Or without even a loop, simply:
copy_to_user(buf, &adev->reg_list, num_regs * sizeof(uint64_t));

- Shashank

>> +
>> +    return len - r;
>> +}
>> +
>> +static ssize_t amdgpu_reset_dump_register_list_write(struct file *f, 
>> const char __user *buf,
>> +        size_t size, loff_t *pos)
>> +{
>> +    struct amdgpu_device *adev = (struct amdgpu_device 
>> *)file_inode(f)->i_private;
>> +    char *reg_offset, *reg;
>> +    int ret, i = 0;
>> +
>> +    reg_offset = kmalloc(size, GFP_KERNEL);
>> +    memset(reg_offset,  0, size);
>> +    ret = copy_from_user(reg_offset, buf, size);
>> +

We are not allowing user to write into the list, so this whole function 
can just be a NOOP.

- Shashank

>> +    if (ret)
>> +        return -EFAULT;
>> +
>> +    while ((reg = strsep(&reg_offset, " ")) != NULL) {
>> +        ret  = kstrtol(reg, 16, &adev->reset_dump_reg_list[i]);
>> +        if (ret)
>> +            return -EINVAL;
>> +        i++;
>> +    }
>> +
>> +    kfree(reg_offset);
>> +
>> +    return size;
>> +}
>> +
>> +static const struct file_operations amdgpu_reset_dump_register_list = {
>> +    .owner = THIS_MODULE,
>> +    .read = amdgpu_reset_dump_register_list_read,
>> +    .write = amdgpu_reset_dump_register_list_write,
>> +    .llseek = default_llseek
>> +};
>> +
>>   int amdgpu_debugfs_init(struct amdgpu_device *adev)
>>   {
>>       struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
>> @@ -1672,6 +1730,8 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
>>                   &amdgpu_debugfs_test_ib_fops);
>>       debugfs_create_file("amdgpu_vm_info", 0444, root, adev,
>>                   &amdgpu_debugfs_vm_info_fops);
>> +    debugfs_create_file("amdgpu_reset_dump_register_list", 0644, 
>> root, adev,
>> +                &amdgpu_reset_dump_register_list);
>>       adev->debugfs_vbios_blob.data = adev->bios;
>>       adev->debugfs_vbios_blob.size = adev->bios_size;
> 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: add debugfs for reset registers list
  2022-02-08 11:13   ` Sharma, Shashank
@ 2022-02-08 13:39     ` Somalapuram, Amaranath
  2022-02-08 14:18       ` Sharma, Shashank
  0 siblings, 1 reply; 21+ messages in thread
From: Somalapuram, Amaranath @ 2022-02-08 13:39 UTC (permalink / raw)
  To: Sharma, Shashank, Christian König, Somalapuram Amaranath, amd-gfx
  Cc: alexander.deucher



On 2/8/2022 4:43 PM, Sharma, Shashank wrote:
> I thought we spoke and agreed about:
> - Not doing dynamic memory allocation during a reset call,
as there is a redesign debugfs call will happen during the application 
initialization and not during reset.
> - Not doing string operations, but just dumping register values by index.
I think your referring to the second patch which happens during reset 
and no string operation in second patch.
> NACK !
>
> - Shashank
>
> Amar,
> Apart from the long comment,there are a few more bugs in the patch, 
> which I have mentioned here inline. Please check them out.
>
> - Shashank
>
> On 2/8/2022 9:18 AM, Christian König wrote:
>> Am 08.02.22 um 09:16 schrieb Somalapuram Amaranath:
>>> List of register to be populated for dump collection during the GPU 
>>> reset.
>>>
>>> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  3 ++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 60 
>>> +++++++++++++++++++++
>>>   2 files changed, 63 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index b85b67a88a3d..78fa46f959c5 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -1097,6 +1097,9 @@ struct amdgpu_device {
>>>       struct amdgpu_reset_control     *reset_cntl;
>>>       uint32_t ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
>>> +
>>> +    /* reset dump register */
>>> +    long            reset_dump_reg_list[128];
>>
>> I don't have time for a full review, but using long here certainly 
>> makes no sense.
>>
>> long is either 32bit or 64bit depending on the CPU architecture.
>>
>> Regards,
>> Christian.
>>
will change uint32_t.
>>>   };
>>>   static inline struct amdgpu_device *drm_to_adev(struct drm_device 
>>> *ddev)
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> index 164d6a9e9fbb..dad268e8a81a 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> @@ -1609,6 +1609,64 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
>>>   DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
>>>               amdgpu_debugfs_sclk_set, "%llu\n");
>>> +static ssize_t amdgpu_reset_dump_register_list_read(struct file *f,
>>> +                char __user *buf, size_t size, loff_t *pos)
>>> +{
>>> +    struct amdgpu_device *adev = (struct amdgpu_device 
>>> *)file_inode(f)->i_private;
>>> +    char *reg_offset;
>>> +    int i, r, len;
>>> +
>>> +    reg_offset = kmalloc(2048, GFP_KERNEL);
>>> +    memset(reg_offset,  0, 2048);
>>> +    for (i = 0; adev->reset_dump_reg_list[i] != 0; i++)
>
> This loop termination condition is incorrect, why are we running the 
> loop until adev->reset_dump_reg_list[i] != 0 ?
>
> What if I have 10 registers to dump, but my 4th register value is 0 ? 
> It will break the loop at 4 and we will not get all values.
>
agreed, i try to avoid one more variable in adev
>>> +        sprintf(reg_offset + strlen(reg_offset), "0x%lx ", 
>>> adev->reset_dump_reg_list[i]);
>>> +
>>> +    sprintf(reg_offset + strlen(reg_offset), "\n");
>>> +    len = strlen(reg_offset);
>>> +
>>> +    if (*pos >=  len)
>>> +        return 0;
>>> +
>>> +    r = copy_to_user(buf, reg_offset, len);
>>> +    *pos += len - r;
>>> +    kfree(reg_offset);
>
> Also, why are we doing a dynamic memory allocation for reg_offest ? We 
> can simply use adev->reset_dump_reg_list[i] isnt't it ?
>
> simply
> for (i=0; i<num_of_regs;i++) {
>     copy_to_user(buf, adev->reg_list[i], sizeof(uint64_t));
> }
>
> Or without even a loop, simply:
> copy_to_user(buf, &adev->reg_list, num_regs * sizeof(uint64_t));
>
> - Shashank

it will not be in user readable format for debugfs, (if non readable is 
acceptable I will change this)

> +
>>>
>>> +    return len - r;
>>> +}
>>> +
>>> +static ssize_t amdgpu_reset_dump_register_list_write(struct file 
>>> *f, const char __user *buf,
>>> +        size_t size, loff_t *pos)
>>> +{
>>> +    struct amdgpu_device *adev = (struct amdgpu_device 
>>> *)file_inode(f)->i_private;
>>> +    char *reg_offset, *reg;
>>> +    int ret, i = 0;
>>> +
>>> +    reg_offset = kmalloc(size, GFP_KERNEL);
>>> +    memset(reg_offset,  0, size);
>>> +    ret = copy_from_user(reg_offset, buf, size);
>>> +
>
> We are not allowing user to write into the list, so this whole 
> function can just be a NOOP.
>
> - Shashank
User only update the list of reg offsets on init, there is no predefined 
reg offset from kernel code.
>
>>> +    if (ret)
>>> +        return -EFAULT;
>>> +
>>> +    while ((reg = strsep(&reg_offset, " ")) != NULL) {
>>> +        ret  = kstrtol(reg, 16, &adev->reset_dump_reg_list[i]);
>>> +        if (ret)
>>> +            return -EINVAL;
>>> +        i++;
>>> +    }
>>> +
>>> +    kfree(reg_offset);
>>> +
>>> +    return size;
>>> +}
>>> +
>>> +static const struct file_operations amdgpu_reset_dump_register_list 
>>> = {
>>> +    .owner = THIS_MODULE,
>>> +    .read = amdgpu_reset_dump_register_list_read,
>>> +    .write = amdgpu_reset_dump_register_list_write,
>>> +    .llseek = default_llseek
>>> +};
>>> +
>>>   int amdgpu_debugfs_init(struct amdgpu_device *adev)
>>>   {
>>>       struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
>>> @@ -1672,6 +1730,8 @@ int amdgpu_debugfs_init(struct amdgpu_device 
>>> *adev)
>>>                   &amdgpu_debugfs_test_ib_fops);
>>>       debugfs_create_file("amdgpu_vm_info", 0444, root, adev,
>>>                   &amdgpu_debugfs_vm_info_fops);
>>> +    debugfs_create_file("amdgpu_reset_dump_register_list", 0644, 
>>> root, adev,
>>> +                &amdgpu_reset_dump_register_list);
>>>       adev->debugfs_vbios_blob.data = adev->bios;
>>>       adev->debugfs_vbios_blob.size = adev->bios_size;
>>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: add debugfs for reset registers list
  2022-02-08 13:39     ` Somalapuram, Amaranath
@ 2022-02-08 14:18       ` Sharma, Shashank
  2022-02-08 14:31         ` Sharma, Shashank
  0 siblings, 1 reply; 21+ messages in thread
From: Sharma, Shashank @ 2022-02-08 14:18 UTC (permalink / raw)
  To: Somalapuram, Amaranath, Christian König,
	Somalapuram Amaranath, amd-gfx
  Cc: alexander.deucher



On 2/8/2022 2:39 PM, Somalapuram, Amaranath wrote:
> 
> 
> On 2/8/2022 4:43 PM, Sharma, Shashank wrote:
>> I thought we spoke and agreed about:
>> - Not doing dynamic memory allocation during a reset call,
> as there is a redesign debugfs call will happen during the application 
> initialization and not during reset.
>> - Not doing string operations, but just dumping register values by index.
> I think your referring to the second patch which happens during reset 
> and no string operation in second patch.

Pls see my comment in the end.

>> NACK !
>>
>> - Shashank
>>
>> Amar,
>> Apart from the long comment,there are a few more bugs in the patch, 
>> which I have mentioned here inline. Please check them out.
>>
>> - Shashank
>>
>> On 2/8/2022 9:18 AM, Christian König wrote:
>>> Am 08.02.22 um 09:16 schrieb Somalapuram Amaranath:
>>>> List of register to be populated for dump collection during the GPU 
>>>> reset.
>>>>
>>>> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
>>>> ---
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  3 ++
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 60 
>>>> +++++++++++++++++++++
>>>>   2 files changed, 63 insertions(+)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> index b85b67a88a3d..78fa46f959c5 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> @@ -1097,6 +1097,9 @@ struct amdgpu_device {
>>>>       struct amdgpu_reset_control     *reset_cntl;
>>>>       uint32_t ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
>>>> +
>>>> +    /* reset dump register */
>>>> +    long            reset_dump_reg_list[128];
>>>
>>> I don't have time for a full review, but using long here certainly 
>>> makes no sense.
>>>
>>> long is either 32bit or 64bit depending on the CPU architecture.
>>>
>>> Regards,
>>> Christian.
>>>
> will change uint32_t.
>>>>   };
>>>>   static inline struct amdgpu_device *drm_to_adev(struct drm_device 
>>>> *ddev)
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>> index 164d6a9e9fbb..dad268e8a81a 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>> @@ -1609,6 +1609,64 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
>>>>   DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
>>>>               amdgpu_debugfs_sclk_set, "%llu\n");
>>>> +static ssize_t amdgpu_reset_dump_register_list_read(struct file *f,
>>>> +                char __user *buf, size_t size, loff_t *pos)
>>>> +{
>>>> +    struct amdgpu_device *adev = (struct amdgpu_device 
>>>> *)file_inode(f)->i_private;
>>>> +    char *reg_offset;
>>>> +    int i, r, len;
>>>> +
>>>> +    reg_offset = kmalloc(2048, GFP_KERNEL);

We also want to understand how does the value 2048 came into picture, 
probably a macro which calculates the size preprocessing time will work 
better.

#define #define N_REGS_DUMP_GPU_RESET 10
#define BUFFER_SZ(N_REGS_DUMP_GPU_RESET * (sizeof uint64_t) + 1)

This first macro can be used later for the loop count for registers as well.

>>>> +    memset(reg_offset,  0, 2048);
>>>> +    for (i = 0; adev->reset_dump_reg_list[i] != 0; i++)
>>
>> This loop termination condition is incorrect, why are we running the 
>> loop until adev->reset_dump_reg_list[i] != 0 ?
>>
>> What if I have 10 registers to dump, but my 4th register value is 0 ? 
>> It will break the loop at 4 and we will not get all values.
>>
> agreed, i try to avoid one more variable in adev

Not by the cost of logic of course :).

Now you can run this loop here.

for (i = 0; i < N_REGS...; i++) {
	register_value_copy_here;
}

>>>> +        sprintf(reg_offset + strlen(reg_offset), "0x%lx ", 
>>>> adev->reset_dump_reg_list[i]);
>>>> +

>>>> +    sprintf(reg_offset + strlen(reg_offset), "\n");
>>>> +    len = strlen(reg_offset);
>>>> +
>>>> +    if (*pos >=  len)
>>>> +        return 0;
>>>> +
>>>> +    r = copy_to_user(buf, reg_offset, len);
>>>> +    *pos += len - r;
>>>> +    kfree(reg_offset);
>>
>> Also, why are we doing a dynamic memory allocation for reg_offest ? We 
>> can simply use adev->reset_dump_reg_list[i] isnt't it ?
>>
>> simply
>> for (i=0; i<num_of_regs;i++) {
>>     copy_to_user(buf, adev->reg_list[i], sizeof(uint64_t));
>> }
>>
>> Or without even a loop, simply:
>> copy_to_user(buf, &adev->reg_list, num_regs * sizeof(uint64_t));
>>
>> - Shashank
> 
> it will not be in user readable format for debugfs, (if non readable is 
> acceptable I will change this)
> 

We are just adding 0x in front of the reg value, so honestly I don't see 
a huge improvement in the user readability, but if you still want to do 
the dynamic allocation of memory, add the register offset or name as 
well, I mean then it should read like:

0x1234 = 0xABCD
0x1238 = 0xFFFF

- Shashank

>> +
>>>>
>>>> +    return len - r;
>>>> +}
>>>> +
>>>> +static ssize_t amdgpu_reset_dump_register_list_write(struct file 
>>>> *f, const char __user *buf,
>>>> +        size_t size, loff_t *pos)
>>>> +{
>>>> +    struct amdgpu_device *adev = (struct amdgpu_device 
>>>> *)file_inode(f)->i_private;
>>>> +    char *reg_offset, *reg;
>>>> +    int ret, i = 0;
>>>> +
>>>> +    reg_offset = kmalloc(size, GFP_KERNEL);
>>>> +    memset(reg_offset,  0, size);
>>>> +    ret = copy_from_user(reg_offset, buf, size);
>>>> +
>>
>> We are not allowing user to write into the list, so this whole 
>> function can just be a NOOP.
>>
>> - Shashank
> User only update the list of reg offsets on init, there is no predefined 
> reg offset from kernel code.
>>
>>>> +    if (ret)
>>>> +        return -EFAULT;
>>>> +
>>>> +    while ((reg = strsep(&reg_offset, " ")) != NULL) {
>>>> +        ret  = kstrtol(reg, 16, &adev->reset_dump_reg_list[i]);
>>>> +        if (ret)
>>>> +            return -EINVAL;
>>>> +        i++;
>>>> +    }
>>>> +
>>>> +    kfree(reg_offset);
>>>> +
>>>> +    return size;
>>>> +}
>>>> +
>>>> +static const struct file_operations amdgpu_reset_dump_register_list 
>>>> = {
>>>> +    .owner = THIS_MODULE,
>>>> +    .read = amdgpu_reset_dump_register_list_read,
>>>> +    .write = amdgpu_reset_dump_register_list_write,
>>>> +    .llseek = default_llseek
>>>> +};
>>>> +
>>>>   int amdgpu_debugfs_init(struct amdgpu_device *adev)
>>>>   {
>>>>       struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
>>>> @@ -1672,6 +1730,8 @@ int amdgpu_debugfs_init(struct amdgpu_device 
>>>> *adev)
>>>>                   &amdgpu_debugfs_test_ib_fops);
>>>>       debugfs_create_file("amdgpu_vm_info", 0444, root, adev,
>>>>                   &amdgpu_debugfs_vm_info_fops);
>>>> +    debugfs_create_file("amdgpu_reset_dump_register_list", 0644, 
>>>> root, adev,
>>>> +                &amdgpu_reset_dump_register_list);
>>>>       adev->debugfs_vbios_blob.data = adev->bios;
>>>>       adev->debugfs_vbios_blob.size = adev->bios_size;
>>>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: add debugfs for reset registers list
  2022-02-08 14:18       ` Sharma, Shashank
@ 2022-02-08 14:31         ` Sharma, Shashank
  2022-02-08 14:51           ` Sharma, Shashank
  0 siblings, 1 reply; 21+ messages in thread
From: Sharma, Shashank @ 2022-02-08 14:31 UTC (permalink / raw)
  To: Somalapuram, Amaranath, Christian König,
	Somalapuram Amaranath, amd-gfx
  Cc: alexander.deucher

 >> User only update the list of reg offsets on init, there is no
 >> predefined reg offset from kernel code.

I missed this comment in the last patch, and this makes me a bit 
confused. During the design phase, did we agree to have this whole list 
loaded from user ? which means that if user doesn't define the list at 
init, we will not send the trace_event().

Or was it kernel has a list, and user can modify if he wants to, and we 
will dump the values as per the register list.

@Christian ?

Regards
Shashank
On 2/8/2022 3:18 PM, Sharma, Shashank wrote:
> 
> 
> On 2/8/2022 2:39 PM, Somalapuram, Amaranath wrote:
>>
>>
>> On 2/8/2022 4:43 PM, Sharma, Shashank wrote:
>>> I thought we spoke and agreed about:
>>> - Not doing dynamic memory allocation during a reset call,
>> as there is a redesign debugfs call will happen during the application 
>> initialization and not during reset.
>>> - Not doing string operations, but just dumping register values by 
>>> index.
>> I think your referring to the second patch which happens during reset 
>> and no string operation in second patch.
> 
> Pls see my comment in the end.
> 
>>> NACK !
>>>
>>> - Shashank
>>>
>>> Amar,
>>> Apart from the long comment,there are a few more bugs in the patch, 
>>> which I have mentioned here inline. Please check them out.
>>>
>>> - Shashank
>>>
>>> On 2/8/2022 9:18 AM, Christian König wrote:
>>>> Am 08.02.22 um 09:16 schrieb Somalapuram Amaranath:
>>>>> List of register to be populated for dump collection during the GPU 
>>>>> reset.
>>>>>
>>>>> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
>>>>> ---
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  3 ++
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 60 
>>>>> +++++++++++++++++++++
>>>>>   2 files changed, 63 insertions(+)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>> index b85b67a88a3d..78fa46f959c5 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>> @@ -1097,6 +1097,9 @@ struct amdgpu_device {
>>>>>       struct amdgpu_reset_control     *reset_cntl;
>>>>>       uint32_t ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
>>>>> +
>>>>> +    /* reset dump register */
>>>>> +    long            reset_dump_reg_list[128];
>>>>
>>>> I don't have time for a full review, but using long here certainly 
>>>> makes no sense.
>>>>
>>>> long is either 32bit or 64bit depending on the CPU architecture.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>> will change uint32_t.
>>>>>   };
>>>>>   static inline struct amdgpu_device *drm_to_adev(struct drm_device 
>>>>> *ddev)
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>> index 164d6a9e9fbb..dad268e8a81a 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>> @@ -1609,6 +1609,64 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
>>>>>   DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
>>>>>               amdgpu_debugfs_sclk_set, "%llu\n");
>>>>> +static ssize_t amdgpu_reset_dump_register_list_read(struct file *f,
>>>>> +                char __user *buf, size_t size, loff_t *pos)
>>>>> +{
>>>>> +    struct amdgpu_device *adev = (struct amdgpu_device 
>>>>> *)file_inode(f)->i_private;
>>>>> +    char *reg_offset;
>>>>> +    int i, r, len;
>>>>> +
>>>>> +    reg_offset = kmalloc(2048, GFP_KERNEL);
> 
> We also want to understand how does the value 2048 came into picture, 
> probably a macro which calculates the size preprocessing time will work 
> better.
> 
> #define #define N_REGS_DUMP_GPU_RESET 10
> #define BUFFER_SZ(N_REGS_DUMP_GPU_RESET * (sizeof uint64_t) + 1)
> 
> This first macro can be used later for the loop count for registers as 
> well.
> 
>>>>> +    memset(reg_offset,  0, 2048);
>>>>> +    for (i = 0; adev->reset_dump_reg_list[i] != 0; i++)
>>>
>>> This loop termination condition is incorrect, why are we running the 
>>> loop until adev->reset_dump_reg_list[i] != 0 ?
>>>
>>> What if I have 10 registers to dump, but my 4th register value is 0 ? 
>>> It will break the loop at 4 and we will not get all values.
>>>
>> agreed, i try to avoid one more variable in adev
> 
> Not by the cost of logic of course :).
> 
> Now you can run this loop here.
> 
> for (i = 0; i < N_REGS...; i++) {
>      register_value_copy_here;
> }
> 
>>>>> +        sprintf(reg_offset + strlen(reg_offset), "0x%lx ", 
>>>>> adev->reset_dump_reg_list[i]);
>>>>> +
> 
>>>>> +    sprintf(reg_offset + strlen(reg_offset), "\n");
>>>>> +    len = strlen(reg_offset);
>>>>> +
>>>>> +    if (*pos >=  len)
>>>>> +        return 0;
>>>>> +
>>>>> +    r = copy_to_user(buf, reg_offset, len);
>>>>> +    *pos += len - r;
>>>>> +    kfree(reg_offset);
>>>
>>> Also, why are we doing a dynamic memory allocation for reg_offest ? 
>>> We can simply use adev->reset_dump_reg_list[i] isnt't it ?
>>>
>>> simply
>>> for (i=0; i<num_of_regs;i++) {
>>>     copy_to_user(buf, adev->reg_list[i], sizeof(uint64_t));
>>> }
>>>
>>> Or without even a loop, simply:
>>> copy_to_user(buf, &adev->reg_list, num_regs * sizeof(uint64_t));
>>>
>>> - Shashank
>>
>> it will not be in user readable format for debugfs, (if non readable 
>> is acceptable I will change this)
>>
> 
> We are just adding 0x in front of the reg value, so honestly I don't see 
> a huge improvement in the user readability, but if you still want to do 
> the dynamic allocation of memory, add the register offset or name as 
> well, I mean then it should read like:
> 
> 0x1234 = 0xABCD
> 0x1238 = 0xFFFF
> 
> - Shashank
> 
>>> +
>>>>>
>>>>> +    return len - r;
>>>>> +}
>>>>> +
>>>>> +static ssize_t amdgpu_reset_dump_register_list_write(struct file 
>>>>> *f, const char __user *buf,
>>>>> +        size_t size, loff_t *pos)
>>>>> +{
>>>>> +    struct amdgpu_device *adev = (struct amdgpu_device 
>>>>> *)file_inode(f)->i_private;
>>>>> +    char *reg_offset, *reg;
>>>>> +    int ret, i = 0;
>>>>> +
>>>>> +    reg_offset = kmalloc(size, GFP_KERNEL);
>>>>> +    memset(reg_offset,  0, size);
>>>>> +    ret = copy_from_user(reg_offset, buf, size);
>>>>> +
>>>
>>> We are not allowing user to write into the list, so this whole 
>>> function can just be a NOOP.
>>>
>>> - Shashank
>> User only update the list of reg offsets on init, there is no 
>> predefined reg offset from kernel code.
>>>
>>>>> +    if (ret)
>>>>> +        return -EFAULT;
>>>>> +
>>>>> +    while ((reg = strsep(&reg_offset, " ")) != NULL) {
>>>>> +        ret  = kstrtol(reg, 16, &adev->reset_dump_reg_list[i]);
>>>>> +        if (ret)
>>>>> +            return -EINVAL;
>>>>> +        i++;
>>>>> +    }
>>>>> +
>>>>> +    kfree(reg_offset);
>>>>> +
>>>>> +    return size;
>>>>> +}
>>>>> +
>>>>> +static const struct file_operations 
>>>>> amdgpu_reset_dump_register_list = {
>>>>> +    .owner = THIS_MODULE,
>>>>> +    .read = amdgpu_reset_dump_register_list_read,
>>>>> +    .write = amdgpu_reset_dump_register_list_write,
>>>>> +    .llseek = default_llseek
>>>>> +};
>>>>> +
>>>>>   int amdgpu_debugfs_init(struct amdgpu_device *adev)
>>>>>   {
>>>>>       struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
>>>>> @@ -1672,6 +1730,8 @@ int amdgpu_debugfs_init(struct amdgpu_device 
>>>>> *adev)
>>>>>                   &amdgpu_debugfs_test_ib_fops);
>>>>>       debugfs_create_file("amdgpu_vm_info", 0444, root, adev,
>>>>>                   &amdgpu_debugfs_vm_info_fops);
>>>>> +    debugfs_create_file("amdgpu_reset_dump_register_list", 0644, 
>>>>> root, adev,
>>>>> +                &amdgpu_reset_dump_register_list);
>>>>>       adev->debugfs_vbios_blob.data = adev->bios;
>>>>>       adev->debugfs_vbios_blob.size = adev->bios_size;
>>>>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: add debugfs for reset registers list
  2022-02-08 14:31         ` Sharma, Shashank
@ 2022-02-08 14:51           ` Sharma, Shashank
  0 siblings, 0 replies; 21+ messages in thread
From: Sharma, Shashank @ 2022-02-08 14:51 UTC (permalink / raw)
  To: Somalapuram, Amaranath, Christian König,
	Somalapuram Amaranath, amd-gfx
  Cc: alexander.deucher

Based on confirmation from Christian, it seems my understanding of the 
design was not correct, and user must add a list of registers to dump.

That resolves most of my comments automatically, @Amar, please fix a max 
register condition in the loop, to handle the negative testing case and 
the uint32_t stuff. With those two changes in place, feel free to use 
for this patch:

Reviewed-by: Shashank Sharma <shashank.sharma@amd.com>

- Shashank

On 2/8/2022 3:31 PM, Sharma, Shashank wrote:
>  >> User only update the list of reg offsets on init, there is no
>  >> predefined reg offset from kernel code.
> 
> I missed this comment in the last patch, and this makes me a bit 
> confused. During the design phase, did we agree to have this whole list 
> loaded from user ? which means that if user doesn't define the list at 
> init, we will not send the trace_event().
> 
> Or was it kernel has a list, and user can modify if he wants to, and we 
> will dump the values as per the register list.
> 
> @Christian ?
> 
> Regards
> Shashank
> On 2/8/2022 3:18 PM, Sharma, Shashank wrote:
>>
>>
>> On 2/8/2022 2:39 PM, Somalapuram, Amaranath wrote:
>>>
>>>
>>> On 2/8/2022 4:43 PM, Sharma, Shashank wrote:
>>>> I thought we spoke and agreed about:
>>>> - Not doing dynamic memory allocation during a reset call,
>>> as there is a redesign debugfs call will happen during the 
>>> application initialization and not during reset.
>>>> - Not doing string operations, but just dumping register values by 
>>>> index.
>>> I think your referring to the second patch which happens during reset 
>>> and no string operation in second patch.
>>
>> Pls see my comment in the end.
>>
>>>> NACK !
>>>>
>>>> - Shashank
>>>>
>>>> Amar,
>>>> Apart from the long comment,there are a few more bugs in the patch, 
>>>> which I have mentioned here inline. Please check them out.
>>>>
>>>> - Shashank
>>>>
>>>> On 2/8/2022 9:18 AM, Christian König wrote:
>>>>> Am 08.02.22 um 09:16 schrieb Somalapuram Amaranath:
>>>>>> List of register to be populated for dump collection during the 
>>>>>> GPU reset.
>>>>>>
>>>>>> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
>>>>>> ---
>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  3 ++
>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 60 
>>>>>> +++++++++++++++++++++
>>>>>>   2 files changed, 63 insertions(+)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>> index b85b67a88a3d..78fa46f959c5 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>> @@ -1097,6 +1097,9 @@ struct amdgpu_device {
>>>>>>       struct amdgpu_reset_control     *reset_cntl;
>>>>>>       uint32_t ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
>>>>>> +
>>>>>> +    /* reset dump register */
>>>>>> +    long            reset_dump_reg_list[128];
>>>>>
>>>>> I don't have time for a full review, but using long here certainly 
>>>>> makes no sense.
>>>>>
>>>>> long is either 32bit or 64bit depending on the CPU architecture.
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>> will change uint32_t.
>>>>>>   };
>>>>>>   static inline struct amdgpu_device *drm_to_adev(struct 
>>>>>> drm_device *ddev)
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>>> index 164d6a9e9fbb..dad268e8a81a 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>>> @@ -1609,6 +1609,64 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, 
>>>>>> NULL,
>>>>>>   DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
>>>>>>               amdgpu_debugfs_sclk_set, "%llu\n");
>>>>>> +static ssize_t amdgpu_reset_dump_register_list_read(struct file *f,
>>>>>> +                char __user *buf, size_t size, loff_t *pos)
>>>>>> +{
>>>>>> +    struct amdgpu_device *adev = (struct amdgpu_device 
>>>>>> *)file_inode(f)->i_private;
>>>>>> +    char *reg_offset;
>>>>>> +    int i, r, len;
>>>>>> +
>>>>>> +    reg_offset = kmalloc(2048, GFP_KERNEL);
>>
>> We also want to understand how does the value 2048 came into picture, 
>> probably a macro which calculates the size preprocessing time will 
>> work better.
>>
>> #define #define N_REGS_DUMP_GPU_RESET 10
>> #define BUFFER_SZ(N_REGS_DUMP_GPU_RESET * (sizeof uint64_t) + 1)
>>
>> This first macro can be used later for the loop count for registers as 
>> well.
>>
>>>>>> +    memset(reg_offset,  0, 2048);
>>>>>> +    for (i = 0; adev->reset_dump_reg_list[i] != 0; i++)
>>>>
>>>> This loop termination condition is incorrect, why are we running the 
>>>> loop until adev->reset_dump_reg_list[i] != 0 ?
>>>>
>>>> What if I have 10 registers to dump, but my 4th register value is 0 
>>>> ? It will break the loop at 4 and we will not get all values.
>>>>
>>> agreed, i try to avoid one more variable in adev
>>
>> Not by the cost of logic of course :).
>>
>> Now you can run this loop here.
>>
>> for (i = 0; i < N_REGS...; i++) {
>>      register_value_copy_here;
>> }
>>
>>>>>> +        sprintf(reg_offset + strlen(reg_offset), "0x%lx ", 
>>>>>> adev->reset_dump_reg_list[i]);
>>>>>> +
>>
>>>>>> +    sprintf(reg_offset + strlen(reg_offset), "\n");
>>>>>> +    len = strlen(reg_offset);
>>>>>> +
>>>>>> +    if (*pos >=  len)
>>>>>> +        return 0;
>>>>>> +
>>>>>> +    r = copy_to_user(buf, reg_offset, len);
>>>>>> +    *pos += len - r;
>>>>>> +    kfree(reg_offset);
>>>>
>>>> Also, why are we doing a dynamic memory allocation for reg_offest ? 
>>>> We can simply use adev->reset_dump_reg_list[i] isnt't it ?
>>>>
>>>> simply
>>>> for (i=0; i<num_of_regs;i++) {
>>>>     copy_to_user(buf, adev->reg_list[i], sizeof(uint64_t));
>>>> }
>>>>
>>>> Or without even a loop, simply:
>>>> copy_to_user(buf, &adev->reg_list, num_regs * sizeof(uint64_t));
>>>>
>>>> - Shashank
>>>
>>> it will not be in user readable format for debugfs, (if non readable 
>>> is acceptable I will change this)
>>>
>>
>> We are just adding 0x in front of the reg value, so honestly I don't 
>> see a huge improvement in the user readability, but if you still want 
>> to do the dynamic allocation of memory, add the register offset or 
>> name as well, I mean then it should read like:
>>
>> 0x1234 = 0xABCD
>> 0x1238 = 0xFFFF
>>
>> - Shashank
>>
>>>> +
>>>>>>
>>>>>> +    return len - r;
>>>>>> +}
>>>>>> +
>>>>>> +static ssize_t amdgpu_reset_dump_register_list_write(struct file 
>>>>>> *f, const char __user *buf,
>>>>>> +        size_t size, loff_t *pos)
>>>>>> +{
>>>>>> +    struct amdgpu_device *adev = (struct amdgpu_device 
>>>>>> *)file_inode(f)->i_private;
>>>>>> +    char *reg_offset, *reg;
>>>>>> +    int ret, i = 0;
>>>>>> +
>>>>>> +    reg_offset = kmalloc(size, GFP_KERNEL);
>>>>>> +    memset(reg_offset,  0, size);
>>>>>> +    ret = copy_from_user(reg_offset, buf, size);
>>>>>> +
>>>>
>>>> We are not allowing user to write into the list, so this whole 
>>>> function can just be a NOOP.
>>>>
>>>> - Shashank
>>> User only update the list of reg offsets on init, there is no 
>>> predefined reg offset from kernel code.
>>>>
>>>>>> +    if (ret)
>>>>>> +        return -EFAULT;
>>>>>> +
>>>>>> +    while ((reg = strsep(&reg_offset, " ")) != NULL) {
>>>>>> +        ret  = kstrtol(reg, 16, &adev->reset_dump_reg_list[i]);
>>>>>> +        if (ret)
>>>>>> +            return -EINVAL;
>>>>>> +        i++;
>>>>>> +    }
>>>>>> +
>>>>>> +    kfree(reg_offset);
>>>>>> +
>>>>>> +    return size;
>>>>>> +}
>>>>>> +
>>>>>> +static const struct file_operations 
>>>>>> amdgpu_reset_dump_register_list = {
>>>>>> +    .owner = THIS_MODULE,
>>>>>> +    .read = amdgpu_reset_dump_register_list_read,
>>>>>> +    .write = amdgpu_reset_dump_register_list_write,
>>>>>> +    .llseek = default_llseek
>>>>>> +};
>>>>>> +
>>>>>>   int amdgpu_debugfs_init(struct amdgpu_device *adev)
>>>>>>   {
>>>>>>       struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
>>>>>> @@ -1672,6 +1730,8 @@ int amdgpu_debugfs_init(struct amdgpu_device 
>>>>>> *adev)
>>>>>>                   &amdgpu_debugfs_test_ib_fops);
>>>>>>       debugfs_create_file("amdgpu_vm_info", 0444, root, adev,
>>>>>>                   &amdgpu_debugfs_vm_info_fops);
>>>>>> +    debugfs_create_file("amdgpu_reset_dump_register_list", 0644, 
>>>>>> root, adev,
>>>>>> +                &amdgpu_reset_dump_register_list);
>>>>>>       adev->debugfs_vbios_blob.data = adev->bios;
>>>>>>       adev->debugfs_vbios_blob.size = adev->bios_size;
>>>>>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* RE: [PATCH 1/2] drm/amdgpu: add debugfs for reset registers list
  2022-02-08  8:16 [PATCH 1/2] drm/amdgpu: add debugfs for reset registers list Somalapuram Amaranath
  2022-02-08  8:16 ` [PATCH 2/2] drm/amdgpu: add reset register trace function on GPU reset Somalapuram Amaranath
  2022-02-08  8:18 ` [PATCH 1/2] drm/amdgpu: add debugfs for reset registers list Christian König
@ 2022-02-08 10:54 ` Sharma, Shashank
  2 siblings, 0 replies; 21+ messages in thread
From: Sharma, Shashank @ 2022-02-08 10:54 UTC (permalink / raw)
  To: Somalapuram, Amaranath, amd-gfx; +Cc: Deucher, Alexander, Koenig, Christian

I thought we spoke and agreed about:
- Not doing dynamic memory allocation during a reset call,
- Not doing string operations, but just dumping register values by index. 

NACK !

- Shashank

-----Original Message-----
From: Somalapuram, Amaranath <Amaranath.Somalapuram@amd.com> 
Sent: Tuesday, February 8, 2022 9:17 AM
To: amd-gfx@lists.freedesktop.org
Cc: Koenig, Christian <Christian.Koenig@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Sharma, Shashank <Shashank.Sharma@amd.com>; Somalapuram, Amaranath <Amaranath.Somalapuram@amd.com>
Subject: [PATCH 1/2] drm/amdgpu: add debugfs for reset registers list

List of register to be populated for dump collection during the GPU reset.

Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  3 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 60 +++++++++++++++++++++
 2 files changed, 63 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index b85b67a88a3d..78fa46f959c5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1097,6 +1097,9 @@ struct amdgpu_device {
 
 	struct amdgpu_reset_control     *reset_cntl;
 	uint32_t                        ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
+
+	/* reset dump register */
+	long			reset_dump_reg_list[128];
 };
 
 static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 164d6a9e9fbb..dad268e8a81a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1609,6 +1609,64 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,  DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
 			amdgpu_debugfs_sclk_set, "%llu\n");
 
+static ssize_t amdgpu_reset_dump_register_list_read(struct file *f,
+				char __user *buf, size_t size, loff_t *pos) {
+	struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
+	char *reg_offset;
+	int i, r, len;
+
+	reg_offset = kmalloc(2048, GFP_KERNEL);
+	memset(reg_offset,  0, 2048);
+	for (i = 0; adev->reset_dump_reg_list[i] != 0; i++)
+		sprintf(reg_offset + strlen(reg_offset), "0x%lx ", 
+adev->reset_dump_reg_list[i]);
+
+	sprintf(reg_offset + strlen(reg_offset), "\n");
+	len = strlen(reg_offset);
+
+	if (*pos >=  len)
+		return 0;
+
+	r = copy_to_user(buf, reg_offset, len);
+	*pos += len - r;
+	kfree(reg_offset);
+
+	return len - r;
+}
+
+static ssize_t amdgpu_reset_dump_register_list_write(struct file *f, const char __user *buf,
+		size_t size, loff_t *pos)
+{
+	struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
+	char *reg_offset, *reg;
+	int ret, i = 0;
+
+	reg_offset = kmalloc(size, GFP_KERNEL);
+	memset(reg_offset,  0, size);
+	ret = copy_from_user(reg_offset, buf, size);
+
+	if (ret)
+		return -EFAULT;
+
+	while ((reg = strsep(&reg_offset, " ")) != NULL) {
+		ret  = kstrtol(reg, 16, &adev->reset_dump_reg_list[i]);
+		if (ret)
+			return -EINVAL;
+		i++;
+	}
+
+	kfree(reg_offset);
+
+	return size;
+}
+
+static const struct file_operations amdgpu_reset_dump_register_list = {
+	.owner = THIS_MODULE,
+	.read = amdgpu_reset_dump_register_list_read,
+	.write = amdgpu_reset_dump_register_list_write,
+	.llseek = default_llseek
+};
+
 int amdgpu_debugfs_init(struct amdgpu_device *adev)  {
 	struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
@@ -1672,6 +1730,8 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
 			    &amdgpu_debugfs_test_ib_fops);
 	debugfs_create_file("amdgpu_vm_info", 0444, root, adev,
 			    &amdgpu_debugfs_vm_info_fops);
+	debugfs_create_file("amdgpu_reset_dump_register_list", 0644, root, adev,
+			    &amdgpu_reset_dump_register_list);
 
 	adev->debugfs_vbios_blob.data = adev->bios;
 	adev->debugfs_vbios_blob.size = adev->bios_size;
--
2.25.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2022-02-10 16:00 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-02-08  8:16 [PATCH 1/2] drm/amdgpu: add debugfs for reset registers list Somalapuram Amaranath
2022-02-08  8:16 ` [PATCH 2/2] drm/amdgpu: add reset register trace function on GPU reset Somalapuram Amaranath
2022-02-08 15:28   ` Alex Deucher
2022-02-09  7:47     ` Christian König
2022-02-10  5:29       ` Somalapuram, Amaranath
2022-02-10  7:09         ` Christian König
2022-02-10  7:34           ` Somalapuram, Amaranath
2022-02-10  7:38             ` Christian König
2022-02-10 13:18               ` Sharma, Shashank
2022-02-10 14:05                 ` Christian König
2022-02-10 14:11                   ` Sharma, Shashank
2022-02-10 15:59                     ` Alex Deucher
2022-02-10 11:59         ` Sharma, Shashank
2022-02-10 12:27           ` Christian König
2022-02-08  8:18 ` [PATCH 1/2] drm/amdgpu: add debugfs for reset registers list Christian König
2022-02-08 11:13   ` Sharma, Shashank
2022-02-08 13:39     ` Somalapuram, Amaranath
2022-02-08 14:18       ` Sharma, Shashank
2022-02-08 14:31         ` Sharma, Shashank
2022-02-08 14:51           ` Sharma, Shashank
2022-02-08 10:54 ` Sharma, Shashank

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.