* [PATCH v1 1/2] drm/amdgpu: save the reset dump register value for devcoredump @ 2022-05-20 13:49 Somalapuram Amaranath 2022-05-20 13:49 ` [PATCH v1 2/2] drm/amdgpu: adding device coredump support Somalapuram Amaranath 2022-05-20 14:06 ` [PATCH v1 1/2] drm/amdgpu: save the reset dump register value for devcoredump Sharma, Shashank 0 siblings, 2 replies; 15+ messages in thread From: Somalapuram Amaranath @ 2022-05-20 13:49 UTC (permalink / raw) To: amd-gfx Cc: alexander.deucher, Somalapuram Amaranath, christian.koenig, shashank.sharma Allocate memory for register value and use the same values for devcoredump. Remove dump_stack reset register dumps. Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 9 ++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 7 +++---- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 76df583663c7..c79d9992b113 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1042,6 +1042,7 @@ struct amdgpu_device { /* reset dump register */ uint32_t *reset_dump_reg_list; + uint32_t *reset_dump_reg_value; int num_regs; struct amdgpu_reset_domain *reset_domain; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c index eedb12f6b8a3..942fdbd316f4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c @@ -1683,7 +1683,7 @@ static ssize_t amdgpu_reset_dump_register_list_write(struct file *f, { struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private; char reg_offset[11]; - uint32_t *new, *tmp = NULL; + uint32_t *new, *tmp = NULL, *tmp_value = NULL; int ret, i = 0, len = 0; do { @@ -1709,17 +1709,24 @@ static ssize_t amdgpu_reset_dump_register_list_write(struct file *f, i++; } while (len < size); + new = krealloc_array(tmp_value, i, sizeof(uint32_t), GFP_KERNEL); + if (!new) { + ret = -ENOMEM; + goto error_free; + } ret = down_write_killable(&adev->reset_domain->sem); if (ret) goto error_free; swap(adev->reset_dump_reg_list, tmp); + swap(adev->reset_dump_reg_value, new); adev->num_regs = i; up_write(&adev->reset_domain->sem); ret = size; error_free: kfree(tmp); + kfree(new); return ret; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 4daa0e893965..963c897a76e6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4720,15 +4720,14 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) { - uint32_t reg_value; int i; lockdep_assert_held(&adev->reset_domain->sem); - dump_stack(); for (i = 0; i < adev->num_regs; i++) { - reg_value = RREG32(adev->reset_dump_reg_list[i]); - trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], reg_value); + adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]); + trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], + adev->reset_dump_reg_value[i]); } return 0; -- 2.32.0 ^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH v1 2/2] drm/amdgpu: adding device coredump support 2022-05-20 13:49 [PATCH v1 1/2] drm/amdgpu: save the reset dump register value for devcoredump Somalapuram Amaranath @ 2022-05-20 13:49 ` Somalapuram Amaranath 2022-05-20 14:22 ` Sharma, Shashank 2022-05-20 14:06 ` [PATCH v1 1/2] drm/amdgpu: save the reset dump register value for devcoredump Sharma, Shashank 1 sibling, 1 reply; 15+ messages in thread From: Somalapuram Amaranath @ 2022-05-20 13:49 UTC (permalink / raw) To: amd-gfx Cc: alexander.deucher, Somalapuram Amaranath, christian.koenig, shashank.sharma Added device coredump information: - Kernel version - Module - Time - VRAM status - Guilty process name and PID - GPU register dumps Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 59 ++++++++++++++++++++++ 2 files changed, 62 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index c79d9992b113..f28d9c563f74 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1044,6 +1044,9 @@ struct amdgpu_device { uint32_t *reset_dump_reg_list; uint32_t *reset_dump_reg_value; int num_regs; + struct amdgpu_task_info reset_context_task_info; + bool reset_context_vram_lost; + struct timespec64 reset_time; struct amdgpu_reset_domain *reset_domain; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 963c897a76e6..f9b710e741a7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -32,6 +32,8 @@ #include <linux/slab.h> #include <linux/iommu.h> #include <linux/pci.h> +#include <linux/devcoredump.h> +#include <generated/utsrelease.h> #include <drm/drm_atomic_helper.h> #include <drm/drm_probe_helper.h> @@ -4733,6 +4735,55 @@ static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) return 0; } +#ifdef CONFIG_DEV_COREDUMP +static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, + size_t count, void *data, size_t datalen) +{ + struct drm_printer p; + struct amdgpu_device *adev = data; + struct drm_print_iterator iter; + int i; + + iter.data = buffer; + iter.offset = 0; + iter.start = offset; + iter.remain = count; + + p = drm_coredump_printer(&iter); + + drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); + drm_printf(&p, "kernel: " UTS_RELEASE "\n"); + drm_printf(&p, "module: " KBUILD_MODNAME "\n"); + drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec); + if (adev->reset_context_task_info.pid) + drm_printf(&p, "process_name: %s PID: %d\n", + adev->reset_context_task_info.process_name, + adev->reset_context_task_info.pid); + + if (adev->reset_context_vram_lost) + drm_printf(&p, "VRAM is lost due to GPU reset!\n"); + if (adev->num_regs) { + drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); + + for (i = 0; i < adev->num_regs; i++) + drm_printf(&p, "0x%08x: 0x%08x\n", + adev->reset_dump_reg_list[i], + adev->reset_dump_reg_value[i]); + } + + return count - iter.remain; +} + +static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev) +{ + struct drm_device *dev = adev_to_drm(adev); + + ktime_get_ts64(&adev->reset_time); + dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL, + amdgpu_devcoredump_read, NULL); +} +#endif + int amdgpu_do_asic_reset(struct list_head *device_list_handle, struct amdgpu_reset_context *reset_context) { @@ -4817,6 +4868,14 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, goto out; vram_lost = amdgpu_device_check_vram_lost(tmp_adev); +#ifdef CONFIG_DEV_COREDUMP + tmp_adev->reset_context_vram_lost = vram_lost; + tmp_adev->reset_context_task_info.pid = 0; + if (reset_context->job && reset_context->job->vm) + tmp_adev->reset_context_task_info = + reset_context->job->vm->task_info; + amdgpu_reset_capture_coredumpm(tmp_adev); +#endif if (vram_lost) { DRM_INFO("VRAM is lost due to GPU reset!\n"); amdgpu_inc_vram_lost(tmp_adev); -- 2.32.0 ^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [PATCH v1 2/2] drm/amdgpu: adding device coredump support 2022-05-20 13:49 ` [PATCH v1 2/2] drm/amdgpu: adding device coredump support Somalapuram Amaranath @ 2022-05-20 14:22 ` Sharma, Shashank 2022-05-24 6:42 ` Somalapuram, Amaranath 0 siblings, 1 reply; 15+ messages in thread From: Sharma, Shashank @ 2022-05-20 14:22 UTC (permalink / raw) To: Somalapuram Amaranath, amd-gfx; +Cc: alexander.deucher, christian.koenig On 5/20/2022 3:49 PM, Somalapuram Amaranath wrote: > Added device coredump information: > - Kernel version > - Module > - Time > - VRAM status > - Guilty process name and PID > - GPU register dumps > > Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 ++ > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 59 ++++++++++++++++++++++ > 2 files changed, 62 insertions(+) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index c79d9992b113..f28d9c563f74 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -1044,6 +1044,9 @@ struct amdgpu_device { > uint32_t *reset_dump_reg_list; > uint32_t *reset_dump_reg_value; > int num_regs; > + struct amdgpu_task_info reset_context_task_info; > + bool reset_context_vram_lost; How about drop the 'context' from name and just reset_task_info and reset_vram_lost ? > + struct timespec64 reset_time; > > struct amdgpu_reset_domain *reset_domain; > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 963c897a76e6..f9b710e741a7 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -32,6 +32,8 @@ > #include <linux/slab.h> > #include <linux/iommu.h> > #include <linux/pci.h> > +#include <linux/devcoredump.h> > +#include <generated/utsrelease.h> > > #include <drm/drm_atomic_helper.h> > #include <drm/drm_probe_helper.h> > @@ -4733,6 +4735,55 @@ static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) > return 0; > } > > +#ifdef CONFIG_DEV_COREDUMP > +static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, > + size_t count, void *data, size_t datalen) > +{ > + struct drm_printer p; > + struct amdgpu_device *adev = data; > + struct drm_print_iterator iter; > + int i; > + A NULL check for 'buffer' here could prevent a segfault later. > + iter.data = buffer; > + iter.offset = 0; > + iter.start = offset; > + iter.remain = count; > + > + p = drm_coredump_printer(&iter); > + > + drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); > + drm_printf(&p, "kernel: " UTS_RELEASE "\n"); > + drm_printf(&p, "module: " KBUILD_MODNAME "\n"); > + drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec); > + if (adev->reset_context_task_info.pid) > + drm_printf(&p, "process_name: %s PID: %d\n", > + adev->reset_context_task_info.process_name, > + adev->reset_context_task_info.pid); Please fix the alignment of print variables. > + > + if (adev->reset_context_vram_lost) > + drm_printf(&p, "VRAM is lost due to GPU reset!\n"); > + if (adev->num_regs) { > + drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); > + > + for (i = 0; i < adev->num_regs; i++) > + drm_printf(&p, "0x%08x: 0x%08x\n", > + adev->reset_dump_reg_list[i], > + adev->reset_dump_reg_value[i]); > + } > + > + return count - iter.remain; > +} > + > +static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev) > +{ > + struct drm_device *dev = adev_to_drm(adev); > + > + ktime_get_ts64(&adev->reset_time); > + dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL, > + amdgpu_devcoredump_read, NULL); instead of registering NULL as free function, I would prefer you to have a dummy no_op free function registered, which we can consume if something changes. > +} > +#endif > + > int amdgpu_do_asic_reset(struct list_head *device_list_handle, > struct amdgpu_reset_context *reset_context) > { > @@ -4817,6 +4868,14 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, > goto out; > > vram_lost = amdgpu_device_check_vram_lost(tmp_adev); > +#ifdef CONFIG_DEV_COREDUMP > + tmp_adev->reset_context_vram_lost = vram_lost; > + tmp_adev->reset_context_task_info.pid = 0; why is the PID hardcoded to 0 ? > + if (reset_context->job && reset_context->job->vm) > + tmp_adev->reset_context_task_info = > + reset_context->job->vm->task_info; > + amdgpu_reset_capture_coredumpm(tmp_adev); > +#endif > if (vram_lost) { > DRM_INFO("VRAM is lost due to GPU reset!\n"); > - Shashank amdgpu_inc_vram_lost(tmp_adev); ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v1 2/2] drm/amdgpu: adding device coredump support 2022-05-20 14:22 ` Sharma, Shashank @ 2022-05-24 6:42 ` Somalapuram, Amaranath 2022-05-24 9:53 ` Sharma, Shashank 0 siblings, 1 reply; 15+ messages in thread From: Somalapuram, Amaranath @ 2022-05-24 6:42 UTC (permalink / raw) To: Sharma, Shashank, Somalapuram Amaranath, amd-gfx Cc: alexander.deucher, christian.koenig [-- Attachment #1: Type: text/plain, Size: 5762 bytes --] On 5/20/2022 7:52 PM, Sharma, Shashank wrote: > > > On 5/20/2022 3:49 PM, Somalapuram Amaranath wrote: >> Added device coredump information: >> - Kernel version >> - Module >> - Time >> - VRAM status >> - Guilty process name and PID >> - GPU register dumps >> >> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com> >> --- >> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 ++ >> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 59 ++++++++++++++++++++++ >> 2 files changed, 62 insertions(+) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> index c79d9992b113..f28d9c563f74 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> @@ -1044,6 +1044,9 @@ struct amdgpu_device { >> uint32_t *reset_dump_reg_list; >> uint32_t *reset_dump_reg_value; >> int num_regs; >> + struct amdgpu_task_info reset_context_task_info; >> + bool reset_context_vram_lost; > > How about drop the 'context' from name and just reset_task_info and > reset_vram_lost ? OK. > >> + struct timespec64 reset_time; >> struct amdgpu_reset_domain *reset_domain; >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> index 963c897a76e6..f9b710e741a7 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> @@ -32,6 +32,8 @@ >> #include <linux/slab.h> >> #include <linux/iommu.h> >> #include <linux/pci.h> >> +#include <linux/devcoredump.h> >> +#include <generated/utsrelease.h> >> #include <drm/drm_atomic_helper.h> >> #include <drm/drm_probe_helper.h> >> @@ -4733,6 +4735,55 @@ static int amdgpu_reset_reg_dumps(struct >> amdgpu_device *adev) >> return 0; >> } >> +#ifdef CONFIG_DEV_COREDUMP >> +static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, >> + size_t count, void *data, size_t datalen) >> +{ >> + struct drm_printer p; >> + struct amdgpu_device *adev = data; >> + struct drm_print_iterator iter; >> + int i; >> + > > A NULL check for 'buffer' here could prevent a segfault later. > Agreed. >> + iter.data = buffer; >> + iter.offset = 0; >> + iter.start = offset; >> + iter.remain = count; >> + >> + p = drm_coredump_printer(&iter); >> + >> + drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); >> + drm_printf(&p, "kernel: " UTS_RELEASE "\n"); >> + drm_printf(&p, "module: " KBUILD_MODNAME "\n"); >> + drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, >> adev->reset_time.tv_nsec); >> + if (adev->reset_context_task_info.pid) >> + drm_printf(&p, "process_name: %s PID: %d\n", >> + adev->reset_context_task_info.process_name, >> + adev->reset_context_task_info.pid); > Please fix the alignment of print variables. > I will cross check this. >> + >> + if (adev->reset_context_vram_lost) >> + drm_printf(&p, "VRAM is lost due to GPU reset!\n"); >> + if (adev->num_regs) { >> + drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); >> + >> + for (i = 0; i < adev->num_regs; i++) >> + drm_printf(&p, "0x%08x: 0x%08x\n", >> + adev->reset_dump_reg_list[i], >> + adev->reset_dump_reg_value[i]); >> + } >> + >> + return count - iter.remain; >> +} >> + >> +static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev) >> +{ >> + struct drm_device *dev = adev_to_drm(adev); >> + >> + ktime_get_ts64(&adev->reset_time); >> + dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL, >> + amdgpu_devcoredump_read, NULL); > instead of registering NULL as free function, I would prefer you to > have a dummy no_op free function registered, which we can consume if > something changes. you mean something like this (function without any code): staticvoidamdgpu_devcoredump_free(void*data) { } >> +} >> +#endif >> + >> int amdgpu_do_asic_reset(struct list_head *device_list_handle, >> struct amdgpu_reset_context *reset_context) >> { >> @@ -4817,6 +4868,14 @@ int amdgpu_do_asic_reset(struct list_head >> *device_list_handle, >> goto out; >> vram_lost = amdgpu_device_check_vram_lost(tmp_adev); >> +#ifdef CONFIG_DEV_COREDUMP >> + tmp_adev->reset_context_vram_lost = vram_lost; >> + tmp_adev->reset_context_task_info.pid = 0; > why is the PID hardcoded to 0 ? in case of reset context reset_context->job->vm is null (possibility that reset can be non VM related). If we don't set tmp_adev->reset_context_task_info.pid = 0, it will show previous reset valid PID. Regards, S.Amarnath >> + if (reset_context->job && reset_context->job->vm) >> + tmp_adev->reset_context_task_info = >> + reset_context->job->vm->task_info; >> + amdgpu_reset_capture_coredumpm(tmp_adev); >> +#endif >> if (vram_lost) { >> DRM_INFO("VRAM is lost due to GPU reset!\n"); >> > - Shashank > amdgpu_inc_vram_lost(tmp_adev); [-- Attachment #2: Type: text/html, Size: 12808 bytes --] ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v1 2/2] drm/amdgpu: adding device coredump support 2022-05-24 6:42 ` Somalapuram, Amaranath @ 2022-05-24 9:53 ` Sharma, Shashank 2022-05-24 12:10 ` Somalapuram, Amaranath 0 siblings, 1 reply; 15+ messages in thread From: Sharma, Shashank @ 2022-05-24 9:53 UTC (permalink / raw) To: Somalapuram, Amaranath, Somalapuram Amaranath, amd-gfx Cc: alexander.deucher, christian.koenig On 5/24/2022 8:42 AM, Somalapuram, Amaranath wrote: > > On 5/20/2022 7:52 PM, Sharma, Shashank wrote: >> >> >> On 5/20/2022 3:49 PM, Somalapuram Amaranath wrote: >>> Added device coredump information: >>> - Kernel version >>> - Module >>> - Time >>> - VRAM status >>> - Guilty process name and PID >>> - GPU register dumps >>> >>> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com> >>> --- >>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 ++ >>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 59 ++++++++++++++++++++++ >>> 2 files changed, 62 insertions(+) >>> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>> index c79d9992b113..f28d9c563f74 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>> @@ -1044,6 +1044,9 @@ struct amdgpu_device { >>> uint32_t *reset_dump_reg_list; >>> uint32_t *reset_dump_reg_value; >>> int num_regs; >>> + struct amdgpu_task_info reset_context_task_info; >>> + bool reset_context_vram_lost; >> >> How about drop the 'context' from name and just reset_task_info and >> reset_vram_lost ? > OK. >> >>> + struct timespec64 reset_time; >>> struct amdgpu_reset_domain *reset_domain; >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>> index 963c897a76e6..f9b710e741a7 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>> @@ -32,6 +32,8 @@ >>> #include <linux/slab.h> >>> #include <linux/iommu.h> >>> #include <linux/pci.h> >>> +#include <linux/devcoredump.h> >>> +#include <generated/utsrelease.h> >>> #include <drm/drm_atomic_helper.h> >>> #include <drm/drm_probe_helper.h> >>> @@ -4733,6 +4735,55 @@ static int amdgpu_reset_reg_dumps(struct >>> amdgpu_device *adev) >>> return 0; >>> } >>> +#ifdef CONFIG_DEV_COREDUMP >>> +static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, >>> + size_t count, void *data, size_t datalen) >>> +{ >>> + struct drm_printer p; >>> + struct amdgpu_device *adev = data; >>> + struct drm_print_iterator iter; >>> + int i; >>> + >> >> A NULL check for 'buffer' here could prevent a segfault later. >> > Agreed. >>> + iter.data = buffer; >>> + iter.offset = 0; >>> + iter.start = offset; >>> + iter.remain = count; >>> + >>> + p = drm_coredump_printer(&iter); >>> + >>> + drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); >>> + drm_printf(&p, "kernel: " UTS_RELEASE "\n"); >>> + drm_printf(&p, "module: " KBUILD_MODNAME "\n"); >>> + drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, >>> adev->reset_time.tv_nsec); >>> + if (adev->reset_context_task_info.pid) >>> + drm_printf(&p, "process_name: %s PID: %d\n", >>> + adev->reset_context_task_info.process_name, >>> + adev->reset_context_task_info.pid); >> Please fix the alignment of print variables. >> > I will cross check this. >>> + >>> + if (adev->reset_context_vram_lost) >>> + drm_printf(&p, "VRAM is lost due to GPU reset!\n"); >>> + if (adev->num_regs) { >>> + drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); >>> + >>> + for (i = 0; i < adev->num_regs; i++) >>> + drm_printf(&p, "0x%08x: 0x%08x\n", >>> + adev->reset_dump_reg_list[i], >>> + adev->reset_dump_reg_value[i]); >>> + } >>> + >>> + return count - iter.remain; >>> +} >>> + >>> +static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev) >>> +{ >>> + struct drm_device *dev = adev_to_drm(adev); >>> + >>> + ktime_get_ts64(&adev->reset_time); >>> + dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL, >>> + amdgpu_devcoredump_read, NULL); >> instead of registering NULL as free function, I would prefer you to >> have a dummy no_op free function registered, which we can consume if >> something changes. > you mean something like this (function without any code): > staticvoidamdgpu_devcoredump_free(void*data) > { > } Yes, precisely. >>> +} >>> +#endif >>> + >>> int amdgpu_do_asic_reset(struct list_head *device_list_handle, >>> struct amdgpu_reset_context *reset_context) >>> { >>> @@ -4817,6 +4868,14 @@ int amdgpu_do_asic_reset(struct list_head >>> *device_list_handle, >>> goto out; >>> vram_lost = amdgpu_device_check_vram_lost(tmp_adev); >>> +#ifdef CONFIG_DEV_COREDUMP >>> + tmp_adev->reset_context_vram_lost = vram_lost; >>> + tmp_adev->reset_context_task_info.pid = 0; >> why is the PID hardcoded to 0 ? > in case of reset context reset_context->job->vm is null (possibility > that reset can be non VM related). > If we don't set tmp_adev->reset_context_task_info.pid = 0, it will show > previous reset valid PID. > But when the VM is not NULL, are we updating this PID somewhere ? I did not see that happening in this series. - Shashank > > Regards, > S.Amarnath >>> + if (reset_context->job && reset_context->job->vm) >>> + tmp_adev->reset_context_task_info = >>> + reset_context->job->vm->task_info; >>> + amdgpu_reset_capture_coredumpm(tmp_adev); >>> +#endif >>> if (vram_lost) { >>> DRM_INFO("VRAM is lost due to GPU reset!\n"); >>> >> - Shashank >> amdgpu_inc_vram_lost(tmp_adev); ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v1 2/2] drm/amdgpu: adding device coredump support 2022-05-24 9:53 ` Sharma, Shashank @ 2022-05-24 12:10 ` Somalapuram, Amaranath 2022-05-24 12:50 ` Sharma, Shashank 0 siblings, 1 reply; 15+ messages in thread From: Somalapuram, Amaranath @ 2022-05-24 12:10 UTC (permalink / raw) To: Sharma, Shashank, Somalapuram Amaranath, amd-gfx Cc: alexander.deucher, christian.koenig On 5/24/2022 3:23 PM, Sharma, Shashank wrote: > > > On 5/24/2022 8:42 AM, Somalapuram, Amaranath wrote: >> >> On 5/20/2022 7:52 PM, Sharma, Shashank wrote: >>> >>> >>> On 5/20/2022 3:49 PM, Somalapuram Amaranath wrote: >>>> Added device coredump information: >>>> - Kernel version >>>> - Module >>>> - Time >>>> - VRAM status >>>> - Guilty process name and PID >>>> - GPU register dumps >>>> >>>> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com> >>>> --- >>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 ++ >>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 59 >>>> ++++++++++++++++++++++ >>>> 2 files changed, 62 insertions(+) >>>> >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>> index c79d9992b113..f28d9c563f74 100644 >>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>> @@ -1044,6 +1044,9 @@ struct amdgpu_device { >>>> uint32_t *reset_dump_reg_list; >>>> uint32_t *reset_dump_reg_value; >>>> int num_regs; >>>> + struct amdgpu_task_info reset_context_task_info; >>>> + bool reset_context_vram_lost; >>> >>> How about drop the 'context' from name and just reset_task_info and >>> reset_vram_lost ? >> OK. >>> >>>> + struct timespec64 reset_time; >>>> struct amdgpu_reset_domain *reset_domain; >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>> index 963c897a76e6..f9b710e741a7 100644 >>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>> @@ -32,6 +32,8 @@ >>>> #include <linux/slab.h> >>>> #include <linux/iommu.h> >>>> #include <linux/pci.h> >>>> +#include <linux/devcoredump.h> >>>> +#include <generated/utsrelease.h> >>>> #include <drm/drm_atomic_helper.h> >>>> #include <drm/drm_probe_helper.h> >>>> @@ -4733,6 +4735,55 @@ static int amdgpu_reset_reg_dumps(struct >>>> amdgpu_device *adev) >>>> return 0; >>>> } >>>> +#ifdef CONFIG_DEV_COREDUMP >>>> +static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, >>>> + size_t count, void *data, size_t datalen) >>>> +{ >>>> + struct drm_printer p; >>>> + struct amdgpu_device *adev = data; >>>> + struct drm_print_iterator iter; >>>> + int i; >>>> + >>> >>> A NULL check for 'buffer' here could prevent a segfault later. >>> >> Agreed. >>>> + iter.data = buffer; >>>> + iter.offset = 0; >>>> + iter.start = offset; >>>> + iter.remain = count; >>>> + >>>> + p = drm_coredump_printer(&iter); >>>> + >>>> + drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); >>>> + drm_printf(&p, "kernel: " UTS_RELEASE "\n"); >>>> + drm_printf(&p, "module: " KBUILD_MODNAME "\n"); >>>> + drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, >>>> adev->reset_time.tv_nsec); >>>> + if (adev->reset_context_task_info.pid) >>>> + drm_printf(&p, "process_name: %s PID: %d\n", >>>> + adev->reset_context_task_info.process_name, >>>> + adev->reset_context_task_info.pid); >>> Please fix the alignment of print variables. >>> >> I will cross check this. >>>> + >>>> + if (adev->reset_context_vram_lost) >>>> + drm_printf(&p, "VRAM is lost due to GPU reset!\n"); >>>> + if (adev->num_regs) { >>>> + drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); >>>> + >>>> + for (i = 0; i < adev->num_regs; i++) >>>> + drm_printf(&p, "0x%08x: 0x%08x\n", >>>> + adev->reset_dump_reg_list[i], >>>> + adev->reset_dump_reg_value[i]); >>>> + } >>>> + >>>> + return count - iter.remain; >>>> +} >>>> + >>>> +static void amdgpu_reset_capture_coredumpm(struct amdgpu_device >>>> *adev) >>>> +{ >>>> + struct drm_device *dev = adev_to_drm(adev); >>>> + >>>> + ktime_get_ts64(&adev->reset_time); >>>> + dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL, >>>> + amdgpu_devcoredump_read, NULL); >>> instead of registering NULL as free function, I would prefer you to >>> have a dummy no_op free function registered, which we can consume if >>> something changes. >> you mean something like this (function without any code): >> staticvoidamdgpu_devcoredump_free(void*data) >> { >> } > > Yes, precisely. > >>>> +} >>>> +#endif >>>> + >>>> int amdgpu_do_asic_reset(struct list_head *device_list_handle, >>>> struct amdgpu_reset_context *reset_context) >>>> { >>>> @@ -4817,6 +4868,14 @@ int amdgpu_do_asic_reset(struct list_head >>>> *device_list_handle, >>>> goto out; >>>> vram_lost = >>>> amdgpu_device_check_vram_lost(tmp_adev); >>>> +#ifdef CONFIG_DEV_COREDUMP >>>> + tmp_adev->reset_context_vram_lost = vram_lost; >>>> + tmp_adev->reset_context_task_info.pid = 0; >>> why is the PID hardcoded to 0 ? >> in case of reset context reset_context->job->vm is null (possibility >> that reset can be non VM related). >> If we don't set tmp_adev->reset_context_task_info.pid = 0, it will >> show previous reset valid PID. >> > > But when the VM is not NULL, are we updating this PID somewhere ? I > did not see that happening in this series. This is the only place where PID get updated. For example sequence of operation like: 1st reset: -valid VM and tmp_adev->reset_context_task_info.pid is set to some valid PID 2nd reset: -invalid VM -tmp_adev context will remain same (adev context will be same after successful GPU reset sequence). -tmp_adev->reset_context_task_info.pid will be holding 1st reset PID value if we don't set it 0, it can give wrong PID. Regards, S.Amarnath > > - Shashank > >> >> Regards, >> S.Amarnath >>>> + if (reset_context->job && reset_context->job->vm) >>>> + tmp_adev->reset_context_task_info = >>>> + reset_context->job->vm->task_info; >>>> + amdgpu_reset_capture_coredumpm(tmp_adev); >>>> +#endif >>>> if (vram_lost) { >>>> DRM_INFO("VRAM is lost due to GPU reset!\n"); >>>> >>> - Shashank >>> amdgpu_inc_vram_lost(tmp_adev); ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v1 2/2] drm/amdgpu: adding device coredump support 2022-05-24 12:10 ` Somalapuram, Amaranath @ 2022-05-24 12:50 ` Sharma, Shashank 2022-05-24 13:18 ` Somalapuram, Amaranath 0 siblings, 1 reply; 15+ messages in thread From: Sharma, Shashank @ 2022-05-24 12:50 UTC (permalink / raw) To: Somalapuram, Amaranath, Somalapuram Amaranath, amd-gfx Cc: alexander.deucher, christian.koenig On 5/24/2022 2:10 PM, Somalapuram, Amaranath wrote: > > On 5/24/2022 3:23 PM, Sharma, Shashank wrote: >> >> >> On 5/24/2022 8:42 AM, Somalapuram, Amaranath wrote: >>> >>> On 5/20/2022 7:52 PM, Sharma, Shashank wrote: >>>> >>>> >>>> On 5/20/2022 3:49 PM, Somalapuram Amaranath wrote: >>>>> Added device coredump information: >>>>> - Kernel version >>>>> - Module >>>>> - Time >>>>> - VRAM status >>>>> - Guilty process name and PID >>>>> - GPU register dumps >>>>> >>>>> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com> >>>>> --- >>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 ++ >>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 59 >>>>> ++++++++++++++++++++++ >>>>> 2 files changed, 62 insertions(+) >>>>> >>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>> index c79d9992b113..f28d9c563f74 100644 >>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>> @@ -1044,6 +1044,9 @@ struct amdgpu_device { >>>>> uint32_t *reset_dump_reg_list; >>>>> uint32_t *reset_dump_reg_value; >>>>> int num_regs; >>>>> + struct amdgpu_task_info reset_context_task_info; >>>>> + bool reset_context_vram_lost; >>>> >>>> How about drop the 'context' from name and just reset_task_info and >>>> reset_vram_lost ? >>> OK. >>>> >>>>> + struct timespec64 reset_time; >>>>> struct amdgpu_reset_domain *reset_domain; >>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>> index 963c897a76e6..f9b710e741a7 100644 >>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>> @@ -32,6 +32,8 @@ >>>>> #include <linux/slab.h> >>>>> #include <linux/iommu.h> >>>>> #include <linux/pci.h> >>>>> +#include <linux/devcoredump.h> >>>>> +#include <generated/utsrelease.h> >>>>> #include <drm/drm_atomic_helper.h> >>>>> #include <drm/drm_probe_helper.h> >>>>> @@ -4733,6 +4735,55 @@ static int amdgpu_reset_reg_dumps(struct >>>>> amdgpu_device *adev) >>>>> return 0; >>>>> } >>>>> +#ifdef CONFIG_DEV_COREDUMP >>>>> +static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, >>>>> + size_t count, void *data, size_t datalen) >>>>> +{ >>>>> + struct drm_printer p; >>>>> + struct amdgpu_device *adev = data; >>>>> + struct drm_print_iterator iter; >>>>> + int i; >>>>> + >>>> >>>> A NULL check for 'buffer' here could prevent a segfault later. >>>> >>> Agreed. >>>>> + iter.data = buffer; >>>>> + iter.offset = 0; >>>>> + iter.start = offset; >>>>> + iter.remain = count; >>>>> + >>>>> + p = drm_coredump_printer(&iter); >>>>> + >>>>> + drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); >>>>> + drm_printf(&p, "kernel: " UTS_RELEASE "\n"); >>>>> + drm_printf(&p, "module: " KBUILD_MODNAME "\n"); >>>>> + drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, >>>>> adev->reset_time.tv_nsec); >>>>> + if (adev->reset_context_task_info.pid) >>>>> + drm_printf(&p, "process_name: %s PID: %d\n", >>>>> + adev->reset_context_task_info.process_name, >>>>> + adev->reset_context_task_info.pid); >>>> Please fix the alignment of print variables. >>>> >>> I will cross check this. >>>>> + >>>>> + if (adev->reset_context_vram_lost) >>>>> + drm_printf(&p, "VRAM is lost due to GPU reset!\n"); >>>>> + if (adev->num_regs) { >>>>> + drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); >>>>> + >>>>> + for (i = 0; i < adev->num_regs; i++) >>>>> + drm_printf(&p, "0x%08x: 0x%08x\n", >>>>> + adev->reset_dump_reg_list[i], >>>>> + adev->reset_dump_reg_value[i]); >>>>> + } >>>>> + >>>>> + return count - iter.remain; >>>>> +} >>>>> + >>>>> +static void amdgpu_reset_capture_coredumpm(struct amdgpu_device >>>>> *adev) >>>>> +{ >>>>> + struct drm_device *dev = adev_to_drm(adev); >>>>> + >>>>> + ktime_get_ts64(&adev->reset_time); >>>>> + dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL, >>>>> + amdgpu_devcoredump_read, NULL); >>>> instead of registering NULL as free function, I would prefer you to >>>> have a dummy no_op free function registered, which we can consume if >>>> something changes. >>> you mean something like this (function without any code): >>> staticvoidamdgpu_devcoredump_free(void*data) >>> { >>> } >> >> Yes, precisely. >> >>>>> +} >>>>> +#endif >>>>> + >>>>> int amdgpu_do_asic_reset(struct list_head *device_list_handle, >>>>> struct amdgpu_reset_context *reset_context) >>>>> { >>>>> @@ -4817,6 +4868,14 @@ int amdgpu_do_asic_reset(struct list_head >>>>> *device_list_handle, >>>>> goto out; >>>>> vram_lost = >>>>> amdgpu_device_check_vram_lost(tmp_adev); >>>>> +#ifdef CONFIG_DEV_COREDUMP >>>>> + tmp_adev->reset_context_vram_lost = vram_lost; >>>>> + tmp_adev->reset_context_task_info.pid = 0; >>>> why is the PID hardcoded to 0 ? >>> in case of reset context reset_context->job->vm is null (possibility >>> that reset can be non VM related). >>> If we don't set tmp_adev->reset_context_task_info.pid = 0, it will >>> show previous reset valid PID. >>> >> >> But when the VM is not NULL, are we updating this PID somewhere ? I >> did not see that happening in this series. > This is the only place where PID get updated. > For example sequence of operation like: > 1st reset: > -valid VM and tmp_adev->reset_context_task_info.pid is set to some valid > PID > 2nd reset: > -invalid VM > -tmp_adev context will remain same (adev context will be same after > successful GPU reset sequence). > -tmp_adev->reset_context_task_info.pid will be holding 1st reset PID value > > if we don't set it 0, it can give wrong PID. > I get your point, and agree, that when the vm is invalid, PID is invalid so we should send pid=0. But where are you handling the case when VM is valid and pid is valid ? - Shashank > Regards, > S.Amarnath >> >> - Shashank >> >>> >>> Regards, >>> S.Amarnath >>>>> + if (reset_context->job && reset_context->job->vm) >>>>> + tmp_adev->reset_context_task_info = >>>>> + reset_context->job->vm->task_info; >>>>> + amdgpu_reset_capture_coredumpm(tmp_adev); >>>>> +#endif >>>>> if (vram_lost) { >>>>> DRM_INFO("VRAM is lost due to GPU reset!\n"); >>>>> >>>> - Shashank >>>> amdgpu_inc_vram_lost(tmp_adev); ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v1 2/2] drm/amdgpu: adding device coredump support 2022-05-24 12:50 ` Sharma, Shashank @ 2022-05-24 13:18 ` Somalapuram, Amaranath 2022-05-24 15:04 ` Sharma, Shashank 0 siblings, 1 reply; 15+ messages in thread From: Somalapuram, Amaranath @ 2022-05-24 13:18 UTC (permalink / raw) To: Sharma, Shashank, Somalapuram Amaranath, amd-gfx Cc: alexander.deucher, christian.koenig On 5/24/2022 6:20 PM, Sharma, Shashank wrote: > > > On 5/24/2022 2:10 PM, Somalapuram, Amaranath wrote: >> >> On 5/24/2022 3:23 PM, Sharma, Shashank wrote: >>> >>> >>> On 5/24/2022 8:42 AM, Somalapuram, Amaranath wrote: >>>> >>>> On 5/20/2022 7:52 PM, Sharma, Shashank wrote: >>>>> >>>>> >>>>> On 5/20/2022 3:49 PM, Somalapuram Amaranath wrote: >>>>>> Added device coredump information: >>>>>> - Kernel version >>>>>> - Module >>>>>> - Time >>>>>> - VRAM status >>>>>> - Guilty process name and PID >>>>>> - GPU register dumps >>>>>> >>>>>> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com> >>>>>> --- >>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 ++ >>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 59 >>>>>> ++++++++++++++++++++++ >>>>>> 2 files changed, 62 insertions(+) >>>>>> >>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>> index c79d9992b113..f28d9c563f74 100644 >>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>> @@ -1044,6 +1044,9 @@ struct amdgpu_device { >>>>>> uint32_t *reset_dump_reg_list; >>>>>> uint32_t *reset_dump_reg_value; >>>>>> int num_regs; >>>>>> + struct amdgpu_task_info reset_context_task_info; >>>>>> + bool reset_context_vram_lost; >>>>> >>>>> How about drop the 'context' from name and just reset_task_info >>>>> and reset_vram_lost ? >>>> OK. >>>>> >>>>>> + struct timespec64 reset_time; >>>>>> struct amdgpu_reset_domain *reset_domain; >>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>> index 963c897a76e6..f9b710e741a7 100644 >>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>> @@ -32,6 +32,8 @@ >>>>>> #include <linux/slab.h> >>>>>> #include <linux/iommu.h> >>>>>> #include <linux/pci.h> >>>>>> +#include <linux/devcoredump.h> >>>>>> +#include <generated/utsrelease.h> >>>>>> #include <drm/drm_atomic_helper.h> >>>>>> #include <drm/drm_probe_helper.h> >>>>>> @@ -4733,6 +4735,55 @@ static int amdgpu_reset_reg_dumps(struct >>>>>> amdgpu_device *adev) >>>>>> return 0; >>>>>> } >>>>>> +#ifdef CONFIG_DEV_COREDUMP >>>>>> +static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, >>>>>> + size_t count, void *data, size_t datalen) >>>>>> +{ >>>>>> + struct drm_printer p; >>>>>> + struct amdgpu_device *adev = data; >>>>>> + struct drm_print_iterator iter; >>>>>> + int i; >>>>>> + >>>>> >>>>> A NULL check for 'buffer' here could prevent a segfault later. >>>>> >>>> Agreed. >>>>>> + iter.data = buffer; >>>>>> + iter.offset = 0; >>>>>> + iter.start = offset; >>>>>> + iter.remain = count; >>>>>> + >>>>>> + p = drm_coredump_printer(&iter); >>>>>> + >>>>>> + drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); >>>>>> + drm_printf(&p, "kernel: " UTS_RELEASE "\n"); >>>>>> + drm_printf(&p, "module: " KBUILD_MODNAME "\n"); >>>>>> + drm_printf(&p, "time: %lld.%09ld\n", >>>>>> adev->reset_time.tv_sec, adev->reset_time.tv_nsec); >>>>>> + if (adev->reset_context_task_info.pid) >>>>>> + drm_printf(&p, "process_name: %s PID: %d\n", >>>>>> + adev->reset_context_task_info.process_name, >>>>>> + adev->reset_context_task_info.pid); >>>>> Please fix the alignment of print variables. >>>>> >>>> I will cross check this. >>>>>> + >>>>>> + if (adev->reset_context_vram_lost) >>>>>> + drm_printf(&p, "VRAM is lost due to GPU reset!\n"); >>>>>> + if (adev->num_regs) { >>>>>> + drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); >>>>>> + >>>>>> + for (i = 0; i < adev->num_regs; i++) >>>>>> + drm_printf(&p, "0x%08x: 0x%08x\n", >>>>>> + adev->reset_dump_reg_list[i], >>>>>> + adev->reset_dump_reg_value[i]); >>>>>> + } >>>>>> + >>>>>> + return count - iter.remain; >>>>>> +} >>>>>> + >>>>>> +static void amdgpu_reset_capture_coredumpm(struct amdgpu_device >>>>>> *adev) >>>>>> +{ >>>>>> + struct drm_device *dev = adev_to_drm(adev); >>>>>> + >>>>>> + ktime_get_ts64(&adev->reset_time); >>>>>> + dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL, >>>>>> + amdgpu_devcoredump_read, NULL); >>>>> instead of registering NULL as free function, I would prefer you >>>>> to have a dummy no_op free function registered, which we can >>>>> consume if something changes. >>>> you mean something like this (function without any code): >>>> staticvoidamdgpu_devcoredump_free(void*data) >>>> { >>>> } >>> >>> Yes, precisely. >>> >>>>>> +} >>>>>> +#endif >>>>>> + >>>>>> int amdgpu_do_asic_reset(struct list_head *device_list_handle, >>>>>> struct amdgpu_reset_context *reset_context) >>>>>> { >>>>>> @@ -4817,6 +4868,14 @@ int amdgpu_do_asic_reset(struct list_head >>>>>> *device_list_handle, >>>>>> goto out; >>>>>> vram_lost = >>>>>> amdgpu_device_check_vram_lost(tmp_adev); >>>>>> +#ifdef CONFIG_DEV_COREDUMP >>>>>> + tmp_adev->reset_context_vram_lost = vram_lost; >>>>>> + tmp_adev->reset_context_task_info.pid = 0; >>>>> why is the PID hardcoded to 0 ? >>>> in case of reset context reset_context->job->vm is null >>>> (possibility that reset can be non VM related). >>>> If we don't set tmp_adev->reset_context_task_info.pid = 0, it will >>>> show previous reset valid PID. >>>> >>> >>> But when the VM is not NULL, are we updating this PID somewhere ? I >>> did not see that happening in this series. >> This is the only place where PID get updated. >> For example sequence of operation like: >> 1st reset: >> -valid VM and tmp_adev->reset_context_task_info.pid is set to some >> valid PID >> 2nd reset: >> -invalid VM >> -tmp_adev context will remain same (adev context will be same after >> successful GPU reset sequence). >> -tmp_adev->reset_context_task_info.pid will be holding 1st reset PID >> value >> >> if we don't set it 0, it can give wrong PID. >> > > I get your point, and agree, that when the vm is invalid, PID is > invalid so we should send pid=0. But where are you handling the case > when VM is valid and pid is valid ? valid VM check in the below condition: -->if (reset_context->job && reset_context->job->vm) reset_context->job->vm->task_info will have the required information. > > - Shashank > >> Regards, >> S.Amarnath >>> >>> - Shashank >>> >>>> >>>> Regards, >>>> S.Amarnath >>>>>> + if (reset_context->job && reset_context->job->vm) >>>>>> + tmp_adev->reset_context_task_info = >>>>>> + reset_context->job->vm->task_info; >>>>>> + amdgpu_reset_capture_coredumpm(tmp_adev); >>>>>> +#endif >>>>>> if (vram_lost) { >>>>>> DRM_INFO("VRAM is lost due to GPU reset!\n"); >>>>>> >>>>> - Shashank >>>>> amdgpu_inc_vram_lost(tmp_adev); ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v1 2/2] drm/amdgpu: adding device coredump support 2022-05-24 13:18 ` Somalapuram, Amaranath @ 2022-05-24 15:04 ` Sharma, Shashank 2022-05-24 15:18 ` Somalapuram, Amaranath 0 siblings, 1 reply; 15+ messages in thread From: Sharma, Shashank @ 2022-05-24 15:04 UTC (permalink / raw) To: Somalapuram, Amaranath, Somalapuram Amaranath, amd-gfx Cc: alexander.deucher, christian.koenig On 5/24/2022 3:18 PM, Somalapuram, Amaranath wrote: > > On 5/24/2022 6:20 PM, Sharma, Shashank wrote: >> >> >> On 5/24/2022 2:10 PM, Somalapuram, Amaranath wrote: >>> >>> On 5/24/2022 3:23 PM, Sharma, Shashank wrote: >>>> >>>> >>>> On 5/24/2022 8:42 AM, Somalapuram, Amaranath wrote: >>>>> >>>>> On 5/20/2022 7:52 PM, Sharma, Shashank wrote: >>>>>> >>>>>> >>>>>> On 5/20/2022 3:49 PM, Somalapuram Amaranath wrote: >>>>>>> Added device coredump information: >>>>>>> - Kernel version >>>>>>> - Module >>>>>>> - Time >>>>>>> - VRAM status >>>>>>> - Guilty process name and PID >>>>>>> - GPU register dumps >>>>>>> >>>>>>> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com> >>>>>>> --- >>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 ++ >>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 59 >>>>>>> ++++++++++++++++++++++ >>>>>>> 2 files changed, 62 insertions(+) >>>>>>> >>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>> index c79d9992b113..f28d9c563f74 100644 >>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>> @@ -1044,6 +1044,9 @@ struct amdgpu_device { >>>>>>> uint32_t *reset_dump_reg_list; >>>>>>> uint32_t *reset_dump_reg_value; >>>>>>> int num_regs; >>>>>>> + struct amdgpu_task_info reset_context_task_info; >>>>>>> + bool reset_context_vram_lost; >>>>>> >>>>>> How about drop the 'context' from name and just reset_task_info >>>>>> and reset_vram_lost ? >>>>> OK. >>>>>> >>>>>>> + struct timespec64 reset_time; >>>>>>> struct amdgpu_reset_domain *reset_domain; >>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>> index 963c897a76e6..f9b710e741a7 100644 >>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>> @@ -32,6 +32,8 @@ >>>>>>> #include <linux/slab.h> >>>>>>> #include <linux/iommu.h> >>>>>>> #include <linux/pci.h> >>>>>>> +#include <linux/devcoredump.h> >>>>>>> +#include <generated/utsrelease.h> >>>>>>> #include <drm/drm_atomic_helper.h> >>>>>>> #include <drm/drm_probe_helper.h> >>>>>>> @@ -4733,6 +4735,55 @@ static int amdgpu_reset_reg_dumps(struct >>>>>>> amdgpu_device *adev) >>>>>>> return 0; >>>>>>> } >>>>>>> +#ifdef CONFIG_DEV_COREDUMP >>>>>>> +static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, >>>>>>> + size_t count, void *data, size_t datalen) >>>>>>> +{ >>>>>>> + struct drm_printer p; >>>>>>> + struct amdgpu_device *adev = data; >>>>>>> + struct drm_print_iterator iter; >>>>>>> + int i; >>>>>>> + >>>>>> >>>>>> A NULL check for 'buffer' here could prevent a segfault later. >>>>>> >>>>> Agreed. >>>>>>> + iter.data = buffer; >>>>>>> + iter.offset = 0; >>>>>>> + iter.start = offset; >>>>>>> + iter.remain = count; >>>>>>> + >>>>>>> + p = drm_coredump_printer(&iter); >>>>>>> + >>>>>>> + drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); >>>>>>> + drm_printf(&p, "kernel: " UTS_RELEASE "\n"); >>>>>>> + drm_printf(&p, "module: " KBUILD_MODNAME "\n"); >>>>>>> + drm_printf(&p, "time: %lld.%09ld\n", >>>>>>> adev->reset_time.tv_sec, adev->reset_time.tv_nsec); >>>>>>> + if (adev->reset_context_task_info.pid) >>>>>>> + drm_printf(&p, "process_name: %s PID: %d\n", >>>>>>> + adev->reset_context_task_info.process_name, >>>>>>> + adev->reset_context_task_info.pid); >>>>>> Please fix the alignment of print variables. >>>>>> >>>>> I will cross check this. >>>>>>> + >>>>>>> + if (adev->reset_context_vram_lost) >>>>>>> + drm_printf(&p, "VRAM is lost due to GPU reset!\n"); >>>>>>> + if (adev->num_regs) { >>>>>>> + drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); >>>>>>> + >>>>>>> + for (i = 0; i < adev->num_regs; i++) >>>>>>> + drm_printf(&p, "0x%08x: 0x%08x\n", >>>>>>> + adev->reset_dump_reg_list[i], >>>>>>> + adev->reset_dump_reg_value[i]); >>>>>>> + } >>>>>>> + >>>>>>> + return count - iter.remain; >>>>>>> +} >>>>>>> + >>>>>>> +static void amdgpu_reset_capture_coredumpm(struct amdgpu_device >>>>>>> *adev) >>>>>>> +{ >>>>>>> + struct drm_device *dev = adev_to_drm(adev); >>>>>>> + >>>>>>> + ktime_get_ts64(&adev->reset_time); >>>>>>> + dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL, >>>>>>> + amdgpu_devcoredump_read, NULL); >>>>>> instead of registering NULL as free function, I would prefer you >>>>>> to have a dummy no_op free function registered, which we can >>>>>> consume if something changes. >>>>> you mean something like this (function without any code): >>>>> staticvoidamdgpu_devcoredump_free(void*data) >>>>> { >>>>> } >>>> >>>> Yes, precisely. >>>> >>>>>>> +} >>>>>>> +#endif >>>>>>> + >>>>>>> int amdgpu_do_asic_reset(struct list_head *device_list_handle, >>>>>>> struct amdgpu_reset_context *reset_context) >>>>>>> { >>>>>>> @@ -4817,6 +4868,14 @@ int amdgpu_do_asic_reset(struct list_head >>>>>>> *device_list_handle, >>>>>>> goto out; >>>>>>> vram_lost = >>>>>>> amdgpu_device_check_vram_lost(tmp_adev); >>>>>>> +#ifdef CONFIG_DEV_COREDUMP >>>>>>> + tmp_adev->reset_context_vram_lost = vram_lost; >>>>>>> + tmp_adev->reset_context_task_info.pid = 0; >>>>>> why is the PID hardcoded to 0 ? >>>>> in case of reset context reset_context->job->vm is null >>>>> (possibility that reset can be non VM related). >>>>> If we don't set tmp_adev->reset_context_task_info.pid = 0, it will >>>>> show previous reset valid PID. >>>>> >>>> >>>> But when the VM is not NULL, are we updating this PID somewhere ? I >>>> did not see that happening in this series. >>> This is the only place where PID get updated. >>> For example sequence of operation like: >>> 1st reset: >>> -valid VM and tmp_adev->reset_context_task_info.pid is set to some >>> valid PID >>> 2nd reset: >>> -invalid VM >>> -tmp_adev context will remain same (adev context will be same after >>> successful GPU reset sequence). >>> -tmp_adev->reset_context_task_info.pid will be holding 1st reset PID >>> value >>> >>> if we don't set it 0, it can give wrong PID. >>> >> >> I get your point, and agree, that when the vm is invalid, PID is >> invalid so we should send pid=0. But where are you handling the case >> when VM is valid and pid is valid ? > valid VM check in the below condition: > -->if (reset_context->job && reset_context->job->vm) > reset_context->job->vm->task_info will have the required information. And how will that reflect on tmp_adev->reset_context_task_info.pid ? The point is, if this parameter is being set to zero, means it is being used somewhere, so when and how will this become non-zero ? - Shashank >> >> - Shashank >> >>> Regards, >>> S.Amarnath >>>> >>>> - Shashank >>>> >>>>> >>>>> Regards, >>>>> S.Amarnath >>>>>>> + if (reset_context->job && reset_context->job->vm) >>>>>>> + tmp_adev->reset_context_task_info = >>>>>>> + reset_context->job->vm->task_info; >>>>>>> + amdgpu_reset_capture_coredumpm(tmp_adev); >>>>>>> +#endif >>>>>>> if (vram_lost) { >>>>>>> DRM_INFO("VRAM is lost due to GPU reset!\n"); >>>>>>> >>>>>> - Shashank >>>>>> amdgpu_inc_vram_lost(tmp_adev); ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v1 2/2] drm/amdgpu: adding device coredump support 2022-05-24 15:04 ` Sharma, Shashank @ 2022-05-24 15:18 ` Somalapuram, Amaranath 2022-05-24 17:27 ` Sharma, Shashank 0 siblings, 1 reply; 15+ messages in thread From: Somalapuram, Amaranath @ 2022-05-24 15:18 UTC (permalink / raw) To: Sharma, Shashank, Somalapuram Amaranath, amd-gfx Cc: alexander.deucher, christian.koenig On 5/24/2022 8:34 PM, Sharma, Shashank wrote: > > > On 5/24/2022 3:18 PM, Somalapuram, Amaranath wrote: >> >> On 5/24/2022 6:20 PM, Sharma, Shashank wrote: >>> >>> >>> On 5/24/2022 2:10 PM, Somalapuram, Amaranath wrote: >>>> >>>> On 5/24/2022 3:23 PM, Sharma, Shashank wrote: >>>>> >>>>> >>>>> On 5/24/2022 8:42 AM, Somalapuram, Amaranath wrote: >>>>>> >>>>>> On 5/20/2022 7:52 PM, Sharma, Shashank wrote: >>>>>>> >>>>>>> >>>>>>> On 5/20/2022 3:49 PM, Somalapuram Amaranath wrote: >>>>>>>> Added device coredump information: >>>>>>>> - Kernel version >>>>>>>> - Module >>>>>>>> - Time >>>>>>>> - VRAM status >>>>>>>> - Guilty process name and PID >>>>>>>> - GPU register dumps >>>>>>>> >>>>>>>> Signed-off-by: Somalapuram Amaranath >>>>>>>> <Amaranath.Somalapuram@amd.com> >>>>>>>> --- >>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 ++ >>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 59 >>>>>>>> ++++++++++++++++++++++ >>>>>>>> 2 files changed, 62 insertions(+) >>>>>>>> >>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>> index c79d9992b113..f28d9c563f74 100644 >>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>> @@ -1044,6 +1044,9 @@ struct amdgpu_device { >>>>>>>> uint32_t *reset_dump_reg_list; >>>>>>>> uint32_t *reset_dump_reg_value; >>>>>>>> int num_regs; >>>>>>>> + struct amdgpu_task_info reset_context_task_info; >>>>>>>> + bool reset_context_vram_lost; >>>>>>> >>>>>>> How about drop the 'context' from name and just reset_task_info >>>>>>> and reset_vram_lost ? >>>>>> OK. >>>>>>> >>>>>>>> + struct timespec64 reset_time; >>>>>>>> struct amdgpu_reset_domain *reset_domain; >>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>> index 963c897a76e6..f9b710e741a7 100644 >>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>> @@ -32,6 +32,8 @@ >>>>>>>> #include <linux/slab.h> >>>>>>>> #include <linux/iommu.h> >>>>>>>> #include <linux/pci.h> >>>>>>>> +#include <linux/devcoredump.h> >>>>>>>> +#include <generated/utsrelease.h> >>>>>>>> #include <drm/drm_atomic_helper.h> >>>>>>>> #include <drm/drm_probe_helper.h> >>>>>>>> @@ -4733,6 +4735,55 @@ static int amdgpu_reset_reg_dumps(struct >>>>>>>> amdgpu_device *adev) >>>>>>>> return 0; >>>>>>>> } >>>>>>>> +#ifdef CONFIG_DEV_COREDUMP >>>>>>>> +static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t >>>>>>>> offset, >>>>>>>> + size_t count, void *data, size_t datalen) >>>>>>>> +{ >>>>>>>> + struct drm_printer p; >>>>>>>> + struct amdgpu_device *adev = data; >>>>>>>> + struct drm_print_iterator iter; >>>>>>>> + int i; >>>>>>>> + >>>>>>> >>>>>>> A NULL check for 'buffer' here could prevent a segfault later. >>>>>>> >>>>>> Agreed. >>>>>>>> + iter.data = buffer; >>>>>>>> + iter.offset = 0; >>>>>>>> + iter.start = offset; >>>>>>>> + iter.remain = count; >>>>>>>> + >>>>>>>> + p = drm_coredump_printer(&iter); >>>>>>>> + >>>>>>>> + drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); >>>>>>>> + drm_printf(&p, "kernel: " UTS_RELEASE "\n"); >>>>>>>> + drm_printf(&p, "module: " KBUILD_MODNAME "\n"); >>>>>>>> + drm_printf(&p, "time: %lld.%09ld\n", >>>>>>>> adev->reset_time.tv_sec, adev->reset_time.tv_nsec); >>>>>>>> + if (adev->reset_context_task_info.pid) >>>>>>>> + drm_printf(&p, "process_name: %s PID: %d\n", >>>>>>>> + adev->reset_context_task_info.process_name, >>>>>>>> + adev->reset_context_task_info.pid); >>>>>>> Please fix the alignment of print variables. >>>>>>> >>>>>> I will cross check this. >>>>>>>> + >>>>>>>> + if (adev->reset_context_vram_lost) >>>>>>>> + drm_printf(&p, "VRAM is lost due to GPU reset!\n"); >>>>>>>> + if (adev->num_regs) { >>>>>>>> + drm_printf(&p, "AMDGPU register dumps:\nOffset: >>>>>>>> Value:\n"); >>>>>>>> + >>>>>>>> + for (i = 0; i < adev->num_regs; i++) >>>>>>>> + drm_printf(&p, "0x%08x: 0x%08x\n", >>>>>>>> + adev->reset_dump_reg_list[i], >>>>>>>> + adev->reset_dump_reg_value[i]); >>>>>>>> + } >>>>>>>> + >>>>>>>> + return count - iter.remain; >>>>>>>> +} >>>>>>>> + >>>>>>>> +static void amdgpu_reset_capture_coredumpm(struct >>>>>>>> amdgpu_device *adev) >>>>>>>> +{ >>>>>>>> + struct drm_device *dev = adev_to_drm(adev); >>>>>>>> + >>>>>>>> + ktime_get_ts64(&adev->reset_time); >>>>>>>> + dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL, >>>>>>>> + amdgpu_devcoredump_read, NULL); >>>>>>> instead of registering NULL as free function, I would prefer you >>>>>>> to have a dummy no_op free function registered, which we can >>>>>>> consume if something changes. >>>>>> you mean something like this (function without any code): >>>>>> staticvoidamdgpu_devcoredump_free(void*data) >>>>>> { >>>>>> } >>>>> >>>>> Yes, precisely. >>>>> >>>>>>>> +} >>>>>>>> +#endif >>>>>>>> + >>>>>>>> int amdgpu_do_asic_reset(struct list_head *device_list_handle, >>>>>>>> struct amdgpu_reset_context *reset_context) >>>>>>>> { >>>>>>>> @@ -4817,6 +4868,14 @@ int amdgpu_do_asic_reset(struct >>>>>>>> list_head *device_list_handle, >>>>>>>> goto out; >>>>>>>> vram_lost = >>>>>>>> amdgpu_device_check_vram_lost(tmp_adev); >>>>>>>> +#ifdef CONFIG_DEV_COREDUMP >>>>>>>> + tmp_adev->reset_context_vram_lost = vram_lost; >>>>>>>> + tmp_adev->reset_context_task_info.pid = 0; >>>>>>> why is the PID hardcoded to 0 ? >>>>>> in case of reset context reset_context->job->vm is null >>>>>> (possibility that reset can be non VM related). >>>>>> If we don't set tmp_adev->reset_context_task_info.pid = 0, it >>>>>> will show previous reset valid PID. >>>>>> >>>>> >>>>> But when the VM is not NULL, are we updating this PID somewhere ? >>>>> I did not see that happening in this series. >>>> This is the only place where PID get updated. >>>> For example sequence of operation like: >>>> 1st reset: >>>> -valid VM and tmp_adev->reset_context_task_info.pid is set to some >>>> valid PID >>>> 2nd reset: >>>> -invalid VM >>>> -tmp_adev context will remain same (adev context will be same after >>>> successful GPU reset sequence). >>>> -tmp_adev->reset_context_task_info.pid will be holding 1st reset >>>> PID value >>>> >>>> if we don't set it 0, it can give wrong PID. >>>> >>> >>> I get your point, and agree, that when the vm is invalid, PID is >>> invalid so we should send pid=0. But where are you handling the case >>> when VM is valid and pid is valid ? >> valid VM check in the below condition: >> -->if (reset_context->job && reset_context->job->vm) >> reset_context->job->vm->task_info will have the required information. > > And how will that reflect on tmp_adev->reset_context_task_info.pid ? > The point is, if this parameter is being set to zero, means it is > being used somewhere, so when and how will this become non-zero ? > #ifdef CONFIG_DEV_COREDUMP tmp_adev->reset_context_vram_lost = vram_lost; tmp_adev->reset_context_task_info.pid = 0; if (reset_context->job && reset_context->job->vm) tmp_adev->reset_context_task_info = reset_context->job->vm->task_info; amdgpu_reset_capture_coredumpm(tmp_adev); #endif tmp_adev->reset_context_task_info refers to reset_context->job->vm->task_info, i am not setting PID and process name separately, instead i am having the reference to the reset_context->job->vm->task_info on valid VM. Regards, S.Amarnath > - Shashank > >>> >>> - Shashank >>> >>>> Regards, >>>> S.Amarnath >>>>> >>>>> - Shashank >>>>> >>>>>> >>>>>> Regards, >>>>>> S.Amarnath >>>>>>>> + if (reset_context->job && reset_context->job->vm) >>>>>>>> + tmp_adev->reset_context_task_info = >>>>>>>> + reset_context->job->vm->task_info; >>>>>>>> + amdgpu_reset_capture_coredumpm(tmp_adev); >>>>>>>> +#endif >>>>>>>> if (vram_lost) { >>>>>>>> DRM_INFO("VRAM is lost due to GPU >>>>>>>> reset!\n"); >>>>>>>> >>>>>>> - Shashank >>>>>>> amdgpu_inc_vram_lost(tmp_adev); ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v1 2/2] drm/amdgpu: adding device coredump support 2022-05-24 15:18 ` Somalapuram, Amaranath @ 2022-05-24 17:27 ` Sharma, Shashank 0 siblings, 0 replies; 15+ messages in thread From: Sharma, Shashank @ 2022-05-24 17:27 UTC (permalink / raw) To: Somalapuram, Amaranath, Somalapuram Amaranath, amd-gfx Cc: alexander.deucher, christian.koenig (snip) > #ifdef CONFIG_DEV_COREDUMP > tmp_adev->reset_context_vram_lost = > vram_lost; > tmp_adev->reset_context_task_info.pid = 0; > if (reset_context->job && > reset_context->job->vm) > tmp_adev->reset_context_task_info = > reset_context->job->vm->task_info; > amdgpu_reset_capture_coredumpm(tmp_adev); > #endif > tmp_adev->reset_context_task_info refers to > reset_context->job->vm->task_info, > i am not setting PID and process name separately, instead i am having > the reference to the reset_context->job->vm->task_info on valid VM. > I think you are not getting my point. struct amdgpu_task_info has 3 more parameters: process_name, task_name and tgid. When VRAM is not lost: we are getting the whole new amdgpu_task_info structure, and all parameters are valid. When VRAM is lost: we should reset all parameters of amdgpu_task_info structure, not just pid. Else other params may contain garbage. So what I mean is: instead of tmp_adev->reset_context_task_info.pid = 0; do something like: memset(&tmp_adev->reset_context_task_info.pid, 0, sizeof(reset_context_task_info)); - Shashank ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v1 1/2] drm/amdgpu: save the reset dump register value for devcoredump 2022-05-20 13:49 [PATCH v1 1/2] drm/amdgpu: save the reset dump register value for devcoredump Somalapuram Amaranath 2022-05-20 13:49 ` [PATCH v1 2/2] drm/amdgpu: adding device coredump support Somalapuram Amaranath @ 2022-05-20 14:06 ` Sharma, Shashank 2022-05-24 6:12 ` Somalapuram, Amaranath 1 sibling, 1 reply; 15+ messages in thread From: Sharma, Shashank @ 2022-05-20 14:06 UTC (permalink / raw) To: Somalapuram Amaranath, amd-gfx; +Cc: alexander.deucher, christian.koenig Hey Amar, On 5/20/2022 3:49 PM, Somalapuram Amaranath wrote: > Allocate memory for register value and use the same values for devcoredump. > Remove dump_stack reset register dumps. > > Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + > drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 9 ++++++++- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 7 +++---- > 3 files changed, 12 insertions(+), 5 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index 76df583663c7..c79d9992b113 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -1042,6 +1042,7 @@ struct amdgpu_device { > > /* reset dump register */ > uint32_t *reset_dump_reg_list; > + uint32_t *reset_dump_reg_value; > int num_regs; > > struct amdgpu_reset_domain *reset_domain; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c > index eedb12f6b8a3..942fdbd316f4 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c > @@ -1683,7 +1683,7 @@ static ssize_t amdgpu_reset_dump_register_list_write(struct file *f, > { > struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private; > char reg_offset[11]; > - uint32_t *new, *tmp = NULL; > + uint32_t *new, *tmp = NULL, *tmp_value = NULL; > int ret, i = 0, len = 0; > > do { > @@ -1709,17 +1709,24 @@ static ssize_t amdgpu_reset_dump_register_list_write(struct file *f, > i++; > } while (len < size); > > + new = krealloc_array(tmp_value, i, sizeof(uint32_t), GFP_KERNEL); tmp_value is initialized to NULL, which means krealloc_array() will behave like kmalloc_array(), is there any particular reason we are adding this variable at all just to use krealloc_array(), and why not use kmalloc_array() directly ? > + if (!new) { > + ret = -ENOMEM; > + goto error_free; > + } > ret = down_write_killable(&adev->reset_domain->sem); > if (ret) > goto error_free; > > swap(adev->reset_dump_reg_list, tmp); > + swap(adev->reset_dump_reg_value, new); > adev->num_regs = i; > up_write(&adev->reset_domain->sem); > ret = size; > > error_free: > kfree(tmp); > + kfree(new); > return ret; > } > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 4daa0e893965..963c897a76e6 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -4720,15 +4720,14 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, > > static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) > { > - uint32_t reg_value; > int i; > > lockdep_assert_held(&adev->reset_domain->sem); > - dump_stack(); This should be a part of different patch, where you can give some background on why are we removing this. > > for (i = 0; i < adev->num_regs; i++) { > - reg_value = RREG32(adev->reset_dump_reg_list[i]); > - trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], reg_value); > + adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]); > + trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], > + adev->reset_dump_reg_value[i]); > } > > return 0; - Shashank ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v1 1/2] drm/amdgpu: save the reset dump register value for devcoredump 2022-05-20 14:06 ` [PATCH v1 1/2] drm/amdgpu: save the reset dump register value for devcoredump Sharma, Shashank @ 2022-05-24 6:12 ` Somalapuram, Amaranath 2022-05-24 9:55 ` Sharma, Shashank 0 siblings, 1 reply; 15+ messages in thread From: Somalapuram, Amaranath @ 2022-05-24 6:12 UTC (permalink / raw) To: Sharma, Shashank, Somalapuram Amaranath, amd-gfx Cc: alexander.deucher, christian.koenig On 5/20/2022 7:36 PM, Sharma, Shashank wrote: > Hey Amar, > > On 5/20/2022 3:49 PM, Somalapuram Amaranath wrote: >> Allocate memory for register value and use the same values for >> devcoredump. >> Remove dump_stack reset register dumps. >> >> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com> >> --- >> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + >> drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 9 ++++++++- >> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 7 +++---- >> 3 files changed, 12 insertions(+), 5 deletions(-) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> index 76df583663c7..c79d9992b113 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> @@ -1042,6 +1042,7 @@ struct amdgpu_device { >> /* reset dump register */ >> uint32_t *reset_dump_reg_list; >> + uint32_t *reset_dump_reg_value; >> int num_regs; >> struct amdgpu_reset_domain *reset_domain; >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c >> index eedb12f6b8a3..942fdbd316f4 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c >> @@ -1683,7 +1683,7 @@ static ssize_t >> amdgpu_reset_dump_register_list_write(struct file *f, >> { >> struct amdgpu_device *adev = (struct amdgpu_device >> *)file_inode(f)->i_private; >> char reg_offset[11]; >> - uint32_t *new, *tmp = NULL; >> + uint32_t *new, *tmp = NULL, *tmp_value = NULL; >> int ret, i = 0, len = 0; >> do { >> @@ -1709,17 +1709,24 @@ static ssize_t >> amdgpu_reset_dump_register_list_write(struct file *f, >> i++; >> } while (len < size); >> + new = krealloc_array(tmp_value, i, sizeof(uint32_t), GFP_KERNEL); > > tmp_value is initialized to NULL, which means krealloc_array() will > behave like kmalloc_array(), is there any particular reason we are > adding this variable at all just to use krealloc_array(), and why not > use kmalloc_array() directly ? I thought of using kmalloc_array() (got little confused on next write cycle), I agree to use kmalloc_array(). Regards, S.Amarnath > >> + if (!new) { >> + ret = -ENOMEM; >> + goto error_free; >> + } >> ret = down_write_killable(&adev->reset_domain->sem); >> if (ret) >> goto error_free; >> swap(adev->reset_dump_reg_list, tmp); >> + swap(adev->reset_dump_reg_value, new); >> adev->num_regs = i; >> up_write(&adev->reset_domain->sem); >> ret = size; >> error_free: >> kfree(tmp); >> + kfree(new); >> return ret; >> } >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> index 4daa0e893965..963c897a76e6 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> @@ -4720,15 +4720,14 @@ int amdgpu_device_pre_asic_reset(struct >> amdgpu_device *adev, >> static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) >> { >> - uint32_t reg_value; >> int i; >> lockdep_assert_held(&adev->reset_domain->sem); >> - dump_stack(); > This should be a part of different patch, where you can give some > background on why are we removing this. >> for (i = 0; i < adev->num_regs; i++) { >> - reg_value = RREG32(adev->reset_dump_reg_list[i]); >> - trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], reg_value); >> + adev->reset_dump_reg_value[i] = >> RREG32(adev->reset_dump_reg_list[i]); >> + trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], >> + adev->reset_dump_reg_value[i]); >> } >> return 0; > > - Shashank ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v1 1/2] drm/amdgpu: save the reset dump register value for devcoredump 2022-05-24 6:12 ` Somalapuram, Amaranath @ 2022-05-24 9:55 ` Sharma, Shashank 2022-05-24 11:57 ` Somalapuram, Amaranath 0 siblings, 1 reply; 15+ messages in thread From: Sharma, Shashank @ 2022-05-24 9:55 UTC (permalink / raw) To: Somalapuram, Amaranath, Somalapuram Amaranath, amd-gfx Cc: alexander.deucher, christian.koenig On 5/24/2022 8:12 AM, Somalapuram, Amaranath wrote: > > On 5/20/2022 7:36 PM, Sharma, Shashank wrote: >> Hey Amar, >> >> On 5/20/2022 3:49 PM, Somalapuram Amaranath wrote: >>> Allocate memory for register value and use the same values for >>> devcoredump. >>> Remove dump_stack reset register dumps. >>> >>> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com> >>> --- >>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + >>> drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 9 ++++++++- >>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 7 +++---- >>> 3 files changed, 12 insertions(+), 5 deletions(-) >>> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>> index 76df583663c7..c79d9992b113 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>> @@ -1042,6 +1042,7 @@ struct amdgpu_device { >>> /* reset dump register */ >>> uint32_t *reset_dump_reg_list; >>> + uint32_t *reset_dump_reg_value; >>> int num_regs; >>> struct amdgpu_reset_domain *reset_domain; >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c >>> index eedb12f6b8a3..942fdbd316f4 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c >>> @@ -1683,7 +1683,7 @@ static ssize_t >>> amdgpu_reset_dump_register_list_write(struct file *f, >>> { >>> struct amdgpu_device *adev = (struct amdgpu_device >>> *)file_inode(f)->i_private; >>> char reg_offset[11]; >>> - uint32_t *new, *tmp = NULL; >>> + uint32_t *new, *tmp = NULL, *tmp_value = NULL; >>> int ret, i = 0, len = 0; >>> do { >>> @@ -1709,17 +1709,24 @@ static ssize_t >>> amdgpu_reset_dump_register_list_write(struct file *f, >>> i++; >>> } while (len < size); >>> + new = krealloc_array(tmp_value, i, sizeof(uint32_t), GFP_KERNEL); >> >> tmp_value is initialized to NULL, which means krealloc_array() will >> behave like kmalloc_array(), is there any particular reason we are >> adding this variable at all just to use krealloc_array(), and why not >> use kmalloc_array() directly ? > > I thought of using kmalloc_array() (got little confused on next write > cycle), I agree to use kmalloc_array(). > > Regards, > S.Amarnath >> >>> + if (!new) { >>> + ret = -ENOMEM; >>> + goto error_free; >>> + } >>> ret = down_write_killable(&adev->reset_domain->sem); >>> if (ret) >>> goto error_free; >>> swap(adev->reset_dump_reg_list, tmp); >>> + swap(adev->reset_dump_reg_value, new); >>> adev->num_regs = i; >>> up_write(&adev->reset_domain->sem); >>> ret = size; >>> error_free: >>> kfree(tmp); >>> + kfree(new); >>> return ret; >>> } >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>> index 4daa0e893965..963c897a76e6 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>> @@ -4720,15 +4720,14 @@ int amdgpu_device_pre_asic_reset(struct >>> amdgpu_device *adev, >>> static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) >>> { >>> - uint32_t reg_value; >>> int i; >>> lockdep_assert_held(&adev->reset_domain->sem); >>> - dump_stack(); >> This should be a part of different patch, where you can give some >> background on why are we removing this. You missed this comment. - Shashank >>> for (i = 0; i < adev->num_regs; i++) { >>> - reg_value = RREG32(adev->reset_dump_reg_list[i]); >>> - trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], reg_value); >>> + adev->reset_dump_reg_value[i] = >>> RREG32(adev->reset_dump_reg_list[i]); >>> + trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], >>> + adev->reset_dump_reg_value[i]); >>> } >>> return 0; >> >> - Shashank ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v1 1/2] drm/amdgpu: save the reset dump register value for devcoredump 2022-05-24 9:55 ` Sharma, Shashank @ 2022-05-24 11:57 ` Somalapuram, Amaranath 0 siblings, 0 replies; 15+ messages in thread From: Somalapuram, Amaranath @ 2022-05-24 11:57 UTC (permalink / raw) To: Sharma, Shashank, Somalapuram Amaranath, amd-gfx Cc: alexander.deucher, christian.koenig On 5/24/2022 3:25 PM, Sharma, Shashank wrote: > > > On 5/24/2022 8:12 AM, Somalapuram, Amaranath wrote: >> >> On 5/20/2022 7:36 PM, Sharma, Shashank wrote: >>> Hey Amar, >>> >>> On 5/20/2022 3:49 PM, Somalapuram Amaranath wrote: >>>> Allocate memory for register value and use the same values for >>>> devcoredump. >>>> Remove dump_stack reset register dumps. >>>> >>>> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com> >>>> --- >>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + >>>> drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 9 ++++++++- >>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 7 +++---- >>>> 3 files changed, 12 insertions(+), 5 deletions(-) >>>> >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>> index 76df583663c7..c79d9992b113 100644 >>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>> @@ -1042,6 +1042,7 @@ struct amdgpu_device { >>>> /* reset dump register */ >>>> uint32_t *reset_dump_reg_list; >>>> + uint32_t *reset_dump_reg_value; >>>> int num_regs; >>>> struct amdgpu_reset_domain *reset_domain; >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c >>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c >>>> index eedb12f6b8a3..942fdbd316f4 100644 >>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c >>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c >>>> @@ -1683,7 +1683,7 @@ static ssize_t >>>> amdgpu_reset_dump_register_list_write(struct file *f, >>>> { >>>> struct amdgpu_device *adev = (struct amdgpu_device >>>> *)file_inode(f)->i_private; >>>> char reg_offset[11]; >>>> - uint32_t *new, *tmp = NULL; >>>> + uint32_t *new, *tmp = NULL, *tmp_value = NULL; >>>> int ret, i = 0, len = 0; >>>> do { >>>> @@ -1709,17 +1709,24 @@ static ssize_t >>>> amdgpu_reset_dump_register_list_write(struct file *f, >>>> i++; >>>> } while (len < size); >>>> + new = krealloc_array(tmp_value, i, sizeof(uint32_t), >>>> GFP_KERNEL); >>> >>> tmp_value is initialized to NULL, which means krealloc_array() will >>> behave like kmalloc_array(), is there any particular reason we are >>> adding this variable at all just to use krealloc_array(), and why >>> not use kmalloc_array() directly ? >> >> I thought of using kmalloc_array() (got little confused on next write >> cycle), I agree to use kmalloc_array(). >> >> Regards, >> S.Amarnath >>> >>>> + if (!new) { >>>> + ret = -ENOMEM; >>>> + goto error_free; >>>> + } >>>> ret = down_write_killable(&adev->reset_domain->sem); >>>> if (ret) >>>> goto error_free; >>>> swap(adev->reset_dump_reg_list, tmp); >>>> + swap(adev->reset_dump_reg_value, new); >>>> adev->num_regs = i; >>>> up_write(&adev->reset_domain->sem); >>>> ret = size; >>>> error_free: >>>> kfree(tmp); >>>> + kfree(new); >>>> return ret; >>>> } >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>> index 4daa0e893965..963c897a76e6 100644 >>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>> @@ -4720,15 +4720,14 @@ int amdgpu_device_pre_asic_reset(struct >>>> amdgpu_device *adev, >>>> static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) >>>> { >>>> - uint32_t reg_value; >>>> int i; >>>> lockdep_assert_held(&adev->reset_domain->sem); >>>> - dump_stack(); >>> This should be a part of different patch, where you can give some >>> background on why are we removing this. > Will make different patch for this. > You missed this comment. > - Shashank > >>>> for (i = 0; i < adev->num_regs; i++) { >>>> - reg_value = RREG32(adev->reset_dump_reg_list[i]); >>>> - trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], >>>> reg_value); >>>> + adev->reset_dump_reg_value[i] = >>>> RREG32(adev->reset_dump_reg_list[i]); >>>> + trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], >>>> + adev->reset_dump_reg_value[i]); >>>> } >>>> return 0; >>> >>> - Shashank ^ permalink raw reply [flat|nested] 15+ messages in thread
end of thread, other threads:[~2022-05-24 17:27 UTC | newest] Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2022-05-20 13:49 [PATCH v1 1/2] drm/amdgpu: save the reset dump register value for devcoredump Somalapuram Amaranath 2022-05-20 13:49 ` [PATCH v1 2/2] drm/amdgpu: adding device coredump support Somalapuram Amaranath 2022-05-20 14:22 ` Sharma, Shashank 2022-05-24 6:42 ` Somalapuram, Amaranath 2022-05-24 9:53 ` Sharma, Shashank 2022-05-24 12:10 ` Somalapuram, Amaranath 2022-05-24 12:50 ` Sharma, Shashank 2022-05-24 13:18 ` Somalapuram, Amaranath 2022-05-24 15:04 ` Sharma, Shashank 2022-05-24 15:18 ` Somalapuram, Amaranath 2022-05-24 17:27 ` Sharma, Shashank 2022-05-20 14:06 ` [PATCH v1 1/2] drm/amdgpu: save the reset dump register value for devcoredump Sharma, Shashank 2022-05-24 6:12 ` Somalapuram, Amaranath 2022-05-24 9:55 ` Sharma, Shashank 2022-05-24 11:57 ` Somalapuram, Amaranath
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.