On 5/26/2022 3:56 PM, Wang, Yang(Kevin) wrote: > > [AMD Official Use Only - General] > > > > > ------------------------------------------------------------------------ > *From:* amd-gfx on behalf of > Somalapuram Amaranath > *Sent:* Thursday, May 26, 2022 5:48 PM > *To:* amd-gfx@lists.freedesktop.org > *Cc:* Deucher, Alexander ; Somalapuram, > Amaranath ; Koenig, Christian > ; Sharma, Shashank > *Subject:* [PATCH v2 2/2] drm/amdgpu: adding device coredump support > Added device coredump information: > - Kernel version > - Module > - Time > - VRAM status > - Guilty process name and PID > - GPU register dumps > v1 -> v2: Variable name change > v1 -> v2: NULL check > v1 -> v2: Code alignment > v1 -> v2: Adding dummy amdgpu_devcoredump_free > v1 -> v2: memset reset_task_info to zero > > Signed-off-by: Somalapuram Amaranath > --- >  drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  3 + >  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 67 ++++++++++++++++++++++ >  2 files changed, 70 insertions(+) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index c79d9992b113..25a7b2c74928 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -1044,6 +1044,9 @@ struct amdgpu_device { >          uint32_t *reset_dump_reg_list; >          uint32_t *reset_dump_reg_value; >          int                             num_regs; > +       struct amdgpu_task_info         reset_task_info; > +       bool                            reset_vram_lost; > +       struct timespec64               reset_time; > > [kevin]: > the CONFIG_DEV_COREDUMP check is needed for above variable to avoid > compiler warning when coredump feautre is not enabled. > Agreed. >          struct amdgpu_reset_domain      *reset_domain; > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 866b4980a6fa..ca97afe5be63 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -32,6 +32,8 @@ >  #include >  #include >  #include > +#include > +#include > >  #include >  #include > @@ -4734,6 +4736,62 @@ static int amdgpu_reset_reg_dumps(struct > amdgpu_device *adev) >          return 0; >  } > > +#ifdef CONFIG_DEV_COREDUMP > +static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, > +               size_t count, void *data, size_t datalen) > +{ > +       struct drm_printer p; > +       struct amdgpu_device *adev = data; > +       struct drm_print_iterator iter; > +       int i; > + > +       if (adev == NULL) > +               return 0; > [kevin]: >  this check is not needed, because this private data is passed by our > driver as below: > In my testing if the reset is unsuccessful amdgpu_devcoredump_read will not be called. Shashank: Any inputs on this. Regards, S.Amarnath >  dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL, >  amdgpu_devcoredump_read, amdgpu_devcoredump_free); > + > +       iter.data = buffer; > +       iter.offset = 0; > +       iter.start = offset; > +       iter.remain = count; > + > +       p = drm_coredump_printer(&iter); > + > +       drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); > +       drm_printf(&p, "kernel: " UTS_RELEASE "\n"); > +       drm_printf(&p, "module: " KBUILD_MODNAME "\n"); > +       drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, > adev->reset_time.tv_nsec); > +       if (adev->reset_task_info.pid) > +               drm_printf(&p, "process_name: %s PID: %d\n", > + adev->reset_task_info.process_name, > + adev->reset_task_info.pid); > + > +       if (adev->reset_vram_lost) > +               drm_printf(&p, "VRAM is lost due to GPU reset!\n"); > +       if (adev->num_regs) { > +               drm_printf(&p, "AMDGPU register dumps:\nOffset:     > Value:\n"); > + > +               for (i = 0; i < adev->num_regs; i++) > +                       drm_printf(&p, "0x%08x: 0x%08x\n", > + adev->reset_dump_reg_list[i], > + adev->reset_dump_reg_value[i]); > +       } > + > +       return count - iter.remain; > +} > + > +static void amdgpu_devcoredump_free(void *data) > +{ > +} > + > +static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev) > +{ > +       struct drm_device *dev = adev_to_drm(adev); > + > +       ktime_get_ts64(&adev->reset_time); > +       dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL, > +                       amdgpu_devcoredump_read, amdgpu_devcoredump_free); > +} > +#endif > + >  int amdgpu_do_asic_reset(struct list_head *device_list_handle, >                           struct amdgpu_reset_context *reset_context) >  { > @@ -4818,6 +4876,15 @@ int amdgpu_do_asic_reset(struct list_head > *device_list_handle, >                                          goto out; > >                                  vram_lost = > amdgpu_device_check_vram_lost(tmp_adev); > +#ifdef CONFIG_DEV_COREDUMP > + tmp_adev->reset_vram_lost = vram_lost; > + memset(&tmp_adev->reset_task_info, 0, > + sizeof(tmp_adev->reset_task_info)); > +                               if (reset_context->job && > reset_context->job->vm) > + tmp_adev->reset_task_info = > + reset_context->job->vm->task_info; > + amdgpu_reset_capture_coredumpm(tmp_adev); > +#endif >                                  if (vram_lost) { >                                          DRM_INFO("VRAM is lost due to > GPU reset!\n"); > amdgpu_inc_vram_lost(tmp_adev); > -- > 2.32.0 >