All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Sharma, Shashank" <shashank.sharma@amd.com>
To: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>,
	amd-gfx@lists.freedesktop.org
Cc: alexander.deucher@amd.com, christian.koenig@amd.com
Subject: Re: [PATCH v1 2/2] drm/amdgpu: adding device coredump support
Date: Fri, 20 May 2022 16:22:36 +0200	[thread overview]
Message-ID: <588a0599-7d0c-0041-9877-4429b416e7ed@amd.com> (raw)
In-Reply-To: <20220520134909.92781-2-Amaranath.Somalapuram@amd.com>



On 5/20/2022 3:49 PM, Somalapuram Amaranath wrote:
> Added device coredump information:
> - Kernel version
> - Module
> - Time
> - VRAM status
> - Guilty process name and PID
> - GPU register dumps
> 
> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  3 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 59 ++++++++++++++++++++++
>   2 files changed, 62 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index c79d9992b113..f28d9c563f74 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1044,6 +1044,9 @@ struct amdgpu_device {
>   	uint32_t                        *reset_dump_reg_list;
>   	uint32_t			*reset_dump_reg_value;
>   	int                             num_regs;
> +	struct amdgpu_task_info         reset_context_task_info;
> +	bool                            reset_context_vram_lost;

How about drop the 'context' from name and just reset_task_info and 
reset_vram_lost ?

> +	struct timespec64               reset_time;
>   
>   	struct amdgpu_reset_domain	*reset_domain;
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 963c897a76e6..f9b710e741a7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -32,6 +32,8 @@
>   #include <linux/slab.h>
>   #include <linux/iommu.h>
>   #include <linux/pci.h>
> +#include <linux/devcoredump.h>
> +#include <generated/utsrelease.h>
>   
>   #include <drm/drm_atomic_helper.h>
>   #include <drm/drm_probe_helper.h>
> @@ -4733,6 +4735,55 @@ static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
>   	return 0;
>   }
>   
> +#ifdef CONFIG_DEV_COREDUMP
> +static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
> +		size_t count, void *data, size_t datalen)
> +{
> +	struct drm_printer p;
> +	struct amdgpu_device *adev = data;
> +	struct drm_print_iterator iter;
> +	int i;
> +

A NULL check for 'buffer' here could prevent a segfault later.

> +	iter.data = buffer;
> +	iter.offset = 0;
> +	iter.start = offset;
> +	iter.remain = count;
> +
> +	p = drm_coredump_printer(&iter);
> +
> +	drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
> +	drm_printf(&p, "kernel: " UTS_RELEASE "\n");
> +	drm_printf(&p, "module: " KBUILD_MODNAME "\n");
> +	drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec);
> +	if (adev->reset_context_task_info.pid)
> +		drm_printf(&p, "process_name: %s PID: %d\n",
> +							adev->reset_context_task_info.process_name,
> +							adev->reset_context_task_info.pid);
Please fix the alignment of print variables.

> +
> +	if (adev->reset_context_vram_lost)
> +		drm_printf(&p, "VRAM is lost due to GPU reset!\n");
> +	if (adev->num_regs) {
> +		drm_printf(&p, "AMDGPU register dumps:\nOffset:     Value:\n");
> +
> +		for (i = 0; i < adev->num_regs; i++)
> +			drm_printf(&p, "0x%08x: 0x%08x\n",
> +					adev->reset_dump_reg_list[i],
> +					adev->reset_dump_reg_value[i]);
> +	}
> +
> +	return count - iter.remain;
> +}
> +
> +static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev)
> +{
> +	struct drm_device *dev = adev_to_drm(adev);
> +
> +	ktime_get_ts64(&adev->reset_time);
> +	dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL,
> +			amdgpu_devcoredump_read, NULL);
instead of registering NULL as free function, I would prefer you to have 
a dummy no_op free function registered, which we can consume if 
something changes.
> +}
> +#endif
> +
>   int amdgpu_do_asic_reset(struct list_head *device_list_handle,
>   			 struct amdgpu_reset_context *reset_context)
>   {
> @@ -4817,6 +4868,14 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
>   					goto out;
>   
>   				vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
> +#ifdef CONFIG_DEV_COREDUMP
> +				tmp_adev->reset_context_vram_lost = vram_lost;
> +				tmp_adev->reset_context_task_info.pid = 0;
why is the PID hardcoded to 0 ?
> +				if (reset_context->job && reset_context->job->vm)
> +					tmp_adev->reset_context_task_info =
> +						reset_context->job->vm->task_info;
> +				amdgpu_reset_capture_coredumpm(tmp_adev);
> +#endif
>   				if (vram_lost) {
>   					DRM_INFO("VRAM is lost due to GPU reset!\n");
>  
- Shashank
  					amdgpu_inc_vram_lost(tmp_adev);

  reply	other threads:[~2022-05-20 14:22 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-05-20 13:49 [PATCH v1 1/2] drm/amdgpu: save the reset dump register value for devcoredump Somalapuram Amaranath
2022-05-20 13:49 ` [PATCH v1 2/2] drm/amdgpu: adding device coredump support Somalapuram Amaranath
2022-05-20 14:22   ` Sharma, Shashank [this message]
2022-05-24  6:42     ` Somalapuram, Amaranath
2022-05-24  9:53       ` Sharma, Shashank
2022-05-24 12:10         ` Somalapuram, Amaranath
2022-05-24 12:50           ` Sharma, Shashank
2022-05-24 13:18             ` Somalapuram, Amaranath
2022-05-24 15:04               ` Sharma, Shashank
2022-05-24 15:18                 ` Somalapuram, Amaranath
2022-05-24 17:27                   ` Sharma, Shashank
2022-05-20 14:06 ` [PATCH v1 1/2] drm/amdgpu: save the reset dump register value for devcoredump Sharma, Shashank
2022-05-24  6:12   ` Somalapuram, Amaranath
2022-05-24  9:55     ` Sharma, Shashank
2022-05-24 11:57       ` Somalapuram, Amaranath

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=588a0599-7d0c-0041-9877-4429b416e7ed@amd.com \
    --to=shashank.sharma@amd.com \
    --cc=Amaranath.Somalapuram@amd.com \
    --cc=alexander.deucher@amd.com \
    --cc=amd-gfx@lists.freedesktop.org \
    --cc=christian.koenig@amd.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.