All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdgpu: Add more process info in VM for debug
@ 2018-12-15  8:56 Trigger Huang
       [not found] ` <1544864213-2352-1-git-send-email-Trigger.Huang-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 8+ messages in thread
From: Trigger Huang @ 2018-12-15  8:56 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: andrey.grodzovsky-5C7GfCeVMHo, Trigger Huang, Jim.Qu-5C7GfCeVMHo

When debugging VMC page fault and ring hang issues, the detailed
process information is quite helpful, especially when the issue
can only be reproduced after a very long time running. With this
information, only run the specific sub-testcase may also will
reproduce the issue, which may save a lot of time for debugging.

With this patch, the process information is similar as following.
	When VMC page fault issue happened:
[  142.978417] amdgpu 0000:00:08.0: [gfxhub] VMC page fault (src_id:0 ring:171 vmid:2 pasid:32769),
[  142.978542] amdgpu 0000:00:08.0: for process ocltst pid 1354 thread ocltst pid 1354, args:./ocltst -m oclperf.so -t OCLPerfDeviceEnqueueEvent,
[  142.978693] amdgpu 0000:00:08.0:   in page starting at address 0x0000000000000000 from 27

	When ring hang issue happened:
[ 1740.047122] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring comp_1.0.0 timeout, signaled seq=91571, emitted seq=91572
[ 1740.050167] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* The job's process information is as below:
[ 1740.053160] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process SDMA pid 2098 thread SDMA pid 2098, cmd line:SDMA --mode goldimage_compare --offscreen --n-swapchain-images 3 --gpu 0 --frontend test_executor --n-test-threads 4

Signed-off-by: Trigger Huang <Trigger.Huang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  |  3 ---
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c |  8 ++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 11 +++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h  |  1 +
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   | 11 ++++++-----
 5 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 1c49b82..1a2d0c9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -235,9 +235,6 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs
 		p->job->uf_addr = uf_offset;
 	kfree(chunk_array);
 
-	/* Use this opportunity to fill in task info for the vm */
-	amdgpu_vm_set_task_info(vm);
-
 	return 0;
 
 free_all_kdata:
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index e0af44f..c75ecb3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -43,6 +43,14 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job)
 		  job->base.sched->name, atomic_read(&ring->fence_drv.last_seq),
 		  ring->fence_drv.sync_seq);
 
+	if (job->vm) {
+		struct amdgpu_task_info *ti = &(job->vm->task_info);
+
+		DRM_ERROR("The job's process information is as below:\n");
+		DRM_ERROR("Process %s, thread %s, cmd line:%s\n",
+			ti->process_name, ti->task_name, ti->cmd_line);
+	}
+
 	if (amdgpu_device_should_recover_gpu(ring->adev))
 		amdgpu_device_gpu_recover(ring->adev, job);
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index e73d152..24f3cbd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -30,6 +30,7 @@
 #include <linux/idr.h>
 #include <drm/drmP.h>
 #include <drm/amdgpu_drm.h>
+#include <linux/string_helpers.h>
 #include "amdgpu.h"
 #include "amdgpu_trace.h"
 #include "amdgpu_amdkfd.h"
@@ -3045,6 +3046,8 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 			goto error_free_root;
 
 		vm->pasid = pasid;
+
+		amdgpu_vm_set_task_info(vm);
 	}
 
 	vm->fault_hash = init_fault_hash();
@@ -3223,6 +3226,9 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
 		spin_lock_irqsave(&adev->vm_manager.pasid_lock, flags);
 		idr_remove(&adev->vm_manager.pasid_idr, vm->pasid);
 		spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
+
+		kfree(vm->task_info.cmd_line);
+		vm->task_info.cmd_line = NULL;
 	}
 
 	kfree(vm->fault_hash);
@@ -3391,6 +3397,11 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
 			vm->task_info.tgid = current->group_leader->pid;
 			get_task_comm(vm->task_info.process_name, current->group_leader);
 		}
+
+		vm->task_info.cmd_line =
+				kstrdup_quotable_cmdline(current, GFP_KERNEL);
+		if (!vm->task_info.cmd_line)
+			DRM_DEBUG_DRIVER("Failed to get cmdline!\n");
 	}
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index e8dcfd5..9fab787 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -175,6 +175,7 @@ struct amdgpu_vm_pte_funcs {
 struct amdgpu_task_info {
 	char	process_name[TASK_COMM_LEN];
 	char	task_name[TASK_COMM_LEN];
+	char    *cmd_line;
 	pid_t	pid;
 	pid_t	tgid;
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index bacdaef..c3e3558 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -325,11 +325,12 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
 		amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
 
 		dev_err(adev->dev,
-			"[%s] VMC page fault (src_id:%u ring:%u vmid:%u pasid:%u, for process %s pid %d thread %s pid %d)\n",
-			entry->vmid_src ? "mmhub" : "gfxhub",
-			entry->src_id, entry->ring_id, entry->vmid,
-			entry->pasid, task_info.process_name, task_info.tgid,
-			task_info.task_name, task_info.pid);
+			"[%s] VMC page fault (src_id:%u ring:%u vmid:%u pasid:%u), ",
+			entry->vmid_src ? "mmhub" : "gfxhub",  entry->src_id,
+			entry->ring_id, entry->vmid, entry->pasid);
+		dev_err(adev->dev, "for process %s pid %d thread %s pid %d, args:%s,",
+			task_info.process_name, task_info.tgid,
+			task_info.task_name, task_info.pid, task_info.cmd_line);
 		dev_err(adev->dev, "  in page starting at address 0x%016llx from %d\n",
 			addr, entry->client_id);
 		if (!amdgpu_sriov_vf(adev))
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] drm/amdgpu: Add more process info in VM for debug
       [not found] ` <1544864213-2352-1-git-send-email-Trigger.Huang-5C7GfCeVMHo@public.gmane.org>
@ 2018-12-15 12:22   ` Christian König
       [not found]     ` <3411fb4a-119f-8531-4597-7eb185a63c59-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  0 siblings, 1 reply; 8+ messages in thread
From: Christian König @ 2018-12-15 12:22 UTC (permalink / raw)
  To: Trigger Huang, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: andrey.grodzovsky-5C7GfCeVMHo, Jim.Qu-5C7GfCeVMHo

Am 15.12.18 um 09:56 schrieb Trigger Huang:
> When debugging VMC page fault and ring hang issues, the detailed
> process information is quite helpful, especially when the issue
> can only be reproduced after a very long time running. With this
> information, only run the specific sub-testcase may also will
> reproduce the issue, which may save a lot of time for debugging.
>
> With this patch, the process information is similar as following.
> 	When VMC page fault issue happened:
> [  142.978417] amdgpu 0000:00:08.0: [gfxhub] VMC page fault (src_id:0 ring:171 vmid:2 pasid:32769),
> [  142.978542] amdgpu 0000:00:08.0: for process ocltst pid 1354 thread ocltst pid 1354, args:./ocltst -m oclperf.so -t OCLPerfDeviceEnqueueEvent,
> [  142.978693] amdgpu 0000:00:08.0:   in page starting at address 0x0000000000000000 from 27
>
> 	When ring hang issue happened:
> [ 1740.047122] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring comp_1.0.0 timeout, signaled seq=91571, emitted seq=91572
> [ 1740.050167] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* The job's process information is as below:
> [ 1740.053160] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process SDMA pid 2098 thread SDMA pid 2098, cmd line:SDMA --mode goldimage_compare --offscreen --n-swapchain-images 3 --gpu 0 --frontend test_executor --n-test-threads 4
>
> Signed-off-by: Trigger Huang <Trigger.Huang@amd.com>

Well NAK, we intentionally didn't do it this way.

First of all you can't get the process info during VM creating since 
that can happen in X as well.

Second when a timeout happen the VM structure might already be released, 
so using job->vm is illegal here. What we could try is to get the VM 
using the PASID.

And last we don't want to keep the full command line around.

The only valid addition I can see here is to print the thread info when 
the timeout happens.

Regards,
Christian.

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  |  3 ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c |  8 ++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 11 +++++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h  |  1 +
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   | 11 ++++++-----
>   5 files changed, 26 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index 1c49b82..1a2d0c9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -235,9 +235,6 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs
>   		p->job->uf_addr = uf_offset;
>   	kfree(chunk_array);
>   
> -	/* Use this opportunity to fill in task info for the vm */
> -	amdgpu_vm_set_task_info(vm);
> -
>   	return 0;
>   
>   free_all_kdata:
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index e0af44f..c75ecb3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -43,6 +43,14 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job)
>   		  job->base.sched->name, atomic_read(&ring->fence_drv.last_seq),
>   		  ring->fence_drv.sync_seq);
>   
> +	if (job->vm) {
> +		struct amdgpu_task_info *ti = &(job->vm->task_info);
> +
> +		DRM_ERROR("The job's process information is as below:\n");
> +		DRM_ERROR("Process %s, thread %s, cmd line:%s\n",
> +			ti->process_name, ti->task_name, ti->cmd_line);
> +	}
> +
>   	if (amdgpu_device_should_recover_gpu(ring->adev))
>   		amdgpu_device_gpu_recover(ring->adev, job);
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index e73d152..24f3cbd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -30,6 +30,7 @@
>   #include <linux/idr.h>
>   #include <drm/drmP.h>
>   #include <drm/amdgpu_drm.h>
> +#include <linux/string_helpers.h>
>   #include "amdgpu.h"
>   #include "amdgpu_trace.h"
>   #include "amdgpu_amdkfd.h"
> @@ -3045,6 +3046,8 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>   			goto error_free_root;
>   
>   		vm->pasid = pasid;
> +
> +		amdgpu_vm_set_task_info(vm);
>   	}
>   
>   	vm->fault_hash = init_fault_hash();
> @@ -3223,6 +3226,9 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
>   		spin_lock_irqsave(&adev->vm_manager.pasid_lock, flags);
>   		idr_remove(&adev->vm_manager.pasid_idr, vm->pasid);
>   		spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
> +
> +		kfree(vm->task_info.cmd_line);
> +		vm->task_info.cmd_line = NULL;
>   	}
>   
>   	kfree(vm->fault_hash);
> @@ -3391,6 +3397,11 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
>   			vm->task_info.tgid = current->group_leader->pid;
>   			get_task_comm(vm->task_info.process_name, current->group_leader);
>   		}
> +
> +		vm->task_info.cmd_line =
> +				kstrdup_quotable_cmdline(current, GFP_KERNEL);
> +		if (!vm->task_info.cmd_line)
> +			DRM_DEBUG_DRIVER("Failed to get cmdline!\n");
>   	}
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> index e8dcfd5..9fab787 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> @@ -175,6 +175,7 @@ struct amdgpu_vm_pte_funcs {
>   struct amdgpu_task_info {
>   	char	process_name[TASK_COMM_LEN];
>   	char	task_name[TASK_COMM_LEN];
> +	char    *cmd_line;
>   	pid_t	pid;
>   	pid_t	tgid;
>   };
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index bacdaef..c3e3558 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -325,11 +325,12 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
>   		amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
>   
>   		dev_err(adev->dev,
> -			"[%s] VMC page fault (src_id:%u ring:%u vmid:%u pasid:%u, for process %s pid %d thread %s pid %d)\n",
> -			entry->vmid_src ? "mmhub" : "gfxhub",
> -			entry->src_id, entry->ring_id, entry->vmid,
> -			entry->pasid, task_info.process_name, task_info.tgid,
> -			task_info.task_name, task_info.pid);
> +			"[%s] VMC page fault (src_id:%u ring:%u vmid:%u pasid:%u), ",
> +			entry->vmid_src ? "mmhub" : "gfxhub",  entry->src_id,
> +			entry->ring_id, entry->vmid, entry->pasid);
> +		dev_err(adev->dev, "for process %s pid %d thread %s pid %d, args:%s,",
> +			task_info.process_name, task_info.tgid,
> +			task_info.task_name, task_info.pid, task_info.cmd_line);
>   		dev_err(adev->dev, "  in page starting at address 0x%016llx from %d\n",
>   			addr, entry->client_id);
>   		if (!amdgpu_sriov_vf(adev))

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH] drm/amdgpu: Add more process info in VM for debug
       [not found]     ` <3411fb4a-119f-8531-4597-7eb185a63c59-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
@ 2018-12-17  3:03       ` Huang, Trigger
       [not found]         ` <BY2PR12MB0545A2EF343926B82173F6C9FEBC0-K//h7OWB4q7Qm/t26ohGYQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  0 siblings, 1 reply; 8+ messages in thread
From: Huang, Trigger @ 2018-12-17  3:03 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Grodzovsky, Andrey, Qu, Jim

Hi Christian,

Many thanks for pointing out the mistakes

I have some comments as below, would you help to check again?

First of all you can't get the process info during VM creating since that can happen in X as well.
[Trigger]: Ok, I will keep the original logic, which is that set the vm info in cs . I will still invoke kfree(cmd_line) in amdgpu_vm_fini to avoid memory leak.

Second when a timeout happen the VM structure might already be released, so using job->vm is illegal here. What we could try is to get the VM using the PASID.
[Trigger]: Ok, I will do it in job timeout like what VMC page fault's handler does

And last we don't want to keep the full command line around.
[Trigger]: well, actually, the detailed command line is just what we want.
For example, there are thousands of sub-cases of one big test case, and for each sub-case, the arguments may also be different.
In some corner case,  test machine is hung after running the big test case for several hours even several days, it is really painful to wait another several hours to reproduce it and debug.
But if we know the last sub-case running on the test machine, then this issues *may* can be reproduced by only running the specific sub-case with specific arguments for several rounds, 
and in this situation, it will save us a lot time for reproducing and debugging.
Does this make sense?  If not, how about we add a parameter, such as amdgpu_vm_debug_verbose, to turn on/off the cmd line dumping?


Thanks,
Trigger

-----Original Message-----
From: Christian König <ckoenig.leichtzumerken@gmail.com> 
Sent: Saturday, December 15, 2018 8:23 PM
To: Huang, Trigger <Trigger.Huang@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Qu, Jim <Jim.Qu@amd.com>
Subject: Re: [PATCH] drm/amdgpu: Add more process info in VM for debug

Am 15.12.18 um 09:56 schrieb Trigger Huang:
> When debugging VMC page fault and ring hang issues, the detailed 
> process information is quite helpful, especially when the issue can 
> only be reproduced after a very long time running. With this 
> information, only run the specific sub-testcase may also will 
> reproduce the issue, which may save a lot of time for debugging.
>
> With this patch, the process information is similar as following.
> 	When VMC page fault issue happened:
> [  142.978417] amdgpu 0000:00:08.0: [gfxhub] VMC page fault (src_id:0 
> ring:171 vmid:2 pasid:32769), [  142.978542] amdgpu 0000:00:08.0: for process ocltst pid 1354 thread ocltst pid 1354, args:./ocltst -m oclperf.so -t OCLPerfDeviceEnqueueEvent,
> [  142.978693] amdgpu 0000:00:08.0:   in page starting at address 0x0000000000000000 from 27
>
> 	When ring hang issue happened:
> [ 1740.047122] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring 
> comp_1.0.0 timeout, signaled seq=91571, emitted seq=91572 [ 1740.050167] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* The job's process information is as below:
> [ 1740.053160] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process SDMA 
> pid 2098 thread SDMA pid 2098, cmd line:SDMA --mode goldimage_compare 
> --offscreen --n-swapchain-images 3 --gpu 0 --frontend test_executor 
> --n-test-threads 4
>
> Signed-off-by: Trigger Huang <Trigger.Huang@amd.com>

Well NAK, we intentionally didn't do it this way.

First of all you can't get the process info during VM creating since that can happen in X as well.

Second when a timeout happen the VM structure might already be released, so using job->vm is illegal here. What we could try is to get the VM using the PASID.

And last we don't want to keep the full command line around.

The only valid addition I can see here is to print the thread info when the timeout happens.

Regards,
Christian.

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  |  3 ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c |  8 ++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 11 +++++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h  |  1 +
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   | 11 ++++++-----
>   5 files changed, 26 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index 1c49b82..1a2d0c9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -235,9 +235,6 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs
>   		p->job->uf_addr = uf_offset;
>   	kfree(chunk_array);
>   
> -	/* Use this opportunity to fill in task info for the vm */
> -	amdgpu_vm_set_task_info(vm);
> -
>   	return 0;
>   
>   free_all_kdata:
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index e0af44f..c75ecb3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -43,6 +43,14 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job)
>   		  job->base.sched->name, atomic_read(&ring->fence_drv.last_seq),
>   		  ring->fence_drv.sync_seq);
>   
> +	if (job->vm) {
> +		struct amdgpu_task_info *ti = &(job->vm->task_info);
> +
> +		DRM_ERROR("The job's process information is as below:\n");
> +		DRM_ERROR("Process %s, thread %s, cmd line:%s\n",
> +			ti->process_name, ti->task_name, ti->cmd_line);
> +	}
> +
>   	if (amdgpu_device_should_recover_gpu(ring->adev))
>   		amdgpu_device_gpu_recover(ring->adev, job);
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index e73d152..24f3cbd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -30,6 +30,7 @@
>   #include <linux/idr.h>
>   #include <drm/drmP.h>
>   #include <drm/amdgpu_drm.h>
> +#include <linux/string_helpers.h>
>   #include "amdgpu.h"
>   #include "amdgpu_trace.h"
>   #include "amdgpu_amdkfd.h"
> @@ -3045,6 +3046,8 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>   			goto error_free_root;
>   
>   		vm->pasid = pasid;
> +
> +		amdgpu_vm_set_task_info(vm);
>   	}
>   
>   	vm->fault_hash = init_fault_hash(); @@ -3223,6 +3226,9 @@ void 
> amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
>   		spin_lock_irqsave(&adev->vm_manager.pasid_lock, flags);
>   		idr_remove(&adev->vm_manager.pasid_idr, vm->pasid);
>   		spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
> +
> +		kfree(vm->task_info.cmd_line);
> +		vm->task_info.cmd_line = NULL;
>   	}
>   
>   	kfree(vm->fault_hash);
> @@ -3391,6 +3397,11 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
>   			vm->task_info.tgid = current->group_leader->pid;
>   			get_task_comm(vm->task_info.process_name, current->group_leader);
>   		}
> +
> +		vm->task_info.cmd_line =
> +				kstrdup_quotable_cmdline(current, GFP_KERNEL);
> +		if (!vm->task_info.cmd_line)
> +			DRM_DEBUG_DRIVER("Failed to get cmdline!\n");
>   	}
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> index e8dcfd5..9fab787 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> @@ -175,6 +175,7 @@ struct amdgpu_vm_pte_funcs {
>   struct amdgpu_task_info {
>   	char	process_name[TASK_COMM_LEN];
>   	char	task_name[TASK_COMM_LEN];
> +	char    *cmd_line;
>   	pid_t	pid;
>   	pid_t	tgid;
>   };
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index bacdaef..c3e3558 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -325,11 +325,12 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
>   		amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
>   
>   		dev_err(adev->dev,
> -			"[%s] VMC page fault (src_id:%u ring:%u vmid:%u pasid:%u, for process %s pid %d thread %s pid %d)\n",
> -			entry->vmid_src ? "mmhub" : "gfxhub",
> -			entry->src_id, entry->ring_id, entry->vmid,
> -			entry->pasid, task_info.process_name, task_info.tgid,
> -			task_info.task_name, task_info.pid);
> +			"[%s] VMC page fault (src_id:%u ring:%u vmid:%u pasid:%u), ",
> +			entry->vmid_src ? "mmhub" : "gfxhub",  entry->src_id,
> +			entry->ring_id, entry->vmid, entry->pasid);
> +		dev_err(adev->dev, "for process %s pid %d thread %s pid %d, args:%s,",
> +			task_info.process_name, task_info.tgid,
> +			task_info.task_name, task_info.pid, task_info.cmd_line);
>   		dev_err(adev->dev, "  in page starting at address 0x%016llx from %d\n",
>   			addr, entry->client_id);
>   		if (!amdgpu_sriov_vf(adev))

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] drm/amdgpu: Add more process info in VM for debug
       [not found]         ` <BY2PR12MB0545A2EF343926B82173F6C9FEBC0-K//h7OWB4q7Qm/t26ohGYQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2018-12-17  7:50           ` Koenig, Christian
       [not found]             ` <36a14372-fd14-975b-c06e-1c1081f3c586-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 8+ messages in thread
From: Koenig, Christian @ 2018-12-17  7:50 UTC (permalink / raw)
  To: Huang, Trigger, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Grodzovsky, Andrey, Qu, Jim

Hi Trigger,

> Does this make sense?
Sorry I wasn't clear enough. The key point that we don't want/need the 
full commandline of the process here is that we can already get that 
from the information we have.

E.g. "ps -p 1 -o args" gives you the command line of the process 1.

The only case where directly printing this into the logs is useful is 
when we run into a total system crash and in this case the processed is 
only the trigger, but not the root cause.

Regards,
Christian.

Am 17.12.18 um 04:03 schrieb Huang, Trigger:
> Hi Christian,
>
> Many thanks for pointing out the mistakes
>
> I have some comments as below, would you help to check again?
>
> First of all you can't get the process info during VM creating since that can happen in X as well.
> [Trigger]: Ok, I will keep the original logic, which is that set the vm info in cs . I will still invoke kfree(cmd_line) in amdgpu_vm_fini to avoid memory leak.
>
> Second when a timeout happen the VM structure might already be released, so using job->vm is illegal here. What we could try is to get the VM using the PASID.
> [Trigger]: Ok, I will do it in job timeout like what VMC page fault's handler does
>
> And last we don't want to keep the full command line around.
> [Trigger]: well, actually, the detailed command line is just what we want.
> For example, there are thousands of sub-cases of one big test case, and for each sub-case, the arguments may also be different.
> In some corner case,  test machine is hung after running the big test case for several hours even several days, it is really painful to wait another several hours to reproduce it and debug.
> But if we know the last sub-case running on the test machine, then this issues *may* can be reproduced by only running the specific sub-case with specific arguments for several rounds,
> and in this situation, it will save us a lot time for reproducing and debugging.
> Does this make sense?  If not, how about we add a parameter, such as amdgpu_vm_debug_verbose, to turn on/off the cmd line dumping?
>
>
> Thanks,
> Trigger
>
> -----Original Message-----
> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> Sent: Saturday, December 15, 2018 8:23 PM
> To: Huang, Trigger <Trigger.Huang@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Qu, Jim <Jim.Qu@amd.com>
> Subject: Re: [PATCH] drm/amdgpu: Add more process info in VM for debug
>
> Am 15.12.18 um 09:56 schrieb Trigger Huang:
>> When debugging VMC page fault and ring hang issues, the detailed
>> process information is quite helpful, especially when the issue can
>> only be reproduced after a very long time running. With this
>> information, only run the specific sub-testcase may also will
>> reproduce the issue, which may save a lot of time for debugging.
>>
>> With this patch, the process information is similar as following.
>> 	When VMC page fault issue happened:
>> [  142.978417] amdgpu 0000:00:08.0: [gfxhub] VMC page fault (src_id:0
>> ring:171 vmid:2 pasid:32769), [  142.978542] amdgpu 0000:00:08.0: for process ocltst pid 1354 thread ocltst pid 1354, args:./ocltst -m oclperf.so -t OCLPerfDeviceEnqueueEvent,
>> [  142.978693] amdgpu 0000:00:08.0:   in page starting at address 0x0000000000000000 from 27
>>
>> 	When ring hang issue happened:
>> [ 1740.047122] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring
>> comp_1.0.0 timeout, signaled seq=91571, emitted seq=91572 [ 1740.050167] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* The job's process information is as below:
>> [ 1740.053160] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process SDMA
>> pid 2098 thread SDMA pid 2098, cmd line:SDMA --mode goldimage_compare
>> --offscreen --n-swapchain-images 3 --gpu 0 --frontend test_executor
>> --n-test-threads 4
>>
>> Signed-off-by: Trigger Huang <Trigger.Huang@amd.com>
> Well NAK, we intentionally didn't do it this way.
>
> First of all you can't get the process info during VM creating since that can happen in X as well.
>
> Second when a timeout happen the VM structure might already be released, so using job->vm is illegal here. What we could try is to get the VM using the PASID.
>
> And last we don't want to keep the full command line around.
>
> The only valid addition I can see here is to print the thread info when the timeout happens.
>
> Regards,
> Christian.
>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  |  3 ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_job.c |  8 ++++++++
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 11 +++++++++++
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h  |  1 +
>>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   | 11 ++++++-----
>>    5 files changed, 26 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> index 1c49b82..1a2d0c9 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> @@ -235,9 +235,6 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs
>>    		p->job->uf_addr = uf_offset;
>>    	kfree(chunk_array);
>>    
>> -	/* Use this opportunity to fill in task info for the vm */
>> -	amdgpu_vm_set_task_info(vm);
>> -
>>    	return 0;
>>    
>>    free_all_kdata:
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> index e0af44f..c75ecb3 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> @@ -43,6 +43,14 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job)
>>    		  job->base.sched->name, atomic_read(&ring->fence_drv.last_seq),
>>    		  ring->fence_drv.sync_seq);
>>    
>> +	if (job->vm) {
>> +		struct amdgpu_task_info *ti = &(job->vm->task_info);
>> +
>> +		DRM_ERROR("The job's process information is as below:\n");
>> +		DRM_ERROR("Process %s, thread %s, cmd line:%s\n",
>> +			ti->process_name, ti->task_name, ti->cmd_line);
>> +	}
>> +
>>    	if (amdgpu_device_should_recover_gpu(ring->adev))
>>    		amdgpu_device_gpu_recover(ring->adev, job);
>>    }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> index e73d152..24f3cbd 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> @@ -30,6 +30,7 @@
>>    #include <linux/idr.h>
>>    #include <drm/drmP.h>
>>    #include <drm/amdgpu_drm.h>
>> +#include <linux/string_helpers.h>
>>    #include "amdgpu.h"
>>    #include "amdgpu_trace.h"
>>    #include "amdgpu_amdkfd.h"
>> @@ -3045,6 +3046,8 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>>    			goto error_free_root;
>>    
>>    		vm->pasid = pasid;
>> +
>> +		amdgpu_vm_set_task_info(vm);
>>    	}
>>    
>>    	vm->fault_hash = init_fault_hash(); @@ -3223,6 +3226,9 @@ void
>> amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
>>    		spin_lock_irqsave(&adev->vm_manager.pasid_lock, flags);
>>    		idr_remove(&adev->vm_manager.pasid_idr, vm->pasid);
>>    		spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
>> +
>> +		kfree(vm->task_info.cmd_line);
>> +		vm->task_info.cmd_line = NULL;
>>    	}
>>    
>>    	kfree(vm->fault_hash);
>> @@ -3391,6 +3397,11 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
>>    			vm->task_info.tgid = current->group_leader->pid;
>>    			get_task_comm(vm->task_info.process_name, current->group_leader);
>>    		}
>> +
>> +		vm->task_info.cmd_line =
>> +				kstrdup_quotable_cmdline(current, GFP_KERNEL);
>> +		if (!vm->task_info.cmd_line)
>> +			DRM_DEBUG_DRIVER("Failed to get cmdline!\n");
>>    	}
>>    }
>>    
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> index e8dcfd5..9fab787 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> @@ -175,6 +175,7 @@ struct amdgpu_vm_pte_funcs {
>>    struct amdgpu_task_info {
>>    	char	process_name[TASK_COMM_LEN];
>>    	char	task_name[TASK_COMM_LEN];
>> +	char    *cmd_line;
>>    	pid_t	pid;
>>    	pid_t	tgid;
>>    };
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> index bacdaef..c3e3558 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> @@ -325,11 +325,12 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
>>    		amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
>>    
>>    		dev_err(adev->dev,
>> -			"[%s] VMC page fault (src_id:%u ring:%u vmid:%u pasid:%u, for process %s pid %d thread %s pid %d)\n",
>> -			entry->vmid_src ? "mmhub" : "gfxhub",
>> -			entry->src_id, entry->ring_id, entry->vmid,
>> -			entry->pasid, task_info.process_name, task_info.tgid,
>> -			task_info.task_name, task_info.pid);
>> +			"[%s] VMC page fault (src_id:%u ring:%u vmid:%u pasid:%u), ",
>> +			entry->vmid_src ? "mmhub" : "gfxhub",  entry->src_id,
>> +			entry->ring_id, entry->vmid, entry->pasid);
>> +		dev_err(adev->dev, "for process %s pid %d thread %s pid %d, args:%s,",
>> +			task_info.process_name, task_info.tgid,
>> +			task_info.task_name, task_info.pid, task_info.cmd_line);
>>    		dev_err(adev->dev, "  in page starting at address 0x%016llx from %d\n",
>>    			addr, entry->client_id);
>>    		if (!amdgpu_sriov_vf(adev))

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH] drm/amdgpu: Add more process info in VM for debug
       [not found]             ` <36a14372-fd14-975b-c06e-1c1081f3c586-5C7GfCeVMHo@public.gmane.org>
@ 2018-12-17  8:27               ` Huang, Trigger
       [not found]                 ` <BY2PR12MB05459446C83A07595D4C42E6FEBC0-K//h7OWB4q7Qm/t26ohGYQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  0 siblings, 1 reply; 8+ messages in thread
From: Huang, Trigger @ 2018-12-17  8:27 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Grodzovsky, Andrey, Qu, Jim

Hi Christian,

Yes, if the test machine is still there for debugging, we can login it and check a lot of things, such as ' ps -p 1 -o args ' as you suggested.

But sometimes, the system is not alive anymore, and we only got some log files (such as kern.log ) from QA or customers. 
And at this time, the full command line information in dmesg is quite useful. Let's take the following message for example:
	
	[ 1740.047122] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring comp_1.0.0 timeout, signaled seq=91571, emitted seq=91572
	[ 1740.050167] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* The job's process information is as below:
	[ 1740.053160] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process SDMA pid 2098 thread SDMA pid 2098, cmd line:SDMA --mode goldimage_compare --offscreen --n-swapchain-images 3 --gpu 0 --frontend test_executor --n-test-threads 4
When customer/QA reported that there is comp ring job timeout when running a big test case, vk-example, but the machine is not alive anymore, and can't  login for debugging. Then we need to re-run the whole vk-example to reproduce this issue, and this will waste a lot of time.
But if we get the specific sub-base in the  kern.log file when job timeout happened, then we can only try the specific one, here it is ' SDMA --mode goldimage_compare --offscreen --n-swapchain-images 3 --gpu 0 --frontend test_executor --n-test-threads 4'
maybe several minutes later, this issues is reproduced.


Thanks,
Trigger.


-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Koenig, Christian
Sent: Monday, December 17, 2018 3:50 PM
To: Huang, Trigger <Trigger.Huang@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Qu, Jim <Jim.Qu@amd.com>
Subject: Re: [PATCH] drm/amdgpu: Add more process info in VM for debug

Hi Trigger,

> Does this make sense?
Sorry I wasn't clear enough. The key point that we don't want/need the full commandline of the process here is that we can already get that from the information we have.

E.g. "ps -p 1 -o args" gives you the command line of the process 1.

The only case where directly printing this into the logs is useful is when we run into a total system crash and in this case the processed is only the trigger, but not the root cause.

Regards,
Christian.

Am 17.12.18 um 04:03 schrieb Huang, Trigger:
> Hi Christian,
>
> Many thanks for pointing out the mistakes
>
> I have some comments as below, would you help to check again?
>
> First of all you can't get the process info during VM creating since that can happen in X as well.
> [Trigger]: Ok, I will keep the original logic, which is that set the vm info in cs . I will still invoke kfree(cmd_line) in amdgpu_vm_fini to avoid memory leak.
>
> Second when a timeout happen the VM structure might already be released, so using job->vm is illegal here. What we could try is to get the VM using the PASID.
> [Trigger]: Ok, I will do it in job timeout like what VMC page fault's 
> handler does
>
> And last we don't want to keep the full command line around.
> [Trigger]: well, actually, the detailed command line is just what we want.
> For example, there are thousands of sub-cases of one big test case, and for each sub-case, the arguments may also be different.
> In some corner case,  test machine is hung after running the big test case for several hours even several days, it is really painful to wait another several hours to reproduce it and debug.
> But if we know the last sub-case running on the test machine, then 
> this issues *may* can be reproduced by only running the specific sub-case with specific arguments for several rounds, and in this situation, it will save us a lot time for reproducing and debugging.
> Does this make sense?  If not, how about we add a parameter, such as amdgpu_vm_debug_verbose, to turn on/off the cmd line dumping?
>
>
> Thanks,
> Trigger
>
> -----Original Message-----
> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> Sent: Saturday, December 15, 2018 8:23 PM
> To: Huang, Trigger <Trigger.Huang@amd.com>; 
> amd-gfx@lists.freedesktop.org
> Cc: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Qu, Jim 
> <Jim.Qu@amd.com>
> Subject: Re: [PATCH] drm/amdgpu: Add more process info in VM for debug
>
> Am 15.12.18 um 09:56 schrieb Trigger Huang:
>> When debugging VMC page fault and ring hang issues, the detailed 
>> process information is quite helpful, especially when the issue can 
>> only be reproduced after a very long time running. With this 
>> information, only run the specific sub-testcase may also will 
>> reproduce the issue, which may save a lot of time for debugging.
>>
>> With this patch, the process information is similar as following.
>> 	When VMC page fault issue happened:
>> [  142.978417] amdgpu 0000:00:08.0: [gfxhub] VMC page fault (src_id:0
>> ring:171 vmid:2 pasid:32769), [  142.978542] amdgpu 0000:00:08.0: for process ocltst pid 1354 thread ocltst pid 1354, args:./ocltst -m oclperf.so -t OCLPerfDeviceEnqueueEvent,
>> [  142.978693] amdgpu 0000:00:08.0:   in page starting at address 0x0000000000000000 from 27
>>
>> 	When ring hang issue happened:
>> [ 1740.047122] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring
>> comp_1.0.0 timeout, signaled seq=91571, emitted seq=91572 [ 1740.050167] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* The job's process information is as below:
>> [ 1740.053160] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process 
>> SDMA pid 2098 thread SDMA pid 2098, cmd line:SDMA --mode 
>> goldimage_compare --offscreen --n-swapchain-images 3 --gpu 0 
>> --frontend test_executor --n-test-threads 4
>>
>> Signed-off-by: Trigger Huang <Trigger.Huang@amd.com>
> Well NAK, we intentionally didn't do it this way.
>
> First of all you can't get the process info during VM creating since that can happen in X as well.
>
> Second when a timeout happen the VM structure might already be released, so using job->vm is illegal here. What we could try is to get the VM using the PASID.
>
> And last we don't want to keep the full command line around.
>
> The only valid addition I can see here is to print the thread info when the timeout happens.
>
> Regards,
> Christian.
>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  |  3 ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_job.c |  8 ++++++++
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 11 +++++++++++
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h  |  1 +
>>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   | 11 ++++++-----
>>    5 files changed, 26 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> index 1c49b82..1a2d0c9 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> @@ -235,9 +235,6 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs
>>    		p->job->uf_addr = uf_offset;
>>    	kfree(chunk_array);
>>    
>> -	/* Use this opportunity to fill in task info for the vm */
>> -	amdgpu_vm_set_task_info(vm);
>> -
>>    	return 0;
>>    
>>    free_all_kdata:
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> index e0af44f..c75ecb3 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> @@ -43,6 +43,14 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job)
>>    		  job->base.sched->name, atomic_read(&ring->fence_drv.last_seq),
>>    		  ring->fence_drv.sync_seq);
>>    
>> +	if (job->vm) {
>> +		struct amdgpu_task_info *ti = &(job->vm->task_info);
>> +
>> +		DRM_ERROR("The job's process information is as below:\n");
>> +		DRM_ERROR("Process %s, thread %s, cmd line:%s\n",
>> +			ti->process_name, ti->task_name, ti->cmd_line);
>> +	}
>> +
>>    	if (amdgpu_device_should_recover_gpu(ring->adev))
>>    		amdgpu_device_gpu_recover(ring->adev, job);
>>    }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> index e73d152..24f3cbd 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> @@ -30,6 +30,7 @@
>>    #include <linux/idr.h>
>>    #include <drm/drmP.h>
>>    #include <drm/amdgpu_drm.h>
>> +#include <linux/string_helpers.h>
>>    #include "amdgpu.h"
>>    #include "amdgpu_trace.h"
>>    #include "amdgpu_amdkfd.h"
>> @@ -3045,6 +3046,8 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>>    			goto error_free_root;
>>    
>>    		vm->pasid = pasid;
>> +
>> +		amdgpu_vm_set_task_info(vm);
>>    	}
>>    
>>    	vm->fault_hash = init_fault_hash(); @@ -3223,6 +3226,9 @@ void 
>> amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
>>    		spin_lock_irqsave(&adev->vm_manager.pasid_lock, flags);
>>    		idr_remove(&adev->vm_manager.pasid_idr, vm->pasid);
>>    		spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
>> +
>> +		kfree(vm->task_info.cmd_line);
>> +		vm->task_info.cmd_line = NULL;
>>    	}
>>    
>>    	kfree(vm->fault_hash);
>> @@ -3391,6 +3397,11 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
>>    			vm->task_info.tgid = current->group_leader->pid;
>>    			get_task_comm(vm->task_info.process_name, current->group_leader);
>>    		}
>> +
>> +		vm->task_info.cmd_line =
>> +				kstrdup_quotable_cmdline(current, GFP_KERNEL);
>> +		if (!vm->task_info.cmd_line)
>> +			DRM_DEBUG_DRIVER("Failed to get cmdline!\n");
>>    	}
>>    }
>>    
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> index e8dcfd5..9fab787 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> @@ -175,6 +175,7 @@ struct amdgpu_vm_pte_funcs {
>>    struct amdgpu_task_info {
>>    	char	process_name[TASK_COMM_LEN];
>>    	char	task_name[TASK_COMM_LEN];
>> +	char    *cmd_line;
>>    	pid_t	pid;
>>    	pid_t	tgid;
>>    };
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> index bacdaef..c3e3558 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> @@ -325,11 +325,12 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
>>    		amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
>>    
>>    		dev_err(adev->dev,
>> -			"[%s] VMC page fault (src_id:%u ring:%u vmid:%u pasid:%u, for process %s pid %d thread %s pid %d)\n",
>> -			entry->vmid_src ? "mmhub" : "gfxhub",
>> -			entry->src_id, entry->ring_id, entry->vmid,
>> -			entry->pasid, task_info.process_name, task_info.tgid,
>> -			task_info.task_name, task_info.pid);
>> +			"[%s] VMC page fault (src_id:%u ring:%u vmid:%u pasid:%u), ",
>> +			entry->vmid_src ? "mmhub" : "gfxhub",  entry->src_id,
>> +			entry->ring_id, entry->vmid, entry->pasid);
>> +		dev_err(adev->dev, "for process %s pid %d thread %s pid %d, args:%s,",
>> +			task_info.process_name, task_info.tgid,
>> +			task_info.task_name, task_info.pid, task_info.cmd_line);
>>    		dev_err(adev->dev, "  in page starting at address 0x%016llx from %d\n",
>>    			addr, entry->client_id);
>>    		if (!amdgpu_sriov_vf(adev))

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH] drm/amdgpu: Add more process info in VM for debug
       [not found]                 ` <BY2PR12MB05459446C83A07595D4C42E6FEBC0-K//h7OWB4q7Qm/t26ohGYQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2018-12-17  9:02                   ` Liu, Monk
       [not found]                     ` <CY4PR1201MB02452810FED0CD6283C9B18584BC0-1iTaO6aE1DBfNQakwlCMTGrFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
  0 siblings, 1 reply; 8+ messages in thread
From: Liu, Monk @ 2018-12-17  9:02 UTC (permalink / raw)
  To: Huang, Trigger, Koenig, Christian,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Grodzovsky, Andrey, Qu, Jim

Hi Christian,

I think for some SRIOV customers they need this rich information, 
Maybe we can use an kernel option to let user select if rich or simple information should be printed upon job TDR ?
In SRIOV branch we can set it enable by default while set by default disable for drm-next 

/Monk

-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Huang, Trigger
Sent: Monday, December 17, 2018 4:27 PM
To: Koenig, Christian <Christian.Koenig@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Qu, Jim <Jim.Qu@amd.com>
Subject: RE: [PATCH] drm/amdgpu: Add more process info in VM for debug

Hi Christian,

Yes, if the test machine is still there for debugging, we can login it and check a lot of things, such as ' ps -p 1 -o args ' as you suggested.

But sometimes, the system is not alive anymore, and we only got some log files (such as kern.log ) from QA or customers. 
And at this time, the full command line information in dmesg is quite useful. Let's take the following message for example:
	
	[ 1740.047122] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring comp_1.0.0 timeout, signaled seq=91571, emitted seq=91572
	[ 1740.050167] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* The job's process information is as below:
	[ 1740.053160] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process SDMA pid 2098 thread SDMA pid 2098, cmd line:SDMA --mode goldimage_compare --offscreen --n-swapchain-images 3 --gpu 0 --frontend test_executor --n-test-threads 4 When customer/QA reported that there is comp ring job timeout when running a big test case, vk-example, but the machine is not alive anymore, and can't  login for debugging. Then we need to re-run the whole vk-example to reproduce this issue, and this will waste a lot of time.
But if we get the specific sub-base in the  kern.log file when job timeout happened, then we can only try the specific one, here it is ' SDMA --mode goldimage_compare --offscreen --n-swapchain-images 3 --gpu 0 --frontend test_executor --n-test-threads 4'
maybe several minutes later, this issues is reproduced.


Thanks,
Trigger.


-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Koenig, Christian
Sent: Monday, December 17, 2018 3:50 PM
To: Huang, Trigger <Trigger.Huang@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Qu, Jim <Jim.Qu@amd.com>
Subject: Re: [PATCH] drm/amdgpu: Add more process info in VM for debug

Hi Trigger,

> Does this make sense?
Sorry I wasn't clear enough. The key point that we don't want/need the full commandline of the process here is that we can already get that from the information we have.

E.g. "ps -p 1 -o args" gives you the command line of the process 1.

The only case where directly printing this into the logs is useful is when we run into a total system crash and in this case the processed is only the trigger, but not the root cause.

Regards,
Christian.

Am 17.12.18 um 04:03 schrieb Huang, Trigger:
> Hi Christian,
>
> Many thanks for pointing out the mistakes
>
> I have some comments as below, would you help to check again?
>
> First of all you can't get the process info during VM creating since that can happen in X as well.
> [Trigger]: Ok, I will keep the original logic, which is that set the vm info in cs . I will still invoke kfree(cmd_line) in amdgpu_vm_fini to avoid memory leak.
>
> Second when a timeout happen the VM structure might already be released, so using job->vm is illegal here. What we could try is to get the VM using the PASID.
> [Trigger]: Ok, I will do it in job timeout like what VMC page fault's 
> handler does
>
> And last we don't want to keep the full command line around.
> [Trigger]: well, actually, the detailed command line is just what we want.
> For example, there are thousands of sub-cases of one big test case, and for each sub-case, the arguments may also be different.
> In some corner case,  test machine is hung after running the big test case for several hours even several days, it is really painful to wait another several hours to reproduce it and debug.
> But if we know the last sub-case running on the test machine, then 
> this issues *may* can be reproduced by only running the specific sub-case with specific arguments for several rounds, and in this situation, it will save us a lot time for reproducing and debugging.
> Does this make sense?  If not, how about we add a parameter, such as amdgpu_vm_debug_verbose, to turn on/off the cmd line dumping?
>
>
> Thanks,
> Trigger
>
> -----Original Message-----
> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> Sent: Saturday, December 15, 2018 8:23 PM
> To: Huang, Trigger <Trigger.Huang@amd.com>; 
> amd-gfx@lists.freedesktop.org
> Cc: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Qu, Jim 
> <Jim.Qu@amd.com>
> Subject: Re: [PATCH] drm/amdgpu: Add more process info in VM for debug
>
> Am 15.12.18 um 09:56 schrieb Trigger Huang:
>> When debugging VMC page fault and ring hang issues, the detailed 
>> process information is quite helpful, especially when the issue can 
>> only be reproduced after a very long time running. With this 
>> information, only run the specific sub-testcase may also will 
>> reproduce the issue, which may save a lot of time for debugging.
>>
>> With this patch, the process information is similar as following.
>> 	When VMC page fault issue happened:
>> [  142.978417] amdgpu 0000:00:08.0: [gfxhub] VMC page fault (src_id:0
>> ring:171 vmid:2 pasid:32769), [  142.978542] amdgpu 0000:00:08.0: for process ocltst pid 1354 thread ocltst pid 1354, args:./ocltst -m oclperf.so -t OCLPerfDeviceEnqueueEvent,
>> [  142.978693] amdgpu 0000:00:08.0:   in page starting at address 0x0000000000000000 from 27
>>
>> 	When ring hang issue happened:
>> [ 1740.047122] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring
>> comp_1.0.0 timeout, signaled seq=91571, emitted seq=91572 [ 1740.050167] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* The job's process information is as below:
>> [ 1740.053160] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process 
>> SDMA pid 2098 thread SDMA pid 2098, cmd line:SDMA --mode 
>> goldimage_compare --offscreen --n-swapchain-images 3 --gpu 0 
>> --frontend test_executor --n-test-threads 4
>>
>> Signed-off-by: Trigger Huang <Trigger.Huang@amd.com>
> Well NAK, we intentionally didn't do it this way.
>
> First of all you can't get the process info during VM creating since that can happen in X as well.
>
> Second when a timeout happen the VM structure might already be released, so using job->vm is illegal here. What we could try is to get the VM using the PASID.
>
> And last we don't want to keep the full command line around.
>
> The only valid addition I can see here is to print the thread info when the timeout happens.
>
> Regards,
> Christian.
>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  |  3 ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_job.c |  8 ++++++++
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 11 +++++++++++
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h  |  1 +
>>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   | 11 ++++++-----
>>    5 files changed, 26 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> index 1c49b82..1a2d0c9 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> @@ -235,9 +235,6 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs
>>    		p->job->uf_addr = uf_offset;
>>    	kfree(chunk_array);
>>    
>> -	/* Use this opportunity to fill in task info for the vm */
>> -	amdgpu_vm_set_task_info(vm);
>> -
>>    	return 0;
>>    
>>    free_all_kdata:
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> index e0af44f..c75ecb3 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> @@ -43,6 +43,14 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job)
>>    		  job->base.sched->name, atomic_read(&ring->fence_drv.last_seq),
>>    		  ring->fence_drv.sync_seq);
>>    
>> +	if (job->vm) {
>> +		struct amdgpu_task_info *ti = &(job->vm->task_info);
>> +
>> +		DRM_ERROR("The job's process information is as below:\n");
>> +		DRM_ERROR("Process %s, thread %s, cmd line:%s\n",
>> +			ti->process_name, ti->task_name, ti->cmd_line);
>> +	}
>> +
>>    	if (amdgpu_device_should_recover_gpu(ring->adev))
>>    		amdgpu_device_gpu_recover(ring->adev, job);
>>    }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> index e73d152..24f3cbd 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> @@ -30,6 +30,7 @@
>>    #include <linux/idr.h>
>>    #include <drm/drmP.h>
>>    #include <drm/amdgpu_drm.h>
>> +#include <linux/string_helpers.h>
>>    #include "amdgpu.h"
>>    #include "amdgpu_trace.h"
>>    #include "amdgpu_amdkfd.h"
>> @@ -3045,6 +3046,8 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>>    			goto error_free_root;
>>    
>>    		vm->pasid = pasid;
>> +
>> +		amdgpu_vm_set_task_info(vm);
>>    	}
>>    
>>    	vm->fault_hash = init_fault_hash(); @@ -3223,6 +3226,9 @@ void 
>> amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
>>    		spin_lock_irqsave(&adev->vm_manager.pasid_lock, flags);
>>    		idr_remove(&adev->vm_manager.pasid_idr, vm->pasid);
>>    		spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
>> +
>> +		kfree(vm->task_info.cmd_line);
>> +		vm->task_info.cmd_line = NULL;
>>    	}
>>    
>>    	kfree(vm->fault_hash);
>> @@ -3391,6 +3397,11 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
>>    			vm->task_info.tgid = current->group_leader->pid;
>>    			get_task_comm(vm->task_info.process_name, current->group_leader);
>>    		}
>> +
>> +		vm->task_info.cmd_line =
>> +				kstrdup_quotable_cmdline(current, GFP_KERNEL);
>> +		if (!vm->task_info.cmd_line)
>> +			DRM_DEBUG_DRIVER("Failed to get cmdline!\n");
>>    	}
>>    }
>>    
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> index e8dcfd5..9fab787 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> @@ -175,6 +175,7 @@ struct amdgpu_vm_pte_funcs {
>>    struct amdgpu_task_info {
>>    	char	process_name[TASK_COMM_LEN];
>>    	char	task_name[TASK_COMM_LEN];
>> +	char    *cmd_line;
>>    	pid_t	pid;
>>    	pid_t	tgid;
>>    };
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> index bacdaef..c3e3558 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> @@ -325,11 +325,12 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
>>    		amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
>>    
>>    		dev_err(adev->dev,
>> -			"[%s] VMC page fault (src_id:%u ring:%u vmid:%u pasid:%u, for process %s pid %d thread %s pid %d)\n",
>> -			entry->vmid_src ? "mmhub" : "gfxhub",
>> -			entry->src_id, entry->ring_id, entry->vmid,
>> -			entry->pasid, task_info.process_name, task_info.tgid,
>> -			task_info.task_name, task_info.pid);
>> +			"[%s] VMC page fault (src_id:%u ring:%u vmid:%u pasid:%u), ",
>> +			entry->vmid_src ? "mmhub" : "gfxhub",  entry->src_id,
>> +			entry->ring_id, entry->vmid, entry->pasid);
>> +		dev_err(adev->dev, "for process %s pid %d thread %s pid %d, args:%s,",
>> +			task_info.process_name, task_info.tgid,
>> +			task_info.task_name, task_info.pid, task_info.cmd_line);
>>    		dev_err(adev->dev, "  in page starting at address 0x%016llx from %d\n",
>>    			addr, entry->client_id);
>>    		if (!amdgpu_sriov_vf(adev))

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] drm/amdgpu: Add more process info in VM for debug
       [not found]                     ` <CY4PR1201MB02452810FED0CD6283C9B18584BC0-1iTaO6aE1DBfNQakwlCMTGrFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
@ 2018-12-17 13:38                       ` Koenig, Christian
       [not found]                         ` <6bf51fe6-76d1-73af-8e64-e58802519b3d-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 8+ messages in thread
From: Koenig, Christian @ 2018-12-17 13:38 UTC (permalink / raw)
  To: Liu, Monk, Huang, Trigger, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Grodzovsky, Andrey, Qu, Jim

Hi guys,

We could also print shaders, IBs or textures to figure out what's going 
wrong, but that would be overkill as well.

The log is to notice that something is wrong and not to a detailed crash 
report.

The PID is perfectly sufficient to identify the process which triggered 
an issue and when the system goes down completely because of this we 
have done something wrong in the first place.

So that is still a NAK, CPU crash reports doesn't contain the 
commandline either.

Regards,
Christian.

Am 17.12.18 um 10:02 schrieb Liu, Monk:
> Hi Christian,
>
> I think for some SRIOV customers they need this rich information,
> Maybe we can use an kernel option to let user select if rich or simple information should be printed upon job TDR ?
> In SRIOV branch we can set it enable by default while set by default disable for drm-next
>
> /Monk
>
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Huang, Trigger
> Sent: Monday, December 17, 2018 4:27 PM
> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Qu, Jim <Jim.Qu@amd.com>
> Subject: RE: [PATCH] drm/amdgpu: Add more process info in VM for debug
>
> Hi Christian,
>
> Yes, if the test machine is still there for debugging, we can login it and check a lot of things, such as ' ps -p 1 -o args ' as you suggested.
>
> But sometimes, the system is not alive anymore, and we only got some log files (such as kern.log ) from QA or customers.
> And at this time, the full command line information in dmesg is quite useful. Let's take the following message for example:
> 	
> 	[ 1740.047122] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring comp_1.0.0 timeout, signaled seq=91571, emitted seq=91572
> 	[ 1740.050167] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* The job's process information is as below:
> 	[ 1740.053160] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process SDMA pid 2098 thread SDMA pid 2098, cmd line:SDMA --mode goldimage_compare --offscreen --n-swapchain-images 3 --gpu 0 --frontend test_executor --n-test-threads 4 When customer/QA reported that there is comp ring job timeout when running a big test case, vk-example, but the machine is not alive anymore, and can't  login for debugging. Then we need to re-run the whole vk-example to reproduce this issue, and this will waste a lot of time.
> But if we get the specific sub-base in the  kern.log file when job timeout happened, then we can only try the specific one, here it is ' SDMA --mode goldimage_compare --offscreen --n-swapchain-images 3 --gpu 0 --frontend test_executor --n-test-threads 4'
> maybe several minutes later, this issues is reproduced.
>
>
> Thanks,
> Trigger.
>
>
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Koenig, Christian
> Sent: Monday, December 17, 2018 3:50 PM
> To: Huang, Trigger <Trigger.Huang@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Qu, Jim <Jim.Qu@amd.com>
> Subject: Re: [PATCH] drm/amdgpu: Add more process info in VM for debug
>
> Hi Trigger,
>
>> Does this make sense?
> Sorry I wasn't clear enough. The key point that we don't want/need the full commandline of the process here is that we can already get that from the information we have.
>
> E.g. "ps -p 1 -o args" gives you the command line of the process 1.
>
> The only case where directly printing this into the logs is useful is when we run into a total system crash and in this case the processed is only the trigger, but not the root cause.
>
> Regards,
> Christian.
>
> Am 17.12.18 um 04:03 schrieb Huang, Trigger:
>> Hi Christian,
>>
>> Many thanks for pointing out the mistakes
>>
>> I have some comments as below, would you help to check again?
>>
>> First of all you can't get the process info during VM creating since that can happen in X as well.
>> [Trigger]: Ok, I will keep the original logic, which is that set the vm info in cs . I will still invoke kfree(cmd_line) in amdgpu_vm_fini to avoid memory leak.
>>
>> Second when a timeout happen the VM structure might already be released, so using job->vm is illegal here. What we could try is to get the VM using the PASID.
>> [Trigger]: Ok, I will do it in job timeout like what VMC page fault's
>> handler does
>>
>> And last we don't want to keep the full command line around.
>> [Trigger]: well, actually, the detailed command line is just what we want.
>> For example, there are thousands of sub-cases of one big test case, and for each sub-case, the arguments may also be different.
>> In some corner case,  test machine is hung after running the big test case for several hours even several days, it is really painful to wait another several hours to reproduce it and debug.
>> But if we know the last sub-case running on the test machine, then
>> this issues *may* can be reproduced by only running the specific sub-case with specific arguments for several rounds, and in this situation, it will save us a lot time for reproducing and debugging.
>> Does this make sense?  If not, how about we add a parameter, such as amdgpu_vm_debug_verbose, to turn on/off the cmd line dumping?
>>
>>
>> Thanks,
>> Trigger
>>
>> -----Original Message-----
>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>> Sent: Saturday, December 15, 2018 8:23 PM
>> To: Huang, Trigger <Trigger.Huang@amd.com>;
>> amd-gfx@lists.freedesktop.org
>> Cc: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Qu, Jim
>> <Jim.Qu@amd.com>
>> Subject: Re: [PATCH] drm/amdgpu: Add more process info in VM for debug
>>
>> Am 15.12.18 um 09:56 schrieb Trigger Huang:
>>> When debugging VMC page fault and ring hang issues, the detailed
>>> process information is quite helpful, especially when the issue can
>>> only be reproduced after a very long time running. With this
>>> information, only run the specific sub-testcase may also will
>>> reproduce the issue, which may save a lot of time for debugging.
>>>
>>> With this patch, the process information is similar as following.
>>> 	When VMC page fault issue happened:
>>> [  142.978417] amdgpu 0000:00:08.0: [gfxhub] VMC page fault (src_id:0
>>> ring:171 vmid:2 pasid:32769), [  142.978542] amdgpu 0000:00:08.0: for process ocltst pid 1354 thread ocltst pid 1354, args:./ocltst -m oclperf.so -t OCLPerfDeviceEnqueueEvent,
>>> [  142.978693] amdgpu 0000:00:08.0:   in page starting at address 0x0000000000000000 from 27
>>>
>>> 	When ring hang issue happened:
>>> [ 1740.047122] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring
>>> comp_1.0.0 timeout, signaled seq=91571, emitted seq=91572 [ 1740.050167] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* The job's process information is as below:
>>> [ 1740.053160] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process
>>> SDMA pid 2098 thread SDMA pid 2098, cmd line:SDMA --mode
>>> goldimage_compare --offscreen --n-swapchain-images 3 --gpu 0
>>> --frontend test_executor --n-test-threads 4
>>>
>>> Signed-off-by: Trigger Huang <Trigger.Huang@amd.com>
>> Well NAK, we intentionally didn't do it this way.
>>
>> First of all you can't get the process info during VM creating since that can happen in X as well.
>>
>> Second when a timeout happen the VM structure might already be released, so using job->vm is illegal here. What we could try is to get the VM using the PASID.
>>
>> And last we don't want to keep the full command line around.
>>
>> The only valid addition I can see here is to print the thread info when the timeout happens.
>>
>> Regards,
>> Christian.
>>
>>> ---
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  |  3 ---
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_job.c |  8 ++++++++
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 11 +++++++++++
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h  |  1 +
>>>     drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   | 11 ++++++-----
>>>     5 files changed, 26 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> index 1c49b82..1a2d0c9 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> @@ -235,9 +235,6 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs
>>>     		p->job->uf_addr = uf_offset;
>>>     	kfree(chunk_array);
>>>     
>>> -	/* Use this opportunity to fill in task info for the vm */
>>> -	amdgpu_vm_set_task_info(vm);
>>> -
>>>     	return 0;
>>>     
>>>     free_all_kdata:
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> index e0af44f..c75ecb3 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> @@ -43,6 +43,14 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job)
>>>     		  job->base.sched->name, atomic_read(&ring->fence_drv.last_seq),
>>>     		  ring->fence_drv.sync_seq);
>>>     
>>> +	if (job->vm) {
>>> +		struct amdgpu_task_info *ti = &(job->vm->task_info);
>>> +
>>> +		DRM_ERROR("The job's process information is as below:\n");
>>> +		DRM_ERROR("Process %s, thread %s, cmd line:%s\n",
>>> +			ti->process_name, ti->task_name, ti->cmd_line);
>>> +	}
>>> +
>>>     	if (amdgpu_device_should_recover_gpu(ring->adev))
>>>     		amdgpu_device_gpu_recover(ring->adev, job);
>>>     }
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> index e73d152..24f3cbd 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> @@ -30,6 +30,7 @@
>>>     #include <linux/idr.h>
>>>     #include <drm/drmP.h>
>>>     #include <drm/amdgpu_drm.h>
>>> +#include <linux/string_helpers.h>
>>>     #include "amdgpu.h"
>>>     #include "amdgpu_trace.h"
>>>     #include "amdgpu_amdkfd.h"
>>> @@ -3045,6 +3046,8 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>>>     			goto error_free_root;
>>>     
>>>     		vm->pasid = pasid;
>>> +
>>> +		amdgpu_vm_set_task_info(vm);
>>>     	}
>>>     
>>>     	vm->fault_hash = init_fault_hash(); @@ -3223,6 +3226,9 @@ void
>>> amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
>>>     		spin_lock_irqsave(&adev->vm_manager.pasid_lock, flags);
>>>     		idr_remove(&adev->vm_manager.pasid_idr, vm->pasid);
>>>     		spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
>>> +
>>> +		kfree(vm->task_info.cmd_line);
>>> +		vm->task_info.cmd_line = NULL;
>>>     	}
>>>     
>>>     	kfree(vm->fault_hash);
>>> @@ -3391,6 +3397,11 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
>>>     			vm->task_info.tgid = current->group_leader->pid;
>>>     			get_task_comm(vm->task_info.process_name, current->group_leader);
>>>     		}
>>> +
>>> +		vm->task_info.cmd_line =
>>> +				kstrdup_quotable_cmdline(current, GFP_KERNEL);
>>> +		if (!vm->task_info.cmd_line)
>>> +			DRM_DEBUG_DRIVER("Failed to get cmdline!\n");
>>>     	}
>>>     }
>>>     
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>> index e8dcfd5..9fab787 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>> @@ -175,6 +175,7 @@ struct amdgpu_vm_pte_funcs {
>>>     struct amdgpu_task_info {
>>>     	char	process_name[TASK_COMM_LEN];
>>>     	char	task_name[TASK_COMM_LEN];
>>> +	char    *cmd_line;
>>>     	pid_t	pid;
>>>     	pid_t	tgid;
>>>     };
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> index bacdaef..c3e3558 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> @@ -325,11 +325,12 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
>>>     		amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
>>>     
>>>     		dev_err(adev->dev,
>>> -			"[%s] VMC page fault (src_id:%u ring:%u vmid:%u pasid:%u, for process %s pid %d thread %s pid %d)\n",
>>> -			entry->vmid_src ? "mmhub" : "gfxhub",
>>> -			entry->src_id, entry->ring_id, entry->vmid,
>>> -			entry->pasid, task_info.process_name, task_info.tgid,
>>> -			task_info.task_name, task_info.pid);
>>> +			"[%s] VMC page fault (src_id:%u ring:%u vmid:%u pasid:%u), ",
>>> +			entry->vmid_src ? "mmhub" : "gfxhub",  entry->src_id,
>>> +			entry->ring_id, entry->vmid, entry->pasid);
>>> +		dev_err(adev->dev, "for process %s pid %d thread %s pid %d, args:%s,",
>>> +			task_info.process_name, task_info.tgid,
>>> +			task_info.task_name, task_info.pid, task_info.cmd_line);
>>>     		dev_err(adev->dev, "  in page starting at address 0x%016llx from %d\n",
>>>     			addr, entry->client_id);
>>>     		if (!amdgpu_sriov_vf(adev))
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH] drm/amdgpu: Add more process info in VM for debug
       [not found]                         ` <6bf51fe6-76d1-73af-8e64-e58802519b3d-5C7GfCeVMHo@public.gmane.org>
@ 2018-12-18  1:46                           ` Huang, Trigger
  0 siblings, 0 replies; 8+ messages in thread
From: Huang, Trigger @ 2018-12-18  1:46 UTC (permalink / raw)
  To: Koenig, Christian, Liu, Monk, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Grodzovsky, Andrey, Qu, Jim

Yeah, make sense.
Let me give another patch: [PATCH] drm/amdgpu: print process info when job timeout

-----Original Message-----
From: Koenig, Christian <Christian.Koenig@amd.com> 
Sent: Monday, December 17, 2018 9:39 PM
To: Liu, Monk <Monk.Liu@amd.com>; Huang, Trigger <Trigger.Huang@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Qu, Jim <Jim.Qu@amd.com>
Subject: Re: [PATCH] drm/amdgpu: Add more process info in VM for debug

Hi guys,

We could also print shaders, IBs or textures to figure out what's going wrong, but that would be overkill as well.

The log is to notice that something is wrong and not to a detailed crash report.

The PID is perfectly sufficient to identify the process which triggered an issue and when the system goes down completely because of this we have done something wrong in the first place.

So that is still a NAK, CPU crash reports doesn't contain the commandline either.

Regards,
Christian.

Am 17.12.18 um 10:02 schrieb Liu, Monk:
> Hi Christian,
>
> I think for some SRIOV customers they need this rich information, 
> Maybe we can use an kernel option to let user select if rich or simple information should be printed upon job TDR ?
> In SRIOV branch we can set it enable by default while set by default 
> disable for drm-next
>
> /Monk
>
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of 
> Huang, Trigger
> Sent: Monday, December 17, 2018 4:27 PM
> To: Koenig, Christian <Christian.Koenig@amd.com>; 
> amd-gfx@lists.freedesktop.org
> Cc: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Qu, Jim 
> <Jim.Qu@amd.com>
> Subject: RE: [PATCH] drm/amdgpu: Add more process info in VM for debug
>
> Hi Christian,
>
> Yes, if the test machine is still there for debugging, we can login it and check a lot of things, such as ' ps -p 1 -o args ' as you suggested.
>
> But sometimes, the system is not alive anymore, and we only got some log files (such as kern.log ) from QA or customers.
> And at this time, the full command line information in dmesg is quite useful. Let's take the following message for example:
> 	
> 	[ 1740.047122] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring comp_1.0.0 timeout, signaled seq=91571, emitted seq=91572
> 	[ 1740.050167] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* The job's process information is as below:
> 	[ 1740.053160] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process SDMA pid 2098 thread SDMA pid 2098, cmd line:SDMA --mode goldimage_compare --offscreen --n-swapchain-images 3 --gpu 0 --frontend test_executor --n-test-threads 4 When customer/QA reported that there is comp ring job timeout when running a big test case, vk-example, but the machine is not alive anymore, and can't  login for debugging. Then we need to re-run the whole vk-example to reproduce this issue, and this will waste a lot of time.
> But if we get the specific sub-base in the  kern.log file when job timeout happened, then we can only try the specific one, here it is ' SDMA --mode goldimage_compare --offscreen --n-swapchain-images 3 --gpu 0 --frontend test_executor --n-test-threads 4'
> maybe several minutes later, this issues is reproduced.
>
>
> Thanks,
> Trigger.
>
>
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of 
> Koenig, Christian
> Sent: Monday, December 17, 2018 3:50 PM
> To: Huang, Trigger <Trigger.Huang@amd.com>; 
> amd-gfx@lists.freedesktop.org
> Cc: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Qu, Jim 
> <Jim.Qu@amd.com>
> Subject: Re: [PATCH] drm/amdgpu: Add more process info in VM for debug
>
> Hi Trigger,
>
>> Does this make sense?
> Sorry I wasn't clear enough. The key point that we don't want/need the full commandline of the process here is that we can already get that from the information we have.
>
> E.g. "ps -p 1 -o args" gives you the command line of the process 1.
>
> The only case where directly printing this into the logs is useful is when we run into a total system crash and in this case the processed is only the trigger, but not the root cause.
>
> Regards,
> Christian.
>
> Am 17.12.18 um 04:03 schrieb Huang, Trigger:
>> Hi Christian,
>>
>> Many thanks for pointing out the mistakes
>>
>> I have some comments as below, would you help to check again?
>>
>> First of all you can't get the process info during VM creating since that can happen in X as well.
>> [Trigger]: Ok, I will keep the original logic, which is that set the vm info in cs . I will still invoke kfree(cmd_line) in amdgpu_vm_fini to avoid memory leak.
>>
>> Second when a timeout happen the VM structure might already be released, so using job->vm is illegal here. What we could try is to get the VM using the PASID.
>> [Trigger]: Ok, I will do it in job timeout like what VMC page fault's 
>> handler does
>>
>> And last we don't want to keep the full command line around.
>> [Trigger]: well, actually, the detailed command line is just what we want.
>> For example, there are thousands of sub-cases of one big test case, and for each sub-case, the arguments may also be different.
>> In some corner case,  test machine is hung after running the big test case for several hours even several days, it is really painful to wait another several hours to reproduce it and debug.
>> But if we know the last sub-case running on the test machine, then 
>> this issues *may* can be reproduced by only running the specific sub-case with specific arguments for several rounds, and in this situation, it will save us a lot time for reproducing and debugging.
>> Does this make sense?  If not, how about we add a parameter, such as amdgpu_vm_debug_verbose, to turn on/off the cmd line dumping?
>>
>>
>> Thanks,
>> Trigger
>>
>> -----Original Message-----
>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>> Sent: Saturday, December 15, 2018 8:23 PM
>> To: Huang, Trigger <Trigger.Huang@amd.com>; 
>> amd-gfx@lists.freedesktop.org
>> Cc: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Qu, Jim 
>> <Jim.Qu@amd.com>
>> Subject: Re: [PATCH] drm/amdgpu: Add more process info in VM for 
>> debug
>>
>> Am 15.12.18 um 09:56 schrieb Trigger Huang:
>>> When debugging VMC page fault and ring hang issues, the detailed 
>>> process information is quite helpful, especially when the issue can 
>>> only be reproduced after a very long time running. With this 
>>> information, only run the specific sub-testcase may also will 
>>> reproduce the issue, which may save a lot of time for debugging.
>>>
>>> With this patch, the process information is similar as following.
>>> 	When VMC page fault issue happened:
>>> [  142.978417] amdgpu 0000:00:08.0: [gfxhub] VMC page fault 
>>> (src_id:0
>>> ring:171 vmid:2 pasid:32769), [  142.978542] amdgpu 0000:00:08.0: for process ocltst pid 1354 thread ocltst pid 1354, args:./ocltst -m oclperf.so -t OCLPerfDeviceEnqueueEvent,
>>> [  142.978693] amdgpu 0000:00:08.0:   in page starting at address 0x0000000000000000 from 27
>>>
>>> 	When ring hang issue happened:
>>> [ 1740.047122] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring
>>> comp_1.0.0 timeout, signaled seq=91571, emitted seq=91572 [ 1740.050167] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* The job's process information is as below:
>>> [ 1740.053160] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process 
>>> SDMA pid 2098 thread SDMA pid 2098, cmd line:SDMA --mode 
>>> goldimage_compare --offscreen --n-swapchain-images 3 --gpu 0 
>>> --frontend test_executor --n-test-threads 4
>>>
>>> Signed-off-by: Trigger Huang <Trigger.Huang@amd.com>
>> Well NAK, we intentionally didn't do it this way.
>>
>> First of all you can't get the process info during VM creating since that can happen in X as well.
>>
>> Second when a timeout happen the VM structure might already be released, so using job->vm is illegal here. What we could try is to get the VM using the PASID.
>>
>> And last we don't want to keep the full command line around.
>>
>> The only valid addition I can see here is to print the thread info when the timeout happens.
>>
>> Regards,
>> Christian.
>>
>>> ---
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  |  3 ---
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_job.c |  8 ++++++++
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 11 +++++++++++
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h  |  1 +
>>>     drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   | 11 ++++++-----
>>>     5 files changed, 26 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> index 1c49b82..1a2d0c9 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> @@ -235,9 +235,6 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs
>>>     		p->job->uf_addr = uf_offset;
>>>     	kfree(chunk_array);
>>>     
>>> -	/* Use this opportunity to fill in task info for the vm */
>>> -	amdgpu_vm_set_task_info(vm);
>>> -
>>>     	return 0;
>>>     
>>>     free_all_kdata:
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> index e0af44f..c75ecb3 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> @@ -43,6 +43,14 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job)
>>>     		  job->base.sched->name, atomic_read(&ring->fence_drv.last_seq),
>>>     		  ring->fence_drv.sync_seq);
>>>     
>>> +	if (job->vm) {
>>> +		struct amdgpu_task_info *ti = &(job->vm->task_info);
>>> +
>>> +		DRM_ERROR("The job's process information is as below:\n");
>>> +		DRM_ERROR("Process %s, thread %s, cmd line:%s\n",
>>> +			ti->process_name, ti->task_name, ti->cmd_line);
>>> +	}
>>> +
>>>     	if (amdgpu_device_should_recover_gpu(ring->adev))
>>>     		amdgpu_device_gpu_recover(ring->adev, job);
>>>     }
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> index e73d152..24f3cbd 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> @@ -30,6 +30,7 @@
>>>     #include <linux/idr.h>
>>>     #include <drm/drmP.h>
>>>     #include <drm/amdgpu_drm.h>
>>> +#include <linux/string_helpers.h>
>>>     #include "amdgpu.h"
>>>     #include "amdgpu_trace.h"
>>>     #include "amdgpu_amdkfd.h"
>>> @@ -3045,6 +3046,8 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>>>     			goto error_free_root;
>>>     
>>>     		vm->pasid = pasid;
>>> +
>>> +		amdgpu_vm_set_task_info(vm);
>>>     	}
>>>     
>>>     	vm->fault_hash = init_fault_hash(); @@ -3223,6 +3226,9 @@ void 
>>> amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
>>>     		spin_lock_irqsave(&adev->vm_manager.pasid_lock, flags);
>>>     		idr_remove(&adev->vm_manager.pasid_idr, vm->pasid);
>>>     		spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
>>> +
>>> +		kfree(vm->task_info.cmd_line);
>>> +		vm->task_info.cmd_line = NULL;
>>>     	}
>>>     
>>>     	kfree(vm->fault_hash);
>>> @@ -3391,6 +3397,11 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
>>>     			vm->task_info.tgid = current->group_leader->pid;
>>>     			get_task_comm(vm->task_info.process_name, current->group_leader);
>>>     		}
>>> +
>>> +		vm->task_info.cmd_line =
>>> +				kstrdup_quotable_cmdline(current, GFP_KERNEL);
>>> +		if (!vm->task_info.cmd_line)
>>> +			DRM_DEBUG_DRIVER("Failed to get cmdline!\n");
>>>     	}
>>>     }
>>>     
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>> index e8dcfd5..9fab787 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>> @@ -175,6 +175,7 @@ struct amdgpu_vm_pte_funcs {
>>>     struct amdgpu_task_info {
>>>     	char	process_name[TASK_COMM_LEN];
>>>     	char	task_name[TASK_COMM_LEN];
>>> +	char    *cmd_line;
>>>     	pid_t	pid;
>>>     	pid_t	tgid;
>>>     };
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> index bacdaef..c3e3558 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> @@ -325,11 +325,12 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
>>>     		amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
>>>     
>>>     		dev_err(adev->dev,
>>> -			"[%s] VMC page fault (src_id:%u ring:%u vmid:%u pasid:%u, for process %s pid %d thread %s pid %d)\n",
>>> -			entry->vmid_src ? "mmhub" : "gfxhub",
>>> -			entry->src_id, entry->ring_id, entry->vmid,
>>> -			entry->pasid, task_info.process_name, task_info.tgid,
>>> -			task_info.task_name, task_info.pid);
>>> +			"[%s] VMC page fault (src_id:%u ring:%u vmid:%u pasid:%u), ",
>>> +			entry->vmid_src ? "mmhub" : "gfxhub",  entry->src_id,
>>> +			entry->ring_id, entry->vmid, entry->pasid);
>>> +		dev_err(adev->dev, "for process %s pid %d thread %s pid %d, args:%s,",
>>> +			task_info.process_name, task_info.tgid,
>>> +			task_info.task_name, task_info.pid, task_info.cmd_line);
>>>     		dev_err(adev->dev, "  in page starting at address 0x%016llx from %d\n",
>>>     			addr, entry->client_id);
>>>     		if (!amdgpu_sriov_vf(adev))
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2018-12-18  1:46 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-12-15  8:56 [PATCH] drm/amdgpu: Add more process info in VM for debug Trigger Huang
     [not found] ` <1544864213-2352-1-git-send-email-Trigger.Huang-5C7GfCeVMHo@public.gmane.org>
2018-12-15 12:22   ` Christian König
     [not found]     ` <3411fb4a-119f-8531-4597-7eb185a63c59-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2018-12-17  3:03       ` Huang, Trigger
     [not found]         ` <BY2PR12MB0545A2EF343926B82173F6C9FEBC0-K//h7OWB4q7Qm/t26ohGYQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2018-12-17  7:50           ` Koenig, Christian
     [not found]             ` <36a14372-fd14-975b-c06e-1c1081f3c586-5C7GfCeVMHo@public.gmane.org>
2018-12-17  8:27               ` Huang, Trigger
     [not found]                 ` <BY2PR12MB05459446C83A07595D4C42E6FEBC0-K//h7OWB4q7Qm/t26ohGYQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2018-12-17  9:02                   ` Liu, Monk
     [not found]                     ` <CY4PR1201MB02452810FED0CD6283C9B18584BC0-1iTaO6aE1DBfNQakwlCMTGrFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
2018-12-17 13:38                       ` Koenig, Christian
     [not found]                         ` <6bf51fe6-76d1-73af-8e64-e58802519b3d-5C7GfCeVMHo@public.gmane.org>
2018-12-18  1:46                           ` Huang, Trigger

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.