[PATCH v3 1/2] drm/amdgpu: Add support for logging process info in amdgpu

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v3 1/2] drm/amdgpu: Add support for logging process info in amdgpu_vm.
@ 2018-07-05 19:26 Andrey Grodzovsky
       [not found] ` <1530818820-28631-1-git-send-email-andrey.grodzovsky-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 10+ messages in thread
From: Andrey Grodzovsky @ 2018-07-05 19:26 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Andrey Grodzovsky, zhoucm1-5C7GfCeVMHo, christian.koenig-5C7GfCeVMHo

Add process and thread names and pids and a function to extract
this info from relevant amdgpu_vm.

v2: Add documentation and fix identation.

v3: Add getter and setter functions for amdgpu_task_info.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 39 ++++++++++++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 16 ++++++++++++++
 2 files changed, 55 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 712af5c..d18f247 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2942,3 +2942,42 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
 
 	return 0;
 }
+
+/**
+ * amdgpu_vm_get_task_info - Extracts task info for a PASID.
+ *
+ * @dev: drm device pointer
+ * @pasid: PASID identifier for VM
+ * @task_info: task_info to fill.
+ */
+void amdgpu_vm_get_task_info(struct amdgpu_device *adev, unsigned int pasid,
+			 struct amdgpu_task_info *task_info)
+{
+	struct amdgpu_vm *vm;
+
+	spin_lock(&adev->vm_manager.pasid_lock);
+
+	vm = idr_find(&adev->vm_manager.pasid_idr, pasid);
+	if (vm)
+		*task_info = vm->task_info;
+
+	spin_unlock(&adev->vm_manager.pasid_lock);
+}
+
+/**
+ * amdgpu_vm_set_task_info - Sets VMs task info.
+ *
+ * @vm: vm for which to set the info
+ */
+void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
+{
+	if (!vm->task_info.pid) {
+		vm->task_info.pid = current->pid;
+		get_task_comm(vm->task_info.task_name, current);
+
+		if (current->group_leader->mm == current->mm) {
+			vm->task_info.tgid = current->group_leader->pid;
+			get_task_comm(vm->task_info.process_name, current->group_leader);
+		}
+	}
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 061b99a..d416f89 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -164,6 +164,14 @@ struct amdgpu_vm_pt {
 #define AMDGPU_VM_FAULT_PASID(fault) ((u64)(fault) >> 48)
 #define AMDGPU_VM_FAULT_ADDR(fault)  ((u64)(fault) & 0xfffffffff000ULL)
 
+
+struct amdgpu_task_info {
+	char	process_name[TASK_COMM_LEN];
+	char	task_name[TASK_COMM_LEN];
+	pid_t	pid;
+	pid_t	tgid;
+};
+
 struct amdgpu_vm {
 	/* tree of virtual addresses mapped */
 	struct rb_root_cached	va;
@@ -215,6 +223,9 @@ struct amdgpu_vm {
 
 	/* Valid while the PD is reserved or fenced */
 	uint64_t		pd_phys_addr;
+
+	/* Some basic info about the task */
+	struct amdgpu_task_info task_info;
 };
 
 struct amdgpu_vm_manager {
@@ -317,4 +328,9 @@ bool amdgpu_vm_need_pipeline_sync(struct amdgpu_ring *ring,
 				  struct amdgpu_job *job);
 void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev);
 
+void amdgpu_vm_get_task_info(struct amdgpu_device *adev, unsigned int pasid,
+			 struct amdgpu_task_info *task_info);
+
+void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
+
 #endif
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH v3 2/2] drm/admgpu: Present amdgpu_task_info in VM_FAULTS.
       [not found] ` <1530818820-28631-1-git-send-email-andrey.grodzovsky-5C7GfCeVMHo@public.gmane.org>
@ 2018-07-05 19:27   ` Andrey Grodzovsky
       [not found]     ` <1530818820-28631-2-git-send-email-andrey.grodzovsky-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 10+ messages in thread
From: Andrey Grodzovsky @ 2018-07-05 19:27 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Andrey Grodzovsky, zhoucm1-5C7GfCeVMHo, christian.koenig-5C7GfCeVMHo

Extract and present the reposnsible process and thread when
VM_FAULT happens.

v2: Use getter and setter functions.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c |  4 ++++
 drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c  | 10 +++++++---
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  9 +++++++--
 3 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 7a625f3..609c8f5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -187,6 +187,10 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data)
 	if (p->uf_entry.robj)
 		p->job->uf_addr = uf_offset;
 	kfree(chunk_array);
+
+	/* Use this opportunity to fill in task info for the vm */
+	amdgpu_vm_set_task_info(vm);
+
 	return 0;
 
 free_all_kdata:
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
index 08753e7..75f3ffb 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
@@ -46,7 +46,6 @@
 
 #include "ivsrcid/ivsrcid_vislands30.h"
 
-
 static void gmc_v8_0_set_gmc_funcs(struct amdgpu_device *adev);
 static void gmc_v8_0_set_irq_funcs(struct amdgpu_device *adev);
 static int gmc_v8_0_wait_for_idle(void *handle);
@@ -1449,8 +1448,13 @@ static int gmc_v8_0_process_interrupt(struct amdgpu_device *adev,
 		gmc_v8_0_set_fault_enable_default(adev, false);
 
 	if (printk_ratelimit()) {
-		dev_err(adev->dev, "GPU fault detected: %d 0x%08x\n",
-			entry->src_id, entry->src_data[0]);
+		struct amdgpu_task_info task_info = { 0 };
+
+		amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
+
+		dev_err(adev->dev, "GPU fault detected: %d 0x%08x for process %s pid %d thread %s pid %d\n",
+			entry->src_id, entry->src_data[0], task_info.process_name,
+			task_info.tgid, task_info.task_name, task_info.pid);
 		dev_err(adev->dev, "  VM_CONTEXT1_PROTECTION_FAULT_ADDR   0x%08X\n",
 			addr);
 		dev_err(adev->dev, "  VM_CONTEXT1_PROTECTION_FAULT_STATUS 0x%08X\n",
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 691a659..9df94b4 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -259,11 +259,16 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
 	}
 
 	if (printk_ratelimit()) {
+		struct amdgpu_task_info task_info = { 0 };
+
+		amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
+
 		dev_err(adev->dev,
-			"[%s] VMC page fault (src_id:%u ring:%u vmid:%u pasid:%u)\n",
+			"[%s] VMC page fault (src_id:%u ring:%u vmid:%u pasid:%u, for process %s pid %d thread %s pid %d\n)\n",
 			entry->vmid_src ? "mmhub" : "gfxhub",
 			entry->src_id, entry->ring_id, entry->vmid,
-			entry->pasid);
+			entry->pasid, task_info.process_name, task_info.tgid,
+			task_info.task_name, task_info.pid);
 		dev_err(adev->dev, "  at page 0x%016llx from %d\n",
 			addr, entry->client_id);
 		if (!amdgpu_sriov_vf(adev))
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* 答复: [PATCH v3 2/2] drm/admgpu: Present amdgpu_task_info in VM_FAULTS.
       [not found]     ` <1530818820-28631-2-git-send-email-andrey.grodzovsky-5C7GfCeVMHo@public.gmane.org>
@ 2018-07-06  3:09       ` Qu, Jim
  2018-07-09  5:13       ` Zhang, Jerry (Junwei)
  1 sibling, 0 replies; 10+ messages in thread
From: Qu, Jim @ 2018-07-06  3:09 UTC (permalink / raw)
  To: Grodzovsky, Andrey, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Zhou, David(ChunMing), Koenig, Christian

Look good to me. there is a small typo error about title admgpu->amdgpu

Acked-by: Jim Qu <Jim.Qu@amd.com>

Thanks
JimQu

________________________________________
发件人: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> 代表 Andrey Grodzovsky <andrey.grodzovsky@amd.com>
发送时间: 2018年7月6日 3:27:00
收件人: amd-gfx@lists.freedesktop.org
抄送: Grodzovsky, Andrey; Zhou, David(ChunMing); Koenig, Christian
主题: [PATCH v3 2/2] drm/admgpu: Present amdgpu_task_info in VM_FAULTS.

Extract and present the reposnsible process and thread when
VM_FAULT happens.

v2: Use getter and setter functions.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c |  4 ++++
 drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c  | 10 +++++++---
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  9 +++++++--
 3 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 7a625f3..609c8f5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -187,6 +187,10 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data)
        if (p->uf_entry.robj)
                p->job->uf_addr = uf_offset;
        kfree(chunk_array);
+
+       /* Use this opportunity to fill in task info for the vm */
+       amdgpu_vm_set_task_info(vm);
+
        return 0;

 free_all_kdata:
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
index 08753e7..75f3ffb 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
@@ -46,7 +46,6 @@

 #include "ivsrcid/ivsrcid_vislands30.h"

-
 static void gmc_v8_0_set_gmc_funcs(struct amdgpu_device *adev);
 static void gmc_v8_0_set_irq_funcs(struct amdgpu_device *adev);
 static int gmc_v8_0_wait_for_idle(void *handle);
@@ -1449,8 +1448,13 @@ static int gmc_v8_0_process_interrupt(struct amdgpu_device *adev,
                gmc_v8_0_set_fault_enable_default(adev, false);

        if (printk_ratelimit()) {
-               dev_err(adev->dev, "GPU fault detected: %d 0x%08x\n",
-                       entry->src_id, entry->src_data[0]);
+               struct amdgpu_task_info task_info = { 0 };
+
+               amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
+
+               dev_err(adev->dev, "GPU fault detected: %d 0x%08x for process %s pid %d thread %s pid %d\n",
+                       entry->src_id, entry->src_data[0], task_info.process_name,
+                       task_info.tgid, task_info.task_name, task_info.pid);
                dev_err(adev->dev, "  VM_CONTEXT1_PROTECTION_FAULT_ADDR   0x%08X\n",
                        addr);
                dev_err(adev->dev, "  VM_CONTEXT1_PROTECTION_FAULT_STATUS 0x%08X\n",
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 691a659..9df94b4 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -259,11 +259,16 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
        }

        if (printk_ratelimit()) {
+               struct amdgpu_task_info task_info = { 0 };
+
+               amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
+
                dev_err(adev->dev,
-                       "[%s] VMC page fault (src_id:%u ring:%u vmid:%u pasid:%u)\n",
+                       "[%s] VMC page fault (src_id:%u ring:%u vmid:%u pasid:%u, for process %s pid %d thread %s pid %d\n)\n",
                        entry->vmid_src ? "mmhub" : "gfxhub",
                        entry->src_id, entry->ring_id, entry->vmid,
-                       entry->pasid);
+                       entry->pasid, task_info.process_name, task_info.tgid,
+                       task_info.task_name, task_info.pid);
                dev_err(adev->dev, "  at page 0x%016llx from %d\n",
                        addr, entry->client_id);
                if (!amdgpu_sriov_vf(adev))
--
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH v3 2/2] drm/admgpu: Present amdgpu_task_info in VM_FAULTS.
       [not found]     ` <1530818820-28631-2-git-send-email-andrey.grodzovsky-5C7GfCeVMHo@public.gmane.org>
  2018-07-06  3:09       ` 答复: " Qu, Jim
@ 2018-07-09  5:13       ` Zhang, Jerry (Junwei)
       [not found]         ` <5B42EEEE.30305-5C7GfCeVMHo@public.gmane.org>
  1 sibling, 1 reply; 10+ messages in thread
From: Zhang, Jerry (Junwei) @ 2018-07-09  5:13 UTC (permalink / raw)
  To: Andrey Grodzovsky, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: zhoucm1-5C7GfCeVMHo, christian.koenig-5C7GfCeVMHo

On 07/06/2018 03:27 AM, Andrey Grodzovsky wrote:
> Extract and present the reposnsible process and thread when
> VM_FAULT happens.
>
> v2: Use getter and setter functions.
>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c |  4 ++++
>   drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c  | 10 +++++++---
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  9 +++++++--
>   3 files changed, 18 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index 7a625f3..609c8f5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -187,6 +187,10 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data)
>   	if (p->uf_entry.robj)
>   		p->job->uf_addr = uf_offset;
>   	kfree(chunk_array);
> +
> +	/* Use this opportunity to fill in task info for the vm */
> +	amdgpu_vm_set_task_info(vm);
> +

Shall we set the task info when vm init?


>   	return 0;
>
>   free_all_kdata:
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
> index 08753e7..75f3ffb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
> @@ -46,7 +46,6 @@
>
>   #include "ivsrcid/ivsrcid_vislands30.h"
>
> -
>   static void gmc_v8_0_set_gmc_funcs(struct amdgpu_device *adev);
>   static void gmc_v8_0_set_irq_funcs(struct amdgpu_device *adev);
>   static int gmc_v8_0_wait_for_idle(void *handle);
> @@ -1449,8 +1448,13 @@ static int gmc_v8_0_process_interrupt(struct amdgpu_device *adev,
>   		gmc_v8_0_set_fault_enable_default(adev, false);
>
>   	if (printk_ratelimit()) {
> -		dev_err(adev->dev, "GPU fault detected: %d 0x%08x\n",
> -			entry->src_id, entry->src_data[0]);
> +		struct amdgpu_task_info task_info = { 0 };
> +
> +		amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);

Shall we find vm and get the vm->task_info directly instead of filling local variable task_info?
(current style is also OK for me, just for more info)

And we may also check the return value for "NULL" case, otherwise it may cause access errorin dev_err(),
if failed to find vm (although, it's most unlikely to happen).

Jerry

> +
> +		dev_err(adev->dev, "GPU fault detected: %d 0x%08x for process %s pid %d thread %s pid %d\n",
> +			entry->src_id, entry->src_data[0], task_info.process_name,
> +			task_info.tgid, task_info.task_name, task_info.pid);
>   		dev_err(adev->dev, "  VM_CONTEXT1_PROTECTION_FAULT_ADDR   0x%08X\n",
>   			addr);
>   		dev_err(adev->dev, "  VM_CONTEXT1_PROTECTION_FAULT_STATUS 0x%08X\n",
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 691a659..9df94b4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -259,11 +259,16 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
>   	}
>
>   	if (printk_ratelimit()) {
> +		struct amdgpu_task_info task_info = { 0 };
> +
> +		amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
> +
>   		dev_err(adev->dev,
> -			"[%s] VMC page fault (src_id:%u ring:%u vmid:%u pasid:%u)\n",
> +			"[%s] VMC page fault (src_id:%u ring:%u vmid:%u pasid:%u, for process %s pid %d thread %s pid %d\n)\n",
>   			entry->vmid_src ? "mmhub" : "gfxhub",
>   			entry->src_id, entry->ring_id, entry->vmid,
> -			entry->pasid);
> +			entry->pasid, task_info.process_name, task_info.tgid,
> +			task_info.task_name, task_info.pid);
>   		dev_err(adev->dev, "  at page 0x%016llx from %d\n",
>   			addr, entry->client_id);
>   		if (!amdgpu_sriov_vf(adev))
>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3 2/2] drm/admgpu: Present amdgpu_task_info in VM_FAULTS.
       [not found]         ` <5B42EEEE.30305-5C7GfCeVMHo@public.gmane.org>
@ 2018-07-09  7:04           ` Christian König
       [not found]             ` <d87f8526-e528-2c00-bfa7-e433c490c033-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 10+ messages in thread
From: Christian König @ 2018-07-09  7:04 UTC (permalink / raw)
  To: Zhang, Jerry (Junwei),
	Andrey Grodzovsky, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: zhoucm1-5C7GfCeVMHo

Am 09.07.2018 um 07:13 schrieb Zhang, Jerry (Junwei):
> On 07/06/2018 03:27 AM, Andrey Grodzovsky wrote:
>> Extract and present the reposnsible process and thread when
>> VM_FAULT happens.
>>
>> v2: Use getter and setter functions.
>>
>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c |  4 ++++
>>   drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c  | 10 +++++++---
>>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  9 +++++++--
>>   3 files changed, 18 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> index 7a625f3..609c8f5 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> @@ -187,6 +187,10 @@ static int amdgpu_cs_parser_init(struct 
>> amdgpu_cs_parser *p, void *data)
>>       if (p->uf_entry.robj)
>>           p->job->uf_addr = uf_offset;
>>       kfree(chunk_array);
>> +
>> +    /* Use this opportunity to fill in task info for the vm */
>> +    amdgpu_vm_set_task_info(vm);
>> +
>
> Shall we set the task info when vm init?

No, vm_init() is called from a completely different process which is 
later on user of the VM.

>
>
>>       return 0;
>>
>>   free_all_kdata:
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c 
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>> index 08753e7..75f3ffb 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>> @@ -46,7 +46,6 @@
>>
>>   #include "ivsrcid/ivsrcid_vislands30.h"
>>
>> -
>>   static void gmc_v8_0_set_gmc_funcs(struct amdgpu_device *adev);
>>   static void gmc_v8_0_set_irq_funcs(struct amdgpu_device *adev);
>>   static int gmc_v8_0_wait_for_idle(void *handle);
>> @@ -1449,8 +1448,13 @@ static int gmc_v8_0_process_interrupt(struct 
>> amdgpu_device *adev,
>>           gmc_v8_0_set_fault_enable_default(adev, false);
>>
>>       if (printk_ratelimit()) {
>> -        dev_err(adev->dev, "GPU fault detected: %d 0x%08x\n",
>> -            entry->src_id, entry->src_data[0]);
>> +        struct amdgpu_task_info task_info = { 0 };
>> +
>> +        amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
>
> Shall we find vm and get the vm->task_info directly instead of filling 
> local variable task_info?
> (current style is also OK for me, just for more info)

No, we can't guarantee that the task_info pointer in the VM won't be 
freed after dropping the spinlock. So returning a copy here is the 
better approach.

>
> And we may also check the return value for "NULL" case, otherwise it 
> may cause access errorin dev_err(),
> if failed to find vm (although, it's most unlikely to happen).

Since we use a copy of the task info we should never get a NULL pointer. 
The string should already be zero terminated with the "{ 0 }" 
initialization above.

Christian.

>
> Jerry
>
>> +
>> +        dev_err(adev->dev, "GPU fault detected: %d 0x%08x for 
>> process %s pid %d thread %s pid %d\n",
>> +            entry->src_id, entry->src_data[0], task_info.process_name,
>> +            task_info.tgid, task_info.task_name, task_info.pid);
>>           dev_err(adev->dev, " VM_CONTEXT1_PROTECTION_FAULT_ADDR   
>> 0x%08X\n",
>>               addr);
>>           dev_err(adev->dev, " VM_CONTEXT1_PROTECTION_FAULT_STATUS 
>> 0x%08X\n",
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> index 691a659..9df94b4 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> @@ -259,11 +259,16 @@ static int gmc_v9_0_process_interrupt(struct 
>> amdgpu_device *adev,
>>       }
>>
>>       if (printk_ratelimit()) {
>> +        struct amdgpu_task_info task_info = { 0 };
>> +
>> +        amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
>> +
>>           dev_err(adev->dev,
>> -            "[%s] VMC page fault (src_id:%u ring:%u vmid:%u 
>> pasid:%u)\n",
>> +            "[%s] VMC page fault (src_id:%u ring:%u vmid:%u 
>> pasid:%u, for process %s pid %d thread %s pid %d\n)\n",
>>               entry->vmid_src ? "mmhub" : "gfxhub",
>>               entry->src_id, entry->ring_id, entry->vmid,
>> -            entry->pasid);
>> +            entry->pasid, task_info.process_name, task_info.tgid,
>> +            task_info.task_name, task_info.pid);
>>           dev_err(adev->dev, "  at page 0x%016llx from %d\n",
>>               addr, entry->client_id);
>>           if (!amdgpu_sriov_vf(adev))
>>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3 2/2] drm/admgpu: Present amdgpu_task_info in VM_FAULTS.
       [not found]             ` <d87f8526-e528-2c00-bfa7-e433c490c033-5C7GfCeVMHo@public.gmane.org>
@ 2018-07-09  7:48               ` Zhang, Jerry (Junwei)
       [not found]                 ` <5B43133A.2020300-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 10+ messages in thread
From: Zhang, Jerry (Junwei) @ 2018-07-09  7:48 UTC (permalink / raw)
  To: Christian König, Andrey Grodzovsky,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: zhoucm1-5C7GfCeVMHo

On 07/09/2018 03:04 PM, Christian König wrote:
> Am 09.07.2018 um 07:13 schrieb Zhang, Jerry (Junwei):
>> On 07/06/2018 03:27 AM, Andrey Grodzovsky wrote:
>>> Extract and present the reposnsible process and thread when
>>> VM_FAULT happens.
>>>
>>> v2: Use getter and setter functions.
>>>
>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c |  4 ++++
>>>   drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c  | 10 +++++++---
>>>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  9 +++++++--
>>>   3 files changed, 18 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> index 7a625f3..609c8f5 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> @@ -187,6 +187,10 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data)
>>>       if (p->uf_entry.robj)
>>>           p->job->uf_addr = uf_offset;
>>>       kfree(chunk_array);
>>> +
>>> +    /* Use this opportunity to fill in task info for the vm */
>>> +    amdgpu_vm_set_task_info(vm);
>>> +
>>
>> Shall we set the task info when vm init?
>
> No, vm_init() is called from a completely different process which is later on user of the VM.

Originally I thought UMD opened DRI node and create a VM by vm_init(), then every command submission
would be passed in the same VM initialized by vm_init().

So that's different process?


>
>>
>>
>>>       return 0;
>>>
>>>   free_all_kdata:
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>>> index 08753e7..75f3ffb 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>>> @@ -46,7 +46,6 @@
>>>
>>>   #include "ivsrcid/ivsrcid_vislands30.h"
>>>
>>> -
>>>   static void gmc_v8_0_set_gmc_funcs(struct amdgpu_device *adev);
>>>   static void gmc_v8_0_set_irq_funcs(struct amdgpu_device *adev);
>>>   static int gmc_v8_0_wait_for_idle(void *handle);
>>> @@ -1449,8 +1448,13 @@ static int gmc_v8_0_process_interrupt(struct amdgpu_device *adev,
>>>           gmc_v8_0_set_fault_enable_default(adev, false);
>>>
>>>       if (printk_ratelimit()) {
>>> -        dev_err(adev->dev, "GPU fault detected: %d 0x%08x\n",
>>> -            entry->src_id, entry->src_data[0]);
>>> +        struct amdgpu_task_info task_info = { 0 };
>>> +
>>> +        amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
>>
>> Shall we find vm and get the vm->task_info directly instead of filling local variable task_info?
>> (current style is also OK for me, just for more info)
>
> No, we can't guarantee that the task_info pointer in the VM won't be freed after dropping the spinlock. So returning a copy here is the better approach.
>
>>
>> And we may also check the return value for "NULL" case, otherwise it may cause access errorin dev_err(),
>> if failed to find vm (although, it's most unlikely to happen).
>
> Since we use a copy of the task info we should never get a NULL pointer. The string should already be zero terminated with the "{ 0 }" initialization above.

Thanks to explain that more.
Got it, fine for me.

Jerry

>
> Christian.
>
>>
>> Jerry
>>
>>> +
>>> +        dev_err(adev->dev, "GPU fault detected: %d 0x%08x for process %s pid %d thread %s pid %d\n",
>>> +            entry->src_id, entry->src_data[0], task_info.process_name,
>>> +            task_info.tgid, task_info.task_name, task_info.pid);
>>>           dev_err(adev->dev, " VM_CONTEXT1_PROTECTION_FAULT_ADDR 0x%08X\n",
>>>               addr);
>>>           dev_err(adev->dev, " VM_CONTEXT1_PROTECTION_FAULT_STATUS 0x%08X\n",
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> index 691a659..9df94b4 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> @@ -259,11 +259,16 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
>>>       }
>>>
>>>       if (printk_ratelimit()) {
>>> +        struct amdgpu_task_info task_info = { 0 };
>>> +
>>> +        amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
>>> +
>>>           dev_err(adev->dev,
>>> -            "[%s] VMC page fault (src_id:%u ring:%u vmid:%u pasid:%u)\n",
>>> +            "[%s] VMC page fault (src_id:%u ring:%u vmid:%u pasid:%u, for process %s pid %d thread %s pid %d\n)\n",
>>>               entry->vmid_src ? "mmhub" : "gfxhub",
>>>               entry->src_id, entry->ring_id, entry->vmid,
>>> -            entry->pasid);
>>> +            entry->pasid, task_info.process_name, task_info.tgid,
>>> +            task_info.task_name, task_info.pid);
>>>           dev_err(adev->dev, "  at page 0x%016llx from %d\n",
>>>               addr, entry->client_id);
>>>           if (!amdgpu_sriov_vf(adev))
>>>
>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3 2/2] drm/admgpu: Present amdgpu_task_info in VM_FAULTS.
       [not found]                 ` <5B43133A.2020300-5C7GfCeVMHo@public.gmane.org>
@ 2018-07-09  8:55                   ` Christian König
       [not found]                     ` <6a051779-2711-adba-aa88-e10d3ff20b6f-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 10+ messages in thread
From: Christian König @ 2018-07-09  8:55 UTC (permalink / raw)
  To: Zhang, Jerry (Junwei),
	Andrey Grodzovsky, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: zhoucm1-5C7GfCeVMHo

Am 09.07.2018 um 09:48 schrieb Zhang, Jerry (Junwei):
> On 07/09/2018 03:04 PM, Christian König wrote:
>> Am 09.07.2018 um 07:13 schrieb Zhang, Jerry (Junwei):
>>> On 07/06/2018 03:27 AM, Andrey Grodzovsky wrote:
>>>> Extract and present the reposnsible process and thread when
>>>> VM_FAULT happens.
>>>>
>>>> v2: Use getter and setter functions.
>>>>
>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>> ---
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c |  4 ++++
>>>>   drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c  | 10 +++++++---
>>>>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  9 +++++++--
>>>>   3 files changed, 18 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>> index 7a625f3..609c8f5 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>> @@ -187,6 +187,10 @@ static int amdgpu_cs_parser_init(struct 
>>>> amdgpu_cs_parser *p, void *data)
>>>>       if (p->uf_entry.robj)
>>>>           p->job->uf_addr = uf_offset;
>>>>       kfree(chunk_array);
>>>> +
>>>> +    /* Use this opportunity to fill in task info for the vm */
>>>> +    amdgpu_vm_set_task_info(vm);
>>>> +
>>>
>>> Shall we set the task info when vm init?
>>
>> No, vm_init() is called from a completely different process which is 
>> later on user of the VM.
>
> Originally I thought UMD opened DRI node and create a VM by vm_init(), 
> then every command submission
> would be passed in the same VM initialized by vm_init().
>
> So that's different process?

The display server, e.g. X or Wayland.

See with DRI3 the process of opening a connection to the hardware is 
that the display server open the file descriptor and with that calls 
vm_init.

And then this file descriptor is passed to the client processes through IPC.

Regards,
Christian.

>
>
>>
>>>
>>>
>>>>       return 0;
>>>>
>>>>   free_all_kdata:
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>>>> index 08753e7..75f3ffb 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>>>> @@ -46,7 +46,6 @@
>>>>
>>>>   #include "ivsrcid/ivsrcid_vislands30.h"
>>>>
>>>> -
>>>>   static void gmc_v8_0_set_gmc_funcs(struct amdgpu_device *adev);
>>>>   static void gmc_v8_0_set_irq_funcs(struct amdgpu_device *adev);
>>>>   static int gmc_v8_0_wait_for_idle(void *handle);
>>>> @@ -1449,8 +1448,13 @@ static int gmc_v8_0_process_interrupt(struct 
>>>> amdgpu_device *adev,
>>>>           gmc_v8_0_set_fault_enable_default(adev, false);
>>>>
>>>>       if (printk_ratelimit()) {
>>>> -        dev_err(adev->dev, "GPU fault detected: %d 0x%08x\n",
>>>> -            entry->src_id, entry->src_data[0]);
>>>> +        struct amdgpu_task_info task_info = { 0 };
>>>> +
>>>> +        amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
>>>
>>> Shall we find vm and get the vm->task_info directly instead of 
>>> filling local variable task_info?
>>> (current style is also OK for me, just for more info)
>>
>> No, we can't guarantee that the task_info pointer in the VM won't be 
>> freed after dropping the spinlock. So returning a copy here is the 
>> better approach.
>>
>>>
>>> And we may also check the return value for "NULL" case, otherwise it 
>>> may cause access errorin dev_err(),
>>> if failed to find vm (although, it's most unlikely to happen).
>>
>> Since we use a copy of the task info we should never get a NULL 
>> pointer. The string should already be zero terminated with the "{ 0 
>> }" initialization above.
>
> Thanks to explain that more.
> Got it, fine for me.
>
> Jerry
>
>>
>> Christian.
>>
>>>
>>> Jerry
>>>
>>>> +
>>>> +        dev_err(adev->dev, "GPU fault detected: %d 0x%08x for 
>>>> process %s pid %d thread %s pid %d\n",
>>>> +            entry->src_id, entry->src_data[0], 
>>>> task_info.process_name,
>>>> +            task_info.tgid, task_info.task_name, task_info.pid);
>>>>           dev_err(adev->dev, " VM_CONTEXT1_PROTECTION_FAULT_ADDR 
>>>> 0x%08X\n",
>>>>               addr);
>>>>           dev_err(adev->dev, " VM_CONTEXT1_PROTECTION_FAULT_STATUS 
>>>> 0x%08X\n",
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> index 691a659..9df94b4 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> @@ -259,11 +259,16 @@ static int gmc_v9_0_process_interrupt(struct 
>>>> amdgpu_device *adev,
>>>>       }
>>>>
>>>>       if (printk_ratelimit()) {
>>>> +        struct amdgpu_task_info task_info = { 0 };
>>>> +
>>>> +        amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
>>>> +
>>>>           dev_err(adev->dev,
>>>> -            "[%s] VMC page fault (src_id:%u ring:%u vmid:%u 
>>>> pasid:%u)\n",
>>>> +            "[%s] VMC page fault (src_id:%u ring:%u vmid:%u 
>>>> pasid:%u, for process %s pid %d thread %s pid %d\n)\n",
>>>>               entry->vmid_src ? "mmhub" : "gfxhub",
>>>>               entry->src_id, entry->ring_id, entry->vmid,
>>>> -            entry->pasid);
>>>> +            entry->pasid, task_info.process_name, task_info.tgid,
>>>> +            task_info.task_name, task_info.pid);
>>>>           dev_err(adev->dev, "  at page 0x%016llx from %d\n",
>>>>               addr, entry->client_id);
>>>>           if (!amdgpu_sriov_vf(adev))
>>>>
>>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3 2/2] drm/admgpu: Present amdgpu_task_info in VM_FAULTS.
       [not found]                     ` <6a051779-2711-adba-aa88-e10d3ff20b6f-5C7GfCeVMHo@public.gmane.org>
@ 2018-07-09 10:14                       ` Zhang, Jerry (Junwei)
       [not found]                         ` <5B433598.8000308-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 10+ messages in thread
From: Zhang, Jerry (Junwei) @ 2018-07-09 10:14 UTC (permalink / raw)
  To: Christian König, Andrey Grodzovsky,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: zhoucm1-5C7GfCeVMHo

On 07/09/2018 04:55 PM, Christian König wrote:
> Am 09.07.2018 um 09:48 schrieb Zhang, Jerry (Junwei):
>> On 07/09/2018 03:04 PM, Christian König wrote:
>>> Am 09.07.2018 um 07:13 schrieb Zhang, Jerry (Junwei):
>>>> On 07/06/2018 03:27 AM, Andrey Grodzovsky wrote:
>>>>> Extract and present the reposnsible process and thread when
>>>>> VM_FAULT happens.
>>>>>
>>>>> v2: Use getter and setter functions.
>>>>>
>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>> ---
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c |  4 ++++
>>>>>   drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c  | 10 +++++++---
>>>>>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  9 +++++++--
>>>>>   3 files changed, 18 insertions(+), 5 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>>> index 7a625f3..609c8f5 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>>> @@ -187,6 +187,10 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data)
>>>>>       if (p->uf_entry.robj)
>>>>>           p->job->uf_addr = uf_offset;
>>>>>       kfree(chunk_array);
>>>>> +
>>>>> +    /* Use this opportunity to fill in task info for the vm */
>>>>> +    amdgpu_vm_set_task_info(vm);
>>>>> +
>>>>
>>>> Shall we set the task info when vm init?
>>>
>>> No, vm_init() is called from a completely different process which is later on user of the VM.
>>
>> Originally I thought UMD opened DRI node and create a VM by vm_init(), then every command submission
>> would be passed in the same VM initialized by vm_init().
>>
>> So that's different process?
>
> The display server, e.g. X or Wayland.
>
> See with DRI3 the process of opening a connection to the hardware is that the display server open the file descriptor and with that calls vm_init.
>
> And then this file descriptor is passed to the client processes through IPC.

Thanks to reply.
yes, it's likely to open amdgpu node in ddx when driver probe and pass it to other client.

While that looks like just in the process of initialization that X server loads ddx driver.
But when I start 2 glxgears, that kms_open()->vm_init() will be called twice, which looks related to App as well.
(anyway, I will check it more)

Even so, it sounds vm_init() should be created firstly, then we use that VM for process on every command submission.
So I thought to set the task info at the first time vm_init() and use that info on VM fault process func.

Jerry

>
> Regards,
> Christian.
>
>>
>>
>>>
>>>>
>>>>
>>>>>       return 0;
>>>>>
>>>>>   free_all_kdata:
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>>>>> index 08753e7..75f3ffb 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>>>>> @@ -46,7 +46,6 @@
>>>>>
>>>>>   #include "ivsrcid/ivsrcid_vislands30.h"
>>>>>
>>>>> -
>>>>>   static void gmc_v8_0_set_gmc_funcs(struct amdgpu_device *adev);
>>>>>   static void gmc_v8_0_set_irq_funcs(struct amdgpu_device *adev);
>>>>>   static int gmc_v8_0_wait_for_idle(void *handle);
>>>>> @@ -1449,8 +1448,13 @@ static int gmc_v8_0_process_interrupt(struct amdgpu_device *adev,
>>>>>           gmc_v8_0_set_fault_enable_default(adev, false);
>>>>>
>>>>>       if (printk_ratelimit()) {
>>>>> -        dev_err(adev->dev, "GPU fault detected: %d 0x%08x\n",
>>>>> -            entry->src_id, entry->src_data[0]);
>>>>> +        struct amdgpu_task_info task_info = { 0 };
>>>>> +
>>>>> +        amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
>>>>
>>>> Shall we find vm and get the vm->task_info directly instead of filling local variable task_info?
>>>> (current style is also OK for me, just for more info)
>>>
>>> No, we can't guarantee that the task_info pointer in the VM won't be freed after dropping the spinlock. So returning a copy here is the better approach.
>>>
>>>>
>>>> And we may also check the return value for "NULL" case, otherwise it may cause access errorin dev_err(),
>>>> if failed to find vm (although, it's most unlikely to happen).
>>>
>>> Since we use a copy of the task info we should never get a NULL pointer. The string should already be zero terminated with the "{ 0 }" initialization above.
>>
>> Thanks to explain that more.
>> Got it, fine for me.
>>
>> Jerry
>>
>>>
>>> Christian.
>>>
>>>>
>>>> Jerry
>>>>
>>>>> +
>>>>> +        dev_err(adev->dev, "GPU fault detected: %d 0x%08x for process %s pid %d thread %s pid %d\n",
>>>>> +            entry->src_id, entry->src_data[0], task_info.process_name,
>>>>> +            task_info.tgid, task_info.task_name, task_info.pid);
>>>>>           dev_err(adev->dev, " VM_CONTEXT1_PROTECTION_FAULT_ADDR 0x%08X\n",
>>>>>               addr);
>>>>>           dev_err(adev->dev, " VM_CONTEXT1_PROTECTION_FAULT_STATUS 0x%08X\n",
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>> index 691a659..9df94b4 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>> @@ -259,11 +259,16 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
>>>>>       }
>>>>>
>>>>>       if (printk_ratelimit()) {
>>>>> +        struct amdgpu_task_info task_info = { 0 };
>>>>> +
>>>>> +        amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
>>>>> +
>>>>>           dev_err(adev->dev,
>>>>> -            "[%s] VMC page fault (src_id:%u ring:%u vmid:%u pasid:%u)\n",
>>>>> +            "[%s] VMC page fault (src_id:%u ring:%u vmid:%u pasid:%u, for process %s pid %d thread %s pid %d\n)\n",
>>>>>               entry->vmid_src ? "mmhub" : "gfxhub",
>>>>>               entry->src_id, entry->ring_id, entry->vmid,
>>>>> -            entry->pasid);
>>>>> +            entry->pasid, task_info.process_name, task_info.tgid,
>>>>> +            task_info.task_name, task_info.pid);
>>>>>           dev_err(adev->dev, "  at page 0x%016llx from %d\n",
>>>>>               addr, entry->client_id);
>>>>>           if (!amdgpu_sriov_vf(adev))
>>>>>
>>>
>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3 2/2] drm/admgpu: Present amdgpu_task_info in VM_FAULTS.
       [not found]                         ` <5B433598.8000308-5C7GfCeVMHo@public.gmane.org>
@ 2018-07-09 11:29                           ` Christian König
       [not found]                             ` <3f49bd26-26e6-459e-ac76-284fae1efdbb-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 10+ messages in thread
From: Christian König @ 2018-07-09 11:29 UTC (permalink / raw)
  To: Zhang, Jerry (Junwei),
	Andrey Grodzovsky, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: zhoucm1-5C7GfCeVMHo

Am 09.07.2018 um 12:14 schrieb Zhang, Jerry (Junwei):
> On 07/09/2018 04:55 PM, Christian König wrote:
>> Am 09.07.2018 um 09:48 schrieb Zhang, Jerry (Junwei):
>>> On 07/09/2018 03:04 PM, Christian König wrote:
>>>> Am 09.07.2018 um 07:13 schrieb Zhang, Jerry (Junwei):
>>>>> On 07/06/2018 03:27 AM, Andrey Grodzovsky wrote:
>>>>>> Extract and present the reposnsible process and thread when
>>>>>> VM_FAULT happens.
>>>>>>
>>>>>> v2: Use getter and setter functions.
>>>>>>
>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>>> ---
>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c |  4 ++++
>>>>>>   drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c  | 10 +++++++---
>>>>>>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  9 +++++++--
>>>>>>   3 files changed, 18 insertions(+), 5 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>>>> index 7a625f3..609c8f5 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>>>> @@ -187,6 +187,10 @@ static int amdgpu_cs_parser_init(struct 
>>>>>> amdgpu_cs_parser *p, void *data)
>>>>>>       if (p->uf_entry.robj)
>>>>>>           p->job->uf_addr = uf_offset;
>>>>>>       kfree(chunk_array);
>>>>>> +
>>>>>> +    /* Use this opportunity to fill in task info for the vm */
>>>>>> +    amdgpu_vm_set_task_info(vm);
>>>>>> +
>>>>>
>>>>> Shall we set the task info when vm init?
>>>>
>>>> No, vm_init() is called from a completely different process which 
>>>> is later on user of the VM.
>>>
>>> Originally I thought UMD opened DRI node and create a VM by 
>>> vm_init(), then every command submission
>>> would be passed in the same VM initialized by vm_init().
>>>
>>> So that's different process?
>>
>> The display server, e.g. X or Wayland.
>>
>> See with DRI3 the process of opening a connection to the hardware is 
>> that the display server open the file descriptor and with that calls 
>> vm_init.
>>
>> And then this file descriptor is passed to the client processes 
>> through IPC.
>
> Thanks to reply.
> yes, it's likely to open amdgpu node in ddx when driver probe and pass 
> it to other client.
>
> While that looks like just in the process of initialization that X 
> server loads ddx driver.

No, that happens with each client.

> But when I start 2 glxgears, that kms_open()->vm_init() will be called 
> twice, which looks related to App as well.
> (anyway, I will check it more)
>
> Even so, it sounds vm_init() should be created firstly, then we use 
> that VM for process on every command submission.
> So I thought to set the task info at the first time vm_init() and use 
> that info on VM fault process func.

No, that would certainly be not correct.

As I explained vm_init() is not necessarily called by the process which 
is then going to use the VM.

Christian.

>
> Jerry

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3 2/2] drm/admgpu: Present amdgpu_task_info in VM_FAULTS.
       [not found]                             ` <3f49bd26-26e6-459e-ac76-284fae1efdbb-5C7GfCeVMHo@public.gmane.org>
@ 2018-07-10  3:14                               ` Zhang, Jerry (Junwei)
  0 siblings, 0 replies; 10+ messages in thread
From: Zhang, Jerry (Junwei) @ 2018-07-10  3:14 UTC (permalink / raw)
  To: Christian König, Andrey Grodzovsky,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: zhoucm1-5C7GfCeVMHo

On 07/09/2018 07:29 PM, Christian König wrote:
> Am 09.07.2018 um 12:14 schrieb Zhang, Jerry (Junwei):
>> On 07/09/2018 04:55 PM, Christian König wrote:
>>> Am 09.07.2018 um 09:48 schrieb Zhang, Jerry (Junwei):
>>>> On 07/09/2018 03:04 PM, Christian König wrote:
>>>>> Am 09.07.2018 um 07:13 schrieb Zhang, Jerry (Junwei):
>>>>>> On 07/06/2018 03:27 AM, Andrey Grodzovsky wrote:
>>>>>>> Extract and present the reposnsible process and thread when
>>>>>>> VM_FAULT happens.
>>>>>>>
>>>>>>> v2: Use getter and setter functions.
>>>>>>>
>>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>>>> ---
>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c |  4 ++++
>>>>>>>   drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c  | 10 +++++++---
>>>>>>>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  9 +++++++--
>>>>>>>   3 files changed, 18 insertions(+), 5 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>>>>> index 7a625f3..609c8f5 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>>>>> @@ -187,6 +187,10 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data)
>>>>>>>       if (p->uf_entry.robj)
>>>>>>>           p->job->uf_addr = uf_offset;
>>>>>>>       kfree(chunk_array);
>>>>>>> +
>>>>>>> +    /* Use this opportunity to fill in task info for the vm */
>>>>>>> +    amdgpu_vm_set_task_info(vm);
>>>>>>> +
>>>>>>
>>>>>> Shall we set the task info when vm init?
>>>>>
>>>>> No, vm_init() is called from a completely different process which is later on user of the VM.
>>>>
>>>> Originally I thought UMD opened DRI node and create a VM by vm_init(), then every command submission
>>>> would be passed in the same VM initialized by vm_init().
>>>>
>>>> So that's different process?
>>>
>>> The display server, e.g. X or Wayland.
>>>
>>> See with DRI3 the process of opening a connection to the hardware is that the display server open the file descriptor and with that calls vm_init.
>>>
>>> And then this file descriptor is passed to the client processes through IPC.
>>
>> Thanks to reply.
>> yes, it's likely to open amdgpu node in ddx when driver probe and pass it to other client.
>>
>> While that looks like just in the process of initialization that X server loads ddx driver.
>
> No, that happens with each client.
>
>> But when I start 2 glxgears, that kms_open()->vm_init() will be called twice, which looks related to App as well.
>> (anyway, I will check it more)
>>
>> Even so, it sounds vm_init() should be created firstly, then we use that VM for process on every command submission.
>> So I thought to set the task info at the first time vm_init() and use that info on VM fault process func.
>
> No, that would certainly be not correct.
>
> As I explained vm_init() is not necessarily called by the process which is then going to use the VM.

I ran to an unexpected direction, in reality we'd like to get the process task_info for the client.
And vm_init() do happen in any place(xserver/ddx/client) but be used in client finally, so not good to get the task info in vm_init()

Thanks for your patient explanation.

the series of patch is

Reviewed-by: Junwei Zhang <Jerry.Zhang@amd.com>

Jerry


>
> Christian.
>
>>
>> Jerry
>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2018-07-10  3:14 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-07-05 19:26 [PATCH v3 1/2] drm/amdgpu: Add support for logging process info in amdgpu_vm Andrey Grodzovsky
     [not found] ` <1530818820-28631-1-git-send-email-andrey.grodzovsky-5C7GfCeVMHo@public.gmane.org>
2018-07-05 19:27   ` [PATCH v3 2/2] drm/admgpu: Present amdgpu_task_info in VM_FAULTS Andrey Grodzovsky
     [not found]     ` <1530818820-28631-2-git-send-email-andrey.grodzovsky-5C7GfCeVMHo@public.gmane.org>
2018-07-06  3:09       ` 答复: " Qu, Jim
2018-07-09  5:13       ` Zhang, Jerry (Junwei)
     [not found]         ` <5B42EEEE.30305-5C7GfCeVMHo@public.gmane.org>
2018-07-09  7:04           ` Christian König
     [not found]             ` <d87f8526-e528-2c00-bfa7-e433c490c033-5C7GfCeVMHo@public.gmane.org>
2018-07-09  7:48               ` Zhang, Jerry (Junwei)
     [not found]                 ` <5B43133A.2020300-5C7GfCeVMHo@public.gmane.org>
2018-07-09  8:55                   ` Christian König
     [not found]                     ` <6a051779-2711-adba-aa88-e10d3ff20b6f-5C7GfCeVMHo@public.gmane.org>
2018-07-09 10:14                       ` Zhang, Jerry (Junwei)
     [not found]                         ` <5B433598.8000308-5C7GfCeVMHo@public.gmane.org>
2018-07-09 11:29                           ` Christian König
     [not found]                             ` <3f49bd26-26e6-459e-ac76-284fae1efdbb-5C7GfCeVMHo@public.gmane.org>
2018-07-10  3:14                               ` Zhang, Jerry (Junwei)

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.