amd-gfx.lists.freedesktop.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] drm/amdgpu: Do not move root PT bo to relocated list
@ 2020-02-09  2:52 Pan, Xinhui
  2020-02-09  8:21 ` Christian König
  0 siblings, 1 reply; 6+ messages in thread
From: Pan, Xinhui @ 2020-02-09  2:52 UTC (permalink / raw)
  To: amd-gfx; +Cc: Deucher, Alexander, Koenig, Christian

hit panic when we update the page tables.

<1>[  122.103290] BUG: kernel NULL pointer dereference, address: 0000000000000008
<1>[  122.103348] #PF: supervisor read access in kernel mode
<1>[  122.103376] #PF: error_code(0x0000) - not-present page
<6>[  122.103403] PGD 0 P4D 0 
<4>[  122.103421] Oops: 0000 [#1] SMP PTI
<4>[  122.103442] CPU: 13 PID: 2133 Comm: kfdtest Tainted: G           OE     5.4.0-rc7+ #7
<4>[  122.103480] Hardware name: Supermicro SYS-7048GR-TR/X10DRG-Q, BIOS 3.0b 03/09/2018
<4>[  122.103657] RIP: 0010:amdgpu_vm_update_pdes+0x140/0x330 [amdgpu]
<4>[  122.103689] Code: 03 4c 89 73 08 49 89 9d c8 00 00 00 48 8b 7b f0 c6 43 10 00 45 31 c0 48 8b 87 28 04 00 00 48 85 c0 74 07 4c 8b 80 20 04 00 00 <4d> 8b 70 08 31 f6 49 8b 86 28 04 00 00 48 85 c0 74 0f 48 8b 80 28
<4>[  122.103769] RSP: 0018:ffffb49a0a6a3a98 EFLAGS: 00010246
<4>[  122.103797] RAX: 0000000000000000 RBX: ffff9020f823c148 RCX: dead000000000122
<4>[  122.103831] RDX: ffff9020ece70018 RSI: ffff9020f823c0c8 RDI: ffff9010ca31c800
<4>[  122.103865] RBP: ffffb49a0a6a3b38 R08: 0000000000000000 R09: 0000000000000001
<4>[  122.103899] R10: 000000006044f994 R11: 00000000df57fb58 R12: ffff9020f823c000
<4>[  122.103933] R13: ffff9020f823c000 R14: ffff9020f823c0c8 R15: ffff9010d5d20000
<4>[  122.103968] FS:  00007f32c83dc780(0000) GS:ffff9020ff380000(0000) knlGS:0000000000000000
<4>[  122.104006] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
<4>[  122.104035] CR2: 0000000000000008 CR3: 0000002036bba005 CR4: 00000000003606e0
<4>[  122.104069] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
<4>[  122.104103] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
<4>[  122.104137] Call Trace:
<4>[  122.104241]  vm_update_pds+0x31/0x50 [amdgpu]
<4>[  122.104347]  amdgpu_amdkfd_gpuvm_map_memory_to_gpu+0x2ef/0x690 [amdgpu]
<4>[  122.104466]  kfd_process_alloc_gpuvm+0x98/0x190 [amdgpu]
<4>[  122.104576]  kfd_process_device_init_vm.part.8+0xf3/0x1f0 [amdgpu]
<4>[  122.104688]  kfd_process_device_init_vm+0x24/0x30 [amdgpu]
<4>[  122.104794]  kfd_ioctl_acquire_vm+0xa4/0xc0 [amdgpu]
<4>[  122.104900]  kfd_ioctl+0x277/0x500 [amdgpu]
<4>[  122.105001]  ? kfd_ioctl_free_memory_of_gpu+0xc0/0xc0 [amdgpu]
<4>[  122.105039]  ? rcu_read_lock_sched_held+0x4f/0x80
<4>[  122.105068]  ? kmem_cache_free+0x2ba/0x300
<4>[  122.105093]  ? vm_area_free+0x18/0x20
<4>[  122.105117]  ? find_held_lock+0x35/0xa0
<4>[  122.105143]  do_vfs_ioctl+0xa9/0x6f0
<4>[  122.106001]  ksys_ioctl+0x75/0x80
<4>[  122.106802]  ? do_syscall_64+0x17/0x230
<4>[  122.107605]  __x64_sys_ioctl+0x1a/0x20
<4>[  122.108378]  do_syscall_64+0x5f/0x230
<4>[  122.109118]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
<4>[  122.109842] RIP: 0033:0x7f32c6b495d7

Signed-off-by: xinhui pan <xinhui.pan@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 3195bc90985a..3c388fdf335c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2619,7 +2619,7 @@ void amdgpu_vm_bo_invalidate(struct amdgpu_device *adev,
 			continue;
 		bo_base->moved = true;
 
-		if (bo->tbo.type == ttm_bo_type_kernel)
+		if (bo->tbo.type == ttm_bo_type_kernel && bo->parent)
 			amdgpu_vm_bo_relocated(bo_base);
 		else if (bo->tbo.base.resv == vm->root.base.bo->tbo.base.resv)
 			amdgpu_vm_bo_moved(bo_base);
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH] drm/amdgpu: Do not move root PT bo to relocated list
  2020-02-09  2:52 [PATCH] drm/amdgpu: Do not move root PT bo to relocated list Pan, Xinhui
@ 2020-02-09  8:21 ` Christian König
  2020-02-10  0:59   ` Pan, Xinhui
  0 siblings, 1 reply; 6+ messages in thread
From: Christian König @ 2020-02-09  8:21 UTC (permalink / raw)
  To: Pan, Xinhui, amd-gfx; +Cc: Deucher, Alexander, Koenig, Christian

Am 09.02.20 um 03:52 schrieb Pan, Xinhui:
> hit panic when we update the page tables.
>
> <1>[  122.103290] BUG: kernel NULL pointer dereference, address: 0000000000000008
> <1>[  122.103348] #PF: supervisor read access in kernel mode
> <1>[  122.103376] #PF: error_code(0x0000) - not-present page
> <6>[  122.103403] PGD 0 P4D 0
> <4>[  122.103421] Oops: 0000 [#1] SMP PTI
> <4>[  122.103442] CPU: 13 PID: 2133 Comm: kfdtest Tainted: G           OE     5.4.0-rc7+ #7
> <4>[  122.103480] Hardware name: Supermicro SYS-7048GR-TR/X10DRG-Q, BIOS 3.0b 03/09/2018
> <4>[  122.103657] RIP: 0010:amdgpu_vm_update_pdes+0x140/0x330 [amdgpu]
> <4>[  122.103689] Code: 03 4c 89 73 08 49 89 9d c8 00 00 00 48 8b 7b f0 c6 43 10 00 45 31 c0 48 8b 87 28 04 00 00 48 85 c0 74 07 4c 8b 80 20 04 00 00 <4d> 8b 70 08 31 f6 49 8b 86 28 04 00 00 48 85 c0 74 0f 48 8b 80 28
> <4>[  122.103769] RSP: 0018:ffffb49a0a6a3a98 EFLAGS: 00010246
> <4>[  122.103797] RAX: 0000000000000000 RBX: ffff9020f823c148 RCX: dead000000000122
> <4>[  122.103831] RDX: ffff9020ece70018 RSI: ffff9020f823c0c8 RDI: ffff9010ca31c800
> <4>[  122.103865] RBP: ffffb49a0a6a3b38 R08: 0000000000000000 R09: 0000000000000001
> <4>[  122.103899] R10: 000000006044f994 R11: 00000000df57fb58 R12: ffff9020f823c000
> <4>[  122.103933] R13: ffff9020f823c000 R14: ffff9020f823c0c8 R15: ffff9010d5d20000
> <4>[  122.103968] FS:  00007f32c83dc780(0000) GS:ffff9020ff380000(0000) knlGS:0000000000000000
> <4>[  122.104006] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> <4>[  122.104035] CR2: 0000000000000008 CR3: 0000002036bba005 CR4: 00000000003606e0
> <4>[  122.104069] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> <4>[  122.104103] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> <4>[  122.104137] Call Trace:
> <4>[  122.104241]  vm_update_pds+0x31/0x50 [amdgpu]
> <4>[  122.104347]  amdgpu_amdkfd_gpuvm_map_memory_to_gpu+0x2ef/0x690 [amdgpu]
> <4>[  122.104466]  kfd_process_alloc_gpuvm+0x98/0x190 [amdgpu]
> <4>[  122.104576]  kfd_process_device_init_vm.part.8+0xf3/0x1f0 [amdgpu]
> <4>[  122.104688]  kfd_process_device_init_vm+0x24/0x30 [amdgpu]
> <4>[  122.104794]  kfd_ioctl_acquire_vm+0xa4/0xc0 [amdgpu]
> <4>[  122.104900]  kfd_ioctl+0x277/0x500 [amdgpu]
> <4>[  122.105001]  ? kfd_ioctl_free_memory_of_gpu+0xc0/0xc0 [amdgpu]
> <4>[  122.105039]  ? rcu_read_lock_sched_held+0x4f/0x80
> <4>[  122.105068]  ? kmem_cache_free+0x2ba/0x300
> <4>[  122.105093]  ? vm_area_free+0x18/0x20
> <4>[  122.105117]  ? find_held_lock+0x35/0xa0
> <4>[  122.105143]  do_vfs_ioctl+0xa9/0x6f0
> <4>[  122.106001]  ksys_ioctl+0x75/0x80
> <4>[  122.106802]  ? do_syscall_64+0x17/0x230
> <4>[  122.107605]  __x64_sys_ioctl+0x1a/0x20
> <4>[  122.108378]  do_syscall_64+0x5f/0x230
> <4>[  122.109118]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
> <4>[  122.109842] RIP: 0033:0x7f32c6b495d7
>
> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 3195bc90985a..3c388fdf335c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -2619,7 +2619,7 @@ void amdgpu_vm_bo_invalidate(struct amdgpu_device *adev,
>   			continue;
>   		bo_base->moved = true;
>   
> -		if (bo->tbo.type == ttm_bo_type_kernel)
> +		if (bo->tbo.type == ttm_bo_type_kernel && bo->parent)

Good catch, but that would mean that we move the root PD to the moved 
state which in turn is illegal as well.

Maybe better adjust amdgpu_vm_bo_relocated() to move the root PD to the 
idle state instead.

Christian.


>   			amdgpu_vm_bo_relocated(bo_base);
>   		else if (bo->tbo.base.resv == vm->root.base.bo->tbo.base.resv)
>   			amdgpu_vm_bo_moved(bo_base);

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] drm/amdgpu: Do not move root PT bo to relocated list
  2020-02-09  8:21 ` Christian König
@ 2020-02-10  0:59   ` Pan, Xinhui
  2020-02-10  9:31     ` Christian König
  0 siblings, 1 reply; 6+ messages in thread
From: Pan, Xinhui @ 2020-02-10  0:59 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx; +Cc: Deucher, Alexander, Koenig, Christian


[-- Attachment #1.1: Type: text/plain, Size: 8987 bytes --]

[AMD Official Use Only - Internal Distribution Only]

If so the function name does not match its functionality.

________________________________
From: Christian König <ckoenig.leichtzumerken@gmail.com>
Sent: Sunday, February 9, 2020 4:21:13 PM
To: Pan, Xinhui <Xinhui.Pan@amd.com>; amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
Subject: Re: [PATCH] drm/amdgpu: Do not move root PT bo to relocated list

Am 09.02.20 um 03:52 schrieb Pan, Xinhui:
> hit panic when we update the page tables.
>
> <1>[  122.103290] BUG: kernel NULL pointer dereference, address: 0000000000000008
> <1>[  122.103348] #PF: supervisor read access in kernel mode
> <1>[  122.103376] #PF: error_code(0x0000) - not-present page
> <6>[  122.103403] PGD 0 P4D 0
> <4>[  122.103421] Oops: 0000 [#1] SMP PTI
> <4>[  122.103442] CPU: 13 PID: 2133 Comm: kfdtest Tainted: G           OE     5.4.0-rc7+ #7
> <4>[  122.103480] Hardware name: Supermicro SYS-7048GR-TR/X10DRG-Q, BIOS 3.0b 03/09/2018
> <4>[  122.103657] RIP: 0010:amdgpu_vm_update_pdes+0x140/0x330 [amdgpu]
> <4>[  122.103689] Code: 03 4c 89 73 08 49 89 9d c8 00 00 00 48 8b 7b f0 c6 43 10 00 45 31 c0 48 8b 87 28 04 00 00 48 85 c0 74 07 4c 8b 80 20 04 00 00 <4d> 8b 70 08 31 f6 49 8b 86 28 04 00 00 48 85 c0 74 0f 48 8b 80 28
> <4>[  122.103769] RSP: 0018:ffffb49a0a6a3a98 EFLAGS: 00010246
> <4>[  122.103797] RAX: 0000000000000000 RBX: ffff9020f823c148 RCX: dead000000000122
> <4>[  122.103831] RDX: ffff9020ece70018 RSI: ffff9020f823c0c8 RDI: ffff9010ca31c800
> <4>[  122.103865] RBP: ffffb49a0a6a3b38 R08: 0000000000000000 R09: 0000000000000001
> <4>[  122.103899] R10: 000000006044f994 R11: 00000000df57fb58 R12: ffff9020f823c000
> <4>[  122.103933] R13: ffff9020f823c000 R14: ffff9020f823c0c8 R15: ffff9010d5d20000
> <4>[  122.103968] FS:  00007f32c83dc780(0000) GS:ffff9020ff380000(0000) knlGS:0000000000000000
> <4>[  122.104006] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> <4>[  122.104035] CR2: 0000000000000008 CR3: 0000002036bba005 CR4: 00000000003606e0
> <4>[  122.104069] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> <4>[  122.104103] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> <4>[  122.104137] Call Trace:
> <4>[  122.104241]  vm_update_pds+0x31/0x50 [amdgpu]
> <4>[  122.104347]  amdgpu_amdkfd_gpuvm_map_memory_to_gpu+0x2ef/0x690 [amdgpu]
> <4>[  122.104466]  kfd_process_alloc_gpuvm+0x98/0x190 [amdgpu]
> <4>[  122.104576]  kfd_process_device_init_vm.part.8+0xf3/0x1f0 [amdgpu]
> <4>[  122.104688]  kfd_process_device_init_vm+0x24/0x30 [amdgpu]
> <4>[  122.104794]  kfd_ioctl_acquire_vm+0xa4/0xc0 [amdgpu]
> <4>[  122.104900]  kfd_ioctl+0x277/0x500 [amdgpu]
> <4>[  122.105001]  ? kfd_ioctl_free_memory_of_gpu+0xc0/0xc0 [amdgpu]
> <4>[  122.105039]  ? rcu_read_lock_sched_held+0x4f/0x80
> <4>[  122.105068]  ? kmem_cache_free+0x2ba/0x300
> <4>[  122.105093]  ? vm_area_free+0x18/0x20
> <4>[  122.105117]  ? find_held_lock+0x35/0xa0
> <4>[  122.105143]  do_vfs_ioctl+0xa9/0x6f0
> <4>[  122.106001]  ksys_ioctl+0x75/0x80
> <4>[  122.106802]  ? do_syscall_64+0x17/0x230
> <4>[  122.107605]  __x64_sys_ioctl+0x1a/0x20
> <4>[  122.108378]  do_syscall_64+0x5f/0x230
> <4>[  122.109118]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
> <4>[  122.109842] RIP: 0033:0x7f32c6b495d7
>
> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 3195bc90985a..3c388fdf335c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -2619,7 +2619,7 @@ void amdgpu_vm_bo_invalidate(struct amdgpu_device *adev,
>                        continue;
>                bo_base->moved = true;
>
> -             if (bo->tbo.type == ttm_bo_type_kernel)
> +             if (bo->tbo.type == ttm_bo_type_kernel && bo->parent)

Good catch, but that would mean that we move the root PD to the moved
state which in turn is illegal as well.

Maybe better adjust amdgpu_vm_bo_relocated() to move the root PD to the
idle state instead.

Christian.


>                        amdgpu_vm_bo_relocated(bo_base);
>                else if (bo->tbo.base.resv == vm->root.base.bo->tbo.base.resv)
>                        amdgpu_vm_bo_moved(bo_base);

________________________________
From: Christian König <ckoenig.leichtzumerken@gmail.com>
Sent: Sunday, February 9, 2020 4:21:13 PM
To: Pan, Xinhui <Xinhui.Pan@amd.com>; amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
Subject: Re: [PATCH] drm/amdgpu: Do not move root PT bo to relocated list

Am 09.02.20 um 03:52 schrieb Pan, Xinhui:
> hit panic when we update the page tables.
>
> <1>[  122.103290] BUG: kernel NULL pointer dereference, address: 0000000000000008
> <1>[  122.103348] #PF: supervisor read access in kernel mode
> <1>[  122.103376] #PF: error_code(0x0000) - not-present page
> <6>[  122.103403] PGD 0 P4D 0
> <4>[  122.103421] Oops: 0000 [#1] SMP PTI
> <4>[  122.103442] CPU: 13 PID: 2133 Comm: kfdtest Tainted: G           OE     5.4.0-rc7+ #7
> <4>[  122.103480] Hardware name: Supermicro SYS-7048GR-TR/X10DRG-Q, BIOS 3.0b 03/09/2018
> <4>[  122.103657] RIP: 0010:amdgpu_vm_update_pdes+0x140/0x330 [amdgpu]
> <4>[  122.103689] Code: 03 4c 89 73 08 49 89 9d c8 00 00 00 48 8b 7b f0 c6 43 10 00 45 31 c0 48 8b 87 28 04 00 00 48 85 c0 74 07 4c 8b 80 20 04 00 00 <4d> 8b 70 08 31 f6 49 8b 86 28 04 00 00 48 85 c0 74 0f 48 8b 80 28
> <4>[  122.103769] RSP: 0018:ffffb49a0a6a3a98 EFLAGS: 00010246
> <4>[  122.103797] RAX: 0000000000000000 RBX: ffff9020f823c148 RCX: dead000000000122
> <4>[  122.103831] RDX: ffff9020ece70018 RSI: ffff9020f823c0c8 RDI: ffff9010ca31c800
> <4>[  122.103865] RBP: ffffb49a0a6a3b38 R08: 0000000000000000 R09: 0000000000000001
> <4>[  122.103899] R10: 000000006044f994 R11: 00000000df57fb58 R12: ffff9020f823c000
> <4>[  122.103933] R13: ffff9020f823c000 R14: ffff9020f823c0c8 R15: ffff9010d5d20000
> <4>[  122.103968] FS:  00007f32c83dc780(0000) GS:ffff9020ff380000(0000) knlGS:0000000000000000
> <4>[  122.104006] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> <4>[  122.104035] CR2: 0000000000000008 CR3: 0000002036bba005 CR4: 00000000003606e0
> <4>[  122.104069] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> <4>[  122.104103] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> <4>[  122.104137] Call Trace:
> <4>[  122.104241]  vm_update_pds+0x31/0x50 [amdgpu]
> <4>[  122.104347]  amdgpu_amdkfd_gpuvm_map_memory_to_gpu+0x2ef/0x690 [amdgpu]
> <4>[  122.104466]  kfd_process_alloc_gpuvm+0x98/0x190 [amdgpu]
> <4>[  122.104576]  kfd_process_device_init_vm.part.8+0xf3/0x1f0 [amdgpu]
> <4>[  122.104688]  kfd_process_device_init_vm+0x24/0x30 [amdgpu]
> <4>[  122.104794]  kfd_ioctl_acquire_vm+0xa4/0xc0 [amdgpu]
> <4>[  122.104900]  kfd_ioctl+0x277/0x500 [amdgpu]
> <4>[  122.105001]  ? kfd_ioctl_free_memory_of_gpu+0xc0/0xc0 [amdgpu]
> <4>[  122.105039]  ? rcu_read_lock_sched_held+0x4f/0x80
> <4>[  122.105068]  ? kmem_cache_free+0x2ba/0x300
> <4>[  122.105093]  ? vm_area_free+0x18/0x20
> <4>[  122.105117]  ? find_held_lock+0x35/0xa0
> <4>[  122.105143]  do_vfs_ioctl+0xa9/0x6f0
> <4>[  122.106001]  ksys_ioctl+0x75/0x80
> <4>[  122.106802]  ? do_syscall_64+0x17/0x230
> <4>[  122.107605]  __x64_sys_ioctl+0x1a/0x20
> <4>[  122.108378]  do_syscall_64+0x5f/0x230
> <4>[  122.109118]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
> <4>[  122.109842] RIP: 0033:0x7f32c6b495d7
>
> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 3195bc90985a..3c388fdf335c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -2619,7 +2619,7 @@ void amdgpu_vm_bo_invalidate(struct amdgpu_device *adev,
>                        continue;
>                bo_base->moved = true;
>
> -             if (bo->tbo.type == ttm_bo_type_kernel)
> +             if (bo->tbo.type == ttm_bo_type_kernel && bo->parent)

Good catch, but that would mean that we move the root PD to the moved
state which in turn is illegal as well.

Maybe better adjust amdgpu_vm_bo_relocated() to move the root PD to the
idle state instead.

Christian.


>                        amdgpu_vm_bo_relocated(bo_base);
>                else if (bo->tbo.base.resv == vm->root.base.bo->tbo.base.resv)
>                        amdgpu_vm_bo_moved(bo_base);


[-- Attachment #1.2: Type: text/html, Size: 14024 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] drm/amdgpu: Do not move root PT bo to relocated list
  2020-02-10  0:59   ` Pan, Xinhui
@ 2020-02-10  9:31     ` Christian König
  0 siblings, 0 replies; 6+ messages in thread
From: Christian König @ 2020-02-10  9:31 UTC (permalink / raw)
  To: Pan, Xinhui, amd-gfx; +Cc: Deucher, Alexander


[-- Attachment #1.1: Type: text/plain, Size: 10311 bytes --]

Yeah, that is indeed partially true.

But we already have the same logic in amdgpu_vm_bo_base_init() and 
amdgpu_vm_validate_pt_bos(). Just in the functions 
amdgpu_vm_invalidate_pds() and amdgpu_vm_bo_invalidate() the handling 
seems to be incorrect.

Still sounds like a good idea to me to have this logic in a common place 
and not duplicated multiple times.

And the function name is still correct if we think about it as a state 
of the bo_va instead of a helper to put thinks on a list. It's just that 
for the root PD we can skip this state change and go directly to the 
idle state.

Regards,
Christian.

Am 10.02.20 um 01:59 schrieb Pan, Xinhui:
>
> [AMD Official Use Only - Internal Distribution Only]
>
>
> If so the function name does not match its functionality.
>
> ------------------------------------------------------------------------
> *From:* Christian König <ckoenig.leichtzumerken@gmail.com>
> *Sent:* Sunday, February 9, 2020 4:21:13 PM
> *To:* Pan, Xinhui <Xinhui.Pan@amd.com>; amd-gfx@lists.freedesktop.org 
> <amd-gfx@lists.freedesktop.org>
> *Cc:* Deucher, Alexander <Alexander.Deucher@amd.com>; Koenig, 
> Christian <Christian.Koenig@amd.com>
> *Subject:* Re: [PATCH] drm/amdgpu: Do not move root PT bo to relocated 
> list
> Am 09.02.20 um 03:52 schrieb Pan, Xinhui:
> > hit panic when we update the page tables.
> >
> > <1>[  122.103290] BUG: kernel NULL pointer dereference, address: 
> 0000000000000008
> > <1>[  122.103348] #PF: supervisor read access in kernel mode
> > <1>[  122.103376] #PF: error_code(0x0000) - not-present page
> > <6>[  122.103403] PGD 0 P4D 0
> > <4>[  122.103421] Oops: 0000 [#1] SMP PTI
> > <4>[  122.103442] CPU: 13 PID: 2133 Comm: kfdtest Tainted: 
> G           OE     5.4.0-rc7+ #7
> > <4>[  122.103480] Hardware name: Supermicro SYS-7048GR-TR/X10DRG-Q, 
> BIOS 3.0b 03/09/2018
> > <4>[  122.103657] RIP: 0010:amdgpu_vm_update_pdes+0x140/0x330 [amdgpu]
> > <4>[  122.103689] Code: 03 4c 89 73 08 49 89 9d c8 00 00 00 48 8b 7b 
> f0 c6 43 10 00 45 31 c0 48 8b 87 28 04 00 00 48 85 c0 74 07 4c 8b 80 
> 20 04 00 00 <4d> 8b 70 08 31 f6 49 8b 86 28 04 00 00 48 85 c0 74 0f 48 
> 8b 80 28
> > <4>[  122.103769] RSP: 0018:ffffb49a0a6a3a98 EFLAGS: 00010246
> > <4>[  122.103797] RAX: 0000000000000000 RBX: ffff9020f823c148 RCX: 
> dead000000000122
> > <4>[  122.103831] RDX: ffff9020ece70018 RSI: ffff9020f823c0c8 RDI: 
> ffff9010ca31c800
> > <4>[  122.103865] RBP: ffffb49a0a6a3b38 R08: 0000000000000000 R09: 
> 0000000000000001
> > <4>[  122.103899] R10: 000000006044f994 R11: 00000000df57fb58 R12: 
> ffff9020f823c000
> > <4>[  122.103933] R13: ffff9020f823c000 R14: ffff9020f823c0c8 R15: 
> ffff9010d5d20000
> > <4>[  122.103968] FS:  00007f32c83dc780(0000) 
> GS:ffff9020ff380000(0000) knlGS:0000000000000000
> > <4>[  122.104006] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > <4>[  122.104035] CR2: 0000000000000008 CR3: 0000002036bba005 CR4: 
> 00000000003606e0
> > <4>[  122.104069] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 
> 0000000000000000
> > <4>[  122.104103] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 
> 0000000000000400
> > <4>[  122.104137] Call Trace:
> > <4>[  122.104241]  vm_update_pds+0x31/0x50 [amdgpu]
> > <4>[  122.104347] amdgpu_amdkfd_gpuvm_map_memory_to_gpu+0x2ef/0x690 
> [amdgpu]
> > <4>[  122.104466] kfd_process_alloc_gpuvm+0x98/0x190 [amdgpu]
> > <4>[  122.104576] kfd_process_device_init_vm.part.8+0xf3/0x1f0 [amdgpu]
> > <4>[  122.104688] kfd_process_device_init_vm+0x24/0x30 [amdgpu]
> > <4>[  122.104794] kfd_ioctl_acquire_vm+0xa4/0xc0 [amdgpu]
> > <4>[  122.104900]  kfd_ioctl+0x277/0x500 [amdgpu]
> > <4>[  122.105001]  ? kfd_ioctl_free_memory_of_gpu+0xc0/0xc0 [amdgpu]
> > <4>[  122.105039]  ? rcu_read_lock_sched_held+0x4f/0x80
> > <4>[  122.105068]  ? kmem_cache_free+0x2ba/0x300
> > <4>[  122.105093]  ? vm_area_free+0x18/0x20
> > <4>[  122.105117]  ? find_held_lock+0x35/0xa0
> > <4>[  122.105143]  do_vfs_ioctl+0xa9/0x6f0
> > <4>[  122.106001]  ksys_ioctl+0x75/0x80
> > <4>[  122.106802]  ? do_syscall_64+0x17/0x230
> > <4>[  122.107605]  __x64_sys_ioctl+0x1a/0x20
> > <4>[  122.108378]  do_syscall_64+0x5f/0x230
> > <4>[  122.109118] entry_SYSCALL_64_after_hwframe+0x49/0xbe
> > <4>[  122.109842] RIP: 0033:0x7f32c6b495d7
> >
> > Signed-off-by: xinhui pan <xinhui.pan@amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 2 +-
> >   1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > index 3195bc90985a..3c388fdf335c 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > @@ -2619,7 +2619,7 @@ void amdgpu_vm_bo_invalidate(struct 
> amdgpu_device *adev,
> >                        continue;
> >                bo_base->moved = true;
> >
> > -             if (bo->tbo.type == ttm_bo_type_kernel)
> > +             if (bo->tbo.type == ttm_bo_type_kernel && bo->parent)
>
> Good catch, but that would mean that we move the root PD to the moved
> state which in turn is illegal as well.
>
> Maybe better adjust amdgpu_vm_bo_relocated() to move the root PD to the
> idle state instead.
>
> Christian.
>
>
> > amdgpu_vm_bo_relocated(bo_base);
> >                else if (bo->tbo.base.resv == 
> vm->root.base.bo->tbo.base.resv)
> >                        amdgpu_vm_bo_moved(bo_base);
>
> ------------------------------------------------------------------------
> *From:* Christian König <ckoenig.leichtzumerken@gmail.com>
> *Sent:* Sunday, February 9, 2020 4:21:13 PM
> *To:* Pan, Xinhui <Xinhui.Pan@amd.com>; amd-gfx@lists.freedesktop.org 
> <amd-gfx@lists.freedesktop.org>
> *Cc:* Deucher, Alexander <Alexander.Deucher@amd.com>; Koenig, 
> Christian <Christian.Koenig@amd.com>
> *Subject:* Re: [PATCH] drm/amdgpu: Do not move root PT bo to relocated 
> list
> Am 09.02.20 um 03:52 schrieb Pan, Xinhui:
> > hit panic when we update the page tables.
> >
> > <1>[  122.103290] BUG: kernel NULL pointer dereference, address: 
> 0000000000000008
> > <1>[  122.103348] #PF: supervisor read access in kernel mode
> > <1>[  122.103376] #PF: error_code(0x0000) - not-present page
> > <6>[  122.103403] PGD 0 P4D 0
> > <4>[  122.103421] Oops: 0000 [#1] SMP PTI
> > <4>[  122.103442] CPU: 13 PID: 2133 Comm: kfdtest Tainted: 
> G           OE     5.4.0-rc7+ #7
> > <4>[  122.103480] Hardware name: Supermicro SYS-7048GR-TR/X10DRG-Q, 
> BIOS 3.0b 03/09/2018
> > <4>[  122.103657] RIP: 0010:amdgpu_vm_update_pdes+0x140/0x330 [amdgpu]
> > <4>[  122.103689] Code: 03 4c 89 73 08 49 89 9d c8 00 00 00 48 8b 7b 
> f0 c6 43 10 00 45 31 c0 48 8b 87 28 04 00 00 48 85 c0 74 07 4c 8b 80 
> 20 04 00 00 <4d> 8b 70 08 31 f6 49 8b 86 28 04 00 00 48 85 c0 74 0f 48 
> 8b 80 28
> > <4>[  122.103769] RSP: 0018:ffffb49a0a6a3a98 EFLAGS: 00010246
> > <4>[  122.103797] RAX: 0000000000000000 RBX: ffff9020f823c148 RCX: 
> dead000000000122
> > <4>[  122.103831] RDX: ffff9020ece70018 RSI: ffff9020f823c0c8 RDI: 
> ffff9010ca31c800
> > <4>[  122.103865] RBP: ffffb49a0a6a3b38 R08: 0000000000000000 R09: 
> 0000000000000001
> > <4>[  122.103899] R10: 000000006044f994 R11: 00000000df57fb58 R12: 
> ffff9020f823c000
> > <4>[  122.103933] R13: ffff9020f823c000 R14: ffff9020f823c0c8 R15: 
> ffff9010d5d20000
> > <4>[  122.103968] FS:  00007f32c83dc780(0000) 
> GS:ffff9020ff380000(0000) knlGS:0000000000000000
> > <4>[  122.104006] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > <4>[  122.104035] CR2: 0000000000000008 CR3: 0000002036bba005 CR4: 
> 00000000003606e0
> > <4>[  122.104069] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 
> 0000000000000000
> > <4>[  122.104103] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 
> 0000000000000400
> > <4>[  122.104137] Call Trace:
> > <4>[  122.104241]  vm_update_pds+0x31/0x50 [amdgpu]
> > <4>[  122.104347] amdgpu_amdkfd_gpuvm_map_memory_to_gpu+0x2ef/0x690 
> [amdgpu]
> > <4>[  122.104466] kfd_process_alloc_gpuvm+0x98/0x190 [amdgpu]
> > <4>[  122.104576] kfd_process_device_init_vm.part.8+0xf3/0x1f0 [amdgpu]
> > <4>[  122.104688] kfd_process_device_init_vm+0x24/0x30 [amdgpu]
> > <4>[  122.104794] kfd_ioctl_acquire_vm+0xa4/0xc0 [amdgpu]
> > <4>[  122.104900]  kfd_ioctl+0x277/0x500 [amdgpu]
> > <4>[  122.105001]  ? kfd_ioctl_free_memory_of_gpu+0xc0/0xc0 [amdgpu]
> > <4>[  122.105039]  ? rcu_read_lock_sched_held+0x4f/0x80
> > <4>[  122.105068]  ? kmem_cache_free+0x2ba/0x300
> > <4>[  122.105093]  ? vm_area_free+0x18/0x20
> > <4>[  122.105117]  ? find_held_lock+0x35/0xa0
> > <4>[  122.105143]  do_vfs_ioctl+0xa9/0x6f0
> > <4>[  122.106001]  ksys_ioctl+0x75/0x80
> > <4>[  122.106802]  ? do_syscall_64+0x17/0x230
> > <4>[  122.107605]  __x64_sys_ioctl+0x1a/0x20
> > <4>[  122.108378]  do_syscall_64+0x5f/0x230
> > <4>[  122.109118] entry_SYSCALL_64_after_hwframe+0x49/0xbe
> > <4>[  122.109842] RIP: 0033:0x7f32c6b495d7
> >
> > Signed-off-by: xinhui pan <xinhui.pan@amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 2 +-
> >   1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > index 3195bc90985a..3c388fdf335c 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > @@ -2619,7 +2619,7 @@ void amdgpu_vm_bo_invalidate(struct 
> amdgpu_device *adev,
> >                        continue;
> >                bo_base->moved = true;
> >
> > -             if (bo->tbo.type == ttm_bo_type_kernel)
> > +             if (bo->tbo.type == ttm_bo_type_kernel && bo->parent)
>
> Good catch, but that would mean that we move the root PD to the moved
> state which in turn is illegal as well.
>
> Maybe better adjust amdgpu_vm_bo_relocated() to move the root PD to the
> idle state instead.
>
> Christian.
>
>
> > amdgpu_vm_bo_relocated(bo_base);
> >                else if (bo->tbo.base.resv == 
> vm->root.base.bo->tbo.base.resv)
> >                        amdgpu_vm_bo_moved(bo_base);
>


[-- Attachment #1.2: Type: text/html, Size: 20166 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] drm/amdgpu: Do not move root PT bo to relocated list
  2020-02-11  2:32 xinhui pan
@ 2020-02-11  9:46 ` Christian König
  0 siblings, 0 replies; 6+ messages in thread
From: Christian König @ 2020-02-11  9:46 UTC (permalink / raw)
  To: xinhui pan, amd-gfx; +Cc: Christian König

Am 11.02.20 um 03:32 schrieb xinhui pan:
> As root PD has no parent, we just need move its status to idle.
>
> Suggested-by: Christian König <christian.koenig@amd.com>
> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
> CC: Christian König <christian.koenig@amd.com>

Reviewed-by: Christian König <christian.koenig@amd.com>

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 34 +++++++++++++-------------
>   1 file changed, 17 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index cc56eaba1911..0be293eb2773 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -229,19 +229,6 @@ static void amdgpu_vm_bo_evicted(struct amdgpu_vm_bo_base *vm_bo)
>   	else
>   		list_move_tail(&vm_bo->vm_status, &vm->evicted);
>   }
> -
> -/**
> - * amdgpu_vm_bo_relocated - vm_bo is reloacted
> - *
> - * @vm_bo: vm_bo which is relocated
> - *
> - * State for PDs/PTs which needs to update their parent PD.
> - */
> -static void amdgpu_vm_bo_relocated(struct amdgpu_vm_bo_base *vm_bo)
> -{
> -	list_move(&vm_bo->vm_status, &vm_bo->vm->relocated);
> -}
> -
>   /**
>    * amdgpu_vm_bo_moved - vm_bo is moved
>    *
> @@ -284,6 +271,22 @@ static void amdgpu_vm_bo_invalidated(struct amdgpu_vm_bo_base *vm_bo)
>   	spin_unlock(&vm_bo->vm->invalidated_lock);
>   }
>   
> +/**
> + * amdgpu_vm_bo_relocated - vm_bo is reloacted
> + *
> + * @vm_bo: vm_bo which is relocated
> + *
> + * State for PDs/PTs which needs to update their parent PD.
> + * For the root PD, just move to idle state.
> + */
> +static void amdgpu_vm_bo_relocated(struct amdgpu_vm_bo_base *vm_bo)
> +{
> +	if (vm_bo->bo->parent)
> +		list_move(&vm_bo->vm_status, &vm_bo->vm->relocated);
> +	else
> +		amdgpu_vm_bo_idle(vm_bo);
> +}
> +
>   /**
>    * amdgpu_vm_bo_done - vm_bo is done
>    *
> @@ -691,10 +694,7 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>   			amdgpu_vm_bo_moved(bo_base);
>   		} else {
>   			vm->update_funcs->map_table(bo);
> -			if (bo->parent)
> -				amdgpu_vm_bo_relocated(bo_base);
> -			else
> -				amdgpu_vm_bo_idle(bo_base);
> +			amdgpu_vm_bo_relocated(bo_base);
>   		}
>   	}
>   

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH] drm/amdgpu: Do not move root PT bo to relocated list
@ 2020-02-11  2:32 xinhui pan
  2020-02-11  9:46 ` Christian König
  0 siblings, 1 reply; 6+ messages in thread
From: xinhui pan @ 2020-02-11  2:32 UTC (permalink / raw)
  To: amd-gfx; +Cc: xinhui pan, Christian König

As root PD has no parent, we just need move its status to idle.

Suggested-by: Christian König <christian.koenig@amd.com>
Signed-off-by: xinhui pan <xinhui.pan@amd.com>
CC: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 34 +++++++++++++-------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index cc56eaba1911..0be293eb2773 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -229,19 +229,6 @@ static void amdgpu_vm_bo_evicted(struct amdgpu_vm_bo_base *vm_bo)
 	else
 		list_move_tail(&vm_bo->vm_status, &vm->evicted);
 }
-
-/**
- * amdgpu_vm_bo_relocated - vm_bo is reloacted
- *
- * @vm_bo: vm_bo which is relocated
- *
- * State for PDs/PTs which needs to update their parent PD.
- */
-static void amdgpu_vm_bo_relocated(struct amdgpu_vm_bo_base *vm_bo)
-{
-	list_move(&vm_bo->vm_status, &vm_bo->vm->relocated);
-}
-
 /**
  * amdgpu_vm_bo_moved - vm_bo is moved
  *
@@ -284,6 +271,22 @@ static void amdgpu_vm_bo_invalidated(struct amdgpu_vm_bo_base *vm_bo)
 	spin_unlock(&vm_bo->vm->invalidated_lock);
 }
 
+/**
+ * amdgpu_vm_bo_relocated - vm_bo is reloacted
+ *
+ * @vm_bo: vm_bo which is relocated
+ *
+ * State for PDs/PTs which needs to update their parent PD.
+ * For the root PD, just move to idle state.
+ */
+static void amdgpu_vm_bo_relocated(struct amdgpu_vm_bo_base *vm_bo)
+{
+	if (vm_bo->bo->parent)
+		list_move(&vm_bo->vm_status, &vm_bo->vm->relocated);
+	else
+		amdgpu_vm_bo_idle(vm_bo);
+}
+
 /**
  * amdgpu_vm_bo_done - vm_bo is done
  *
@@ -691,10 +694,7 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 			amdgpu_vm_bo_moved(bo_base);
 		} else {
 			vm->update_funcs->map_table(bo);
-			if (bo->parent)
-				amdgpu_vm_bo_relocated(bo_base);
-			else
-				amdgpu_vm_bo_idle(bo_base);
+			amdgpu_vm_bo_relocated(bo_base);
 		}
 	}
 
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2020-02-11  9:46 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-02-09  2:52 [PATCH] drm/amdgpu: Do not move root PT bo to relocated list Pan, Xinhui
2020-02-09  8:21 ` Christian König
2020-02-10  0:59   ` Pan, Xinhui
2020-02-10  9:31     ` Christian König
2020-02-11  2:32 xinhui pan
2020-02-11  9:46 ` Christian König

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).