All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdgpu: move gfxhub.funcs->init to gmc sw_init
@ 2023-09-23 11:16 Yifan Zhang
  2023-09-25  6:45 ` Christian König
  2023-09-25 11:30 ` Zhang, Hawking
  0 siblings, 2 replies; 7+ messages in thread
From: Yifan Zhang @ 2023-09-23 11:16 UTC (permalink / raw)
  To: amd-gfx; +Cc: Alexander.Deucher, Yifan Zhang, christian.koenig

gmc11 hw_init depends on adev->gfxhub.funcs->init now, move
it to gmc11 sw_init. This patch fixes below driver load error on
GC 11.0.4:

[   43.928149] [drm] amdgpu: 512M of VRAM memory ready
[   43.928151] [drm] amdgpu: 7641M of GTT memory ready.
[   43.928168] [drm] GART: num cpu pages 131072, num gpu pages 131072
[   43.928182] BUG: kernel NULL pointer dereference, address: 0000000000000008
[   43.928185] #PF: supervisor read access in kernel mode
[   43.928187] #PF: error_code(0x0000) - not-present page
[   43.928188] PGD 0 P4D 0
[   43.928191] Oops: 0000 [#1] PREEMPT SMP NOPTI
[   43.928193] CPU: 7 PID: 1278 Comm: modprobe Tainted: G           OE      6.3.7+ #73
[   43.928198] RIP: 0010:gmc_v11_0_flush_gpu_tlb+0x59/0x4d0 [amdgpu]
[   43.928406] Code: 8b 4d d4 48 89 4d c8 83 f9 0d 0f 87 7c 04 00 00 48 8b 45 c8 44 89 ee 44 89 e7 48 8d 04 80 48 c1 e0 04 49 8b 84 07 90 eb 00 00 <48> 8b 40 08 ff d0 0f 1f 00 44 8b 45 d4 41 89 c5 45 85 c0 0f 85 c0
[   43.928409] RSP: 0018:ffffae24c28277f0 EFLAGS: 00010256
[   43.928411] RAX: 0000000000000000 RBX: ffff95bf0ed80000 RCX: 0000000000000000
[   43.928413] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
[   43.928414] RBP: ffffae24c2827848 R08: 0003000000000077 R09: 0000000000000401
[   43.928416] R10: 0000000000001000 R11: 0000000000001000 R12: 0000000000000000
[   43.928417] R13: 0000000000000000 R14: 0000000000000000 R15: ffff95bf0ed80000
[   43.928418] FS:  00007f2703c83000(0000) GS:ffff95c26e7c0000(0000) knlGS:0000000000000000
[   43.928421] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   43.928422] CR2: 0000000000000008 CR3: 000000012c300000 CR4: 0000000000750ee0
[   43.928424] PKRU: 55555554
[   43.928425] Call Trace:
[   43.928426]  <TASK>
[   43.928428]  ? show_regs+0x6e/0x80
[   43.928434]  ? __die+0x29/0x70
[   43.928436]  ? page_fault_oops+0x154/0x4a0
[   43.928440]  ? amdgpu_bo_move+0x25b/0x790 [amdgpu]
[   43.928612]  ? do_user_addr_fault+0x325/0x710
[   43.928615]  ? exc_page_fault+0x7a/0x180
[   43.928619]  ? asm_exc_page_fault+0x2b/0x30
[   43.928623]  ? gmc_v11_0_flush_gpu_tlb+0x59/0x4d0 [amdgpu]
[   43.928797]  ? amdgpu_gart_bind+0x64/0xc0 [amdgpu]
[   43.928974]  amdgpu_gmc_flush_gpu_tlb+0x258/0x2e0 [amdgpu]
[   43.929159]  amdgpu_gart_invalidate_tlb+0x66/0xa0 [amdgpu]
[   43.929332]  amdgpu_gtt_mgr_recover+0x67/0x80 [amdgpu]
[   43.929504]  gmc_v11_0_hw_init+0x4c/0x130 [amdgpu]
[   43.929690]  amdgpu_device_init+0x1766/0x2d60 [amdgpu]
[   43.929858]  ? pci_bus_read_config_word+0x4b/0x80
[   43.929862]  ? do_pci_enable_device+0xdb/0x110
[   43.929866]  amdgpu_driver_load_kms+0x1e/0x1a0 [amdgpu]
[   43.930032]  amdgpu_pci_probe+0x16b/0x4e0 [amdgpu]
[   43.930201]  local_pci_probe+0x4c/0xb0
[   43.930204]  pci_device_probe+0xcc/0x280
[   43.930207]  really_probe+0x1c4/0x430
[   43.930210]  __driver_probe_device+0x8a/0x180
[   43.930212]  driver_probe_device+0x23/0xc0
[   43.930214]  __driver_attach+0xfb/0x200
[   43.930215]  ? __pfx___driver_attach+0x10/0x10
[   43.930217]  bus_for_each_dev+0x7e/0xd0
[   43.930221]  driver_attach+0x22/0x30
[   43.930222]  bus_add_driver+0x120/0x230
[   43.930224]  driver_register+0x68/0x130
[   43.930226]  ? __pfx_init_module+0x10/0x10 [amdgpu]
[   43.930394]  __pci_register_driver+0x6c/0x80
[   43.930396]  amdgpu_init+0x67/0xff0 [amdgpu]
[   43.930549]  do_one_initcall+0x4a/0x240
[   43.930554]  ? kmalloc_trace+0x2e/0xa0
[   43.930559]  do_init_module+0x52/0x230
[   43.930563]  load_module+0x2386/0x2880
[   43.930566]  ? security_kernel_post_read_file+0x60/0x70
[   43.930571]  __do_sys_finit_module+0xc8/0x140
[   43.930573]  ? __do_sys_finit_module+0xc8/0x140
[   43.930576]  __x64_sys_finit_module+0x1c/0x30
[   43.930578]  do_syscall_64+0x5d/0x90
[   43.930581]  ? syscall_exit_to_user_mode+0x2a/0x50
[   43.930583]  ? __x64_sys_mmap+0x37/0x50
[   43.930586]  ? do_syscall_64+0x6d/0x90
[   43.930588]  entry_SYSCALL_64_after_hwframe+0x72/0xdc

Signed-off-by: Yifan Zhang <yifan1.zhang@amd.com>
Fixes: c6f52f1ffd91 (drm/amdgpu: cleanup gmc_v11_0_flush_gpu_tlb)
---
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 2 --
 drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 2 ++
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index 42fc0cc13fdd..9c4562bda8cd 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -1329,8 +1329,6 @@ static int gfx_v11_0_sw_init(void *handle)
 	struct amdgpu_kiq *kiq;
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
-	adev->gfxhub.funcs->init(adev);
-
 	switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
 	case IP_VERSION(11, 0, 0):
 	case IP_VERSION(11, 0, 2):
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
index 7d61f66625fa..d0e3583a3cac 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
@@ -720,6 +720,8 @@ static int gmc_v11_0_sw_init(void *handle)
 
 	adev->mmhub.funcs->init(adev);
 
+	adev->gfxhub.funcs->init(adev);
+
 	spin_lock_init(&adev->gmc.invalidate_lock);
 
 	r = amdgpu_atomfirmware_get_vram_info(adev,
-- 
2.37.3


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] drm/amdgpu: move gfxhub.funcs->init to gmc sw_init
  2023-09-23 11:16 [PATCH] drm/amdgpu: move gfxhub.funcs->init to gmc sw_init Yifan Zhang
@ 2023-09-25  6:45 ` Christian König
  2023-09-25  8:45   ` Zhang, Yifan
  2023-09-25 11:30 ` Zhang, Hawking
  1 sibling, 1 reply; 7+ messages in thread
From: Christian König @ 2023-09-25  6:45 UTC (permalink / raw)
  To: Yifan Zhang, amd-gfx; +Cc: Alexander.Deucher

Am 23.09.23 um 13:16 schrieb Yifan Zhang:
> gmc11 hw_init depends on adev->gfxhub.funcs->init now, move
> it to gmc11 sw_init. This patch fixes below driver load error on
> GC 11.0.4:
>
> [   43.928149] [drm] amdgpu: 512M of VRAM memory ready
> [   43.928151] [drm] amdgpu: 7641M of GTT memory ready.
> [   43.928168] [drm] GART: num cpu pages 131072, num gpu pages 131072
> [   43.928182] BUG: kernel NULL pointer dereference, address: 0000000000000008
> [   43.928185] #PF: supervisor read access in kernel mode
> [   43.928187] #PF: error_code(0x0000) - not-present page
> [   43.928188] PGD 0 P4D 0
> [   43.928191] Oops: 0000 [#1] PREEMPT SMP NOPTI
> [   43.928193] CPU: 7 PID: 1278 Comm: modprobe Tainted: G           OE      6.3.7+ #73
> [   43.928198] RIP: 0010:gmc_v11_0_flush_gpu_tlb+0x59/0x4d0 [amdgpu]
> [   43.928406] Code: 8b 4d d4 48 89 4d c8 83 f9 0d 0f 87 7c 04 00 00 48 8b 45 c8 44 89 ee 44 89 e7 48 8d 04 80 48 c1 e0 04 49 8b 84 07 90 eb 00 00 <48> 8b 40 08 ff d0 0f 1f 00 44 8b 45 d4 41 89 c5 45 85 c0 0f 85 c0
> [   43.928409] RSP: 0018:ffffae24c28277f0 EFLAGS: 00010256
> [   43.928411] RAX: 0000000000000000 RBX: ffff95bf0ed80000 RCX: 0000000000000000
> [   43.928413] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
> [   43.928414] RBP: ffffae24c2827848 R08: 0003000000000077 R09: 0000000000000401
> [   43.928416] R10: 0000000000001000 R11: 0000000000001000 R12: 0000000000000000
> [   43.928417] R13: 0000000000000000 R14: 0000000000000000 R15: ffff95bf0ed80000
> [   43.928418] FS:  00007f2703c83000(0000) GS:ffff95c26e7c0000(0000) knlGS:0000000000000000
> [   43.928421] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [   43.928422] CR2: 0000000000000008 CR3: 000000012c300000 CR4: 0000000000750ee0
> [   43.928424] PKRU: 55555554
> [   43.928425] Call Trace:
> [   43.928426]  <TASK>
> [   43.928428]  ? show_regs+0x6e/0x80
> [   43.928434]  ? __die+0x29/0x70
> [   43.928436]  ? page_fault_oops+0x154/0x4a0
> [   43.928440]  ? amdgpu_bo_move+0x25b/0x790 [amdgpu]
> [   43.928612]  ? do_user_addr_fault+0x325/0x710
> [   43.928615]  ? exc_page_fault+0x7a/0x180
> [   43.928619]  ? asm_exc_page_fault+0x2b/0x30
> [   43.928623]  ? gmc_v11_0_flush_gpu_tlb+0x59/0x4d0 [amdgpu]
> [   43.928797]  ? amdgpu_gart_bind+0x64/0xc0 [amdgpu]
> [   43.928974]  amdgpu_gmc_flush_gpu_tlb+0x258/0x2e0 [amdgpu]
> [   43.929159]  amdgpu_gart_invalidate_tlb+0x66/0xa0 [amdgpu]
> [   43.929332]  amdgpu_gtt_mgr_recover+0x67/0x80 [amdgpu]
> [   43.929504]  gmc_v11_0_hw_init+0x4c/0x130 [amdgpu]
> [   43.929690]  amdgpu_device_init+0x1766/0x2d60 [amdgpu]
> [   43.929858]  ? pci_bus_read_config_word+0x4b/0x80
> [   43.929862]  ? do_pci_enable_device+0xdb/0x110
> [   43.929866]  amdgpu_driver_load_kms+0x1e/0x1a0 [amdgpu]
> [   43.930032]  amdgpu_pci_probe+0x16b/0x4e0 [amdgpu]
> [   43.930201]  local_pci_probe+0x4c/0xb0
> [   43.930204]  pci_device_probe+0xcc/0x280
> [   43.930207]  really_probe+0x1c4/0x430
> [   43.930210]  __driver_probe_device+0x8a/0x180
> [   43.930212]  driver_probe_device+0x23/0xc0
> [   43.930214]  __driver_attach+0xfb/0x200
> [   43.930215]  ? __pfx___driver_attach+0x10/0x10
> [   43.930217]  bus_for_each_dev+0x7e/0xd0
> [   43.930221]  driver_attach+0x22/0x30
> [   43.930222]  bus_add_driver+0x120/0x230
> [   43.930224]  driver_register+0x68/0x130
> [   43.930226]  ? __pfx_init_module+0x10/0x10 [amdgpu]
> [   43.930394]  __pci_register_driver+0x6c/0x80
> [   43.930396]  amdgpu_init+0x67/0xff0 [amdgpu]
> [   43.930549]  do_one_initcall+0x4a/0x240
> [   43.930554]  ? kmalloc_trace+0x2e/0xa0
> [   43.930559]  do_init_module+0x52/0x230
> [   43.930563]  load_module+0x2386/0x2880
> [   43.930566]  ? security_kernel_post_read_file+0x60/0x70
> [   43.930571]  __do_sys_finit_module+0xc8/0x140
> [   43.930573]  ? __do_sys_finit_module+0xc8/0x140
> [   43.930576]  __x64_sys_finit_module+0x1c/0x30
> [   43.930578]  do_syscall_64+0x5d/0x90
> [   43.930581]  ? syscall_exit_to_user_mode+0x2a/0x50
> [   43.930583]  ? __x64_sys_mmap+0x37/0x50
> [   43.930586]  ? do_syscall_64+0x6d/0x90
> [   43.930588]  entry_SYSCALL_64_after_hwframe+0x72/0xdc
>
> Signed-off-by: Yifan Zhang <yifan1.zhang@amd.com>
> Fixes: c6f52f1ffd91 (drm/amdgpu: cleanup gmc_v11_0_flush_gpu_tlb)

Reviewed-by: Christian König <christian.koenig@amd.com>

Any idea why that was that in the gfx init in the first place? For all 
other generations and the mmhub it's in the GMC init.

Thanks,
Christian.

> ---
>   drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 2 --
>   drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 2 ++
>   2 files changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> index 42fc0cc13fdd..9c4562bda8cd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> @@ -1329,8 +1329,6 @@ static int gfx_v11_0_sw_init(void *handle)
>   	struct amdgpu_kiq *kiq;
>   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>   
> -	adev->gfxhub.funcs->init(adev);
> -
>   	switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
>   	case IP_VERSION(11, 0, 0):
>   	case IP_VERSION(11, 0, 2):
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> index 7d61f66625fa..d0e3583a3cac 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> @@ -720,6 +720,8 @@ static int gmc_v11_0_sw_init(void *handle)
>   
>   	adev->mmhub.funcs->init(adev);
>   
> +	adev->gfxhub.funcs->init(adev);
> +
>   	spin_lock_init(&adev->gmc.invalidate_lock);
>   
>   	r = amdgpu_atomfirmware_get_vram_info(adev,


^ permalink raw reply	[flat|nested] 7+ messages in thread

* RE: [PATCH] drm/amdgpu: move gfxhub.funcs->init to gmc sw_init
  2023-09-25  6:45 ` Christian König
@ 2023-09-25  8:45   ` Zhang, Yifan
  0 siblings, 0 replies; 7+ messages in thread
From: Zhang, Yifan @ 2023-09-25  8:45 UTC (permalink / raw)
  To: Christian König, amd-gfx, Zhang, Hawking, Yin, Tianci (Rico)
  Cc: Deucher, Alexander

[AMD Official Use Only - General]

I'm not sure, mmhub and gfxhub init are separated since the first version of gfx11 code.

Hi Hawking/Tianci, do you know the reason ? Thanks !

Best Regards,
Yifan

-----Original Message-----
From: Christian König <ckoenig.leichtzumerken@gmail.com>
Sent: Monday, September 25, 2023 2:45 PM
To: Zhang, Yifan <Yifan1.Zhang@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>
Subject: Re: [PATCH] drm/amdgpu: move gfxhub.funcs->init to gmc sw_init

Am 23.09.23 um 13:16 schrieb Yifan Zhang:
> gmc11 hw_init depends on adev->gfxhub.funcs->init now, move it to
> gmc11 sw_init. This patch fixes below driver load error on GC 11.0.4:
>
> [   43.928149] [drm] amdgpu: 512M of VRAM memory ready
> [   43.928151] [drm] amdgpu: 7641M of GTT memory ready.
> [   43.928168] [drm] GART: num cpu pages 131072, num gpu pages 131072
> [   43.928182] BUG: kernel NULL pointer dereference, address: 0000000000000008
> [   43.928185] #PF: supervisor read access in kernel mode
> [   43.928187] #PF: error_code(0x0000) - not-present page
> [   43.928188] PGD 0 P4D 0
> [   43.928191] Oops: 0000 [#1] PREEMPT SMP NOPTI
> [   43.928193] CPU: 7 PID: 1278 Comm: modprobe Tainted: G           OE      6.3.7+ #73
> [   43.928198] RIP: 0010:gmc_v11_0_flush_gpu_tlb+0x59/0x4d0 [amdgpu]
> [   43.928406] Code: 8b 4d d4 48 89 4d c8 83 f9 0d 0f 87 7c 04 00 00 48 8b 45 c8 44 89 ee 44 89 e7 48 8d 04 80 48 c1 e0 04 49 8b 84 07 90 eb 00 00 <48> 8b 40 08 ff d0 0f 1f 00 44 8b 45 d4 41 89 c5 45 85 c0 0f 85 c0
> [   43.928409] RSP: 0018:ffffae24c28277f0 EFLAGS: 00010256
> [   43.928411] RAX: 0000000000000000 RBX: ffff95bf0ed80000 RCX: 0000000000000000
> [   43.928413] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
> [   43.928414] RBP: ffffae24c2827848 R08: 0003000000000077 R09: 0000000000000401
> [   43.928416] R10: 0000000000001000 R11: 0000000000001000 R12: 0000000000000000
> [   43.928417] R13: 0000000000000000 R14: 0000000000000000 R15: ffff95bf0ed80000
> [   43.928418] FS:  00007f2703c83000(0000) GS:ffff95c26e7c0000(0000) knlGS:0000000000000000
> [   43.928421] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [   43.928422] CR2: 0000000000000008 CR3: 000000012c300000 CR4: 0000000000750ee0
> [   43.928424] PKRU: 55555554
> [   43.928425] Call Trace:
> [   43.928426]  <TASK>
> [   43.928428]  ? show_regs+0x6e/0x80
> [   43.928434]  ? __die+0x29/0x70
> [   43.928436]  ? page_fault_oops+0x154/0x4a0
> [   43.928440]  ? amdgpu_bo_move+0x25b/0x790 [amdgpu]
> [   43.928612]  ? do_user_addr_fault+0x325/0x710
> [   43.928615]  ? exc_page_fault+0x7a/0x180
> [   43.928619]  ? asm_exc_page_fault+0x2b/0x30
> [   43.928623]  ? gmc_v11_0_flush_gpu_tlb+0x59/0x4d0 [amdgpu]
> [   43.928797]  ? amdgpu_gart_bind+0x64/0xc0 [amdgpu]
> [   43.928974]  amdgpu_gmc_flush_gpu_tlb+0x258/0x2e0 [amdgpu]
> [   43.929159]  amdgpu_gart_invalidate_tlb+0x66/0xa0 [amdgpu]
> [   43.929332]  amdgpu_gtt_mgr_recover+0x67/0x80 [amdgpu]
> [   43.929504]  gmc_v11_0_hw_init+0x4c/0x130 [amdgpu]
> [   43.929690]  amdgpu_device_init+0x1766/0x2d60 [amdgpu]
> [   43.929858]  ? pci_bus_read_config_word+0x4b/0x80
> [   43.929862]  ? do_pci_enable_device+0xdb/0x110
> [   43.929866]  amdgpu_driver_load_kms+0x1e/0x1a0 [amdgpu]
> [   43.930032]  amdgpu_pci_probe+0x16b/0x4e0 [amdgpu]
> [   43.930201]  local_pci_probe+0x4c/0xb0
> [   43.930204]  pci_device_probe+0xcc/0x280
> [   43.930207]  really_probe+0x1c4/0x430
> [   43.930210]  __driver_probe_device+0x8a/0x180
> [   43.930212]  driver_probe_device+0x23/0xc0
> [   43.930214]  __driver_attach+0xfb/0x200
> [   43.930215]  ? __pfx___driver_attach+0x10/0x10
> [   43.930217]  bus_for_each_dev+0x7e/0xd0
> [   43.930221]  driver_attach+0x22/0x30
> [   43.930222]  bus_add_driver+0x120/0x230
> [   43.930224]  driver_register+0x68/0x130
> [   43.930226]  ? __pfx_init_module+0x10/0x10 [amdgpu]
> [   43.930394]  __pci_register_driver+0x6c/0x80
> [   43.930396]  amdgpu_init+0x67/0xff0 [amdgpu]
> [   43.930549]  do_one_initcall+0x4a/0x240
> [   43.930554]  ? kmalloc_trace+0x2e/0xa0
> [   43.930559]  do_init_module+0x52/0x230
> [   43.930563]  load_module+0x2386/0x2880
> [   43.930566]  ? security_kernel_post_read_file+0x60/0x70
> [   43.930571]  __do_sys_finit_module+0xc8/0x140
> [   43.930573]  ? __do_sys_finit_module+0xc8/0x140
> [   43.930576]  __x64_sys_finit_module+0x1c/0x30
> [   43.930578]  do_syscall_64+0x5d/0x90
> [   43.930581]  ? syscall_exit_to_user_mode+0x2a/0x50
> [   43.930583]  ? __x64_sys_mmap+0x37/0x50
> [   43.930586]  ? do_syscall_64+0x6d/0x90
> [   43.930588]  entry_SYSCALL_64_after_hwframe+0x72/0xdc
>
> Signed-off-by: Yifan Zhang <yifan1.zhang@amd.com>
> Fixes: c6f52f1ffd91 (drm/amdgpu: cleanup gmc_v11_0_flush_gpu_tlb)

Reviewed-by: Christian König <christian.koenig@amd.com>

Any idea why that was that in the gfx init in the first place? For all other generations and the mmhub it's in the GMC init.

Thanks,
Christian.

> ---
>   drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 2 --
>   drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 2 ++
>   2 files changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> index 42fc0cc13fdd..9c4562bda8cd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> @@ -1329,8 +1329,6 @@ static int gfx_v11_0_sw_init(void *handle)
>       struct amdgpu_kiq *kiq;
>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>
> -     adev->gfxhub.funcs->init(adev);
> -
>       switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
>       case IP_VERSION(11, 0, 0):
>       case IP_VERSION(11, 0, 2):
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> index 7d61f66625fa..d0e3583a3cac 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> @@ -720,6 +720,8 @@ static int gmc_v11_0_sw_init(void *handle)
>
>       adev->mmhub.funcs->init(adev);
>
> +     adev->gfxhub.funcs->init(adev);
> +
>       spin_lock_init(&adev->gmc.invalidate_lock);
>
>       r = amdgpu_atomfirmware_get_vram_info(adev,


^ permalink raw reply	[flat|nested] 7+ messages in thread

* RE: [PATCH] drm/amdgpu: move gfxhub.funcs->init to gmc sw_init
  2023-09-23 11:16 [PATCH] drm/amdgpu: move gfxhub.funcs->init to gmc sw_init Yifan Zhang
  2023-09-25  6:45 ` Christian König
@ 2023-09-25 11:30 ` Zhang, Hawking
  2023-09-25 11:32   ` Zhang, Hawking
  1 sibling, 1 reply; 7+ messages in thread
From: Zhang, Hawking @ 2023-09-25 11:30 UTC (permalink / raw)
  To: Zhang, Yifan, amd-gfx; +Cc: Deucher, Alexander, Zhang, Yifan, Koenig, Christian

[AMD Official Use Only - General]

RE - gmc11 hw_init depends on adev->gfxhub.funcs->init now, move it to gmc11 sw_init

isn't any driver loading failure observed? Since from gfx11, GFX won't be powered on until smu hw_init phase. Any programming that touch gfx core is invalid before it is powered on.

Regards,
Hawking

-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Yifan Zhang
Sent: Saturday, September 23, 2023 19:17
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Zhang, Yifan <Yifan1.Zhang@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
Subject: [PATCH] drm/amdgpu: move gfxhub.funcs->init to gmc sw_init

gmc11 hw_init depends on adev->gfxhub.funcs->init now, move it to gmc11 sw_init. This patch fixes below driver load error on GC 11.0.4:

[   43.928149] [drm] amdgpu: 512M of VRAM memory ready
[   43.928151] [drm] amdgpu: 7641M of GTT memory ready.
[   43.928168] [drm] GART: num cpu pages 131072, num gpu pages 131072
[   43.928182] BUG: kernel NULL pointer dereference, address: 0000000000000008
[   43.928185] #PF: supervisor read access in kernel mode
[   43.928187] #PF: error_code(0x0000) - not-present page
[   43.928188] PGD 0 P4D 0
[   43.928191] Oops: 0000 [#1] PREEMPT SMP NOPTI
[   43.928193] CPU: 7 PID: 1278 Comm: modprobe Tainted: G           OE      6.3.7+ #73
[   43.928198] RIP: 0010:gmc_v11_0_flush_gpu_tlb+0x59/0x4d0 [amdgpu]
[   43.928406] Code: 8b 4d d4 48 89 4d c8 83 f9 0d 0f 87 7c 04 00 00 48 8b 45 c8 44 89 ee 44 89 e7 48 8d 04 80 48 c1 e0 04 49 8b 84 07 90 eb 00 00 <48> 8b 40 08 ff d0 0f 1f 00 44 8b 45 d4 41 89 c5 45 85 c0 0f 85 c0
[   43.928409] RSP: 0018:ffffae24c28277f0 EFLAGS: 00010256
[   43.928411] RAX: 0000000000000000 RBX: ffff95bf0ed80000 RCX: 0000000000000000
[   43.928413] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
[   43.928414] RBP: ffffae24c2827848 R08: 0003000000000077 R09: 0000000000000401
[   43.928416] R10: 0000000000001000 R11: 0000000000001000 R12: 0000000000000000
[   43.928417] R13: 0000000000000000 R14: 0000000000000000 R15: ffff95bf0ed80000
[   43.928418] FS:  00007f2703c83000(0000) GS:ffff95c26e7c0000(0000) knlGS:0000000000000000
[   43.928421] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   43.928422] CR2: 0000000000000008 CR3: 000000012c300000 CR4: 0000000000750ee0
[   43.928424] PKRU: 55555554
[   43.928425] Call Trace:
[   43.928426]  <TASK>
[   43.928428]  ? show_regs+0x6e/0x80
[   43.928434]  ? __die+0x29/0x70
[   43.928436]  ? page_fault_oops+0x154/0x4a0
[   43.928440]  ? amdgpu_bo_move+0x25b/0x790 [amdgpu]
[   43.928612]  ? do_user_addr_fault+0x325/0x710
[   43.928615]  ? exc_page_fault+0x7a/0x180
[   43.928619]  ? asm_exc_page_fault+0x2b/0x30
[   43.928623]  ? gmc_v11_0_flush_gpu_tlb+0x59/0x4d0 [amdgpu]
[   43.928797]  ? amdgpu_gart_bind+0x64/0xc0 [amdgpu]
[   43.928974]  amdgpu_gmc_flush_gpu_tlb+0x258/0x2e0 [amdgpu]
[   43.929159]  amdgpu_gart_invalidate_tlb+0x66/0xa0 [amdgpu]
[   43.929332]  amdgpu_gtt_mgr_recover+0x67/0x80 [amdgpu]
[   43.929504]  gmc_v11_0_hw_init+0x4c/0x130 [amdgpu]
[   43.929690]  amdgpu_device_init+0x1766/0x2d60 [amdgpu]
[   43.929858]  ? pci_bus_read_config_word+0x4b/0x80
[   43.929862]  ? do_pci_enable_device+0xdb/0x110
[   43.929866]  amdgpu_driver_load_kms+0x1e/0x1a0 [amdgpu]
[   43.930032]  amdgpu_pci_probe+0x16b/0x4e0 [amdgpu]
[   43.930201]  local_pci_probe+0x4c/0xb0
[   43.930204]  pci_device_probe+0xcc/0x280
[   43.930207]  really_probe+0x1c4/0x430
[   43.930210]  __driver_probe_device+0x8a/0x180
[   43.930212]  driver_probe_device+0x23/0xc0
[   43.930214]  __driver_attach+0xfb/0x200
[   43.930215]  ? __pfx___driver_attach+0x10/0x10
[   43.930217]  bus_for_each_dev+0x7e/0xd0
[   43.930221]  driver_attach+0x22/0x30
[   43.930222]  bus_add_driver+0x120/0x230
[   43.930224]  driver_register+0x68/0x130
[   43.930226]  ? __pfx_init_module+0x10/0x10 [amdgpu]
[   43.930394]  __pci_register_driver+0x6c/0x80
[   43.930396]  amdgpu_init+0x67/0xff0 [amdgpu]
[   43.930549]  do_one_initcall+0x4a/0x240
[   43.930554]  ? kmalloc_trace+0x2e/0xa0
[   43.930559]  do_init_module+0x52/0x230
[   43.930563]  load_module+0x2386/0x2880
[   43.930566]  ? security_kernel_post_read_file+0x60/0x70
[   43.930571]  __do_sys_finit_module+0xc8/0x140
[   43.930573]  ? __do_sys_finit_module+0xc8/0x140
[   43.930576]  __x64_sys_finit_module+0x1c/0x30
[   43.930578]  do_syscall_64+0x5d/0x90
[   43.930581]  ? syscall_exit_to_user_mode+0x2a/0x50
[   43.930583]  ? __x64_sys_mmap+0x37/0x50
[   43.930586]  ? do_syscall_64+0x6d/0x90
[   43.930588]  entry_SYSCALL_64_after_hwframe+0x72/0xdc

Signed-off-by: Yifan Zhang <yifan1.zhang@amd.com>
Fixes: c6f52f1ffd91 (drm/amdgpu: cleanup gmc_v11_0_flush_gpu_tlb)
---
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 2 --  drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 2 ++
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index 42fc0cc13fdd..9c4562bda8cd 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -1329,8 +1329,6 @@ static int gfx_v11_0_sw_init(void *handle)
        struct amdgpu_kiq *kiq;
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;

-       adev->gfxhub.funcs->init(adev);
-
        switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
        case IP_VERSION(11, 0, 0):
        case IP_VERSION(11, 0, 2):
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
index 7d61f66625fa..d0e3583a3cac 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
@@ -720,6 +720,8 @@ static int gmc_v11_0_sw_init(void *handle)

        adev->mmhub.funcs->init(adev);

+       adev->gfxhub.funcs->init(adev);
+
        spin_lock_init(&adev->gmc.invalidate_lock);

        r = amdgpu_atomfirmware_get_vram_info(adev,
--
2.37.3


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* RE: [PATCH] drm/amdgpu: move gfxhub.funcs->init to gmc sw_init
  2023-09-25 11:30 ` Zhang, Hawking
@ 2023-09-25 11:32   ` Zhang, Hawking
  2023-09-25 13:59     ` Zhang, Yifan
  0 siblings, 1 reply; 7+ messages in thread
From: Zhang, Hawking @ 2023-09-25 11:32 UTC (permalink / raw)
  To: Zhang, Hawking, Zhang, Yifan, amd-gfx
  Cc: Deucher, Alexander, Zhang, Yifan, Koenig, Christian

[AMD Official Use Only - General]

For sw_init, it's fine to move to gmc sw_init phase, but any hardware programming should be done after smu hw_init.

Regards,
Hawking

-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Zhang, Hawking
Sent: Monday, September 25, 2023 19:30
To: Zhang, Yifan <Yifan1.Zhang@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Zhang, Yifan <Yifan1.Zhang@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
Subject: RE: [PATCH] drm/amdgpu: move gfxhub.funcs->init to gmc sw_init

[AMD Official Use Only - General]

[AMD Official Use Only - General]

RE - gmc11 hw_init depends on adev->gfxhub.funcs->init now, move it to gmc11 sw_init

isn't any driver loading failure observed? Since from gfx11, GFX won't be powered on until smu hw_init phase. Any programming that touch gfx core is invalid before it is powered on.

Regards,
Hawking

-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Yifan Zhang
Sent: Saturday, September 23, 2023 19:17
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Zhang, Yifan <Yifan1.Zhang@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
Subject: [PATCH] drm/amdgpu: move gfxhub.funcs->init to gmc sw_init

gmc11 hw_init depends on adev->gfxhub.funcs->init now, move it to gmc11 sw_init. This patch fixes below driver load error on GC 11.0.4:

[   43.928149] [drm] amdgpu: 512M of VRAM memory ready
[   43.928151] [drm] amdgpu: 7641M of GTT memory ready.
[   43.928168] [drm] GART: num cpu pages 131072, num gpu pages 131072
[   43.928182] BUG: kernel NULL pointer dereference, address: 0000000000000008
[   43.928185] #PF: supervisor read access in kernel mode
[   43.928187] #PF: error_code(0x0000) - not-present page
[   43.928188] PGD 0 P4D 0
[   43.928191] Oops: 0000 [#1] PREEMPT SMP NOPTI
[   43.928193] CPU: 7 PID: 1278 Comm: modprobe Tainted: G           OE      6.3.7+ #73
[   43.928198] RIP: 0010:gmc_v11_0_flush_gpu_tlb+0x59/0x4d0 [amdgpu]
[   43.928406] Code: 8b 4d d4 48 89 4d c8 83 f9 0d 0f 87 7c 04 00 00 48 8b 45 c8 44 89 ee 44 89 e7 48 8d 04 80 48 c1 e0 04 49 8b 84 07 90 eb 00 00 <48> 8b 40 08 ff d0 0f 1f 00 44 8b 45 d4 41 89 c5 45 85 c0 0f 85 c0
[   43.928409] RSP: 0018:ffffae24c28277f0 EFLAGS: 00010256
[   43.928411] RAX: 0000000000000000 RBX: ffff95bf0ed80000 RCX: 0000000000000000
[   43.928413] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
[   43.928414] RBP: ffffae24c2827848 R08: 0003000000000077 R09: 0000000000000401
[   43.928416] R10: 0000000000001000 R11: 0000000000001000 R12: 0000000000000000
[   43.928417] R13: 0000000000000000 R14: 0000000000000000 R15: ffff95bf0ed80000
[   43.928418] FS:  00007f2703c83000(0000) GS:ffff95c26e7c0000(0000) knlGS:0000000000000000
[   43.928421] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   43.928422] CR2: 0000000000000008 CR3: 000000012c300000 CR4: 0000000000750ee0
[   43.928424] PKRU: 55555554
[   43.928425] Call Trace:
[   43.928426]  <TASK>
[   43.928428]  ? show_regs+0x6e/0x80
[   43.928434]  ? __die+0x29/0x70
[   43.928436]  ? page_fault_oops+0x154/0x4a0
[   43.928440]  ? amdgpu_bo_move+0x25b/0x790 [amdgpu]
[   43.928612]  ? do_user_addr_fault+0x325/0x710
[   43.928615]  ? exc_page_fault+0x7a/0x180
[   43.928619]  ? asm_exc_page_fault+0x2b/0x30
[   43.928623]  ? gmc_v11_0_flush_gpu_tlb+0x59/0x4d0 [amdgpu]
[   43.928797]  ? amdgpu_gart_bind+0x64/0xc0 [amdgpu]
[   43.928974]  amdgpu_gmc_flush_gpu_tlb+0x258/0x2e0 [amdgpu]
[   43.929159]  amdgpu_gart_invalidate_tlb+0x66/0xa0 [amdgpu]
[   43.929332]  amdgpu_gtt_mgr_recover+0x67/0x80 [amdgpu]
[   43.929504]  gmc_v11_0_hw_init+0x4c/0x130 [amdgpu]
[   43.929690]  amdgpu_device_init+0x1766/0x2d60 [amdgpu]
[   43.929858]  ? pci_bus_read_config_word+0x4b/0x80
[   43.929862]  ? do_pci_enable_device+0xdb/0x110
[   43.929866]  amdgpu_driver_load_kms+0x1e/0x1a0 [amdgpu]
[   43.930032]  amdgpu_pci_probe+0x16b/0x4e0 [amdgpu]
[   43.930201]  local_pci_probe+0x4c/0xb0
[   43.930204]  pci_device_probe+0xcc/0x280
[   43.930207]  really_probe+0x1c4/0x430
[   43.930210]  __driver_probe_device+0x8a/0x180
[   43.930212]  driver_probe_device+0x23/0xc0
[   43.930214]  __driver_attach+0xfb/0x200
[   43.930215]  ? __pfx___driver_attach+0x10/0x10
[   43.930217]  bus_for_each_dev+0x7e/0xd0
[   43.930221]  driver_attach+0x22/0x30
[   43.930222]  bus_add_driver+0x120/0x230
[   43.930224]  driver_register+0x68/0x130
[   43.930226]  ? __pfx_init_module+0x10/0x10 [amdgpu]
[   43.930394]  __pci_register_driver+0x6c/0x80
[   43.930396]  amdgpu_init+0x67/0xff0 [amdgpu]
[   43.930549]  do_one_initcall+0x4a/0x240
[   43.930554]  ? kmalloc_trace+0x2e/0xa0
[   43.930559]  do_init_module+0x52/0x230
[   43.930563]  load_module+0x2386/0x2880
[   43.930566]  ? security_kernel_post_read_file+0x60/0x70
[   43.930571]  __do_sys_finit_module+0xc8/0x140
[   43.930573]  ? __do_sys_finit_module+0xc8/0x140
[   43.930576]  __x64_sys_finit_module+0x1c/0x30
[   43.930578]  do_syscall_64+0x5d/0x90
[   43.930581]  ? syscall_exit_to_user_mode+0x2a/0x50
[   43.930583]  ? __x64_sys_mmap+0x37/0x50
[   43.930586]  ? do_syscall_64+0x6d/0x90
[   43.930588]  entry_SYSCALL_64_after_hwframe+0x72/0xdc

Signed-off-by: Yifan Zhang <yifan1.zhang@amd.com>
Fixes: c6f52f1ffd91 (drm/amdgpu: cleanup gmc_v11_0_flush_gpu_tlb)
---
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 2 --  drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 2 ++
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index 42fc0cc13fdd..9c4562bda8cd 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -1329,8 +1329,6 @@ static int gfx_v11_0_sw_init(void *handle)
        struct amdgpu_kiq *kiq;
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;

-       adev->gfxhub.funcs->init(adev);
-
        switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
        case IP_VERSION(11, 0, 0):
        case IP_VERSION(11, 0, 2):
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
index 7d61f66625fa..d0e3583a3cac 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
@@ -720,6 +720,8 @@ static int gmc_v11_0_sw_init(void *handle)

        adev->mmhub.funcs->init(adev);

+       adev->gfxhub.funcs->init(adev);
+
        spin_lock_init(&adev->gmc.invalidate_lock);

        r = amdgpu_atomfirmware_get_vram_info(adev,
--
2.37.3


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* RE: [PATCH] drm/amdgpu: move gfxhub.funcs->init to gmc sw_init
  2023-09-25 11:32   ` Zhang, Hawking
@ 2023-09-25 13:59     ` Zhang, Yifan
  2023-09-25 18:08       ` Alex Deucher
  0 siblings, 1 reply; 7+ messages in thread
From: Zhang, Yifan @ 2023-09-25 13:59 UTC (permalink / raw)
  To: Zhang, Hawking, amd-gfx; +Cc: Deucher, Alexander, Koenig, Christian

[AMD Official Use Only - General]

Hi Hawking,

No, no gfx HW programming before gfx power on, gmc_v11_0_flush_gpu_tlb returns before real tbl flush happens.  The dependence is a pure SW one as below:

static void gmc_v11_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
                     uint32_t vmhub, uint32_t flush_type)
 {
     bool use_semaphore = gmc_v11_0_use_invalidate_semaphore(adev, vmhub);
     struct amdgpu_vmhub *hub = &adev->vmhub[vmhub];
     u32 inv_req = hub->vmhub_funcs->get_invalidate_req(vmid, flush_type);  ---> get_invalidate_req  is NULL if gfxhub.funcs->init not called.
......
     if ((vmhub == AMDGPU_GFXHUB(0)) && !adev->gfx.is_poweron)
         return;
}

We can either move vmhub == AMDGPU_GFXHUB(0) before get_invalidate_req or move gfxhub.funcs->init to gmc sw_init as the patch does.  To align w/ other generations, I choose the later approach.
Driver loading failure was observed on GC 11.0.4, but it should be universal to all gfx11 ASICs.

BRs,
Yifan.



-----Original Message-----
From: Zhang, Hawking <Hawking.Zhang@amd.com>
Sent: Monday, September 25, 2023 7:33 PM
To: Zhang, Hawking <Hawking.Zhang@amd.com>; Zhang, Yifan <Yifan1.Zhang@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Zhang, Yifan <Yifan1.Zhang@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
Subject: RE: [PATCH] drm/amdgpu: move gfxhub.funcs->init to gmc sw_init

[AMD Official Use Only - General]

For sw_init, it's fine to move to gmc sw_init phase, but any hardware programming should be done after smu hw_init.

Regards,
Hawking

-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Zhang, Hawking
Sent: Monday, September 25, 2023 19:30
To: Zhang, Yifan <Yifan1.Zhang@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Zhang, Yifan <Yifan1.Zhang@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
Subject: RE: [PATCH] drm/amdgpu: move gfxhub.funcs->init to gmc sw_init

[AMD Official Use Only - General]

[AMD Official Use Only - General]

RE - gmc11 hw_init depends on adev->gfxhub.funcs->init now, move it to gmc11 sw_init

isn't any driver loading failure observed? Since from gfx11, GFX won't be powered on until smu hw_init phase. Any programming that touch gfx core is invalid before it is powered on.

Regards,
Hawking

-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Yifan Zhang
Sent: Saturday, September 23, 2023 19:17
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Zhang, Yifan <Yifan1.Zhang@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
Subject: [PATCH] drm/amdgpu: move gfxhub.funcs->init to gmc sw_init

gmc11 hw_init depends on adev->gfxhub.funcs->init now, move it to gmc11 sw_init. This patch fixes below driver load error on GC 11.0.4:

[   43.928149] [drm] amdgpu: 512M of VRAM memory ready
[   43.928151] [drm] amdgpu: 7641M of GTT memory ready.
[   43.928168] [drm] GART: num cpu pages 131072, num gpu pages 131072
[   43.928182] BUG: kernel NULL pointer dereference, address: 0000000000000008
[   43.928185] #PF: supervisor read access in kernel mode
[   43.928187] #PF: error_code(0x0000) - not-present page
[   43.928188] PGD 0 P4D 0
[   43.928191] Oops: 0000 [#1] PREEMPT SMP NOPTI
[   43.928193] CPU: 7 PID: 1278 Comm: modprobe Tainted: G           OE      6.3.7+ #73
[   43.928198] RIP: 0010:gmc_v11_0_flush_gpu_tlb+0x59/0x4d0 [amdgpu]
[   43.928406] Code: 8b 4d d4 48 89 4d c8 83 f9 0d 0f 87 7c 04 00 00 48 8b 45 c8 44 89 ee 44 89 e7 48 8d 04 80 48 c1 e0 04 49 8b 84 07 90 eb 00 00 <48> 8b 40 08 ff d0 0f 1f 00 44 8b 45 d4 41 89 c5 45 85 c0 0f 85 c0
[   43.928409] RSP: 0018:ffffae24c28277f0 EFLAGS: 00010256
[   43.928411] RAX: 0000000000000000 RBX: ffff95bf0ed80000 RCX: 0000000000000000
[   43.928413] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
[   43.928414] RBP: ffffae24c2827848 R08: 0003000000000077 R09: 0000000000000401
[   43.928416] R10: 0000000000001000 R11: 0000000000001000 R12: 0000000000000000
[   43.928417] R13: 0000000000000000 R14: 0000000000000000 R15: ffff95bf0ed80000
[   43.928418] FS:  00007f2703c83000(0000) GS:ffff95c26e7c0000(0000) knlGS:0000000000000000
[   43.928421] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   43.928422] CR2: 0000000000000008 CR3: 000000012c300000 CR4: 0000000000750ee0
[   43.928424] PKRU: 55555554
[   43.928425] Call Trace:
[   43.928426]  <TASK>
[   43.928428]  ? show_regs+0x6e/0x80
[   43.928434]  ? __die+0x29/0x70
[   43.928436]  ? page_fault_oops+0x154/0x4a0
[   43.928440]  ? amdgpu_bo_move+0x25b/0x790 [amdgpu]
[   43.928612]  ? do_user_addr_fault+0x325/0x710
[   43.928615]  ? exc_page_fault+0x7a/0x180
[   43.928619]  ? asm_exc_page_fault+0x2b/0x30
[   43.928623]  ? gmc_v11_0_flush_gpu_tlb+0x59/0x4d0 [amdgpu]
[   43.928797]  ? amdgpu_gart_bind+0x64/0xc0 [amdgpu]
[   43.928974]  amdgpu_gmc_flush_gpu_tlb+0x258/0x2e0 [amdgpu]
[   43.929159]  amdgpu_gart_invalidate_tlb+0x66/0xa0 [amdgpu]
[   43.929332]  amdgpu_gtt_mgr_recover+0x67/0x80 [amdgpu]
[   43.929504]  gmc_v11_0_hw_init+0x4c/0x130 [amdgpu]
[   43.929690]  amdgpu_device_init+0x1766/0x2d60 [amdgpu]
[   43.929858]  ? pci_bus_read_config_word+0x4b/0x80
[   43.929862]  ? do_pci_enable_device+0xdb/0x110
[   43.929866]  amdgpu_driver_load_kms+0x1e/0x1a0 [amdgpu]
[   43.930032]  amdgpu_pci_probe+0x16b/0x4e0 [amdgpu]
[   43.930201]  local_pci_probe+0x4c/0xb0
[   43.930204]  pci_device_probe+0xcc/0x280
[   43.930207]  really_probe+0x1c4/0x430
[   43.930210]  __driver_probe_device+0x8a/0x180
[   43.930212]  driver_probe_device+0x23/0xc0
[   43.930214]  __driver_attach+0xfb/0x200
[   43.930215]  ? __pfx___driver_attach+0x10/0x10
[   43.930217]  bus_for_each_dev+0x7e/0xd0
[   43.930221]  driver_attach+0x22/0x30
[   43.930222]  bus_add_driver+0x120/0x230
[   43.930224]  driver_register+0x68/0x130
[   43.930226]  ? __pfx_init_module+0x10/0x10 [amdgpu]
[   43.930394]  __pci_register_driver+0x6c/0x80
[   43.930396]  amdgpu_init+0x67/0xff0 [amdgpu]
[   43.930549]  do_one_initcall+0x4a/0x240
[   43.930554]  ? kmalloc_trace+0x2e/0xa0
[   43.930559]  do_init_module+0x52/0x230
[   43.930563]  load_module+0x2386/0x2880
[   43.930566]  ? security_kernel_post_read_file+0x60/0x70
[   43.930571]  __do_sys_finit_module+0xc8/0x140
[   43.930573]  ? __do_sys_finit_module+0xc8/0x140
[   43.930576]  __x64_sys_finit_module+0x1c/0x30
[   43.930578]  do_syscall_64+0x5d/0x90
[   43.930581]  ? syscall_exit_to_user_mode+0x2a/0x50
[   43.930583]  ? __x64_sys_mmap+0x37/0x50
[   43.930586]  ? do_syscall_64+0x6d/0x90
[   43.930588]  entry_SYSCALL_64_after_hwframe+0x72/0xdc

Signed-off-by: Yifan Zhang <yifan1.zhang@amd.com>
Fixes: c6f52f1ffd91 (drm/amdgpu: cleanup gmc_v11_0_flush_gpu_tlb)
---
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 2 --  drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 2 ++
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index 42fc0cc13fdd..9c4562bda8cd 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -1329,8 +1329,6 @@ static int gfx_v11_0_sw_init(void *handle)
        struct amdgpu_kiq *kiq;
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;

-       adev->gfxhub.funcs->init(adev);
-
        switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
        case IP_VERSION(11, 0, 0):
        case IP_VERSION(11, 0, 2):
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
index 7d61f66625fa..d0e3583a3cac 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
@@ -720,6 +720,8 @@ static int gmc_v11_0_sw_init(void *handle)

        adev->mmhub.funcs->init(adev);

+       adev->gfxhub.funcs->init(adev);
+
        spin_lock_init(&adev->gmc.invalidate_lock);

        r = amdgpu_atomfirmware_get_vram_info(adev,
--
2.37.3



^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] drm/amdgpu: move gfxhub.funcs->init to gmc sw_init
  2023-09-25 13:59     ` Zhang, Yifan
@ 2023-09-25 18:08       ` Alex Deucher
  0 siblings, 0 replies; 7+ messages in thread
From: Alex Deucher @ 2023-09-25 18:08 UTC (permalink / raw)
  To: Zhang, Yifan
  Cc: Deucher, Alexander, Koenig, Christian, amd-gfx, Zhang, Hawking

On Mon, Sep 25, 2023 at 10:16 AM Zhang, Yifan <Yifan1.Zhang@amd.com> wrote:
>
> [AMD Official Use Only - General]
>
> Hi Hawking,
>
> No, no gfx HW programming before gfx power on, gmc_v11_0_flush_gpu_tlb returns before real tbl flush happens.  The dependence is a pure SW one as below:
>
> static void gmc_v11_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>                      uint32_t vmhub, uint32_t flush_type)
>  {
>      bool use_semaphore = gmc_v11_0_use_invalidate_semaphore(adev, vmhub);
>      struct amdgpu_vmhub *hub = &adev->vmhub[vmhub];
>      u32 inv_req = hub->vmhub_funcs->get_invalidate_req(vmid, flush_type);  ---> get_invalidate_req  is NULL if gfxhub.funcs->init not called.
> ......
>      if ((vmhub == AMDGPU_GFXHUB(0)) && !adev->gfx.is_poweron)
>          return;
> }
>
> We can either move vmhub == AMDGPU_GFXHUB(0) before get_invalidate_req or move gfxhub.funcs->init to gmc sw_init as the patch does.  To align w/ other generations, I choose the later approach.
> Driver loading failure was observed on GC 11.0.4, but it should be universal to all gfx11 ASICs.

This should be fine.  the hub->init() functions don't touch hardware.
Maybe we could even rename them sw_init().  They only need to be
called once at driver load time.

Reviewed-by: Alex Deucher <alexander.deucher@amd.com>

Alex


>
> BRs,
> Yifan.
>
>
>
> -----Original Message-----
> From: Zhang, Hawking <Hawking.Zhang@amd.com>
> Sent: Monday, September 25, 2023 7:33 PM
> To: Zhang, Hawking <Hawking.Zhang@amd.com>; Zhang, Yifan <Yifan1.Zhang@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Zhang, Yifan <Yifan1.Zhang@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
> Subject: RE: [PATCH] drm/amdgpu: move gfxhub.funcs->init to gmc sw_init
>
> [AMD Official Use Only - General]
>
> For sw_init, it's fine to move to gmc sw_init phase, but any hardware programming should be done after smu hw_init.
>
> Regards,
> Hawking
>
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Zhang, Hawking
> Sent: Monday, September 25, 2023 19:30
> To: Zhang, Yifan <Yifan1.Zhang@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Zhang, Yifan <Yifan1.Zhang@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
> Subject: RE: [PATCH] drm/amdgpu: move gfxhub.funcs->init to gmc sw_init
>
> [AMD Official Use Only - General]
>
> [AMD Official Use Only - General]
>
> RE - gmc11 hw_init depends on adev->gfxhub.funcs->init now, move it to gmc11 sw_init
>
> isn't any driver loading failure observed? Since from gfx11, GFX won't be powered on until smu hw_init phase. Any programming that touch gfx core is invalid before it is powered on.
>
> Regards,
> Hawking
>
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Yifan Zhang
> Sent: Saturday, September 23, 2023 19:17
> To: amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Zhang, Yifan <Yifan1.Zhang@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
> Subject: [PATCH] drm/amdgpu: move gfxhub.funcs->init to gmc sw_init
>
> gmc11 hw_init depends on adev->gfxhub.funcs->init now, move it to gmc11 sw_init. This patch fixes below driver load error on GC 11.0.4:
>
> [   43.928149] [drm] amdgpu: 512M of VRAM memory ready
> [   43.928151] [drm] amdgpu: 7641M of GTT memory ready.
> [   43.928168] [drm] GART: num cpu pages 131072, num gpu pages 131072
> [   43.928182] BUG: kernel NULL pointer dereference, address: 0000000000000008
> [   43.928185] #PF: supervisor read access in kernel mode
> [   43.928187] #PF: error_code(0x0000) - not-present page
> [   43.928188] PGD 0 P4D 0
> [   43.928191] Oops: 0000 [#1] PREEMPT SMP NOPTI
> [   43.928193] CPU: 7 PID: 1278 Comm: modprobe Tainted: G           OE      6.3.7+ #73
> [   43.928198] RIP: 0010:gmc_v11_0_flush_gpu_tlb+0x59/0x4d0 [amdgpu]
> [   43.928406] Code: 8b 4d d4 48 89 4d c8 83 f9 0d 0f 87 7c 04 00 00 48 8b 45 c8 44 89 ee 44 89 e7 48 8d 04 80 48 c1 e0 04 49 8b 84 07 90 eb 00 00 <48> 8b 40 08 ff d0 0f 1f 00 44 8b 45 d4 41 89 c5 45 85 c0 0f 85 c0
> [   43.928409] RSP: 0018:ffffae24c28277f0 EFLAGS: 00010256
> [   43.928411] RAX: 0000000000000000 RBX: ffff95bf0ed80000 RCX: 0000000000000000
> [   43.928413] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
> [   43.928414] RBP: ffffae24c2827848 R08: 0003000000000077 R09: 0000000000000401
> [   43.928416] R10: 0000000000001000 R11: 0000000000001000 R12: 0000000000000000
> [   43.928417] R13: 0000000000000000 R14: 0000000000000000 R15: ffff95bf0ed80000
> [   43.928418] FS:  00007f2703c83000(0000) GS:ffff95c26e7c0000(0000) knlGS:0000000000000000
> [   43.928421] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [   43.928422] CR2: 0000000000000008 CR3: 000000012c300000 CR4: 0000000000750ee0
> [   43.928424] PKRU: 55555554
> [   43.928425] Call Trace:
> [   43.928426]  <TASK>
> [   43.928428]  ? show_regs+0x6e/0x80
> [   43.928434]  ? __die+0x29/0x70
> [   43.928436]  ? page_fault_oops+0x154/0x4a0
> [   43.928440]  ? amdgpu_bo_move+0x25b/0x790 [amdgpu]
> [   43.928612]  ? do_user_addr_fault+0x325/0x710
> [   43.928615]  ? exc_page_fault+0x7a/0x180
> [   43.928619]  ? asm_exc_page_fault+0x2b/0x30
> [   43.928623]  ? gmc_v11_0_flush_gpu_tlb+0x59/0x4d0 [amdgpu]
> [   43.928797]  ? amdgpu_gart_bind+0x64/0xc0 [amdgpu]
> [   43.928974]  amdgpu_gmc_flush_gpu_tlb+0x258/0x2e0 [amdgpu]
> [   43.929159]  amdgpu_gart_invalidate_tlb+0x66/0xa0 [amdgpu]
> [   43.929332]  amdgpu_gtt_mgr_recover+0x67/0x80 [amdgpu]
> [   43.929504]  gmc_v11_0_hw_init+0x4c/0x130 [amdgpu]
> [   43.929690]  amdgpu_device_init+0x1766/0x2d60 [amdgpu]
> [   43.929858]  ? pci_bus_read_config_word+0x4b/0x80
> [   43.929862]  ? do_pci_enable_device+0xdb/0x110
> [   43.929866]  amdgpu_driver_load_kms+0x1e/0x1a0 [amdgpu]
> [   43.930032]  amdgpu_pci_probe+0x16b/0x4e0 [amdgpu]
> [   43.930201]  local_pci_probe+0x4c/0xb0
> [   43.930204]  pci_device_probe+0xcc/0x280
> [   43.930207]  really_probe+0x1c4/0x430
> [   43.930210]  __driver_probe_device+0x8a/0x180
> [   43.930212]  driver_probe_device+0x23/0xc0
> [   43.930214]  __driver_attach+0xfb/0x200
> [   43.930215]  ? __pfx___driver_attach+0x10/0x10
> [   43.930217]  bus_for_each_dev+0x7e/0xd0
> [   43.930221]  driver_attach+0x22/0x30
> [   43.930222]  bus_add_driver+0x120/0x230
> [   43.930224]  driver_register+0x68/0x130
> [   43.930226]  ? __pfx_init_module+0x10/0x10 [amdgpu]
> [   43.930394]  __pci_register_driver+0x6c/0x80
> [   43.930396]  amdgpu_init+0x67/0xff0 [amdgpu]
> [   43.930549]  do_one_initcall+0x4a/0x240
> [   43.930554]  ? kmalloc_trace+0x2e/0xa0
> [   43.930559]  do_init_module+0x52/0x230
> [   43.930563]  load_module+0x2386/0x2880
> [   43.930566]  ? security_kernel_post_read_file+0x60/0x70
> [   43.930571]  __do_sys_finit_module+0xc8/0x140
> [   43.930573]  ? __do_sys_finit_module+0xc8/0x140
> [   43.930576]  __x64_sys_finit_module+0x1c/0x30
> [   43.930578]  do_syscall_64+0x5d/0x90
> [   43.930581]  ? syscall_exit_to_user_mode+0x2a/0x50
> [   43.930583]  ? __x64_sys_mmap+0x37/0x50
> [   43.930586]  ? do_syscall_64+0x6d/0x90
> [   43.930588]  entry_SYSCALL_64_after_hwframe+0x72/0xdc
>
> Signed-off-by: Yifan Zhang <yifan1.zhang@amd.com>
> Fixes: c6f52f1ffd91 (drm/amdgpu: cleanup gmc_v11_0_flush_gpu_tlb)
> ---
>  drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 2 --  drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 2 ++
>  2 files changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> index 42fc0cc13fdd..9c4562bda8cd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> @@ -1329,8 +1329,6 @@ static int gfx_v11_0_sw_init(void *handle)
>         struct amdgpu_kiq *kiq;
>         struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>
> -       adev->gfxhub.funcs->init(adev);
> -
>         switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
>         case IP_VERSION(11, 0, 0):
>         case IP_VERSION(11, 0, 2):
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> index 7d61f66625fa..d0e3583a3cac 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> @@ -720,6 +720,8 @@ static int gmc_v11_0_sw_init(void *handle)
>
>         adev->mmhub.funcs->init(adev);
>
> +       adev->gfxhub.funcs->init(adev);
> +
>         spin_lock_init(&adev->gmc.invalidate_lock);
>
>         r = amdgpu_atomfirmware_get_vram_info(adev,
> --
> 2.37.3
>
>

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2023-09-25 18:09 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-09-23 11:16 [PATCH] drm/amdgpu: move gfxhub.funcs->init to gmc sw_init Yifan Zhang
2023-09-25  6:45 ` Christian König
2023-09-25  8:45   ` Zhang, Yifan
2023-09-25 11:30 ` Zhang, Hawking
2023-09-25 11:32   ` Zhang, Hawking
2023-09-25 13:59     ` Zhang, Yifan
2023-09-25 18:08       ` Alex Deucher

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.