All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdgpu: Fix a redundant kfree
@ 2020-09-01  7:45 Pan, Xinhui
  2020-09-01 14:12 ` Alex Deucher
  0 siblings, 1 reply; 7+ messages in thread
From: Pan, Xinhui @ 2020-09-01  7:45 UTC (permalink / raw)
  To: amd-gfx; +Cc: Deucher, Alexander, Tuikov, Luben

[AMD Official Use Only - Internal Distribution Only]

drm_dev_alloc() alloc *dev* and set managed.final_kfree to dev to free
itself.
Now from commit 5cdd68498918("drm/amdgpu: Embed drm_device into
amdgpu_device (v3)") we alloc *adev* and ddev is just a member of it.
So drm_dev_release try to free a wrong pointer then.

Also driver's release trys to free adev, but drm_dev_release will
access dev after call drvier's release.

To fix it, remove driver's release and set managed.final_kfree to adev.

[   36.269348] BUG: unable to handle page fault for address: ffffa0c279940028
[   36.276841] #PF: supervisor read access in kernel mode
[   36.282434] #PF: error_code(0x0000) - not-present page
[   36.288053] PGD 676601067 P4D 676601067 PUD 86a414067 PMD 86a247067 PTE 800ffff8066bf060
[   36.296868] Oops: 0000 [#1] SMP DEBUG_PAGEALLOC NOPTI
[   36.302409] CPU: 4 PID: 1375 Comm: bash Tainted: G           O      5.9.0-rc2+ #46
[   36.310670] Hardware name: System manufacturer System Product Name/PRIME Z390-A, BIOS 1401 11/26/2019
[   36.320725] RIP: 0010:drm_managed_release+0x25/0x110 [drm]
[   36.326741] Code: 80 00 00 00 00 0f 1f 44 00 00 55 48 c7 c2 5a 9f 41 c0 be 00 02 00 00 48 89 e5 41 57 41 56 41 55 41 54 49 89 fc 53 48 83 ec 08 <48> 8b 7f 18 e8 c2 10 ff ff 4d 8b 74 24 20 49 8d 44 24 5
[   36.347217] RSP: 0018:ffffb9424141fce0 EFLAGS: 00010282
[   36.352931] RAX: 0000000000000006 RBX: ffffa0c279940010 RCX: 0000000000000006
[   36.360718] RDX: ffffffffc0419f5a RSI: 0000000000000200 RDI: ffffa0c279940010
[   36.368503] RBP: ffffb9424141fd10 R08: 0000000000000001 R09: 0000000000000001
[   36.376304] R10: 0000000000000000 R11: 0000000000000000 R12: ffffa0c279940010
[   36.384070] R13: ffffffffc0e2a000 R14: ffffa0c26924e220 R15: fffffffffffffff2
[   36.391845] FS:  00007fc4a277b740(0000) GS:ffffa0c288e00000(0000) knlGS:0000000000000000
[   36.400669] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   36.406937] CR2: ffffa0c279940028 CR3: 0000000792304006 CR4: 00000000003706e0
[   36.414732] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   36.422550] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[   36.430354] Call Trace:
[   36.433044]  drm_dev_put.part.0+0x40/0x60 [drm]
[   36.438017]  drm_dev_put+0x13/0x20 [drm]
[   36.442398]  amdgpu_pci_remove+0x56/0x60 [amdgpu]
[   36.447528]  pci_device_remove+0x3e/0xb0
[   36.451807]  device_release_driver_internal+0xff/0x1d0
[   36.457416]  device_release_driver+0x12/0x20
[   36.462094]  pci_stop_bus_device+0x70/0xa0
[   36.466588]  pci_stop_and_remove_bus_device_locked+0x1b/0x30
[   36.472786]  remove_store+0x7b/0x90
[   36.476614]  dev_attr_store+0x17/0x30
[   36.480646]  sysfs_kf_write+0x4b/0x60
[   36.484655]  kernfs_fop_write+0xe8/0x1d0
[   36.488952]  vfs_write+0xf5/0x230
[   36.492562]  ksys_write+0x70/0xf0
[   36.496206]  __x64_sys_write+0x1a/0x20
[   36.500292]  do_syscall_64+0x38/0x90
[   36.504219]  entry_SYSCALL_64_after_hwframe+0x44/0xa9

Signed-off-by: xinhui pan <xinhui.pan@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index c12e9acd421d..52fc0c6c8538 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -1165,7 +1165,7 @@ static int amdgpu_pci_probe(struct pci_dev *pdev,
 if (ret)
 goto err_free;

-drmm_add_final_kfree(ddev, ddev);
+drmm_add_final_kfree(ddev, adev);

 if (!supports_atomic)
 ddev->driver_features &= ~DRIVER_ATOMIC;
@@ -1221,13 +1221,6 @@ amdgpu_pci_remove(struct pci_dev *pdev)
 drm_dev_put(dev);
 }

-static void amdgpu_driver_release(struct drm_device *ddev)
-{
-struct amdgpu_device *adev = drm_to_adev(ddev);
-
-kfree(adev);
-}
-
 static void
 amdgpu_pci_shutdown(struct pci_dev *pdev)
 {
@@ -1522,7 +1515,6 @@ static struct drm_driver kms_driver = {
 .open = amdgpu_driver_open_kms,
 .postclose = amdgpu_driver_postclose_kms,
 .lastclose = amdgpu_driver_lastclose_kms,
-.release   = amdgpu_driver_release,
 .irq_handler = amdgpu_irq_handler,
 .ioctls = amdgpu_ioctls_kms,
 .gem_free_object_unlocked = amdgpu_gem_object_free,
--
2.25.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix a redundant kfree
  2020-09-01  7:45 [PATCH] drm/amdgpu: Fix a redundant kfree Pan, Xinhui
@ 2020-09-01 14:12 ` Alex Deucher
  2020-09-01 20:35   ` Luben Tuikov
  0 siblings, 1 reply; 7+ messages in thread
From: Alex Deucher @ 2020-09-01 14:12 UTC (permalink / raw)
  To: Pan, Xinhui; +Cc: Deucher, Alexander, Tuikov, Luben, amd-gfx

On Tue, Sep 1, 2020 at 3:46 AM Pan, Xinhui <Xinhui.Pan@amd.com> wrote:
>
> [AMD Official Use Only - Internal Distribution Only]
>
> drm_dev_alloc() alloc *dev* and set managed.final_kfree to dev to free
> itself.
> Now from commit 5cdd68498918("drm/amdgpu: Embed drm_device into
> amdgpu_device (v3)") we alloc *adev* and ddev is just a member of it.
> So drm_dev_release try to free a wrong pointer then.
>
> Also driver's release trys to free adev, but drm_dev_release will
> access dev after call drvier's release.
>
> To fix it, remove driver's release and set managed.final_kfree to adev.

I've got to admit, the documentation around drm_dev_init is hard to
follow.  We aren't really using the drmm stuff, but you have to use
drmm_add_final_kfree to avoid a warning.  The logic seems to make
sense though.
Acked-by: Alex Deucher <alexancer.deucher@amd.com>

>
> [   36.269348] BUG: unable to handle page fault for address: ffffa0c279940028
> [   36.276841] #PF: supervisor read access in kernel mode
> [   36.282434] #PF: error_code(0x0000) - not-present page
> [   36.288053] PGD 676601067 P4D 676601067 PUD 86a414067 PMD 86a247067 PTE 800ffff8066bf060
> [   36.296868] Oops: 0000 [#1] SMP DEBUG_PAGEALLOC NOPTI
> [   36.302409] CPU: 4 PID: 1375 Comm: bash Tainted: G           O      5.9.0-rc2+ #46
> [   36.310670] Hardware name: System manufacturer System Product Name/PRIME Z390-A, BIOS 1401 11/26/2019
> [   36.320725] RIP: 0010:drm_managed_release+0x25/0x110 [drm]
> [   36.326741] Code: 80 00 00 00 00 0f 1f 44 00 00 55 48 c7 c2 5a 9f 41 c0 be 00 02 00 00 48 89 e5 41 57 41 56 41 55 41 54 49 89 fc 53 48 83 ec 08 <48> 8b 7f 18 e8 c2 10 ff ff 4d 8b 74 24 20 49 8d 44 24 5
> [   36.347217] RSP: 0018:ffffb9424141fce0 EFLAGS: 00010282
> [   36.352931] RAX: 0000000000000006 RBX: ffffa0c279940010 RCX: 0000000000000006
> [   36.360718] RDX: ffffffffc0419f5a RSI: 0000000000000200 RDI: ffffa0c279940010
> [   36.368503] RBP: ffffb9424141fd10 R08: 0000000000000001 R09: 0000000000000001
> [   36.376304] R10: 0000000000000000 R11: 0000000000000000 R12: ffffa0c279940010
> [   36.384070] R13: ffffffffc0e2a000 R14: ffffa0c26924e220 R15: fffffffffffffff2
> [   36.391845] FS:  00007fc4a277b740(0000) GS:ffffa0c288e00000(0000) knlGS:0000000000000000
> [   36.400669] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [   36.406937] CR2: ffffa0c279940028 CR3: 0000000792304006 CR4: 00000000003706e0
> [   36.414732] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [   36.422550] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> [   36.430354] Call Trace:
> [   36.433044]  drm_dev_put.part.0+0x40/0x60 [drm]
> [   36.438017]  drm_dev_put+0x13/0x20 [drm]
> [   36.442398]  amdgpu_pci_remove+0x56/0x60 [amdgpu]
> [   36.447528]  pci_device_remove+0x3e/0xb0
> [   36.451807]  device_release_driver_internal+0xff/0x1d0
> [   36.457416]  device_release_driver+0x12/0x20
> [   36.462094]  pci_stop_bus_device+0x70/0xa0
> [   36.466588]  pci_stop_and_remove_bus_device_locked+0x1b/0x30
> [   36.472786]  remove_store+0x7b/0x90
> [   36.476614]  dev_attr_store+0x17/0x30
> [   36.480646]  sysfs_kf_write+0x4b/0x60
> [   36.484655]  kernfs_fop_write+0xe8/0x1d0
> [   36.488952]  vfs_write+0xf5/0x230
> [   36.492562]  ksys_write+0x70/0xf0
> [   36.496206]  __x64_sys_write+0x1a/0x20
> [   36.500292]  do_syscall_64+0x38/0x90
> [   36.504219]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
>
> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 10 +---------
>  1 file changed, 1 insertion(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index c12e9acd421d..52fc0c6c8538 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -1165,7 +1165,7 @@ static int amdgpu_pci_probe(struct pci_dev *pdev,
>  if (ret)
>  goto err_free;
>
> -drmm_add_final_kfree(ddev, ddev);
> +drmm_add_final_kfree(ddev, adev);
>
>  if (!supports_atomic)
>  ddev->driver_features &= ~DRIVER_ATOMIC;
> @@ -1221,13 +1221,6 @@ amdgpu_pci_remove(struct pci_dev *pdev)
>  drm_dev_put(dev);
>  }
>
> -static void amdgpu_driver_release(struct drm_device *ddev)
> -{
> -struct amdgpu_device *adev = drm_to_adev(ddev);
> -
> -kfree(adev);
> -}
> -
>  static void
>  amdgpu_pci_shutdown(struct pci_dev *pdev)
>  {
> @@ -1522,7 +1515,6 @@ static struct drm_driver kms_driver = {
>  .open = amdgpu_driver_open_kms,
>  .postclose = amdgpu_driver_postclose_kms,
>  .lastclose = amdgpu_driver_lastclose_kms,
> -.release   = amdgpu_driver_release,
>  .irq_handler = amdgpu_irq_handler,
>  .ioctls = amdgpu_ioctls_kms,
>  .gem_free_object_unlocked = amdgpu_gem_object_free,
> --
> 2.25.1
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix a redundant kfree
  2020-09-01 14:12 ` Alex Deucher
@ 2020-09-01 20:35   ` Luben Tuikov
  2020-09-01 21:58     ` Pan, Xinhui
  2020-09-02 11:27     ` Daniel Vetter
  0 siblings, 2 replies; 7+ messages in thread
From: Luben Tuikov @ 2020-09-01 20:35 UTC (permalink / raw)
  To: Alex Deucher, Pan, Xinhui, Daniel Vetter; +Cc: Deucher, Alexander, amd-gfx

On 2020-09-01 10:12 a.m., Alex Deucher wrote:
> On Tue, Sep 1, 2020 at 3:46 AM Pan, Xinhui <Xinhui.Pan@amd.com> wrote:
>>
>> [AMD Official Use Only - Internal Distribution Only]
>>
>> drm_dev_alloc() alloc *dev* and set managed.final_kfree to dev to free
>> itself.
>> Now from commit 5cdd68498918("drm/amdgpu: Embed drm_device into
>> amdgpu_device (v3)") we alloc *adev* and ddev is just a member of it.
>> So drm_dev_release try to free a wrong pointer then.
>>
>> Also driver's release trys to free adev, but drm_dev_release will
>> access dev after call drvier's release.
>>
>> To fix it, remove driver's release and set managed.final_kfree to adev.
> 
> I've got to admit, the documentation around drm_dev_init is hard to
> follow.  We aren't really using the drmm stuff, but you have to use
> drmm_add_final_kfree to avoid a warning.  The logic seems to make
> sense though.
> Acked-by: Alex Deucher <alexancer.deucher@amd.com>

The logic in patch 3/3 uses the kref infrastructure
as described in drm_drv.c's comment on what the DRM
usage is, i.e. taking advantage of the kref infrastructure.

In amdgpu_pci_probe() we call drm_dev_init() which takes
a ref of 1 on the kref in the DRM device structure,
and from then on, only when the kref transitions
from non-zero to 0, do we free the container of
DRM device, and this is beautifully shown in the
kernel stack below (please take a look at the kernel
stack below).

Using a kref is very powerful as it is implicit:
when the kref transitions from non-zero to 0,
then call the release method.

Furthermore, we own the release method, and we
like that, as it is pure, as well as,
there may be more things we'd like to do in the future
before we free the amdgpu device: maybe free memory we're
using specifically for that PCI device, maybe write
some registers, maybe notify someone or something, etc.

On another note, adding "drmm_add_final_kfree()" in the middle
of amdgpu_pci_probe() seems hackish--it's neither part
of drm_dev_init() nor of drm_dev_register(). We really
don't need it, since we rely on the kref infrastructure
to tell us when to free the device, and if you'd look
at the beautiful stack below, it knows exactly when that is,
i.e. when to free it.

The correct thing to do this is to
_leave the amdgpu_driver_release()_ alone,
remove "drmm_add_final_kfree()" and qualify
the WARN_ON() in drm_dev_register() by
the existence of drm_driver.release() (i.e. non-NULL).

I'll submit a sequence of patches to fix this right.

Regards,
Luben

> 
>>
>> [   36.269348] BUG: unable to handle page fault for address: ffffa0c279940028
>> [   36.276841] #PF: supervisor read access in kernel mode
>> [   36.282434] #PF: error_code(0x0000) - not-present page
>> [   36.288053] PGD 676601067 P4D 676601067 PUD 86a414067 PMD 86a247067 PTE 800ffff8066bf060
>> [   36.296868] Oops: 0000 [#1] SMP DEBUG_PAGEALLOC NOPTI
>> [   36.302409] CPU: 4 PID: 1375 Comm: bash Tainted: G           O      5.9.0-rc2+ #46
>> [   36.310670] Hardware name: System manufacturer System Product Name/PRIME Z390-A, BIOS 1401 11/26/2019
>> [   36.320725] RIP: 0010:drm_managed_release+0x25/0x110 [drm]
>> [   36.326741] Code: 80 00 00 00 00 0f 1f 44 00 00 55 48 c7 c2 5a 9f 41 c0 be 00 02 00 00 48 89 e5 41 57 41 56 41 55 41 54 49 89 fc 53 48 83 ec 08 <48> 8b 7f 18 e8 c2 10 ff ff 4d 8b 74 24 20 49 8d 44 24 5
>> [   36.347217] RSP: 0018:ffffb9424141fce0 EFLAGS: 00010282
>> [   36.352931] RAX: 0000000000000006 RBX: ffffa0c279940010 RCX: 0000000000000006
>> [   36.360718] RDX: ffffffffc0419f5a RSI: 0000000000000200 RDI: ffffa0c279940010
>> [   36.368503] RBP: ffffb9424141fd10 R08: 0000000000000001 R09: 0000000000000001
>> [   36.376304] R10: 0000000000000000 R11: 0000000000000000 R12: ffffa0c279940010
>> [   36.384070] R13: ffffffffc0e2a000 R14: ffffa0c26924e220 R15: fffffffffffffff2
>> [   36.391845] FS:  00007fc4a277b740(0000) GS:ffffa0c288e00000(0000) knlGS:0000000000000000
>> [   36.400669] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>> [   36.406937] CR2: ffffa0c279940028 CR3: 0000000792304006 CR4: 00000000003706e0
>> [   36.414732] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>> [   36.422550] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
>> [   36.430354] Call Trace:
>> [   36.433044]  drm_dev_put.part.0+0x40/0x60 [drm]
>> [   36.438017]  drm_dev_put+0x13/0x20 [drm]
>> [   36.442398]  amdgpu_pci_remove+0x56/0x60 [amdgpu]
>> [   36.447528]  pci_device_remove+0x3e/0xb0
>> [   36.451807]  device_release_driver_internal+0xff/0x1d0
>> [   36.457416]  device_release_driver+0x12/0x20
>> [   36.462094]  pci_stop_bus_device+0x70/0xa0
>> [   36.466588]  pci_stop_and_remove_bus_device_locked+0x1b/0x30
>> [   36.472786]  remove_store+0x7b/0x90
>> [   36.476614]  dev_attr_store+0x17/0x30
>> [   36.480646]  sysfs_kf_write+0x4b/0x60
>> [   36.484655]  kernfs_fop_write+0xe8/0x1d0
>> [   36.488952]  vfs_write+0xf5/0x230
>> [   36.492562]  ksys_write+0x70/0xf0
>> [   36.496206]  __x64_sys_write+0x1a/0x20
>> [   36.500292]  do_syscall_64+0x38/0x90
>> [   36.504219]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
>>
>> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
>> ---
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 10 +---------
>>  1 file changed, 1 insertion(+), 9 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> index c12e9acd421d..52fc0c6c8538 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> @@ -1165,7 +1165,7 @@ static int amdgpu_pci_probe(struct pci_dev *pdev,
>>  if (ret)
>>  goto err_free;
>>
>> -drmm_add_final_kfree(ddev, ddev);
>> +drmm_add_final_kfree(ddev, adev);
>>
>>  if (!supports_atomic)
>>  ddev->driver_features &= ~DRIVER_ATOMIC;
>> @@ -1221,13 +1221,6 @@ amdgpu_pci_remove(struct pci_dev *pdev)
>>  drm_dev_put(dev);
>>  }
>>
>> -static void amdgpu_driver_release(struct drm_device *ddev)
>> -{
>> -struct amdgpu_device *adev = drm_to_adev(ddev);
>> -
>> -kfree(adev);
>> -}
>> -
>>  static void
>>  amdgpu_pci_shutdown(struct pci_dev *pdev)
>>  {
>> @@ -1522,7 +1515,6 @@ static struct drm_driver kms_driver = {
>>  .open = amdgpu_driver_open_kms,
>>  .postclose = amdgpu_driver_postclose_kms,
>>  .lastclose = amdgpu_driver_lastclose_kms,
>> -.release   = amdgpu_driver_release,
>>  .irq_handler = amdgpu_irq_handler,
>>  .ioctls = amdgpu_ioctls_kms,
>>  .gem_free_object_unlocked = amdgpu_gem_object_free,
>> --
>> 2.25.1
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7CLuben.Tuikov%40amd.com%7C5233b0caaac449417a7208d84e810de0%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637345663485193534&amp;sdata=A%2FzJpoQmUhGBGts5mQfp%2BlVu1FL5NVbe5qfRelb2Og8%3D&amp;reserved=0

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix a redundant kfree
  2020-09-01 20:35   ` Luben Tuikov
@ 2020-09-01 21:58     ` Pan, Xinhui
  2020-09-02  3:47       ` Luben Tuikov
  2020-09-02 11:27     ` Daniel Vetter
  1 sibling, 1 reply; 7+ messages in thread
From: Pan, Xinhui @ 2020-09-01 21:58 UTC (permalink / raw)
  To: Alex Deucher, Daniel Vetter, Tuikov, Luben; +Cc: Deucher, Alexander, amd-gfx


[-- Attachment #1.1: Type: text/plain, Size: 8218 bytes --]

[AMD Official Use Only - Internal Distribution Only]


The correct thing to do this is to
_leave the amdgpu_driver_release()_ alone,
remove "drmm_add_final_kfree()" and qualify
the WARN_ON() in drm_dev_register() by
the existence of drm_driver.release() (i.e. non-NULL).

Re:  this drm driver release callback is more like a release notify. It is called in the beginning of the total release sequence. As you have made drm device a member of adev. So you can not free adev in the driver release callback.

Maybe change the sequence of release,  say, put drm driver release in the end of total release sequence.
Or still use the final_kfree to free adev and our release callback just do some other cleanup work.
________________________________
From: Tuikov, Luben <Luben.Tuikov@amd.com>
Sent: Wednesday, September 2, 2020 4:35:32 AM
To: Alex Deucher <alexdeucher@gmail.com>; Pan, Xinhui <Xinhui.Pan@amd.com>; Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>; Deucher, Alexander <Alexander.Deucher@amd.com>
Subject: Re: [PATCH] drm/amdgpu: Fix a redundant kfree

On 2020-09-01 10:12 a.m., Alex Deucher wrote:
> On Tue, Sep 1, 2020 at 3:46 AM Pan, Xinhui <Xinhui.Pan@amd.com> wrote:
>>
>> [AMD Official Use Only - Internal Distribution Only]
>>
>> drm_dev_alloc() alloc *dev* and set managed.final_kfree to dev to free
>> itself.
>> Now from commit 5cdd68498918("drm/amdgpu: Embed drm_device into
>> amdgpu_device (v3)") we alloc *adev* and ddev is just a member of it.
>> So drm_dev_release try to free a wrong pointer then.
>>
>> Also driver's release trys to free adev, but drm_dev_release will
>> access dev after call drvier's release.
>>
>> To fix it, remove driver's release and set managed.final_kfree to adev.
>
> I've got to admit, the documentation around drm_dev_init is hard to
> follow.  We aren't really using the drmm stuff, but you have to use
> drmm_add_final_kfree to avoid a warning.  The logic seems to make
> sense though.
> Acked-by: Alex Deucher <alexancer.deucher@amd.com>

The logic in patch 3/3 uses the kref infrastructure
as described in drm_drv.c's comment on what the DRM
usage is, i.e. taking advantage of the kref infrastructure.

In amdgpu_pci_probe() we call drm_dev_init() which takes
a ref of 1 on the kref in the DRM device structure,
and from then on, only when the kref transitions
from non-zero to 0, do we free the container of
DRM device, and this is beautifully shown in the
kernel stack below (please take a look at the kernel
stack below).

Using a kref is very powerful as it is implicit:
when the kref transitions from non-zero to 0,
then call the release method.

Furthermore, we own the release method, and we
like that, as it is pure, as well as,
there may be more things we'd like to do in the future
before we free the amdgpu device: maybe free memory we're
using specifically for that PCI device, maybe write
some registers, maybe notify someone or something, etc.

On another note, adding "drmm_add_final_kfree()" in the middle
of amdgpu_pci_probe() seems hackish--it's neither part
of drm_dev_init() nor of drm_dev_register(). We really
don't need it, since we rely on the kref infrastructure
to tell us when to free the device, and if you'd look
at the beautiful stack below, it knows exactly when that is,
i.e. when to free it.

The correct thing to do this is to
_leave the amdgpu_driver_release()_ alone,
remove "drmm_add_final_kfree()" and qualify
the WARN_ON() in drm_dev_register() by
the existence of drm_driver.release() (i.e. non-NULL).

I'll submit a sequence of patches to fix this right.

Regards,
Luben

>
>>
>> [   36.269348] BUG: unable to handle page fault for address: ffffa0c279940028
>> [   36.276841] #PF: supervisor read access in kernel mode
>> [   36.282434] #PF: error_code(0x0000) - not-present page
>> [   36.288053] PGD 676601067 P4D 676601067 PUD 86a414067 PMD 86a247067 PTE 800ffff8066bf060
>> [   36.296868] Oops: 0000 [#1] SMP DEBUG_PAGEALLOC NOPTI
>> [   36.302409] CPU: 4 PID: 1375 Comm: bash Tainted: G           O      5.9.0-rc2+ #46
>> [   36.310670] Hardware name: System manufacturer System Product Name/PRIME Z390-A, BIOS 1401 11/26/2019
>> [   36.320725] RIP: 0010:drm_managed_release+0x25/0x110 [drm]
>> [   36.326741] Code: 80 00 00 00 00 0f 1f 44 00 00 55 48 c7 c2 5a 9f 41 c0 be 00 02 00 00 48 89 e5 41 57 41 56 41 55 41 54 49 89 fc 53 48 83 ec 08 <48> 8b 7f 18 e8 c2 10 ff ff 4d 8b 74 24 20 49 8d 44 24 5
>> [   36.347217] RSP: 0018:ffffb9424141fce0 EFLAGS: 00010282
>> [   36.352931] RAX: 0000000000000006 RBX: ffffa0c279940010 RCX: 0000000000000006
>> [   36.360718] RDX: ffffffffc0419f5a RSI: 0000000000000200 RDI: ffffa0c279940010
>> [   36.368503] RBP: ffffb9424141fd10 R08: 0000000000000001 R09: 0000000000000001
>> [   36.376304] R10: 0000000000000000 R11: 0000000000000000 R12: ffffa0c279940010
>> [   36.384070] R13: ffffffffc0e2a000 R14: ffffa0c26924e220 R15: fffffffffffffff2
>> [   36.391845] FS:  00007fc4a277b740(0000) GS:ffffa0c288e00000(0000) knlGS:0000000000000000
>> [   36.400669] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>> [   36.406937] CR2: ffffa0c279940028 CR3: 0000000792304006 CR4: 00000000003706e0
>> [   36.414732] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>> [   36.422550] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
>> [   36.430354] Call Trace:
>> [   36.433044]  drm_dev_put.part.0+0x40/0x60 [drm]
>> [   36.438017]  drm_dev_put+0x13/0x20 [drm]
>> [   36.442398]  amdgpu_pci_remove+0x56/0x60 [amdgpu]
>> [   36.447528]  pci_device_remove+0x3e/0xb0
>> [   36.451807]  device_release_driver_internal+0xff/0x1d0
>> [   36.457416]  device_release_driver+0x12/0x20
>> [   36.462094]  pci_stop_bus_device+0x70/0xa0
>> [   36.466588]  pci_stop_and_remove_bus_device_locked+0x1b/0x30
>> [   36.472786]  remove_store+0x7b/0x90
>> [   36.476614]  dev_attr_store+0x17/0x30
>> [   36.480646]  sysfs_kf_write+0x4b/0x60
>> [   36.484655]  kernfs_fop_write+0xe8/0x1d0
>> [   36.488952]  vfs_write+0xf5/0x230
>> [   36.492562]  ksys_write+0x70/0xf0
>> [   36.496206]  __x64_sys_write+0x1a/0x20
>> [   36.500292]  do_syscall_64+0x38/0x90
>> [   36.504219]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
>>
>> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
>> ---
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 10 +---------
>>  1 file changed, 1 insertion(+), 9 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> index c12e9acd421d..52fc0c6c8538 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> @@ -1165,7 +1165,7 @@ static int amdgpu_pci_probe(struct pci_dev *pdev,
>>  if (ret)
>>  goto err_free;
>>
>> -drmm_add_final_kfree(ddev, ddev);
>> +drmm_add_final_kfree(ddev, adev);
>>
>>  if (!supports_atomic)
>>  ddev->driver_features &= ~DRIVER_ATOMIC;
>> @@ -1221,13 +1221,6 @@ amdgpu_pci_remove(struct pci_dev *pdev)
>>  drm_dev_put(dev);
>>  }
>>
>> -static void amdgpu_driver_release(struct drm_device *ddev)
>> -{
>> -struct amdgpu_device *adev = drm_to_adev(ddev);
>> -
>> -kfree(adev);
>> -}
>> -
>>  static void
>>  amdgpu_pci_shutdown(struct pci_dev *pdev)
>>  {
>> @@ -1522,7 +1515,6 @@ static struct drm_driver kms_driver = {
>>  .open = amdgpu_driver_open_kms,
>>  .postclose = amdgpu_driver_postclose_kms,
>>  .lastclose = amdgpu_driver_lastclose_kms,
>> -.release   = amdgpu_driver_release,
>>  .irq_handler = amdgpu_irq_handler,
>>  .ioctls = amdgpu_ioctls_kms,
>>  .gem_free_object_unlocked = amdgpu_gem_object_free,
>> --
>> 2.25.1
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7CLuben.Tuikov%40amd.com%7C5233b0caaac449417a7208d84e810de0%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637345663485193534&amp;sdata=A%2FzJpoQmUhGBGts5mQfp%2BlVu1FL5NVbe5qfRelb2Og8%3D&amp;reserved=0


[-- Attachment #1.2: Type: text/html, Size: 12392 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix a redundant kfree
  2020-09-01 21:58     ` Pan, Xinhui
@ 2020-09-02  3:47       ` Luben Tuikov
  0 siblings, 0 replies; 7+ messages in thread
From: Luben Tuikov @ 2020-09-02  3:47 UTC (permalink / raw)
  To: Pan, Xinhui, Alex Deucher, Daniel Vetter; +Cc: Deucher, Alexander, amd-gfx

On 2020-09-01 5:58 p.m., Pan, Xinhui wrote:
> [AMD Official Use Only - Internal Distribution Only]
> 
> 
> 
> The correct thing to do this is to
> _leave the amdgpu_driver_release()_ alone,
> remove "drmm_add_final_kfree()" and qualify
> the WARN_ON() in drm_dev_register() by
> the existence of drm_driver.release() (i.e. non-NULL).
> 
> Re:  this drm driver release callback is more like a release notify. It is called in the beginning of the total release sequence. As you have made drm device a member of adev. So you can not free adev in the driver release callback.
> 

No. The DRM driver release callback is not "more like release notify".
When the kref goes to 0, the release callback is called. The kref
infrastructure has been around in the kernel for about 20 years now.

I don't understand what a "total release sequence" is. Is there
a "partial release sequence"? The stack below shows an example
of a release. It's very simple, when the kref goes to 0,
the DRM driver's callback is called.

> Maybe change the sequence of release,  say, put drm driver release in the end of total release sequence.
> Or still use the final_kfree to free adev and our release callback just do some other cleanup work.

No. "final kfree" is a hack. What we want to use is the implicit
nature of the kref infrastructure. You don't really care to "free",
how and when it happens. You just forget about "free."
It will implicitly take place when the kref transitions from non-zero
to 0. It's implicit.

Another advantage of using DRM driver's "release" callback
on kref going to 0, is that we can do other interesting stuff
just before we free the container structure amdgpu_device.

And most of all, we don't really need to be concerned with
sequencing the free, how, when, etc., it naturally takes place
on a kref-put, when that transitions the kref to 0.

Regards,
Luben


> ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
> *From:* Tuikov, Luben <Luben.Tuikov@amd.com>
> *Sent:* Wednesday, September 2, 2020 4:35:32 AM
> *To:* Alex Deucher <alexdeucher@gmail.com>; Pan, Xinhui <Xinhui.Pan@amd.com>; Daniel Vetter <daniel.vetter@ffwll.ch>
> *Cc:* amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>; Deucher, Alexander <Alexander.Deucher@amd.com>
> *Subject:* Re: [PATCH] drm/amdgpu: Fix a redundant kfree
>  
> On 2020-09-01 10:12 a.m., Alex Deucher wrote:
>> On Tue, Sep 1, 2020 at 3:46 AM Pan, Xinhui <Xinhui.Pan@amd.com> wrote:
>>>
>>> [AMD Official Use Only - Internal Distribution Only]
>>>
>>> drm_dev_alloc() alloc *dev* and set managed.final_kfree to dev to free
>>> itself.
>>> Now from commit 5cdd68498918("drm/amdgpu: Embed drm_device into
>>> amdgpu_device (v3)") we alloc *adev* and ddev is just a member of it.
>>> So drm_dev_release try to free a wrong pointer then.
>>>
>>> Also driver's release trys to free adev, but drm_dev_release will
>>> access dev after call drvier's release.
>>>
>>> To fix it, remove driver's release and set managed.final_kfree to adev.
>> 
>> I've got to admit, the documentation around drm_dev_init is hard to
>> follow.  We aren't really using the drmm stuff, but you have to use
>> drmm_add_final_kfree to avoid a warning.  The logic seems to make
>> sense though.
>> Acked-by: Alex Deucher <alexancer.deucher@amd.com>
> 
> The logic in patch 3/3 uses the kref infrastructure
> as described in drm_drv.c's comment on what the DRM
> usage is, i.e. taking advantage of the kref infrastructure.
> 
> In amdgpu_pci_probe() we call drm_dev_init() which takes
> a ref of 1 on the kref in the DRM device structure,
> and from then on, only when the kref transitions
> from non-zero to 0, do we free the container of
> DRM device, and this is beautifully shown in the
> kernel stack below (please take a look at the kernel
> stack below).
> 
> Using a kref is very powerful as it is implicit:
> when the kref transitions from non-zero to 0,
> then call the release method.
> 
> Furthermore, we own the release method, and we
> like that, as it is pure, as well as,
> there may be more things we'd like to do in the future
> before we free the amdgpu device: maybe free memory we're
> using specifically for that PCI device, maybe write
> some registers, maybe notify someone or something, etc.
> 
> On another note, adding "drmm_add_final_kfree()" in the middle
> of amdgpu_pci_probe() seems hackish--it's neither part
> of drm_dev_init() nor of drm_dev_register(). We really
> don't need it, since we rely on the kref infrastructure
> to tell us when to free the device, and if you'd look
> at the beautiful stack below, it knows exactly when that is,
> i.e. when to free it.
> 
> The correct thing to do this is to
> _leave the amdgpu_driver_release()_ alone,
> remove "drmm_add_final_kfree()" and qualify
> the WARN_ON() in drm_dev_register() by
> the existence of drm_driver.release() (i.e. non-NULL).
> 
> I'll submit a sequence of patches to fix this right.
> 
> Regards,
> Luben
> 
>> 
>>>
>>> [   36.269348] BUG: unable to handle page fault for address: ffffa0c279940028
>>> [   36.276841] #PF: supervisor read access in kernel mode
>>> [   36.282434] #PF: error_code(0x0000) - not-present page
>>> [   36.288053] PGD 676601067 P4D 676601067 PUD 86a414067 PMD 86a247067 PTE 800ffff8066bf060
>>> [   36.296868] Oops: 0000 [#1] SMP DEBUG_PAGEALLOC NOPTI
>>> [   36.302409] CPU: 4 PID: 1375 Comm: bash Tainted: G           O      5.9.0-rc2+ #46
>>> [   36.310670] Hardware name: System manufacturer System Product Name/PRIME Z390-A, BIOS 1401 11/26/2019
>>> [   36.320725] RIP: 0010:drm_managed_release+0x25/0x110 [drm]
>>> [   36.326741] Code: 80 00 00 00 00 0f 1f 44 00 00 55 48 c7 c2 5a 9f 41 c0 be 00 02 00 00 48 89 e5 41 57 41 56 41 55 41 54 49 89 fc 53 48 83 ec 08 <48> 8b 7f 18 e8 c2 10 ff ff 4d 8b 74 24 20 49 8d 44 24 5
>>> [   36.347217] RSP: 0018:ffffb9424141fce0 EFLAGS: 00010282
>>> [   36.352931] RAX: 0000000000000006 RBX: ffffa0c279940010 RCX: 0000000000000006
>>> [   36.360718] RDX: ffffffffc0419f5a RSI: 0000000000000200 RDI: ffffa0c279940010
>>> [   36.368503] RBP: ffffb9424141fd10 R08: 0000000000000001 R09: 0000000000000001
>>> [   36.376304] R10: 0000000000000000 R11: 0000000000000000 R12: ffffa0c279940010
>>> [   36.384070] R13: ffffffffc0e2a000 R14: ffffa0c26924e220 R15: fffffffffffffff2
>>> [   36.391845] FS:  00007fc4a277b740(0000) GS:ffffa0c288e00000(0000) knlGS:0000000000000000
>>> [   36.400669] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>>> [   36.406937] CR2: ffffa0c279940028 CR3: 0000000792304006 CR4: 00000000003706e0
>>> [   36.414732] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>>> [   36.422550] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
>>> [   36.430354] Call Trace:
>>> [   36.433044]  drm_dev_put.part.0+0x40/0x60 [drm]
>>> [   36.438017]  drm_dev_put+0x13/0x20 [drm]
>>> [   36.442398]  amdgpu_pci_remove+0x56/0x60 [amdgpu]
>>> [   36.447528]  pci_device_remove+0x3e/0xb0
>>> [   36.451807]  device_release_driver_internal+0xff/0x1d0
>>> [   36.457416]  device_release_driver+0x12/0x20
>>> [   36.462094]  pci_stop_bus_device+0x70/0xa0
>>> [   36.466588]  pci_stop_and_remove_bus_device_locked+0x1b/0x30
>>> [   36.472786]  remove_store+0x7b/0x90
>>> [   36.476614]  dev_attr_store+0x17/0x30
>>> [   36.480646]  sysfs_kf_write+0x4b/0x60
>>> [   36.484655]  kernfs_fop_write+0xe8/0x1d0
>>> [   36.488952]  vfs_write+0xf5/0x230
>>> [   36.492562]  ksys_write+0x70/0xf0
>>> [   36.496206]  __x64_sys_write+0x1a/0x20
>>> [   36.500292]  do_syscall_64+0x38/0x90
>>> [   36.504219]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
>>>
>>> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
>>> ---
>>>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 10 +---------
>>>  1 file changed, 1 insertion(+), 9 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> index c12e9acd421d..52fc0c6c8538 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> @@ -1165,7 +1165,7 @@ static int amdgpu_pci_probe(struct pci_dev *pdev,
>>>  if (ret)
>>>  goto err_free;
>>>
>>> -drmm_add_final_kfree(ddev, ddev);
>>> +drmm_add_final_kfree(ddev, adev);
>>>
>>>  if (!supports_atomic)
>>>  ddev->driver_features &= ~DRIVER_ATOMIC;
>>> @@ -1221,13 +1221,6 @@ amdgpu_pci_remove(struct pci_dev *pdev)
>>>  drm_dev_put(dev);
>>>  }
>>>
>>> -static void amdgpu_driver_release(struct drm_device *ddev)
>>> -{
>>> -struct amdgpu_device *adev = drm_to_adev(ddev);
>>> -
>>> -kfree(adev);
>>> -}
>>> -
>>>  static void
>>>  amdgpu_pci_shutdown(struct pci_dev *pdev)
>>>  {
>>> @@ -1522,7 +1515,6 @@ static struct drm_driver kms_driver = {
>>>  .open = amdgpu_driver_open_kms,
>>>  .postclose = amdgpu_driver_postclose_kms,
>>>  .lastclose = amdgpu_driver_lastclose_kms,
>>> -.release   = amdgpu_driver_release,
>>>  .irq_handler = amdgpu_irq_handler,
>>>  .ioctls = amdgpu_ioctls_kms,
>>>  .gem_free_object_unlocked = amdgpu_gem_object_free,
>>> --
>>> 2.25.1
>>>
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7CLuben.Tuikov%40amd.com%7C5233b0caaac449417a7208d84e810de0%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637345663485193534&amp;sdata=A%2FzJpoQmUhGBGts5mQfp%2BlVu1FL5NVbe5qfRelb2Og8%3D&amp;reserved=0 <https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7CLuben.Tuikov%40amd.com%7C5233b0caaac449417a7208d84e810de0%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637345663485193534&amp;sdata=A%2FzJpoQmUhGBGts5mQfp%2BlVu1FL5NVbe5qfRelb2Og8%3D&amp;reserved=0>
> 

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix a redundant kfree
  2020-09-01 20:35   ` Luben Tuikov
  2020-09-01 21:58     ` Pan, Xinhui
@ 2020-09-02 11:27     ` Daniel Vetter
  2020-09-02 14:36       ` Luben Tuikov
  1 sibling, 1 reply; 7+ messages in thread
From: Daniel Vetter @ 2020-09-02 11:27 UTC (permalink / raw)
  To: Luben Tuikov; +Cc: Alex Deucher, Deucher, Alexander, Pan, Xinhui, amd-gfx

On Tue, Sep 1, 2020 at 10:35 PM Luben Tuikov <luben.tuikov@amd.com> wrote:
>
> On 2020-09-01 10:12 a.m., Alex Deucher wrote:
> > On Tue, Sep 1, 2020 at 3:46 AM Pan, Xinhui <Xinhui.Pan@amd.com> wrote:
> >>
> >> [AMD Official Use Only - Internal Distribution Only]
> >>
> >> drm_dev_alloc() alloc *dev* and set managed.final_kfree to dev to free
> >> itself.
> >> Now from commit 5cdd68498918("drm/amdgpu: Embed drm_device into
> >> amdgpu_device (v3)") we alloc *adev* and ddev is just a member of it.
> >> So drm_dev_release try to free a wrong pointer then.
> >>
> >> Also driver's release trys to free adev, but drm_dev_release will
> >> access dev after call drvier's release.
> >>
> >> To fix it, remove driver's release and set managed.final_kfree to adev.
> >
> > I've got to admit, the documentation around drm_dev_init is hard to
> > follow.  We aren't really using the drmm stuff, but you have to use
> > drmm_add_final_kfree to avoid a warning.  The logic seems to make
> > sense though.

I've just resent the patch which should clarify all this a bit.

And the warning isn't there just for lolz, if you enable KASAN it will
report a use-after-free if you don't set this all up correctly. Note
that drmm_ is already used by drm core code internally for everyone.
-Daniel

> > Acked-by: Alex Deucher <alexancer.deucher@amd.com>
>
> The logic in patch 3/3 uses the kref infrastructure
> as described in drm_drv.c's comment on what the DRM
> usage is, i.e. taking advantage of the kref infrastructure.
>
> In amdgpu_pci_probe() we call drm_dev_init() which takes
> a ref of 1 on the kref in the DRM device structure,
> and from then on, only when the kref transitions
> from non-zero to 0, do we free the container of
> DRM device, and this is beautifully shown in the
> kernel stack below (please take a look at the kernel
> stack below).
>
> Using a kref is very powerful as it is implicit:
> when the kref transitions from non-zero to 0,
> then call the release method.
>
> Furthermore, we own the release method, and we
> like that, as it is pure, as well as,
> there may be more things we'd like to do in the future
> before we free the amdgpu device: maybe free memory we're
> using specifically for that PCI device, maybe write
> some registers, maybe notify someone or something, etc.
>
> On another note, adding "drmm_add_final_kfree()" in the middle
> of amdgpu_pci_probe() seems hackish--it's neither part
> of drm_dev_init() nor of drm_dev_register(). We really
> don't need it, since we rely on the kref infrastructure
> to tell us when to free the device, and if you'd look
> at the beautiful stack below, it knows exactly when that is,
> i.e. when to free it.
>
> The correct thing to do this is to
> _leave the amdgpu_driver_release()_ alone,
> remove "drmm_add_final_kfree()" and qualify
> the WARN_ON() in drm_dev_register() by
> the existence of drm_driver.release() (i.e. non-NULL).
>
> I'll submit a sequence of patches to fix this right.
>
> Regards,
> Luben
>
> >
> >>
> >> [   36.269348] BUG: unable to handle page fault for address: ffffa0c279940028
> >> [   36.276841] #PF: supervisor read access in kernel mode
> >> [   36.282434] #PF: error_code(0x0000) - not-present page
> >> [   36.288053] PGD 676601067 P4D 676601067 PUD 86a414067 PMD 86a247067 PTE 800ffff8066bf060
> >> [   36.296868] Oops: 0000 [#1] SMP DEBUG_PAGEALLOC NOPTI
> >> [   36.302409] CPU: 4 PID: 1375 Comm: bash Tainted: G           O      5.9.0-rc2+ #46
> >> [   36.310670] Hardware name: System manufacturer System Product Name/PRIME Z390-A, BIOS 1401 11/26/2019
> >> [   36.320725] RIP: 0010:drm_managed_release+0x25/0x110 [drm]
> >> [   36.326741] Code: 80 00 00 00 00 0f 1f 44 00 00 55 48 c7 c2 5a 9f 41 c0 be 00 02 00 00 48 89 e5 41 57 41 56 41 55 41 54 49 89 fc 53 48 83 ec 08 <48> 8b 7f 18 e8 c2 10 ff ff 4d 8b 74 24 20 49 8d 44 24 5
> >> [   36.347217] RSP: 0018:ffffb9424141fce0 EFLAGS: 00010282
> >> [   36.352931] RAX: 0000000000000006 RBX: ffffa0c279940010 RCX: 0000000000000006
> >> [   36.360718] RDX: ffffffffc0419f5a RSI: 0000000000000200 RDI: ffffa0c279940010
> >> [   36.368503] RBP: ffffb9424141fd10 R08: 0000000000000001 R09: 0000000000000001
> >> [   36.376304] R10: 0000000000000000 R11: 0000000000000000 R12: ffffa0c279940010
> >> [   36.384070] R13: ffffffffc0e2a000 R14: ffffa0c26924e220 R15: fffffffffffffff2
> >> [   36.391845] FS:  00007fc4a277b740(0000) GS:ffffa0c288e00000(0000) knlGS:0000000000000000
> >> [   36.400669] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> >> [   36.406937] CR2: ffffa0c279940028 CR3: 0000000792304006 CR4: 00000000003706e0
> >> [   36.414732] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> >> [   36.422550] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> >> [   36.430354] Call Trace:
> >> [   36.433044]  drm_dev_put.part.0+0x40/0x60 [drm]
> >> [   36.438017]  drm_dev_put+0x13/0x20 [drm]
> >> [   36.442398]  amdgpu_pci_remove+0x56/0x60 [amdgpu]
> >> [   36.447528]  pci_device_remove+0x3e/0xb0
> >> [   36.451807]  device_release_driver_internal+0xff/0x1d0
> >> [   36.457416]  device_release_driver+0x12/0x20
> >> [   36.462094]  pci_stop_bus_device+0x70/0xa0
> >> [   36.466588]  pci_stop_and_remove_bus_device_locked+0x1b/0x30
> >> [   36.472786]  remove_store+0x7b/0x90
> >> [   36.476614]  dev_attr_store+0x17/0x30
> >> [   36.480646]  sysfs_kf_write+0x4b/0x60
> >> [   36.484655]  kernfs_fop_write+0xe8/0x1d0
> >> [   36.488952]  vfs_write+0xf5/0x230
> >> [   36.492562]  ksys_write+0x70/0xf0
> >> [   36.496206]  __x64_sys_write+0x1a/0x20
> >> [   36.500292]  do_syscall_64+0x38/0x90
> >> [   36.504219]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> >>
> >> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
> >> ---
> >>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 10 +---------
> >>  1 file changed, 1 insertion(+), 9 deletions(-)
> >>
> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> >> index c12e9acd421d..52fc0c6c8538 100644
> >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> >> @@ -1165,7 +1165,7 @@ static int amdgpu_pci_probe(struct pci_dev *pdev,
> >>  if (ret)
> >>  goto err_free;
> >>
> >> -drmm_add_final_kfree(ddev, ddev);
> >> +drmm_add_final_kfree(ddev, adev);
> >>
> >>  if (!supports_atomic)
> >>  ddev->driver_features &= ~DRIVER_ATOMIC;
> >> @@ -1221,13 +1221,6 @@ amdgpu_pci_remove(struct pci_dev *pdev)
> >>  drm_dev_put(dev);
> >>  }
> >>
> >> -static void amdgpu_driver_release(struct drm_device *ddev)
> >> -{
> >> -struct amdgpu_device *adev = drm_to_adev(ddev);
> >> -
> >> -kfree(adev);
> >> -}
> >> -
> >>  static void
> >>  amdgpu_pci_shutdown(struct pci_dev *pdev)
> >>  {
> >> @@ -1522,7 +1515,6 @@ static struct drm_driver kms_driver = {
> >>  .open = amdgpu_driver_open_kms,
> >>  .postclose = amdgpu_driver_postclose_kms,
> >>  .lastclose = amdgpu_driver_lastclose_kms,
> >> -.release   = amdgpu_driver_release,
> >>  .irq_handler = amdgpu_irq_handler,
> >>  .ioctls = amdgpu_ioctls_kms,
> >>  .gem_free_object_unlocked = amdgpu_gem_object_free,
> >> --
> >> 2.25.1
> >>
> >> _______________________________________________
> >> amd-gfx mailing list
> >> amd-gfx@lists.freedesktop.org
> >> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7CLuben.Tuikov%40amd.com%7C5233b0caaac449417a7208d84e810de0%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637345663485193534&amp;sdata=A%2FzJpoQmUhGBGts5mQfp%2BlVu1FL5NVbe5qfRelb2Og8%3D&amp;reserved=0
>


-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix a redundant kfree
  2020-09-02 11:27     ` Daniel Vetter
@ 2020-09-02 14:36       ` Luben Tuikov
  0 siblings, 0 replies; 7+ messages in thread
From: Luben Tuikov @ 2020-09-02 14:36 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: Alex Deucher, Deucher, Alexander, Pan, Xinhui, amd-gfx

On 2020-09-02 07:27, Daniel Vetter wrote:
> On Tue, Sep 1, 2020 at 10:35 PM Luben Tuikov <luben.tuikov@amd.com> wrote:
>>
>> On 2020-09-01 10:12 a.m., Alex Deucher wrote:
>>> On Tue, Sep 1, 2020 at 3:46 AM Pan, Xinhui <Xinhui.Pan@amd.com> wrote:
>>>>
>>>> [AMD Official Use Only - Internal Distribution Only]
>>>>
>>>> drm_dev_alloc() alloc *dev* and set managed.final_kfree to dev to free
>>>> itself.
>>>> Now from commit 5cdd68498918("drm/amdgpu: Embed drm_device into
>>>> amdgpu_device (v3)") we alloc *adev* and ddev is just a member of it.
>>>> So drm_dev_release try to free a wrong pointer then.
>>>>
>>>> Also driver's release trys to free adev, but drm_dev_release will
>>>> access dev after call drvier's release.
>>>>
>>>> To fix it, remove driver's release and set managed.final_kfree to adev.
>>>
>>> I've got to admit, the documentation around drm_dev_init is hard to
>>> follow.  We aren't really using the drmm stuff, but you have to use
>>> drmm_add_final_kfree to avoid a warning.  The logic seems to make
>>> sense though.
> 
> I've just resent the patch which should clarify all this a bit.
> 
> And the warning isn't there just for lolz, if you enable KASAN it will
> report a use-after-free if you don't set this all up correctly. Note
> that drmm_ is already used by drm core code internally for everyone.

Well, you made the changes--of course it is. But something
being used by "everyone", doesn't mean it is a good thing.

It seems the motivation behind "managed resources", may have
been good, but the implementation, as is right now, makes
a mockery of the kref infrastructure and the original
_clean_ design of DRM init/fini sequence as I showed
in a previous email quoting the original version
of drm_dev_release():

static void drm_dev_release(struct kref *ref)
{
	if (dev->driver->release)
		dev->driver->release(dev);
	else
		drm_dev_fini(dev);
}

FWIW, the managed resources shouldn't be even known
by drivers, if well implemented--it should fold
into the current/original DRM driver infra.

Regards,
Luben

> -Daniel
> 
>>> Acked-by: Alex Deucher <alexancer.deucher@amd.com>
>>
>> The logic in patch 3/3 uses the kref infrastructure
>> as described in drm_drv.c's comment on what the DRM
>> usage is, i.e. taking advantage of the kref infrastructure.
>>
>> In amdgpu_pci_probe() we call drm_dev_init() which takes
>> a ref of 1 on the kref in the DRM device structure,
>> and from then on, only when the kref transitions
>> from non-zero to 0, do we free the container of
>> DRM device, and this is beautifully shown in the
>> kernel stack below (please take a look at the kernel
>> stack below).
>>
>> Using a kref is very powerful as it is implicit:
>> when the kref transitions from non-zero to 0,
>> then call the release method.
>>
>> Furthermore, we own the release method, and we
>> like that, as it is pure, as well as,
>> there may be more things we'd like to do in the future
>> before we free the amdgpu device: maybe free memory we're
>> using specifically for that PCI device, maybe write
>> some registers, maybe notify someone or something, etc.
>>
>> On another note, adding "drmm_add_final_kfree()" in the middle
>> of amdgpu_pci_probe() seems hackish--it's neither part
>> of drm_dev_init() nor of drm_dev_register(). We really
>> don't need it, since we rely on the kref infrastructure
>> to tell us when to free the device, and if you'd look
>> at the beautiful stack below, it knows exactly when that is,
>> i.e. when to free it.
>>
>> The correct thing to do this is to
>> _leave the amdgpu_driver_release()_ alone,
>> remove "drmm_add_final_kfree()" and qualify
>> the WARN_ON() in drm_dev_register() by
>> the existence of drm_driver.release() (i.e. non-NULL).
>>
>> I'll submit a sequence of patches to fix this right.
>>
>> Regards,
>> Luben
>>
>>>
>>>>
>>>> [   36.269348] BUG: unable to handle page fault for address: ffffa0c279940028
>>>> [   36.276841] #PF: supervisor read access in kernel mode
>>>> [   36.282434] #PF: error_code(0x0000) - not-present page
>>>> [   36.288053] PGD 676601067 P4D 676601067 PUD 86a414067 PMD 86a247067 PTE 800ffff8066bf060
>>>> [   36.296868] Oops: 0000 [#1] SMP DEBUG_PAGEALLOC NOPTI
>>>> [   36.302409] CPU: 4 PID: 1375 Comm: bash Tainted: G           O      5.9.0-rc2+ #46
>>>> [   36.310670] Hardware name: System manufacturer System Product Name/PRIME Z390-A, BIOS 1401 11/26/2019
>>>> [   36.320725] RIP: 0010:drm_managed_release+0x25/0x110 [drm]
>>>> [   36.326741] Code: 80 00 00 00 00 0f 1f 44 00 00 55 48 c7 c2 5a 9f 41 c0 be 00 02 00 00 48 89 e5 41 57 41 56 41 55 41 54 49 89 fc 53 48 83 ec 08 <48> 8b 7f 18 e8 c2 10 ff ff 4d 8b 74 24 20 49 8d 44 24 5
>>>> [   36.347217] RSP: 0018:ffffb9424141fce0 EFLAGS: 00010282
>>>> [   36.352931] RAX: 0000000000000006 RBX: ffffa0c279940010 RCX: 0000000000000006
>>>> [   36.360718] RDX: ffffffffc0419f5a RSI: 0000000000000200 RDI: ffffa0c279940010
>>>> [   36.368503] RBP: ffffb9424141fd10 R08: 0000000000000001 R09: 0000000000000001
>>>> [   36.376304] R10: 0000000000000000 R11: 0000000000000000 R12: ffffa0c279940010
>>>> [   36.384070] R13: ffffffffc0e2a000 R14: ffffa0c26924e220 R15: fffffffffffffff2
>>>> [   36.391845] FS:  00007fc4a277b740(0000) GS:ffffa0c288e00000(0000) knlGS:0000000000000000
>>>> [   36.400669] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>>>> [   36.406937] CR2: ffffa0c279940028 CR3: 0000000792304006 CR4: 00000000003706e0
>>>> [   36.414732] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>>>> [   36.422550] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
>>>> [   36.430354] Call Trace:
>>>> [   36.433044]  drm_dev_put.part.0+0x40/0x60 [drm]
>>>> [   36.438017]  drm_dev_put+0x13/0x20 [drm]
>>>> [   36.442398]  amdgpu_pci_remove+0x56/0x60 [amdgpu]
>>>> [   36.447528]  pci_device_remove+0x3e/0xb0
>>>> [   36.451807]  device_release_driver_internal+0xff/0x1d0
>>>> [   36.457416]  device_release_driver+0x12/0x20
>>>> [   36.462094]  pci_stop_bus_device+0x70/0xa0
>>>> [   36.466588]  pci_stop_and_remove_bus_device_locked+0x1b/0x30
>>>> [   36.472786]  remove_store+0x7b/0x90
>>>> [   36.476614]  dev_attr_store+0x17/0x30
>>>> [   36.480646]  sysfs_kf_write+0x4b/0x60
>>>> [   36.484655]  kernfs_fop_write+0xe8/0x1d0
>>>> [   36.488952]  vfs_write+0xf5/0x230
>>>> [   36.492562]  ksys_write+0x70/0xf0
>>>> [   36.496206]  __x64_sys_write+0x1a/0x20
>>>> [   36.500292]  do_syscall_64+0x38/0x90
>>>> [   36.504219]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
>>>>
>>>> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
>>>> ---
>>>>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 10 +---------
>>>>  1 file changed, 1 insertion(+), 9 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>> index c12e9acd421d..52fc0c6c8538 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>> @@ -1165,7 +1165,7 @@ static int amdgpu_pci_probe(struct pci_dev *pdev,
>>>>  if (ret)
>>>>  goto err_free;
>>>>
>>>> -drmm_add_final_kfree(ddev, ddev);
>>>> +drmm_add_final_kfree(ddev, adev);
>>>>
>>>>  if (!supports_atomic)
>>>>  ddev->driver_features &= ~DRIVER_ATOMIC;
>>>> @@ -1221,13 +1221,6 @@ amdgpu_pci_remove(struct pci_dev *pdev)
>>>>  drm_dev_put(dev);
>>>>  }
>>>>
>>>> -static void amdgpu_driver_release(struct drm_device *ddev)
>>>> -{
>>>> -struct amdgpu_device *adev = drm_to_adev(ddev);
>>>> -
>>>> -kfree(adev);
>>>> -}
>>>> -
>>>>  static void
>>>>  amdgpu_pci_shutdown(struct pci_dev *pdev)
>>>>  {
>>>> @@ -1522,7 +1515,6 @@ static struct drm_driver kms_driver = {
>>>>  .open = amdgpu_driver_open_kms,
>>>>  .postclose = amdgpu_driver_postclose_kms,
>>>>  .lastclose = amdgpu_driver_lastclose_kms,
>>>> -.release   = amdgpu_driver_release,
>>>>  .irq_handler = amdgpu_irq_handler,
>>>>  .ioctls = amdgpu_ioctls_kms,
>>>>  .gem_free_object_unlocked = amdgpu_gem_object_free,
>>>> --
>>>> 2.25.1
>>>>
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx@lists.freedesktop.org
>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7Cluben.tuikov%40amd.com%7Cedb9fe436f324678e04008d84f3338cf%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637346428700034458&amp;sdata=ixJLxEibImTEPvOnBCfk6eqfFbjah90LSGXMeJeIkXM%3D&amp;reserved=0
>>
> 
> 

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2020-09-02 14:36 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-09-01  7:45 [PATCH] drm/amdgpu: Fix a redundant kfree Pan, Xinhui
2020-09-01 14:12 ` Alex Deucher
2020-09-01 20:35   ` Luben Tuikov
2020-09-01 21:58     ` Pan, Xinhui
2020-09-02  3:47       ` Luben Tuikov
2020-09-02 11:27     ` Daniel Vetter
2020-09-02 14:36       ` Luben Tuikov

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.