All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdgpu: move lockdep assert to the right place.
@ 2022-02-04  8:52 Christian König
  2022-02-04 14:28 ` Deucher, Alexander
  2022-02-04 16:27 ` Felix Kuehling
  0 siblings, 2 replies; 9+ messages in thread
From: Christian König @ 2022-02-04  8:52 UTC (permalink / raw)
  To: Rajneesh.Bhardwaj, amd-gfx

Since newly added BOs don't have any mappings it's ok to add them
without holding the VM lock. Only when we add per VM BOs the lock is
mandatory.

Signed-off-by: Christian König <christian.koenig@amd.com>
Reported-by: Bhardwaj, Rajneesh <Rajneesh.Bhardwaj@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index fdc6a1fd74af..dcc80d6e099e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -375,6 +375,8 @@ static void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base *base,
 	if (bo->tbo.base.resv != vm->root.bo->tbo.base.resv)
 		return;
 
+	dma_resv_assert_held(vm->root.bo->tbo.base.resv);
+
 	vm->bulk_moveable = false;
 	if (bo->tbo.type == ttm_bo_type_kernel && bo->parent)
 		amdgpu_vm_bo_relocated(base);
@@ -2260,8 +2262,6 @@ struct amdgpu_bo_va *amdgpu_vm_bo_add(struct amdgpu_device *adev,
 {
 	struct amdgpu_bo_va *bo_va;
 
-	dma_resv_assert_held(vm->root.bo->tbo.base.resv);
-
 	bo_va = kzalloc(sizeof(struct amdgpu_bo_va), GFP_KERNEL);
 	if (bo_va == NULL) {
 		return NULL;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH] drm/amdgpu: move lockdep assert to the right place.
  2022-02-04  8:52 [PATCH] drm/amdgpu: move lockdep assert to the right place Christian König
@ 2022-02-04 14:28 ` Deucher, Alexander
  2022-02-04 16:27 ` Felix Kuehling
  1 sibling, 0 replies; 9+ messages in thread
From: Deucher, Alexander @ 2022-02-04 14:28 UTC (permalink / raw)
  To: Christian König, Bhardwaj, Rajneesh, amd-gfx

[-- Attachment #1: Type: text/plain, Size: 1778 bytes --]

[Public]

Acked-by: Alex Deucher <alexander.deucher@amd.com>
________________________________
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf of Christian König <ckoenig.leichtzumerken@gmail.com>
Sent: Friday, February 4, 2022 3:52 AM
To: Bhardwaj, Rajneesh <Rajneesh.Bhardwaj@amd.com>; amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>
Subject: [PATCH] drm/amdgpu: move lockdep assert to the right place.

Since newly added BOs don't have any mappings it's ok to add them
without holding the VM lock. Only when we add per VM BOs the lock is
mandatory.

Signed-off-by: Christian König <christian.koenig@amd.com>
Reported-by: Bhardwaj, Rajneesh <Rajneesh.Bhardwaj@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index fdc6a1fd74af..dcc80d6e099e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -375,6 +375,8 @@ static void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base *base,
         if (bo->tbo.base.resv != vm->root.bo->tbo.base.resv)
                 return;

+       dma_resv_assert_held(vm->root.bo->tbo.base.resv);
+
         vm->bulk_moveable = false;
         if (bo->tbo.type == ttm_bo_type_kernel && bo->parent)
                 amdgpu_vm_bo_relocated(base);
@@ -2260,8 +2262,6 @@ struct amdgpu_bo_va *amdgpu_vm_bo_add(struct amdgpu_device *adev,
 {
         struct amdgpu_bo_va *bo_va;

-       dma_resv_assert_held(vm->root.bo->tbo.base.resv);
-
         bo_va = kzalloc(sizeof(struct amdgpu_bo_va), GFP_KERNEL);
         if (bo_va == NULL) {
                 return NULL;
--
2.25.1


[-- Attachment #2: Type: text/html, Size: 3383 bytes --]

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH] drm/amdgpu: move lockdep assert to the right place.
  2022-02-04  8:52 [PATCH] drm/amdgpu: move lockdep assert to the right place Christian König
  2022-02-04 14:28 ` Deucher, Alexander
@ 2022-02-04 16:27 ` Felix Kuehling
  2022-02-04 18:12   ` Bhardwaj, Rajneesh
  1 sibling, 1 reply; 9+ messages in thread
From: Felix Kuehling @ 2022-02-04 16:27 UTC (permalink / raw)
  To: Christian König, Rajneesh.Bhardwaj, amd-gfx


Am 2022-02-04 um 03:52 schrieb Christian König:
> Since newly added BOs don't have any mappings it's ok to add them
> without holding the VM lock. Only when we add per VM BOs the lock is
> mandatory.
>
> Signed-off-by: Christian König <christian.koenig@amd.com>
> Reported-by: Bhardwaj, Rajneesh <Rajneesh.Bhardwaj@amd.com>

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>


> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 4 ++--
>   1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index fdc6a1fd74af..dcc80d6e099e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -375,6 +375,8 @@ static void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base *base,
>   	if (bo->tbo.base.resv != vm->root.bo->tbo.base.resv)
>   		return;
>   
> +	dma_resv_assert_held(vm->root.bo->tbo.base.resv);
> +
>   	vm->bulk_moveable = false;
>   	if (bo->tbo.type == ttm_bo_type_kernel && bo->parent)
>   		amdgpu_vm_bo_relocated(base);
> @@ -2260,8 +2262,6 @@ struct amdgpu_bo_va *amdgpu_vm_bo_add(struct amdgpu_device *adev,
>   {
>   	struct amdgpu_bo_va *bo_va;
>   
> -	dma_resv_assert_held(vm->root.bo->tbo.base.resv);
> -
>   	bo_va = kzalloc(sizeof(struct amdgpu_bo_va), GFP_KERNEL);
>   	if (bo_va == NULL) {
>   		return NULL;

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] drm/amdgpu: move lockdep assert to the right place.
  2022-02-04 16:27 ` Felix Kuehling
@ 2022-02-04 18:12   ` Bhardwaj, Rajneesh
  2022-02-04 18:32     ` Christian König
  0 siblings, 1 reply; 9+ messages in thread
From: Bhardwaj, Rajneesh @ 2022-02-04 18:12 UTC (permalink / raw)
  To: Felix Kuehling, Christian König, amd-gfx, Deucher, Alexander

[Sorry for top posting]

Hi Christian

I think you forgot the below hunk, without which the issue is not fixed 
completely on a multi GPU system.

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index dcc80d6e099e..6f68fc9da56a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2670,8 +2670,6 @@ void amdgpu_vm_bo_del(struct amdgpu_device *adev,
         struct amdgpu_vm *vm = bo_va->base.vm;
         struct amdgpu_vm_bo_base **base;

-       dma_resv_assert_held(vm->root.bo->tbo.base.resv);
-
         if (bo) {
                 dma_resv_assert_held(bo->tbo.base.resv);
                 if (bo->tbo.base.resv == vm->root.bo->tbo.base.resv)


If you chose to include the above hunk, please feel free to add

Reviewed-and-tested-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>

On 2/4/2022 11:27 AM, Felix Kuehling wrote:
>
> Am 2022-02-04 um 03:52 schrieb Christian König:
>> Since newly added BOs don't have any mappings it's ok to add them
>> without holding the VM lock. Only when we add per VM BOs the lock is
>> mandatory.
>>
>> Signed-off-by: Christian König <christian.koenig@amd.com>
>> Reported-by: Bhardwaj, Rajneesh <Rajneesh.Bhardwaj@amd.com>
>
> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
>
>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 4 ++--
>>   1 file changed, 2 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> index fdc6a1fd74af..dcc80d6e099e 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> @@ -375,6 +375,8 @@ static void amdgpu_vm_bo_base_init(struct 
>> amdgpu_vm_bo_base *base,
>>       if (bo->tbo.base.resv != vm->root.bo->tbo.base.resv)
>>           return;
>>   +    dma_resv_assert_held(vm->root.bo->tbo.base.resv);
>> +
>>       vm->bulk_moveable = false;
>>       if (bo->tbo.type == ttm_bo_type_kernel && bo->parent)
>>           amdgpu_vm_bo_relocated(base);
>> @@ -2260,8 +2262,6 @@ struct amdgpu_bo_va *amdgpu_vm_bo_add(struct 
>> amdgpu_device *adev,
>>   {
>>       struct amdgpu_bo_va *bo_va;
>>   -    dma_resv_assert_held(vm->root.bo->tbo.base.resv);
>> -
>>       bo_va = kzalloc(sizeof(struct amdgpu_bo_va), GFP_KERNEL);
>>       if (bo_va == NULL) {
>>           return NULL;

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH] drm/amdgpu: move lockdep assert to the right place.
  2022-02-04 18:12   ` Bhardwaj, Rajneesh
@ 2022-02-04 18:32     ` Christian König
  2022-02-04 18:47       ` Bhardwaj, Rajneesh
  0 siblings, 1 reply; 9+ messages in thread
From: Christian König @ 2022-02-04 18:32 UTC (permalink / raw)
  To: Bhardwaj, Rajneesh, Felix Kuehling, amd-gfx, Deucher, Alexander

Am 04.02.22 um 19:12 schrieb Bhardwaj, Rajneesh:
> [Sorry for top posting]
>
> Hi Christian
>
> I think you forgot the below hunk, without which the issue is not 
> fixed completely on a multi GPU system.

No, that is perfectly intentional. While removing a bo_va structure it 
can happen that there are still mappings attached to it (for example 
because the application crashed).

Because of this locking the VM before the remove is mandatory. Only 
while adding a bo_va structure we can avoid that.

Regards,
Christian.

>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index dcc80d6e099e..6f68fc9da56a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -2670,8 +2670,6 @@ void amdgpu_vm_bo_del(struct amdgpu_device *adev,
>         struct amdgpu_vm *vm = bo_va->base.vm;
>         struct amdgpu_vm_bo_base **base;
>
> -       dma_resv_assert_held(vm->root.bo->tbo.base.resv);
> -
>         if (bo) {
>                 dma_resv_assert_held(bo->tbo.base.resv);
>                 if (bo->tbo.base.resv == vm->root.bo->tbo.base.resv)
>
>
> If you chose to include the above hunk, please feel free to add
>
> Reviewed-and-tested-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
>
> On 2/4/2022 11:27 AM, Felix Kuehling wrote:
>>
>> Am 2022-02-04 um 03:52 schrieb Christian König:
>>> Since newly added BOs don't have any mappings it's ok to add them
>>> without holding the VM lock. Only when we add per VM BOs the lock is
>>> mandatory.
>>>
>>> Signed-off-by: Christian König <christian.koenig@amd.com>
>>> Reported-by: Bhardwaj, Rajneesh <Rajneesh.Bhardwaj@amd.com>
>>
>> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
>>
>>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 4 ++--
>>>   1 file changed, 2 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> index fdc6a1fd74af..dcc80d6e099e 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> @@ -375,6 +375,8 @@ static void amdgpu_vm_bo_base_init(struct 
>>> amdgpu_vm_bo_base *base,
>>>       if (bo->tbo.base.resv != vm->root.bo->tbo.base.resv)
>>>           return;
>>>   +    dma_resv_assert_held(vm->root.bo->tbo.base.resv);
>>> +
>>>       vm->bulk_moveable = false;
>>>       if (bo->tbo.type == ttm_bo_type_kernel && bo->parent)
>>>           amdgpu_vm_bo_relocated(base);
>>> @@ -2260,8 +2262,6 @@ struct amdgpu_bo_va *amdgpu_vm_bo_add(struct 
>>> amdgpu_device *adev,
>>>   {
>>>       struct amdgpu_bo_va *bo_va;
>>>   -    dma_resv_assert_held(vm->root.bo->tbo.base.resv);
>>> -
>>>       bo_va = kzalloc(sizeof(struct amdgpu_bo_va), GFP_KERNEL);
>>>       if (bo_va == NULL) {
>>>           return NULL;


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] drm/amdgpu: move lockdep assert to the right place.
  2022-02-04 18:32     ` Christian König
@ 2022-02-04 18:47       ` Bhardwaj, Rajneesh
  2022-02-04 18:50         ` Christian König
  0 siblings, 1 reply; 9+ messages in thread
From: Bhardwaj, Rajneesh @ 2022-02-04 18:47 UTC (permalink / raw)
  To: Christian König, Felix Kuehling, amd-gfx, Deucher, Alexander


On 2/4/2022 1:32 PM, Christian König wrote:
> Am 04.02.22 um 19:12 schrieb Bhardwaj, Rajneesh:
>> [Sorry for top posting]
>>
>> Hi Christian
>>
>> I think you forgot the below hunk, without which the issue is not 
>> fixed completely on a multi GPU system.
>
> No, that is perfectly intentional. While removing a bo_va structure it 
> can happen that there are still mappings attached to it (for example 
> because the application crashed).


OK. but for me, at boot time, I see flood of warning messages coming 
from  amdgpu_vm_bo_del so the previous patch doesn't help.


>
> Because of this locking the VM before the remove is mandatory. Only 
> while adding a bo_va structure we can avoid that.
>
> Regards,
> Christian.
>
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> index dcc80d6e099e..6f68fc9da56a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> @@ -2670,8 +2670,6 @@ void amdgpu_vm_bo_del(struct amdgpu_device *adev,
>>         struct amdgpu_vm *vm = bo_va->base.vm;
>>         struct amdgpu_vm_bo_base **base;
>>
>> -       dma_resv_assert_held(vm->root.bo->tbo.base.resv);
>> -
>>         if (bo) {
>>                 dma_resv_assert_held(bo->tbo.base.resv);
>>                 if (bo->tbo.base.resv == vm->root.bo->tbo.base.resv)
>>
>>
>> If you chose to include the above hunk, please feel free to add
>>
>> Reviewed-and-tested-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
>>
>> On 2/4/2022 11:27 AM, Felix Kuehling wrote:
>>>
>>> Am 2022-02-04 um 03:52 schrieb Christian König:
>>>> Since newly added BOs don't have any mappings it's ok to add them
>>>> without holding the VM lock. Only when we add per VM BOs the lock is
>>>> mandatory.
>>>>
>>>> Signed-off-by: Christian König <christian.koenig@amd.com>
>>>> Reported-by: Bhardwaj, Rajneesh <Rajneesh.Bhardwaj@amd.com>
>>>
>>> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
>>>
>>>
>>>> ---
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 4 ++--
>>>>   1 file changed, 2 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> index fdc6a1fd74af..dcc80d6e099e 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> @@ -375,6 +375,8 @@ static void amdgpu_vm_bo_base_init(struct 
>>>> amdgpu_vm_bo_base *base,
>>>>       if (bo->tbo.base.resv != vm->root.bo->tbo.base.resv)
>>>>           return;
>>>>   + dma_resv_assert_held(vm->root.bo->tbo.base.resv);
>>>> +
>>>>       vm->bulk_moveable = false;
>>>>       if (bo->tbo.type == ttm_bo_type_kernel && bo->parent)
>>>>           amdgpu_vm_bo_relocated(base);
>>>> @@ -2260,8 +2262,6 @@ struct amdgpu_bo_va *amdgpu_vm_bo_add(struct 
>>>> amdgpu_device *adev,
>>>>   {
>>>>       struct amdgpu_bo_va *bo_va;
>>>>   - dma_resv_assert_held(vm->root.bo->tbo.base.resv);
>>>> -
>>>>       bo_va = kzalloc(sizeof(struct amdgpu_bo_va), GFP_KERNEL);
>>>>       if (bo_va == NULL) {
>>>>           return NULL;
>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] drm/amdgpu: move lockdep assert to the right place.
  2022-02-04 18:47       ` Bhardwaj, Rajneesh
@ 2022-02-04 18:50         ` Christian König
  2022-02-04 19:15           ` Bhardwaj, Rajneesh
  0 siblings, 1 reply; 9+ messages in thread
From: Christian König @ 2022-02-04 18:50 UTC (permalink / raw)
  To: Bhardwaj, Rajneesh, Felix Kuehling, amd-gfx, Deucher, Alexander

Am 04.02.22 um 19:47 schrieb Bhardwaj, Rajneesh:
>
> On 2/4/2022 1:32 PM, Christian König wrote:
>> Am 04.02.22 um 19:12 schrieb Bhardwaj, Rajneesh:
>>> [Sorry for top posting]
>>>
>>> Hi Christian
>>>
>>> I think you forgot the below hunk, without which the issue is not 
>>> fixed completely on a multi GPU system.
>>
>> No, that is perfectly intentional. While removing a bo_va structure 
>> it can happen that there are still mappings attached to it (for 
>> example because the application crashed).
>
>
> OK. but for me, at boot time, I see flood of warning messages coming 
> from  amdgpu_vm_bo_del so the previous patch doesn't help.

Do you have a backtrace? That should not happen.

Could be that we have this buggy at a couple of different places.

Regards,
Christian.

>
>
>>
>> Because of this locking the VM before the remove is mandatory. Only 
>> while adding a bo_va structure we can avoid that.
>>
>> Regards,
>> Christian.
>>
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> index dcc80d6e099e..6f68fc9da56a 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> @@ -2670,8 +2670,6 @@ void amdgpu_vm_bo_del(struct amdgpu_device *adev,
>>>         struct amdgpu_vm *vm = bo_va->base.vm;
>>>         struct amdgpu_vm_bo_base **base;
>>>
>>> - dma_resv_assert_held(vm->root.bo->tbo.base.resv);
>>> -
>>>         if (bo) {
>>>                 dma_resv_assert_held(bo->tbo.base.resv);
>>>                 if (bo->tbo.base.resv == vm->root.bo->tbo.base.resv)
>>>
>>>
>>> If you chose to include the above hunk, please feel free to add
>>>
>>> Reviewed-and-tested-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
>>>
>>> On 2/4/2022 11:27 AM, Felix Kuehling wrote:
>>>>
>>>> Am 2022-02-04 um 03:52 schrieb Christian König:
>>>>> Since newly added BOs don't have any mappings it's ok to add them
>>>>> without holding the VM lock. Only when we add per VM BOs the lock is
>>>>> mandatory.
>>>>>
>>>>> Signed-off-by: Christian König <christian.koenig@amd.com>
>>>>> Reported-by: Bhardwaj, Rajneesh <Rajneesh.Bhardwaj@amd.com>
>>>>
>>>> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
>>>>
>>>>
>>>>> ---
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 4 ++--
>>>>>   1 file changed, 2 insertions(+), 2 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>> index fdc6a1fd74af..dcc80d6e099e 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>> @@ -375,6 +375,8 @@ static void amdgpu_vm_bo_base_init(struct 
>>>>> amdgpu_vm_bo_base *base,
>>>>>       if (bo->tbo.base.resv != vm->root.bo->tbo.base.resv)
>>>>>           return;
>>>>>   + dma_resv_assert_held(vm->root.bo->tbo.base.resv);
>>>>> +
>>>>>       vm->bulk_moveable = false;
>>>>>       if (bo->tbo.type == ttm_bo_type_kernel && bo->parent)
>>>>>           amdgpu_vm_bo_relocated(base);
>>>>> @@ -2260,8 +2262,6 @@ struct amdgpu_bo_va *amdgpu_vm_bo_add(struct 
>>>>> amdgpu_device *adev,
>>>>>   {
>>>>>       struct amdgpu_bo_va *bo_va;
>>>>>   - dma_resv_assert_held(vm->root.bo->tbo.base.resv);
>>>>> -
>>>>>       bo_va = kzalloc(sizeof(struct amdgpu_bo_va), GFP_KERNEL);
>>>>>       if (bo_va == NULL) {
>>>>>           return NULL;
>>


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] drm/amdgpu: move lockdep assert to the right place.
  2022-02-04 18:50         ` Christian König
@ 2022-02-04 19:15           ` Bhardwaj, Rajneesh
  2022-02-07  7:24             ` Christian König
  0 siblings, 1 reply; 9+ messages in thread
From: Bhardwaj, Rajneesh @ 2022-02-04 19:15 UTC (permalink / raw)
  To: Christian König, Felix Kuehling, amd-gfx, Deucher, Alexander


On 2/4/2022 1:50 PM, Christian König wrote:
> Am 04.02.22 um 19:47 schrieb Bhardwaj, Rajneesh:
>>
>> On 2/4/2022 1:32 PM, Christian König wrote:
>>> Am 04.02.22 um 19:12 schrieb Bhardwaj, Rajneesh:
>>>> [Sorry for top posting]
>>>>
>>>> Hi Christian
>>>>
>>>> I think you forgot the below hunk, without which the issue is not 
>>>> fixed completely on a multi GPU system.
>>>
>>> No, that is perfectly intentional. While removing a bo_va structure 
>>> it can happen that there are still mappings attached to it (for 
>>> example because the application crashed).
>>
>>
>> OK. but for me, at boot time, I see flood of warning messages coming 
>> from  amdgpu_vm_bo_del so the previous patch doesn't help.
>
> Do you have a backtrace? That should not happen.
>
> Could be that we have this buggy at a couple of different places.


This is on Ubuntu 18.04.


[    8.392405] WARNING: CPU: 13 PID: 2732 at 
/home/rajneesh/git/compute/kernel/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c:2673 
amdgpu_vm_bo_del+0x386/0x3b
0 [amdgpu]
[    8.392586] Modules linked in: amdgpu ast iommu_v2 gpu_sched 
drm_vram_helper drm_ttm_helper ttm drm_kms_helper cfbfillrect 
syscopyarea cfbimgblt sy
sfillrect sysimgblt fb_sys_fops cfbcopyarea drm 
drm_panel_orientation_quirks k10temp nf_tables nfnetlink ip_tables 
x_tables i2c_piix4
[    8.392628] CPU: 13 PID: 2732 Comm: plymouthd Not tainted 
5.13.0-kfd-rajneesh #1055
[    8.392632] Hardware name: GIGABYTE MZ01-CE0-00/MZ01-CE0-00, BIOS F02 
08/29/2018
[    8.392635] RIP: 0010:amdgpu_vm_bo_del+0x386/0x3b0 [amdgpu]
[    8.392749] Code: 0f 85 56 fe ff ff 48 c7 c2 28 6b b3 c0 be 21 01 00 
00 48 c7 c7 58 6b b3 c0 c6 05 64 64 62 00 01 e8 5f be a4 d4 e9 32 fe ff 
ff <0f
 > 0b eb 8a 49 8b be 88 01 00 00 e9 af fc ff ff be 03 00 00 00 e8
[    8.392752] RSP: 0018:ffffaf33471d7d98 EFLAGS: 00010246
[    8.392756] RAX: 0000000000000000 RBX: ffffa08771600000 RCX: 
0000000000000001
[    8.392758] RDX: 0000000000000001 RSI: ffffa08772ae01f8 RDI: 
ffffa0800a426f68
[    8.392761] RBP: ffffa087691fd980 R08: ffffffffc0a4e2e0 R09: 
000000000000000a
[    8.392763] R10: ffffaf33471d7d68 R11: 0000000000000001 R12: 
ffffa0801d540010
[    8.392765] R13: ffffa08771615c00 R14: ffffa08024166618 R15: 
ffffa08771615e60
[    8.392768] FS:  00007f7b80179740(0000) GS:ffffa08f3dec0000(0000) 
knlGS:0000000000000000
[    8.392771] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[    8.392773] CR2: 000055839db51eb8 CR3: 000000010f49c000 CR4: 
00000000003506e0
[    8.392775] Call Trace:
[    8.392779]  ? _raw_spin_unlock_irqrestore+0x30/0x40
[    8.392787]  amdgpu_driver_postclose_kms+0x94/0x270 [amdgpu]
[    8.392897]  drm_file_free.part.14+0x1e3/0x230 [drm]
[    8.392918]  drm_release+0x6e/0xf0 [drm]
[    8.392937]  __fput+0xa3/0x250
[    8.392942]  task_work_run+0x6d/0xb0
[    8.392949]  exit_to_user_mode_prepare+0x1c9/0x1d0
[    8.392953]  syscall_exit_to_user_mode+0x19/0x50
[    8.392957]  do_syscall_64+0x42/0x70
[    8.392960]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[    8.392964] RIP: 0033:0x7f7b7f8679e4
[    8.392967] Code: eb 89 e8 6f 44 02 00 66 2e 0f 1f 84 00 00 00 00 00 
0f 1f 44 00 00 48 8d 05 21 ff 2d 00 8b 00 85 c0 75 13 b8 03 00 00 00 0f 
05 <48> 3d 00 f0 ff ff 77 3c f3 c3 66 90 53 89 fb 48 83 ec 10 e8 94 fe
[    8.392970] RSP: 002b:00007ffe6bb0cdb8 EFLAGS: 00000246 ORIG_RAX: 
0000000000000003
[    8.392973] RAX: 0000000000000000 RBX: 000055839db4b760 RCX: 
00007f7b7f8679e4
[    8.392974] RDX: 000055839db4aed0 RSI: 0000000000000000 RDI: 
000000000000000b
[    8.392976] RBP: 000000000000000b R08: 000055839db4aee0 R09: 
00007f7b7fb42c40
[    8.392978] R10: 0000000000000007 R11: 0000000000000246 R12: 
000000000000e280
[    8.392979] R13: 000000000000000d R14: 00007f7b7fb5b1e0 R15: 
00007f7b7fb5b130
[    8.392988] irq event stamp: 1264799
[    8.392990] hardirqs last  enabled at (1264805): [<ffffffff951011b9>] 
console_unlock+0x339/0x530
[    8.392994] hardirqs last disabled at (1264810): [<ffffffff95101226>] 
console_unlock+0x3a6/0x530


>
> Regards,
> Christian.
>
>>
>>
>>>
>>> Because of this locking the VM before the remove is mandatory. Only 
>>> while adding a bo_va structure we can avoid that.
>>>
>>> Regards,
>>> Christian.
>>>
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> index dcc80d6e099e..6f68fc9da56a 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> @@ -2670,8 +2670,6 @@ void amdgpu_vm_bo_del(struct amdgpu_device 
>>>> *adev,
>>>>         struct amdgpu_vm *vm = bo_va->base.vm;
>>>>         struct amdgpu_vm_bo_base **base;
>>>>
>>>> - dma_resv_assert_held(vm->root.bo->tbo.base.resv);
>>>> -
>>>>         if (bo) {
>>>>                 dma_resv_assert_held(bo->tbo.base.resv);
>>>>                 if (bo->tbo.base.resv == vm->root.bo->tbo.base.resv)
>>>>
>>>>
>>>> If you chose to include the above hunk, please feel free to add
>>>>
>>>> Reviewed-and-tested-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
>>>>
>>>> On 2/4/2022 11:27 AM, Felix Kuehling wrote:
>>>>>
>>>>> Am 2022-02-04 um 03:52 schrieb Christian König:
>>>>>> Since newly added BOs don't have any mappings it's ok to add them
>>>>>> without holding the VM lock. Only when we add per VM BOs the lock is
>>>>>> mandatory.
>>>>>>
>>>>>> Signed-off-by: Christian König <christian.koenig@amd.com>
>>>>>> Reported-by: Bhardwaj, Rajneesh <Rajneesh.Bhardwaj@amd.com>
>>>>>
>>>>> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
>>>>>
>>>>>
>>>>>> ---
>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 4 ++--
>>>>>>   1 file changed, 2 insertions(+), 2 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>> index fdc6a1fd74af..dcc80d6e099e 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>> @@ -375,6 +375,8 @@ static void amdgpu_vm_bo_base_init(struct 
>>>>>> amdgpu_vm_bo_base *base,
>>>>>>       if (bo->tbo.base.resv != vm->root.bo->tbo.base.resv)
>>>>>>           return;
>>>>>>   + dma_resv_assert_held(vm->root.bo->tbo.base.resv);
>>>>>> +
>>>>>>       vm->bulk_moveable = false;
>>>>>>       if (bo->tbo.type == ttm_bo_type_kernel && bo->parent)
>>>>>>           amdgpu_vm_bo_relocated(base);
>>>>>> @@ -2260,8 +2262,6 @@ struct amdgpu_bo_va 
>>>>>> *amdgpu_vm_bo_add(struct amdgpu_device *adev,
>>>>>>   {
>>>>>>       struct amdgpu_bo_va *bo_va;
>>>>>>   - dma_resv_assert_held(vm->root.bo->tbo.base.resv);
>>>>>> -
>>>>>>       bo_va = kzalloc(sizeof(struct amdgpu_bo_va), GFP_KERNEL);
>>>>>>       if (bo_va == NULL) {
>>>>>>           return NULL;
>>>
>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] drm/amdgpu: move lockdep assert to the right place.
  2022-02-04 19:15           ` Bhardwaj, Rajneesh
@ 2022-02-07  7:24             ` Christian König
  0 siblings, 0 replies; 9+ messages in thread
From: Christian König @ 2022-02-07  7:24 UTC (permalink / raw)
  To: Bhardwaj, Rajneesh, Felix Kuehling, amd-gfx, Deucher, Alexander

Ah, yes of course!

When the VM is freed we currently don't lock anything either because 
nobody should have a reference to that object any more.

Going to fix this as well.

Thanks,
Christian.

Am 04.02.22 um 20:15 schrieb Bhardwaj, Rajneesh:
>
> On 2/4/2022 1:50 PM, Christian König wrote:
>> Am 04.02.22 um 19:47 schrieb Bhardwaj, Rajneesh:
>>>
>>> On 2/4/2022 1:32 PM, Christian König wrote:
>>>> Am 04.02.22 um 19:12 schrieb Bhardwaj, Rajneesh:
>>>>> [Sorry for top posting]
>>>>>
>>>>> Hi Christian
>>>>>
>>>>> I think you forgot the below hunk, without which the issue is not 
>>>>> fixed completely on a multi GPU system.
>>>>
>>>> No, that is perfectly intentional. While removing a bo_va structure 
>>>> it can happen that there are still mappings attached to it (for 
>>>> example because the application crashed).
>>>
>>>
>>> OK. but for me, at boot time, I see flood of warning messages coming 
>>> from  amdgpu_vm_bo_del so the previous patch doesn't help.
>>
>> Do you have a backtrace? That should not happen.
>>
>> Could be that we have this buggy at a couple of different places.
>
>
> This is on Ubuntu 18.04.
>
>
> [    8.392405] WARNING: CPU: 13 PID: 2732 at 
> /home/rajneesh/git/compute/kernel/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c:2673 
> amdgpu_vm_bo_del+0x386/0x3b
> 0 [amdgpu]
> [    8.392586] Modules linked in: amdgpu ast iommu_v2 gpu_sched 
> drm_vram_helper drm_ttm_helper ttm drm_kms_helper cfbfillrect 
> syscopyarea cfbimgblt sy
> sfillrect sysimgblt fb_sys_fops cfbcopyarea drm 
> drm_panel_orientation_quirks k10temp nf_tables nfnetlink ip_tables 
> x_tables i2c_piix4
> [    8.392628] CPU: 13 PID: 2732 Comm: plymouthd Not tainted 
> 5.13.0-kfd-rajneesh #1055
> [    8.392632] Hardware name: GIGABYTE MZ01-CE0-00/MZ01-CE0-00, BIOS 
> F02 08/29/2018
> [    8.392635] RIP: 0010:amdgpu_vm_bo_del+0x386/0x3b0 [amdgpu]
> [    8.392749] Code: 0f 85 56 fe ff ff 48 c7 c2 28 6b b3 c0 be 21 01 
> 00 00 48 c7 c7 58 6b b3 c0 c6 05 64 64 62 00 01 e8 5f be a4 d4 e9 32 
> fe ff ff <0f
> > 0b eb 8a 49 8b be 88 01 00 00 e9 af fc ff ff be 03 00 00 00 e8
> [    8.392752] RSP: 0018:ffffaf33471d7d98 EFLAGS: 00010246
> [    8.392756] RAX: 0000000000000000 RBX: ffffa08771600000 RCX: 
> 0000000000000001
> [    8.392758] RDX: 0000000000000001 RSI: ffffa08772ae01f8 RDI: 
> ffffa0800a426f68
> [    8.392761] RBP: ffffa087691fd980 R08: ffffffffc0a4e2e0 R09: 
> 000000000000000a
> [    8.392763] R10: ffffaf33471d7d68 R11: 0000000000000001 R12: 
> ffffa0801d540010
> [    8.392765] R13: ffffa08771615c00 R14: ffffa08024166618 R15: 
> ffffa08771615e60
> [    8.392768] FS:  00007f7b80179740(0000) GS:ffffa08f3dec0000(0000) 
> knlGS:0000000000000000
> [    8.392771] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [    8.392773] CR2: 000055839db51eb8 CR3: 000000010f49c000 CR4: 
> 00000000003506e0
> [    8.392775] Call Trace:
> [    8.392779]  ? _raw_spin_unlock_irqrestore+0x30/0x40
> [    8.392787]  amdgpu_driver_postclose_kms+0x94/0x270 [amdgpu]
> [    8.392897]  drm_file_free.part.14+0x1e3/0x230 [drm]
> [    8.392918]  drm_release+0x6e/0xf0 [drm]
> [    8.392937]  __fput+0xa3/0x250
> [    8.392942]  task_work_run+0x6d/0xb0
> [    8.392949]  exit_to_user_mode_prepare+0x1c9/0x1d0
> [    8.392953]  syscall_exit_to_user_mode+0x19/0x50
> [    8.392957]  do_syscall_64+0x42/0x70
> [    8.392960]  entry_SYSCALL_64_after_hwframe+0x44/0xae
> [    8.392964] RIP: 0033:0x7f7b7f8679e4
> [    8.392967] Code: eb 89 e8 6f 44 02 00 66 2e 0f 1f 84 00 00 00 00 
> 00 0f 1f 44 00 00 48 8d 05 21 ff 2d 00 8b 00 85 c0 75 13 b8 03 00 00 
> 00 0f 05 <48> 3d 00 f0 ff ff 77 3c f3 c3 66 90 53 89 fb 48 83 ec 10 e8 
> 94 fe
> [    8.392970] RSP: 002b:00007ffe6bb0cdb8 EFLAGS: 00000246 ORIG_RAX: 
> 0000000000000003
> [    8.392973] RAX: 0000000000000000 RBX: 000055839db4b760 RCX: 
> 00007f7b7f8679e4
> [    8.392974] RDX: 000055839db4aed0 RSI: 0000000000000000 RDI: 
> 000000000000000b
> [    8.392976] RBP: 000000000000000b R08: 000055839db4aee0 R09: 
> 00007f7b7fb42c40
> [    8.392978] R10: 0000000000000007 R11: 0000000000000246 R12: 
> 000000000000e280
> [    8.392979] R13: 000000000000000d R14: 00007f7b7fb5b1e0 R15: 
> 00007f7b7fb5b130
> [    8.392988] irq event stamp: 1264799
> [    8.392990] hardirqs last  enabled at (1264805): 
> [<ffffffff951011b9>] console_unlock+0x339/0x530
> [    8.392994] hardirqs last disabled at (1264810): 
> [<ffffffff95101226>] console_unlock+0x3a6/0x530
>
>
>>
>> Regards,
>> Christian.
>>
>>>
>>>
>>>>
>>>> Because of this locking the VM before the remove is mandatory. Only 
>>>> while adding a bo_va structure we can avoid that.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>> index dcc80d6e099e..6f68fc9da56a 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>> @@ -2670,8 +2670,6 @@ void amdgpu_vm_bo_del(struct amdgpu_device 
>>>>> *adev,
>>>>>         struct amdgpu_vm *vm = bo_va->base.vm;
>>>>>         struct amdgpu_vm_bo_base **base;
>>>>>
>>>>> - dma_resv_assert_held(vm->root.bo->tbo.base.resv);
>>>>> -
>>>>>         if (bo) {
>>>>> dma_resv_assert_held(bo->tbo.base.resv);
>>>>>                 if (bo->tbo.base.resv == vm->root.bo->tbo.base.resv)
>>>>>
>>>>>
>>>>> If you chose to include the above hunk, please feel free to add
>>>>>
>>>>> Reviewed-and-tested-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
>>>>>
>>>>> On 2/4/2022 11:27 AM, Felix Kuehling wrote:
>>>>>>
>>>>>> Am 2022-02-04 um 03:52 schrieb Christian König:
>>>>>>> Since newly added BOs don't have any mappings it's ok to add them
>>>>>>> without holding the VM lock. Only when we add per VM BOs the 
>>>>>>> lock is
>>>>>>> mandatory.
>>>>>>>
>>>>>>> Signed-off-by: Christian König <christian.koenig@amd.com>
>>>>>>> Reported-by: Bhardwaj, Rajneesh <Rajneesh.Bhardwaj@amd.com>
>>>>>>
>>>>>> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
>>>>>>
>>>>>>
>>>>>>> ---
>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 4 ++--
>>>>>>>   1 file changed, 2 insertions(+), 2 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>> index fdc6a1fd74af..dcc80d6e099e 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>> @@ -375,6 +375,8 @@ static void amdgpu_vm_bo_base_init(struct 
>>>>>>> amdgpu_vm_bo_base *base,
>>>>>>>       if (bo->tbo.base.resv != vm->root.bo->tbo.base.resv)
>>>>>>>           return;
>>>>>>>   + dma_resv_assert_held(vm->root.bo->tbo.base.resv);
>>>>>>> +
>>>>>>>       vm->bulk_moveable = false;
>>>>>>>       if (bo->tbo.type == ttm_bo_type_kernel && bo->parent)
>>>>>>>           amdgpu_vm_bo_relocated(base);
>>>>>>> @@ -2260,8 +2262,6 @@ struct amdgpu_bo_va 
>>>>>>> *amdgpu_vm_bo_add(struct amdgpu_device *adev,
>>>>>>>   {
>>>>>>>       struct amdgpu_bo_va *bo_va;
>>>>>>>   - dma_resv_assert_held(vm->root.bo->tbo.base.resv);
>>>>>>> -
>>>>>>>       bo_va = kzalloc(sizeof(struct amdgpu_bo_va), GFP_KERNEL);
>>>>>>>       if (bo_va == NULL) {
>>>>>>>           return NULL;
>>>>
>>


^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2022-02-07  7:24 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-02-04  8:52 [PATCH] drm/amdgpu: move lockdep assert to the right place Christian König
2022-02-04 14:28 ` Deucher, Alexander
2022-02-04 16:27 ` Felix Kuehling
2022-02-04 18:12   ` Bhardwaj, Rajneesh
2022-02-04 18:32     ` Christian König
2022-02-04 18:47       ` Bhardwaj, Rajneesh
2022-02-04 18:50         ` Christian König
2022-02-04 19:15           ` Bhardwaj, Rajneesh
2022-02-07  7:24             ` Christian König

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.