Re: [PATCH v2 3/3] drm/amdgpu: recover gart table at resume

From: "Das, Nirmoy" <nirmoy.das@amd.com>
To: "Christian König" <christian.koenig@amd.com>,
	"Lazar, Lijo" <lijo.lazar@amd.com>,
	amd-gfx@lists.freedesktop.org
Cc: andrey.grodzovsky@amd.com
Subject: Re: [PATCH v2 3/3] drm/amdgpu: recover gart table at resume
Date: Wed, 20 Oct 2021 13:10:37 +0200	[thread overview]
Message-ID: <4a46999b-12f2-a1fc-10f6-68545c540f7e@amd.com> (raw)
In-Reply-To: <3cc14a7e-9bb6-4cf9-a1a8-e300b2b68072@amd.com>

On 10/20/2021 12:51 PM, Christian König wrote:
>
>
> Am 20.10.21 um 12:21 schrieb Das, Nirmoy:
>>
>> On 10/20/2021 12:15 PM, Lazar, Lijo wrote:
>>>
>>>
>>> On 10/20/2021 3:42 PM, Das, Nirmoy wrote:
>>>>
>>>> On 10/20/2021 12:03 PM, Lazar, Lijo wrote:
>>>>>
>>>>>
>>>>> On 10/20/2021 3:23 PM, Das, Nirmoy wrote:
>>>>>>
>>>>>> On 10/20/2021 11:11 AM, Lazar, Lijo wrote:
>>>>>>>
>>>>>>>
>>>>>>> On 10/19/2021 11:44 PM, Nirmoy Das wrote:
>>>>>>>> Get rid off pin/unpin of gart BO at resume/suspend and
>>>>>>>> instead pin only once and try to recover gart content
>>>>>>>> at resume time. This is much more stable in case there
>>>>>>>> is OOM situation at 2nd call to amdgpu_device_evict_resources()
>>>>>>>> while evicting GART table.
>>>>>>>>
>>>>>>>> Signed-off-by: Nirmoy Das <nirmoy.das@amd.com>
>>>>>>>> ---
>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  4 ---
>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c   | 42 
>>>>>>>> ++++++++++++----------
>>>>>>>>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c     |  9 ++---
>>>>>>>>   drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c      | 10 +++---
>>>>>>>>   drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c      | 10 +++---
>>>>>>>>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c      |  9 ++---
>>>>>>>>   6 files changed, 45 insertions(+), 39 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> index 5807df52031c..f69e613805db 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> @@ -3941,10 +3941,6 @@ int amdgpu_device_suspend(struct 
>>>>>>>> drm_device *dev, bool fbcon)
>>>>>>>>       amdgpu_fence_driver_hw_fini(adev);
>>>>>>>>
>>>>>>>>       amdgpu_device_ip_suspend_phase2(adev);
>>>>>>>> -    /* This second call to evict device resources is to evict
>>>>>>>> -     * the gart page table using the CPU.
>>>>>>>> -     */
>>>>>>>> -    amdgpu_device_evict_resources(adev);
>>>>>>>>
>>>>>>>>       return 0;
>>>>>>>>   }
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c 
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>>>>>>>> index d3e4203f6217..97a9f61fa106 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>>>>>>>> @@ -107,33 +107,37 @@ void amdgpu_gart_dummy_page_fini(struct 
>>>>>>>> amdgpu_device *adev)
>>>>>>>>    *
>>>>>>>>    * @adev: amdgpu_device pointer
>>>>>>>>    *
>>>>>>>> - * Allocate video memory for GART page table
>>>>>>>> + * Allocate and pin video memory for GART page table
>>>>>>>>    * (pcie r4xx, r5xx+).  These asics require the
>>>>>>>>    * gart table to be in video memory.
>>>>>>>>    * Returns 0 for success, error for failure.
>>>>>>>>    */
>>>>>>>>   int amdgpu_gart_table_vram_alloc(struct amdgpu_device *adev)
>>>>>>>>   {
>>>>>>>> +    struct amdgpu_bo_param bp;
>>>>>>>>       int r;
>>>>>>>>
>>>>>>>> -    if (adev->gart.bo == NULL) {
>>>>>>>> -        struct amdgpu_bo_param bp;
>>>>>>>> -
>>>>>>>> -        memset(&bp, 0, sizeof(bp));
>>>>>>>> -        bp.size = adev->gart.table_size;
>>>>>>>> -        bp.byte_align = PAGE_SIZE;
>>>>>>>> -        bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
>>>>>>>> -        bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
>>>>>>>> -            AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
>>>>>>>> -        bp.type = ttm_bo_type_kernel;
>>>>>>>> -        bp.resv = NULL;
>>>>>>>> -        bp.bo_ptr_size = sizeof(struct amdgpu_bo);
>>>>>>>> -
>>>>>>>> -        r = amdgpu_bo_create(adev, &bp, &adev->gart.bo);
>>>>>>>> -        if (r) {
>>>>>>>> -            return r;
>>>>>>>> -        }
>>>>>>>> -    }
>>>>>>>> +    if (adev->gart.bo != NULL)
>>>>>>>> +        return 0;
>>>>>>>> +
>>>>>>>> +    memset(&bp, 0, sizeof(bp));
>>>>>>>> +    bp.size = adev->gart.table_size;
>>>>>>>> +    bp.byte_align = PAGE_SIZE;
>>>>>>>> +    bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
>>>>>>>> +    bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
>>>>>>>> +        AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
>>>>>>>> +    bp.type = ttm_bo_type_kernel;
>>>>>>>> +    bp.resv = NULL;
>>>>>>>> +    bp.bo_ptr_size = sizeof(struct amdgpu_bo);
>>>>>>>> +
>>>>>>>> +    r = amdgpu_bo_create(adev, &bp, &adev->gart.bo);
>>>>>>>> +    if (r)
>>>>>>>> +        return r;
>>>>>>>> +
>>>>>>>> +    r = amdgpu_gart_table_vram_pin(adev);
>>>>>>>> +    if (r)
>>>>>>>> +        return r;
>>>>>>>> +
>>>>>>>>       return 0;
>>>>>>>>   }
>>>>>>>>
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>>>>> index 3ec5ff5a6dbe..75d584e1b0e9 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>>>>> @@ -992,9 +992,11 @@ static int gmc_v10_0_gart_enable(struct 
>>>>>>>> amdgpu_device *adev)
>>>>>>>>           return -EINVAL;
>>>>>>>>       }
>>>>>>>>
>>>>>>>> -    r = amdgpu_gart_table_vram_pin(adev);
>>>>>>>> -    if (r)
>>>>>>>> -        return r;
>>>>>>>> +    if (adev->in_suspend) {
>>>>>>>> +        r = amdgpu_gtt_mgr_recover(adev);
>>>>>>>
>>>>>>> When the existing usage of this function is checked, this is 
>>>>>>> called during reset recovery after ip resume phase1. Can't the 
>>>>>>> same thing be done in ip_resume() to place this after phase1 
>>>>>>> resume instead of repeating in every IP version?
>>>>>>
>>>>>>
>>>>>> Placing amdgpu_gtt_mgr_recover() after phase1 generally works but 
>>>>>> gmc_v10_0_gart_enable() seems to be correct place  to do this
>>>>>>
>>>>>> gart specific work.
>>>>>>
>>>>>
>>>>> I see. In that case probably the patch needs to change other call 
>>>>> places also as this step is already taken care in gart enable.
>>>>
>>>>
>>>> Do you mean amdgpu_do_asic_reset() ?
>>>>
>>>
>>> Yes, and saw it called in one more place related to sriov reset 
>>> (didn't track the sriov reset path though).
>>
>>
>> True, hmm looks like this patch going to need multiple tested-by tags 
>> for gfx6,7 and sriov. I only have gfx8,9,10.
>
> You also need to test this on APUs as well, when it works won 
> Raven/gfx9 I'm pretty sure it will work on other generations as well 
> (except for typos of course).

I have a raven APU. I will test on that as well.

Regards,

Nirmoy

>
> Christian.
>
>>
>>
>> Regards,
>>
>> Nirmoy
>>
>>
>>>
>>> Thanks,
>>> Lijo
>>>
>>>>
>>>> Nirmoy
>>>>
>>>>
>>>>>
>>>>> Thanks,
>>>>> Lijo
>>>>>
>>>>>>
>>>>>> Regards,
>>>>>>
>>>>>> Nirmoy
>>>>>>
>>>>>>
>>>>>>
>>>>>>>
>>>>>>> Thanks,
>>>>>>> Lijo
>>>>>>>
>>>>>>>> +        if (r)
>>>>>>>> +            return r;
>>>>>>>> +    }
>>>>>>>>
>>>>>>>>       r = adev->gfxhub.funcs->gart_enable(adev);
>>>>>>>>       if (r)
>>>>>>>> @@ -1062,7 +1064,6 @@ static void gmc_v10_0_gart_disable(struct 
>>>>>>>> amdgpu_device *adev)
>>>>>>>>   {
>>>>>>>>       adev->gfxhub.funcs->gart_disable(adev);
>>>>>>>>       adev->mmhub.funcs->gart_disable(adev);
>>>>>>>> -    amdgpu_gart_table_vram_unpin(adev);
>>>>>>>>   }
>>>>>>>>
>>>>>>>>   static int gmc_v10_0_hw_fini(void *handle)
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c 
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
>>>>>>>> index 0a50fdaced7e..02e90d9443c1 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
>>>>>>>> @@ -620,9 +620,12 @@ static int gmc_v7_0_gart_enable(struct 
>>>>>>>> amdgpu_device *adev)
>>>>>>>>           dev_err(adev->dev, "No VRAM object for PCIE GART.\n");
>>>>>>>>           return -EINVAL;
>>>>>>>>       }
>>>>>>>> -    r = amdgpu_gart_table_vram_pin(adev);
>>>>>>>> -    if (r)
>>>>>>>> -        return r;
>>>>>>>> +
>>>>>>>> +    if (adev->in_suspend) {
>>>>>>>> +        r = amdgpu_gtt_mgr_recover(adev);
>>>>>>>> +        if (r)
>>>>>>>> +            return r;
>>>>>>>> +    }
>>>>>>>>
>>>>>>>>       table_addr = amdgpu_bo_gpu_offset(adev->gart.bo);
>>>>>>>>
>>>>>>>> @@ -758,7 +761,6 @@ static void gmc_v7_0_gart_disable(struct 
>>>>>>>> amdgpu_device *adev)
>>>>>>>>       tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, ENABLE_L2_CACHE, 0);
>>>>>>>>       WREG32(mmVM_L2_CNTL, tmp);
>>>>>>>>       WREG32(mmVM_L2_CNTL2, 0);
>>>>>>>> -    amdgpu_gart_table_vram_unpin(adev);
>>>>>>>>   }
>>>>>>>>
>>>>>>>>   /**
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c 
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>>>>>>>> index 492ebed2915b..dc2577e37688 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>>>>>>>> @@ -837,9 +837,12 @@ static int gmc_v8_0_gart_enable(struct 
>>>>>>>> amdgpu_device *adev)
>>>>>>>>           dev_err(adev->dev, "No VRAM object for PCIE GART.\n");
>>>>>>>>           return -EINVAL;
>>>>>>>>       }
>>>>>>>> -    r = amdgpu_gart_table_vram_pin(adev);
>>>>>>>> -    if (r)
>>>>>>>> -        return r;
>>>>>>>> +
>>>>>>>> +    if (adev->in_suspend) {
>>>>>>>> +        r = amdgpu_gtt_mgr_recover(adev);
>>>>>>>> +        if (r)
>>>>>>>> +            return r;
>>>>>>>> +    }
>>>>>>>>
>>>>>>>>       table_addr = amdgpu_bo_gpu_offset(adev->gart.bo);
>>>>>>>>
>>>>>>>> @@ -992,7 +995,6 @@ static void gmc_v8_0_gart_disable(struct 
>>>>>>>> amdgpu_device *adev)
>>>>>>>>       tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, ENABLE_L2_CACHE, 0);
>>>>>>>>       WREG32(mmVM_L2_CNTL, tmp);
>>>>>>>>       WREG32(mmVM_L2_CNTL2, 0);
>>>>>>>> -    amdgpu_gart_table_vram_unpin(adev);
>>>>>>>>   }
>>>>>>>>
>>>>>>>>   /**
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>>>>> index cb82404df534..732d91dbf449 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>>>>> @@ -1714,9 +1714,11 @@ static int gmc_v9_0_gart_enable(struct 
>>>>>>>> amdgpu_device *adev)
>>>>>>>>           return -EINVAL;
>>>>>>>>       }
>>>>>>>>
>>>>>>>> -    r = amdgpu_gart_table_vram_pin(adev);
>>>>>>>> -    if (r)
>>>>>>>> -        return r;
>>>>>>>> +    if (adev->in_suspend) {
>>>>>>>> +        r = amdgpu_gtt_mgr_recover(adev);
>>>>>>>> +        if (r)
>>>>>>>> +            return r;
>>>>>>>> +    }
>>>>>>>>
>>>>>>>>       r = adev->gfxhub.funcs->gart_enable(adev);
>>>>>>>>       if (r)
>>>>>>>> @@ -1793,7 +1795,6 @@ static void gmc_v9_0_gart_disable(struct 
>>>>>>>> amdgpu_device *adev)
>>>>>>>>   {
>>>>>>>>       adev->gfxhub.funcs->gart_disable(adev);
>>>>>>>>       adev->mmhub.funcs->gart_disable(adev);
>>>>>>>> -    amdgpu_gart_table_vram_unpin(adev);
>>>>>>>>   }
>>>>>>>>
>>>>>>>>   static int gmc_v9_0_hw_fini(void *handle)
>>>>>>>> -- 
>>>>>>>> 2.32.0
>>>>>>>>
>