Re: [PATCH v5 4/5] drm/amdgpu: address remove from fault filter

From: "Christian König" <ckoenig.leichtzumerken@gmail.com>
To: Felix Kuehling <felix.kuehling@amd.com>,
	Philip Yang <Philip.Yang@amd.com>,
	amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH v5 4/5] drm/amdgpu: address remove from fault filter
Date: Wed, 28 Apr 2021 11:00:45 +0200	[thread overview]
Message-ID: <47a28337-1abe-4f94-b96d-3ebba32a82aa@gmail.com> (raw)
In-Reply-To: <0ca90df0-a26a-2b62-4da6-6e49beaf052a@amd.com>

Am 28.04.21 um 09:53 schrieb Felix Kuehling:
> Am 2021-04-28 um 2:54 a.m. schrieb Christian König:
>> Am 27.04.21 um 20:21 schrieb Felix Kuehling:
>>> On 2021-04-27 10:51 a.m., Philip Yang wrote:
>>>> Add interface to remove address from fault filter ring by resetting
>>>> fault ring entry key, then future vm fault on the address will be
>>>> processed to recover.
>>>>
>>>> Define fault key as atomic64_t type to use atomic read/set/cmpxchg key
>>>> to protect fault ring access by interrupt handler and interrupt
>>>> deferred
>>>> work for vg20. Change fault->timestamp to 48-bit to share same uint64_t
>>>> with 8-bit fault->next, it is enough for 48bit IH timestamp.
>>>>
>>>> Signed-off-by: Philip Yang <Philip.Yang@amd.com>
>>>> ---
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 48
>>>> ++++++++++++++++++++++---
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  6 ++--
>>>>    2 files changed, 48 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>>>> index c39ed9eb0987..a2e81e913abb 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>>>> @@ -332,6 +332,17 @@ void amdgpu_gmc_agp_location(struct
>>>> amdgpu_device *adev, struct amdgpu_gmc *mc)
>>>>                mc->agp_size >> 20, mc->agp_start, mc->agp_end);
>>>>    }
>>>>    +/**
>>>> + * amdgpu_gmc_fault_key - get hask key from vm fault address and pasid
>>>> + *
>>>> + * @addr: 48bit physical address
>>>> + * @pasid: 4 bit
>>> This comment is misleading. The PASID is not 4-bit. It's 16-bit. But
>>> shifting the address by 4 bit is sufficient because the address is
>>> page-aligned, which means the low 12 bits are 0. So this would be
>>> more accurate:
>>>
>>> @addr: 48 bit physical address, page aligned (36 significant bits)
>>> @pasid: 16 bit process address space identifier
>>>
>>> With that fixed, the patch is
>>>
>>> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
>>>
>>>
>>>> + */
>>>> +static inline uint64_t amdgpu_gmc_fault_key(uint64_t addr, uint16_t
>>>> pasid)
>>>> +{
>>>> +    return addr << 4 | pasid;
>>>> +}
>> Maybe changing this so that we have "addr * ((1 << 16) /
>> AMDGPU_PAGE_SIZE) | pasid" would help to better document that?
> I find this a mix of multiplication, division and bit-shift more
> confusing. How about "addr << (16 - AMDGPU_GPU_PAGE_SHIFT) | pasid"?

Yeah, that is even better.

Regards,
Christian.

>
> Regards,
>    Felix
>
>
>> Christian.
>>
>>>> +
>>>>    /**
>>>>     * amdgpu_gmc_filter_faults - filter VM faults
>>>>     *
>>>> @@ -348,8 +359,7 @@ bool amdgpu_gmc_filter_faults(struct
>>>> amdgpu_device *adev, uint64_t addr,
>>>>                      uint16_t pasid, uint64_t timestamp)
>>>>    {
>>>>        struct amdgpu_gmc *gmc = &adev->gmc;
>>>> -
>>>> -    uint64_t stamp, key = addr << 4 | pasid;
>>>> +    uint64_t stamp, key = amdgpu_gmc_fault_key(addr, pasid);
>>>>        struct amdgpu_gmc_fault *fault;
>>>>        uint32_t hash;
>>>>    @@ -365,7 +375,7 @@ bool amdgpu_gmc_filter_faults(struct
>>>> amdgpu_device *adev, uint64_t addr,
>>>>        while (fault->timestamp >= stamp) {
>>>>            uint64_t tmp;
>>>>    -        if (fault->key == key)
>>>> +        if (atomic64_read(&fault->key) == key)
>>>>                return true;
>>>>              tmp = fault->timestamp;
>>>> @@ -378,7 +388,7 @@ bool amdgpu_gmc_filter_faults(struct
>>>> amdgpu_device *adev, uint64_t addr,
>>>>          /* Add the fault to the ring */
>>>>        fault = &gmc->fault_ring[gmc->last_fault];
>>>> -    fault->key = key;
>>>> +    atomic64_set(&fault->key, key);
>>>>        fault->timestamp = timestamp;
>>>>          /* And update the hash */
>>>> @@ -387,6 +397,36 @@ bool amdgpu_gmc_filter_faults(struct
>>>> amdgpu_device *adev, uint64_t addr,
>>>>        return false;
>>>>    }
>>>>    +/**
>>>> + * amdgpu_gmc_filter_faults_remove - remove address from VM faults
>>>> filter
>>>> + *
>>>> + * @adev: amdgpu device structure
>>>> + * @addr: address of the VM fault
>>>> + * @pasid: PASID of the process causing the fault
>>>> + *
>>>> + * Remove the address from fault filter, then future vm fault on
>>>> this address
>>>> + * will pass to retry fault handler to recover.
>>>> + */
>>>> +void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev,
>>>> uint64_t addr,
>>>> +                     uint16_t pasid)
>>>> +{
>>>> +    struct amdgpu_gmc *gmc = &adev->gmc;
>>>> +    uint64_t key = amdgpu_gmc_fault_key(addr, pasid);
>>>> +    struct amdgpu_gmc_fault *fault;
>>>> +    uint32_t hash;
>>>> +    uint64_t tmp;
>>>> +
>>>> +    hash = hash_64(key, AMDGPU_GMC_FAULT_HASH_ORDER);
>>>> +    fault = &gmc->fault_ring[gmc->fault_hash[hash].idx];
>>>> +    do {
>>>> +        if (atomic64_cmpxchg(&fault->key, key, 0) == key)
>>>> +            break;
>>>> +
>>>> +        tmp = fault->timestamp;
>>>> +        fault = &gmc->fault_ring[fault->next];
>>>> +    } while (fault->timestamp < tmp);
>>>> +}
>>>> +
>>>>    int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
>>>>    {
>>>>        int r;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>>>> index 9d11c02a3938..6aa1d52d3aee 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>>>> @@ -66,9 +66,9 @@ struct firmware;
>>>>     * GMC page fault information
>>>>     */
>>>>    struct amdgpu_gmc_fault {
>>>> -    uint64_t    timestamp;
>>>> +    uint64_t    timestamp:48;
>>>>        uint64_t    next:AMDGPU_GMC_FAULT_RING_ORDER;
>>>> -    uint64_t    key:52;
>>>> +    atomic64_t    key;
>>>>    };
>>>>      /*
>>>> @@ -318,6 +318,8 @@ void amdgpu_gmc_agp_location(struct
>>>> amdgpu_device *adev,
>>>>                     struct amdgpu_gmc *mc);
>>>>    bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t
>>>> addr,
>>>>                      uint16_t pasid, uint64_t timestamp);
>>>> +void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev,
>>>> uint64_t addr,
>>>> +                     uint16_t pasid);
>>>>    int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev);
>>>>    void amdgpu_gmc_ras_fini(struct amdgpu_device *adev);
>>>>    int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device *adev);
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx