All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3 4/5] drm/amdgpu: address remove from fault filter
@ 2021-04-23 15:35 Philip Yang
  2021-04-23 15:35 ` [PATCH 5/5] drm/amdkfd: enable subsequent retry fault Philip Yang
                   ` (3 more replies)
  0 siblings, 4 replies; 14+ messages in thread
From: Philip Yang @ 2021-04-23 15:35 UTC (permalink / raw)
  To: amd-gfx; +Cc: Philip Yang

Add interface to remove address from fault filter ring by resetting
fault ring entry key, then future vm fault on the address will be
processed to recover.

Use spinlock to protect fault hash ring access by interrupt handler and
interrupt scheduled deferred work for vg20.

Signed-off-by: Philip Yang <Philip.Yang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 66 +++++++++++++++++++++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  3 ++
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  |  1 +
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  1 +
 4 files changed, 68 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index c39ed9eb0987..91106b59389f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -332,6 +332,17 @@ void amdgpu_gmc_agp_location(struct amdgpu_device *adev, struct amdgpu_gmc *mc)
 			mc->agp_size >> 20, mc->agp_start, mc->agp_end);
 }
 
+/**
+ * fault_key - get 52bit hask key from vm fault address and pasid
+ *
+ * @addr: 48bit physical address
+ * @pasid: 4 bit
+ */
+static inline uint64_t fault_key(uint64_t addr, uint16_t pasid)
+{
+	return addr << 4 | pasid;
+}
+
 /**
  * amdgpu_gmc_filter_faults - filter VM faults
  *
@@ -349,15 +360,20 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
 {
 	struct amdgpu_gmc *gmc = &adev->gmc;
 
-	uint64_t stamp, key = addr << 4 | pasid;
+	uint64_t stamp, key = fault_key(addr, pasid);
 	struct amdgpu_gmc_fault *fault;
+	unsigned long flags;
 	uint32_t hash;
 
 	/* If we don't have space left in the ring buffer return immediately */
 	stamp = max(timestamp, AMDGPU_GMC_FAULT_TIMEOUT + 1) -
 		AMDGPU_GMC_FAULT_TIMEOUT;
-	if (gmc->fault_ring[gmc->last_fault].timestamp >= stamp)
+
+	spin_lock_irqsave(&gmc->fault_lock, flags);
+	if (gmc->fault_ring[gmc->last_fault].timestamp >= stamp) {
+		spin_unlock_irqrestore(&gmc->fault_lock, flags);
 		return true;
+	}
 
 	/* Try to find the fault in the hash */
 	hash = hash_64(key, AMDGPU_GMC_FAULT_HASH_ORDER);
@@ -365,8 +381,10 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
 	while (fault->timestamp >= stamp) {
 		uint64_t tmp;
 
-		if (fault->key == key)
+		if (fault->key == key) {
+			spin_unlock_irqrestore(&gmc->fault_lock, flags);
 			return true;
+		}
 
 		tmp = fault->timestamp;
 		fault = &gmc->fault_ring[fault->next];
@@ -384,9 +402,51 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
 	/* And update the hash */
 	fault->next = gmc->fault_hash[hash].idx;
 	gmc->fault_hash[hash].idx = gmc->last_fault++;
+	spin_unlock_irqrestore(&gmc->fault_lock, flags);
 	return false;
 }
 
+/**
+ * amdgpu_gmc_filter_faults_remove - remove address from VM faults filter
+ *
+ * @adev: amdgpu device structure
+ * @addr: address of the VM fault
+ * @pasid: PASID of the process causing the fault
+ *
+ * Remove the address from fault filter, then future vm fault on this address
+ * will pass to retry fault handler to recover.
+ */
+void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t addr,
+				     uint16_t pasid)
+{
+	struct amdgpu_gmc *gmc = &adev->gmc;
+
+	uint64_t key = fault_key(addr, pasid);
+	struct amdgpu_gmc_fault *fault;
+	unsigned long flags;
+	uint32_t hash;
+
+	spin_lock_irqsave(&gmc->fault_lock, flags);
+	hash = hash_64(key, AMDGPU_GMC_FAULT_HASH_ORDER);
+	fault = &gmc->fault_ring[gmc->fault_hash[hash].idx];
+	while (true) {
+		uint64_t tmp;
+
+		if (fault->key == key) {
+			fault->key = fault_key(0, 0);
+			break;
+		}
+
+		tmp = fault->timestamp;
+		fault = &gmc->fault_ring[fault->next];
+
+		/* Check if the entry was reused */
+		if (fault->timestamp >= tmp)
+			break;
+	}
+	spin_unlock_irqrestore(&gmc->fault_lock, flags);
+}
+
 int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
 {
 	int r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index 9d11c02a3938..0aae3bd01bf2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -246,6 +246,7 @@ struct amdgpu_gmc {
 		uint64_t	idx:AMDGPU_GMC_FAULT_RING_ORDER;
 	} fault_hash[AMDGPU_GMC_FAULT_HASH_SIZE];
 	uint64_t		last_fault:AMDGPU_GMC_FAULT_RING_ORDER;
+	spinlock_t		fault_lock;
 
 	bool tmz_enabled;
 
@@ -318,6 +319,8 @@ void amdgpu_gmc_agp_location(struct amdgpu_device *adev,
 			     struct amdgpu_gmc *mc);
 bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
 			      uint16_t pasid, uint64_t timestamp);
+void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t addr,
+				     uint16_t pasid);
 int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev);
 void amdgpu_gmc_ras_fini(struct amdgpu_device *adev);
 int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 498b28a35f5b..7416ad874652 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -839,6 +839,7 @@ static int gmc_v10_0_sw_init(void *handle)
 	adev->mmhub.funcs->init(adev);
 
 	spin_lock_init(&adev->gmc.invalidate_lock);
+	spin_lock_init(&adev->gmc.fault_lock);
 
 	if ((adev->flags & AMD_IS_APU) && amdgpu_emu_mode == 1) {
 		adev->gmc.vram_type = AMDGPU_VRAM_TYPE_DDR4;
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 4da8b3d28af2..3290b259a372 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1444,6 +1444,7 @@ static int gmc_v9_0_sw_init(void *handle)
 	adev->mmhub.funcs->init(adev);
 
 	spin_lock_init(&adev->gmc.invalidate_lock);
+	spin_lock_init(&adev->gmc.fault_lock);
 
 	r = amdgpu_atomfirmware_get_vram_info(adev,
 		&vram_width, &vram_type, &vram_vendor);
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 5/5] drm/amdkfd: enable subsequent retry fault
  2021-04-23 15:35 [PATCH v3 4/5] drm/amdgpu: address remove from fault filter Philip Yang
@ 2021-04-23 15:35 ` Philip Yang
  2021-04-23 16:28 ` [PATCH v3 4/5] drm/amdgpu: address remove from fault filter Christian König
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 14+ messages in thread
From: Philip Yang @ 2021-04-23 15:35 UTC (permalink / raw)
  To: amd-gfx; +Cc: Philip Yang

After draining the stale retry fault, or failed to validate the range
to recover, have to remove the fault address from fault filter ring, to
be able to handle subsequent retry interrupt on same address. Otherwise
the retry fault will not be processed to recover until timeout passed.

Signed-off-by: Philip Yang <Philip.Yang@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 6490f016aa6e..5ed7dda2d14f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -2363,8 +2363,10 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
 
 	mutex_lock(&prange->migrate_mutex);
 
-	if (svm_range_skip_recover(prange))
+	if (svm_range_skip_recover(prange)) {
+		amdgpu_gmc_filter_faults_remove(adev, addr, pasid);
 		goto out_unlock_range;
+	}
 
 	timestamp = ktime_to_us(ktime_get()) - prange->validate_timestamp;
 	/* skip duplicate vm fault on different pages of same range */
@@ -2426,6 +2428,7 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
 
 	if (r == -EAGAIN) {
 		pr_debug("recover vm fault later\n");
+		amdgpu_gmc_filter_faults_remove(adev, addr, pasid);
 		r = 0;
 	}
 	return r;
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v3 4/5] drm/amdgpu: address remove from fault filter
  2021-04-23 15:35 [PATCH v3 4/5] drm/amdgpu: address remove from fault filter Philip Yang
  2021-04-23 15:35 ` [PATCH 5/5] drm/amdkfd: enable subsequent retry fault Philip Yang
@ 2021-04-23 16:28 ` Christian König
  2021-04-24  4:50   ` Felix Kuehling
  2021-04-26 21:26 ` [PATCH v4 " Philip Yang
  2021-04-27 14:51 ` [PATCH v5 " Philip Yang
  3 siblings, 1 reply; 14+ messages in thread
From: Christian König @ 2021-04-23 16:28 UTC (permalink / raw)
  To: Philip Yang, amd-gfx



Am 23.04.21 um 17:35 schrieb Philip Yang:
> Add interface to remove address from fault filter ring by resetting
> fault ring entry key, then future vm fault on the address will be
> processed to recover.
>
> Use spinlock to protect fault hash ring access by interrupt handler and
> interrupt scheduled deferred work for vg20.
>
> Signed-off-by: Philip Yang <Philip.Yang@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 66 +++++++++++++++++++++++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  3 ++
>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  |  1 +
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  1 +
>   4 files changed, 68 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> index c39ed9eb0987..91106b59389f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> @@ -332,6 +332,17 @@ void amdgpu_gmc_agp_location(struct amdgpu_device *adev, struct amdgpu_gmc *mc)
>   			mc->agp_size >> 20, mc->agp_start, mc->agp_end);
>   }
>   
> +/**
> + * fault_key - get 52bit hask key from vm fault address and pasid
> + *
> + * @addr: 48bit physical address
> + * @pasid: 4 bit
> + */
> +static inline uint64_t fault_key(uint64_t addr, uint16_t pasid)

Please prefix with amdgpu_gmc_

> +{
> +	return addr << 4 | pasid;
> +}
> +
>   /**
>    * amdgpu_gmc_filter_faults - filter VM faults
>    *
> @@ -349,15 +360,20 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
>   {
>   	struct amdgpu_gmc *gmc = &adev->gmc;
>   
> -	uint64_t stamp, key = addr << 4 | pasid;
> +	uint64_t stamp, key = fault_key(addr, pasid);
>   	struct amdgpu_gmc_fault *fault;
> +	unsigned long flags;
>   	uint32_t hash;
>   
>   	/* If we don't have space left in the ring buffer return immediately */
>   	stamp = max(timestamp, AMDGPU_GMC_FAULT_TIMEOUT + 1) -
>   		AMDGPU_GMC_FAULT_TIMEOUT;
> -	if (gmc->fault_ring[gmc->last_fault].timestamp >= stamp)
> +
> +	spin_lock_irqsave(&gmc->fault_lock, flags);

Uff the spinlock adds quite some overhead here. I'm still wondering if 
we can't somehow avoid this.

Christian.

> +	if (gmc->fault_ring[gmc->last_fault].timestamp >= stamp) {
> +		spin_unlock_irqrestore(&gmc->fault_lock, flags);
>   		return true;
> +	}
>   
>   	/* Try to find the fault in the hash */
>   	hash = hash_64(key, AMDGPU_GMC_FAULT_HASH_ORDER);
> @@ -365,8 +381,10 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
>   	while (fault->timestamp >= stamp) {
>   		uint64_t tmp;
>   
> -		if (fault->key == key)
> +		if (fault->key == key) {
> +			spin_unlock_irqrestore(&gmc->fault_lock, flags);
>   			return true;
> +		}
>   
>   		tmp = fault->timestamp;
>   		fault = &gmc->fault_ring[fault->next];
> @@ -384,9 +402,51 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
>   	/* And update the hash */
>   	fault->next = gmc->fault_hash[hash].idx;
>   	gmc->fault_hash[hash].idx = gmc->last_fault++;
> +	spin_unlock_irqrestore(&gmc->fault_lock, flags);
>   	return false;
>   }
>   
> +/**
> + * amdgpu_gmc_filter_faults_remove - remove address from VM faults filter
> + *
> + * @adev: amdgpu device structure
> + * @addr: address of the VM fault
> + * @pasid: PASID of the process causing the fault
> + *
> + * Remove the address from fault filter, then future vm fault on this address
> + * will pass to retry fault handler to recover.
> + */
> +void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t addr,
> +				     uint16_t pasid)
> +{
> +	struct amdgpu_gmc *gmc = &adev->gmc;
> +
> +	uint64_t key = fault_key(addr, pasid);
> +	struct amdgpu_gmc_fault *fault;
> +	unsigned long flags;
> +	uint32_t hash;
> +
> +	spin_lock_irqsave(&gmc->fault_lock, flags);
> +	hash = hash_64(key, AMDGPU_GMC_FAULT_HASH_ORDER);
> +	fault = &gmc->fault_ring[gmc->fault_hash[hash].idx];
> +	while (true) {
> +		uint64_t tmp;
> +
> +		if (fault->key == key) {
> +			fault->key = fault_key(0, 0);
> +			break;
> +		}
> +
> +		tmp = fault->timestamp;
> +		fault = &gmc->fault_ring[fault->next];
> +
> +		/* Check if the entry was reused */
> +		if (fault->timestamp >= tmp)
> +			break;
> +	}
> +	spin_unlock_irqrestore(&gmc->fault_lock, flags);
> +}
> +
>   int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
>   {
>   	int r;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> index 9d11c02a3938..0aae3bd01bf2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> @@ -246,6 +246,7 @@ struct amdgpu_gmc {
>   		uint64_t	idx:AMDGPU_GMC_FAULT_RING_ORDER;
>   	} fault_hash[AMDGPU_GMC_FAULT_HASH_SIZE];
>   	uint64_t		last_fault:AMDGPU_GMC_FAULT_RING_ORDER;
> +	spinlock_t		fault_lock;
>   
>   	bool tmz_enabled;
>   
> @@ -318,6 +319,8 @@ void amdgpu_gmc_agp_location(struct amdgpu_device *adev,
>   			     struct amdgpu_gmc *mc);
>   bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
>   			      uint16_t pasid, uint64_t timestamp);
> +void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t addr,
> +				     uint16_t pasid);
>   int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev);
>   void amdgpu_gmc_ras_fini(struct amdgpu_device *adev);
>   int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device *adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 498b28a35f5b..7416ad874652 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -839,6 +839,7 @@ static int gmc_v10_0_sw_init(void *handle)
>   	adev->mmhub.funcs->init(adev);
>   
>   	spin_lock_init(&adev->gmc.invalidate_lock);
> +	spin_lock_init(&adev->gmc.fault_lock);
>   
>   	if ((adev->flags & AMD_IS_APU) && amdgpu_emu_mode == 1) {
>   		adev->gmc.vram_type = AMDGPU_VRAM_TYPE_DDR4;
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 4da8b3d28af2..3290b259a372 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -1444,6 +1444,7 @@ static int gmc_v9_0_sw_init(void *handle)
>   	adev->mmhub.funcs->init(adev);
>   
>   	spin_lock_init(&adev->gmc.invalidate_lock);
> +	spin_lock_init(&adev->gmc.fault_lock);
>   
>   	r = amdgpu_atomfirmware_get_vram_info(adev,
>   		&vram_width, &vram_type, &vram_vendor);

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v3 4/5] drm/amdgpu: address remove from fault filter
  2021-04-23 16:28 ` [PATCH v3 4/5] drm/amdgpu: address remove from fault filter Christian König
@ 2021-04-24  4:50   ` Felix Kuehling
  0 siblings, 0 replies; 14+ messages in thread
From: Felix Kuehling @ 2021-04-24  4:50 UTC (permalink / raw)
  To: Christian König, Philip Yang, amd-gfx

Am 2021-04-23 um 12:28 p.m. schrieb Christian König:
>
>
> Am 23.04.21 um 17:35 schrieb Philip Yang:
>> Add interface to remove address from fault filter ring by resetting
>> fault ring entry key, then future vm fault on the address will be
>> processed to recover.
>>
>> Use spinlock to protect fault hash ring access by interrupt handler and
>> interrupt scheduled deferred work for vg20.
>>
>> Signed-off-by: Philip Yang <Philip.Yang@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 66 +++++++++++++++++++++++--
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  3 ++
>>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  |  1 +
>>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  1 +
>>   4 files changed, 68 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>> index c39ed9eb0987..91106b59389f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>> @@ -332,6 +332,17 @@ void amdgpu_gmc_agp_location(struct
>> amdgpu_device *adev, struct amdgpu_gmc *mc)
>>               mc->agp_size >> 20, mc->agp_start, mc->agp_end);
>>   }
>>   +/**
>> + * fault_key - get 52bit hask key from vm fault address and pasid
>> + *
>> + * @addr: 48bit physical address
>> + * @pasid: 4 bit
>> + */
>> +static inline uint64_t fault_key(uint64_t addr, uint16_t pasid)
>
> Please prefix with amdgpu_gmc_
>
>> +{
>> +    return addr << 4 | pasid;
>> +}
>> +
>>   /**
>>    * amdgpu_gmc_filter_faults - filter VM faults
>>    *
>> @@ -349,15 +360,20 @@ bool amdgpu_gmc_filter_faults(struct
>> amdgpu_device *adev, uint64_t addr,
>>   {
>>       struct amdgpu_gmc *gmc = &adev->gmc;
>>   -    uint64_t stamp, key = addr << 4 | pasid;
>> +    uint64_t stamp, key = fault_key(addr, pasid);
>>       struct amdgpu_gmc_fault *fault;
>> +    unsigned long flags;
>>       uint32_t hash;
>>         /* If we don't have space left in the ring buffer return
>> immediately */
>>       stamp = max(timestamp, AMDGPU_GMC_FAULT_TIMEOUT + 1) -
>>           AMDGPU_GMC_FAULT_TIMEOUT;
>> -    if (gmc->fault_ring[gmc->last_fault].timestamp >= stamp)
>> +
>> +    spin_lock_irqsave(&gmc->fault_lock, flags);
>
> Uff the spinlock adds quite some overhead here. I'm still wondering if
> we can't somehow avoid this.

Does it? I guess the irqsave version has overhead because it needs to
disable and reenable interrupts. Other than that, spin locks should be
fairly low overhead, especially in the common (uncontested) case.

If we want to use atomic ops to update the key, it has to be a uint64_t,
not a bitfield. Maybe we could steal 8 bits for the "next" field from
the timestamp instead.

Regards,
  Felix


>
> Christian.
>
>> +    if (gmc->fault_ring[gmc->last_fault].timestamp >= stamp) {
>> +        spin_unlock_irqrestore(&gmc->fault_lock, flags);
>>           return true;
>> +    }
>>         /* Try to find the fault in the hash */
>>       hash = hash_64(key, AMDGPU_GMC_FAULT_HASH_ORDER);
>> @@ -365,8 +381,10 @@ bool amdgpu_gmc_filter_faults(struct
>> amdgpu_device *adev, uint64_t addr,
>>       while (fault->timestamp >= stamp) {
>>           uint64_t tmp;
>>   -        if (fault->key == key)
>> +        if (fault->key == key) {
>> +            spin_unlock_irqrestore(&gmc->fault_lock, flags);
>>               return true;
>> +        }
>>             tmp = fault->timestamp;
>>           fault = &gmc->fault_ring[fault->next];
>> @@ -384,9 +402,51 @@ bool amdgpu_gmc_filter_faults(struct
>> amdgpu_device *adev, uint64_t addr,
>>       /* And update the hash */
>>       fault->next = gmc->fault_hash[hash].idx;
>>       gmc->fault_hash[hash].idx = gmc->last_fault++;
>> +    spin_unlock_irqrestore(&gmc->fault_lock, flags);
>>       return false;
>>   }
>>   +/**
>> + * amdgpu_gmc_filter_faults_remove - remove address from VM faults
>> filter
>> + *
>> + * @adev: amdgpu device structure
>> + * @addr: address of the VM fault
>> + * @pasid: PASID of the process causing the fault
>> + *
>> + * Remove the address from fault filter, then future vm fault on
>> this address
>> + * will pass to retry fault handler to recover.
>> + */
>> +void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev,
>> uint64_t addr,
>> +                     uint16_t pasid)
>> +{
>> +    struct amdgpu_gmc *gmc = &adev->gmc;
>> +
>> +    uint64_t key = fault_key(addr, pasid);
>> +    struct amdgpu_gmc_fault *fault;
>> +    unsigned long flags;
>> +    uint32_t hash;
>> +
>> +    spin_lock_irqsave(&gmc->fault_lock, flags);
>> +    hash = hash_64(key, AMDGPU_GMC_FAULT_HASH_ORDER);
>> +    fault = &gmc->fault_ring[gmc->fault_hash[hash].idx];
>> +    while (true) {
>> +        uint64_t tmp;
>> +
>> +        if (fault->key == key) {
>> +            fault->key = fault_key(0, 0);
>> +            break;
>> +        }
>> +
>> +        tmp = fault->timestamp;
>> +        fault = &gmc->fault_ring[fault->next];
>> +
>> +        /* Check if the entry was reused */
>> +        if (fault->timestamp >= tmp)
>> +            break;
>> +    }
>> +    spin_unlock_irqrestore(&gmc->fault_lock, flags);
>> +}
>> +
>>   int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
>>   {
>>       int r;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>> index 9d11c02a3938..0aae3bd01bf2 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>> @@ -246,6 +246,7 @@ struct amdgpu_gmc {
>>           uint64_t    idx:AMDGPU_GMC_FAULT_RING_ORDER;
>>       } fault_hash[AMDGPU_GMC_FAULT_HASH_SIZE];
>>       uint64_t        last_fault:AMDGPU_GMC_FAULT_RING_ORDER;
>> +    spinlock_t        fault_lock;
>>         bool tmz_enabled;
>>   @@ -318,6 +319,8 @@ void amdgpu_gmc_agp_location(struct
>> amdgpu_device *adev,
>>                    struct amdgpu_gmc *mc);
>>   bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t
>> addr,
>>                     uint16_t pasid, uint64_t timestamp);
>> +void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev,
>> uint64_t addr,
>> +                     uint16_t pasid);
>>   int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev);
>>   void amdgpu_gmc_ras_fini(struct amdgpu_device *adev);
>>   int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device *adev);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> index 498b28a35f5b..7416ad874652 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> @@ -839,6 +839,7 @@ static int gmc_v10_0_sw_init(void *handle)
>>       adev->mmhub.funcs->init(adev);
>>         spin_lock_init(&adev->gmc.invalidate_lock);
>> +    spin_lock_init(&adev->gmc.fault_lock);
>>         if ((adev->flags & AMD_IS_APU) && amdgpu_emu_mode == 1) {
>>           adev->gmc.vram_type = AMDGPU_VRAM_TYPE_DDR4;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> index 4da8b3d28af2..3290b259a372 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> @@ -1444,6 +1444,7 @@ static int gmc_v9_0_sw_init(void *handle)
>>       adev->mmhub.funcs->init(adev);
>>         spin_lock_init(&adev->gmc.invalidate_lock);
>> +    spin_lock_init(&adev->gmc.fault_lock);
>>         r = amdgpu_atomfirmware_get_vram_info(adev,
>>           &vram_width, &vram_type, &vram_vendor);
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v4 4/5] drm/amdgpu: address remove from fault filter
  2021-04-23 15:35 [PATCH v3 4/5] drm/amdgpu: address remove from fault filter Philip Yang
  2021-04-23 15:35 ` [PATCH 5/5] drm/amdkfd: enable subsequent retry fault Philip Yang
  2021-04-23 16:28 ` [PATCH v3 4/5] drm/amdgpu: address remove from fault filter Christian König
@ 2021-04-26 21:26 ` Philip Yang
  2021-04-26 21:26   ` [PATCH 5/5] drm/amdkfd: enable subsequent retry fault Philip Yang
  2021-04-27  7:25   ` [PATCH v4 4/5] drm/amdgpu: address remove from fault filter Christian König
  2021-04-27 14:51 ` [PATCH v5 " Philip Yang
  3 siblings, 2 replies; 14+ messages in thread
From: Philip Yang @ 2021-04-26 21:26 UTC (permalink / raw)
  To: amd-gfx; +Cc: Philip Yang

Add interface to remove address from fault filter ring by resetting
fault ring entry key, then future vm fault on the address will be
processed to recover.

Define fault key as atomic64_t type to use atomic read/set/cmpxchg key
to protect fault ring access by interrupt handler and interrupt deferred
work for vg20. Change fault->timestamp to 56-bit to share same uint64_t
with 8-bit fault->next, it is big enough for 48bit IH timestamp.

Signed-off-by: Philip Yang <Philip.Yang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 54 +++++++++++++++++++++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  6 ++-
 2 files changed, 55 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index c39ed9eb0987..888b749bd75e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -332,6 +332,17 @@ void amdgpu_gmc_agp_location(struct amdgpu_device *adev, struct amdgpu_gmc *mc)
 			mc->agp_size >> 20, mc->agp_start, mc->agp_end);
 }
 
+/**
+ * amdgpu_gmc_fault_key - get hask key from vm fault address and pasid
+ *
+ * @addr: 48bit physical address
+ * @pasid: 4 bit
+ */
+static inline uint64_t amdgpu_gmc_fault_key(uint64_t addr, uint16_t pasid)
+{
+	return addr << 4 | pasid;
+}
+
 /**
  * amdgpu_gmc_filter_faults - filter VM faults
  *
@@ -349,13 +360,14 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
 {
 	struct amdgpu_gmc *gmc = &adev->gmc;
 
-	uint64_t stamp, key = addr << 4 | pasid;
+	uint64_t stamp, key = amdgpu_gmc_fault_key(addr, pasid);
 	struct amdgpu_gmc_fault *fault;
 	uint32_t hash;
 
 	/* If we don't have space left in the ring buffer return immediately */
 	stamp = max(timestamp, AMDGPU_GMC_FAULT_TIMEOUT + 1) -
 		AMDGPU_GMC_FAULT_TIMEOUT;
+
 	if (gmc->fault_ring[gmc->last_fault].timestamp >= stamp)
 		return true;
 
@@ -365,7 +377,7 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
 	while (fault->timestamp >= stamp) {
 		uint64_t tmp;
 
-		if (fault->key == key)
+		if (atomic64_read(&fault->key) == key)
 			return true;
 
 		tmp = fault->timestamp;
@@ -378,7 +390,7 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
 
 	/* Add the fault to the ring */
 	fault = &gmc->fault_ring[gmc->last_fault];
-	fault->key = key;
+	atomic64_set(&fault->key, key);
 	fault->timestamp = timestamp;
 
 	/* And update the hash */
@@ -387,6 +399,42 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
 	return false;
 }
 
+/**
+ * amdgpu_gmc_filter_faults_remove - remove address from VM faults filter
+ *
+ * @adev: amdgpu device structure
+ * @addr: address of the VM fault
+ * @pasid: PASID of the process causing the fault
+ *
+ * Remove the address from fault filter, then future vm fault on this address
+ * will pass to retry fault handler to recover.
+ */
+void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t addr,
+				     uint16_t pasid)
+{
+	struct amdgpu_gmc *gmc = &adev->gmc;
+
+	uint64_t key = amdgpu_gmc_fault_key(addr, pasid);
+	struct amdgpu_gmc_fault *fault;
+	uint32_t hash;
+
+	hash = hash_64(key, AMDGPU_GMC_FAULT_HASH_ORDER);
+	fault = &gmc->fault_ring[gmc->fault_hash[hash].idx];
+	while (true) {
+		uint64_t tmp;
+
+		if (atomic64_cmpxchg(&fault->key, key, 0) == key)
+			break;
+
+		tmp = fault->timestamp;
+		fault = &gmc->fault_ring[fault->next];
+
+		/* Check if the entry was reused */
+		if (fault->timestamp >= tmp)
+			break;
+	}
+}
+
 int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
 {
 	int r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index 9d11c02a3938..95e18ef83aec 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -66,9 +66,9 @@ struct firmware;
  * GMC page fault information
  */
 struct amdgpu_gmc_fault {
-	uint64_t	timestamp;
+	uint64_t	timestamp:56;
 	uint64_t	next:AMDGPU_GMC_FAULT_RING_ORDER;
-	uint64_t	key:52;
+	atomic64_t	key;
 };
 
 /*
@@ -318,6 +318,8 @@ void amdgpu_gmc_agp_location(struct amdgpu_device *adev,
 			     struct amdgpu_gmc *mc);
 bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
 			      uint16_t pasid, uint64_t timestamp);
+void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t addr,
+				     uint16_t pasid);
 int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev);
 void amdgpu_gmc_ras_fini(struct amdgpu_device *adev);
 int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device *adev);
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 5/5] drm/amdkfd: enable subsequent retry fault
  2021-04-26 21:26 ` [PATCH v4 " Philip Yang
@ 2021-04-26 21:26   ` Philip Yang
  2021-04-27  7:25   ` [PATCH v4 4/5] drm/amdgpu: address remove from fault filter Christian König
  1 sibling, 0 replies; 14+ messages in thread
From: Philip Yang @ 2021-04-26 21:26 UTC (permalink / raw)
  To: amd-gfx; +Cc: Philip Yang

After draining the stale retry fault, or failed to validate the range
to recover, have to remove the fault address from fault filter ring, to
be able to handle subsequent retry interrupt on same address. Otherwise
the retry fault will not be processed to recover until timeout passed.

Signed-off-by: Philip Yang <Philip.Yang@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 00d759b257f4..d9111fea724b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -2363,8 +2363,10 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
 
 	mutex_lock(&prange->migrate_mutex);
 
-	if (svm_range_skip_recover(prange))
+	if (svm_range_skip_recover(prange)) {
+		amdgpu_gmc_filter_faults_remove(adev, addr, pasid);
 		goto out_unlock_range;
+	}
 
 	timestamp = ktime_to_us(ktime_get()) - prange->validate_timestamp;
 	/* skip duplicate vm fault on different pages of same range */
@@ -2426,6 +2428,7 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
 
 	if (r == -EAGAIN) {
 		pr_debug("recover vm fault later\n");
+		amdgpu_gmc_filter_faults_remove(adev, addr, pasid);
 		r = 0;
 	}
 	return r;
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v4 4/5] drm/amdgpu: address remove from fault filter
  2021-04-26 21:26 ` [PATCH v4 " Philip Yang
  2021-04-26 21:26   ` [PATCH 5/5] drm/amdkfd: enable subsequent retry fault Philip Yang
@ 2021-04-27  7:25   ` Christian König
  2021-04-27 14:12     ` philip yang
  1 sibling, 1 reply; 14+ messages in thread
From: Christian König @ 2021-04-27  7:25 UTC (permalink / raw)
  To: Philip Yang, amd-gfx

Am 26.04.21 um 23:26 schrieb Philip Yang:
> Add interface to remove address from fault filter ring by resetting
> fault ring entry key, then future vm fault on the address will be
> processed to recover.
>
> Define fault key as atomic64_t type to use atomic read/set/cmpxchg key
> to protect fault ring access by interrupt handler and interrupt deferred
> work for vg20. Change fault->timestamp to 56-bit to share same uint64_t
> with 8-bit fault->next, it is big enough for 48bit IH timestamp.
>
> Signed-off-by: Philip Yang <Philip.Yang@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 54 +++++++++++++++++++++++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  6 ++-
>   2 files changed, 55 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> index c39ed9eb0987..888b749bd75e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> @@ -332,6 +332,17 @@ void amdgpu_gmc_agp_location(struct amdgpu_device *adev, struct amdgpu_gmc *mc)
>   			mc->agp_size >> 20, mc->agp_start, mc->agp_end);
>   }
>   
> +/**
> + * amdgpu_gmc_fault_key - get hask key from vm fault address and pasid
> + *
> + * @addr: 48bit physical address
> + * @pasid: 4 bit
> + */
> +static inline uint64_t amdgpu_gmc_fault_key(uint64_t addr, uint16_t pasid)
> +{
> +	return addr << 4 | pasid;
> +}
> +
>   /**
>    * amdgpu_gmc_filter_faults - filter VM faults
>    *
> @@ -349,13 +360,14 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
>   {
>   	struct amdgpu_gmc *gmc = &adev->gmc;
>   
> -	uint64_t stamp, key = addr << 4 | pasid;
> +	uint64_t stamp, key = amdgpu_gmc_fault_key(addr, pasid);
>   	struct amdgpu_gmc_fault *fault;
>   	uint32_t hash;
>   
>   	/* If we don't have space left in the ring buffer return immediately */
>   	stamp = max(timestamp, AMDGPU_GMC_FAULT_TIMEOUT + 1) -
>   		AMDGPU_GMC_FAULT_TIMEOUT;
> +
>   	if (gmc->fault_ring[gmc->last_fault].timestamp >= stamp)
>   		return true;
>   
> @@ -365,7 +377,7 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
>   	while (fault->timestamp >= stamp) {
>   		uint64_t tmp;
>   
> -		if (fault->key == key)
> +		if (atomic64_read(&fault->key) == key)
>   			return true;
>   
>   		tmp = fault->timestamp;
> @@ -378,7 +390,7 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
>   
>   	/* Add the fault to the ring */
>   	fault = &gmc->fault_ring[gmc->last_fault];
> -	fault->key = key;
> +	atomic64_set(&fault->key, key);
>   	fault->timestamp = timestamp;
>   
>   	/* And update the hash */
> @@ -387,6 +399,42 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
>   	return false;
>   }
>   
> +/**
> + * amdgpu_gmc_filter_faults_remove - remove address from VM faults filter
> + *
> + * @adev: amdgpu device structure
> + * @addr: address of the VM fault
> + * @pasid: PASID of the process causing the fault
> + *
> + * Remove the address from fault filter, then future vm fault on this address
> + * will pass to retry fault handler to recover.
> + */
> +void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t addr,
> +				     uint16_t pasid)
> +{
> +	struct amdgpu_gmc *gmc = &adev->gmc;
> +
> +	uint64_t key = amdgpu_gmc_fault_key(addr, pasid);
> +	struct amdgpu_gmc_fault *fault;
> +	uint32_t hash;
> +
> +	hash = hash_64(key, AMDGPU_GMC_FAULT_HASH_ORDER);
> +	fault = &gmc->fault_ring[gmc->fault_hash[hash].idx];
> +	while (true) {
> +		uint64_t tmp;
> +
> +		if (atomic64_cmpxchg(&fault->key, key, 0) == key)
> +			break;
> +
> +		tmp = fault->timestamp;
> +		fault = &gmc->fault_ring[fault->next];
> +
> +		/* Check if the entry was reused */
> +		if (fault->timestamp >= tmp)
> +			break;
> +	}

Maybe rewrite this as "do { ... } while (fault->timestamp < tmp)".

> +}
> +
>   int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
>   {
>   	int r;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> index 9d11c02a3938..95e18ef83aec 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> @@ -66,9 +66,9 @@ struct firmware;
>    * GMC page fault information
>    */
>   struct amdgpu_gmc_fault {
> -	uint64_t	timestamp;
> +	uint64_t	timestamp:56;

I think 48 bits should be enough for the timestamp for current hardware.

Apart from that looks good to me now,
Christian.

>   	uint64_t	next:AMDGPU_GMC_FAULT_RING_ORDER;
> -	uint64_t	key:52;
> +	atomic64_t	key;
>   };
>   
>   /*
> @@ -318,6 +318,8 @@ void amdgpu_gmc_agp_location(struct amdgpu_device *adev,
>   			     struct amdgpu_gmc *mc);
>   bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
>   			      uint16_t pasid, uint64_t timestamp);
> +void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t addr,
> +				     uint16_t pasid);
>   int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev);
>   void amdgpu_gmc_ras_fini(struct amdgpu_device *adev);
>   int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device *adev);

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v4 4/5] drm/amdgpu: address remove from fault filter
  2021-04-27  7:25   ` [PATCH v4 4/5] drm/amdgpu: address remove from fault filter Christian König
@ 2021-04-27 14:12     ` philip yang
  0 siblings, 0 replies; 14+ messages in thread
From: philip yang @ 2021-04-27 14:12 UTC (permalink / raw)
  To: Christian König, Philip Yang, amd-gfx

[-- Attachment #1: Type: text/html, Size: 11034 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v5 4/5] drm/amdgpu: address remove from fault filter
  2021-04-23 15:35 [PATCH v3 4/5] drm/amdgpu: address remove from fault filter Philip Yang
                   ` (2 preceding siblings ...)
  2021-04-26 21:26 ` [PATCH v4 " Philip Yang
@ 2021-04-27 14:51 ` Philip Yang
  2021-04-27 18:21   ` Felix Kuehling
  3 siblings, 1 reply; 14+ messages in thread
From: Philip Yang @ 2021-04-27 14:51 UTC (permalink / raw)
  To: amd-gfx; +Cc: Philip Yang

Add interface to remove address from fault filter ring by resetting
fault ring entry key, then future vm fault on the address will be
processed to recover.

Define fault key as atomic64_t type to use atomic read/set/cmpxchg key
to protect fault ring access by interrupt handler and interrupt deferred
work for vg20. Change fault->timestamp to 48-bit to share same uint64_t
with 8-bit fault->next, it is enough for 48bit IH timestamp.

Signed-off-by: Philip Yang <Philip.Yang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 48 ++++++++++++++++++++++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  6 ++--
 2 files changed, 48 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index c39ed9eb0987..a2e81e913abb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -332,6 +332,17 @@ void amdgpu_gmc_agp_location(struct amdgpu_device *adev, struct amdgpu_gmc *mc)
 			mc->agp_size >> 20, mc->agp_start, mc->agp_end);
 }
 
+/**
+ * amdgpu_gmc_fault_key - get hask key from vm fault address and pasid
+ *
+ * @addr: 48bit physical address
+ * @pasid: 4 bit
+ */
+static inline uint64_t amdgpu_gmc_fault_key(uint64_t addr, uint16_t pasid)
+{
+	return addr << 4 | pasid;
+}
+
 /**
  * amdgpu_gmc_filter_faults - filter VM faults
  *
@@ -348,8 +359,7 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
 			      uint16_t pasid, uint64_t timestamp)
 {
 	struct amdgpu_gmc *gmc = &adev->gmc;
-
-	uint64_t stamp, key = addr << 4 | pasid;
+	uint64_t stamp, key = amdgpu_gmc_fault_key(addr, pasid);
 	struct amdgpu_gmc_fault *fault;
 	uint32_t hash;
 
@@ -365,7 +375,7 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
 	while (fault->timestamp >= stamp) {
 		uint64_t tmp;
 
-		if (fault->key == key)
+		if (atomic64_read(&fault->key) == key)
 			return true;
 
 		tmp = fault->timestamp;
@@ -378,7 +388,7 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
 
 	/* Add the fault to the ring */
 	fault = &gmc->fault_ring[gmc->last_fault];
-	fault->key = key;
+	atomic64_set(&fault->key, key);
 	fault->timestamp = timestamp;
 
 	/* And update the hash */
@@ -387,6 +397,36 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
 	return false;
 }
 
+/**
+ * amdgpu_gmc_filter_faults_remove - remove address from VM faults filter
+ *
+ * @adev: amdgpu device structure
+ * @addr: address of the VM fault
+ * @pasid: PASID of the process causing the fault
+ *
+ * Remove the address from fault filter, then future vm fault on this address
+ * will pass to retry fault handler to recover.
+ */
+void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t addr,
+				     uint16_t pasid)
+{
+	struct amdgpu_gmc *gmc = &adev->gmc;
+	uint64_t key = amdgpu_gmc_fault_key(addr, pasid);
+	struct amdgpu_gmc_fault *fault;
+	uint32_t hash;
+	uint64_t tmp;
+
+	hash = hash_64(key, AMDGPU_GMC_FAULT_HASH_ORDER);
+	fault = &gmc->fault_ring[gmc->fault_hash[hash].idx];
+	do {
+		if (atomic64_cmpxchg(&fault->key, key, 0) == key)
+			break;
+
+		tmp = fault->timestamp;
+		fault = &gmc->fault_ring[fault->next];
+	} while (fault->timestamp < tmp);
+}
+
 int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
 {
 	int r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index 9d11c02a3938..6aa1d52d3aee 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -66,9 +66,9 @@ struct firmware;
  * GMC page fault information
  */
 struct amdgpu_gmc_fault {
-	uint64_t	timestamp;
+	uint64_t	timestamp:48;
 	uint64_t	next:AMDGPU_GMC_FAULT_RING_ORDER;
-	uint64_t	key:52;
+	atomic64_t	key;
 };
 
 /*
@@ -318,6 +318,8 @@ void amdgpu_gmc_agp_location(struct amdgpu_device *adev,
 			     struct amdgpu_gmc *mc);
 bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
 			      uint16_t pasid, uint64_t timestamp);
+void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t addr,
+				     uint16_t pasid);
 int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev);
 void amdgpu_gmc_ras_fini(struct amdgpu_device *adev);
 int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device *adev);
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v5 4/5] drm/amdgpu: address remove from fault filter
  2021-04-27 14:51 ` [PATCH v5 " Philip Yang
@ 2021-04-27 18:21   ` Felix Kuehling
  2021-04-28  6:54     ` Christian König
  0 siblings, 1 reply; 14+ messages in thread
From: Felix Kuehling @ 2021-04-27 18:21 UTC (permalink / raw)
  To: Philip Yang, amd-gfx

On 2021-04-27 10:51 a.m., Philip Yang wrote:
> Add interface to remove address from fault filter ring by resetting
> fault ring entry key, then future vm fault on the address will be
> processed to recover.
>
> Define fault key as atomic64_t type to use atomic read/set/cmpxchg key
> to protect fault ring access by interrupt handler and interrupt deferred
> work for vg20. Change fault->timestamp to 48-bit to share same uint64_t
> with 8-bit fault->next, it is enough for 48bit IH timestamp.
>
> Signed-off-by: Philip Yang <Philip.Yang@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 48 ++++++++++++++++++++++---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  6 ++--
>   2 files changed, 48 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> index c39ed9eb0987..a2e81e913abb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> @@ -332,6 +332,17 @@ void amdgpu_gmc_agp_location(struct amdgpu_device *adev, struct amdgpu_gmc *mc)
>   			mc->agp_size >> 20, mc->agp_start, mc->agp_end);
>   }
>   
> +/**
> + * amdgpu_gmc_fault_key - get hask key from vm fault address and pasid
> + *
> + * @addr: 48bit physical address
> + * @pasid: 4 bit

This comment is misleading. The PASID is not 4-bit. It's 16-bit. But 
shifting the address by 4 bit is sufficient because the address is 
page-aligned, which means the low 12 bits are 0. So this would be more 
accurate:

@addr: 48 bit physical address, page aligned (36 significant bits)
@pasid: 16 bit process address space identifier

With that fixed, the patch is

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>


> + */
> +static inline uint64_t amdgpu_gmc_fault_key(uint64_t addr, uint16_t pasid)
> +{
> +	return addr << 4 | pasid;
> +}
> +
>   /**
>    * amdgpu_gmc_filter_faults - filter VM faults
>    *
> @@ -348,8 +359,7 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
>   			      uint16_t pasid, uint64_t timestamp)
>   {
>   	struct amdgpu_gmc *gmc = &adev->gmc;
> -
> -	uint64_t stamp, key = addr << 4 | pasid;
> +	uint64_t stamp, key = amdgpu_gmc_fault_key(addr, pasid);
>   	struct amdgpu_gmc_fault *fault;
>   	uint32_t hash;
>   
> @@ -365,7 +375,7 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
>   	while (fault->timestamp >= stamp) {
>   		uint64_t tmp;
>   
> -		if (fault->key == key)
> +		if (atomic64_read(&fault->key) == key)
>   			return true;
>   
>   		tmp = fault->timestamp;
> @@ -378,7 +388,7 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
>   
>   	/* Add the fault to the ring */
>   	fault = &gmc->fault_ring[gmc->last_fault];
> -	fault->key = key;
> +	atomic64_set(&fault->key, key);
>   	fault->timestamp = timestamp;
>   
>   	/* And update the hash */
> @@ -387,6 +397,36 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
>   	return false;
>   }
>   
> +/**
> + * amdgpu_gmc_filter_faults_remove - remove address from VM faults filter
> + *
> + * @adev: amdgpu device structure
> + * @addr: address of the VM fault
> + * @pasid: PASID of the process causing the fault
> + *
> + * Remove the address from fault filter, then future vm fault on this address
> + * will pass to retry fault handler to recover.
> + */
> +void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t addr,
> +				     uint16_t pasid)
> +{
> +	struct amdgpu_gmc *gmc = &adev->gmc;
> +	uint64_t key = amdgpu_gmc_fault_key(addr, pasid);
> +	struct amdgpu_gmc_fault *fault;
> +	uint32_t hash;
> +	uint64_t tmp;
> +
> +	hash = hash_64(key, AMDGPU_GMC_FAULT_HASH_ORDER);
> +	fault = &gmc->fault_ring[gmc->fault_hash[hash].idx];
> +	do {
> +		if (atomic64_cmpxchg(&fault->key, key, 0) == key)
> +			break;
> +
> +		tmp = fault->timestamp;
> +		fault = &gmc->fault_ring[fault->next];
> +	} while (fault->timestamp < tmp);
> +}
> +
>   int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
>   {
>   	int r;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> index 9d11c02a3938..6aa1d52d3aee 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> @@ -66,9 +66,9 @@ struct firmware;
>    * GMC page fault information
>    */
>   struct amdgpu_gmc_fault {
> -	uint64_t	timestamp;
> +	uint64_t	timestamp:48;
>   	uint64_t	next:AMDGPU_GMC_FAULT_RING_ORDER;
> -	uint64_t	key:52;
> +	atomic64_t	key;
>   };
>   
>   /*
> @@ -318,6 +318,8 @@ void amdgpu_gmc_agp_location(struct amdgpu_device *adev,
>   			     struct amdgpu_gmc *mc);
>   bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
>   			      uint16_t pasid, uint64_t timestamp);
> +void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t addr,
> +				     uint16_t pasid);
>   int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev);
>   void amdgpu_gmc_ras_fini(struct amdgpu_device *adev);
>   int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device *adev);
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v5 4/5] drm/amdgpu: address remove from fault filter
  2021-04-27 18:21   ` Felix Kuehling
@ 2021-04-28  6:54     ` Christian König
  2021-04-28  7:53       ` Felix Kuehling
  0 siblings, 1 reply; 14+ messages in thread
From: Christian König @ 2021-04-28  6:54 UTC (permalink / raw)
  To: Felix Kuehling, Philip Yang, amd-gfx

Am 27.04.21 um 20:21 schrieb Felix Kuehling:
> On 2021-04-27 10:51 a.m., Philip Yang wrote:
>> Add interface to remove address from fault filter ring by resetting
>> fault ring entry key, then future vm fault on the address will be
>> processed to recover.
>>
>> Define fault key as atomic64_t type to use atomic read/set/cmpxchg key
>> to protect fault ring access by interrupt handler and interrupt deferred
>> work for vg20. Change fault->timestamp to 48-bit to share same uint64_t
>> with 8-bit fault->next, it is enough for 48bit IH timestamp.
>>
>> Signed-off-by: Philip Yang <Philip.Yang@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 48 ++++++++++++++++++++++---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  6 ++--
>>   2 files changed, 48 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>> index c39ed9eb0987..a2e81e913abb 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>> @@ -332,6 +332,17 @@ void amdgpu_gmc_agp_location(struct 
>> amdgpu_device *adev, struct amdgpu_gmc *mc)
>>               mc->agp_size >> 20, mc->agp_start, mc->agp_end);
>>   }
>>   +/**
>> + * amdgpu_gmc_fault_key - get hask key from vm fault address and pasid
>> + *
>> + * @addr: 48bit physical address
>> + * @pasid: 4 bit
>
> This comment is misleading. The PASID is not 4-bit. It's 16-bit. But 
> shifting the address by 4 bit is sufficient because the address is 
> page-aligned, which means the low 12 bits are 0. So this would be more 
> accurate:
>
> @addr: 48 bit physical address, page aligned (36 significant bits)
> @pasid: 16 bit process address space identifier
>
> With that fixed, the patch is
>
> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
>
>
>> + */
>> +static inline uint64_t amdgpu_gmc_fault_key(uint64_t addr, uint16_t 
>> pasid)
>> +{
>> +    return addr << 4 | pasid;
>> +}

Maybe changing this so that we have "addr * ((1 << 16) / 
AMDGPU_PAGE_SIZE) | pasid" would help to better document that?

Christian.

>> +
>>   /**
>>    * amdgpu_gmc_filter_faults - filter VM faults
>>    *
>> @@ -348,8 +359,7 @@ bool amdgpu_gmc_filter_faults(struct 
>> amdgpu_device *adev, uint64_t addr,
>>                     uint16_t pasid, uint64_t timestamp)
>>   {
>>       struct amdgpu_gmc *gmc = &adev->gmc;
>> -
>> -    uint64_t stamp, key = addr << 4 | pasid;
>> +    uint64_t stamp, key = amdgpu_gmc_fault_key(addr, pasid);
>>       struct amdgpu_gmc_fault *fault;
>>       uint32_t hash;
>>   @@ -365,7 +375,7 @@ bool amdgpu_gmc_filter_faults(struct 
>> amdgpu_device *adev, uint64_t addr,
>>       while (fault->timestamp >= stamp) {
>>           uint64_t tmp;
>>   -        if (fault->key == key)
>> +        if (atomic64_read(&fault->key) == key)
>>               return true;
>>             tmp = fault->timestamp;
>> @@ -378,7 +388,7 @@ bool amdgpu_gmc_filter_faults(struct 
>> amdgpu_device *adev, uint64_t addr,
>>         /* Add the fault to the ring */
>>       fault = &gmc->fault_ring[gmc->last_fault];
>> -    fault->key = key;
>> +    atomic64_set(&fault->key, key);
>>       fault->timestamp = timestamp;
>>         /* And update the hash */
>> @@ -387,6 +397,36 @@ bool amdgpu_gmc_filter_faults(struct 
>> amdgpu_device *adev, uint64_t addr,
>>       return false;
>>   }
>>   +/**
>> + * amdgpu_gmc_filter_faults_remove - remove address from VM faults 
>> filter
>> + *
>> + * @adev: amdgpu device structure
>> + * @addr: address of the VM fault
>> + * @pasid: PASID of the process causing the fault
>> + *
>> + * Remove the address from fault filter, then future vm fault on 
>> this address
>> + * will pass to retry fault handler to recover.
>> + */
>> +void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, 
>> uint64_t addr,
>> +                     uint16_t pasid)
>> +{
>> +    struct amdgpu_gmc *gmc = &adev->gmc;
>> +    uint64_t key = amdgpu_gmc_fault_key(addr, pasid);
>> +    struct amdgpu_gmc_fault *fault;
>> +    uint32_t hash;
>> +    uint64_t tmp;
>> +
>> +    hash = hash_64(key, AMDGPU_GMC_FAULT_HASH_ORDER);
>> +    fault = &gmc->fault_ring[gmc->fault_hash[hash].idx];
>> +    do {
>> +        if (atomic64_cmpxchg(&fault->key, key, 0) == key)
>> +            break;
>> +
>> +        tmp = fault->timestamp;
>> +        fault = &gmc->fault_ring[fault->next];
>> +    } while (fault->timestamp < tmp);
>> +}
>> +
>>   int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
>>   {
>>       int r;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>> index 9d11c02a3938..6aa1d52d3aee 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>> @@ -66,9 +66,9 @@ struct firmware;
>>    * GMC page fault information
>>    */
>>   struct amdgpu_gmc_fault {
>> -    uint64_t    timestamp;
>> +    uint64_t    timestamp:48;
>>       uint64_t    next:AMDGPU_GMC_FAULT_RING_ORDER;
>> -    uint64_t    key:52;
>> +    atomic64_t    key;
>>   };
>>     /*
>> @@ -318,6 +318,8 @@ void amdgpu_gmc_agp_location(struct amdgpu_device 
>> *adev,
>>                    struct amdgpu_gmc *mc);
>>   bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t 
>> addr,
>>                     uint16_t pasid, uint64_t timestamp);
>> +void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, 
>> uint64_t addr,
>> +                     uint16_t pasid);
>>   int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev);
>>   void amdgpu_gmc_ras_fini(struct amdgpu_device *adev);
>>   int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device *adev);
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v5 4/5] drm/amdgpu: address remove from fault filter
  2021-04-28  6:54     ` Christian König
@ 2021-04-28  7:53       ` Felix Kuehling
  2021-04-28  9:00         ` Christian König
  0 siblings, 1 reply; 14+ messages in thread
From: Felix Kuehling @ 2021-04-28  7:53 UTC (permalink / raw)
  To: Christian König, Philip Yang, amd-gfx

Am 2021-04-28 um 2:54 a.m. schrieb Christian König:
> Am 27.04.21 um 20:21 schrieb Felix Kuehling:
>> On 2021-04-27 10:51 a.m., Philip Yang wrote:
>>> Add interface to remove address from fault filter ring by resetting
>>> fault ring entry key, then future vm fault on the address will be
>>> processed to recover.
>>>
>>> Define fault key as atomic64_t type to use atomic read/set/cmpxchg key
>>> to protect fault ring access by interrupt handler and interrupt
>>> deferred
>>> work for vg20. Change fault->timestamp to 48-bit to share same uint64_t
>>> with 8-bit fault->next, it is enough for 48bit IH timestamp.
>>>
>>> Signed-off-by: Philip Yang <Philip.Yang@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 48
>>> ++++++++++++++++++++++---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  6 ++--
>>>   2 files changed, 48 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>>> index c39ed9eb0987..a2e81e913abb 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>>> @@ -332,6 +332,17 @@ void amdgpu_gmc_agp_location(struct
>>> amdgpu_device *adev, struct amdgpu_gmc *mc)
>>>               mc->agp_size >> 20, mc->agp_start, mc->agp_end);
>>>   }
>>>   +/**
>>> + * amdgpu_gmc_fault_key - get hask key from vm fault address and pasid
>>> + *
>>> + * @addr: 48bit physical address
>>> + * @pasid: 4 bit
>>
>> This comment is misleading. The PASID is not 4-bit. It's 16-bit. But
>> shifting the address by 4 bit is sufficient because the address is
>> page-aligned, which means the low 12 bits are 0. So this would be
>> more accurate:
>>
>> @addr: 48 bit physical address, page aligned (36 significant bits)
>> @pasid: 16 bit process address space identifier
>>
>> With that fixed, the patch is
>>
>> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
>>
>>
>>> + */
>>> +static inline uint64_t amdgpu_gmc_fault_key(uint64_t addr, uint16_t
>>> pasid)
>>> +{
>>> +    return addr << 4 | pasid;
>>> +}
>
> Maybe changing this so that we have "addr * ((1 << 16) /
> AMDGPU_PAGE_SIZE) | pasid" would help to better document that?

I find this a mix of multiplication, division and bit-shift more
confusing. How about "addr << (16 - AMDGPU_GPU_PAGE_SHIFT) | pasid"?

Regards,
  Felix


>
> Christian.
>
>>> +
>>>   /**
>>>    * amdgpu_gmc_filter_faults - filter VM faults
>>>    *
>>> @@ -348,8 +359,7 @@ bool amdgpu_gmc_filter_faults(struct
>>> amdgpu_device *adev, uint64_t addr,
>>>                     uint16_t pasid, uint64_t timestamp)
>>>   {
>>>       struct amdgpu_gmc *gmc = &adev->gmc;
>>> -
>>> -    uint64_t stamp, key = addr << 4 | pasid;
>>> +    uint64_t stamp, key = amdgpu_gmc_fault_key(addr, pasid);
>>>       struct amdgpu_gmc_fault *fault;
>>>       uint32_t hash;
>>>   @@ -365,7 +375,7 @@ bool amdgpu_gmc_filter_faults(struct
>>> amdgpu_device *adev, uint64_t addr,
>>>       while (fault->timestamp >= stamp) {
>>>           uint64_t tmp;
>>>   -        if (fault->key == key)
>>> +        if (atomic64_read(&fault->key) == key)
>>>               return true;
>>>             tmp = fault->timestamp;
>>> @@ -378,7 +388,7 @@ bool amdgpu_gmc_filter_faults(struct
>>> amdgpu_device *adev, uint64_t addr,
>>>         /* Add the fault to the ring */
>>>       fault = &gmc->fault_ring[gmc->last_fault];
>>> -    fault->key = key;
>>> +    atomic64_set(&fault->key, key);
>>>       fault->timestamp = timestamp;
>>>         /* And update the hash */
>>> @@ -387,6 +397,36 @@ bool amdgpu_gmc_filter_faults(struct
>>> amdgpu_device *adev, uint64_t addr,
>>>       return false;
>>>   }
>>>   +/**
>>> + * amdgpu_gmc_filter_faults_remove - remove address from VM faults
>>> filter
>>> + *
>>> + * @adev: amdgpu device structure
>>> + * @addr: address of the VM fault
>>> + * @pasid: PASID of the process causing the fault
>>> + *
>>> + * Remove the address from fault filter, then future vm fault on
>>> this address
>>> + * will pass to retry fault handler to recover.
>>> + */
>>> +void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev,
>>> uint64_t addr,
>>> +                     uint16_t pasid)
>>> +{
>>> +    struct amdgpu_gmc *gmc = &adev->gmc;
>>> +    uint64_t key = amdgpu_gmc_fault_key(addr, pasid);
>>> +    struct amdgpu_gmc_fault *fault;
>>> +    uint32_t hash;
>>> +    uint64_t tmp;
>>> +
>>> +    hash = hash_64(key, AMDGPU_GMC_FAULT_HASH_ORDER);
>>> +    fault = &gmc->fault_ring[gmc->fault_hash[hash].idx];
>>> +    do {
>>> +        if (atomic64_cmpxchg(&fault->key, key, 0) == key)
>>> +            break;
>>> +
>>> +        tmp = fault->timestamp;
>>> +        fault = &gmc->fault_ring[fault->next];
>>> +    } while (fault->timestamp < tmp);
>>> +}
>>> +
>>>   int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
>>>   {
>>>       int r;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>>> index 9d11c02a3938..6aa1d52d3aee 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>>> @@ -66,9 +66,9 @@ struct firmware;
>>>    * GMC page fault information
>>>    */
>>>   struct amdgpu_gmc_fault {
>>> -    uint64_t    timestamp;
>>> +    uint64_t    timestamp:48;
>>>       uint64_t    next:AMDGPU_GMC_FAULT_RING_ORDER;
>>> -    uint64_t    key:52;
>>> +    atomic64_t    key;
>>>   };
>>>     /*
>>> @@ -318,6 +318,8 @@ void amdgpu_gmc_agp_location(struct
>>> amdgpu_device *adev,
>>>                    struct amdgpu_gmc *mc);
>>>   bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t
>>> addr,
>>>                     uint16_t pasid, uint64_t timestamp);
>>> +void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev,
>>> uint64_t addr,
>>> +                     uint16_t pasid);
>>>   int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev);
>>>   void amdgpu_gmc_ras_fini(struct amdgpu_device *adev);
>>>   int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device *adev);
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v5 4/5] drm/amdgpu: address remove from fault filter
  2021-04-28  7:53       ` Felix Kuehling
@ 2021-04-28  9:00         ` Christian König
  2021-04-28 14:57           ` philip yang
  0 siblings, 1 reply; 14+ messages in thread
From: Christian König @ 2021-04-28  9:00 UTC (permalink / raw)
  To: Felix Kuehling, Philip Yang, amd-gfx



Am 28.04.21 um 09:53 schrieb Felix Kuehling:
> Am 2021-04-28 um 2:54 a.m. schrieb Christian König:
>> Am 27.04.21 um 20:21 schrieb Felix Kuehling:
>>> On 2021-04-27 10:51 a.m., Philip Yang wrote:
>>>> Add interface to remove address from fault filter ring by resetting
>>>> fault ring entry key, then future vm fault on the address will be
>>>> processed to recover.
>>>>
>>>> Define fault key as atomic64_t type to use atomic read/set/cmpxchg key
>>>> to protect fault ring access by interrupt handler and interrupt
>>>> deferred
>>>> work for vg20. Change fault->timestamp to 48-bit to share same uint64_t
>>>> with 8-bit fault->next, it is enough for 48bit IH timestamp.
>>>>
>>>> Signed-off-by: Philip Yang <Philip.Yang@amd.com>
>>>> ---
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 48
>>>> ++++++++++++++++++++++---
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  6 ++--
>>>>    2 files changed, 48 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>>>> index c39ed9eb0987..a2e81e913abb 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>>>> @@ -332,6 +332,17 @@ void amdgpu_gmc_agp_location(struct
>>>> amdgpu_device *adev, struct amdgpu_gmc *mc)
>>>>                mc->agp_size >> 20, mc->agp_start, mc->agp_end);
>>>>    }
>>>>    +/**
>>>> + * amdgpu_gmc_fault_key - get hask key from vm fault address and pasid
>>>> + *
>>>> + * @addr: 48bit physical address
>>>> + * @pasid: 4 bit
>>> This comment is misleading. The PASID is not 4-bit. It's 16-bit. But
>>> shifting the address by 4 bit is sufficient because the address is
>>> page-aligned, which means the low 12 bits are 0. So this would be
>>> more accurate:
>>>
>>> @addr: 48 bit physical address, page aligned (36 significant bits)
>>> @pasid: 16 bit process address space identifier
>>>
>>> With that fixed, the patch is
>>>
>>> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
>>>
>>>
>>>> + */
>>>> +static inline uint64_t amdgpu_gmc_fault_key(uint64_t addr, uint16_t
>>>> pasid)
>>>> +{
>>>> +    return addr << 4 | pasid;
>>>> +}
>> Maybe changing this so that we have "addr * ((1 << 16) /
>> AMDGPU_PAGE_SIZE) | pasid" would help to better document that?
> I find this a mix of multiplication, division and bit-shift more
> confusing. How about "addr << (16 - AMDGPU_GPU_PAGE_SHIFT) | pasid"?

Yeah, that is even better.

Regards,
Christian.

>
> Regards,
>    Felix
>
>
>> Christian.
>>
>>>> +
>>>>    /**
>>>>     * amdgpu_gmc_filter_faults - filter VM faults
>>>>     *
>>>> @@ -348,8 +359,7 @@ bool amdgpu_gmc_filter_faults(struct
>>>> amdgpu_device *adev, uint64_t addr,
>>>>                      uint16_t pasid, uint64_t timestamp)
>>>>    {
>>>>        struct amdgpu_gmc *gmc = &adev->gmc;
>>>> -
>>>> -    uint64_t stamp, key = addr << 4 | pasid;
>>>> +    uint64_t stamp, key = amdgpu_gmc_fault_key(addr, pasid);
>>>>        struct amdgpu_gmc_fault *fault;
>>>>        uint32_t hash;
>>>>    @@ -365,7 +375,7 @@ bool amdgpu_gmc_filter_faults(struct
>>>> amdgpu_device *adev, uint64_t addr,
>>>>        while (fault->timestamp >= stamp) {
>>>>            uint64_t tmp;
>>>>    -        if (fault->key == key)
>>>> +        if (atomic64_read(&fault->key) == key)
>>>>                return true;
>>>>              tmp = fault->timestamp;
>>>> @@ -378,7 +388,7 @@ bool amdgpu_gmc_filter_faults(struct
>>>> amdgpu_device *adev, uint64_t addr,
>>>>          /* Add the fault to the ring */
>>>>        fault = &gmc->fault_ring[gmc->last_fault];
>>>> -    fault->key = key;
>>>> +    atomic64_set(&fault->key, key);
>>>>        fault->timestamp = timestamp;
>>>>          /* And update the hash */
>>>> @@ -387,6 +397,36 @@ bool amdgpu_gmc_filter_faults(struct
>>>> amdgpu_device *adev, uint64_t addr,
>>>>        return false;
>>>>    }
>>>>    +/**
>>>> + * amdgpu_gmc_filter_faults_remove - remove address from VM faults
>>>> filter
>>>> + *
>>>> + * @adev: amdgpu device structure
>>>> + * @addr: address of the VM fault
>>>> + * @pasid: PASID of the process causing the fault
>>>> + *
>>>> + * Remove the address from fault filter, then future vm fault on
>>>> this address
>>>> + * will pass to retry fault handler to recover.
>>>> + */
>>>> +void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev,
>>>> uint64_t addr,
>>>> +                     uint16_t pasid)
>>>> +{
>>>> +    struct amdgpu_gmc *gmc = &adev->gmc;
>>>> +    uint64_t key = amdgpu_gmc_fault_key(addr, pasid);
>>>> +    struct amdgpu_gmc_fault *fault;
>>>> +    uint32_t hash;
>>>> +    uint64_t tmp;
>>>> +
>>>> +    hash = hash_64(key, AMDGPU_GMC_FAULT_HASH_ORDER);
>>>> +    fault = &gmc->fault_ring[gmc->fault_hash[hash].idx];
>>>> +    do {
>>>> +        if (atomic64_cmpxchg(&fault->key, key, 0) == key)
>>>> +            break;
>>>> +
>>>> +        tmp = fault->timestamp;
>>>> +        fault = &gmc->fault_ring[fault->next];
>>>> +    } while (fault->timestamp < tmp);
>>>> +}
>>>> +
>>>>    int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
>>>>    {
>>>>        int r;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>>>> index 9d11c02a3938..6aa1d52d3aee 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>>>> @@ -66,9 +66,9 @@ struct firmware;
>>>>     * GMC page fault information
>>>>     */
>>>>    struct amdgpu_gmc_fault {
>>>> -    uint64_t    timestamp;
>>>> +    uint64_t    timestamp:48;
>>>>        uint64_t    next:AMDGPU_GMC_FAULT_RING_ORDER;
>>>> -    uint64_t    key:52;
>>>> +    atomic64_t    key;
>>>>    };
>>>>      /*
>>>> @@ -318,6 +318,8 @@ void amdgpu_gmc_agp_location(struct
>>>> amdgpu_device *adev,
>>>>                     struct amdgpu_gmc *mc);
>>>>    bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t
>>>> addr,
>>>>                      uint16_t pasid, uint64_t timestamp);
>>>> +void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev,
>>>> uint64_t addr,
>>>> +                     uint16_t pasid);
>>>>    int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev);
>>>>    void amdgpu_gmc_ras_fini(struct amdgpu_device *adev);
>>>>    int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device *adev);
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v5 4/5] drm/amdgpu: address remove from fault filter
  2021-04-28  9:00         ` Christian König
@ 2021-04-28 14:57           ` philip yang
  0 siblings, 0 replies; 14+ messages in thread
From: philip yang @ 2021-04-28 14:57 UTC (permalink / raw)
  To: Christian König, Felix Kuehling, Philip Yang, amd-gfx

[-- Attachment #1: Type: text/html, Size: 15597 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2021-04-28 14:57 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-04-23 15:35 [PATCH v3 4/5] drm/amdgpu: address remove from fault filter Philip Yang
2021-04-23 15:35 ` [PATCH 5/5] drm/amdkfd: enable subsequent retry fault Philip Yang
2021-04-23 16:28 ` [PATCH v3 4/5] drm/amdgpu: address remove from fault filter Christian König
2021-04-24  4:50   ` Felix Kuehling
2021-04-26 21:26 ` [PATCH v4 " Philip Yang
2021-04-26 21:26   ` [PATCH 5/5] drm/amdkfd: enable subsequent retry fault Philip Yang
2021-04-27  7:25   ` [PATCH v4 4/5] drm/amdgpu: address remove from fault filter Christian König
2021-04-27 14:12     ` philip yang
2021-04-27 14:51 ` [PATCH v5 " Philip Yang
2021-04-27 18:21   ` Felix Kuehling
2021-04-28  6:54     ` Christian König
2021-04-28  7:53       ` Felix Kuehling
2021-04-28  9:00         ` Christian König
2021-04-28 14:57           ` philip yang

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.