All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdgpu: block ring buffer access during GPU recovery
@ 2020-09-01  1:17 Dennis Li
  2020-09-01  1:40 ` Andrey Grodzovsky
  2020-09-01  7:15 ` Christian König
  0 siblings, 2 replies; 6+ messages in thread
From: Dennis Li @ 2020-09-01  1:17 UTC (permalink / raw)
  To: amd-gfx, Alexander.Deucher, felix.kuehling, Hawking.Zhang,
	christian.koenig
  Cc: Dennis Li

When GPU is in reset, its status isn't stable and ring buffer also need
be reset when resuming. Therefore driver should protect GPU recovery
thread from ring buffer accessed by other threads. Otherwise GPU will
randomly hang during recovery.

Signed-off-by: Dennis Li <Dennis.Li@amd.com>

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 172dc47b7f39..8db56a22cd1b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -319,8 +319,13 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
 {
 	uint32_t ret;
 
-	if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
-		return amdgpu_kiq_rreg(adev, reg);
+	if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
+		amdgpu_sriov_runtime(adev) &&
+		down_read_trylock(&adev->reset_sem)) {
+		ret = amdgpu_kiq_rreg(adev, reg);
+		up_read(&adev->reset_sem);
+		return ret;
+	}
 
 	if ((reg * 4) < adev->rmmio_size)
 		ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
@@ -332,6 +337,7 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
 		ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4));
 		spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
 	}
+
 	trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret);
 	return ret;
 }
@@ -407,8 +413,13 @@ void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg,
 void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
 		    uint32_t acc_flags)
 {
-	if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
-		return amdgpu_kiq_wreg(adev, reg, v);
+	if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
+		amdgpu_sriov_runtime(adev) &&
+		down_read_trylock(&adev->reset_sem)) {
+		amdgpu_kiq_wreg(adev, reg, v);
+		up_read(&adev->reset_sem);
+		return;
+	}
 
 	amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags);
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index ad9ad622ccce..4ea2a065daa9 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -287,7 +287,7 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
 	 */
 	if (adev->gfx.kiq.ring.sched.ready &&
 	    (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) &&
-	    !amdgpu_in_reset(adev)) {
+	    down_read_trylock(&adev->reset_sem)) {
 
 		struct amdgpu_vmhub *hub = &adev->vmhub[vmhub];
 		const unsigned eng = 17;
@@ -297,6 +297,8 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
 
 		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req,
 				1 << vmid);
+
+		up_read(&adev->reset_sem);
 		return;
 	}
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index e1a0ae327cf5..33b7cf1c79ec 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -501,12 +501,13 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
 	 */
 	if (adev->gfx.kiq.ring.sched.ready &&
 			(amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) &&
-			!amdgpu_in_reset(adev)) {
+			down_read_trylock(&adev->reset_sem)) {
 		uint32_t req = hub->vm_inv_eng0_req + hub->eng_distance * eng;
 		uint32_t ack = hub->vm_inv_eng0_ack + hub->eng_distance * eng;
 
 		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req,
 						   1 << vmid);
+		up_read(&adev->reset_sem);
 		return;
 	}
 
@@ -599,7 +600,8 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
 	if (amdgpu_in_reset(adev))
 		return -EIO;
 
-	if (ring->sched.ready) {
+	if (ring->sched.ready &&
+		 down_read_trylock(&adev->reset_sem)) {
 		/* Vega20+XGMI caches PTEs in TC and TLB. Add a
 		 * heavy-weight TLB flush (type 2), which flushes
 		 * both. Due to a race condition with concurrent
@@ -626,6 +628,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
 		if (r) {
 			amdgpu_ring_undo(ring);
 			spin_unlock(&adev->gfx.kiq.ring_lock);
+			up_read(&adev->reset_sem);
 			return -ETIME;
 		}
 
@@ -634,9 +637,10 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
 		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
 		if (r < 1) {
 			dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r);
+			up_read(&adev->reset_sem);
 			return -ETIME;
 		}
-
+		up_read(&adev->reset_sem);
 		return 0;
 	}
 
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH] drm/amdgpu: block ring buffer access during GPU recovery
  2020-09-01  1:17 [PATCH] drm/amdgpu: block ring buffer access during GPU recovery Dennis Li
@ 2020-09-01  1:40 ` Andrey Grodzovsky
  2020-09-01  2:45   ` Li, Dennis
  2020-09-01  7:15 ` Christian König
  1 sibling, 1 reply; 6+ messages in thread
From: Andrey Grodzovsky @ 2020-09-01  1:40 UTC (permalink / raw)
  To: Dennis Li, amd-gfx, Alexander.Deucher, felix.kuehling,
	Hawking.Zhang, christian.koenig


On 8/31/20 9:17 PM, Dennis Li wrote:
> When GPU is in reset, its status isn't stable and ring buffer also need
> be reset when resuming. Therefore driver should protect GPU recovery
> thread from ring buffer accessed by other threads. Otherwise GPU will
> randomly hang during recovery.
>
> Signed-off-by: Dennis Li <Dennis.Li@amd.com>
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 172dc47b7f39..8db56a22cd1b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -319,8 +319,13 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
>   {
>   	uint32_t ret;
>   
> -	if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
> -		return amdgpu_kiq_rreg(adev, reg);
> +	if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
> +		amdgpu_sriov_runtime(adev) &&
> +		down_read_trylock(&adev->reset_sem)) {
> +		ret = amdgpu_kiq_rreg(adev, reg);
> +		up_read(&adev->reset_sem);
> +		return ret;
> +	}


Isn't adev->reset_sem non-recursive ? How this works when you try to access 
registers
from within GPU reset thread while adev->reset_sem is already write locked from 
amdgpu_device_lock_adev
earlier in the same thread ?

Andrey


>   
>   	if ((reg * 4) < adev->rmmio_size)
>   		ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
> @@ -332,6 +337,7 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
>   		ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4));
>   		spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
>   	}
> +
>   	trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret);
>   	return ret;
>   }
> @@ -407,8 +413,13 @@ void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg,
>   void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
>   		    uint32_t acc_flags)
>   {
> -	if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
> -		return amdgpu_kiq_wreg(adev, reg, v);
> +	if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
> +		amdgpu_sriov_runtime(adev) &&
> +		down_read_trylock(&adev->reset_sem)) {
> +		amdgpu_kiq_wreg(adev, reg, v);
> +		up_read(&adev->reset_sem);
> +		return;
> +	}
>   
>   	amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags);
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index ad9ad622ccce..4ea2a065daa9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -287,7 +287,7 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>   	 */
>   	if (adev->gfx.kiq.ring.sched.ready &&
>   	    (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) &&
> -	    !amdgpu_in_reset(adev)) {
> +	    down_read_trylock(&adev->reset_sem)) {
>   
>   		struct amdgpu_vmhub *hub = &adev->vmhub[vmhub];
>   		const unsigned eng = 17;
> @@ -297,6 +297,8 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>   
>   		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req,
>   				1 << vmid);
> +
> +		up_read(&adev->reset_sem);
>   		return;
>   	}
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index e1a0ae327cf5..33b7cf1c79ec 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -501,12 +501,13 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>   	 */
>   	if (adev->gfx.kiq.ring.sched.ready &&
>   			(amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) &&
> -			!amdgpu_in_reset(adev)) {
> +			down_read_trylock(&adev->reset_sem)) {
>   		uint32_t req = hub->vm_inv_eng0_req + hub->eng_distance * eng;
>   		uint32_t ack = hub->vm_inv_eng0_ack + hub->eng_distance * eng;
>   
>   		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req,
>   						   1 << vmid);
> +		up_read(&adev->reset_sem);
>   		return;
>   	}
>   
> @@ -599,7 +600,8 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>   	if (amdgpu_in_reset(adev))
>   		return -EIO;
>   
> -	if (ring->sched.ready) {
> +	if (ring->sched.ready &&
> +		 down_read_trylock(&adev->reset_sem)) {
>   		/* Vega20+XGMI caches PTEs in TC and TLB. Add a
>   		 * heavy-weight TLB flush (type 2), which flushes
>   		 * both. Due to a race condition with concurrent
> @@ -626,6 +628,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>   		if (r) {
>   			amdgpu_ring_undo(ring);
>   			spin_unlock(&adev->gfx.kiq.ring_lock);
> +			up_read(&adev->reset_sem);
>   			return -ETIME;
>   		}
>   
> @@ -634,9 +637,10 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>   		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
>   		if (r < 1) {
>   			dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r);
> +			up_read(&adev->reset_sem);
>   			return -ETIME;
>   		}
> -
> +		up_read(&adev->reset_sem);
>   		return 0;
>   	}
>   
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* RE: [PATCH] drm/amdgpu: block ring buffer access during GPU recovery
  2020-09-01  1:40 ` Andrey Grodzovsky
@ 2020-09-01  2:45   ` Li, Dennis
  2020-09-01  7:11     ` Christian König
  2020-09-01 14:22     ` Andrey Grodzovsky
  0 siblings, 2 replies; 6+ messages in thread
From: Li, Dennis @ 2020-09-01  2:45 UTC (permalink / raw)
  To: Grodzovsky, Andrey, amd-gfx, Deucher, Alexander, Kuehling, Felix,
	Zhang, Hawking, Koenig, Christian

[AMD Official Use Only - Internal Distribution Only]

Hi, Andrey,
    
RE- Isn't adev->reset_sem non-recursive ? How this works when you try to access registers from within GPU reset thread while adev->reset_sem is already write locked from amdgpu_device_lock_adev earlier in the same thread ?

Deli: down_read_trylock will fail in this case, return false immediately and will not lock adev->reset_sem. In GPU reset thread, we should use MMIO instead of KIQ to access registers. 

Best Regards
Dennis Li
-----Original Message-----
From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com> 
Sent: Tuesday, September 1, 2020 9:40 AM
To: Li, Dennis <Dennis.Li@amd.com>; amd-gfx@lists.freedesktop.org; Deucher, Alexander <Alexander.Deucher@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
Subject: Re: [PATCH] drm/amdgpu: block ring buffer access during GPU recovery


On 8/31/20 9:17 PM, Dennis Li wrote:
> When GPU is in reset, its status isn't stable and ring buffer also 
> need be reset when resuming. Therefore driver should protect GPU 
> recovery thread from ring buffer accessed by other threads. Otherwise 
> GPU will randomly hang during recovery.
>
> Signed-off-by: Dennis Li <Dennis.Li@amd.com>
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 172dc47b7f39..8db56a22cd1b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -319,8 +319,13 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
>   {
>   	uint32_t ret;
>   
> -	if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
> -		return amdgpu_kiq_rreg(adev, reg);
> +	if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
> +		amdgpu_sriov_runtime(adev) &&
> +		down_read_trylock(&adev->reset_sem)) {
> +		ret = amdgpu_kiq_rreg(adev, reg);
> +		up_read(&adev->reset_sem);
> +		return ret;
> +	}


Isn't adev->reset_sem non-recursive ? How this works when you try to access registers from within GPU reset thread while adev->reset_sem is already write locked from amdgpu_device_lock_adev earlier in the same thread ?

Andrey


>   
>   	if ((reg * 4) < adev->rmmio_size)
>   		ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); @@ -332,6 
> +337,7 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
>   		ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4));
>   		spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
>   	}
> +
>   	trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret);
>   	return ret;
>   }
> @@ -407,8 +413,13 @@ void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg,
>   void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
>   		    uint32_t acc_flags)
>   {
> -	if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
> -		return amdgpu_kiq_wreg(adev, reg, v);
> +	if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
> +		amdgpu_sriov_runtime(adev) &&
> +		down_read_trylock(&adev->reset_sem)) {
> +		amdgpu_kiq_wreg(adev, reg, v);
> +		up_read(&adev->reset_sem);
> +		return;
> +	}
>   
>   	amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags);
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index ad9ad622ccce..4ea2a065daa9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -287,7 +287,7 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>   	 */
>   	if (adev->gfx.kiq.ring.sched.ready &&
>   	    (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) &&
> -	    !amdgpu_in_reset(adev)) {
> +	    down_read_trylock(&adev->reset_sem)) {
>   
>   		struct amdgpu_vmhub *hub = &adev->vmhub[vmhub];
>   		const unsigned eng = 17;
> @@ -297,6 +297,8 @@ static void gmc_v10_0_flush_gpu_tlb(struct 
> amdgpu_device *adev, uint32_t vmid,
>   
>   		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req,
>   				1 << vmid);
> +
> +		up_read(&adev->reset_sem);
>   		return;
>   	}
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index e1a0ae327cf5..33b7cf1c79ec 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -501,12 +501,13 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>   	 */
>   	if (adev->gfx.kiq.ring.sched.ready &&
>   			(amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) &&
> -			!amdgpu_in_reset(adev)) {
> +			down_read_trylock(&adev->reset_sem)) {
>   		uint32_t req = hub->vm_inv_eng0_req + hub->eng_distance * eng;
>   		uint32_t ack = hub->vm_inv_eng0_ack + hub->eng_distance * eng;
>   
>   		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req,
>   						   1 << vmid);
> +		up_read(&adev->reset_sem);
>   		return;
>   	}
>   
> @@ -599,7 +600,8 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>   	if (amdgpu_in_reset(adev))
>   		return -EIO;
>   
> -	if (ring->sched.ready) {
> +	if (ring->sched.ready &&
> +		 down_read_trylock(&adev->reset_sem)) {
>   		/* Vega20+XGMI caches PTEs in TC and TLB. Add a
>   		 * heavy-weight TLB flush (type 2), which flushes
>   		 * both. Due to a race condition with concurrent @@ -626,6 +628,7 
> @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>   		if (r) {
>   			amdgpu_ring_undo(ring);
>   			spin_unlock(&adev->gfx.kiq.ring_lock);
> +			up_read(&adev->reset_sem);
>   			return -ETIME;
>   		}
>   
> @@ -634,9 +637,10 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>   		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
>   		if (r < 1) {
>   			dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r);
> +			up_read(&adev->reset_sem);
>   			return -ETIME;
>   		}
> -
> +		up_read(&adev->reset_sem);
>   		return 0;
>   	}
>   
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] drm/amdgpu: block ring buffer access during GPU recovery
  2020-09-01  2:45   ` Li, Dennis
@ 2020-09-01  7:11     ` Christian König
  2020-09-01 14:22     ` Andrey Grodzovsky
  1 sibling, 0 replies; 6+ messages in thread
From: Christian König @ 2020-09-01  7:11 UTC (permalink / raw)
  To: Li, Dennis, Grodzovsky, Andrey, amd-gfx, Deucher, Alexander,
	Kuehling, Felix, Zhang, Hawking, Koenig, Christian

Yeah, correct.

What we maybe should do is to add a WARN_ON() which tests if the current 
thread is the one which has locked the semaphore to catch this case.

Regards,
Christian.

Am 01.09.20 um 04:45 schrieb Li, Dennis:
> [AMD Official Use Only - Internal Distribution Only]
>
> Hi, Andrey,
>      
> RE- Isn't adev->reset_sem non-recursive ? How this works when you try to access registers from within GPU reset thread while adev->reset_sem is already write locked from amdgpu_device_lock_adev earlier in the same thread ?
>
> Deli: down_read_trylock will fail in this case, return false immediately and will not lock adev->reset_sem. In GPU reset thread, we should use MMIO instead of KIQ to access registers.
>
> Best Regards
> Dennis Li
> -----Original Message-----
> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
> Sent: Tuesday, September 1, 2020 9:40 AM
> To: Li, Dennis <Dennis.Li@amd.com>; amd-gfx@lists.freedesktop.org; Deucher, Alexander <Alexander.Deucher@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
> Subject: Re: [PATCH] drm/amdgpu: block ring buffer access during GPU recovery
>
>
> On 8/31/20 9:17 PM, Dennis Li wrote:
>> When GPU is in reset, its status isn't stable and ring buffer also
>> need be reset when resuming. Therefore driver should protect GPU
>> recovery thread from ring buffer accessed by other threads. Otherwise
>> GPU will randomly hang during recovery.
>>
>> Signed-off-by: Dennis Li <Dennis.Li@amd.com>
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 172dc47b7f39..8db56a22cd1b 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -319,8 +319,13 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
>>    {
>>    	uint32_t ret;
>>    
>> -	if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
>> -		return amdgpu_kiq_rreg(adev, reg);
>> +	if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
>> +		amdgpu_sriov_runtime(adev) &&
>> +		down_read_trylock(&adev->reset_sem)) {
>> +		ret = amdgpu_kiq_rreg(adev, reg);
>> +		up_read(&adev->reset_sem);
>> +		return ret;
>> +	}
>
> Isn't adev->reset_sem non-recursive ? How this works when you try to access registers from within GPU reset thread while adev->reset_sem is already write locked from amdgpu_device_lock_adev earlier in the same thread ?
>
> Andrey
>
>
>>    
>>    	if ((reg * 4) < adev->rmmio_size)
>>    		ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); @@ -332,6
>> +337,7 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
>>    		ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4));
>>    		spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
>>    	}
>> +
>>    	trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret);
>>    	return ret;
>>    }
>> @@ -407,8 +413,13 @@ void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg,
>>    void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
>>    		    uint32_t acc_flags)
>>    {
>> -	if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
>> -		return amdgpu_kiq_wreg(adev, reg, v);
>> +	if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
>> +		amdgpu_sriov_runtime(adev) &&
>> +		down_read_trylock(&adev->reset_sem)) {
>> +		amdgpu_kiq_wreg(adev, reg, v);
>> +		up_read(&adev->reset_sem);
>> +		return;
>> +	}
>>    
>>    	amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags);
>>    }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> index ad9ad622ccce..4ea2a065daa9 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> @@ -287,7 +287,7 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>>    	 */
>>    	if (adev->gfx.kiq.ring.sched.ready &&
>>    	    (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) &&
>> -	    !amdgpu_in_reset(adev)) {
>> +	    down_read_trylock(&adev->reset_sem)) {
>>    
>>    		struct amdgpu_vmhub *hub = &adev->vmhub[vmhub];
>>    		const unsigned eng = 17;
>> @@ -297,6 +297,8 @@ static void gmc_v10_0_flush_gpu_tlb(struct
>> amdgpu_device *adev, uint32_t vmid,
>>    
>>    		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req,
>>    				1 << vmid);
>> +
>> +		up_read(&adev->reset_sem);
>>    		return;
>>    	}
>>    
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> index e1a0ae327cf5..33b7cf1c79ec 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> @@ -501,12 +501,13 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>>    	 */
>>    	if (adev->gfx.kiq.ring.sched.ready &&
>>    			(amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) &&
>> -			!amdgpu_in_reset(adev)) {
>> +			down_read_trylock(&adev->reset_sem)) {
>>    		uint32_t req = hub->vm_inv_eng0_req + hub->eng_distance * eng;
>>    		uint32_t ack = hub->vm_inv_eng0_ack + hub->eng_distance * eng;
>>    
>>    		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req,
>>    						   1 << vmid);
>> +		up_read(&adev->reset_sem);
>>    		return;
>>    	}
>>    
>> @@ -599,7 +600,8 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>>    	if (amdgpu_in_reset(adev))
>>    		return -EIO;
>>    
>> -	if (ring->sched.ready) {
>> +	if (ring->sched.ready &&
>> +		 down_read_trylock(&adev->reset_sem)) {
>>    		/* Vega20+XGMI caches PTEs in TC and TLB. Add a
>>    		 * heavy-weight TLB flush (type 2), which flushes
>>    		 * both. Due to a race condition with concurrent @@ -626,6 +628,7
>> @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>>    		if (r) {
>>    			amdgpu_ring_undo(ring);
>>    			spin_unlock(&adev->gfx.kiq.ring_lock);
>> +			up_read(&adev->reset_sem);
>>    			return -ETIME;
>>    		}
>>    
>> @@ -634,9 +637,10 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>>    		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
>>    		if (r < 1) {
>>    			dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r);
>> +			up_read(&adev->reset_sem);
>>    			return -ETIME;
>>    		}
>> -
>> +		up_read(&adev->reset_sem);
>>    		return 0;
>>    	}
>>    
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] drm/amdgpu: block ring buffer access during GPU recovery
  2020-09-01  1:17 [PATCH] drm/amdgpu: block ring buffer access during GPU recovery Dennis Li
  2020-09-01  1:40 ` Andrey Grodzovsky
@ 2020-09-01  7:15 ` Christian König
  1 sibling, 0 replies; 6+ messages in thread
From: Christian König @ 2020-09-01  7:15 UTC (permalink / raw)
  To: Dennis Li, amd-gfx, Alexander.Deucher, felix.kuehling,
	Hawking.Zhang, christian.koenig

Am 01.09.20 um 03:17 schrieb Dennis Li:
> When GPU is in reset, its status isn't stable and ring buffer also need
> be reset when resuming. Therefore driver should protect GPU recovery
> thread from ring buffer accessed by other threads. Otherwise GPU will
> randomly hang during recovery.

One style comment inline, apart from that looks good to me.

>
> Signed-off-by: Dennis Li <Dennis.Li@amd.com>
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 172dc47b7f39..8db56a22cd1b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -319,8 +319,13 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
>   {
>   	uint32_t ret;
>   
> -	if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
> -		return amdgpu_kiq_rreg(adev, reg);
> +	if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
> +		amdgpu_sriov_runtime(adev) &&
> +		down_read_trylock(&adev->reset_sem)) {

This is not correctly indented, the following lines of an if clause 
should be under the ( of the first line. Same for other cases as well.

Regards,
Christian.

> +		ret = amdgpu_kiq_rreg(adev, reg);
> +		up_read(&adev->reset_sem);
> +		return ret;
> +	}
>   
>   	if ((reg * 4) < adev->rmmio_size)
>   		ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
> @@ -332,6 +337,7 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
>   		ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4));
>   		spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
>   	}
> +
>   	trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret);
>   	return ret;
>   }
> @@ -407,8 +413,13 @@ void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg,
>   void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
>   		    uint32_t acc_flags)
>   {
> -	if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
> -		return amdgpu_kiq_wreg(adev, reg, v);
> +	if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
> +		amdgpu_sriov_runtime(adev) &&
> +		down_read_trylock(&adev->reset_sem)) {
> +		amdgpu_kiq_wreg(adev, reg, v);
> +		up_read(&adev->reset_sem);
> +		return;
> +	}
>   
>   	amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags);
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index ad9ad622ccce..4ea2a065daa9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -287,7 +287,7 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>   	 */
>   	if (adev->gfx.kiq.ring.sched.ready &&
>   	    (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) &&
> -	    !amdgpu_in_reset(adev)) {
> +	    down_read_trylock(&adev->reset_sem)) {
>   
>   		struct amdgpu_vmhub *hub = &adev->vmhub[vmhub];
>   		const unsigned eng = 17;
> @@ -297,6 +297,8 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>   
>   		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req,
>   				1 << vmid);
> +
> +		up_read(&adev->reset_sem);
>   		return;
>   	}
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index e1a0ae327cf5..33b7cf1c79ec 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -501,12 +501,13 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>   	 */
>   	if (adev->gfx.kiq.ring.sched.ready &&
>   			(amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) &&
> -			!amdgpu_in_reset(adev)) {
> +			down_read_trylock(&adev->reset_sem)) {
>   		uint32_t req = hub->vm_inv_eng0_req + hub->eng_distance * eng;
>   		uint32_t ack = hub->vm_inv_eng0_ack + hub->eng_distance * eng;
>   
>   		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req,
>   						   1 << vmid);
> +		up_read(&adev->reset_sem);
>   		return;
>   	}
>   
> @@ -599,7 +600,8 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>   	if (amdgpu_in_reset(adev))
>   		return -EIO;
>   
> -	if (ring->sched.ready) {
> +	if (ring->sched.ready &&
> +		 down_read_trylock(&adev->reset_sem)) {
>   		/* Vega20+XGMI caches PTEs in TC and TLB. Add a
>   		 * heavy-weight TLB flush (type 2), which flushes
>   		 * both. Due to a race condition with concurrent
> @@ -626,6 +628,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>   		if (r) {
>   			amdgpu_ring_undo(ring);
>   			spin_unlock(&adev->gfx.kiq.ring_lock);
> +			up_read(&adev->reset_sem);
>   			return -ETIME;
>   		}
>   
> @@ -634,9 +637,10 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>   		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
>   		if (r < 1) {
>   			dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r);
> +			up_read(&adev->reset_sem);
>   			return -ETIME;
>   		}
> -
> +		up_read(&adev->reset_sem);
>   		return 0;
>   	}
>   

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] drm/amdgpu: block ring buffer access during GPU recovery
  2020-09-01  2:45   ` Li, Dennis
  2020-09-01  7:11     ` Christian König
@ 2020-09-01 14:22     ` Andrey Grodzovsky
  1 sibling, 0 replies; 6+ messages in thread
From: Andrey Grodzovsky @ 2020-09-01 14:22 UTC (permalink / raw)
  To: Li, Dennis, amd-gfx, Deucher, Alexander, Kuehling, Felix, Zhang,
	Hawking, Koenig, Christian

Now i get it, I missed the 'else' part.

Acked-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>

Andrey

On 8/31/20 10:45 PM, Li, Dennis wrote:
> [AMD Official Use Only - Internal Distribution Only]
>
> Hi, Andrey,
>      
> RE- Isn't adev->reset_sem non-recursive ? How this works when you try to access registers from within GPU reset thread while adev->reset_sem is already write locked from amdgpu_device_lock_adev earlier in the same thread ?
>
> Deli: down_read_trylock will fail in this case, return false immediately and will not lock adev->reset_sem. In GPU reset thread, we should use MMIO instead of KIQ to access registers.
>
> Best Regards
> Dennis Li
> -----Original Message-----
> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
> Sent: Tuesday, September 1, 2020 9:40 AM
> To: Li, Dennis <Dennis.Li@amd.com>; amd-gfx@lists.freedesktop.org; Deucher, Alexander <Alexander.Deucher@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
> Subject: Re: [PATCH] drm/amdgpu: block ring buffer access during GPU recovery
>
>
> On 8/31/20 9:17 PM, Dennis Li wrote:
>> When GPU is in reset, its status isn't stable and ring buffer also
>> need be reset when resuming. Therefore driver should protect GPU
>> recovery thread from ring buffer accessed by other threads. Otherwise
>> GPU will randomly hang during recovery.
>>
>> Signed-off-by: Dennis Li <Dennis.Li@amd.com>
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 172dc47b7f39..8db56a22cd1b 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -319,8 +319,13 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
>>    {
>>    	uint32_t ret;
>>    
>> -	if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
>> -		return amdgpu_kiq_rreg(adev, reg);
>> +	if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
>> +		amdgpu_sriov_runtime(adev) &&
>> +		down_read_trylock(&adev->reset_sem)) {
>> +		ret = amdgpu_kiq_rreg(adev, reg);
>> +		up_read(&adev->reset_sem);
>> +		return ret;
>> +	}
>
> Isn't adev->reset_sem non-recursive ? How this works when you try to access registers from within GPU reset thread while adev->reset_sem is already write locked from amdgpu_device_lock_adev earlier in the same thread ?
>
> Andrey
>
>
>>    
>>    	if ((reg * 4) < adev->rmmio_size)
>>    		ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); @@ -332,6
>> +337,7 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
>>    		ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4));
>>    		spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
>>    	}
>> +
>>    	trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret);
>>    	return ret;
>>    }
>> @@ -407,8 +413,13 @@ void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg,
>>    void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
>>    		    uint32_t acc_flags)
>>    {
>> -	if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
>> -		return amdgpu_kiq_wreg(adev, reg, v);
>> +	if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
>> +		amdgpu_sriov_runtime(adev) &&
>> +		down_read_trylock(&adev->reset_sem)) {
>> +		amdgpu_kiq_wreg(adev, reg, v);
>> +		up_read(&adev->reset_sem);
>> +		return;
>> +	}
>>    
>>    	amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags);
>>    }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> index ad9ad622ccce..4ea2a065daa9 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> @@ -287,7 +287,7 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>>    	 */
>>    	if (adev->gfx.kiq.ring.sched.ready &&
>>    	    (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) &&
>> -	    !amdgpu_in_reset(adev)) {
>> +	    down_read_trylock(&adev->reset_sem)) {
>>    
>>    		struct amdgpu_vmhub *hub = &adev->vmhub[vmhub];
>>    		const unsigned eng = 17;
>> @@ -297,6 +297,8 @@ static void gmc_v10_0_flush_gpu_tlb(struct
>> amdgpu_device *adev, uint32_t vmid,
>>    
>>    		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req,
>>    				1 << vmid);
>> +
>> +		up_read(&adev->reset_sem);
>>    		return;
>>    	}
>>    
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> index e1a0ae327cf5..33b7cf1c79ec 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> @@ -501,12 +501,13 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>>    	 */
>>    	if (adev->gfx.kiq.ring.sched.ready &&
>>    			(amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) &&
>> -			!amdgpu_in_reset(adev)) {
>> +			down_read_trylock(&adev->reset_sem)) {
>>    		uint32_t req = hub->vm_inv_eng0_req + hub->eng_distance * eng;
>>    		uint32_t ack = hub->vm_inv_eng0_ack + hub->eng_distance * eng;
>>    
>>    		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req,
>>    						   1 << vmid);
>> +		up_read(&adev->reset_sem);
>>    		return;
>>    	}
>>    
>> @@ -599,7 +600,8 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>>    	if (amdgpu_in_reset(adev))
>>    		return -EIO;
>>    
>> -	if (ring->sched.ready) {
>> +	if (ring->sched.ready &&
>> +		 down_read_trylock(&adev->reset_sem)) {
>>    		/* Vega20+XGMI caches PTEs in TC and TLB. Add a
>>    		 * heavy-weight TLB flush (type 2), which flushes
>>    		 * both. Due to a race condition with concurrent @@ -626,6 +628,7
>> @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>>    		if (r) {
>>    			amdgpu_ring_undo(ring);
>>    			spin_unlock(&adev->gfx.kiq.ring_lock);
>> +			up_read(&adev->reset_sem);
>>    			return -ETIME;
>>    		}
>>    
>> @@ -634,9 +637,10 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>>    		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
>>    		if (r < 1) {
>>    			dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r);
>> +			up_read(&adev->reset_sem);
>>    			return -ETIME;
>>    		}
>> -
>> +		up_read(&adev->reset_sem);
>>    		return 0;
>>    	}
>>    
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2020-09-01 14:22 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-09-01  1:17 [PATCH] drm/amdgpu: block ring buffer access during GPU recovery Dennis Li
2020-09-01  1:40 ` Andrey Grodzovsky
2020-09-01  2:45   ` Li, Dennis
2020-09-01  7:11     ` Christian König
2020-09-01 14:22     ` Andrey Grodzovsky
2020-09-01  7:15 ` Christian König

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.