All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdgpu: block hardware accessed by other threads when doing gpu recovery
@ 2021-03-01 11:12 Dennis Li
  2021-03-01 11:43 ` Chen, Jiansong (Simon)
  2021-03-01 11:52 ` Christian König
  0 siblings, 2 replies; 3+ messages in thread
From: Dennis Li @ 2021-03-01 11:12 UTC (permalink / raw)
  To: amd-gfx, Alexander.Deucher, felix.kuehling, Hawking.Zhang,
	christian.koenig
  Cc: Dennis Li

When GPU recovery thread is doing GPU reset, it is unsafe that other
threads access hardware concurrently, which could cause GPU reset
randomly hang.

Signed-off-by: Dennis Li <Dennis.Li@amd.com>

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 1624c2bc8285..c71d3bba5f69 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1033,6 +1033,7 @@ struct amdgpu_device {
 	atomic_t 			in_gpu_reset;
 	enum pp_mp1_state               mp1_state;
 	struct rw_semaphore reset_sem;
+	struct thread_info *recovery_thread;
 	struct amdgpu_doorbell_index doorbell_index;
 
 	struct mutex			notifier_lock;
@@ -1385,4 +1386,13 @@ static inline int amdgpu_in_reset(struct amdgpu_device *adev)
 {
 	return atomic_read(&adev->in_gpu_reset);
 }
+
+static inline bool amdgpu_in_recovery_thread(struct amdgpu_device *adev)
+{
+	if (unlikely(adev->recovery_thread != NULL) &&
+		adev->recovery_thread == current_thread_info())
+		return true;
+
+	return false;
+}
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 71805dfd9e25..7c17a5468d43 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -401,13 +401,22 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
  */
 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
 {
+	bool locked;
+
 	if (adev->in_pci_err_recovery)
 		return;
 
+	locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq();
+	if (locked)
+		down_read(&adev->reset_sem);
+
 	if (offset < adev->rmmio_size)
 		writeb(value, adev->rmmio + offset);
 	else
 		BUG();
+
+	if (locked)
+		up_read(&adev->reset_sem);
 }
 
 /**
@@ -424,15 +433,19 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,
 			uint32_t reg, uint32_t v,
 			uint32_t acc_flags)
 {
+	bool locked;
+
 	if (adev->in_pci_err_recovery)
 		return;
 
+	locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq();
+	if (locked)
+		down_read(&adev->reset_sem);
+
 	if ((reg * 4) < adev->rmmio_size) {
 		if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
-		    amdgpu_sriov_runtime(adev) &&
-		    down_read_trylock(&adev->reset_sem)) {
+		    amdgpu_sriov_runtime(adev)) {
 			amdgpu_kiq_wreg(adev, reg, v);
-			up_read(&adev->reset_sem);
 		} else {
 			writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
 		}
@@ -440,6 +453,9 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,
 		adev->pcie_wreg(adev, reg * 4, v);
 	}
 
+	if (locked)
+		up_read(&adev->reset_sem);
+
 	trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
 }
 
@@ -451,9 +467,15 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,
 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
 			     uint32_t reg, uint32_t v)
 {
+	bool locked;
+
 	if (adev->in_pci_err_recovery)
 		return;
 
+	locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq();
+	if (locked)
+		down_read(&adev->reset_sem);
+
 	if (amdgpu_sriov_fullaccess(adev) &&
 	    adev->gfx.rlc.funcs &&
 	    adev->gfx.rlc.funcs->is_rlcg_access_range) {
@@ -462,6 +484,9 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
 	} else {
 		writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
 	}
+
+	if (locked)
+		up_read(&adev->reset_sem);
 }
 
 /**
@@ -496,15 +521,24 @@ u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
  */
 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
 {
+	bool locked;
+
 	if (adev->in_pci_err_recovery)
 		return;
 
+	locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq();
+	if (locked)
+		down_read(&adev->reset_sem);
+
 	if ((reg * 4) < adev->rio_mem_size)
 		iowrite32(v, adev->rio_mem + (reg * 4));
 	else {
 		iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
 		iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
 	}
+
+	if (locked)
+		up_read(&adev->reset_sem);
 }
 
 /**
@@ -679,6 +713,11 @@ void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
 	unsigned long flags;
 	void __iomem *pcie_index_offset;
 	void __iomem *pcie_data_offset;
+	bool locked;
+
+	locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq();
+	if (locked)
+		down_read(&adev->reset_sem);
 
 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
@@ -689,6 +728,9 @@ void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
 	writel(reg_data, pcie_data_offset);
 	readl(pcie_data_offset);
 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
+
+	if (locked)
+		up_read(&adev->reset_sem);
 }
 
 /**
@@ -708,6 +750,11 @@ void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
 	unsigned long flags;
 	void __iomem *pcie_index_offset;
 	void __iomem *pcie_data_offset;
+	bool locked;
+
+	locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq();
+	if (locked)
+		down_read(&adev->reset_sem);
 
 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
@@ -724,6 +771,9 @@ void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
 	writel((u32)(reg_data >> 32), pcie_data_offset);
 	readl(pcie_data_offset);
 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
+
+	if (locked)
+		up_read(&adev->reset_sem);
 }
 
 /**
@@ -4459,6 +4509,8 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
 		break;
 	}
 
+	adev->recovery_thread = current_thread_info();
+
 	return true;
 }
 
@@ -4467,6 +4519,7 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
 	amdgpu_vf_error_trans_all(adev);
 	adev->mp1_state = PP_MP1_STATE_NONE;
 	atomic_set(&adev->in_gpu_reset, 0);
+	adev->recovery_thread = NULL;
 	up_write(&adev->reset_sem);
 }
 
@@ -4609,7 +4662,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 
 	dev_info(adev->dev, "GPU %s begin!\n",
 		need_emergency_restart ? "jobs stop":"reset");
-
 	/*
 	 * Here we trylock to avoid chain of resets executing from
 	 * either trigger by jobs on different adevs in XGMI hive or jobs on
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* RE: [PATCH] drm/amdgpu: block hardware accessed by other threads when doing gpu recovery
  2021-03-01 11:12 [PATCH] drm/amdgpu: block hardware accessed by other threads when doing gpu recovery Dennis Li
@ 2021-03-01 11:43 ` Chen, Jiansong (Simon)
  2021-03-01 11:52 ` Christian König
  1 sibling, 0 replies; 3+ messages in thread
From: Chen, Jiansong (Simon) @ 2021-03-01 11:43 UTC (permalink / raw)
  To: Li, Dennis, amd-gfx, Deucher, Alexander, Kuehling, Felix, Zhang,
	Hawking, Koenig, Christian
  Cc: Li, Dennis

[AMD Official Use Only - Internal Distribution Only]

For all the "locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq();",  logical operator "&&" should be used,

Regards,
Jiansong
-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Dennis Li
Sent: Monday, March 1, 2021 7:12 PM
To: amd-gfx@lists.freedesktop.org; Deucher, Alexander <Alexander.Deucher@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
Cc: Li, Dennis <Dennis.Li@amd.com>
Subject: [PATCH] drm/amdgpu: block hardware accessed by other threads when doing gpu recovery

When GPU recovery thread is doing GPU reset, it is unsafe that other threads access hardware concurrently, which could cause GPU reset randomly hang.

Signed-off-by: Dennis Li <Dennis.Li@amd.com>

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 1624c2bc8285..c71d3bba5f69 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1033,6 +1033,7 @@ struct amdgpu_device {
 atomic_t in_gpu_reset;
 enum pp_mp1_state               mp1_state;
 struct rw_semaphore reset_sem;
+struct thread_info *recovery_thread;
 struct amdgpu_doorbell_index doorbell_index;

 struct mutexnotifier_lock;
@@ -1385,4 +1386,13 @@ static inline int amdgpu_in_reset(struct amdgpu_device *adev)  {
 return atomic_read(&adev->in_gpu_reset);  }
+
+static inline bool amdgpu_in_recovery_thread(struct amdgpu_device
+*adev) {
+if (unlikely(adev->recovery_thread != NULL) &&
+adev->recovery_thread == current_thread_info())
+return true;
+
+return false;
+}
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 71805dfd9e25..7c17a5468d43 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -401,13 +401,22 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
  */
 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)  {
+bool locked;
+
 if (adev->in_pci_err_recovery)
 return;

+locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq();
+if (locked)
+down_read(&adev->reset_sem);
+
 if (offset < adev->rmmio_size)
 writeb(value, adev->rmmio + offset);
 else
 BUG();
+
+if (locked)
+up_read(&adev->reset_sem);
 }

 /**
@@ -424,15 +433,19 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,
 uint32_t reg, uint32_t v,
 uint32_t acc_flags)
 {
+bool locked;
+
 if (adev->in_pci_err_recovery)
 return;

+locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq();
+if (locked)
+down_read(&adev->reset_sem);
+
 if ((reg * 4) < adev->rmmio_size) {
 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
-    amdgpu_sriov_runtime(adev) &&
-    down_read_trylock(&adev->reset_sem)) {
+    amdgpu_sriov_runtime(adev)) {
 amdgpu_kiq_wreg(adev, reg, v);
-up_read(&adev->reset_sem);
 } else {
 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
 }
@@ -440,6 +453,9 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,
 adev->pcie_wreg(adev, reg * 4, v);
 }

+if (locked)
+up_read(&adev->reset_sem);
+
 trace_amdgpu_device_wreg(adev->pdev->device, reg, v);  }

@@ -451,9 +467,15 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,  void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
      uint32_t reg, uint32_t v)
 {
+bool locked;
+
 if (adev->in_pci_err_recovery)
 return;

+locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq();
+if (locked)
+down_read(&adev->reset_sem);
+
 if (amdgpu_sriov_fullaccess(adev) &&
     adev->gfx.rlc.funcs &&
     adev->gfx.rlc.funcs->is_rlcg_access_range) { @@ -462,6 +484,9 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
 } else {
 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
 }
+
+if (locked)
+up_read(&adev->reset_sem);
 }

 /**
@@ -496,15 +521,24 @@ u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
  */
 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)  {
+bool locked;
+
 if (adev->in_pci_err_recovery)
 return;

+locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq();
+if (locked)
+down_read(&adev->reset_sem);
+
 if ((reg * 4) < adev->rio_mem_size)
 iowrite32(v, adev->rio_mem + (reg * 4));
 else {
 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
 }
+
+if (locked)
+up_read(&adev->reset_sem);
 }

 /**
@@ -679,6 +713,11 @@ void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
 unsigned long flags;
 void __iomem *pcie_index_offset;
 void __iomem *pcie_data_offset;
+bool locked;
+
+locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq();
+if (locked)
+down_read(&adev->reset_sem);

 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; @@ -689,6 +728,9 @@ void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
 writel(reg_data, pcie_data_offset);
 readl(pcie_data_offset);
 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
+
+if (locked)
+up_read(&adev->reset_sem);
 }

 /**
@@ -708,6 +750,11 @@ void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
 unsigned long flags;
 void __iomem *pcie_index_offset;
 void __iomem *pcie_data_offset;
+bool locked;
+
+locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq();
+if (locked)
+down_read(&adev->reset_sem);

 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; @@ -724,6 +771,9 @@ void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
 writel((u32)(reg_data >> 32), pcie_data_offset);
 readl(pcie_data_offset);
 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
+
+if (locked)
+up_read(&adev->reset_sem);
 }

 /**
@@ -4459,6 +4509,8 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
 break;
 }

+adev->recovery_thread = current_thread_info();
+
 return true;
 }

@@ -4467,6 +4519,7 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
 amdgpu_vf_error_trans_all(adev);
 adev->mp1_state = PP_MP1_STATE_NONE;
 atomic_set(&adev->in_gpu_reset, 0);
+adev->recovery_thread = NULL;
 up_write(&adev->reset_sem);
 }

@@ -4609,7 +4662,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,

 dev_info(adev->dev, "GPU %s begin!\n",
 need_emergency_restart ? "jobs stop":"reset");
-
 /*
  * Here we trylock to avoid chain of resets executing from
  * either trigger by jobs on different adevs in XGMI hive or jobs on
--
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=04%7C01%7CJiansong.Chen%40amd.com%7Caecef0bf855040342e4c08d8dca2f1aa%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637501939694268859%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=jqlfLSc4UOBdOlzxBC2e5Aq13HqBawnJ3t1Oz9F4M2Q%3D&amp;reserved=0
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH] drm/amdgpu: block hardware accessed by other threads when doing gpu recovery
  2021-03-01 11:12 [PATCH] drm/amdgpu: block hardware accessed by other threads when doing gpu recovery Dennis Li
  2021-03-01 11:43 ` Chen, Jiansong (Simon)
@ 2021-03-01 11:52 ` Christian König
  1 sibling, 0 replies; 3+ messages in thread
From: Christian König @ 2021-03-01 11:52 UTC (permalink / raw)
  To: Dennis Li, amd-gfx, Alexander.Deucher, felix.kuehling,
	Hawking.Zhang, Andrey Grodzovsky

Adding Andrey as well. Need to wrap my head around that problem once more.

In general the in_irq/in_interrupt macros should not be used in driver 
code any more and I'm pretty sure we need to block the access at a 
higher level.

Regards,
Christian.

Am 01.03.21 um 12:12 schrieb Dennis Li:
> When GPU recovery thread is doing GPU reset, it is unsafe that other
> threads access hardware concurrently, which could cause GPU reset
> randomly hang.
>
> Signed-off-by: Dennis Li <Dennis.Li@amd.com>
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 1624c2bc8285..c71d3bba5f69 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1033,6 +1033,7 @@ struct amdgpu_device {
>   	atomic_t 			in_gpu_reset;
>   	enum pp_mp1_state               mp1_state;
>   	struct rw_semaphore reset_sem;
> +	struct thread_info *recovery_thread;
>   	struct amdgpu_doorbell_index doorbell_index;
>   
>   	struct mutex			notifier_lock;
> @@ -1385,4 +1386,13 @@ static inline int amdgpu_in_reset(struct amdgpu_device *adev)
>   {
>   	return atomic_read(&adev->in_gpu_reset);
>   }
> +
> +static inline bool amdgpu_in_recovery_thread(struct amdgpu_device *adev)
> +{
> +	if (unlikely(adev->recovery_thread != NULL) &&
> +		adev->recovery_thread == current_thread_info())
> +		return true;
> +
> +	return false;
> +}
>   #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 71805dfd9e25..7c17a5468d43 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -401,13 +401,22 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
>    */
>   void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
>   {
> +	bool locked;
> +
>   	if (adev->in_pci_err_recovery)
>   		return;
>   
> +	locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq();
> +	if (locked)
> +		down_read(&adev->reset_sem);
> +
>   	if (offset < adev->rmmio_size)
>   		writeb(value, adev->rmmio + offset);
>   	else
>   		BUG();
> +
> +	if (locked)
> +		up_read(&adev->reset_sem);
>   }
>   
>   /**
> @@ -424,15 +433,19 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,
>   			uint32_t reg, uint32_t v,
>   			uint32_t acc_flags)
>   {
> +	bool locked;
> +
>   	if (adev->in_pci_err_recovery)
>   		return;
>   
> +	locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq();
> +	if (locked)
> +		down_read(&adev->reset_sem);
> +
>   	if ((reg * 4) < adev->rmmio_size) {
>   		if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
> -		    amdgpu_sriov_runtime(adev) &&
> -		    down_read_trylock(&adev->reset_sem)) {
> +		    amdgpu_sriov_runtime(adev)) {
>   			amdgpu_kiq_wreg(adev, reg, v);
> -			up_read(&adev->reset_sem);
>   		} else {
>   			writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
>   		}
> @@ -440,6 +453,9 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,
>   		adev->pcie_wreg(adev, reg * 4, v);
>   	}
>   
> +	if (locked)
> +		up_read(&adev->reset_sem);
> +
>   	trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
>   }
>   
> @@ -451,9 +467,15 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,
>   void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
>   			     uint32_t reg, uint32_t v)
>   {
> +	bool locked;
> +
>   	if (adev->in_pci_err_recovery)
>   		return;
>   
> +	locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq();
> +	if (locked)
> +		down_read(&adev->reset_sem);
> +
>   	if (amdgpu_sriov_fullaccess(adev) &&
>   	    adev->gfx.rlc.funcs &&
>   	    adev->gfx.rlc.funcs->is_rlcg_access_range) {
> @@ -462,6 +484,9 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
>   	} else {
>   		writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
>   	}
> +
> +	if (locked)
> +		up_read(&adev->reset_sem);
>   }
>   
>   /**
> @@ -496,15 +521,24 @@ u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
>    */
>   void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
>   {
> +	bool locked;
> +
>   	if (adev->in_pci_err_recovery)
>   		return;
>   
> +	locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq();
> +	if (locked)
> +		down_read(&adev->reset_sem);
> +
>   	if ((reg * 4) < adev->rio_mem_size)
>   		iowrite32(v, adev->rio_mem + (reg * 4));
>   	else {
>   		iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
>   		iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
>   	}
> +
> +	if (locked)
> +		up_read(&adev->reset_sem);
>   }
>   
>   /**
> @@ -679,6 +713,11 @@ void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
>   	unsigned long flags;
>   	void __iomem *pcie_index_offset;
>   	void __iomem *pcie_data_offset;
> +	bool locked;
> +
> +	locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq();
> +	if (locked)
> +		down_read(&adev->reset_sem);
>   
>   	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
>   	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
> @@ -689,6 +728,9 @@ void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
>   	writel(reg_data, pcie_data_offset);
>   	readl(pcie_data_offset);
>   	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
> +
> +	if (locked)
> +		up_read(&adev->reset_sem);
>   }
>   
>   /**
> @@ -708,6 +750,11 @@ void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
>   	unsigned long flags;
>   	void __iomem *pcie_index_offset;
>   	void __iomem *pcie_data_offset;
> +	bool locked;
> +
> +	locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq();
> +	if (locked)
> +		down_read(&adev->reset_sem);
>   
>   	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
>   	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
> @@ -724,6 +771,9 @@ void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
>   	writel((u32)(reg_data >> 32), pcie_data_offset);
>   	readl(pcie_data_offset);
>   	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
> +
> +	if (locked)
> +		up_read(&adev->reset_sem);
>   }
>   
>   /**
> @@ -4459,6 +4509,8 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
>   		break;
>   	}
>   
> +	adev->recovery_thread = current_thread_info();
> +
>   	return true;
>   }
>   
> @@ -4467,6 +4519,7 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>   	amdgpu_vf_error_trans_all(adev);
>   	adev->mp1_state = PP_MP1_STATE_NONE;
>   	atomic_set(&adev->in_gpu_reset, 0);
> +	adev->recovery_thread = NULL;
>   	up_write(&adev->reset_sem);
>   }
>   
> @@ -4609,7 +4662,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   
>   	dev_info(adev->dev, "GPU %s begin!\n",
>   		need_emergency_restart ? "jobs stop":"reset");
> -
>   	/*
>   	 * Here we trylock to avoid chain of resets executing from
>   	 * either trigger by jobs on different adevs in XGMI hive or jobs on

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2021-03-01 11:52 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-01 11:12 [PATCH] drm/amdgpu: block hardware accessed by other threads when doing gpu recovery Dennis Li
2021-03-01 11:43 ` Chen, Jiansong (Simon)
2021-03-01 11:52 ` Christian König

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.