All of lore.kernel.org
 help / color / mirror / Atom feed
From: Dennis Li <Dennis.Li@amd.com>
To: <amd-gfx@lists.freedesktop.org>, <Alexander.Deucher@amd.com>,
	<felix.kuehling@amd.com>, <Hawking.Zhang@amd.com>,
	<christian.koenig@amd.com>
Cc: Dennis Li <Dennis.Li@amd.com>
Subject: [PATCH] drm/amdgpu: block hardware accessed by other threads when doing gpu recovery
Date: Mon, 1 Mar 2021 19:12:25 +0800	[thread overview]
Message-ID: <20210301111225.11330-1-Dennis.Li@amd.com> (raw)

When GPU recovery thread is doing GPU reset, it is unsafe that other
threads access hardware concurrently, which could cause GPU reset
randomly hang.

Signed-off-by: Dennis Li <Dennis.Li@amd.com>

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 1624c2bc8285..c71d3bba5f69 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1033,6 +1033,7 @@ struct amdgpu_device {
 	atomic_t 			in_gpu_reset;
 	enum pp_mp1_state               mp1_state;
 	struct rw_semaphore reset_sem;
+	struct thread_info *recovery_thread;
 	struct amdgpu_doorbell_index doorbell_index;
 
 	struct mutex			notifier_lock;
@@ -1385,4 +1386,13 @@ static inline int amdgpu_in_reset(struct amdgpu_device *adev)
 {
 	return atomic_read(&adev->in_gpu_reset);
 }
+
+static inline bool amdgpu_in_recovery_thread(struct amdgpu_device *adev)
+{
+	if (unlikely(adev->recovery_thread != NULL) &&
+		adev->recovery_thread == current_thread_info())
+		return true;
+
+	return false;
+}
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 71805dfd9e25..7c17a5468d43 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -401,13 +401,22 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
  */
 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
 {
+	bool locked;
+
 	if (adev->in_pci_err_recovery)
 		return;
 
+	locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq();
+	if (locked)
+		down_read(&adev->reset_sem);
+
 	if (offset < adev->rmmio_size)
 		writeb(value, adev->rmmio + offset);
 	else
 		BUG();
+
+	if (locked)
+		up_read(&adev->reset_sem);
 }
 
 /**
@@ -424,15 +433,19 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,
 			uint32_t reg, uint32_t v,
 			uint32_t acc_flags)
 {
+	bool locked;
+
 	if (adev->in_pci_err_recovery)
 		return;
 
+	locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq();
+	if (locked)
+		down_read(&adev->reset_sem);
+
 	if ((reg * 4) < adev->rmmio_size) {
 		if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
-		    amdgpu_sriov_runtime(adev) &&
-		    down_read_trylock(&adev->reset_sem)) {
+		    amdgpu_sriov_runtime(adev)) {
 			amdgpu_kiq_wreg(adev, reg, v);
-			up_read(&adev->reset_sem);
 		} else {
 			writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
 		}
@@ -440,6 +453,9 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,
 		adev->pcie_wreg(adev, reg * 4, v);
 	}
 
+	if (locked)
+		up_read(&adev->reset_sem);
+
 	trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
 }
 
@@ -451,9 +467,15 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,
 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
 			     uint32_t reg, uint32_t v)
 {
+	bool locked;
+
 	if (adev->in_pci_err_recovery)
 		return;
 
+	locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq();
+	if (locked)
+		down_read(&adev->reset_sem);
+
 	if (amdgpu_sriov_fullaccess(adev) &&
 	    adev->gfx.rlc.funcs &&
 	    adev->gfx.rlc.funcs->is_rlcg_access_range) {
@@ -462,6 +484,9 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
 	} else {
 		writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
 	}
+
+	if (locked)
+		up_read(&adev->reset_sem);
 }
 
 /**
@@ -496,15 +521,24 @@ u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
  */
 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
 {
+	bool locked;
+
 	if (adev->in_pci_err_recovery)
 		return;
 
+	locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq();
+	if (locked)
+		down_read(&adev->reset_sem);
+
 	if ((reg * 4) < adev->rio_mem_size)
 		iowrite32(v, adev->rio_mem + (reg * 4));
 	else {
 		iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
 		iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
 	}
+
+	if (locked)
+		up_read(&adev->reset_sem);
 }
 
 /**
@@ -679,6 +713,11 @@ void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
 	unsigned long flags;
 	void __iomem *pcie_index_offset;
 	void __iomem *pcie_data_offset;
+	bool locked;
+
+	locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq();
+	if (locked)
+		down_read(&adev->reset_sem);
 
 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
@@ -689,6 +728,9 @@ void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
 	writel(reg_data, pcie_data_offset);
 	readl(pcie_data_offset);
 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
+
+	if (locked)
+		up_read(&adev->reset_sem);
 }
 
 /**
@@ -708,6 +750,11 @@ void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
 	unsigned long flags;
 	void __iomem *pcie_index_offset;
 	void __iomem *pcie_data_offset;
+	bool locked;
+
+	locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq();
+	if (locked)
+		down_read(&adev->reset_sem);
 
 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
@@ -724,6 +771,9 @@ void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
 	writel((u32)(reg_data >> 32), pcie_data_offset);
 	readl(pcie_data_offset);
 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
+
+	if (locked)
+		up_read(&adev->reset_sem);
 }
 
 /**
@@ -4459,6 +4509,8 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
 		break;
 	}
 
+	adev->recovery_thread = current_thread_info();
+
 	return true;
 }
 
@@ -4467,6 +4519,7 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
 	amdgpu_vf_error_trans_all(adev);
 	adev->mp1_state = PP_MP1_STATE_NONE;
 	atomic_set(&adev->in_gpu_reset, 0);
+	adev->recovery_thread = NULL;
 	up_write(&adev->reset_sem);
 }
 
@@ -4609,7 +4662,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 
 	dev_info(adev->dev, "GPU %s begin!\n",
 		need_emergency_restart ? "jobs stop":"reset");
-
 	/*
 	 * Here we trylock to avoid chain of resets executing from
 	 * either trigger by jobs on different adevs in XGMI hive or jobs on
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

             reply	other threads:[~2021-03-01 11:12 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-03-01 11:12 Dennis Li [this message]
2021-03-01 11:43 ` [PATCH] drm/amdgpu: block hardware accessed by other threads when doing gpu recovery Chen, Jiansong (Simon)
2021-03-01 11:52 ` Christian König

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210301111225.11330-1-Dennis.Li@amd.com \
    --to=dennis.li@amd.com \
    --cc=Alexander.Deucher@amd.com \
    --cc=Hawking.Zhang@amd.com \
    --cc=amd-gfx@lists.freedesktop.org \
    --cc=christian.koenig@amd.com \
    --cc=felix.kuehling@amd.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.