All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Christian König" <christian.koenig@amd.com>
To: Dennis Li <Dennis.Li@amd.com>,
	amd-gfx@lists.freedesktop.org, Alexander.Deucher@amd.com,
	felix.kuehling@amd.com, Hawking.Zhang@amd.com,
	Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>
Subject: Re: [PATCH] drm/amdgpu: block hardware accessed by other threads when doing gpu recovery
Date: Mon, 1 Mar 2021 12:52:39 +0100	[thread overview]
Message-ID: <2b7c3869-e750-edeb-9b0d-0a4fbd029211@amd.com> (raw)
In-Reply-To: <20210301111225.11330-1-Dennis.Li@amd.com>

Adding Andrey as well. Need to wrap my head around that problem once more.

In general the in_irq/in_interrupt macros should not be used in driver 
code any more and I'm pretty sure we need to block the access at a 
higher level.

Regards,
Christian.

Am 01.03.21 um 12:12 schrieb Dennis Li:
> When GPU recovery thread is doing GPU reset, it is unsafe that other
> threads access hardware concurrently, which could cause GPU reset
> randomly hang.
>
> Signed-off-by: Dennis Li <Dennis.Li@amd.com>
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 1624c2bc8285..c71d3bba5f69 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1033,6 +1033,7 @@ struct amdgpu_device {
>   	atomic_t 			in_gpu_reset;
>   	enum pp_mp1_state               mp1_state;
>   	struct rw_semaphore reset_sem;
> +	struct thread_info *recovery_thread;
>   	struct amdgpu_doorbell_index doorbell_index;
>   
>   	struct mutex			notifier_lock;
> @@ -1385,4 +1386,13 @@ static inline int amdgpu_in_reset(struct amdgpu_device *adev)
>   {
>   	return atomic_read(&adev->in_gpu_reset);
>   }
> +
> +static inline bool amdgpu_in_recovery_thread(struct amdgpu_device *adev)
> +{
> +	if (unlikely(adev->recovery_thread != NULL) &&
> +		adev->recovery_thread == current_thread_info())
> +		return true;
> +
> +	return false;
> +}
>   #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 71805dfd9e25..7c17a5468d43 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -401,13 +401,22 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
>    */
>   void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
>   {
> +	bool locked;
> +
>   	if (adev->in_pci_err_recovery)
>   		return;
>   
> +	locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq();
> +	if (locked)
> +		down_read(&adev->reset_sem);
> +
>   	if (offset < adev->rmmio_size)
>   		writeb(value, adev->rmmio + offset);
>   	else
>   		BUG();
> +
> +	if (locked)
> +		up_read(&adev->reset_sem);
>   }
>   
>   /**
> @@ -424,15 +433,19 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,
>   			uint32_t reg, uint32_t v,
>   			uint32_t acc_flags)
>   {
> +	bool locked;
> +
>   	if (adev->in_pci_err_recovery)
>   		return;
>   
> +	locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq();
> +	if (locked)
> +		down_read(&adev->reset_sem);
> +
>   	if ((reg * 4) < adev->rmmio_size) {
>   		if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
> -		    amdgpu_sriov_runtime(adev) &&
> -		    down_read_trylock(&adev->reset_sem)) {
> +		    amdgpu_sriov_runtime(adev)) {
>   			amdgpu_kiq_wreg(adev, reg, v);
> -			up_read(&adev->reset_sem);
>   		} else {
>   			writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
>   		}
> @@ -440,6 +453,9 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,
>   		adev->pcie_wreg(adev, reg * 4, v);
>   	}
>   
> +	if (locked)
> +		up_read(&adev->reset_sem);
> +
>   	trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
>   }
>   
> @@ -451,9 +467,15 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,
>   void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
>   			     uint32_t reg, uint32_t v)
>   {
> +	bool locked;
> +
>   	if (adev->in_pci_err_recovery)
>   		return;
>   
> +	locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq();
> +	if (locked)
> +		down_read(&adev->reset_sem);
> +
>   	if (amdgpu_sriov_fullaccess(adev) &&
>   	    adev->gfx.rlc.funcs &&
>   	    adev->gfx.rlc.funcs->is_rlcg_access_range) {
> @@ -462,6 +484,9 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
>   	} else {
>   		writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
>   	}
> +
> +	if (locked)
> +		up_read(&adev->reset_sem);
>   }
>   
>   /**
> @@ -496,15 +521,24 @@ u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
>    */
>   void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
>   {
> +	bool locked;
> +
>   	if (adev->in_pci_err_recovery)
>   		return;
>   
> +	locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq();
> +	if (locked)
> +		down_read(&adev->reset_sem);
> +
>   	if ((reg * 4) < adev->rio_mem_size)
>   		iowrite32(v, adev->rio_mem + (reg * 4));
>   	else {
>   		iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
>   		iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
>   	}
> +
> +	if (locked)
> +		up_read(&adev->reset_sem);
>   }
>   
>   /**
> @@ -679,6 +713,11 @@ void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
>   	unsigned long flags;
>   	void __iomem *pcie_index_offset;
>   	void __iomem *pcie_data_offset;
> +	bool locked;
> +
> +	locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq();
> +	if (locked)
> +		down_read(&adev->reset_sem);
>   
>   	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
>   	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
> @@ -689,6 +728,9 @@ void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
>   	writel(reg_data, pcie_data_offset);
>   	readl(pcie_data_offset);
>   	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
> +
> +	if (locked)
> +		up_read(&adev->reset_sem);
>   }
>   
>   /**
> @@ -708,6 +750,11 @@ void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
>   	unsigned long flags;
>   	void __iomem *pcie_index_offset;
>   	void __iomem *pcie_data_offset;
> +	bool locked;
> +
> +	locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq();
> +	if (locked)
> +		down_read(&adev->reset_sem);
>   
>   	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
>   	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
> @@ -724,6 +771,9 @@ void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
>   	writel((u32)(reg_data >> 32), pcie_data_offset);
>   	readl(pcie_data_offset);
>   	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
> +
> +	if (locked)
> +		up_read(&adev->reset_sem);
>   }
>   
>   /**
> @@ -4459,6 +4509,8 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
>   		break;
>   	}
>   
> +	adev->recovery_thread = current_thread_info();
> +
>   	return true;
>   }
>   
> @@ -4467,6 +4519,7 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>   	amdgpu_vf_error_trans_all(adev);
>   	adev->mp1_state = PP_MP1_STATE_NONE;
>   	atomic_set(&adev->in_gpu_reset, 0);
> +	adev->recovery_thread = NULL;
>   	up_write(&adev->reset_sem);
>   }
>   
> @@ -4609,7 +4662,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   
>   	dev_info(adev->dev, "GPU %s begin!\n",
>   		need_emergency_restart ? "jobs stop":"reset");
> -
>   	/*
>   	 * Here we trylock to avoid chain of resets executing from
>   	 * either trigger by jobs on different adevs in XGMI hive or jobs on

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

      parent reply	other threads:[~2021-03-01 11:52 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-03-01 11:12 [PATCH] drm/amdgpu: block hardware accessed by other threads when doing gpu recovery Dennis Li
2021-03-01 11:43 ` Chen, Jiansong (Simon)
2021-03-01 11:52 ` Christian König [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=2b7c3869-e750-edeb-9b0d-0a4fbd029211@amd.com \
    --to=christian.koenig@amd.com \
    --cc=Alexander.Deucher@amd.com \
    --cc=Andrey.Grodzovsky@amd.com \
    --cc=Dennis.Li@amd.com \
    --cc=Hawking.Zhang@amd.com \
    --cc=amd-gfx@lists.freedesktop.org \
    --cc=felix.kuehling@amd.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.