All of lore.kernel.org
 help / color / mirror / Atom feed
From: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>
To: Horace Chen <horace.chen@amd.com>, amd-gfx@lists.freedesktop.org
Cc: "Jack Xiao" <Jack.Xiao@amd.com>, "Feifei Xu" <Feifei.Xu@amd.com>,
	"Kevin Wang" <Kevin1.Wang@amd.com>,
	"Xiaojie Yuan" <xiaojie.yuan@amd.com>,
	"Tuikov Luben" <Luben.Tuikov@amd.com>,
	"Deucher Alexander" <Alexander.Deucher@amd.com>,
	"Evan Quan" <Evan.Quan@amd.com>,
	"Christian König" <christian.koenig@amd.com>,
	"Monk Liu" <Monk.Liu@amd.com>,
	"Hawking Zhang" <Hawking.Zhang@amd.com>
Subject: Re: [PATCH] drm/amdgpu: race issue when jobs on 2 ring timeout
Date: Wed, 20 Jan 2021 11:39:54 -0500	[thread overview]
Message-ID: <54b28e65-226d-a9a2-2448-c2decd1a7d58@amd.com> (raw)
In-Reply-To: <20210120141250.4095-1-horace.chen@amd.com>


On 1/20/21 9:12 AM, Horace Chen wrote:
> Fix a racing issue when jobs on 2 rings timeout simultaneously.
>
> If 2 rings timed out at the same time, the
> amdgpu_device_gpu_recover will be reentered. Then the
> adev->gmc.xgmi.head will be grabbed by 2 local linked list,
> which may cause wild pointer issue in iterating.
>
> lock the device earily to prevent the node be added to 2
> different lists.
>
> also increase karma for the skipped job since the job is also
> timed out and should be guilty.
>
> Signed-off-by: Horace Chen <horace.chen@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 70 +++++++++++++++++++---
>   1 file changed, 61 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 4d434803fb49..d59d3182ac2d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4460,6 +4460,46 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>   	up_write(&adev->reset_sem);
>   }
>   
> +/*
> + * to lockup a list of amdgpu devices in a hive safely, if not a hive
> + * with multiple nodes, it will be same as amdgpu_device_lock_adev.
> + *
> + * unlock won't require roll back.
> + */
> +static bool amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)
> +{
> +	struct amdgpu_device *tmp_adev = NULL;
> +
> +	if (adev->gmc.xgmi.num_physical_nodes > 1) {
> +		if (!hive) {
> +			dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes");
> +			return false;
> +		}
> +		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
> +			if (!amdgpu_device_lock_adev(tmp_adev, hive))
> +				goto roll_back;
> +		}
> +		return true;
> +	} else {
> +		return amdgpu_device_lock_adev(adev, hive);
> +	}
> +roll_back:
> +	if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) {
> +		/*
> +		 * if the lockup iteration break in the middle of a hive,
> +		 * it may means there may has a race issue,
> +		 * or a hive device locked up independently.
> +		 * we may be in trouble and may not,
> +		 * so will try to roll back the lock and give out a warnning.
> +		 */
> +		dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock");
> +		list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) {
> +			amdgpu_device_unlock_adev(tmp_adev);
> +		}
> +	}
> +	return false;
> +}
> +
>   static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
>   {
>   	struct pci_dev *p = NULL;
> @@ -4573,11 +4613,32 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   			DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
>   				job ? job->base.id : -1, hive->hive_id);
>   			amdgpu_put_xgmi_hive(hive);
> +			if (job)
> +				drm_sched_increase_karma(&job->base);
>   			return 0;
>   		}
>   		mutex_lock(&hive->hive_lock);
>   	}
>   
> +	/*
> +	 * lock the device before we try to operate the linked list
> +	 * if didn't get the device lock, don't touch the linked list since
> +	 * others may iterating it.
> +	 */
> +	if (!amdgpu_device_lock_hive_adev(adev, hive)) {
> +		dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
> +					job ? job->base.id : -1);
> +
> +		if (adev->gmc.xgmi.num_physical_nodes > 1 && !hive)
> +			r = -ENODEV;
> +		else
> +			r = 0;


You can just change amdgpu_device_lock_hive_adev return type to int instead
of code duplication, maybe returning EAGAIN for actual locking failure.

Andrey


> +		/* even we skipped this reset, still need to set the job to guilty */
> +		if (job)
> +			drm_sched_increase_karma(&job->base);
> +		goto skip_recovery;
> +	}
> +
>   	/*
>   	 * Build list of devices to reset.
>   	 * In case we are in XGMI hive mode, resort the device list
> @@ -4585,8 +4646,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   	 */
>   	INIT_LIST_HEAD(&device_list);
>   	if (adev->gmc.xgmi.num_physical_nodes > 1) {
> -		if (!hive)
> -			return -ENODEV;
>   		if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
>   			list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
>   		device_list_handle = &hive->device_list;
> @@ -4597,13 +4656,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   
>   	/* block all schedulers and reset given job's ring */
>   	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
> -		if (!amdgpu_device_lock_adev(tmp_adev, hive)) {
> -			dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
> -				  job ? job->base.id : -1);
> -			r = 0;
> -			goto skip_recovery;
> -		}
> -
>   		/*
>   		 * Try to put the audio codec into suspend state
>   		 * before gpu reset started.
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

       reply	other threads:[~2021-01-20 16:40 UTC|newest]

Thread overview: 2+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <20210120141250.4095-1-horace.chen@amd.com>
2021-01-20 16:39 ` Andrey Grodzovsky [this message]
     [not found] <20210121102106.13257-1-horace.chen@amd.com>
2021-01-21 18:16 ` [PATCH] drm/amdgpu: race issue when jobs on 2 ring timeout Andrey Grodzovsky

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=54b28e65-226d-a9a2-2448-c2decd1a7d58@amd.com \
    --to=andrey.grodzovsky@amd.com \
    --cc=Alexander.Deucher@amd.com \
    --cc=Evan.Quan@amd.com \
    --cc=Feifei.Xu@amd.com \
    --cc=Hawking.Zhang@amd.com \
    --cc=Jack.Xiao@amd.com \
    --cc=Kevin1.Wang@amd.com \
    --cc=Luben.Tuikov@amd.com \
    --cc=Monk.Liu@amd.com \
    --cc=amd-gfx@lists.freedesktop.org \
    --cc=christian.koenig@amd.com \
    --cc=horace.chen@amd.com \
    --cc=xiaojie.yuan@amd.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.