Re: [PATCH] drm/amdgpu: race issue when jobs on 2 ring timeout

All of lore.kernel.org
 help / color / mirror / Atom feed
* Re: [PATCH] drm/amdgpu: race issue when jobs on 2 ring timeout
       [not found] <20210120141250.4095-1-horace.chen@amd.com>
@ 2021-01-20 16:39 ` Andrey Grodzovsky
  0 siblings, 0 replies; 2+ messages in thread
From: Andrey Grodzovsky @ 2021-01-20 16:39 UTC (permalink / raw)
  To: Horace Chen, amd-gfx
  Cc: Jack Xiao, Feifei Xu, Kevin Wang, Xiaojie Yuan, Tuikov Luben,
	Deucher Alexander, Evan Quan, Christian König, Monk Liu,
	Hawking Zhang

On 1/20/21 9:12 AM, Horace Chen wrote:
> Fix a racing issue when jobs on 2 rings timeout simultaneously.
>
> If 2 rings timed out at the same time, the
> amdgpu_device_gpu_recover will be reentered. Then the
> adev->gmc.xgmi.head will be grabbed by 2 local linked list,
> which may cause wild pointer issue in iterating.
>
> lock the device earily to prevent the node be added to 2
> different lists.
>
> also increase karma for the skipped job since the job is also
> timed out and should be guilty.
>
> Signed-off-by: Horace Chen <horace.chen@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 70 +++++++++++++++++++---
>   1 file changed, 61 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 4d434803fb49..d59d3182ac2d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4460,6 +4460,46 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>   	up_write(&adev->reset_sem);
>   }
>   
> +/*
> + * to lockup a list of amdgpu devices in a hive safely, if not a hive
> + * with multiple nodes, it will be same as amdgpu_device_lock_adev.
> + *
> + * unlock won't require roll back.
> + */
> +static bool amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)
> +{
> +	struct amdgpu_device *tmp_adev = NULL;
> +
> +	if (adev->gmc.xgmi.num_physical_nodes > 1) {
> +		if (!hive) {
> +			dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes");
> +			return false;
> +		}
> +		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
> +			if (!amdgpu_device_lock_adev(tmp_adev, hive))
> +				goto roll_back;
> +		}
> +		return true;
> +	} else {
> +		return amdgpu_device_lock_adev(adev, hive);
> +	}
> +roll_back:
> +	if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) {
> +		/*
> +		 * if the lockup iteration break in the middle of a hive,
> +		 * it may means there may has a race issue,
> +		 * or a hive device locked up independently.
> +		 * we may be in trouble and may not,
> +		 * so will try to roll back the lock and give out a warnning.
> +		 */
> +		dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock");
> +		list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) {
> +			amdgpu_device_unlock_adev(tmp_adev);
> +		}
> +	}
> +	return false;
> +}
> +
>   static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
>   {
>   	struct pci_dev *p = NULL;
> @@ -4573,11 +4613,32 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   			DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
>   				job ? job->base.id : -1, hive->hive_id);
>   			amdgpu_put_xgmi_hive(hive);
> +			if (job)
> +				drm_sched_increase_karma(&job->base);
>   			return 0;
>   		}
>   		mutex_lock(&hive->hive_lock);
>   	}
>   
> +	/*
> +	 * lock the device before we try to operate the linked list
> +	 * if didn't get the device lock, don't touch the linked list since
> +	 * others may iterating it.
> +	 */
> +	if (!amdgpu_device_lock_hive_adev(adev, hive)) {
> +		dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
> +					job ? job->base.id : -1);
> +
> +		if (adev->gmc.xgmi.num_physical_nodes > 1 && !hive)
> +			r = -ENODEV;
> +		else
> +			r = 0;

You can just change amdgpu_device_lock_hive_adev return type to int instead
of code duplication, maybe returning EAGAIN for actual locking failure.

Andrey

> +		/* even we skipped this reset, still need to set the job to guilty */
> +		if (job)
> +			drm_sched_increase_karma(&job->base);
> +		goto skip_recovery;
> +	}
> +
>   	/*
>   	 * Build list of devices to reset.
>   	 * In case we are in XGMI hive mode, resort the device list
> @@ -4585,8 +4646,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   	 */
>   	INIT_LIST_HEAD(&device_list);
>   	if (adev->gmc.xgmi.num_physical_nodes > 1) {
> -		if (!hive)
> -			return -ENODEV;
>   		if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
>   			list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
>   		device_list_handle = &hive->device_list;
> @@ -4597,13 +4656,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   
>   	/* block all schedulers and reset given job's ring */
>   	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
> -		if (!amdgpu_device_lock_adev(tmp_adev, hive)) {
> -			dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
> -				  job ? job->base.id : -1);
> -			r = 0;
> -			goto skip_recovery;
> -		}
> -
>   		/*
>   		 * Try to put the audio codec into suspend state
>   		 * before gpu reset started.
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 2+ messages in thread
[parent not found: <20210121102106.13257-1-horace.chen@amd.com>]
* Re: [PATCH] drm/amdgpu: race issue when jobs on 2 ring timeout
       [not found] <20210121102106.13257-1-horace.chen@amd.com>
@ 2021-01-21 18:16 ` Andrey Grodzovsky
  0 siblings, 0 replies; 2+ messages in thread
From: Andrey Grodzovsky @ 2021-01-21 18:16 UTC (permalink / raw)
  To: Horace Chen, amd-gfx
  Cc: Jack Xiao, Feifei Xu, Kevin Wang, Xiaojie Yuan, Tuikov Luben,
	Deucher Alexander, Evan Quan, Christian König, Monk Liu,
	Hawking Zhang

Looks good to me

Reviewed-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>

Andrey

On 1/21/21 5:21 AM, Horace Chen wrote:
> Fix a racing issue when jobs on 2 rings timeout simultaneously.
>
> If 2 rings timed out at the same time, the
> amdgpu_device_gpu_recover will be reentered. Then the
> adev->gmc.xgmi.head will be grabbed by 2 local linked list,
> which may cause wild pointer issue in iterating.
>
> lock the device earily to prevent the node be added to 2
> different lists.
>
> also increase karma for the skipped job since the job is also
> timed out and should be guilty.
>
> Signed-off-by: Horace Chen <horace.chen@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 69 ++++++++++++++++++----
>   1 file changed, 59 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 82fc392e4296..702e577be5e5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4459,6 +4459,46 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>   	up_write(&adev->reset_sem);
>   }
>   
> +/*
> + * to lockup a list of amdgpu devices in a hive safely, if not a hive
> + * with multiple nodes, it will be similar as amdgpu_device_lock_adev.
> + *
> + * unlock won't require roll back.
> + */
> +static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)
> +{
> +	struct amdgpu_device *tmp_adev = NULL;
> +
> +	if (adev->gmc.xgmi.num_physical_nodes > 1) {
> +		if (!hive) {
> +			dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes");
> +			return -ENODEV;
> +		}
> +		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
> +			if (!amdgpu_device_lock_adev(tmp_adev, hive))
> +				goto roll_back;
> +		}
> +	} else if (!amdgpu_device_lock_adev(adev, hive))
> +		return -EAGAIN;
> +
> +	return 0;
> +roll_back:
> +	if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) {
> +		/*
> +		 * if the lockup iteration break in the middle of a hive,
> +		 * it may means there may has a race issue,
> +		 * or a hive device locked up independently.
> +		 * we may be in trouble and may not, so will try to roll back
> +		 * the lock and give out a warnning.
> +		 */
> +		dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock");
> +		list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) {
> +			amdgpu_device_unlock_adev(tmp_adev);
> +		}
> +	}
> +	return -EAGAIN;
> +}
> +
>   static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
>   {
>   	struct pci_dev *p = NULL;
> @@ -4572,11 +4612,29 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   			DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
>   				job ? job->base.id : -1, hive->hive_id);
>   			amdgpu_put_xgmi_hive(hive);
> +			if (job)
> +				drm_sched_increase_karma(&job->base);
>   			return 0;
>   		}
>   		mutex_lock(&hive->hive_lock);
>   	}
>   
> +	/*
> +	 * lock the device before we try to operate the linked list
> +	 * if didn't get the device lock, don't touch the linked list since
> +	 * others may iterating it.
> +	 */
> +	r = amdgpu_device_lock_hive_adev(adev, hive);
> +	if (r) {
> +		dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
> +					job ? job->base.id : -1);
> +
> +		/* even we skipped this reset, still need to set the job to guilty */
> +		if (job)
> +			drm_sched_increase_karma(&job->base);
> +		goto skip_recovery;
> +	}
> +
>   	/*
>   	 * Build list of devices to reset.
>   	 * In case we are in XGMI hive mode, resort the device list
> @@ -4584,8 +4642,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   	 */
>   	INIT_LIST_HEAD(&device_list);
>   	if (adev->gmc.xgmi.num_physical_nodes > 1) {
> -		if (!hive)
> -			return -ENODEV;
>   		if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
>   			list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
>   		device_list_handle = &hive->device_list;
> @@ -4596,13 +4652,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   
>   	/* block all schedulers and reset given job's ring */
>   	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
> -		if (!amdgpu_device_lock_adev(tmp_adev, hive)) {
> -			dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
> -				  job ? job->base.id : -1);
> -			r = 0;
> -			goto skip_recovery;
> -		}
> -
>   		/*
>   		 * Try to put the audio codec into suspend state
>   		 * before gpu reset started.
> @@ -4740,7 +4789,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   		amdgpu_put_xgmi_hive(hive);
>   	}
>   
> -	if (r)
> +	if (r && r != -EAGAIN)
>   		dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
>   	return r;
>   }
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 2+ messages in thread
end of thread, other threads:[~2021-01-21 18:16 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20210120141250.4095-1-horace.chen@amd.com>
2021-01-20 16:39 ` [PATCH] drm/amdgpu: race issue when jobs on 2 ring timeout Andrey Grodzovsky
     [not found] <20210121102106.13257-1-horace.chen@amd.com>
2021-01-21 18:16 ` Andrey Grodzovsky

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.