All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Grodzovsky, Andrey" <Andrey.Grodzovsky-5C7GfCeVMHo@public.gmane.org>
To: "dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org"
	<dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org>,
	"amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org"
	<amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org>,
	"eric-WhKQ6XTQaPysTnJN9+BGXg@public.gmane.org"
	<eric-WhKQ6XTQaPysTnJN9+BGXg@public.gmane.org>,
	"etnaviv-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org"
	<etnaviv-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org>,
	"ckoenig.leichtzumerken-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org"
	<ckoenig.leichtzumerken-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
Cc: "Kazlauskas,
	Nicholas" <Nicholas.Kazlauskas-5C7GfCeVMHo@public.gmane.org>,
	"Koenig,
	Christian" <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
Subject: Re: [PATCH v5 6/6] drm/amdgpu: Avoid HW reset if guilty job already signaled.
Date: Mon, 22 Apr 2019 11:54:25 +0000	[thread overview]
Message-ID: <7a09bdb5-0acb-3df1-6691-4940920411ec@amd.com> (raw)
In-Reply-To: <1555599624-12285-6-git-send-email-andrey.grodzovsky-5C7GfCeVMHo@public.gmane.org>

Ping for patches 3, new patch 5 and patch 6.

Andrey

On 4/18/19 11:00 AM, Andrey Grodzovsky wrote:
> Also reject TDRs if another one already running.
>
> v2:
> Stop all schedulers across device and entire XGMI hive before
> force signaling HW fences.
> Avoid passing job_signaled to helper fnctions to keep all the decision
> making about skipping HW reset in one place.
>
> v3:
> Fix SW sched. hang after non HW reset. sched.hw_rq_count has to be balanced
> against it's decrement in drm_sched_stop in non HW reset case.
> v4: rebase
> v5: Revert v3 as we do it now in sceduler code.
>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 143 +++++++++++++++++++----------
>   1 file changed, 95 insertions(+), 48 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index a0e165c..85f8792 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3334,8 +3334,6 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>   		if (!ring || !ring->sched.thread)
>   			continue;
>   
> -		drm_sched_stop(&ring->sched, &job->base);
> -
>   		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
>   		amdgpu_fence_driver_force_completion(ring);
>   	}
> @@ -3343,6 +3341,7 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>   	if(job)
>   		drm_sched_increase_karma(&job->base);
>   
> +	/* Don't suspend on bare metal if we are not going to HW reset the ASIC */
>   	if (!amdgpu_sriov_vf(adev)) {
>   
>   		if (!need_full_reset)
> @@ -3480,37 +3479,21 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>   	return r;
>   }
>   
> -static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev)
> +static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock)
>   {
> -	int i;
> -
> -	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> -		struct amdgpu_ring *ring = adev->rings[i];
> -
> -		if (!ring || !ring->sched.thread)
> -			continue;
> -
> -		if (!adev->asic_reset_res)
> -			drm_sched_resubmit_jobs(&ring->sched);
> +	if (trylock) {
> +		if (!mutex_trylock(&adev->lock_reset))
> +			return false;
> +	} else
> +		mutex_lock(&adev->lock_reset);
>   
> -		drm_sched_start(&ring->sched, !adev->asic_reset_res);
> -	}
> -
> -	if (!amdgpu_device_has_dc_support(adev)) {
> -		drm_helper_resume_force_mode(adev->ddev);
> -	}
> -
> -	adev->asic_reset_res = 0;
> -}
> -
> -static void amdgpu_device_lock_adev(struct amdgpu_device *adev)
> -{
> -	mutex_lock(&adev->lock_reset);
>   	atomic_inc(&adev->gpu_reset_counter);
>   	adev->in_gpu_reset = 1;
>   	/* Block kfd: SRIOV would do it separately */
>   	if (!amdgpu_sriov_vf(adev))
>                   amdgpu_amdkfd_pre_reset(adev);
> +
> +	return true;
>   }
>   
>   static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
> @@ -3538,40 +3521,42 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>   int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   			      struct amdgpu_job *job)
>   {
> -	int r;
> +	struct list_head device_list, *device_list_handle =  NULL;
> +	bool need_full_reset, job_signaled;
>   	struct amdgpu_hive_info *hive = NULL;
> -	bool need_full_reset = false;
>   	struct amdgpu_device *tmp_adev = NULL;
> -	struct list_head device_list, *device_list_handle =  NULL;
> +	int i, r = 0;
>   
> +	need_full_reset = job_signaled = false;
>   	INIT_LIST_HEAD(&device_list);
>   
>   	dev_info(adev->dev, "GPU reset begin!\n");
>   
> +	hive = amdgpu_get_xgmi_hive(adev, false);
> +
>   	/*
> -	 * In case of XGMI hive disallow concurrent resets to be triggered
> -	 * by different nodes. No point also since the one node already executing
> -	 * reset will also reset all the other nodes in the hive.
> +	 * Here we trylock to avoid chain of resets executing from
> +	 * either trigger by jobs on different adevs in XGMI hive or jobs on
> +	 * different schedulers for same device while this TO handler is running.
> +	 * We always reset all schedulers for device and all devices for XGMI
> +	 * hive so that should take care of them too.
>   	 */
> -	hive = amdgpu_get_xgmi_hive(adev, 0);
> -	if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
> -	    !mutex_trylock(&hive->reset_lock))
> +
> +	if (hive && !mutex_trylock(&hive->reset_lock)) {
> +		DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
> +			 job->base.id, hive->hive_id);
>   		return 0;
> +	}
>   
>   	/* Start with adev pre asic reset first for soft reset check.*/
> -	amdgpu_device_lock_adev(adev);
> -	r = amdgpu_device_pre_asic_reset(adev,
> -					 job,
> -					 &need_full_reset);
> -	if (r) {
> -		/*TODO Should we stop ?*/
> -		DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
> -			  r, adev->ddev->unique);
> -		adev->asic_reset_res = r;
> +	if (!amdgpu_device_lock_adev(adev, !hive)) {
> +		DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress",
> +					 job->base.id);
> +		return 0;
>   	}
>   
>   	/* Build list of devices to reset */
> -	if  (need_full_reset && adev->gmc.xgmi.num_physical_nodes > 1) {
> +	if  (adev->gmc.xgmi.num_physical_nodes > 1) {
>   		if (!hive) {
>   			amdgpu_device_unlock_adev(adev);
>   			return -ENODEV;
> @@ -3588,13 +3573,56 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   		device_list_handle = &device_list;
>   	}
>   
> +	/* block all schedulers and reset given job's ring */
> +	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
> +		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> +			struct amdgpu_ring *ring = tmp_adev->rings[i];
> +
> +			if (!ring || !ring->sched.thread)
> +				continue;
> +
> +			drm_sched_stop(&ring->sched, &job->base);
> +		}
> +	}
> +
> +
> +	/*
> +	 * Must check guilty signal here since after this point all old
> +	 * HW fences are force signaled.
> +	 *
> +	 * job->base holds a reference to parent fence
> +	 */
> +	if (job && job->base.s_fence->parent &&
> +	    dma_fence_is_signaled(job->base.s_fence->parent))
> +		job_signaled = true;
> +
> +	if (!amdgpu_device_ip_need_full_reset(adev))
> +		device_list_handle = &device_list;
> +
> +	if (job_signaled) {
> +		dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
> +		goto skip_hw_reset;
> +	}
> +
> +
> +	/* Guilty job will be freed after this*/
> +	r = amdgpu_device_pre_asic_reset(adev,
> +					 job,
> +					 &need_full_reset);
> +	if (r) {
> +		/*TODO Should we stop ?*/
> +		DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
> +			  r, adev->ddev->unique);
> +		adev->asic_reset_res = r;
> +	}
> +
>   retry:	/* Rest of adevs pre asic reset from XGMI hive. */
>   	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>   
>   		if (tmp_adev == adev)
>   			continue;
>   
> -		amdgpu_device_lock_adev(tmp_adev);
> +		amdgpu_device_lock_adev(tmp_adev, false);
>   		r = amdgpu_device_pre_asic_reset(tmp_adev,
>   						 NULL,
>   						 &need_full_reset);
> @@ -3618,9 +3646,28 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   			goto retry;
>   	}
>   
> +skip_hw_reset:
> +
>   	/* Post ASIC reset for all devs .*/
>   	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
> -		amdgpu_device_post_asic_reset(tmp_adev);
> +		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> +			struct amdgpu_ring *ring = tmp_adev->rings[i];
> +
> +			if (!ring || !ring->sched.thread)
> +				continue;
> +
> +			/* No point to resubmit jobs if we didn't HW reset*/
> +			if (!tmp_adev->asic_reset_res && !job_signaled)
> +				drm_sched_resubmit_jobs(&ring->sched);
> +
> +			drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
> +		}
> +
> +		if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
> +			drm_helper_resume_force_mode(tmp_adev->ddev);
> +		}
> +
> +		tmp_adev->asic_reset_res = 0;
>   
>   		if (r) {
>   			/* bad news, how to tell it to userspace ? */
> @@ -3633,7 +3680,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   		amdgpu_device_unlock_adev(tmp_adev);
>   	}
>   
> -	if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
> +	if (hive)
>   		mutex_unlock(&hive->reset_lock);
>   
>   	if (r)
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

  parent reply	other threads:[~2019-04-22 11:54 UTC|newest]

Thread overview: 31+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-04-18 15:00 [PATCH v5 1/6] drm/amd/display: wait for fence without holding reservation lock Andrey Grodzovsky
     [not found] ` <1555599624-12285-1-git-send-email-andrey.grodzovsky-5C7GfCeVMHo@public.gmane.org>
2019-04-18 15:00   ` [PATCH v5 2/6] drm/amd/display: Use a reasonable timeout for framebuffer fence waits Andrey Grodzovsky
2019-04-18 15:00   ` [PATCH v5 3/6] drm/scheduler: rework job destruction Andrey Grodzovsky
2019-04-22 12:48     ` Chunming Zhou
     [not found]       ` <9f7112b1-0348-b4f6-374d-e44c0d448112-5C7GfCeVMHo@public.gmane.org>
2019-04-23 14:26         ` Grodzovsky, Andrey
2019-04-23 14:44           ` Zhou, David(ChunMing)
2019-04-23 15:01             ` [PATCH " Grodzovsky, Andrey
2019-05-29 10:02     ` Daniel Vetter
2019-04-18 15:00   ` [PATCH v5 4/6] drm/sched: Keep s_fence->parent pointer Andrey Grodzovsky
     [not found]     ` <1555599624-12285-4-git-send-email-andrey.grodzovsky-5C7GfCeVMHo@public.gmane.org>
2019-04-22 12:59       ` Chunming Zhou
2019-04-23 15:14         ` Grodzovsky, Andrey
2019-04-18 15:00   ` [PATCH v5 5/6] drm/scheduler: Add flag to hint the release of guilty job Andrey Grodzovsky
2019-04-18 15:00   ` [PATCH v5 6/6] drm/amdgpu: Avoid HW reset if guilty job already signaled Andrey Grodzovsky
     [not found]     ` <1555599624-12285-6-git-send-email-andrey.grodzovsky-5C7GfCeVMHo@public.gmane.org>
2019-04-22 11:54       ` Grodzovsky, Andrey [this message]
2019-04-23 12:32         ` Koenig, Christian
     [not found]           ` <9774408b-cc4c-90dd-cbc7-6ef5c6fd8c46-5C7GfCeVMHo@public.gmane.org>
2019-04-23 13:14             ` Kazlauskas, Nicholas
2019-04-23 14:03               ` Grodzovsky, Andrey
2019-04-23 14:12           ` Grodzovsky, Andrey
     [not found]             ` <a5c97356-66d8-b79e-32ab-a03e4c4d3e39-5C7GfCeVMHo@public.gmane.org>
2019-04-23 14:49               ` Christian König
2019-04-22 13:09       ` Chunming Zhou
2019-04-23 14:51         ` Grodzovsky, Andrey
     [not found]           ` <1b41c4f1-b406-8710-2a7a-e5c54a116fe9-5C7GfCeVMHo@public.gmane.org>
2019-04-23 15:19             ` Zhou, David(ChunMing)
     [not found]               ` <-hyv5g0n8ru25qelb0v-8u6jdi1vp2c7z1m3f5-uygwc1o5ji6s-9zli9v-srreuk-3pvse1en6kx0-6se95l-6jsafd-a6sboi-j814xf-ijgwfc-qewgmm-vnafjgrn2fq0-jgir949hx4yo-i772hz-tn7ial.1556032736536-2ueSQiBKiTY7tOexoI0I+QC/G2K4zDHf@public.gmane.org>
2019-04-23 15:59                 ` [PATCH " Grodzovsky, Andrey
2019-04-24  3:02                   ` Zhou, David(ChunMing)
2019-04-24  7:09                     ` Christian König
     [not found]                       ` <e20d013e-df21-1300-27d1-7f9b829cc067-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2019-04-26 14:08                         ` Grodzovsky, Andrey
2019-04-28  2:56                           ` Zhou, David(ChunMing)
2019-04-29 14:14                             ` Grodzovsky, Andrey
2019-04-29 19:03                               ` Christian König
2019-04-23  2:35   ` [PATCH v5 1/6] drm/amd/display: wait for fence without holding reservation lock Dieter Nützel
     [not found]     ` <2ddcff29bfaab2408b6e2cbc416322cd-0hun7QTegEsDD4udEopG9Q@public.gmane.org>
2019-04-23 14:02       ` Grodzovsky, Andrey

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=7a09bdb5-0acb-3df1-6691-4940920411ec@amd.com \
    --to=andrey.grodzovsky-5c7gfcevmho@public.gmane.org \
    --cc=Christian.Koenig-5C7GfCeVMHo@public.gmane.org \
    --cc=Nicholas.Kazlauskas-5C7GfCeVMHo@public.gmane.org \
    --cc=amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org \
    --cc=ckoenig.leichtzumerken-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org \
    --cc=dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org \
    --cc=eric-WhKQ6XTQaPysTnJN9+BGXg@public.gmane.org \
    --cc=etnaviv-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.