All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Christian König" <ckoenig.leichtzumerken-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
To: Monk Liu <Monk.Liu-5C7GfCeVMHo@public.gmane.org>,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
Subject: Re: [PATCH 1/6] amd/scheduler:imple job skip feature(v3)
Date: Mon, 30 Oct 2017 11:00:11 +0100	[thread overview]
Message-ID: <de065e02-acb7-2891-70ca-5eab3ce3365d@gmail.com> (raw)
In-Reply-To: <1509336909-11455-2-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>

Am 30.10.2017 um 05:15 schrieb Monk Liu:
> jobs are skipped under two cases
> 1)when the entity behind this job marked guilty, the job
> poped from this entity's queue will be dropped in sched_main loop.
>
> 2)in job_recovery(), skip the scheduling job if its karma detected
> above limit, and also skipped as well for other jobs sharing the
> same fence context. this approach is becuase job_recovery() cannot
> access job->entity due to entity may already dead.
>
> v2:
> some logic fix
>
> v3:
> when entity detected guilty, don't drop the job in the poping
> stage, instead set its fence error as -ECANCELED
>
> in run_job(), skip the scheduling either:1) fence->error < 0
> or 2) there was a VRAM LOST occurred on this job.
> this way we can unify the job skipping logic.
>
> with this feature we can introduce new gpu recover feature.
>
> Change-Id: I268b1c752c94e6ecd4ea78c87eb226ea3f52908a
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c       | 13 +++++----
>   drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 39 ++++++++++++++++-----------
>   2 files changed, 31 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index f60662e..0a90c76 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -180,7 +180,7 @@ static struct dma_fence *amdgpu_job_dependency(struct amd_sched_job *sched_job,
>   
>   static struct dma_fence *amdgpu_job_run(struct amd_sched_job *sched_job)
>   {
> -	struct dma_fence *fence = NULL;
> +	struct dma_fence *fence = NULL, *finished;
>   	struct amdgpu_device *adev;
>   	struct amdgpu_job *job;
>   	int r;
> @@ -190,15 +190,18 @@ static struct dma_fence *amdgpu_job_run(struct amd_sched_job *sched_job)
>   		return NULL;
>   	}
>   	job = to_amdgpu_job(sched_job);
> +	finished = &job->base.s_fence->finished;
>   	adev = job->adev;
>   
>   	BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL));
>   
>   	trace_amdgpu_sched_run_job(job);
> -	/* skip ib schedule when vram is lost */
> -	if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter)) {
> -		dma_fence_set_error(&job->base.s_fence->finished, -ECANCELED);
> -		DRM_ERROR("Skip scheduling IBs!\n");
> +
> +	if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter))
> +		dma_fence_set_error(finished, -ECANCELED);/* skip IB as well if VRAM lost */
> +
> +	if (finished->error < 0) {
> +		DRM_INFO("Skip scheduling IBs!\n");
>   	} else {
>   		r = amdgpu_ib_schedule(job->ring, job->num_ibs, job->ibs, job,
>   				       &fence);
> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> index 903ef8b..3d8c994 100644
> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> @@ -344,6 +344,10 @@ amd_sched_entity_pop_job(struct amd_sched_entity *entity)
>   		if (amd_sched_entity_add_dependency_cb(entity))
>   			return NULL;
>   
> +	/* skip jobs from entity that marked guilty */
> +	if (entity->guilty && atomic_read(entity->guilty))
> +		dma_fence_set_error(&sched_job->s_fence->finished, -ECANCELED);
> +
>   	spsc_queue_pop(&entity->job_queue);
>   	return sched_job;
>   }
> @@ -440,14 +444,6 @@ static void amd_sched_job_timedout(struct work_struct *work)
>   	job->sched->ops->timedout_job(job);
>   }
>   
> -static void amd_sched_set_guilty(struct amd_sched_job *s_job,
> -				 struct amd_sched_entity *s_entity)
> -{
> -	if (atomic_inc_return(&s_job->karma) > s_job->sched->hang_limit)
> -		if (s_entity->guilty)
> -			atomic_set(s_entity->guilty, 1);
> -}
> -
>   void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_job *bad)
>   {
>   	struct amd_sched_job *s_job;
> @@ -467,21 +463,24 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_jo
>   	spin_unlock(&sched->job_list_lock);
>   
>   	if (bad) {
> -		bool found = false;
> -
> -		for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_MAX; i++ ) {
> +		/* don't increase @bad's karma if it's from KERNEL RQ,
> +		 * becuase sometimes GPU hang would cause kernel jobs (like VM updating jobs)
> +		 * corrupt but keep in mind that kernel jobs always considered good.
> +		 */
> +		for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_KERNEL; i++ ) {
>   			struct amd_sched_rq *rq = &sched->sched_rq[i];
>   
>   			spin_lock(&rq->lock);
>   			list_for_each_entry_safe(entity, tmp, &rq->entities, list) {
>   				if (bad->s_fence->scheduled.context == entity->fence_context) {
> -					found = true;
> -					amd_sched_set_guilty(bad, entity);
> +				    if (atomic_inc_return(&bad->karma) > bad->sched->hang_limit)
> +						if (entity->guilty)
> +							atomic_set(entity->guilty, 1);
>   					break;
>   				}
>   			}
>   			spin_unlock(&rq->lock);
> -			if (found)
> +			if (&entity->list == &rq->entities)

That needs to be "&entity->list != &rq->entities", or otherwise we only 
check on the first round.

With that fixed the patch is Reviewed-by: Christian König 
<christian.koenig@amd.com>.

Nice work,
Christian.

>   				break;
>   		}
>   	}
> @@ -499,6 +498,7 @@ void amd_sched_job_kickout(struct amd_sched_job *s_job)
>   void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
>   {
>   	struct amd_sched_job *s_job, *tmp;
> +	bool found_guilty = false;
>   	int r;
>   
>   	spin_lock(&sched->job_list_lock);
> @@ -510,6 +510,15 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
>   	list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
>   		struct amd_sched_fence *s_fence = s_job->s_fence;
>   		struct dma_fence *fence;
> +		uint64_t guilty_context;
> +
> +		if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) {
> +			found_guilty = true;
> +			guilty_context = s_job->s_fence->scheduled.context;
> +		}
> +
> +		if (found_guilty && s_job->s_fence->scheduled.context == guilty_context)
> +			dma_fence_set_error(&s_fence->finished, -ECANCELED);
>   
>   		spin_unlock(&sched->job_list_lock);
>   		fence = sched->ops->run_job(s_job);
> @@ -525,7 +534,6 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
>   					  r);
>   			dma_fence_put(fence);
>   		} else {
> -			DRM_ERROR("Failed to run job!\n");
>   			amd_sched_process_job(NULL, &s_fence->cb);
>   		}
>   		spin_lock(&sched->job_list_lock);
> @@ -663,7 +671,6 @@ static int amd_sched_main(void *param)
>   					  r);
>   			dma_fence_put(fence);
>   		} else {
> -			DRM_ERROR("Failed to run job!\n");
>   			amd_sched_process_job(NULL, &s_fence->cb);
>   		}
>   


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

  parent reply	other threads:[~2017-10-30 10:00 UTC|newest]

Thread overview: 20+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-10-30  4:15 [PATCH 0/7] *** GPU recover V3 *** Monk Liu
     [not found] ` <1509336909-11455-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2017-10-30  4:15   ` [PATCH 1/6] amd/scheduler:imple job skip feature(v3) Monk Liu
     [not found]     ` <1509336909-11455-2-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2017-10-30 10:00       ` Christian König [this message]
     [not found]         ` <de065e02-acb7-2891-70ca-5eab3ce3365d-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2017-10-30 10:33           ` Liu, Monk
2017-10-30  4:15   ` [PATCH 2/6] drm/amdgpu:implement new GPU recover(v3) Monk Liu
     [not found]     ` <1509336909-11455-3-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2017-10-30 10:06       ` Christian König
2017-10-30  4:15   ` [PATCH 3/6] drm/amdgpu:cleanup in_sriov_reset and lock_reset Monk Liu
     [not found]     ` <1509336909-11455-4-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2017-10-30 10:36       ` Christian König
2017-10-30  4:15   ` [PATCH 4/6] drm/amdgpu:cleanup ucode_init_bo Monk Liu
     [not found]     ` <1509336909-11455-5-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2017-10-30 10:36       ` Christian König
2017-10-30  4:15   ` [PATCH 5/6] drm/amdgpu/sriov:fix memory leak in psp_load_fw Monk Liu
     [not found]     ` <1509336909-11455-6-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2017-10-30 10:37       ` Christian König
2017-10-30  4:15   ` [PATCH 6/6] drm/amdgpu:fix random missing of FLR NOTIFY Monk Liu
     [not found]     ` <1509336909-11455-7-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2017-10-30 10:37       ` Christian König
2017-11-09  9:35   ` [PATCH 0/7] *** GPU recover V3 *** Julien Isorce
     [not found]     ` <CAHWPjbU=47WZzbdWYPbP360v4qa6t=_pCtkYcxO2J4bSwA8jVA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-11-09 18:08       ` Alex Deucher
     [not found]         ` <CADnq5_Otuz78P8YuoSYZNNx+iy2Pi4bBqLxvt4NrhwYDWXtjLw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-11-13 11:24           ` Julien Isorce
     [not found]             ` <CAHWPjbX+gqRLMgr896kzSXPeQnUU8X3_1jPfZtfzLB_VsuEa3A-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-11-13 15:23               ` Alex Deucher
2017-11-10  7:51       ` Liu, Monk
     [not found]         ` <BY1PR12MB0456E48B02CFAE654F1F145484540-PicGAnIBOoZZHrdT0E092gdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2017-11-13 11:26           ` Julien Isorce

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=de065e02-acb7-2891-70ca-5eab3ce3365d@gmail.com \
    --to=ckoenig.leichtzumerken-re5jqeeqqe8avxtiumwx3w@public.gmane.org \
    --cc=Monk.Liu-5C7GfCeVMHo@public.gmane.org \
    --cc=amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org \
    --cc=christian.koenig-5C7GfCeVMHo@public.gmane.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.