All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/7] *** GPU recover V3 ***
@ 2017-10-30  4:15 Monk Liu
       [not found] ` <1509336909-11455-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 20+ messages in thread
From: Monk Liu @ 2017-10-30  4:15 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Monk Liu

*** job skipping logic in scheduler part is re-implemented  ***

Monk Liu (7):
  amd/scheduler:imple job skip feature(v3)
  drm/amdgpu:implement new GPU recover(v3)
  drm/amdgpu:cleanup in_sriov_reset and lock_reset
  drm/amdgpu:cleanup ucode_init_bo
  drm/amdgpu:block kms open during gpu_reset
  drm/amdgpu/sriov:fix memory leak in psp_load_fw
  drm/amdgpu:fix random missing of FLR NOTIFY

 drivers/gpu/drm/amd/amdgpu/amdgpu.h           |   9 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    | 311 ++++++++++++--------------
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c     |  10 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c       |   2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c       |  18 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c       |   3 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c       |  22 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c     |   4 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c      |   2 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h      |   2 -
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c         |   6 +-
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c         |   6 +-
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c         |  16 +-
 drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c         |   2 +-
 drivers/gpu/drm/amd/scheduler/gpu_scheduler.c |  39 ++--
 15 files changed, 220 insertions(+), 232 deletions(-)

-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH 1/6] amd/scheduler:imple job skip feature(v3)
       [not found] ` <1509336909-11455-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
@ 2017-10-30  4:15   ` Monk Liu
       [not found]     ` <1509336909-11455-2-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  2017-10-30  4:15   ` [PATCH 2/6] drm/amdgpu:implement new GPU recover(v3) Monk Liu
                     ` (5 subsequent siblings)
  6 siblings, 1 reply; 20+ messages in thread
From: Monk Liu @ 2017-10-30  4:15 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Monk Liu

jobs are skipped under two cases
1)when the entity behind this job marked guilty, the job
poped from this entity's queue will be dropped in sched_main loop.

2)in job_recovery(), skip the scheduling job if its karma detected
above limit, and also skipped as well for other jobs sharing the
same fence context. this approach is becuase job_recovery() cannot
access job->entity due to entity may already dead.

v2:
some logic fix

v3:
when entity detected guilty, don't drop the job in the poping
stage, instead set its fence error as -ECANCELED

in run_job(), skip the scheduling either:1) fence->error < 0
or 2) there was a VRAM LOST occurred on this job.
this way we can unify the job skipping logic.

with this feature we can introduce new gpu recover feature.

Change-Id: I268b1c752c94e6ecd4ea78c87eb226ea3f52908a
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c       | 13 +++++----
 drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 39 ++++++++++++++++-----------
 2 files changed, 31 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index f60662e..0a90c76 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -180,7 +180,7 @@ static struct dma_fence *amdgpu_job_dependency(struct amd_sched_job *sched_job,
 
 static struct dma_fence *amdgpu_job_run(struct amd_sched_job *sched_job)
 {
-	struct dma_fence *fence = NULL;
+	struct dma_fence *fence = NULL, *finished;
 	struct amdgpu_device *adev;
 	struct amdgpu_job *job;
 	int r;
@@ -190,15 +190,18 @@ static struct dma_fence *amdgpu_job_run(struct amd_sched_job *sched_job)
 		return NULL;
 	}
 	job = to_amdgpu_job(sched_job);
+	finished = &job->base.s_fence->finished;
 	adev = job->adev;
 
 	BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL));
 
 	trace_amdgpu_sched_run_job(job);
-	/* skip ib schedule when vram is lost */
-	if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter)) {
-		dma_fence_set_error(&job->base.s_fence->finished, -ECANCELED);
-		DRM_ERROR("Skip scheduling IBs!\n");
+
+	if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter))
+		dma_fence_set_error(finished, -ECANCELED);/* skip IB as well if VRAM lost */
+
+	if (finished->error < 0) {
+		DRM_INFO("Skip scheduling IBs!\n");
 	} else {
 		r = amdgpu_ib_schedule(job->ring, job->num_ibs, job->ibs, job,
 				       &fence);
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
index 903ef8b..3d8c994 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
@@ -344,6 +344,10 @@ amd_sched_entity_pop_job(struct amd_sched_entity *entity)
 		if (amd_sched_entity_add_dependency_cb(entity))
 			return NULL;
 
+	/* skip jobs from entity that marked guilty */
+	if (entity->guilty && atomic_read(entity->guilty))
+		dma_fence_set_error(&sched_job->s_fence->finished, -ECANCELED);
+
 	spsc_queue_pop(&entity->job_queue);
 	return sched_job;
 }
@@ -440,14 +444,6 @@ static void amd_sched_job_timedout(struct work_struct *work)
 	job->sched->ops->timedout_job(job);
 }
 
-static void amd_sched_set_guilty(struct amd_sched_job *s_job,
-				 struct amd_sched_entity *s_entity)
-{
-	if (atomic_inc_return(&s_job->karma) > s_job->sched->hang_limit)
-		if (s_entity->guilty)
-			atomic_set(s_entity->guilty, 1);
-}
-
 void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_job *bad)
 {
 	struct amd_sched_job *s_job;
@@ -467,21 +463,24 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_jo
 	spin_unlock(&sched->job_list_lock);
 
 	if (bad) {
-		bool found = false;
-
-		for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_MAX; i++ ) {
+		/* don't increase @bad's karma if it's from KERNEL RQ,
+		 * becuase sometimes GPU hang would cause kernel jobs (like VM updating jobs)
+		 * corrupt but keep in mind that kernel jobs always considered good.
+		 */
+		for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_KERNEL; i++ ) {
 			struct amd_sched_rq *rq = &sched->sched_rq[i];
 
 			spin_lock(&rq->lock);
 			list_for_each_entry_safe(entity, tmp, &rq->entities, list) {
 				if (bad->s_fence->scheduled.context == entity->fence_context) {
-					found = true;
-					amd_sched_set_guilty(bad, entity);
+				    if (atomic_inc_return(&bad->karma) > bad->sched->hang_limit)
+						if (entity->guilty)
+							atomic_set(entity->guilty, 1);
 					break;
 				}
 			}
 			spin_unlock(&rq->lock);
-			if (found)
+			if (&entity->list == &rq->entities)
 				break;
 		}
 	}
@@ -499,6 +498,7 @@ void amd_sched_job_kickout(struct amd_sched_job *s_job)
 void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
 {
 	struct amd_sched_job *s_job, *tmp;
+	bool found_guilty = false;
 	int r;
 
 	spin_lock(&sched->job_list_lock);
@@ -510,6 +510,15 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
 	list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
 		struct amd_sched_fence *s_fence = s_job->s_fence;
 		struct dma_fence *fence;
+		uint64_t guilty_context;
+
+		if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) {
+			found_guilty = true;
+			guilty_context = s_job->s_fence->scheduled.context;
+		}
+
+		if (found_guilty && s_job->s_fence->scheduled.context == guilty_context)
+			dma_fence_set_error(&s_fence->finished, -ECANCELED);
 
 		spin_unlock(&sched->job_list_lock);
 		fence = sched->ops->run_job(s_job);
@@ -525,7 +534,6 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
 					  r);
 			dma_fence_put(fence);
 		} else {
-			DRM_ERROR("Failed to run job!\n");
 			amd_sched_process_job(NULL, &s_fence->cb);
 		}
 		spin_lock(&sched->job_list_lock);
@@ -663,7 +671,6 @@ static int amd_sched_main(void *param)
 					  r);
 			dma_fence_put(fence);
 		} else {
-			DRM_ERROR("Failed to run job!\n");
 			amd_sched_process_job(NULL, &s_fence->cb);
 		}
 
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 2/6] drm/amdgpu:implement new GPU recover(v3)
       [not found] ` <1509336909-11455-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  2017-10-30  4:15   ` [PATCH 1/6] amd/scheduler:imple job skip feature(v3) Monk Liu
@ 2017-10-30  4:15   ` Monk Liu
       [not found]     ` <1509336909-11455-3-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  2017-10-30  4:15   ` [PATCH 3/6] drm/amdgpu:cleanup in_sriov_reset and lock_reset Monk Liu
                     ` (4 subsequent siblings)
  6 siblings, 1 reply; 20+ messages in thread
From: Monk Liu @ 2017-10-30  4:15 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Monk Liu

1,new imple names amdgpu_gpu_recover which gives more hint
on what it does compared with gpu_reset

2,gpu_recover unify bare-metal and SR-IOV, only the asic reset
part is implemented differently

3,gpu_recover will increase hang job karma and mark its entity/context
as guilty if exceeds limit

V2:

4,in scheduler main routine the job from guilty context  will be immedialy
fake signaled after it poped from queue and its fence be set with
"-ECANCELED" error

5,in scheduler recovery routine all jobs from the guilty entity would be
dropped

6,in run_job() routine the real IB submission would be skipped if @skip parameter
equales true or there was VRAM lost occured.

V3:

7,replace deprecated gpu reset, use new gpu recover

Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |   6 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 310 +++++++++++++----------------
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  |  10 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c    |   2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    |   5 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h   |   1 -
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c      |   2 +-
 drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c      |   2 +-
 8 files changed, 151 insertions(+), 187 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index ba1ab97..335df11 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -178,6 +178,10 @@ extern int amdgpu_cik_support;
 #define CIK_CURSOR_WIDTH 128
 #define CIK_CURSOR_HEIGHT 128
 
+/* GPU RESET flags */
+#define AMDGPU_RESET_INFO_VRAM_LOST  (1 << 0)
+#define AMDGPU_RESET_INFO_FULLRESET  (1 << 1)
+
 struct amdgpu_device;
 struct amdgpu_ib;
 struct amdgpu_cs_parser;
@@ -1840,7 +1844,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
 #define amdgpu_psp_check_fw_loading_status(adev, i) (adev)->firmware.funcs->check_fw_loading_status((adev), (i))
 
 /* Common functions */
-int amdgpu_gpu_reset(struct amdgpu_device *adev);
+int amdgpu_gpu_recover(struct amdgpu_device *adev, struct amdgpu_job* job);
 bool amdgpu_need_backup(struct amdgpu_device *adev);
 void amdgpu_pci_config_reset(struct amdgpu_device *adev);
 bool amdgpu_need_post(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 400dfaa..7bccd45 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2826,163 +2826,154 @@ static int amdgpu_recover_vram_from_shadow(struct amdgpu_device *adev,
 	return r;
 }
 
-/**
- * amdgpu_sriov_gpu_reset - reset the asic
- *
- * @adev: amdgpu device pointer
- * @job: which job trigger hang
- *
- * Attempt the reset the GPU if it has hung (all asics).
- * for SRIOV case.
- * Returns 0 for success or an error on failure.
- */
-int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, struct amdgpu_job *job)
+static int amdgpu_reset(struct amdgpu_device *adev, uint64_t* reset_flags)
 {
-	int i, j, r = 0;
-	int resched;
-	struct amdgpu_bo *bo, *tmp;
-	struct amdgpu_ring *ring;
-	struct dma_fence *fence = NULL, *next = NULL;
+	int r;
+	bool need_full_reset, vram_lost = 0;
 
-	mutex_lock(&adev->virt.lock_reset);
-	atomic_inc(&adev->gpu_reset_counter);
-	adev->in_sriov_reset = true;
+	need_full_reset = amdgpu_need_full_reset(adev);
 
-	/* block TTM */
-	resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
+	if (!need_full_reset) {
+		amdgpu_pre_soft_reset(adev);
+		r = amdgpu_soft_reset(adev);
+		amdgpu_post_soft_reset(adev);
+		if (r || amdgpu_check_soft_reset(adev)) {
+			DRM_INFO("soft reset failed, will fallback to full reset!\n");
+			need_full_reset = true;
+		}
 
-	/* we start from the ring trigger GPU hang */
-	j = job ? job->ring->idx : 0;
+	}
 
-	/* block scheduler */
-	for (i = j; i < j + AMDGPU_MAX_RINGS; ++i) {
-		ring = adev->rings[i % AMDGPU_MAX_RINGS];
-		if (!ring || !ring->sched.thread)
-			continue;
+	if (need_full_reset) {
+		r = amdgpu_suspend(adev);
 
-		kthread_park(ring->sched.thread);
+retry:
+		amdgpu_atombios_scratch_regs_save(adev);
+		r = amdgpu_asic_reset(adev);
+		amdgpu_atombios_scratch_regs_restore(adev);
+		/* post card */
+		amdgpu_atom_asic_init(adev->mode_info.atom_context);
 
-		if (job && j != i)
-			continue;
+		if (!r) {
+			dev_info(adev->dev, "GPU reset succeeded, trying to resume\n");
+			r = amdgpu_resume_phase1(adev);
+			if (r)
+				goto out;
 
-		/* here give the last chance to check if job removed from mirror-list
-		 * since we already pay some time on kthread_park */
-		if (job && list_empty(&job->base.node)) {
-			kthread_unpark(ring->sched.thread);
-			goto give_up_reset;
+			vram_lost = amdgpu_check_vram_lost(adev);
+			if (vram_lost) {
+				DRM_ERROR("VRAM is lost!\n");
+				atomic_inc(&adev->vram_lost_counter);
+			}
+
+			r = amdgpu_ttm_recover_gart(adev);
+			if (r)
+				goto out;
+
+			r = amdgpu_resume_phase2(adev);
+			if (r)
+				goto out;
+
+			if (vram_lost)
+				amdgpu_fill_reset_magic(adev);
 		}
+	}
 
-		if (amd_sched_invalidate_job(&job->base, amdgpu_job_hang_limit))
-			amd_sched_job_kickout(&job->base);
+out:
+	if (!r) {
+		amdgpu_irq_gpu_reset_resume_helper(adev);
+		r = amdgpu_ib_ring_tests(adev);
+		if (r) {
+			dev_err(adev->dev, "ib ring test failed (%d).\n", r);
+			r = amdgpu_suspend(adev);
+			need_full_reset = true;
+			goto retry;
+		}
+	}
 
-		/* only do job_reset on the hang ring if @job not NULL */
-		amd_sched_hw_job_reset(&ring->sched, NULL);
+	if (reset_flags) {
+		if (vram_lost)
+			(*reset_flags) |= AMDGPU_RESET_INFO_VRAM_LOST;
 
-		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
-		amdgpu_fence_driver_force_completion(ring);
+		if (need_full_reset)
+			(*reset_flags) |= AMDGPU_RESET_INFO_FULLRESET;
 	}
 
-	/* request to take full control of GPU before re-initialization  */
-	if (job)
-		amdgpu_virt_reset_gpu(adev);
-	else
-		amdgpu_virt_request_full_gpu(adev, true);
+	return r;
+}
+
+static int amdgpu_reset_sriov(struct amdgpu_device *adev, uint64_t *reset_flags, bool from_hypervisor)
+{
+	int r;
 
+	if (from_hypervisor)
+		r = amdgpu_virt_request_full_gpu(adev, true);
+	else
+		r = amdgpu_virt_reset_gpu(adev);
+	if (r)
+		return r;
 
 	/* Resume IP prior to SMC */
-	amdgpu_sriov_reinit_early(adev);
+	r = amdgpu_sriov_reinit_early(adev);
+	if (r)
+		goto error;
 
 	/* we need recover gart prior to run SMC/CP/SDMA resume */
 	amdgpu_ttm_recover_gart(adev);
 
 	/* now we are okay to resume SMC/CP/SDMA */
-	amdgpu_sriov_reinit_late(adev);
+	r = amdgpu_sriov_reinit_late(adev);
+	if (r)
+		goto error;
 
 	amdgpu_irq_gpu_reset_resume_helper(adev);
-
-	if (amdgpu_ib_ring_tests(adev))
+	r = amdgpu_ib_ring_tests(adev);
+	if (r)
 		dev_err(adev->dev, "[GPU_RESET] ib ring test failed (%d).\n", r);
 
+error:
 	/* release full control of GPU after ib test */
 	amdgpu_virt_release_full_gpu(adev, true);
 
-	DRM_INFO("recover vram bo from shadow\n");
-
-	ring = adev->mman.buffer_funcs_ring;
-	mutex_lock(&adev->shadow_list_lock);
-	list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) {
-		next = NULL;
-		amdgpu_recover_vram_from_shadow(adev, ring, bo, &next);
-		if (fence) {
-			r = dma_fence_wait(fence, false);
-			if (r) {
-				WARN(r, "recovery from shadow isn't completed\n");
-				break;
-			}
-		}
-
-		dma_fence_put(fence);
-		fence = next;
-	}
-	mutex_unlock(&adev->shadow_list_lock);
-
-	if (fence) {
-		r = dma_fence_wait(fence, false);
-		if (r)
-			WARN(r, "recovery from shadow isn't completed\n");
-	}
-	dma_fence_put(fence);
-
-	for (i = j; i < j + AMDGPU_MAX_RINGS; ++i) {
-		ring = adev->rings[i % AMDGPU_MAX_RINGS];
-		if (!ring || !ring->sched.thread)
-			continue;
-
-		if (job && j != i) {
-			kthread_unpark(ring->sched.thread);
-			continue;
-		}
-
-		amd_sched_job_recovery(&ring->sched);
-		kthread_unpark(ring->sched.thread);
-	}
+	if (reset_flags) {
+		/* will get vram_lost from GIM in future, now all
+		 * reset request considered VRAM LOST
+		 */
+		(*reset_flags) |= ~AMDGPU_RESET_INFO_VRAM_LOST;
+		atomic_inc(&adev->vram_lost_counter);
 
-	drm_helper_resume_force_mode(adev->ddev);
-give_up_reset:
-	ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);
-	if (r) {
-		/* bad news, how to tell it to userspace ? */
-		dev_info(adev->dev, "GPU reset failed\n");
-	} else {
-		dev_info(adev->dev, "GPU reset successed!\n");
+		/* VF FLR or hotlink reset is always full-reset */
+		(*reset_flags) |= AMDGPU_RESET_INFO_FULLRESET;
 	}
 
-	adev->in_sriov_reset = false;
-	mutex_unlock(&adev->virt.lock_reset);
 	return r;
 }
 
 /**
- * amdgpu_gpu_reset - reset the asic
+ * amdgpu_gpu_recover - reset the asic and recover scheduler
  *
  * @adev: amdgpu device pointer
+ * @job: which job trigger hang
  *
- * Attempt the reset the GPU if it has hung (all asics).
+ * Attempt to reset the GPU if it has hung (all asics).
  * Returns 0 for success or an error on failure.
  */
-int amdgpu_gpu_reset(struct amdgpu_device *adev)
+int amdgpu_gpu_recover(struct amdgpu_device *adev, struct amdgpu_job *job)
 {
 	struct drm_atomic_state *state = NULL;
-	int i, r;
-	int resched;
-	bool need_full_reset, vram_lost = false;
+	uint64_t reset_flags = 0;
+	int i, r, resched;
 
 	if (!amdgpu_check_soft_reset(adev)) {
 		DRM_INFO("No hardware hang detected. Did some blocks stall?\n");
 		return 0;
 	}
 
+	dev_info(adev->dev, "GPU reset begin!\n");
+
+	mutex_lock(&adev->virt.lock_reset);
 	atomic_inc(&adev->gpu_reset_counter);
+	adev->in_sriov_reset = 1;
 
 	/* block TTM */
 	resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
@@ -2996,69 +2987,26 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
 
 		if (!ring || !ring->sched.thread)
 			continue;
+
+		/* only focus on the ring hit timeout if &job not NULL */
+		if (job && job->ring->idx != i)
+			continue;
+
 		kthread_park(ring->sched.thread);
-		amd_sched_hw_job_reset(&ring->sched, NULL);
+		amd_sched_hw_job_reset(&ring->sched, &job->base);
+
 		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
 		amdgpu_fence_driver_force_completion(ring);
 	}
 
-	need_full_reset = amdgpu_need_full_reset(adev);
-
-	if (!need_full_reset) {
-		amdgpu_pre_soft_reset(adev);
-		r = amdgpu_soft_reset(adev);
-		amdgpu_post_soft_reset(adev);
-		if (r || amdgpu_check_soft_reset(adev)) {
-			DRM_INFO("soft reset failed, will fallback to full reset!\n");
-			need_full_reset = true;
-		}
-	}
-
-	if (need_full_reset) {
-		r = amdgpu_suspend(adev);
-
-retry:
-		amdgpu_atombios_scratch_regs_save(adev);
-		r = amdgpu_asic_reset(adev);
-		amdgpu_atombios_scratch_regs_restore(adev);
-		/* post card */
-		amdgpu_atom_asic_init(adev->mode_info.atom_context);
+	if (amdgpu_sriov_vf(adev))
+		r = amdgpu_reset_sriov(adev, &reset_flags, job ? false : true);
+	else
+		r = amdgpu_reset(adev, &reset_flags);
 
-		if (!r) {
-			dev_info(adev->dev, "GPU reset succeeded, trying to resume\n");
-			r = amdgpu_resume_phase1(adev);
-			if (r)
-				goto out;
-			vram_lost = amdgpu_check_vram_lost(adev);
-			if (vram_lost) {
-				DRM_ERROR("VRAM is lost!\n");
-				atomic_inc(&adev->vram_lost_counter);
-			}
-			r = amdgpu_ttm_recover_gart(adev);
-			if (r)
-				goto out;
-			r = amdgpu_resume_phase2(adev);
-			if (r)
-				goto out;
-			if (vram_lost)
-				amdgpu_fill_reset_magic(adev);
-		}
-	}
-out:
 	if (!r) {
-		amdgpu_irq_gpu_reset_resume_helper(adev);
-		r = amdgpu_ib_ring_tests(adev);
-		if (r) {
-			dev_err(adev->dev, "ib ring test failed (%d).\n", r);
-			r = amdgpu_suspend(adev);
-			need_full_reset = true;
-			goto retry;
-		}
-		/**
-		 * recovery vm page tables, since we cannot depend on VRAM is
-		 * consistent after gpu full reset.
-		 */
-		if (need_full_reset && amdgpu_need_backup(adev)) {
+		if (((reset_flags & AMDGPU_RESET_INFO_FULLRESET) && !(adev->flags & AMD_IS_APU)) ||
+			(reset_flags & AMDGPU_RESET_INFO_VRAM_LOST)) {
 			struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
 			struct amdgpu_bo *bo, *tmp;
 			struct dma_fence *fence = NULL, *next = NULL;
@@ -3087,40 +3035,56 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
 			}
 			dma_fence_put(fence);
 		}
+
 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
 			struct amdgpu_ring *ring = adev->rings[i];
 
 			if (!ring || !ring->sched.thread)
 				continue;
 
+			/* only focus on the ring hit timeout if &job not NULL */
+			if (job && job->ring->idx != i)
+				continue;
+
 			amd_sched_job_recovery(&ring->sched);
 			kthread_unpark(ring->sched.thread);
 		}
 	} else {
-		dev_err(adev->dev, "asic resume failed (%d).\n", r);
 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
-			if (adev->rings[i] && adev->rings[i]->sched.thread) {
-				kthread_unpark(adev->rings[i]->sched.thread);
-			}
+			struct amdgpu_ring *ring = adev->rings[i];
+
+			if (!ring || !ring->sched.thread)
+				continue;
+
+			/* only focus on the ring hit timeout if &job not NULL */
+			if (job && job->ring->idx != i)
+				continue;
+
+			kthread_unpark(adev->rings[i]->sched.thread);
 		}
 	}
 
 	if (amdgpu_device_has_dc_support(adev)) {
-		r = drm_atomic_helper_resume(adev->ddev, state);
+		if (drm_atomic_helper_resume(adev->ddev, state))
+			dev_info(adev->dev, "drm resume failed:%d\n", r);
 		amdgpu_dm_display_resume(adev);
-	} else
+	} else {
 		drm_helper_resume_force_mode(adev->ddev);
+	}
 
 	ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);
+
 	if (r) {
 		/* bad news, how to tell it to userspace ? */
-		dev_info(adev->dev, "GPU reset failed\n");
-	}
-	else {
-		dev_info(adev->dev, "GPU reset successed!\n");
+		dev_info(adev->dev, "GPU reset(%d) failed\n", atomic_read(&adev->gpu_reset_counter));
+		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
+	} else {
+		dev_info(adev->dev, "GPU reset(%d) successed!\n",atomic_read(&adev->gpu_reset_counter));
 	}
 
 	amdgpu_vf_error_trans_all(adev);
+	adev->in_sriov_reset = 0;
+	mutex_unlock(&adev->virt.lock_reset);
 	return r;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 80ee1c1..d0e5aeb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -694,25 +694,25 @@ static int amdgpu_debugfs_fence_info(struct seq_file *m, void *data)
 }
 
 /**
- * amdgpu_debugfs_gpu_reset - manually trigger a gpu reset
+ * amdgpu_debugfs_gpu_recover - manually trigger a gpu reset & recover
  *
  * Manually trigger a gpu reset at the next fence wait.
  */
-static int amdgpu_debugfs_gpu_reset(struct seq_file *m, void *data)
+static int amdgpu_debugfs_gpu_recover(struct seq_file *m, void *data)
 {
 	struct drm_info_node *node = (struct drm_info_node *) m->private;
 	struct drm_device *dev = node->minor->dev;
 	struct amdgpu_device *adev = dev->dev_private;
 
-	seq_printf(m, "gpu reset\n");
-	amdgpu_gpu_reset(adev);
+	seq_printf(m, "gpu recover\n");
+	amdgpu_gpu_recover(adev, NULL);
 
 	return 0;
 }
 
 static const struct drm_info_list amdgpu_debugfs_fence_list[] = {
 	{"amdgpu_fence_info", &amdgpu_debugfs_fence_info, 0, NULL},
-	{"amdgpu_gpu_reset", &amdgpu_debugfs_gpu_reset, 0, NULL}
+	{"amdgpu_gpu_recover", &amdgpu_debugfs_gpu_recover, 0, NULL}
 };
 
 static const struct drm_info_list amdgpu_debugfs_fence_list_sriov[] = {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
index 32590e4..c340774 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
@@ -88,7 +88,7 @@ static void amdgpu_irq_reset_work_func(struct work_struct *work)
 						  reset_work);
 
 	if (!amdgpu_sriov_vf(adev))
-		amdgpu_gpu_reset(adev);
+		amdgpu_gpu_recover(adev, NULL);
 }
 
 /* Disable *all* interrupts */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 0a90c76..18770a8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -37,10 +37,7 @@ static void amdgpu_job_timedout(struct amd_sched_job *s_job)
 		  atomic_read(&job->ring->fence_drv.last_seq),
 		  job->ring->fence_drv.sync_seq);
 
-	if (amdgpu_sriov_vf(job->adev))
-		amdgpu_sriov_gpu_reset(job->adev, job);
-	else
-		amdgpu_gpu_reset(job->adev);
+	amdgpu_gpu_recover(job->adev, job);
 }
 
 int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
index d149aca..20bdb8f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
@@ -288,7 +288,6 @@ int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init);
 int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init);
 int amdgpu_virt_reset_gpu(struct amdgpu_device *adev);
 int amdgpu_virt_wait_reset(struct amdgpu_device *adev);
-int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, struct amdgpu_job *job);
 int amdgpu_virt_alloc_mm_table(struct amdgpu_device *adev);
 void amdgpu_virt_free_mm_table(struct amdgpu_device *adev);
 int amdgpu_virt_fw_reserve_get_checksum(void *obj, unsigned long obj_size,
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index f91aab3..c32d0b0 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -254,7 +254,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
 	}
 
 	/* Trigger recovery due to world switch failure */
-	amdgpu_sriov_gpu_reset(adev, NULL);
+	amdgpu_gpu_recover(adev, NULL);
 }
 
 static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
index 27b03c7..818ec0f 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
@@ -519,7 +519,7 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
 	}
 
 	/* Trigger recovery due to world switch failure */
-	amdgpu_sriov_gpu_reset(adev, NULL);
+	amdgpu_gpu_recover(adev, NULL);
 }
 
 static int xgpu_vi_set_mailbox_rcv_irq(struct amdgpu_device *adev,
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 3/6] drm/amdgpu:cleanup in_sriov_reset and lock_reset
       [not found] ` <1509336909-11455-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  2017-10-30  4:15   ` [PATCH 1/6] amd/scheduler:imple job skip feature(v3) Monk Liu
  2017-10-30  4:15   ` [PATCH 2/6] drm/amdgpu:implement new GPU recover(v3) Monk Liu
@ 2017-10-30  4:15   ` Monk Liu
       [not found]     ` <1509336909-11455-4-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  2017-10-30  4:15   ` [PATCH 4/6] drm/amdgpu:cleanup ucode_init_bo Monk Liu
                     ` (3 subsequent siblings)
  6 siblings, 1 reply; 20+ messages in thread
From: Monk Liu @ 2017-10-30  4:15 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Monk Liu

since now gpu reset is unified with gpu_recover
for both bare-metal and SR-IOV:

1)rename in_sriov_reset to in_gpu_reset
2)move lock_reset from adev->virt to adev

Change-Id: I9f4dbab9a4c916fbc156f669824d15ddcd0f2322
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 9 +++++----
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c    | 2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c  | 2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c   | 2 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h   | 1 -
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c      | 6 +++---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c      | 6 +++---
 8 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 335df11..6e89be5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1650,7 +1650,8 @@ struct amdgpu_device {
 
 	/* record last mm index being written through WREG32*/
 	unsigned long last_mm_index;
-	bool                            in_sriov_reset;
+	bool                            in_gpu_reset;
+	struct mutex  lock_reset;
 };
 
 static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 7bccd45..a144578 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2161,6 +2161,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 	mutex_init(&adev->mn_lock);
 	mutex_init(&adev->virt.vf_errors.lock);
 	hash_init(adev->mn_hash);
+	mutex_init(&adev->lock_reset);
 
 	amdgpu_check_arguments(adev);
 
@@ -2971,9 +2972,9 @@ int amdgpu_gpu_recover(struct amdgpu_device *adev, struct amdgpu_job *job)
 
 	dev_info(adev->dev, "GPU reset begin!\n");
 
-	mutex_lock(&adev->virt.lock_reset);
+	mutex_lock(&adev->lock_reset);
 	atomic_inc(&adev->gpu_reset_counter);
-	adev->in_sriov_reset = 1;
+	adev->in_gpu_reset = 1;
 
 	/* block TTM */
 	resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
@@ -3083,8 +3084,8 @@ int amdgpu_gpu_recover(struct amdgpu_device *adev, struct amdgpu_job *job)
 	}
 
 	amdgpu_vf_error_trans_all(adev);
-	adev->in_sriov_reset = 0;
-	mutex_unlock(&adev->virt.lock_reset);
+	adev->in_gpu_reset = 0;
+	mutex_unlock(&adev->lock_reset);
 	return r;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 447d446..76f531b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -264,7 +264,7 @@ static int psp_hw_start(struct psp_context *psp)
 	struct amdgpu_device *adev = psp->adev;
 	int ret;
 
-	if (!amdgpu_sriov_vf(adev) || !adev->in_sriov_reset) {
+	if (!amdgpu_sriov_vf(adev) || !adev->in_gpu_reset) {
 		ret = psp_bootloader_load_sysdrv(psp);
 		if (ret)
 			return ret;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
index 6564902..edc37cc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
@@ -370,7 +370,7 @@ int amdgpu_ucode_init_bo(struct amdgpu_device *adev)
 		return 0;
 	}
 
-	if (!amdgpu_sriov_vf(adev) || !adev->in_sriov_reset) {
+	if (!amdgpu_sriov_vf(adev) || !adev->in_gpu_reset) {
 		err = amdgpu_bo_create(adev, adev->firmware.fw_size, PAGE_SIZE, true,
 					amdgpu_sriov_vf(adev) ? AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT,
 					AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
index fee08af..f791518 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
@@ -115,8 +115,6 @@ void amdgpu_virt_init_setting(struct amdgpu_device *adev)
 	adev->enable_virtual_display = true;
 	adev->cg_flags = 0;
 	adev->pg_flags = 0;
-
-	mutex_init(&adev->virt.lock_reset);
 }
 
 uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
index 20bdb8f..e3f78f5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
@@ -239,7 +239,6 @@ struct amdgpu_virt {
 	uint64_t			csa_vmid0_addr;
 	bool chained_ib_support;
 	uint32_t			reg_val_offs;
-	struct mutex                    lock_reset;
 	struct amdgpu_irq_src		ack_irq;
 	struct amdgpu_irq_src		rcv_irq;
 	struct work_struct		flr_work;
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index e0b7876..a74515a 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -4815,7 +4815,7 @@ static int gfx_v8_0_kiq_init_queue(struct amdgpu_ring *ring)
 
 	gfx_v8_0_kiq_setting(ring);
 
-	if (adev->in_sriov_reset) { /* for GPU_RESET case */
+	if (adev->in_gpu_reset) { /* for GPU_RESET case */
 		/* reset MQD to a clean status */
 		if (adev->gfx.mec.mqd_backup[mqd_idx])
 			memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(struct vi_mqd_allocation));
@@ -4852,7 +4852,7 @@ static int gfx_v8_0_kcq_init_queue(struct amdgpu_ring *ring)
 	struct vi_mqd *mqd = ring->mqd_ptr;
 	int mqd_idx = ring - &adev->gfx.compute_ring[0];
 
-	if (!adev->in_sriov_reset && !adev->gfx.in_suspend) {
+	if (!adev->in_gpu_reset && !adev->gfx.in_suspend) {
 		memset((void *)mqd, 0, sizeof(struct vi_mqd_allocation));
 		((struct vi_mqd_allocation *)mqd)->dynamic_cu_mask = 0xFFFFFFFF;
 		((struct vi_mqd_allocation *)mqd)->dynamic_rb_mask = 0xFFFFFFFF;
@@ -4864,7 +4864,7 @@ static int gfx_v8_0_kcq_init_queue(struct amdgpu_ring *ring)
 
 		if (adev->gfx.mec.mqd_backup[mqd_idx])
 			memcpy(adev->gfx.mec.mqd_backup[mqd_idx], mqd, sizeof(struct vi_mqd_allocation));
-	} else if (adev->in_sriov_reset) { /* for GPU_RESET case */
+	} else if (adev->in_gpu_reset) { /* for GPU_RESET case */
 		/* reset MQD to a clean status */
 		if (adev->gfx.mec.mqd_backup[mqd_idx])
 			memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(struct vi_mqd_allocation));
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 39b02e9..9855dc0 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -2740,7 +2740,7 @@ static int gfx_v9_0_kiq_init_queue(struct amdgpu_ring *ring)
 
 	gfx_v9_0_kiq_setting(ring);
 
-	if (adev->in_sriov_reset) { /* for GPU_RESET case */
+	if (adev->in_gpu_reset) { /* for GPU_RESET case */
 		/* reset MQD to a clean status */
 		if (adev->gfx.mec.mqd_backup[mqd_idx])
 			memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(struct v9_mqd_allocation));
@@ -2778,7 +2778,7 @@ static int gfx_v9_0_kcq_init_queue(struct amdgpu_ring *ring)
 	struct v9_mqd *mqd = ring->mqd_ptr;
 	int mqd_idx = ring - &adev->gfx.compute_ring[0];
 
-	if (!adev->in_sriov_reset && !adev->gfx.in_suspend) {
+	if (!adev->in_gpu_reset && !adev->gfx.in_suspend) {
 		memset((void *)mqd, 0, sizeof(struct v9_mqd_allocation));
 		((struct v9_mqd_allocation *)mqd)->dynamic_cu_mask = 0xFFFFFFFF;
 		((struct v9_mqd_allocation *)mqd)->dynamic_rb_mask = 0xFFFFFFFF;
@@ -2790,7 +2790,7 @@ static int gfx_v9_0_kcq_init_queue(struct amdgpu_ring *ring)
 
 		if (adev->gfx.mec.mqd_backup[mqd_idx])
 			memcpy(adev->gfx.mec.mqd_backup[mqd_idx], mqd, sizeof(struct v9_mqd_allocation));
-	} else if (adev->in_sriov_reset) { /* for GPU_RESET case */
+	} else if (adev->in_gpu_reset) { /* for GPU_RESET case */
 		/* reset MQD to a clean status */
 		if (adev->gfx.mec.mqd_backup[mqd_idx])
 			memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(struct v9_mqd_allocation));
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 4/6] drm/amdgpu:cleanup ucode_init_bo
       [not found] ` <1509336909-11455-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
                     ` (2 preceding siblings ...)
  2017-10-30  4:15   ` [PATCH 3/6] drm/amdgpu:cleanup in_sriov_reset and lock_reset Monk Liu
@ 2017-10-30  4:15   ` Monk Liu
       [not found]     ` <1509336909-11455-5-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  2017-10-30  4:15   ` [PATCH 5/6] drm/amdgpu/sriov:fix memory leak in psp_load_fw Monk Liu
                     ` (2 subsequent siblings)
  6 siblings, 1 reply; 20+ messages in thread
From: Monk Liu @ 2017-10-30  4:15 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Monk Liu

1,no sriov check since gpu recover is unified
2,need CPU_ACCESS_REQUIRED flag for VRAM if SRIOV
because otherwise after following PIN the first allocated
VRAM bo is wasted due to some TTM mgr reason.

Change-Id: I4d029f2da8bb463942c7861d3e52f309bdba9576
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
index edc37cc..ab9b2d4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
@@ -370,10 +370,10 @@ int amdgpu_ucode_init_bo(struct amdgpu_device *adev)
 		return 0;
 	}
 
-	if (!amdgpu_sriov_vf(adev) || !adev->in_gpu_reset) {
+	if (!adev->in_gpu_reset) {
 		err = amdgpu_bo_create(adev, adev->firmware.fw_size, PAGE_SIZE, true,
 					amdgpu_sriov_vf(adev) ? AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT,
-					AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS,
+					AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS|AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED,
 					NULL, NULL, 0, bo);
 		if (err) {
 			dev_err(adev->dev, "(%d) Firmware buffer allocate failed\n", err);
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 5/6] drm/amdgpu/sriov:fix memory leak in psp_load_fw
       [not found] ` <1509336909-11455-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
                     ` (3 preceding siblings ...)
  2017-10-30  4:15   ` [PATCH 4/6] drm/amdgpu:cleanup ucode_init_bo Monk Liu
@ 2017-10-30  4:15   ` Monk Liu
       [not found]     ` <1509336909-11455-6-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  2017-10-30  4:15   ` [PATCH 6/6] drm/amdgpu:fix random missing of FLR NOTIFY Monk Liu
  2017-11-09  9:35   ` [PATCH 0/7] *** GPU recover V3 *** Julien Isorce
  6 siblings, 1 reply; 20+ messages in thread
From: Monk Liu @ 2017-10-30  4:15 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Monk Liu

for SR-IOV when doing gpu reset this routine shouldn't do
resource allocating otherwise memory leak

Change-Id: I25da3a5b475196c75c7e639adc40751754625968
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 76f531b..2157d45 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -334,23 +334,26 @@ static int psp_load_fw(struct amdgpu_device *adev)
 	int ret;
 	struct psp_context *psp = &adev->psp;
 
+	if (amdgpu_sriov_vf(adev) && adev->in_gpu_reset != 0)
+		goto skip_memalloc;
+
 	psp->cmd = kzalloc(sizeof(struct psp_gfx_cmd_resp), GFP_KERNEL);
 	if (!psp->cmd)
 		return -ENOMEM;
 
 	ret = amdgpu_bo_create_kernel(adev, PSP_1_MEG, PSP_1_MEG,
-				      AMDGPU_GEM_DOMAIN_GTT,
-				      &psp->fw_pri_bo,
-				      &psp->fw_pri_mc_addr,
-				      &psp->fw_pri_buf);
+					AMDGPU_GEM_DOMAIN_GTT,
+					&psp->fw_pri_bo,
+					&psp->fw_pri_mc_addr,
+					&psp->fw_pri_buf);
 	if (ret)
 		goto failed;
 
 	ret = amdgpu_bo_create_kernel(adev, PSP_FENCE_BUFFER_SIZE, PAGE_SIZE,
-				      AMDGPU_GEM_DOMAIN_VRAM,
-				      &psp->fence_buf_bo,
-				      &psp->fence_buf_mc_addr,
-				      &psp->fence_buf);
+					AMDGPU_GEM_DOMAIN_VRAM,
+					&psp->fence_buf_bo,
+					&psp->fence_buf_mc_addr,
+					&psp->fence_buf);
 	if (ret)
 		goto failed_mem2;
 
@@ -375,6 +378,7 @@ static int psp_load_fw(struct amdgpu_device *adev)
 	if (ret)
 		goto failed_mem;
 
+skip_memalloc:
 	ret = psp_hw_start(psp);
 	if (ret)
 		goto failed_mem;
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 6/6] drm/amdgpu:fix random missing of FLR NOTIFY
       [not found] ` <1509336909-11455-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
                     ` (4 preceding siblings ...)
  2017-10-30  4:15   ` [PATCH 5/6] drm/amdgpu/sriov:fix memory leak in psp_load_fw Monk Liu
@ 2017-10-30  4:15   ` Monk Liu
       [not found]     ` <1509336909-11455-7-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  2017-11-09  9:35   ` [PATCH 0/7] *** GPU recover V3 *** Julien Isorce
  6 siblings, 1 reply; 20+ messages in thread
From: Monk Liu @ 2017-10-30  4:15 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Monk Liu

Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index c32d0b0..d31259e 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -282,9 +282,17 @@ static int xgpu_ai_mailbox_rcv_irq(struct amdgpu_device *adev,
 		/* see what event we get */
 		r = xgpu_ai_mailbox_rcv_msg(adev, IDH_FLR_NOTIFICATION);
 
-		/* only handle FLR_NOTIFY now */
-		if (!r)
-			schedule_work(&adev->virt.flr_work);
+		/* sometimes the interrupt is delayed to inject to VM, so under such case
+		 * the IDH_FLR_NOTIFICATION is overwritten by VF FLR from GIM side, thus
+		 * above recieve message could be failed, we should schedule the flr_work
+		 * anyway
+		 */
+		if (r) {
+			DRM_ERROR("FLR_NOTIFICATION is missed\n");
+			xgpu_ai_mailbox_send_ack(adev);
+		}
+
+		schedule_work(&adev->virt.flr_work);
 	}
 
 	return 0;
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/6] amd/scheduler:imple job skip feature(v3)
       [not found]     ` <1509336909-11455-2-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
@ 2017-10-30 10:00       ` Christian König
       [not found]         ` <de065e02-acb7-2891-70ca-5eab3ce3365d-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  0 siblings, 1 reply; 20+ messages in thread
From: Christian König @ 2017-10-30 10:00 UTC (permalink / raw)
  To: Monk Liu, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 30.10.2017 um 05:15 schrieb Monk Liu:
> jobs are skipped under two cases
> 1)when the entity behind this job marked guilty, the job
> poped from this entity's queue will be dropped in sched_main loop.
>
> 2)in job_recovery(), skip the scheduling job if its karma detected
> above limit, and also skipped as well for other jobs sharing the
> same fence context. this approach is becuase job_recovery() cannot
> access job->entity due to entity may already dead.
>
> v2:
> some logic fix
>
> v3:
> when entity detected guilty, don't drop the job in the poping
> stage, instead set its fence error as -ECANCELED
>
> in run_job(), skip the scheduling either:1) fence->error < 0
> or 2) there was a VRAM LOST occurred on this job.
> this way we can unify the job skipping logic.
>
> with this feature we can introduce new gpu recover feature.
>
> Change-Id: I268b1c752c94e6ecd4ea78c87eb226ea3f52908a
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c       | 13 +++++----
>   drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 39 ++++++++++++++++-----------
>   2 files changed, 31 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index f60662e..0a90c76 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -180,7 +180,7 @@ static struct dma_fence *amdgpu_job_dependency(struct amd_sched_job *sched_job,
>   
>   static struct dma_fence *amdgpu_job_run(struct amd_sched_job *sched_job)
>   {
> -	struct dma_fence *fence = NULL;
> +	struct dma_fence *fence = NULL, *finished;
>   	struct amdgpu_device *adev;
>   	struct amdgpu_job *job;
>   	int r;
> @@ -190,15 +190,18 @@ static struct dma_fence *amdgpu_job_run(struct amd_sched_job *sched_job)
>   		return NULL;
>   	}
>   	job = to_amdgpu_job(sched_job);
> +	finished = &job->base.s_fence->finished;
>   	adev = job->adev;
>   
>   	BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL));
>   
>   	trace_amdgpu_sched_run_job(job);
> -	/* skip ib schedule when vram is lost */
> -	if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter)) {
> -		dma_fence_set_error(&job->base.s_fence->finished, -ECANCELED);
> -		DRM_ERROR("Skip scheduling IBs!\n");
> +
> +	if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter))
> +		dma_fence_set_error(finished, -ECANCELED);/* skip IB as well if VRAM lost */
> +
> +	if (finished->error < 0) {
> +		DRM_INFO("Skip scheduling IBs!\n");
>   	} else {
>   		r = amdgpu_ib_schedule(job->ring, job->num_ibs, job->ibs, job,
>   				       &fence);
> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> index 903ef8b..3d8c994 100644
> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> @@ -344,6 +344,10 @@ amd_sched_entity_pop_job(struct amd_sched_entity *entity)
>   		if (amd_sched_entity_add_dependency_cb(entity))
>   			return NULL;
>   
> +	/* skip jobs from entity that marked guilty */
> +	if (entity->guilty && atomic_read(entity->guilty))
> +		dma_fence_set_error(&sched_job->s_fence->finished, -ECANCELED);
> +
>   	spsc_queue_pop(&entity->job_queue);
>   	return sched_job;
>   }
> @@ -440,14 +444,6 @@ static void amd_sched_job_timedout(struct work_struct *work)
>   	job->sched->ops->timedout_job(job);
>   }
>   
> -static void amd_sched_set_guilty(struct amd_sched_job *s_job,
> -				 struct amd_sched_entity *s_entity)
> -{
> -	if (atomic_inc_return(&s_job->karma) > s_job->sched->hang_limit)
> -		if (s_entity->guilty)
> -			atomic_set(s_entity->guilty, 1);
> -}
> -
>   void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_job *bad)
>   {
>   	struct amd_sched_job *s_job;
> @@ -467,21 +463,24 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_jo
>   	spin_unlock(&sched->job_list_lock);
>   
>   	if (bad) {
> -		bool found = false;
> -
> -		for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_MAX; i++ ) {
> +		/* don't increase @bad's karma if it's from KERNEL RQ,
> +		 * becuase sometimes GPU hang would cause kernel jobs (like VM updating jobs)
> +		 * corrupt but keep in mind that kernel jobs always considered good.
> +		 */
> +		for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_KERNEL; i++ ) {
>   			struct amd_sched_rq *rq = &sched->sched_rq[i];
>   
>   			spin_lock(&rq->lock);
>   			list_for_each_entry_safe(entity, tmp, &rq->entities, list) {
>   				if (bad->s_fence->scheduled.context == entity->fence_context) {
> -					found = true;
> -					amd_sched_set_guilty(bad, entity);
> +				    if (atomic_inc_return(&bad->karma) > bad->sched->hang_limit)
> +						if (entity->guilty)
> +							atomic_set(entity->guilty, 1);
>   					break;
>   				}
>   			}
>   			spin_unlock(&rq->lock);
> -			if (found)
> +			if (&entity->list == &rq->entities)

That needs to be "&entity->list != &rq->entities", or otherwise we only 
check on the first round.

With that fixed the patch is Reviewed-by: Christian König 
<christian.koenig@amd.com>.

Nice work,
Christian.

>   				break;
>   		}
>   	}
> @@ -499,6 +498,7 @@ void amd_sched_job_kickout(struct amd_sched_job *s_job)
>   void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
>   {
>   	struct amd_sched_job *s_job, *tmp;
> +	bool found_guilty = false;
>   	int r;
>   
>   	spin_lock(&sched->job_list_lock);
> @@ -510,6 +510,15 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
>   	list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
>   		struct amd_sched_fence *s_fence = s_job->s_fence;
>   		struct dma_fence *fence;
> +		uint64_t guilty_context;
> +
> +		if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) {
> +			found_guilty = true;
> +			guilty_context = s_job->s_fence->scheduled.context;
> +		}
> +
> +		if (found_guilty && s_job->s_fence->scheduled.context == guilty_context)
> +			dma_fence_set_error(&s_fence->finished, -ECANCELED);
>   
>   		spin_unlock(&sched->job_list_lock);
>   		fence = sched->ops->run_job(s_job);
> @@ -525,7 +534,6 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
>   					  r);
>   			dma_fence_put(fence);
>   		} else {
> -			DRM_ERROR("Failed to run job!\n");
>   			amd_sched_process_job(NULL, &s_fence->cb);
>   		}
>   		spin_lock(&sched->job_list_lock);
> @@ -663,7 +671,6 @@ static int amd_sched_main(void *param)
>   					  r);
>   			dma_fence_put(fence);
>   		} else {
> -			DRM_ERROR("Failed to run job!\n");
>   			amd_sched_process_job(NULL, &s_fence->cb);
>   		}
>   


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/6] drm/amdgpu:implement new GPU recover(v3)
       [not found]     ` <1509336909-11455-3-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
@ 2017-10-30 10:06       ` Christian König
  0 siblings, 0 replies; 20+ messages in thread
From: Christian König @ 2017-10-30 10:06 UTC (permalink / raw)
  To: Monk Liu, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 30.10.2017 um 05:15 schrieb Monk Liu:
> 1,new imple names amdgpu_gpu_recover which gives more hint
> on what it does compared with gpu_reset
>
> 2,gpu_recover unify bare-metal and SR-IOV, only the asic reset
> part is implemented differently
>
> 3,gpu_recover will increase hang job karma and mark its entity/context
> as guilty if exceeds limit
>
> V2:
>
> 4,in scheduler main routine the job from guilty context  will be immedialy
> fake signaled after it poped from queue and its fence be set with
> "-ECANCELED" error
>
> 5,in scheduler recovery routine all jobs from the guilty entity would be
> dropped
>
> 6,in run_job() routine the real IB submission would be skipped if @skip parameter
> equales true or there was VRAM lost occured.
>
> V3:
>
> 7,replace deprecated gpu reset, use new gpu recover
>
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |   6 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 310 +++++++++++++----------------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  |  10 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c    |   2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    |   5 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h   |   1 -
>   drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c      |   2 +-
>   drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c      |   2 +-
>   8 files changed, 151 insertions(+), 187 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index ba1ab97..335df11 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -178,6 +178,10 @@ extern int amdgpu_cik_support;
>   #define CIK_CURSOR_WIDTH 128
>   #define CIK_CURSOR_HEIGHT 128
>   
> +/* GPU RESET flags */
> +#define AMDGPU_RESET_INFO_VRAM_LOST  (1 << 0)
> +#define AMDGPU_RESET_INFO_FULLRESET  (1 << 1)
> +
>   struct amdgpu_device;
>   struct amdgpu_ib;
>   struct amdgpu_cs_parser;
> @@ -1840,7 +1844,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
>   #define amdgpu_psp_check_fw_loading_status(adev, i) (adev)->firmware.funcs->check_fw_loading_status((adev), (i))
>   
>   /* Common functions */
> -int amdgpu_gpu_reset(struct amdgpu_device *adev);
> +int amdgpu_gpu_recover(struct amdgpu_device *adev, struct amdgpu_job* job);
>   bool amdgpu_need_backup(struct amdgpu_device *adev);
>   void amdgpu_pci_config_reset(struct amdgpu_device *adev);
>   bool amdgpu_need_post(struct amdgpu_device *adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 400dfaa..7bccd45 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2826,163 +2826,154 @@ static int amdgpu_recover_vram_from_shadow(struct amdgpu_device *adev,
>   	return r;
>   }
>   
> -/**
> - * amdgpu_sriov_gpu_reset - reset the asic
> - *
> - * @adev: amdgpu device pointer
> - * @job: which job trigger hang
> - *
> - * Attempt the reset the GPU if it has hung (all asics).
> - * for SRIOV case.
> - * Returns 0 for success or an error on failure.
> - */
> -int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, struct amdgpu_job *job)
> +static int amdgpu_reset(struct amdgpu_device *adev, uint64_t* reset_flags)

Keeping a description here what the function does exactly would be nice 
to have.

>   {
> -	int i, j, r = 0;
> -	int resched;
> -	struct amdgpu_bo *bo, *tmp;
> -	struct amdgpu_ring *ring;
> -	struct dma_fence *fence = NULL, *next = NULL;
> +	int r;
> +	bool need_full_reset, vram_lost = 0;

Style nit pick: reverse tree order coding style please.

Apart from that the patch looks good to me and is Reviewed-by: Christian 
König <christian.koenig@amd.com>

Regards,
Christian.

>   Styling says that
> -	mutex_lock(&adev->virt.lock_reset);
> -	atomic_inc(&adev->gpu_reset_counter);
> -	adev->in_sriov_reset = true;
> +	need_full_reset = amdgpu_need_full_reset(adev);
>   
> -	/* block TTM */
> -	resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
> +	if (!need_full_reset) {
> +		amdgpu_pre_soft_reset(adev);
> +		r = amdgpu_soft_reset(adev);
> +		amdgpu_post_soft_reset(adev);
> +		if (r || amdgpu_check_soft_reset(adev)) {
> +			DRM_INFO("soft reset failed, will fallback to full reset!\n");
> +			need_full_reset = true;
> +		}
>   
> -	/* we start from the ring trigger GPU hang */
> -	j = job ? job->ring->idx : 0;
> +	}
>   
> -	/* block scheduler */
> -	for (i = j; i < j + AMDGPU_MAX_RINGS; ++i) {
> -		ring = adev->rings[i % AMDGPU_MAX_RINGS];
> -		if (!ring || !ring->sched.thread)
> -			continue;
> +	if (need_full_reset) {
> +		r = amdgpu_suspend(adev);
>   
> -		kthread_park(ring->sched.thread);
> +retry:
> +		amdgpu_atombios_scratch_regs_save(adev);
> +		r = amdgpu_asic_reset(adev);
> +		amdgpu_atombios_scratch_regs_restore(adev);
> +		/* post card */
> +		amdgpu_atom_asic_init(adev->mode_info.atom_context);
>   
> -		if (job && j != i)
> -			continue;
> +		if (!r) {
> +			dev_info(adev->dev, "GPU reset succeeded, trying to resume\n");
> +			r = amdgpu_resume_phase1(adev);
> +			if (r)
> +				goto out;
>   
> -		/* here give the last chance to check if job removed from mirror-list
> -		 * since we already pay some time on kthread_park */
> -		if (job && list_empty(&job->base.node)) {
> -			kthread_unpark(ring->sched.thread);
> -			goto give_up_reset;
> +			vram_lost = amdgpu_check_vram_lost(adev);
> +			if (vram_lost) {
> +				DRM_ERROR("VRAM is lost!\n");
> +				atomic_inc(&adev->vram_lost_counter);
> +			}
> +
> +			r = amdgpu_ttm_recover_gart(adev);
> +			if (r)
> +				goto out;
> +
> +			r = amdgpu_resume_phase2(adev);
> +			if (r)
> +				goto out;
> +
> +			if (vram_lost)
> +				amdgpu_fill_reset_magic(adev);
>   		}
> +	}
>   
> -		if (amd_sched_invalidate_job(&job->base, amdgpu_job_hang_limit))
> -			amd_sched_job_kickout(&job->base);
> +out:
> +	if (!r) {
> +		amdgpu_irq_gpu_reset_resume_helper(adev);
> +		r = amdgpu_ib_ring_tests(adev);
> +		if (r) {
> +			dev_err(adev->dev, "ib ring test failed (%d).\n", r);
> +			r = amdgpu_suspend(adev);
> +			need_full_reset = true;
> +			goto retry;
> +		}
> +	}
>   
> -		/* only do job_reset on the hang ring if @job not NULL */
> -		amd_sched_hw_job_reset(&ring->sched, NULL);
> +	if (reset_flags) {
> +		if (vram_lost)
> +			(*reset_flags) |= AMDGPU_RESET_INFO_VRAM_LOST;
>   
> -		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
> -		amdgpu_fence_driver_force_completion(ring);
> +		if (need_full_reset)
> +			(*reset_flags) |= AMDGPU_RESET_INFO_FULLRESET;
>   	}
>   
> -	/* request to take full control of GPU before re-initialization  */
> -	if (job)
> -		amdgpu_virt_reset_gpu(adev);
> -	else
> -		amdgpu_virt_request_full_gpu(adev, true);
> +	return r;
> +}
> +
> +static int amdgpu_reset_sriov(struct amdgpu_device *adev, uint64_t *reset_flags, bool from_hypervisor)
> +{
> +	int r;
>   
> +	if (from_hypervisor)
> +		r = amdgpu_virt_request_full_gpu(adev, true);
> +	else
> +		r = amdgpu_virt_reset_gpu(adev);
> +	if (r)
> +		return r;
>   
>   	/* Resume IP prior to SMC */
> -	amdgpu_sriov_reinit_early(adev);
> +	r = amdgpu_sriov_reinit_early(adev);
> +	if (r)
> +		goto error;
>   
>   	/* we need recover gart prior to run SMC/CP/SDMA resume */
>   	amdgpu_ttm_recover_gart(adev);
>   
>   	/* now we are okay to resume SMC/CP/SDMA */
> -	amdgpu_sriov_reinit_late(adev);
> +	r = amdgpu_sriov_reinit_late(adev);
> +	if (r)
> +		goto error;
>   
>   	amdgpu_irq_gpu_reset_resume_helper(adev);
> -
> -	if (amdgpu_ib_ring_tests(adev))
> +	r = amdgpu_ib_ring_tests(adev);
> +	if (r)
>   		dev_err(adev->dev, "[GPU_RESET] ib ring test failed (%d).\n", r);
>   
> +error:
>   	/* release full control of GPU after ib test */
>   	amdgpu_virt_release_full_gpu(adev, true);
>   
> -	DRM_INFO("recover vram bo from shadow\n");
> -
> -	ring = adev->mman.buffer_funcs_ring;
> -	mutex_lock(&adev->shadow_list_lock);
> -	list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) {
> -		next = NULL;
> -		amdgpu_recover_vram_from_shadow(adev, ring, bo, &next);
> -		if (fence) {
> -			r = dma_fence_wait(fence, false);
> -			if (r) {
> -				WARN(r, "recovery from shadow isn't completed\n");
> -				break;
> -			}
> -		}
> -
> -		dma_fence_put(fence);
> -		fence = next;
> -	}
> -	mutex_unlock(&adev->shadow_list_lock);
> -
> -	if (fence) {
> -		r = dma_fence_wait(fence, false);
> -		if (r)
> -			WARN(r, "recovery from shadow isn't completed\n");
> -	}
> -	dma_fence_put(fence);
> -
> -	for (i = j; i < j + AMDGPU_MAX_RINGS; ++i) {
> -		ring = adev->rings[i % AMDGPU_MAX_RINGS];
> -		if (!ring || !ring->sched.thread)
> -			continue;
> -
> -		if (job && j != i) {
> -			kthread_unpark(ring->sched.thread);
> -			continue;
> -		}
> -
> -		amd_sched_job_recovery(&ring->sched);
> -		kthread_unpark(ring->sched.thread);
> -	}
> +	if (reset_flags) {
> +		/* will get vram_lost from GIM in future, now all
> +		 * reset request considered VRAM LOST
> +		 */
> +		(*reset_flags) |= ~AMDGPU_RESET_INFO_VRAM_LOST;
> +		atomic_inc(&adev->vram_lost_counter);
>   
> -	drm_helper_resume_force_mode(adev->ddev);
> -give_up_reset:
> -	ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);
> -	if (r) {
> -		/* bad news, how to tell it to userspace ? */
> -		dev_info(adev->dev, "GPU reset failed\n");
> -	} else {
> -		dev_info(adev->dev, "GPU reset successed!\n");
> +		/* VF FLR or hotlink reset is always full-reset */
> +		(*reset_flags) |= AMDGPU_RESET_INFO_FULLRESET;
>   	}
>   
> -	adev->in_sriov_reset = false;
> -	mutex_unlock(&adev->virt.lock_reset);
>   	return r;
>   }
>   
>   /**
> - * amdgpu_gpu_reset - reset the asic
> + * amdgpu_gpu_recover - reset the asic and recover scheduler
>    *
>    * @adev: amdgpu device pointer
> + * @job: which job trigger hang
>    *
> - * Attempt the reset the GPU if it has hung (all asics).
> + * Attempt to reset the GPU if it has hung (all asics).
>    * Returns 0 for success or an error on failure.
>    */
> -int amdgpu_gpu_reset(struct amdgpu_device *adev)
> +int amdgpu_gpu_recover(struct amdgpu_device *adev, struct amdgpu_job *job)
>   {
>   	struct drm_atomic_state *state = NULL;
> -	int i, r;
> -	int resched;
> -	bool need_full_reset, vram_lost = false;
> +	uint64_t reset_flags = 0;
> +	int i, r, resched;
>   
>   	if (!amdgpu_check_soft_reset(adev)) {
>   		DRM_INFO("No hardware hang detected. Did some blocks stall?\n");
>   		return 0;
>   	}
>   
> +	dev_info(adev->dev, "GPU reset begin!\n");
> +
> +	mutex_lock(&adev->virt.lock_reset);
>   	atomic_inc(&adev->gpu_reset_counter);
> +	adev->in_sriov_reset = 1;
>   
>   	/* block TTM */
>   	resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
> @@ -2996,69 +2987,26 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>   
>   		if (!ring || !ring->sched.thread)
>   			continue;
> +
> +		/* only focus on the ring hit timeout if &job not NULL */
> +		if (job && job->ring->idx != i)
> +			continue;
> +
>   		kthread_park(ring->sched.thread);
> -		amd_sched_hw_job_reset(&ring->sched, NULL);
> +		amd_sched_hw_job_reset(&ring->sched, &job->base);
> +
>   		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
>   		amdgpu_fence_driver_force_completion(ring);
>   	}
>   
> -	need_full_reset = amdgpu_need_full_reset(adev);
> -
> -	if (!need_full_reset) {
> -		amdgpu_pre_soft_reset(adev);
> -		r = amdgpu_soft_reset(adev);
> -		amdgpu_post_soft_reset(adev);
> -		if (r || amdgpu_check_soft_reset(adev)) {
> -			DRM_INFO("soft reset failed, will fallback to full reset!\n");
> -			need_full_reset = true;
> -		}
> -	}
> -
> -	if (need_full_reset) {
> -		r = amdgpu_suspend(adev);
> -
> -retry:
> -		amdgpu_atombios_scratch_regs_save(adev);
> -		r = amdgpu_asic_reset(adev);
> -		amdgpu_atombios_scratch_regs_restore(adev);
> -		/* post card */
> -		amdgpu_atom_asic_init(adev->mode_info.atom_context);
> +	if (amdgpu_sriov_vf(adev))
> +		r = amdgpu_reset_sriov(adev, &reset_flags, job ? false : true);
> +	else
> +		r = amdgpu_reset(adev, &reset_flags);
>   
> -		if (!r) {
> -			dev_info(adev->dev, "GPU reset succeeded, trying to resume\n");
> -			r = amdgpu_resume_phase1(adev);
> -			if (r)
> -				goto out;
> -			vram_lost = amdgpu_check_vram_lost(adev);
> -			if (vram_lost) {
> -				DRM_ERROR("VRAM is lost!\n");
> -				atomic_inc(&adev->vram_lost_counter);
> -			}
> -			r = amdgpu_ttm_recover_gart(adev);
> -			if (r)
> -				goto out;
> -			r = amdgpu_resume_phase2(adev);
> -			if (r)
> -				goto out;
> -			if (vram_lost)
> -				amdgpu_fill_reset_magic(adev);
> -		}
> -	}
> -out:
>   	if (!r) {
> -		amdgpu_irq_gpu_reset_resume_helper(adev);
> -		r = amdgpu_ib_ring_tests(adev);
> -		if (r) {
> -			dev_err(adev->dev, "ib ring test failed (%d).\n", r);
> -			r = amdgpu_suspend(adev);
> -			need_full_reset = true;
> -			goto retry;
> -		}
> -		/**
> -		 * recovery vm page tables, since we cannot depend on VRAM is
> -		 * consistent after gpu full reset.
> -		 */
> -		if (need_full_reset && amdgpu_need_backup(adev)) {
> +		if (((reset_flags & AMDGPU_RESET_INFO_FULLRESET) && !(adev->flags & AMD_IS_APU)) ||
> +			(reset_flags & AMDGPU_RESET_INFO_VRAM_LOST)) {
>   			struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
>   			struct amdgpu_bo *bo, *tmp;
>   			struct dma_fence *fence = NULL, *next = NULL;
> @@ -3087,40 +3035,56 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>   			}
>   			dma_fence_put(fence);
>   		}
> +
>   		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>   			struct amdgpu_ring *ring = adev->rings[i];
>   
>   			if (!ring || !ring->sched.thread)
>   				continue;
>   
> +			/* only focus on the ring hit timeout if &job not NULL */
> +			if (job && job->ring->idx != i)
> +				continue;
> +
>   			amd_sched_job_recovery(&ring->sched);
>   			kthread_unpark(ring->sched.thread);
>   		}
>   	} else {
> -		dev_err(adev->dev, "asic resume failed (%d).\n", r);
>   		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> -			if (adev->rings[i] && adev->rings[i]->sched.thread) {
> -				kthread_unpark(adev->rings[i]->sched.thread);
> -			}
> +			struct amdgpu_ring *ring = adev->rings[i];
> +
> +			if (!ring || !ring->sched.thread)
> +				continue;
> +
> +			/* only focus on the ring hit timeout if &job not NULL */
> +			if (job && job->ring->idx != i)
> +				continue;
> +
> +			kthread_unpark(adev->rings[i]->sched.thread);
>   		}
>   	}
>   
>   	if (amdgpu_device_has_dc_support(adev)) {
> -		r = drm_atomic_helper_resume(adev->ddev, state);
> +		if (drm_atomic_helper_resume(adev->ddev, state))
> +			dev_info(adev->dev, "drm resume failed:%d\n", r);
>   		amdgpu_dm_display_resume(adev);
> -	} else
> +	} else {
>   		drm_helper_resume_force_mode(adev->ddev);
> +	}
>   
>   	ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);
> +
>   	if (r) {
>   		/* bad news, how to tell it to userspace ? */
> -		dev_info(adev->dev, "GPU reset failed\n");
> -	}
> -	else {
> -		dev_info(adev->dev, "GPU reset successed!\n");
> +		dev_info(adev->dev, "GPU reset(%d) failed\n", atomic_read(&adev->gpu_reset_counter));
> +		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
> +	} else {
> +		dev_info(adev->dev, "GPU reset(%d) successed!\n",atomic_read(&adev->gpu_reset_counter));
>   	}
>   
>   	amdgpu_vf_error_trans_all(adev);
> +	adev->in_sriov_reset = 0;
> +	mutex_unlock(&adev->virt.lock_reset);
>   	return r;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> index 80ee1c1..d0e5aeb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> @@ -694,25 +694,25 @@ static int amdgpu_debugfs_fence_info(struct seq_file *m, void *data)
>   }
>   
>   /**
> - * amdgpu_debugfs_gpu_reset - manually trigger a gpu reset
> + * amdgpu_debugfs_gpu_recover - manually trigger a gpu reset & recover
>    *
>    * Manually trigger a gpu reset at the next fence wait.
>    */
> -static int amdgpu_debugfs_gpu_reset(struct seq_file *m, void *data)
> +static int amdgpu_debugfs_gpu_recover(struct seq_file *m, void *data)
>   {
>   	struct drm_info_node *node = (struct drm_info_node *) m->private;
>   	struct drm_device *dev = node->minor->dev;
>   	struct amdgpu_device *adev = dev->dev_private;
>   
> -	seq_printf(m, "gpu reset\n");
> -	amdgpu_gpu_reset(adev);
> +	seq_printf(m, "gpu recover\n");
> +	amdgpu_gpu_recover(adev, NULL);
>   
>   	return 0;
>   }
>   
>   static const struct drm_info_list amdgpu_debugfs_fence_list[] = {
>   	{"amdgpu_fence_info", &amdgpu_debugfs_fence_info, 0, NULL},
> -	{"amdgpu_gpu_reset", &amdgpu_debugfs_gpu_reset, 0, NULL}
> +	{"amdgpu_gpu_recover", &amdgpu_debugfs_gpu_recover, 0, NULL}
>   };
>   
>   static const struct drm_info_list amdgpu_debugfs_fence_list_sriov[] = {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> index 32590e4..c340774 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> @@ -88,7 +88,7 @@ static void amdgpu_irq_reset_work_func(struct work_struct *work)
>   						  reset_work);
>   
>   	if (!amdgpu_sriov_vf(adev))
> -		amdgpu_gpu_reset(adev);
> +		amdgpu_gpu_recover(adev, NULL);
>   }
>   
>   /* Disable *all* interrupts */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index 0a90c76..18770a8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -37,10 +37,7 @@ static void amdgpu_job_timedout(struct amd_sched_job *s_job)
>   		  atomic_read(&job->ring->fence_drv.last_seq),
>   		  job->ring->fence_drv.sync_seq);
>   
> -	if (amdgpu_sriov_vf(job->adev))
> -		amdgpu_sriov_gpu_reset(job->adev, job);
> -	else
> -		amdgpu_gpu_reset(job->adev);
> +	amdgpu_gpu_recover(job->adev, job);
>   }
>   
>   int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> index d149aca..20bdb8f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> @@ -288,7 +288,6 @@ int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init);
>   int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init);
>   int amdgpu_virt_reset_gpu(struct amdgpu_device *adev);
>   int amdgpu_virt_wait_reset(struct amdgpu_device *adev);
> -int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, struct amdgpu_job *job);
>   int amdgpu_virt_alloc_mm_table(struct amdgpu_device *adev);
>   void amdgpu_virt_free_mm_table(struct amdgpu_device *adev);
>   int amdgpu_virt_fw_reserve_get_checksum(void *obj, unsigned long obj_size,
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> index f91aab3..c32d0b0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> @@ -254,7 +254,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
>   	}
>   
>   	/* Trigger recovery due to world switch failure */
> -	amdgpu_sriov_gpu_reset(adev, NULL);
> +	amdgpu_gpu_recover(adev, NULL);
>   }
>   
>   static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev,
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> index 27b03c7..818ec0f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> @@ -519,7 +519,7 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
>   	}
>   
>   	/* Trigger recovery due to world switch failure */
> -	amdgpu_sriov_gpu_reset(adev, NULL);
> +	amdgpu_gpu_recover(adev, NULL);
>   }
>   
>   static int xgpu_vi_set_mailbox_rcv_irq(struct amdgpu_device *adev,


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH 1/6] amd/scheduler:imple job skip feature(v3)
       [not found]         ` <de065e02-acb7-2891-70ca-5eab3ce3365d-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
@ 2017-10-30 10:33           ` Liu, Monk
  0 siblings, 0 replies; 20+ messages in thread
From: Liu, Monk @ 2017-10-30 10:33 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Yeah, my typo, thanks !

-----Original Message-----
From: Christian König [mailto:ckoenig.leichtzumerken@gmail.com] 
Sent: 2017年10月30日 18:00
To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 1/6] amd/scheduler:imple job skip feature(v3)

Am 30.10.2017 um 05:15 schrieb Monk Liu:
> jobs are skipped under two cases
> 1)when the entity behind this job marked guilty, the job poped from 
> this entity's queue will be dropped in sched_main loop.
>
> 2)in job_recovery(), skip the scheduling job if its karma detected 
> above limit, and also skipped as well for other jobs sharing the same 
> fence context. this approach is becuase job_recovery() cannot access 
> job->entity due to entity may already dead.
>
> v2:
> some logic fix
>
> v3:
> when entity detected guilty, don't drop the job in the poping stage, 
> instead set its fence error as -ECANCELED
>
> in run_job(), skip the scheduling either:1) fence->error < 0 or 2) 
> there was a VRAM LOST occurred on this job.
> this way we can unify the job skipping logic.
>
> with this feature we can introduce new gpu recover feature.
>
> Change-Id: I268b1c752c94e6ecd4ea78c87eb226ea3f52908a
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c       | 13 +++++----
>   drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 39 ++++++++++++++++-----------
>   2 files changed, 31 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index f60662e..0a90c76 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -180,7 +180,7 @@ static struct dma_fence 
> *amdgpu_job_dependency(struct amd_sched_job *sched_job,
>   
>   static struct dma_fence *amdgpu_job_run(struct amd_sched_job *sched_job)
>   {
> -	struct dma_fence *fence = NULL;
> +	struct dma_fence *fence = NULL, *finished;
>   	struct amdgpu_device *adev;
>   	struct amdgpu_job *job;
>   	int r;
> @@ -190,15 +190,18 @@ static struct dma_fence *amdgpu_job_run(struct amd_sched_job *sched_job)
>   		return NULL;
>   	}
>   	job = to_amdgpu_job(sched_job);
> +	finished = &job->base.s_fence->finished;
>   	adev = job->adev;
>   
>   	BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL));
>   
>   	trace_amdgpu_sched_run_job(job);
> -	/* skip ib schedule when vram is lost */
> -	if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter)) {
> -		dma_fence_set_error(&job->base.s_fence->finished, -ECANCELED);
> -		DRM_ERROR("Skip scheduling IBs!\n");
> +
> +	if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter))
> +		dma_fence_set_error(finished, -ECANCELED);/* skip IB as well if 
> +VRAM lost */
> +
> +	if (finished->error < 0) {
> +		DRM_INFO("Skip scheduling IBs!\n");
>   	} else {
>   		r = amdgpu_ib_schedule(job->ring, job->num_ibs, job->ibs, job,
>   				       &fence);
> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c 
> b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> index 903ef8b..3d8c994 100644
> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> @@ -344,6 +344,10 @@ amd_sched_entity_pop_job(struct amd_sched_entity *entity)
>   		if (amd_sched_entity_add_dependency_cb(entity))
>   			return NULL;
>   
> +	/* skip jobs from entity that marked guilty */
> +	if (entity->guilty && atomic_read(entity->guilty))
> +		dma_fence_set_error(&sched_job->s_fence->finished, -ECANCELED);
> +
>   	spsc_queue_pop(&entity->job_queue);
>   	return sched_job;
>   }
> @@ -440,14 +444,6 @@ static void amd_sched_job_timedout(struct work_struct *work)
>   	job->sched->ops->timedout_job(job);
>   }
>   
> -static void amd_sched_set_guilty(struct amd_sched_job *s_job,
> -				 struct amd_sched_entity *s_entity)
> -{
> -	if (atomic_inc_return(&s_job->karma) > s_job->sched->hang_limit)
> -		if (s_entity->guilty)
> -			atomic_set(s_entity->guilty, 1);
> -}
> -
>   void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_job *bad)
>   {
>   	struct amd_sched_job *s_job;
> @@ -467,21 +463,24 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_jo
>   	spin_unlock(&sched->job_list_lock);
>   
>   	if (bad) {
> -		bool found = false;
> -
> -		for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_MAX; i++ ) {
> +		/* don't increase @bad's karma if it's from KERNEL RQ,
> +		 * becuase sometimes GPU hang would cause kernel jobs (like VM updating jobs)
> +		 * corrupt but keep in mind that kernel jobs always considered good.
> +		 */
> +		for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_KERNEL; i++ 
> +) {
>   			struct amd_sched_rq *rq = &sched->sched_rq[i];
>   
>   			spin_lock(&rq->lock);
>   			list_for_each_entry_safe(entity, tmp, &rq->entities, list) {
>   				if (bad->s_fence->scheduled.context == entity->fence_context) {
> -					found = true;
> -					amd_sched_set_guilty(bad, entity);
> +				    if (atomic_inc_return(&bad->karma) > bad->sched->hang_limit)
> +						if (entity->guilty)
> +							atomic_set(entity->guilty, 1);
>   					break;
>   				}
>   			}
>   			spin_unlock(&rq->lock);
> -			if (found)
> +			if (&entity->list == &rq->entities)

That needs to be "&entity->list != &rq->entities", or otherwise we only check on the first round.

With that fixed the patch is Reviewed-by: Christian König <christian.koenig@amd.com>.

Nice work,
Christian.

>   				break;
>   		}
>   	}
> @@ -499,6 +498,7 @@ void amd_sched_job_kickout(struct amd_sched_job *s_job)
>   void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
>   {
>   	struct amd_sched_job *s_job, *tmp;
> +	bool found_guilty = false;
>   	int r;
>   
>   	spin_lock(&sched->job_list_lock);
> @@ -510,6 +510,15 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
>   	list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
>   		struct amd_sched_fence *s_fence = s_job->s_fence;
>   		struct dma_fence *fence;
> +		uint64_t guilty_context;
> +
> +		if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) {
> +			found_guilty = true;
> +			guilty_context = s_job->s_fence->scheduled.context;
> +		}
> +
> +		if (found_guilty && s_job->s_fence->scheduled.context == guilty_context)
> +			dma_fence_set_error(&s_fence->finished, -ECANCELED);
>   
>   		spin_unlock(&sched->job_list_lock);
>   		fence = sched->ops->run_job(s_job); @@ -525,7 +534,6 @@ void 
> amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
>   					  r);
>   			dma_fence_put(fence);
>   		} else {
> -			DRM_ERROR("Failed to run job!\n");
>   			amd_sched_process_job(NULL, &s_fence->cb);
>   		}
>   		spin_lock(&sched->job_list_lock);
> @@ -663,7 +671,6 @@ static int amd_sched_main(void *param)
>   					  r);
>   			dma_fence_put(fence);
>   		} else {
> -			DRM_ERROR("Failed to run job!\n");
>   			amd_sched_process_job(NULL, &s_fence->cb);
>   		}
>   


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 3/6] drm/amdgpu:cleanup in_sriov_reset and lock_reset
       [not found]     ` <1509336909-11455-4-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
@ 2017-10-30 10:36       ` Christian König
  0 siblings, 0 replies; 20+ messages in thread
From: Christian König @ 2017-10-30 10:36 UTC (permalink / raw)
  To: Monk Liu, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 30.10.2017 um 05:15 schrieb Monk Liu:
> since now gpu reset is unified with gpu_recover
> for both bare-metal and SR-IOV:
>
> 1)rename in_sriov_reset to in_gpu_reset
> 2)move lock_reset from adev->virt to adev
>
> Change-Id: I9f4dbab9a4c916fbc156f669824d15ddcd0f2322
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>

Reviewed-by: Christian König <christian.koenig@amd.com>

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 3 ++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 9 +++++----
>   drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c    | 2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c  | 2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c   | 2 --
>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h   | 1 -
>   drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c      | 6 +++---
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c      | 6 +++---
>   8 files changed, 15 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 335df11..6e89be5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1650,7 +1650,8 @@ struct amdgpu_device {
>   
>   	/* record last mm index being written through WREG32*/
>   	unsigned long last_mm_index;
> -	bool                            in_sriov_reset;
> +	bool                            in_gpu_reset;
> +	struct mutex  lock_reset;
>   };
>   
>   static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 7bccd45..a144578 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2161,6 +2161,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>   	mutex_init(&adev->mn_lock);
>   	mutex_init(&adev->virt.vf_errors.lock);
>   	hash_init(adev->mn_hash);
> +	mutex_init(&adev->lock_reset);
>   
>   	amdgpu_check_arguments(adev);
>   
> @@ -2971,9 +2972,9 @@ int amdgpu_gpu_recover(struct amdgpu_device *adev, struct amdgpu_job *job)
>   
>   	dev_info(adev->dev, "GPU reset begin!\n");
>   
> -	mutex_lock(&adev->virt.lock_reset);
> +	mutex_lock(&adev->lock_reset);
>   	atomic_inc(&adev->gpu_reset_counter);
> -	adev->in_sriov_reset = 1;
> +	adev->in_gpu_reset = 1;
>   
>   	/* block TTM */
>   	resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
> @@ -3083,8 +3084,8 @@ int amdgpu_gpu_recover(struct amdgpu_device *adev, struct amdgpu_job *job)
>   	}
>   
>   	amdgpu_vf_error_trans_all(adev);
> -	adev->in_sriov_reset = 0;
> -	mutex_unlock(&adev->virt.lock_reset);
> +	adev->in_gpu_reset = 0;
> +	mutex_unlock(&adev->lock_reset);
>   	return r;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> index 447d446..76f531b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> @@ -264,7 +264,7 @@ static int psp_hw_start(struct psp_context *psp)
>   	struct amdgpu_device *adev = psp->adev;
>   	int ret;
>   
> -	if (!amdgpu_sriov_vf(adev) || !adev->in_sriov_reset) {
> +	if (!amdgpu_sriov_vf(adev) || !adev->in_gpu_reset) {
>   		ret = psp_bootloader_load_sysdrv(psp);
>   		if (ret)
>   			return ret;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
> index 6564902..edc37cc 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
> @@ -370,7 +370,7 @@ int amdgpu_ucode_init_bo(struct amdgpu_device *adev)
>   		return 0;
>   	}
>   
> -	if (!amdgpu_sriov_vf(adev) || !adev->in_sriov_reset) {
> +	if (!amdgpu_sriov_vf(adev) || !adev->in_gpu_reset) {
>   		err = amdgpu_bo_create(adev, adev->firmware.fw_size, PAGE_SIZE, true,
>   					amdgpu_sriov_vf(adev) ? AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT,
>   					AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> index fee08af..f791518 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> @@ -115,8 +115,6 @@ void amdgpu_virt_init_setting(struct amdgpu_device *adev)
>   	adev->enable_virtual_display = true;
>   	adev->cg_flags = 0;
>   	adev->pg_flags = 0;
> -
> -	mutex_init(&adev->virt.lock_reset);
>   }
>   
>   uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> index 20bdb8f..e3f78f5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> @@ -239,7 +239,6 @@ struct amdgpu_virt {
>   	uint64_t			csa_vmid0_addr;
>   	bool chained_ib_support;
>   	uint32_t			reg_val_offs;
> -	struct mutex                    lock_reset;
>   	struct amdgpu_irq_src		ack_irq;
>   	struct amdgpu_irq_src		rcv_irq;
>   	struct work_struct		flr_work;
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index e0b7876..a74515a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -4815,7 +4815,7 @@ static int gfx_v8_0_kiq_init_queue(struct amdgpu_ring *ring)
>   
>   	gfx_v8_0_kiq_setting(ring);
>   
> -	if (adev->in_sriov_reset) { /* for GPU_RESET case */
> +	if (adev->in_gpu_reset) { /* for GPU_RESET case */
>   		/* reset MQD to a clean status */
>   		if (adev->gfx.mec.mqd_backup[mqd_idx])
>   			memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(struct vi_mqd_allocation));
> @@ -4852,7 +4852,7 @@ static int gfx_v8_0_kcq_init_queue(struct amdgpu_ring *ring)
>   	struct vi_mqd *mqd = ring->mqd_ptr;
>   	int mqd_idx = ring - &adev->gfx.compute_ring[0];
>   
> -	if (!adev->in_sriov_reset && !adev->gfx.in_suspend) {
> +	if (!adev->in_gpu_reset && !adev->gfx.in_suspend) {
>   		memset((void *)mqd, 0, sizeof(struct vi_mqd_allocation));
>   		((struct vi_mqd_allocation *)mqd)->dynamic_cu_mask = 0xFFFFFFFF;
>   		((struct vi_mqd_allocation *)mqd)->dynamic_rb_mask = 0xFFFFFFFF;
> @@ -4864,7 +4864,7 @@ static int gfx_v8_0_kcq_init_queue(struct amdgpu_ring *ring)
>   
>   		if (adev->gfx.mec.mqd_backup[mqd_idx])
>   			memcpy(adev->gfx.mec.mqd_backup[mqd_idx], mqd, sizeof(struct vi_mqd_allocation));
> -	} else if (adev->in_sriov_reset) { /* for GPU_RESET case */
> +	} else if (adev->in_gpu_reset) { /* for GPU_RESET case */
>   		/* reset MQD to a clean status */
>   		if (adev->gfx.mec.mqd_backup[mqd_idx])
>   			memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(struct vi_mqd_allocation));
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 39b02e9..9855dc0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -2740,7 +2740,7 @@ static int gfx_v9_0_kiq_init_queue(struct amdgpu_ring *ring)
>   
>   	gfx_v9_0_kiq_setting(ring);
>   
> -	if (adev->in_sriov_reset) { /* for GPU_RESET case */
> +	if (adev->in_gpu_reset) { /* for GPU_RESET case */
>   		/* reset MQD to a clean status */
>   		if (adev->gfx.mec.mqd_backup[mqd_idx])
>   			memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(struct v9_mqd_allocation));
> @@ -2778,7 +2778,7 @@ static int gfx_v9_0_kcq_init_queue(struct amdgpu_ring *ring)
>   	struct v9_mqd *mqd = ring->mqd_ptr;
>   	int mqd_idx = ring - &adev->gfx.compute_ring[0];
>   
> -	if (!adev->in_sriov_reset && !adev->gfx.in_suspend) {
> +	if (!adev->in_gpu_reset && !adev->gfx.in_suspend) {
>   		memset((void *)mqd, 0, sizeof(struct v9_mqd_allocation));
>   		((struct v9_mqd_allocation *)mqd)->dynamic_cu_mask = 0xFFFFFFFF;
>   		((struct v9_mqd_allocation *)mqd)->dynamic_rb_mask = 0xFFFFFFFF;
> @@ -2790,7 +2790,7 @@ static int gfx_v9_0_kcq_init_queue(struct amdgpu_ring *ring)
>   
>   		if (adev->gfx.mec.mqd_backup[mqd_idx])
>   			memcpy(adev->gfx.mec.mqd_backup[mqd_idx], mqd, sizeof(struct v9_mqd_allocation));
> -	} else if (adev->in_sriov_reset) { /* for GPU_RESET case */
> +	} else if (adev->in_gpu_reset) { /* for GPU_RESET case */
>   		/* reset MQD to a clean status */
>   		if (adev->gfx.mec.mqd_backup[mqd_idx])
>   			memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(struct v9_mqd_allocation));


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 4/6] drm/amdgpu:cleanup ucode_init_bo
       [not found]     ` <1509336909-11455-5-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
@ 2017-10-30 10:36       ` Christian König
  0 siblings, 0 replies; 20+ messages in thread
From: Christian König @ 2017-10-30 10:36 UTC (permalink / raw)
  To: Monk Liu, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 30.10.2017 um 05:15 schrieb Monk Liu:
> 1,no sriov check since gpu recover is unified
> 2,need CPU_ACCESS_REQUIRED flag for VRAM if SRIOV
> because otherwise after following PIN the first allocated
> VRAM bo is wasted due to some TTM mgr reason.
>
> Change-Id: I4d029f2da8bb463942c7861d3e52f309bdba9576
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>

Reviewed-by: Christian König <christian.koenig@amd.com>

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c | 4 ++--
>   1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
> index edc37cc..ab9b2d4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
> @@ -370,10 +370,10 @@ int amdgpu_ucode_init_bo(struct amdgpu_device *adev)
>   		return 0;
>   	}
>   
> -	if (!amdgpu_sriov_vf(adev) || !adev->in_gpu_reset) {
> +	if (!adev->in_gpu_reset) {
>   		err = amdgpu_bo_create(adev, adev->firmware.fw_size, PAGE_SIZE, true,
>   					amdgpu_sriov_vf(adev) ? AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT,
> -					AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS,
> +					AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS|AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED,
>   					NULL, NULL, 0, bo);
>   		if (err) {
>   			dev_err(adev->dev, "(%d) Firmware buffer allocate failed\n", err);


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 5/6] drm/amdgpu/sriov:fix memory leak in psp_load_fw
       [not found]     ` <1509336909-11455-6-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
@ 2017-10-30 10:37       ` Christian König
  0 siblings, 0 replies; 20+ messages in thread
From: Christian König @ 2017-10-30 10:37 UTC (permalink / raw)
  To: Monk Liu, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 30.10.2017 um 05:15 schrieb Monk Liu:
> for SR-IOV when doing gpu reset this routine shouldn't do
> resource allocating otherwise memory leak
>
> Change-Id: I25da3a5b475196c75c7e639adc40751754625968
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>

Acked-by: Christian König <christian.koenig@amd.com>

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 20 ++++++++++++--------
>   1 file changed, 12 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> index 76f531b..2157d45 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> @@ -334,23 +334,26 @@ static int psp_load_fw(struct amdgpu_device *adev)
>   	int ret;
>   	struct psp_context *psp = &adev->psp;
>   
> +	if (amdgpu_sriov_vf(adev) && adev->in_gpu_reset != 0)
> +		goto skip_memalloc;
> +
>   	psp->cmd = kzalloc(sizeof(struct psp_gfx_cmd_resp), GFP_KERNEL);
>   	if (!psp->cmd)
>   		return -ENOMEM;
>   
>   	ret = amdgpu_bo_create_kernel(adev, PSP_1_MEG, PSP_1_MEG,
> -				      AMDGPU_GEM_DOMAIN_GTT,
> -				      &psp->fw_pri_bo,
> -				      &psp->fw_pri_mc_addr,
> -				      &psp->fw_pri_buf);
> +					AMDGPU_GEM_DOMAIN_GTT,
> +					&psp->fw_pri_bo,
> +					&psp->fw_pri_mc_addr,
> +					&psp->fw_pri_buf);
>   	if (ret)
>   		goto failed;
>   
>   	ret = amdgpu_bo_create_kernel(adev, PSP_FENCE_BUFFER_SIZE, PAGE_SIZE,
> -				      AMDGPU_GEM_DOMAIN_VRAM,
> -				      &psp->fence_buf_bo,
> -				      &psp->fence_buf_mc_addr,
> -				      &psp->fence_buf);
> +					AMDGPU_GEM_DOMAIN_VRAM,
> +					&psp->fence_buf_bo,
> +					&psp->fence_buf_mc_addr,
> +					&psp->fence_buf);
>   	if (ret)
>   		goto failed_mem2;
>   
> @@ -375,6 +378,7 @@ static int psp_load_fw(struct amdgpu_device *adev)
>   	if (ret)
>   		goto failed_mem;
>   
> +skip_memalloc:
>   	ret = psp_hw_start(psp);
>   	if (ret)
>   		goto failed_mem;


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 6/6] drm/amdgpu:fix random missing of FLR NOTIFY
       [not found]     ` <1509336909-11455-7-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
@ 2017-10-30 10:37       ` Christian König
  0 siblings, 0 replies; 20+ messages in thread
From: Christian König @ 2017-10-30 10:37 UTC (permalink / raw)
  To: Monk Liu, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 30.10.2017 um 05:15 schrieb Monk Liu:
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>

Acked-by: Christian König <christian.koenig@amd.com>

> ---
>   drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 14 +++++++++++---
>   1 file changed, 11 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> index c32d0b0..d31259e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> @@ -282,9 +282,17 @@ static int xgpu_ai_mailbox_rcv_irq(struct amdgpu_device *adev,
>   		/* see what event we get */
>   		r = xgpu_ai_mailbox_rcv_msg(adev, IDH_FLR_NOTIFICATION);
>   
> -		/* only handle FLR_NOTIFY now */
> -		if (!r)
> -			schedule_work(&adev->virt.flr_work);
> +		/* sometimes the interrupt is delayed to inject to VM, so under such case
> +		 * the IDH_FLR_NOTIFICATION is overwritten by VF FLR from GIM side, thus
> +		 * above recieve message could be failed, we should schedule the flr_work
> +		 * anyway
> +		 */
> +		if (r) {
> +			DRM_ERROR("FLR_NOTIFICATION is missed\n");
> +			xgpu_ai_mailbox_send_ack(adev);
> +		}
> +
> +		schedule_work(&adev->virt.flr_work);
>   	}
>   
>   	return 0;


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 0/7] *** GPU recover V3 ***
       [not found] ` <1509336909-11455-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
                     ` (5 preceding siblings ...)
  2017-10-30  4:15   ` [PATCH 6/6] drm/amdgpu:fix random missing of FLR NOTIFY Monk Liu
@ 2017-11-09  9:35   ` Julien Isorce
       [not found]     ` <CAHWPjbU=47WZzbdWYPbP360v4qa6t=_pCtkYcxO2J4bSwA8jVA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  6 siblings, 1 reply; 20+ messages in thread
From: Julien Isorce @ 2017-11-09  9:35 UTC (permalink / raw)
  To: Monk Liu; +Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 2481 bytes --]

Hi Monk.

I am interested on this. Currently when a "ring X stalled for more than N
sec" happens it usually goes into the gpu reset routine.
Does it always cause the vram to be lost ? Could you explain what happens
if the vram remains lost ?

I am asking this because I experienced some recurrent gpu reset that are
marked succeeded from the log but fail in the "resume" step.
I would not be interested in this if it would always leave a chance to the
user to cleanly reboot the machine.

The issue is that it can require a hard reboot without kernel panic and
without keeping the keyboard responding to magic keys.
Are those patches trying to address this issue ?

Note that here "issue" is not referring to the root cause of a ring X
stalled and it is also not referring to why "resume" step fails.

Thx a lot
Julien


On 30 October 2017 at 04:15, Monk Liu <Monk.Liu-5C7GfCeVMHo@public.gmane.org> wrote:

> *** job skipping logic in scheduler part is re-implemented  ***
>
> Monk Liu (7):
>   amd/scheduler:imple job skip feature(v3)
>   drm/amdgpu:implement new GPU recover(v3)
>   drm/amdgpu:cleanup in_sriov_reset and lock_reset
>   drm/amdgpu:cleanup ucode_init_bo
>   drm/amdgpu:block kms open during gpu_reset
>   drm/amdgpu/sriov:fix memory leak in psp_load_fw
>   drm/amdgpu:fix random missing of FLR NOTIFY
>
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h           |   9 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    | 311
> ++++++++++++--------------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c     |  10 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c       |   2 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c       |  18 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c       |   3 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c       |  22 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c     |   4 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c      |   2 -
>  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h      |   2 -
>  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c         |   6 +-
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c         |   6 +-
>  drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c         |  16 +-
>  drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c         |   2 +-
>  drivers/gpu/drm/amd/scheduler/gpu_scheduler.c |  39 ++--
>  15 files changed, 220 insertions(+), 232 deletions(-)
>
> --
> 2.7.4
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>

[-- Attachment #1.2: Type: text/html, Size: 3551 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 0/7] *** GPU recover V3 ***
       [not found]     ` <CAHWPjbU=47WZzbdWYPbP360v4qa6t=_pCtkYcxO2J4bSwA8jVA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2017-11-09 18:08       ` Alex Deucher
       [not found]         ` <CADnq5_Otuz78P8YuoSYZNNx+iy2Pi4bBqLxvt4NrhwYDWXtjLw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  2017-11-10  7:51       ` Liu, Monk
  1 sibling, 1 reply; 20+ messages in thread
From: Alex Deucher @ 2017-11-09 18:08 UTC (permalink / raw)
  To: Julien Isorce; +Cc: amd-gfx list, Monk Liu

On Thu, Nov 9, 2017 at 4:35 AM, Julien Isorce <julien.isorce@gmail.com> wrote:
> Hi Monk.
>
> I am interested on this. Currently when a "ring X stalled for more than N
> sec" happens it usually goes into the gpu reset routine.
> Does it always cause the vram to be lost ? Could you explain what happens if
> the vram remains lost ?

It means the contents of vram are gone or unreliable.  In that case
applications need to re-initialize all of their buffers before
submitting any work.  You really need to add GL_robustness support to
any applications you care about.  Whether vram is lost or not depends
on the reset method and the asic.  E.g., soft reset of a specific
engine won't cause a loss of vram, but a full adapter reset or an FLR
may.

>
> I am asking this because I experienced some recurrent gpu reset that are
> marked succeeded from the log but fail in the "resume" step.
> I would not be interested in this if it would always leave a chance to the
> user to cleanly reboot the machine.
>
> The issue is that it can require a hard reboot without kernel panic and
> without keeping the keyboard responding to magic keys.
> Are those patches trying to address this issue ?
>
> Note that here "issue" is not referring to the root cause of a ring X
> stalled and it is also not referring to why "resume" step fails.

There were a few issues that caused problems with GPU reset.  The
biggest was that the GPU scheduler deadlocked in certain cases so if
you got a GPU hang, the driver locked up.  That should mostly be
straightened out at this point.  I think there may still be some
deadlocks in the modesetting code after a reset.  Once that is sorted,
it will come down to fine tuning the actual reset sequences.  Full
adapter resets are the easiest to get working reliably (and are
already implemented in the driver), but also the most destructive.

Alex

>
> Thx a lot
> Julien
>
>
> On 30 October 2017 at 04:15, Monk Liu <Monk.Liu@amd.com> wrote:
>>
>> *** job skipping logic in scheduler part is re-implemented  ***
>>
>> Monk Liu (7):
>>   amd/scheduler:imple job skip feature(v3)
>>   drm/amdgpu:implement new GPU recover(v3)
>>   drm/amdgpu:cleanup in_sriov_reset and lock_reset
>>   drm/amdgpu:cleanup ucode_init_bo
>>   drm/amdgpu:block kms open during gpu_reset
>>   drm/amdgpu/sriov:fix memory leak in psp_load_fw
>>   drm/amdgpu:fix random missing of FLR NOTIFY
>>
>>  drivers/gpu/drm/amd/amdgpu/amdgpu.h           |   9 +-
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    | 311
>> ++++++++++++--------------
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c     |  10 +-
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c       |   2 +-
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c       |  18 +-
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c       |   3 +
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c       |  22 +-
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c     |   4 +-
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c      |   2 -
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h      |   2 -
>>  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c         |   6 +-
>>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c         |   6 +-
>>  drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c         |  16 +-
>>  drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c         |   2 +-
>>  drivers/gpu/drm/amd/scheduler/gpu_scheduler.c |  39 ++--
>>  15 files changed, 220 insertions(+), 232 deletions(-)
>>
>> --
>> 2.7.4
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH 0/7] *** GPU recover V3 ***
       [not found]     ` <CAHWPjbU=47WZzbdWYPbP360v4qa6t=_pCtkYcxO2J4bSwA8jVA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  2017-11-09 18:08       ` Alex Deucher
@ 2017-11-10  7:51       ` Liu, Monk
       [not found]         ` <BY1PR12MB0456E48B02CFAE654F1F145484540-PicGAnIBOoZZHrdT0E092gdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  1 sibling, 1 reply; 20+ messages in thread
From: Liu, Monk @ 2017-11-10  7:51 UTC (permalink / raw)
  To: Julien Isorce; +Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 2752 bytes --]

Please share the dmesg log, and what’s the chip are you using ?

From: Julien Isorce [mailto:julien.isorce@gmail.com]
Sent: 2017年11月9日 17:35
To: Liu, Monk <Monk.Liu@amd.com>
Cc: amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 0/7] *** GPU recover V3 ***

Hi Monk.

I am interested on this. Currently when a "ring X stalled for more than N sec" happens it usually goes into the gpu reset routine.
Does it always cause the vram to be lost ? Could you explain what happens if the vram remains lost ?

I am asking this because I experienced some recurrent gpu reset that are marked succeeded from the log but fail in the "resume" step.
I would not be interested in this if it would always leave a chance to the user to cleanly reboot the machine.

The issue is that it can require a hard reboot without kernel panic and without keeping the keyboard responding to magic keys.
Are those patches trying to address this issue ?

Note that here "issue" is not referring to the root cause of a ring X stalled and it is also not referring to why "resume" step fails.

Thx a lot
Julien


On 30 October 2017 at 04:15, Monk Liu <Monk.Liu@amd.com<mailto:Monk.Liu@amd.com>> wrote:
*** job skipping logic in scheduler part is re-implemented  ***

Monk Liu (7):
  amd/scheduler:imple job skip feature(v3)
  drm/amdgpu:implement new GPU recover(v3)
  drm/amdgpu:cleanup in_sriov_reset and lock_reset
  drm/amdgpu:cleanup ucode_init_bo
  drm/amdgpu:block kms open during gpu_reset
  drm/amdgpu/sriov:fix memory leak in psp_load_fw
  drm/amdgpu:fix random missing of FLR NOTIFY

 drivers/gpu/drm/amd/amdgpu/amdgpu.h           |   9 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    | 311 ++++++++++++--------------
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c     |  10 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c       |   2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c       |  18 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c       |   3 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c       |  22 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c     |   4 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c      |   2 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h      |   2 -
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c         |   6 +-
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c         |   6 +-
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c         |  16 +-
 drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c         |   2 +-
 drivers/gpu/drm/amd/scheduler/gpu_scheduler.c |  39 ++--
 15 files changed, 220 insertions(+), 232 deletions(-)

--
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[-- Attachment #1.2: Type: text/html, Size: 8655 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 0/7] *** GPU recover V3 ***
       [not found]         ` <CADnq5_Otuz78P8YuoSYZNNx+iy2Pi4bBqLxvt4NrhwYDWXtjLw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2017-11-13 11:24           ` Julien Isorce
       [not found]             ` <CAHWPjbX+gqRLMgr896kzSXPeQnUU8X3_1jPfZtfzLB_VsuEa3A-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 20+ messages in thread
From: Julien Isorce @ 2017-11-13 11:24 UTC (permalink / raw)
  To: Alex Deucher; +Cc: amd-gfx list, Monk Liu


[-- Attachment #1.1: Type: text/plain, Size: 4354 bytes --]

Hi Alex,

Thx for your reply, but in all of the cases you mentioned, the user would
still
be able to reboot properly ( i.e. typing reboot or a magic keyboard key)
or to have a trace of a kernel panic if it happens, is it correct ?

Thx
Julien

On 9 November 2017 at 18:08, Alex Deucher <alexdeucher-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:

> On Thu, Nov 9, 2017 at 4:35 AM, Julien Isorce <julien.isorce-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
> wrote:
> > Hi Monk.
> >
> > I am interested on this. Currently when a "ring X stalled for more than N
> > sec" happens it usually goes into the gpu reset routine.
> > Does it always cause the vram to be lost ? Could you explain what
> happens if
> > the vram remains lost ?
>
> It means the contents of vram are gone or unreliable.  In that case
> applications need to re-initialize all of their buffers before
> submitting any work.  You really need to add GL_robustness support to
> any applications you care about.  Whether vram is lost or not depends
> on the reset method and the asic.  E.g., soft reset of a specific
> engine won't cause a loss of vram, but a full adapter reset or an FLR
> may.
>
> >
> > I am asking this because I experienced some recurrent gpu reset that are
> > marked succeeded from the log but fail in the "resume" step.
> > I would not be interested in this if it would always leave a chance to
> the
> > user to cleanly reboot the machine.
> >
> > The issue is that it can require a hard reboot without kernel panic and
> > without keeping the keyboard responding to magic keys.
> > Are those patches trying to address this issue ?
> >
> > Note that here "issue" is not referring to the root cause of a ring X
> > stalled and it is also not referring to why "resume" step fails.
>
> There were a few issues that caused problems with GPU reset.  The
> biggest was that the GPU scheduler deadlocked in certain cases so if
> you got a GPU hang, the driver locked up.  That should mostly be
> straightened out at this point.  I think there may still be some
> deadlocks in the modesetting code after a reset.  Once that is sorted,
> it will come down to fine tuning the actual reset sequences.  Full
> adapter resets are the easiest to get working reliably (and are
> already implemented in the driver), but also the most destructive.
>
> Alex
>
> >
> > Thx a lot
> > Julien
> >
> >
> > On 30 October 2017 at 04:15, Monk Liu <Monk.Liu-5C7GfCeVMHo@public.gmane.org> wrote:
> >>
> >> *** job skipping logic in scheduler part is re-implemented  ***
> >>
> >> Monk Liu (7):
> >>   amd/scheduler:imple job skip feature(v3)
> >>   drm/amdgpu:implement new GPU recover(v3)
> >>   drm/amdgpu:cleanup in_sriov_reset and lock_reset
> >>   drm/amdgpu:cleanup ucode_init_bo
> >>   drm/amdgpu:block kms open during gpu_reset
> >>   drm/amdgpu/sriov:fix memory leak in psp_load_fw
> >>   drm/amdgpu:fix random missing of FLR NOTIFY
> >>
> >>  drivers/gpu/drm/amd/amdgpu/amdgpu.h           |   9 +-
> >>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    | 311
> >> ++++++++++++--------------
> >>  drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c     |  10 +-
> >>  drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c       |   2 +-
> >>  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c       |  18 +-
> >>  drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c       |   3 +
> >>  drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c       |  22 +-
> >>  drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c     |   4 +-
> >>  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c      |   2 -
> >>  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h      |   2 -
> >>  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c         |   6 +-
> >>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c         |   6 +-
> >>  drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c         |  16 +-
> >>  drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c         |   2 +-
> >>  drivers/gpu/drm/amd/scheduler/gpu_scheduler.c |  39 ++--
> >>  15 files changed, 220 insertions(+), 232 deletions(-)
> >>
> >> --
> >> 2.7.4
> >>
> >> _______________________________________________
> >> amd-gfx mailing list
> >> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> >> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> >
> >
> >
> > _______________________________________________
> > amd-gfx mailing list
> > amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> > https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> >
>

[-- Attachment #1.2: Type: text/html, Size: 6393 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 0/7] *** GPU recover V3 ***
       [not found]         ` <BY1PR12MB0456E48B02CFAE654F1F145484540-PicGAnIBOoZZHrdT0E092gdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2017-11-13 11:26           ` Julien Isorce
  0 siblings, 0 replies; 20+ messages in thread
From: Julien Isorce @ 2017-11-13 11:26 UTC (permalink / raw)
  To: Liu, Monk; +Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 3253 bytes --]

Hi Monk,

It was more a general question. So you never need to do an electrical
reboot when a gpu reset fails ?

Thx
Julien

On 10 November 2017 at 07:51, Liu, Monk <Monk.Liu-5C7GfCeVMHo@public.gmane.org> wrote:

> Please share the dmesg log, and what’s the chip are you using ?
>
>
>
> *From:* Julien Isorce [mailto:julien.isorce-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org]
> *Sent:* 2017年11月9日 17:35
> *To:* Liu, Monk <Monk.Liu-5C7GfCeVMHo@public.gmane.org>
> *Cc:* amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> *Subject:* Re: [PATCH 0/7] *** GPU recover V3 ***
>
>
>
> Hi Monk.
>
>
>
> I am interested on this. Currently when a "ring X stalled for more than N
> sec" happens it usually goes into the gpu reset routine.
>
> Does it always cause the vram to be lost ? Could you explain what happens
> if the vram remains lost ?
>
>
>
> I am asking this because I experienced some recurrent gpu reset that are
> marked succeeded from the log but fail in the "resume" step.
>
> I would not be interested in this if it would always leave a chance to the
> user to cleanly reboot the machine.
>
>
>
> The issue is that it can require a hard reboot without kernel panic and
> without keeping the keyboard responding to magic keys.
>
> Are those patches trying to address this issue ?
>
>
>
> Note that here "issue" is not referring to the root cause of a ring X
> stalled and it is also not referring to why "resume" step fails.
>
>
>
> Thx a lot
>
> Julien
>
>
>
>
>
> On 30 October 2017 at 04:15, Monk Liu <Monk.Liu-5C7GfCeVMHo@public.gmane.org> wrote:
>
> *** job skipping logic in scheduler part is re-implemented  ***
>
> Monk Liu (7):
>   amd/scheduler:imple job skip feature(v3)
>   drm/amdgpu:implement new GPU recover(v3)
>   drm/amdgpu:cleanup in_sriov_reset and lock_reset
>   drm/amdgpu:cleanup ucode_init_bo
>   drm/amdgpu:block kms open during gpu_reset
>   drm/amdgpu/sriov:fix memory leak in psp_load_fw
>   drm/amdgpu:fix random missing of FLR NOTIFY
>
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h           |   9 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    | 311
> ++++++++++++--------------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c     |  10 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c       |   2 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c       |  18 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c       |   3 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c       |  22 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c     |   4 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c      |   2 -
>  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h      |   2 -
>  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c         |   6 +-
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c         |   6 +-
>  drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c         |  16 +-
>  drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c         |   2 +-
>  drivers/gpu/drm/amd/scheduler/gpu_scheduler.c |  39 ++--
>  15 files changed, 220 insertions(+), 232 deletions(-)
>
> --
> 2.7.4
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
>
>

[-- Attachment #1.2: Type: text/html, Size: 7412 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 0/7] *** GPU recover V3 ***
       [not found]             ` <CAHWPjbX+gqRLMgr896kzSXPeQnUU8X3_1jPfZtfzLB_VsuEa3A-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2017-11-13 15:23               ` Alex Deucher
  0 siblings, 0 replies; 20+ messages in thread
From: Alex Deucher @ 2017-11-13 15:23 UTC (permalink / raw)
  To: Julien Isorce; +Cc: amd-gfx list, Monk Liu

On Mon, Nov 13, 2017 at 6:24 AM, Julien Isorce <julien.isorce@gmail.com> wrote:
> Hi Alex,
>
> Thx for your reply, but in all of the cases you mentioned, the user would
> still
> be able to reboot properly ( i.e. typing reboot or a magic keyboard key)
> or to have a trace of a kernel panic if it happens, is it correct ?

Yes, the deadlock in the GPU scheduler was the issue preventing that
from working properly.

Alex

>
> Thx
> Julien
>
> On 9 November 2017 at 18:08, Alex Deucher <alexdeucher@gmail.com> wrote:
>>
>> On Thu, Nov 9, 2017 at 4:35 AM, Julien Isorce <julien.isorce@gmail.com>
>> wrote:
>> > Hi Monk.
>> >
>> > I am interested on this. Currently when a "ring X stalled for more than
>> > N
>> > sec" happens it usually goes into the gpu reset routine.
>> > Does it always cause the vram to be lost ? Could you explain what
>> > happens if
>> > the vram remains lost ?
>>
>> It means the contents of vram are gone or unreliable.  In that case
>> applications need to re-initialize all of their buffers before
>> submitting any work.  You really need to add GL_robustness support to
>> any applications you care about.  Whether vram is lost or not depends
>> on the reset method and the asic.  E.g., soft reset of a specific
>> engine won't cause a loss of vram, but a full adapter reset or an FLR
>> may.
>>
>> >
>> > I am asking this because I experienced some recurrent gpu reset that are
>> > marked succeeded from the log but fail in the "resume" step.
>> > I would not be interested in this if it would always leave a chance to
>> > the
>> > user to cleanly reboot the machine.
>> >
>> > The issue is that it can require a hard reboot without kernel panic and
>> > without keeping the keyboard responding to magic keys.
>> > Are those patches trying to address this issue ?
>> >
>> > Note that here "issue" is not referring to the root cause of a ring X
>> > stalled and it is also not referring to why "resume" step fails.
>>
>> There were a few issues that caused problems with GPU reset.  The
>> biggest was that the GPU scheduler deadlocked in certain cases so if
>> you got a GPU hang, the driver locked up.  That should mostly be
>> straightened out at this point.  I think there may still be some
>> deadlocks in the modesetting code after a reset.  Once that is sorted,
>> it will come down to fine tuning the actual reset sequences.  Full
>> adapter resets are the easiest to get working reliably (and are
>> already implemented in the driver), but also the most destructive.
>>
>> Alex
>>
>> >
>> > Thx a lot
>> > Julien
>> >
>> >
>> > On 30 October 2017 at 04:15, Monk Liu <Monk.Liu@amd.com> wrote:
>> >>
>> >> *** job skipping logic in scheduler part is re-implemented  ***
>> >>
>> >> Monk Liu (7):
>> >>   amd/scheduler:imple job skip feature(v3)
>> >>   drm/amdgpu:implement new GPU recover(v3)
>> >>   drm/amdgpu:cleanup in_sriov_reset and lock_reset
>> >>   drm/amdgpu:cleanup ucode_init_bo
>> >>   drm/amdgpu:block kms open during gpu_reset
>> >>   drm/amdgpu/sriov:fix memory leak in psp_load_fw
>> >>   drm/amdgpu:fix random missing of FLR NOTIFY
>> >>
>> >>  drivers/gpu/drm/amd/amdgpu/amdgpu.h           |   9 +-
>> >>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    | 311
>> >> ++++++++++++--------------
>> >>  drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c     |  10 +-
>> >>  drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c       |   2 +-
>> >>  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c       |  18 +-
>> >>  drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c       |   3 +
>> >>  drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c       |  22 +-
>> >>  drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c     |   4 +-
>> >>  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c      |   2 -
>> >>  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h      |   2 -
>> >>  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c         |   6 +-
>> >>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c         |   6 +-
>> >>  drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c         |  16 +-
>> >>  drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c         |   2 +-
>> >>  drivers/gpu/drm/amd/scheduler/gpu_scheduler.c |  39 ++--
>> >>  15 files changed, 220 insertions(+), 232 deletions(-)
>> >>
>> >> --
>> >> 2.7.4
>> >>
>> >> _______________________________________________
>> >> amd-gfx mailing list
>> >> amd-gfx@lists.freedesktop.org
>> >> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>> >
>> >
>> >
>> > _______________________________________________
>> > amd-gfx mailing list
>> > amd-gfx@lists.freedesktop.org
>> > https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>> >
>
>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2017-11-13 15:23 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-10-30  4:15 [PATCH 0/7] *** GPU recover V3 *** Monk Liu
     [not found] ` <1509336909-11455-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2017-10-30  4:15   ` [PATCH 1/6] amd/scheduler:imple job skip feature(v3) Monk Liu
     [not found]     ` <1509336909-11455-2-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2017-10-30 10:00       ` Christian König
     [not found]         ` <de065e02-acb7-2891-70ca-5eab3ce3365d-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2017-10-30 10:33           ` Liu, Monk
2017-10-30  4:15   ` [PATCH 2/6] drm/amdgpu:implement new GPU recover(v3) Monk Liu
     [not found]     ` <1509336909-11455-3-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2017-10-30 10:06       ` Christian König
2017-10-30  4:15   ` [PATCH 3/6] drm/amdgpu:cleanup in_sriov_reset and lock_reset Monk Liu
     [not found]     ` <1509336909-11455-4-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2017-10-30 10:36       ` Christian König
2017-10-30  4:15   ` [PATCH 4/6] drm/amdgpu:cleanup ucode_init_bo Monk Liu
     [not found]     ` <1509336909-11455-5-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2017-10-30 10:36       ` Christian König
2017-10-30  4:15   ` [PATCH 5/6] drm/amdgpu/sriov:fix memory leak in psp_load_fw Monk Liu
     [not found]     ` <1509336909-11455-6-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2017-10-30 10:37       ` Christian König
2017-10-30  4:15   ` [PATCH 6/6] drm/amdgpu:fix random missing of FLR NOTIFY Monk Liu
     [not found]     ` <1509336909-11455-7-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2017-10-30 10:37       ` Christian König
2017-11-09  9:35   ` [PATCH 0/7] *** GPU recover V3 *** Julien Isorce
     [not found]     ` <CAHWPjbU=47WZzbdWYPbP360v4qa6t=_pCtkYcxO2J4bSwA8jVA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-11-09 18:08       ` Alex Deucher
     [not found]         ` <CADnq5_Otuz78P8YuoSYZNNx+iy2Pi4bBqLxvt4NrhwYDWXtjLw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-11-13 11:24           ` Julien Isorce
     [not found]             ` <CAHWPjbX+gqRLMgr896kzSXPeQnUU8X3_1jPfZtfzLB_VsuEa3A-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-11-13 15:23               ` Alex Deucher
2017-11-10  7:51       ` Liu, Monk
     [not found]         ` <BY1PR12MB0456E48B02CFAE654F1F145484540-PicGAnIBOoZZHrdT0E092gdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2017-11-13 11:26           ` Julien Isorce

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.