All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/3] drm/amdgpu: rename explicit to need_pipe_sync for better understanding
@ 2018-11-03  5:33 Monk Liu
       [not found] ` <1541223223-27020-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 19+ messages in thread
From: Monk Liu @ 2018-11-03  5:33 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Monk Liu

and differentiate it with explicit_bo flag

Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c  | 6 +++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c | 7 +++++--
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index e0af44f..1d71f8c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -179,11 +179,11 @@ static struct dma_fence *amdgpu_job_dependency(struct drm_sched_job *sched_job,
 	struct amdgpu_job *job = to_amdgpu_job(sched_job);
 	struct amdgpu_vm *vm = job->vm;
 	struct dma_fence *fence;
-	bool explicit = false;
+	bool need_pipe_sync = false;
 	int r;
 
-	fence = amdgpu_sync_get_fence(&job->sync, &explicit);
-	if (fence && explicit) {
+	fence = amdgpu_sync_get_fence(&job->sync, &need_pipe_sync);
+	if (fence && need_pipe_sync) {
 		if (drm_sched_dependency_optimized(fence, s_entity)) {
 			r = amdgpu_sync_fence(ring->adev, &job->sched_sync,
 					      fence, false);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
index 2d6f5ec..a7e1ea8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
@@ -36,7 +36,7 @@
 struct amdgpu_sync_entry {
 	struct hlist_node	node;
 	struct dma_fence	*fence;
-	bool	explicit;
+	bool   explicit;
 };
 
 static struct kmem_cache *amdgpu_sync_slab;
@@ -126,6 +126,7 @@ static void amdgpu_sync_keep_later(struct dma_fence **keep,
  *
  * @sync: sync object to add the fence to
  * @f: fence to add
+ * @explicit: this fence is an explicit dependency
  *
  * Tries to add the fence to an existing hash entry. Returns true when an entry
  * was found, false otherwise.
@@ -153,6 +154,8 @@ static bool amdgpu_sync_add_later(struct amdgpu_sync *sync, struct dma_fence *f,
  *
  * @sync: sync object to add fence to
  * @fence: fence to sync to
+ * @explicit: True to indicate the given @f need a pipeline sync upon the case
+ *            that the job of @sync runs on the same ring with it
  *
  */
 int amdgpu_sync_fence(struct amdgpu_device *adev, struct amdgpu_sync *sync,
@@ -295,7 +298,7 @@ struct dma_fence *amdgpu_sync_peek_fence(struct amdgpu_sync *sync,
  * amdgpu_sync_get_fence - get the next fence from the sync object
  *
  * @sync: sync object to use
- * @explicit: true if the next fence is explicit
+ * @explicit: true if the next fence is explicitly defined., e.g. dependency, syncobj, etc...
  *
  * Get and removes the next fence from the sync object not signaled yet.
  */
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 2/3] drm/amdgpu: drop the sched_sync
       [not found] ` <1541223223-27020-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
@ 2018-11-03  5:33   ` Monk Liu
       [not found]     ` <1541223223-27020-2-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  2018-11-03  5:33   ` [PATCH 3/3] drm/amdgpu: drop need_vm_flush/need_pipe_sync Monk Liu
  1 sibling, 1 reply; 19+ messages in thread
From: Monk Liu @ 2018-11-03  5:33 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Monk Liu

Reasons to drop it:

1) simplify the code: just introduce field member "need_pipe_sync"
for job is good enough to tell if the explicit dependency fence
need followed by a pipeline sync.

2) after GPU_recover the explicit fence from sched_syn will not
come back so the required pipeline_sync following it is missed,
consider scenario below:
>now on ring buffer:
Job-A -> pipe_sync -> Job-B
>TDR occured on Job-A, and after GPU recover:
>now on ring buffer:
Job-A -> Job-B

because the fence from sched_sync is used and freed after ib_schedule
in first time, it will never come back, with this patch this issue
could be avoided.

Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c  | 16 ++++++----------
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 14 +++-----------
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  3 +--
 3 files changed, 10 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
index c48207b3..ac7d2da 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
@@ -122,7 +122,6 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
 {
 	struct amdgpu_device *adev = ring->adev;
 	struct amdgpu_ib *ib = &ibs[0];
-	struct dma_fence *tmp = NULL;
 	bool skip_preamble, need_ctx_switch;
 	unsigned patch_offset = ~0;
 	struct amdgpu_vm *vm;
@@ -166,16 +165,13 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
 	}
 
 	need_ctx_switch = ring->current_ctx != fence_ctx;
-	if (ring->funcs->emit_pipeline_sync && job &&
-	    ((tmp = amdgpu_sync_get_fence(&job->sched_sync, NULL)) ||
-	     (amdgpu_sriov_vf(adev) && need_ctx_switch) ||
-	     amdgpu_vm_need_pipeline_sync(ring, job))) {
-		need_pipe_sync = true;
 
-		if (tmp)
-			trace_amdgpu_ib_pipe_sync(job, tmp);
-
-		dma_fence_put(tmp);
+	if (ring->funcs->emit_pipeline_sync && job) {
+		if ((need_ctx_switch && amdgpu_sriov_vf(adev)) ||
+			amdgpu_vm_need_pipeline_sync(ring, job))
+			need_pipe_sync = true;
+		else if (job->need_pipe_sync)
+			need_pipe_sync = true;
 	}
 
 	if (ring->funcs->insert_start)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 1d71f8c..dae997d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -71,7 +71,6 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
 	(*job)->num_ibs = num_ibs;
 
 	amdgpu_sync_create(&(*job)->sync);
-	amdgpu_sync_create(&(*job)->sched_sync);
 	(*job)->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
 	(*job)->vm_pd_addr = AMDGPU_BO_INVALID_OFFSET;
 
@@ -117,7 +116,6 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job)
 	amdgpu_ring_priority_put(ring, s_job->s_priority);
 	dma_fence_put(job->fence);
 	amdgpu_sync_free(&job->sync);
-	amdgpu_sync_free(&job->sched_sync);
 	kfree(job);
 }
 
@@ -127,7 +125,6 @@ void amdgpu_job_free(struct amdgpu_job *job)
 
 	dma_fence_put(job->fence);
 	amdgpu_sync_free(&job->sync);
-	amdgpu_sync_free(&job->sched_sync);
 	kfree(job);
 }
 
@@ -182,14 +179,9 @@ static struct dma_fence *amdgpu_job_dependency(struct drm_sched_job *sched_job,
 	bool need_pipe_sync = false;
 	int r;
 
-	fence = amdgpu_sync_get_fence(&job->sync, &need_pipe_sync);
-	if (fence && need_pipe_sync) {
-		if (drm_sched_dependency_optimized(fence, s_entity)) {
-			r = amdgpu_sync_fence(ring->adev, &job->sched_sync,
-					      fence, false);
-			if (r)
-				DRM_ERROR("Error adding fence (%d)\n", r);
-		}
+	if (fence && need_pipe_sync && drm_sched_dependency_optimized(fence, s_entity)) {
+		trace_amdgpu_ib_pipe_sync(job, fence);
+		job->need_pipe_sync = true;
 	}
 
 	while (fence == NULL && vm && !job->vmid) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
index e1b46a6..c1d00f0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
@@ -41,7 +41,6 @@ struct amdgpu_job {
 	struct drm_sched_job    base;
 	struct amdgpu_vm	*vm;
 	struct amdgpu_sync	sync;
-	struct amdgpu_sync	sched_sync;
 	struct amdgpu_ib	*ibs;
 	struct dma_fence	*fence; /* the hw fence */
 	uint32_t		preamble_status;
@@ -59,7 +58,7 @@ struct amdgpu_job {
 	/* user fence handling */
 	uint64_t		uf_addr;
 	uint64_t		uf_sequence;
-
+	bool            need_pipe_sync; /* require a pipeline sync for this job */
 };
 
 int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 3/3] drm/amdgpu: drop need_vm_flush/need_pipe_sync
       [not found] ` <1541223223-27020-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  2018-11-03  5:33   ` [PATCH 2/3] drm/amdgpu: drop the sched_sync Monk Liu
@ 2018-11-03  5:33   ` Monk Liu
       [not found]     ` <1541223223-27020-3-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  1 sibling, 1 reply; 19+ messages in thread
From: Monk Liu @ 2018-11-03  5:33 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Monk Liu

use a flag to hold need_flush and need_pipe_sync

Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c    |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c   | 14 +++++++++-----
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c   |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.h   |  6 ++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c   |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c    |  4 ++--
 7 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
index ac7d2da..3231a3b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
@@ -170,7 +170,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
 		if ((need_ctx_switch && amdgpu_sriov_vf(adev)) ||
 			amdgpu_vm_need_pipeline_sync(ring, job))
 			need_pipe_sync = true;
-		else if (job->need_pipe_sync)
+		else if (job->flags & JOB_NEED_PIPE_SYNC)
 			need_pipe_sync = true;
 	}
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
index df9b173..ed9b6c1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
@@ -311,7 +311,9 @@ static int amdgpu_vmid_grab_reserved(struct amdgpu_vm *vm,
 		dma_fence_put((*id)->flushed_updates);
 		(*id)->flushed_updates = dma_fence_get(updates);
 	}
-	job->vm_needs_flush = needs_flush;
+
+	if (needs_flush)
+		job->flags |= JOB_NEED_VM_FLUSH;
 	return 0;
 }
 
@@ -341,7 +343,8 @@ static int amdgpu_vmid_grab_used(struct amdgpu_vm *vm,
 	struct dma_fence *updates = sync->last_vm_update;
 	int r;
 
-	job->vm_needs_flush = vm->use_cpu_for_update;
+	if (vm->use_cpu_for_update)
+		job->flags |= JOB_NEED_VM_FLUSH;
 
 	/* Check if we can use a VMID already assigned to this VM */
 	list_for_each_entry_reverse((*id), &id_mgr->ids_lru, list) {
@@ -380,7 +383,8 @@ static int amdgpu_vmid_grab_used(struct amdgpu_vm *vm,
 			(*id)->flushed_updates = dma_fence_get(updates);
 		}
 
-		job->vm_needs_flush |= needs_flush;
+		if (needs_flush)
+			job->flags |= JOB_NEED_VM_FLUSH;
 		return 0;
 	}
 
@@ -438,7 +442,7 @@ int amdgpu_vmid_grab(struct amdgpu_vm *vm, struct amdgpu_ring *ring,
 
 			dma_fence_put(id->flushed_updates);
 			id->flushed_updates = dma_fence_get(updates);
-			job->vm_needs_flush = true;
+			job->flags |= JOB_NEED_VM_FLUSH;
 		}
 
 		list_move_tail(&id->list, &id_mgr->ids_lru);
@@ -447,7 +451,7 @@ int amdgpu_vmid_grab(struct amdgpu_vm *vm, struct amdgpu_ring *ring,
 	id->pd_gpu_addr = job->vm_pd_addr;
 	id->owner = vm->entity.fence_context;
 
-	if (job->vm_needs_flush) {
+	if (job->flags & JOB_NEED_VM_FLUSH) {
 		dma_fence_put(id->last_flush);
 		id->last_flush = NULL;
 	}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index dae997d..82e190d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -181,7 +181,7 @@ static struct dma_fence *amdgpu_job_dependency(struct drm_sched_job *sched_job,
 
 	if (fence && need_pipe_sync && drm_sched_dependency_optimized(fence, s_entity)) {
 		trace_amdgpu_ib_pipe_sync(job, fence);
-		job->need_pipe_sync = true;
+		job->flags |= JOB_NEED_PIPE_SYNC;
 	}
 
 	while (fence == NULL && vm && !job->vmid) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
index c1d00f0..f9e8ecd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
@@ -37,6 +37,9 @@
 
 struct amdgpu_fence;
 
+#define JOB_NEED_VM_FLUSH       1   /* require a vm flush for this job */
+#define JOB_NEED_PIPE_SYNC      2   /* require a pipeline sync for this job */
+
 struct amdgpu_job {
 	struct drm_sched_job    base;
 	struct amdgpu_vm	*vm;
@@ -46,7 +49,6 @@ struct amdgpu_job {
 	uint32_t		preamble_status;
 	uint32_t		num_ibs;
 	void			*owner;
-	bool                    vm_needs_flush;
 	uint64_t		vm_pd_addr;
 	unsigned		vmid;
 	unsigned		pasid;
@@ -58,7 +60,7 @@ struct amdgpu_job {
 	/* user fence handling */
 	uint64_t		uf_addr;
 	uint64_t		uf_sequence;
-	bool            need_pipe_sync; /* require a pipeline sync for this job */
+	uint64_t		flags;
 };
 
 int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
index 626abca..4318d57 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
@@ -232,7 +232,7 @@ TRACE_EVENT(amdgpu_vm_grab_id,
 			   __entry->vmid = job->vmid;
 			   __entry->vm_hub = ring->funcs->vmhub,
 			   __entry->pd_addr = job->vm_pd_addr;
-			   __entry->needs_flush = job->vm_needs_flush;
+			   __entry->needs_flush = job->flags & JOB_NEED_VM_FLUSH;
 			   ),
 	    TP_printk("pasid=%d, ring=%s, id=%u, hub=%u, pd_addr=%010Lx needs_flush=%u",
 		      __entry->pasid, __get_str(ring), __entry->vmid,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index c91ec31..9ea2861 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1989,7 +1989,7 @@ int amdgpu_copy_buffer(struct amdgpu_ring *ring, uint64_t src_offset,
 
 	if (vm_needs_flush) {
 		job->vm_pd_addr = amdgpu_gmc_pd_addr(adev->gart.bo);
-		job->vm_needs_flush = true;
+		job->flags |= JOB_NEED_VM_FLUSH;
 	}
 	if (resv) {
 		r = amdgpu_sync_resv(adev, &job->sync, resv,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 352b304..dcdf9d4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -1038,7 +1038,7 @@ bool amdgpu_vm_need_pipeline_sync(struct amdgpu_ring *ring,
 	struct amdgpu_vmid_mgr *id_mgr = &adev->vm_manager.id_mgr[vmhub];
 	struct amdgpu_vmid *id;
 	bool gds_switch_needed;
-	bool vm_flush_needed = job->vm_needs_flush || ring->has_compute_vm_bug;
+	bool vm_flush_needed = (job->flags & JOB_NEED_VM_FLUSH) || ring->has_compute_vm_bug;
 
 	if (job->vmid == 0)
 		return false;
@@ -1082,7 +1082,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, bool need_
 		id->gws_size != job->gws_size ||
 		id->oa_base != job->oa_base ||
 		id->oa_size != job->oa_size);
-	bool vm_flush_needed = job->vm_needs_flush;
+	bool vm_flush_needed = job->flags & JOB_NEED_VM_FLUSH;
 	bool pasid_mapping_needed = id->pasid != job->pasid ||
 		!id->pasid_mapping ||
 		!dma_fence_is_signaled(id->pasid_mapping);
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/3] drm/amdgpu: drop need_vm_flush/need_pipe_sync
       [not found]     ` <1541223223-27020-3-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
@ 2018-11-03 19:48       ` Christian König
  0 siblings, 0 replies; 19+ messages in thread
From: Christian König @ 2018-11-03 19:48 UTC (permalink / raw)
  To: Monk Liu, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 03.11.18 um 06:33 schrieb Monk Liu:
> use a flag to hold need_flush and need_pipe_sync

NAK, if you want to safe space then use "bool variable:1" instead.

Flags in structures are not as self explaining as a member variable.

Christian.

>
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c    |  2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c   | 14 +++++++++-----
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c   |  2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.h   |  6 ++++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h |  2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c   |  2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c    |  4 ++--
>   7 files changed, 19 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> index ac7d2da..3231a3b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> @@ -170,7 +170,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>   		if ((need_ctx_switch && amdgpu_sriov_vf(adev)) ||
>   			amdgpu_vm_need_pipeline_sync(ring, job))
>   			need_pipe_sync = true;
> -		else if (job->need_pipe_sync)
> +		else if (job->flags & JOB_NEED_PIPE_SYNC)
>   			need_pipe_sync = true;
>   	}
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
> index df9b173..ed9b6c1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
> @@ -311,7 +311,9 @@ static int amdgpu_vmid_grab_reserved(struct amdgpu_vm *vm,
>   		dma_fence_put((*id)->flushed_updates);
>   		(*id)->flushed_updates = dma_fence_get(updates);
>   	}
> -	job->vm_needs_flush = needs_flush;
> +
> +	if (needs_flush)
> +		job->flags |= JOB_NEED_VM_FLUSH;
>   	return 0;
>   }
>   
> @@ -341,7 +343,8 @@ static int amdgpu_vmid_grab_used(struct amdgpu_vm *vm,
>   	struct dma_fence *updates = sync->last_vm_update;
>   	int r;
>   
> -	job->vm_needs_flush = vm->use_cpu_for_update;
> +	if (vm->use_cpu_for_update)
> +		job->flags |= JOB_NEED_VM_FLUSH;
>   
>   	/* Check if we can use a VMID already assigned to this VM */
>   	list_for_each_entry_reverse((*id), &id_mgr->ids_lru, list) {
> @@ -380,7 +383,8 @@ static int amdgpu_vmid_grab_used(struct amdgpu_vm *vm,
>   			(*id)->flushed_updates = dma_fence_get(updates);
>   		}
>   
> -		job->vm_needs_flush |= needs_flush;
> +		if (needs_flush)
> +			job->flags |= JOB_NEED_VM_FLUSH;
>   		return 0;
>   	}
>   
> @@ -438,7 +442,7 @@ int amdgpu_vmid_grab(struct amdgpu_vm *vm, struct amdgpu_ring *ring,
>   
>   			dma_fence_put(id->flushed_updates);
>   			id->flushed_updates = dma_fence_get(updates);
> -			job->vm_needs_flush = true;
> +			job->flags |= JOB_NEED_VM_FLUSH;
>   		}
>   
>   		list_move_tail(&id->list, &id_mgr->ids_lru);
> @@ -447,7 +451,7 @@ int amdgpu_vmid_grab(struct amdgpu_vm *vm, struct amdgpu_ring *ring,
>   	id->pd_gpu_addr = job->vm_pd_addr;
>   	id->owner = vm->entity.fence_context;
>   
> -	if (job->vm_needs_flush) {
> +	if (job->flags & JOB_NEED_VM_FLUSH) {
>   		dma_fence_put(id->last_flush);
>   		id->last_flush = NULL;
>   	}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index dae997d..82e190d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -181,7 +181,7 @@ static struct dma_fence *amdgpu_job_dependency(struct drm_sched_job *sched_job,
>   
>   	if (fence && need_pipe_sync && drm_sched_dependency_optimized(fence, s_entity)) {
>   		trace_amdgpu_ib_pipe_sync(job, fence);
> -		job->need_pipe_sync = true;
> +		job->flags |= JOB_NEED_PIPE_SYNC;
>   	}
>   
>   	while (fence == NULL && vm && !job->vmid) {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> index c1d00f0..f9e8ecd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> @@ -37,6 +37,9 @@
>   
>   struct amdgpu_fence;
>   
> +#define JOB_NEED_VM_FLUSH       1   /* require a vm flush for this job */
> +#define JOB_NEED_PIPE_SYNC      2   /* require a pipeline sync for this job */
> +
>   struct amdgpu_job {
>   	struct drm_sched_job    base;
>   	struct amdgpu_vm	*vm;
> @@ -46,7 +49,6 @@ struct amdgpu_job {
>   	uint32_t		preamble_status;
>   	uint32_t		num_ibs;
>   	void			*owner;
> -	bool                    vm_needs_flush;
>   	uint64_t		vm_pd_addr;
>   	unsigned		vmid;
>   	unsigned		pasid;
> @@ -58,7 +60,7 @@ struct amdgpu_job {
>   	/* user fence handling */
>   	uint64_t		uf_addr;
>   	uint64_t		uf_sequence;
> -	bool            need_pipe_sync; /* require a pipeline sync for this job */
> +	uint64_t		flags;
>   };
>   
>   int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
> index 626abca..4318d57 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
> @@ -232,7 +232,7 @@ TRACE_EVENT(amdgpu_vm_grab_id,
>   			   __entry->vmid = job->vmid;
>   			   __entry->vm_hub = ring->funcs->vmhub,
>   			   __entry->pd_addr = job->vm_pd_addr;
> -			   __entry->needs_flush = job->vm_needs_flush;
> +			   __entry->needs_flush = job->flags & JOB_NEED_VM_FLUSH;
>   			   ),
>   	    TP_printk("pasid=%d, ring=%s, id=%u, hub=%u, pd_addr=%010Lx needs_flush=%u",
>   		      __entry->pasid, __get_str(ring), __entry->vmid,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> index c91ec31..9ea2861 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> @@ -1989,7 +1989,7 @@ int amdgpu_copy_buffer(struct amdgpu_ring *ring, uint64_t src_offset,
>   
>   	if (vm_needs_flush) {
>   		job->vm_pd_addr = amdgpu_gmc_pd_addr(adev->gart.bo);
> -		job->vm_needs_flush = true;
> +		job->flags |= JOB_NEED_VM_FLUSH;
>   	}
>   	if (resv) {
>   		r = amdgpu_sync_resv(adev, &job->sync, resv,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 352b304..dcdf9d4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -1038,7 +1038,7 @@ bool amdgpu_vm_need_pipeline_sync(struct amdgpu_ring *ring,
>   	struct amdgpu_vmid_mgr *id_mgr = &adev->vm_manager.id_mgr[vmhub];
>   	struct amdgpu_vmid *id;
>   	bool gds_switch_needed;
> -	bool vm_flush_needed = job->vm_needs_flush || ring->has_compute_vm_bug;
> +	bool vm_flush_needed = (job->flags & JOB_NEED_VM_FLUSH) || ring->has_compute_vm_bug;
>   
>   	if (job->vmid == 0)
>   		return false;
> @@ -1082,7 +1082,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, bool need_
>   		id->gws_size != job->gws_size ||
>   		id->oa_base != job->oa_base ||
>   		id->oa_size != job->oa_size);
> -	bool vm_flush_needed = job->vm_needs_flush;
> +	bool vm_flush_needed = job->flags & JOB_NEED_VM_FLUSH;
>   	bool pasid_mapping_needed = id->pasid != job->pasid ||
>   		!id->pasid_mapping ||
>   		!dma_fence_is_signaled(id->pasid_mapping);

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/3] drm/amdgpu: drop the sched_sync
       [not found]     ` <1541223223-27020-2-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
@ 2018-11-03 19:49       ` Christian König
       [not found]         ` <251ea9fd-910f-24aa-343c-a87b5039b4dd-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  0 siblings, 1 reply; 19+ messages in thread
From: Christian König @ 2018-11-03 19:49 UTC (permalink / raw)
  To: Monk Liu, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 03.11.18 um 06:33 schrieb Monk Liu:
> Reasons to drop it:
>
> 1) simplify the code: just introduce field member "need_pipe_sync"
> for job is good enough to tell if the explicit dependency fence
> need followed by a pipeline sync.
>
> 2) after GPU_recover the explicit fence from sched_syn will not
> come back so the required pipeline_sync following it is missed,
> consider scenario below:
>> now on ring buffer:
> Job-A -> pipe_sync -> Job-B
>> TDR occured on Job-A, and after GPU recover:
>> now on ring buffer:
> Job-A -> Job-B
>
> because the fence from sched_sync is used and freed after ib_schedule
> in first time, it will never come back, with this patch this issue
> could be avoided.

NAK, that would result in a severe performance drop.

We need the fence here to determine if we actually need to do the 
pipeline sync or not.

E.g. the explicit requested fence could already be signaled.

Christian.

>
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c  | 16 ++++++----------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 14 +++-----------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  3 +--
>   3 files changed, 10 insertions(+), 23 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> index c48207b3..ac7d2da 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> @@ -122,7 +122,6 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>   {
>   	struct amdgpu_device *adev = ring->adev;
>   	struct amdgpu_ib *ib = &ibs[0];
> -	struct dma_fence *tmp = NULL;
>   	bool skip_preamble, need_ctx_switch;
>   	unsigned patch_offset = ~0;
>   	struct amdgpu_vm *vm;
> @@ -166,16 +165,13 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>   	}
>   
>   	need_ctx_switch = ring->current_ctx != fence_ctx;
> -	if (ring->funcs->emit_pipeline_sync && job &&
> -	    ((tmp = amdgpu_sync_get_fence(&job->sched_sync, NULL)) ||
> -	     (amdgpu_sriov_vf(adev) && need_ctx_switch) ||
> -	     amdgpu_vm_need_pipeline_sync(ring, job))) {
> -		need_pipe_sync = true;
>   
> -		if (tmp)
> -			trace_amdgpu_ib_pipe_sync(job, tmp);
> -
> -		dma_fence_put(tmp);
> +	if (ring->funcs->emit_pipeline_sync && job) {
> +		if ((need_ctx_switch && amdgpu_sriov_vf(adev)) ||
> +			amdgpu_vm_need_pipeline_sync(ring, job))
> +			need_pipe_sync = true;
> +		else if (job->need_pipe_sync)
> +			need_pipe_sync = true;
>   	}
>   
>   	if (ring->funcs->insert_start)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index 1d71f8c..dae997d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -71,7 +71,6 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
>   	(*job)->num_ibs = num_ibs;
>   
>   	amdgpu_sync_create(&(*job)->sync);
> -	amdgpu_sync_create(&(*job)->sched_sync);
>   	(*job)->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
>   	(*job)->vm_pd_addr = AMDGPU_BO_INVALID_OFFSET;
>   
> @@ -117,7 +116,6 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job)
>   	amdgpu_ring_priority_put(ring, s_job->s_priority);
>   	dma_fence_put(job->fence);
>   	amdgpu_sync_free(&job->sync);
> -	amdgpu_sync_free(&job->sched_sync);
>   	kfree(job);
>   }
>   
> @@ -127,7 +125,6 @@ void amdgpu_job_free(struct amdgpu_job *job)
>   
>   	dma_fence_put(job->fence);
>   	amdgpu_sync_free(&job->sync);
> -	amdgpu_sync_free(&job->sched_sync);
>   	kfree(job);
>   }
>   
> @@ -182,14 +179,9 @@ static struct dma_fence *amdgpu_job_dependency(struct drm_sched_job *sched_job,
>   	bool need_pipe_sync = false;
>   	int r;
>   
> -	fence = amdgpu_sync_get_fence(&job->sync, &need_pipe_sync);
> -	if (fence && need_pipe_sync) {
> -		if (drm_sched_dependency_optimized(fence, s_entity)) {
> -			r = amdgpu_sync_fence(ring->adev, &job->sched_sync,
> -					      fence, false);
> -			if (r)
> -				DRM_ERROR("Error adding fence (%d)\n", r);
> -		}
> +	if (fence && need_pipe_sync && drm_sched_dependency_optimized(fence, s_entity)) {
> +		trace_amdgpu_ib_pipe_sync(job, fence);
> +		job->need_pipe_sync = true;
>   	}
>   
>   	while (fence == NULL && vm && !job->vmid) {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> index e1b46a6..c1d00f0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> @@ -41,7 +41,6 @@ struct amdgpu_job {
>   	struct drm_sched_job    base;
>   	struct amdgpu_vm	*vm;
>   	struct amdgpu_sync	sync;
> -	struct amdgpu_sync	sched_sync;
>   	struct amdgpu_ib	*ibs;
>   	struct dma_fence	*fence; /* the hw fence */
>   	uint32_t		preamble_status;
> @@ -59,7 +58,7 @@ struct amdgpu_job {
>   	/* user fence handling */
>   	uint64_t		uf_addr;
>   	uint64_t		uf_sequence;
> -
> +	bool            need_pipe_sync; /* require a pipeline sync for this job */
>   };
>   
>   int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* RE: [PATCH 2/3] drm/amdgpu: drop the sched_sync
       [not found]         ` <251ea9fd-910f-24aa-343c-a87b5039b4dd-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
@ 2018-11-04  0:21           ` Liu, Monk
  2018-11-04  0:48           ` Liu, Monk
  1 sibling, 0 replies; 19+ messages in thread
From: Liu, Monk @ 2018-11-04  0:21 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

How you solve the missing pipe-sync bug I illustrated ? actually original logic Is loose,
I found this corner case during massive TDR test couple weeks ago, and it is very hard to catch ...

/Monk

-----Original Message-----
From: Christian König <ckoenig.leichtzumerken@gmail.com> 
Sent: Sunday, November 4, 2018 3:50 AM
To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 2/3] drm/amdgpu: drop the sched_sync

Am 03.11.18 um 06:33 schrieb Monk Liu:
> Reasons to drop it:
>
> 1) simplify the code: just introduce field member "need_pipe_sync"
> for job is good enough to tell if the explicit dependency fence need 
> followed by a pipeline sync.
>
> 2) after GPU_recover the explicit fence from sched_syn will not come 
> back so the required pipeline_sync following it is missed, consider 
> scenario below:
>> now on ring buffer:
> Job-A -> pipe_sync -> Job-B
>> TDR occured on Job-A, and after GPU recover:
>> now on ring buffer:
> Job-A -> Job-B
>
> because the fence from sched_sync is used and freed after ib_schedule 
> in first time, it will never come back, with this patch this issue 
> could be avoided.

NAK, that would result in a severe performance drop.

We need the fence here to determine if we actually need to do the pipeline sync or not.

E.g. the explicit requested fence could already be signaled.

Christian.

>
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c  | 16 ++++++----------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 14 +++-----------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  3 +--
>   3 files changed, 10 insertions(+), 23 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> index c48207b3..ac7d2da 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> @@ -122,7 +122,6 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>   {
>   	struct amdgpu_device *adev = ring->adev;
>   	struct amdgpu_ib *ib = &ibs[0];
> -	struct dma_fence *tmp = NULL;
>   	bool skip_preamble, need_ctx_switch;
>   	unsigned patch_offset = ~0;
>   	struct amdgpu_vm *vm;
> @@ -166,16 +165,13 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>   	}
>   
>   	need_ctx_switch = ring->current_ctx != fence_ctx;
> -	if (ring->funcs->emit_pipeline_sync && job &&
> -	    ((tmp = amdgpu_sync_get_fence(&job->sched_sync, NULL)) ||
> -	     (amdgpu_sriov_vf(adev) && need_ctx_switch) ||
> -	     amdgpu_vm_need_pipeline_sync(ring, job))) {
> -		need_pipe_sync = true;
>   
> -		if (tmp)
> -			trace_amdgpu_ib_pipe_sync(job, tmp);
> -
> -		dma_fence_put(tmp);
> +	if (ring->funcs->emit_pipeline_sync && job) {
> +		if ((need_ctx_switch && amdgpu_sriov_vf(adev)) ||
> +			amdgpu_vm_need_pipeline_sync(ring, job))
> +			need_pipe_sync = true;
> +		else if (job->need_pipe_sync)
> +			need_pipe_sync = true;
>   	}
>   
>   	if (ring->funcs->insert_start)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index 1d71f8c..dae997d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -71,7 +71,6 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
>   	(*job)->num_ibs = num_ibs;
>   
>   	amdgpu_sync_create(&(*job)->sync);
> -	amdgpu_sync_create(&(*job)->sched_sync);
>   	(*job)->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
>   	(*job)->vm_pd_addr = AMDGPU_BO_INVALID_OFFSET;
>   
> @@ -117,7 +116,6 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job)
>   	amdgpu_ring_priority_put(ring, s_job->s_priority);
>   	dma_fence_put(job->fence);
>   	amdgpu_sync_free(&job->sync);
> -	amdgpu_sync_free(&job->sched_sync);
>   	kfree(job);
>   }
>   
> @@ -127,7 +125,6 @@ void amdgpu_job_free(struct amdgpu_job *job)
>   
>   	dma_fence_put(job->fence);
>   	amdgpu_sync_free(&job->sync);
> -	amdgpu_sync_free(&job->sched_sync);
>   	kfree(job);
>   }
>   
> @@ -182,14 +179,9 @@ static struct dma_fence *amdgpu_job_dependency(struct drm_sched_job *sched_job,
>   	bool need_pipe_sync = false;
>   	int r;
>   
> -	fence = amdgpu_sync_get_fence(&job->sync, &need_pipe_sync);
> -	if (fence && need_pipe_sync) {
> -		if (drm_sched_dependency_optimized(fence, s_entity)) {
> -			r = amdgpu_sync_fence(ring->adev, &job->sched_sync,
> -					      fence, false);
> -			if (r)
> -				DRM_ERROR("Error adding fence (%d)\n", r);
> -		}
> +	if (fence && need_pipe_sync && drm_sched_dependency_optimized(fence, s_entity)) {
> +		trace_amdgpu_ib_pipe_sync(job, fence);
> +		job->need_pipe_sync = true;
>   	}
>   
>   	while (fence == NULL && vm && !job->vmid) { diff --git 
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> index e1b46a6..c1d00f0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> @@ -41,7 +41,6 @@ struct amdgpu_job {
>   	struct drm_sched_job    base;
>   	struct amdgpu_vm	*vm;
>   	struct amdgpu_sync	sync;
> -	struct amdgpu_sync	sched_sync;
>   	struct amdgpu_ib	*ibs;
>   	struct dma_fence	*fence; /* the hw fence */
>   	uint32_t		preamble_status;
> @@ -59,7 +58,7 @@ struct amdgpu_job {
>   	/* user fence handling */
>   	uint64_t		uf_addr;
>   	uint64_t		uf_sequence;
> -
> +	bool            need_pipe_sync; /* require a pipeline sync for this job */
>   };
>   
>   int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* RE: [PATCH 2/3] drm/amdgpu: drop the sched_sync
       [not found]         ` <251ea9fd-910f-24aa-343c-a87b5039b4dd-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  2018-11-04  0:21           ` Liu, Monk
@ 2018-11-04  0:48           ` Liu, Monk
       [not found]             ` <BN6PR1201MB024149B0B50D889C4CEF655984C90-6iU6OBHu2P+jgkzTOKc/hWrFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
  1 sibling, 1 reply; 19+ messages in thread
From: Liu, Monk @ 2018-11-04  0:48 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

> NAK, that would result in a severe performance drop.
> We need the fence here to determine if we actually need to do the pipeline sync or not.
>E.g. the explicit requested fence could already be signaled.

For the performance issue, only insert a WAIT_REG_MEM on GFX/compute ring *doesn't* give the "severe" drop (it's mimic in fact) ...  At least I didn't observe any performance drop with 3dmark benchmark (also tested vulkan CTS), Can you tell me which game/benchmark will have performance drop with this fix by your understanding ? let me check it .

The problem I hit is during the massive stress test against multi-process + quark , if the quark process hang the engine while there is another two job following the bad job, 
After the TDR these two job will lose the explicit and the pipeline-sync was also lost.


BTW: for original logic, the pipeline sync have another corner case:
Assume JobC depend on JobA with explicit flag, and there is jobB inserted in ring:

jobA -> jobB -> (pipe sync)JobC

if JobA really cost a lot of time to finish, in the amdgpu_ib_schedule() stage you will insert a pipeline sync for JobC against its explicit dependency which is JobA,
but there is a JobB between A and C and the pipeline sync of before JobC will wrongly wait on the JobB ...  

while it is not a big issue but obviously not necessary: C have no relation with B

/Monk



-----Original Message-----
From: Christian König <ckoenig.leichtzumerken@gmail.com> 
Sent: Sunday, November 4, 2018 3:50 AM
To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 2/3] drm/amdgpu: drop the sched_sync

Am 03.11.18 um 06:33 schrieb Monk Liu:
> Reasons to drop it:
>
> 1) simplify the code: just introduce field member "need_pipe_sync"
> for job is good enough to tell if the explicit dependency fence need 
> followed by a pipeline sync.
>
> 2) after GPU_recover the explicit fence from sched_syn will not come 
> back so the required pipeline_sync following it is missed, consider 
> scenario below:
>> now on ring buffer:
> Job-A -> pipe_sync -> Job-B
>> TDR occured on Job-A, and after GPU recover:
>> now on ring buffer:
> Job-A -> Job-B
>
> because the fence from sched_sync is used and freed after ib_schedule 
> in first time, it will never come back, with this patch this issue 
> could be avoided.

NAK, that would result in a severe performance drop.

We need the fence here to determine if we actually need to do the pipeline sync or not.

E.g. the explicit requested fence could already be signaled.

Christian.

>
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c  | 16 ++++++----------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 14 +++-----------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  3 +--
>   3 files changed, 10 insertions(+), 23 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> index c48207b3..ac7d2da 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> @@ -122,7 +122,6 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>   {
>   	struct amdgpu_device *adev = ring->adev;
>   	struct amdgpu_ib *ib = &ibs[0];
> -	struct dma_fence *tmp = NULL;
>   	bool skip_preamble, need_ctx_switch;
>   	unsigned patch_offset = ~0;
>   	struct amdgpu_vm *vm;
> @@ -166,16 +165,13 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>   	}
>   
>   	need_ctx_switch = ring->current_ctx != fence_ctx;
> -	if (ring->funcs->emit_pipeline_sync && job &&
> -	    ((tmp = amdgpu_sync_get_fence(&job->sched_sync, NULL)) ||
> -	     (amdgpu_sriov_vf(adev) && need_ctx_switch) ||
> -	     amdgpu_vm_need_pipeline_sync(ring, job))) {
> -		need_pipe_sync = true;
>   
> -		if (tmp)
> -			trace_amdgpu_ib_pipe_sync(job, tmp);
> -
> -		dma_fence_put(tmp);
> +	if (ring->funcs->emit_pipeline_sync && job) {
> +		if ((need_ctx_switch && amdgpu_sriov_vf(adev)) ||
> +			amdgpu_vm_need_pipeline_sync(ring, job))
> +			need_pipe_sync = true;
> +		else if (job->need_pipe_sync)
> +			need_pipe_sync = true;
>   	}
>   
>   	if (ring->funcs->insert_start)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index 1d71f8c..dae997d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -71,7 +71,6 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
>   	(*job)->num_ibs = num_ibs;
>   
>   	amdgpu_sync_create(&(*job)->sync);
> -	amdgpu_sync_create(&(*job)->sched_sync);
>   	(*job)->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
>   	(*job)->vm_pd_addr = AMDGPU_BO_INVALID_OFFSET;
>   
> @@ -117,7 +116,6 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job)
>   	amdgpu_ring_priority_put(ring, s_job->s_priority);
>   	dma_fence_put(job->fence);
>   	amdgpu_sync_free(&job->sync);
> -	amdgpu_sync_free(&job->sched_sync);
>   	kfree(job);
>   }
>   
> @@ -127,7 +125,6 @@ void amdgpu_job_free(struct amdgpu_job *job)
>   
>   	dma_fence_put(job->fence);
>   	amdgpu_sync_free(&job->sync);
> -	amdgpu_sync_free(&job->sched_sync);
>   	kfree(job);
>   }
>   
> @@ -182,14 +179,9 @@ static struct dma_fence *amdgpu_job_dependency(struct drm_sched_job *sched_job,
>   	bool need_pipe_sync = false;
>   	int r;
>   
> -	fence = amdgpu_sync_get_fence(&job->sync, &need_pipe_sync);
> -	if (fence && need_pipe_sync) {
> -		if (drm_sched_dependency_optimized(fence, s_entity)) {
> -			r = amdgpu_sync_fence(ring->adev, &job->sched_sync,
> -					      fence, false);
> -			if (r)
> -				DRM_ERROR("Error adding fence (%d)\n", r);
> -		}
> +	if (fence && need_pipe_sync && drm_sched_dependency_optimized(fence, s_entity)) {
> +		trace_amdgpu_ib_pipe_sync(job, fence);
> +		job->need_pipe_sync = true;
>   	}
>   
>   	while (fence == NULL && vm && !job->vmid) { diff --git 
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> index e1b46a6..c1d00f0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> @@ -41,7 +41,6 @@ struct amdgpu_job {
>   	struct drm_sched_job    base;
>   	struct amdgpu_vm	*vm;
>   	struct amdgpu_sync	sync;
> -	struct amdgpu_sync	sched_sync;
>   	struct amdgpu_ib	*ibs;
>   	struct dma_fence	*fence; /* the hw fence */
>   	uint32_t		preamble_status;
> @@ -59,7 +58,7 @@ struct amdgpu_job {
>   	/* user fence handling */
>   	uint64_t		uf_addr;
>   	uint64_t		uf_sequence;
> -
> +	bool            need_pipe_sync; /* require a pipeline sync for this job */
>   };
>   
>   int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/3] drm/amdgpu: drop the sched_sync
       [not found]             ` <BN6PR1201MB024149B0B50D889C4CEF655984C90-6iU6OBHu2P+jgkzTOKc/hWrFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
@ 2018-11-04 19:02               ` Koenig, Christian
       [not found]                 ` <0abab90d-dd54-290d-a1cf-38047ec66b9f-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 19+ messages in thread
From: Koenig, Christian @ 2018-11-04 19:02 UTC (permalink / raw)
  To: Liu, Monk, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

> Can you tell me which game/benchmark will have performance drop with this fix by your understanding ?
When you sync between submission things like composing X windows are 
slowed down massively.

David Zhou had an use case which saw a >10% performance drop the last 
time he tried it.

> The problem I hit is during the massive stress test against multi-process + quark , if the quark process hang the engine while there is another two job following the bad job,
> After the TDR these two job will lose the explicit and the pipeline-sync was also lost.
Well that is really strange. This workaround is only for a very specific 
Vulkan CTS test which we are still not 100% sure is actually valid.

When a reset happens we flush the VMIDs when re-submitting the jobs to 
the rings and while doing so we also always do a pipeline sync.

So you should never ever run into any issues in quark with that, even 
when we completely disable this workaround.

Regards,
Christian.

Am 04.11.18 um 01:48 schrieb Liu, Monk:
>> NAK, that would result in a severe performance drop.
>> We need the fence here to determine if we actually need to do the pipeline sync or not.
>> E.g. the explicit requested fence could already be signaled.
> For the performance issue, only insert a WAIT_REG_MEM on GFX/compute ring *doesn't* give the "severe" drop (it's mimic in fact) ...  At least I didn't observe any performance drop with 3dmark benchmark (also tested vulkan CTS), Can you tell me which game/benchmark will have performance drop with this fix by your understanding ? let me check it .
>
> The problem I hit is during the massive stress test against multi-process + quark , if the quark process hang the engine while there is another two job following the bad job,
> After the TDR these two job will lose the explicit and the pipeline-sync was also lost.
>
>
> BTW: for original logic, the pipeline sync have another corner case:
> Assume JobC depend on JobA with explicit flag, and there is jobB inserted in ring:
>
> jobA -> jobB -> (pipe sync)JobC
>
> if JobA really cost a lot of time to finish, in the amdgpu_ib_schedule() stage you will insert a pipeline sync for JobC against its explicit dependency which is JobA,
> but there is a JobB between A and C and the pipeline sync of before JobC will wrongly wait on the JobB ...
>
> while it is not a big issue but obviously not necessary: C have no relation with B
>
> /Monk
>
>
>
> -----Original Message-----
> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> Sent: Sunday, November 4, 2018 3:50 AM
> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH 2/3] drm/amdgpu: drop the sched_sync
>
> Am 03.11.18 um 06:33 schrieb Monk Liu:
>> Reasons to drop it:
>>
>> 1) simplify the code: just introduce field member "need_pipe_sync"
>> for job is good enough to tell if the explicit dependency fence need
>> followed by a pipeline sync.
>>
>> 2) after GPU_recover the explicit fence from sched_syn will not come
>> back so the required pipeline_sync following it is missed, consider
>> scenario below:
>>> now on ring buffer:
>> Job-A -> pipe_sync -> Job-B
>>> TDR occured on Job-A, and after GPU recover:
>>> now on ring buffer:
>> Job-A -> Job-B
>>
>> because the fence from sched_sync is used and freed after ib_schedule
>> in first time, it will never come back, with this patch this issue
>> could be avoided.
> NAK, that would result in a severe performance drop.
>
> We need the fence here to determine if we actually need to do the pipeline sync or not.
>
> E.g. the explicit requested fence could already be signaled.
>
> Christian.
>
>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c  | 16 ++++++----------
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 14 +++-----------
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  3 +--
>>    3 files changed, 10 insertions(+), 23 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>> index c48207b3..ac7d2da 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>> @@ -122,7 +122,6 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>    {
>>    	struct amdgpu_device *adev = ring->adev;
>>    	struct amdgpu_ib *ib = &ibs[0];
>> -	struct dma_fence *tmp = NULL;
>>    	bool skip_preamble, need_ctx_switch;
>>    	unsigned patch_offset = ~0;
>>    	struct amdgpu_vm *vm;
>> @@ -166,16 +165,13 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>    	}
>>    
>>    	need_ctx_switch = ring->current_ctx != fence_ctx;
>> -	if (ring->funcs->emit_pipeline_sync && job &&
>> -	    ((tmp = amdgpu_sync_get_fence(&job->sched_sync, NULL)) ||
>> -	     (amdgpu_sriov_vf(adev) && need_ctx_switch) ||
>> -	     amdgpu_vm_need_pipeline_sync(ring, job))) {
>> -		need_pipe_sync = true;
>>    
>> -		if (tmp)
>> -			trace_amdgpu_ib_pipe_sync(job, tmp);
>> -
>> -		dma_fence_put(tmp);
>> +	if (ring->funcs->emit_pipeline_sync && job) {
>> +		if ((need_ctx_switch && amdgpu_sriov_vf(adev)) ||
>> +			amdgpu_vm_need_pipeline_sync(ring, job))
>> +			need_pipe_sync = true;
>> +		else if (job->need_pipe_sync)
>> +			need_pipe_sync = true;
>>    	}
>>    
>>    	if (ring->funcs->insert_start)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> index 1d71f8c..dae997d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> @@ -71,7 +71,6 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
>>    	(*job)->num_ibs = num_ibs;
>>    
>>    	amdgpu_sync_create(&(*job)->sync);
>> -	amdgpu_sync_create(&(*job)->sched_sync);
>>    	(*job)->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
>>    	(*job)->vm_pd_addr = AMDGPU_BO_INVALID_OFFSET;
>>    
>> @@ -117,7 +116,6 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job)
>>    	amdgpu_ring_priority_put(ring, s_job->s_priority);
>>    	dma_fence_put(job->fence);
>>    	amdgpu_sync_free(&job->sync);
>> -	amdgpu_sync_free(&job->sched_sync);
>>    	kfree(job);
>>    }
>>    
>> @@ -127,7 +125,6 @@ void amdgpu_job_free(struct amdgpu_job *job)
>>    
>>    	dma_fence_put(job->fence);
>>    	amdgpu_sync_free(&job->sync);
>> -	amdgpu_sync_free(&job->sched_sync);
>>    	kfree(job);
>>    }
>>    
>> @@ -182,14 +179,9 @@ static struct dma_fence *amdgpu_job_dependency(struct drm_sched_job *sched_job,
>>    	bool need_pipe_sync = false;
>>    	int r;
>>    
>> -	fence = amdgpu_sync_get_fence(&job->sync, &need_pipe_sync);
>> -	if (fence && need_pipe_sync) {
>> -		if (drm_sched_dependency_optimized(fence, s_entity)) {
>> -			r = amdgpu_sync_fence(ring->adev, &job->sched_sync,
>> -					      fence, false);
>> -			if (r)
>> -				DRM_ERROR("Error adding fence (%d)\n", r);
>> -		}
>> +	if (fence && need_pipe_sync && drm_sched_dependency_optimized(fence, s_entity)) {
>> +		trace_amdgpu_ib_pipe_sync(job, fence);
>> +		job->need_pipe_sync = true;
>>    	}
>>    
>>    	while (fence == NULL && vm && !job->vmid) { diff --git
>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>> index e1b46a6..c1d00f0 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>> @@ -41,7 +41,6 @@ struct amdgpu_job {
>>    	struct drm_sched_job    base;
>>    	struct amdgpu_vm	*vm;
>>    	struct amdgpu_sync	sync;
>> -	struct amdgpu_sync	sched_sync;
>>    	struct amdgpu_ib	*ibs;
>>    	struct dma_fence	*fence; /* the hw fence */
>>    	uint32_t		preamble_status;
>> @@ -59,7 +58,7 @@ struct amdgpu_job {
>>    	/* user fence handling */
>>    	uint64_t		uf_addr;
>>    	uint64_t		uf_sequence;
>> -
>> +	bool            need_pipe_sync; /* require a pipeline sync for this job */
>>    };
>>    
>>    int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* RE: [PATCH 2/3] drm/amdgpu: drop the sched_sync
       [not found]                 ` <0abab90d-dd54-290d-a1cf-38047ec66b9f-5C7GfCeVMHo@public.gmane.org>
@ 2018-11-05  7:24                   ` Liu, Monk
       [not found]                     ` <CY4PR1201MB02455363976262BBDC58415484CA0-1iTaO6aE1DBfNQakwlCMTGrFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
  0 siblings, 1 reply; 19+ messages in thread
From: Liu, Monk @ 2018-11-05  7:24 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

> David Zhou had an use case which saw a >10% performance drop the last time he tried it.

I really don't believe that, because if you insert a WAIT_MEM on an already signaled fence, it only cost GPU couple clocks to move  on, right ? no reason to slow down up to 10% ... with 3dmark vulkan version test, the performance is barely different ... with my patch applied ...

> When a reset happens we flush the VMIDs when re-submitting the jobs to the rings and while doing so we also always do a pipeline sync.

I will check that point in my branch, I didn't use drm-next, maybe there is gap in this part 

/Monk
-----Original Message-----
From: Koenig, Christian 
Sent: Monday, November 5, 2018 3:02 AM
To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 2/3] drm/amdgpu: drop the sched_sync

> Can you tell me which game/benchmark will have performance drop with this fix by your understanding ?
When you sync between submission things like composing X windows are slowed down massively.

David Zhou had an use case which saw a >10% performance drop the last time he tried it.

> The problem I hit is during the massive stress test against 
> multi-process + quark , if the quark process hang the engine while there is another two job following the bad job, After the TDR these two job will lose the explicit and the pipeline-sync was also lost.
Well that is really strange. This workaround is only for a very specific Vulkan CTS test which we are still not 100% sure is actually valid.

When a reset happens we flush the VMIDs when re-submitting the jobs to the rings and while doing so we also always do a pipeline sync.

So you should never ever run into any issues in quark with that, even when we completely disable this workaround.

Regards,
Christian.

Am 04.11.18 um 01:48 schrieb Liu, Monk:
>> NAK, that would result in a severe performance drop.
>> We need the fence here to determine if we actually need to do the pipeline sync or not.
>> E.g. the explicit requested fence could already be signaled.
> For the performance issue, only insert a WAIT_REG_MEM on GFX/compute ring *doesn't* give the "severe" drop (it's mimic in fact) ...  At least I didn't observe any performance drop with 3dmark benchmark (also tested vulkan CTS), Can you tell me which game/benchmark will have performance drop with this fix by your understanding ? let me check it .
>
> The problem I hit is during the massive stress test against 
> multi-process + quark , if the quark process hang the engine while there is another two job following the bad job, After the TDR these two job will lose the explicit and the pipeline-sync was also lost.
>
>
> BTW: for original logic, the pipeline sync have another corner case:
> Assume JobC depend on JobA with explicit flag, and there is jobB inserted in ring:
>
> jobA -> jobB -> (pipe sync)JobC
>
> if JobA really cost a lot of time to finish, in the 
> amdgpu_ib_schedule() stage you will insert a pipeline sync for JobC against its explicit dependency which is JobA, but there is a JobB between A and C and the pipeline sync of before JobC will wrongly wait on the JobB ...
>
> while it is not a big issue but obviously not necessary: C have no 
> relation with B
>
> /Monk
>
>
>
> -----Original Message-----
> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> Sent: Sunday, November 4, 2018 3:50 AM
> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH 2/3] drm/amdgpu: drop the sched_sync
>
> Am 03.11.18 um 06:33 schrieb Monk Liu:
>> Reasons to drop it:
>>
>> 1) simplify the code: just introduce field member "need_pipe_sync"
>> for job is good enough to tell if the explicit dependency fence need 
>> followed by a pipeline sync.
>>
>> 2) after GPU_recover the explicit fence from sched_syn will not come 
>> back so the required pipeline_sync following it is missed, consider 
>> scenario below:
>>> now on ring buffer:
>> Job-A -> pipe_sync -> Job-B
>>> TDR occured on Job-A, and after GPU recover:
>>> now on ring buffer:
>> Job-A -> Job-B
>>
>> because the fence from sched_sync is used and freed after ib_schedule 
>> in first time, it will never come back, with this patch this issue 
>> could be avoided.
> NAK, that would result in a severe performance drop.
>
> We need the fence here to determine if we actually need to do the pipeline sync or not.
>
> E.g. the explicit requested fence could already be signaled.
>
> Christian.
>
>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c  | 16 ++++++----------
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 14 +++-----------
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  3 +--
>>    3 files changed, 10 insertions(+), 23 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>> index c48207b3..ac7d2da 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>> @@ -122,7 +122,6 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>    {
>>    	struct amdgpu_device *adev = ring->adev;
>>    	struct amdgpu_ib *ib = &ibs[0];
>> -	struct dma_fence *tmp = NULL;
>>    	bool skip_preamble, need_ctx_switch;
>>    	unsigned patch_offset = ~0;
>>    	struct amdgpu_vm *vm;
>> @@ -166,16 +165,13 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>    	}
>>    
>>    	need_ctx_switch = ring->current_ctx != fence_ctx;
>> -	if (ring->funcs->emit_pipeline_sync && job &&
>> -	    ((tmp = amdgpu_sync_get_fence(&job->sched_sync, NULL)) ||
>> -	     (amdgpu_sriov_vf(adev) && need_ctx_switch) ||
>> -	     amdgpu_vm_need_pipeline_sync(ring, job))) {
>> -		need_pipe_sync = true;
>>    
>> -		if (tmp)
>> -			trace_amdgpu_ib_pipe_sync(job, tmp);
>> -
>> -		dma_fence_put(tmp);
>> +	if (ring->funcs->emit_pipeline_sync && job) {
>> +		if ((need_ctx_switch && amdgpu_sriov_vf(adev)) ||
>> +			amdgpu_vm_need_pipeline_sync(ring, job))
>> +			need_pipe_sync = true;
>> +		else if (job->need_pipe_sync)
>> +			need_pipe_sync = true;
>>    	}
>>    
>>    	if (ring->funcs->insert_start)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> index 1d71f8c..dae997d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> @@ -71,7 +71,6 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
>>    	(*job)->num_ibs = num_ibs;
>>    
>>    	amdgpu_sync_create(&(*job)->sync);
>> -	amdgpu_sync_create(&(*job)->sched_sync);
>>    	(*job)->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
>>    	(*job)->vm_pd_addr = AMDGPU_BO_INVALID_OFFSET;
>>    
>> @@ -117,7 +116,6 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job)
>>    	amdgpu_ring_priority_put(ring, s_job->s_priority);
>>    	dma_fence_put(job->fence);
>>    	amdgpu_sync_free(&job->sync);
>> -	amdgpu_sync_free(&job->sched_sync);
>>    	kfree(job);
>>    }
>>    
>> @@ -127,7 +125,6 @@ void amdgpu_job_free(struct amdgpu_job *job)
>>    
>>    	dma_fence_put(job->fence);
>>    	amdgpu_sync_free(&job->sync);
>> -	amdgpu_sync_free(&job->sched_sync);
>>    	kfree(job);
>>    }
>>    
>> @@ -182,14 +179,9 @@ static struct dma_fence *amdgpu_job_dependency(struct drm_sched_job *sched_job,
>>    	bool need_pipe_sync = false;
>>    	int r;
>>    
>> -	fence = amdgpu_sync_get_fence(&job->sync, &need_pipe_sync);
>> -	if (fence && need_pipe_sync) {
>> -		if (drm_sched_dependency_optimized(fence, s_entity)) {
>> -			r = amdgpu_sync_fence(ring->adev, &job->sched_sync,
>> -					      fence, false);
>> -			if (r)
>> -				DRM_ERROR("Error adding fence (%d)\n", r);
>> -		}
>> +	if (fence && need_pipe_sync && drm_sched_dependency_optimized(fence, s_entity)) {
>> +		trace_amdgpu_ib_pipe_sync(job, fence);
>> +		job->need_pipe_sync = true;
>>    	}
>>    
>>    	while (fence == NULL && vm && !job->vmid) { diff --git 
>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>> index e1b46a6..c1d00f0 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>> @@ -41,7 +41,6 @@ struct amdgpu_job {
>>    	struct drm_sched_job    base;
>>    	struct amdgpu_vm	*vm;
>>    	struct amdgpu_sync	sync;
>> -	struct amdgpu_sync	sched_sync;
>>    	struct amdgpu_ib	*ibs;
>>    	struct dma_fence	*fence; /* the hw fence */
>>    	uint32_t		preamble_status;
>> @@ -59,7 +58,7 @@ struct amdgpu_job {
>>    	/* user fence handling */
>>    	uint64_t		uf_addr;
>>    	uint64_t		uf_sequence;
>> -
>> +	bool            need_pipe_sync; /* require a pipeline sync for this job */
>>    };
>>    
>>    int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/3] drm/amdgpu: drop the sched_sync
       [not found]                     ` <CY4PR1201MB02455363976262BBDC58415484CA0-1iTaO6aE1DBfNQakwlCMTGrFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
@ 2018-11-05  7:47                       ` Koenig, Christian
       [not found]                         ` <9e5d6f91-9e20-8dcc-953f-e1e980b9e0d3-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 19+ messages in thread
From: Koenig, Christian @ 2018-11-05  7:47 UTC (permalink / raw)
  To: Liu, Monk, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Zhou,
	David(ChunMing)

Am 05.11.18 um 08:24 schrieb Liu, Monk:
>> David Zhou had an use case which saw a >10% performance drop the last time he tried it.
> I really don't believe that, because if you insert a WAIT_MEM on an already signaled fence, it only cost GPU couple clocks to move  on, right ? no reason to slow down up to 10% ... with 3dmark vulkan version test, the performance is barely different ... with my patch applied ...

Why do you think that we insert a WAIT_MEM on an already signaled fence? 
The pipeline sync always wait for the last fence value (because we can't 
handle wraparounds in PM4).

So you have a pipeline sync when you don't need one and that is really 
really bad for things shared between processes, e.g. X/Wayland and it's 
clients.

I also expects that this doesn't effect 3dmark at all, but everything 
which runs in a window which is composed by X could be slowed down 
massively.

David do you remember which use case was affected when you tried to drop 
this optimization?

>> When a reset happens we flush the VMIDs when re-submitting the jobs to the rings and while doing so we also always do a pipeline sync.
> I will check that point in my branch, I didn't use drm-next, maybe there is gap in this part

We had that logic for a very long time now, but we recently simplified 
it. Could be that there was a bug introduced doing so.

Maybe we should add a specific flag to run_job() to note that we are 
re-running a job and then always add VM flushes/pipeline syncs?

But my main question is why do you see any issues with quark? That is a 
workaround for an issue for Vulkan sync handling and should only surface 
when a specific test is run many many times.

Regards,
Christian.

>
> /Monk
> -----Original Message-----
> From: Koenig, Christian
> Sent: Monday, November 5, 2018 3:02 AM
> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH 2/3] drm/amdgpu: drop the sched_sync
>
>> Can you tell me which game/benchmark will have performance drop with this fix by your understanding ?
> When you sync between submission things like composing X windows are slowed down massively.
>
> David Zhou had an use case which saw a >10% performance drop the last time he tried it.
>
>> The problem I hit is during the massive stress test against
>> multi-process + quark , if the quark process hang the engine while there is another two job following the bad job, After the TDR these two job will lose the explicit and the pipeline-sync was also lost.
> Well that is really strange. This workaround is only for a very specific Vulkan CTS test which we are still not 100% sure is actually valid.
>
> When a reset happens we flush the VMIDs when re-submitting the jobs to the rings and while doing so we also always do a pipeline sync.
>
> So you should never ever run into any issues in quark with that, even when we completely disable this workaround.
>
> Regards,
> Christian.
>
> Am 04.11.18 um 01:48 schrieb Liu, Monk:
>>> NAK, that would result in a severe performance drop.
>>> We need the fence here to determine if we actually need to do the pipeline sync or not.
>>> E.g. the explicit requested fence could already be signaled.
>> For the performance issue, only insert a WAIT_REG_MEM on GFX/compute ring *doesn't* give the "severe" drop (it's mimic in fact) ...  At least I didn't observe any performance drop with 3dmark benchmark (also tested vulkan CTS), Can you tell me which game/benchmark will have performance drop with this fix by your understanding ? let me check it .
>>
>> The problem I hit is during the massive stress test against
>> multi-process + quark , if the quark process hang the engine while there is another two job following the bad job, After the TDR these two job will lose the explicit and the pipeline-sync was also lost.
>>
>>
>> BTW: for original logic, the pipeline sync have another corner case:
>> Assume JobC depend on JobA with explicit flag, and there is jobB inserted in ring:
>>
>> jobA -> jobB -> (pipe sync)JobC
>>
>> if JobA really cost a lot of time to finish, in the
>> amdgpu_ib_schedule() stage you will insert a pipeline sync for JobC against its explicit dependency which is JobA, but there is a JobB between A and C and the pipeline sync of before JobC will wrongly wait on the JobB ...
>>
>> while it is not a big issue but obviously not necessary: C have no
>> relation with B
>>
>> /Monk
>>
>>
>>
>> -----Original Message-----
>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>> Sent: Sunday, November 4, 2018 3:50 AM
>> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH 2/3] drm/amdgpu: drop the sched_sync
>>
>> Am 03.11.18 um 06:33 schrieb Monk Liu:
>>> Reasons to drop it:
>>>
>>> 1) simplify the code: just introduce field member "need_pipe_sync"
>>> for job is good enough to tell if the explicit dependency fence need
>>> followed by a pipeline sync.
>>>
>>> 2) after GPU_recover the explicit fence from sched_syn will not come
>>> back so the required pipeline_sync following it is missed, consider
>>> scenario below:
>>>> now on ring buffer:
>>> Job-A -> pipe_sync -> Job-B
>>>> TDR occured on Job-A, and after GPU recover:
>>>> now on ring buffer:
>>> Job-A -> Job-B
>>>
>>> because the fence from sched_sync is used and freed after ib_schedule
>>> in first time, it will never come back, with this patch this issue
>>> could be avoided.
>> NAK, that would result in a severe performance drop.
>>
>> We need the fence here to determine if we actually need to do the pipeline sync or not.
>>
>> E.g. the explicit requested fence could already be signaled.
>>
>> Christian.
>>
>>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>>> ---
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c  | 16 ++++++----------
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 14 +++-----------
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  3 +--
>>>     3 files changed, 10 insertions(+), 23 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>> index c48207b3..ac7d2da 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>> @@ -122,7 +122,6 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>>     {
>>>     	struct amdgpu_device *adev = ring->adev;
>>>     	struct amdgpu_ib *ib = &ibs[0];
>>> -	struct dma_fence *tmp = NULL;
>>>     	bool skip_preamble, need_ctx_switch;
>>>     	unsigned patch_offset = ~0;
>>>     	struct amdgpu_vm *vm;
>>> @@ -166,16 +165,13 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>>     	}
>>>     
>>>     	need_ctx_switch = ring->current_ctx != fence_ctx;
>>> -	if (ring->funcs->emit_pipeline_sync && job &&
>>> -	    ((tmp = amdgpu_sync_get_fence(&job->sched_sync, NULL)) ||
>>> -	     (amdgpu_sriov_vf(adev) && need_ctx_switch) ||
>>> -	     amdgpu_vm_need_pipeline_sync(ring, job))) {
>>> -		need_pipe_sync = true;
>>>     
>>> -		if (tmp)
>>> -			trace_amdgpu_ib_pipe_sync(job, tmp);
>>> -
>>> -		dma_fence_put(tmp);
>>> +	if (ring->funcs->emit_pipeline_sync && job) {
>>> +		if ((need_ctx_switch && amdgpu_sriov_vf(adev)) ||
>>> +			amdgpu_vm_need_pipeline_sync(ring, job))
>>> +			need_pipe_sync = true;
>>> +		else if (job->need_pipe_sync)
>>> +			need_pipe_sync = true;
>>>     	}
>>>     
>>>     	if (ring->funcs->insert_start)
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> index 1d71f8c..dae997d 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> @@ -71,7 +71,6 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
>>>     	(*job)->num_ibs = num_ibs;
>>>     
>>>     	amdgpu_sync_create(&(*job)->sync);
>>> -	amdgpu_sync_create(&(*job)->sched_sync);
>>>     	(*job)->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
>>>     	(*job)->vm_pd_addr = AMDGPU_BO_INVALID_OFFSET;
>>>     
>>> @@ -117,7 +116,6 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job)
>>>     	amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>     	dma_fence_put(job->fence);
>>>     	amdgpu_sync_free(&job->sync);
>>> -	amdgpu_sync_free(&job->sched_sync);
>>>     	kfree(job);
>>>     }
>>>     
>>> @@ -127,7 +125,6 @@ void amdgpu_job_free(struct amdgpu_job *job)
>>>     
>>>     	dma_fence_put(job->fence);
>>>     	amdgpu_sync_free(&job->sync);
>>> -	amdgpu_sync_free(&job->sched_sync);
>>>     	kfree(job);
>>>     }
>>>     
>>> @@ -182,14 +179,9 @@ static struct dma_fence *amdgpu_job_dependency(struct drm_sched_job *sched_job,
>>>     	bool need_pipe_sync = false;
>>>     	int r;
>>>     
>>> -	fence = amdgpu_sync_get_fence(&job->sync, &need_pipe_sync);
>>> -	if (fence && need_pipe_sync) {
>>> -		if (drm_sched_dependency_optimized(fence, s_entity)) {
>>> -			r = amdgpu_sync_fence(ring->adev, &job->sched_sync,
>>> -					      fence, false);
>>> -			if (r)
>>> -				DRM_ERROR("Error adding fence (%d)\n", r);
>>> -		}
>>> +	if (fence && need_pipe_sync && drm_sched_dependency_optimized(fence, s_entity)) {
>>> +		trace_amdgpu_ib_pipe_sync(job, fence);
>>> +		job->need_pipe_sync = true;
>>>     	}
>>>     
>>>     	while (fence == NULL && vm && !job->vmid) { diff --git
>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> index e1b46a6..c1d00f0 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> @@ -41,7 +41,6 @@ struct amdgpu_job {
>>>     	struct drm_sched_job    base;
>>>     	struct amdgpu_vm	*vm;
>>>     	struct amdgpu_sync	sync;
>>> -	struct amdgpu_sync	sched_sync;
>>>     	struct amdgpu_ib	*ibs;
>>>     	struct dma_fence	*fence; /* the hw fence */
>>>     	uint32_t		preamble_status;
>>> @@ -59,7 +58,7 @@ struct amdgpu_job {
>>>     	/* user fence handling */
>>>     	uint64_t		uf_addr;
>>>     	uint64_t		uf_sequence;
>>> -
>>> +	bool            need_pipe_sync; /* require a pipeline sync for this job */
>>>     };
>>>     
>>>     int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* RE: [PATCH 2/3] drm/amdgpu: drop the sched_sync
       [not found]                         ` <9e5d6f91-9e20-8dcc-953f-e1e980b9e0d3-5C7GfCeVMHo@public.gmane.org>
@ 2018-11-05  7:50                           ` Zhou, David(ChunMing)
       [not found]                             ` <BY1PR12MB0502C523B2FA785240D12E36B4CA0-PicGAnIBOobrCwm+z9iKNgdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  2018-11-05  8:58                           ` Liu, Monk
  2018-11-05 13:41                           ` Liu, Monk
  2 siblings, 1 reply; 19+ messages in thread
From: Zhou, David(ChunMing) @ 2018-11-05  7:50 UTC (permalink / raw)
  To: Koenig, Christian, Liu, Monk, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW



> -----Original Message-----
> From: Koenig, Christian
> Sent: Monday, November 05, 2018 3:48 PM
> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org; Zhou,
> David(ChunMing) <David1.Zhou@amd.com>
> Subject: Re: [PATCH 2/3] drm/amdgpu: drop the sched_sync
> 
> Am 05.11.18 um 08:24 schrieb Liu, Monk:
> >> David Zhou had an use case which saw a >10% performance drop the last
> time he tried it.
> > I really don't believe that, because if you insert a WAIT_MEM on an already
> signaled fence, it only cost GPU couple clocks to move  on, right ? no reason
> to slow down up to 10% ... with 3dmark vulkan version test, the performance
> is barely different ... with my patch applied ...
> 
> Why do you think that we insert a WAIT_MEM on an already signaled fence?
> The pipeline sync always wait for the last fence value (because we can't
> handle wraparounds in PM4).
> 
> So you have a pipeline sync when you don't need one and that is really really
> bad for things shared between processes, e.g. X/Wayland and it's clients.
> 
> I also expects that this doesn't effect 3dmark at all, but everything which runs
> in a window which is composed by X could be slowed down massively.
> 
> David do you remember which use case was affected when you tried to drop
> this optimization?
That was a long time ago, I remember Andrey also tried to remove sched_sync before, but he eventually kept it, right?
From Monk's patch, seems he doesn't change main logic, he just  moved sched_sync logic to job->need_pipe_sync.
But at least, I can see a bit effect, e.g. job process evaluates fence to sched_sync, but the fence could be signaled when amdgpu_ib_schedule, then don't need insert pipeline sync.

Anyway, this is a sensitive path, we should change it carefully, we should give a wide test.

Regards,
David Zhou
> 
> >> When a reset happens we flush the VMIDs when re-submitting the jobs
> to the rings and while doing so we also always do a pipeline sync.
> > I will check that point in my branch, I didn't use drm-next, maybe
> > there is gap in this part
> 
> We had that logic for a very long time now, but we recently simplified it.
> Could be that there was a bug introduced doing so.
> 
> Maybe we should add a specific flag to run_job() to note that we are re-
> running a job and then always add VM flushes/pipeline syncs?
> 
> But my main question is why do you see any issues with quark? That is a
> workaround for an issue for Vulkan sync handling and should only surface
> when a specific test is run many many times.
> 
> Regards,
> Christian.
> 
> >
> > /Monk
> > -----Original Message-----
> > From: Koenig, Christian
> > Sent: Monday, November 5, 2018 3:02 AM
> > To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
> > Subject: Re: [PATCH 2/3] drm/amdgpu: drop the sched_sync
> >
> >> Can you tell me which game/benchmark will have performance drop with
> this fix by your understanding ?
> > When you sync between submission things like composing X windows are
> slowed down massively.
> >
> > David Zhou had an use case which saw a >10% performance drop the last
> time he tried it.
> >
> >> The problem I hit is during the massive stress test against
> >> multi-process + quark , if the quark process hang the engine while there is
> another two job following the bad job, After the TDR these two job will lose
> the explicit and the pipeline-sync was also lost.
> > Well that is really strange. This workaround is only for a very specific Vulkan
> CTS test which we are still not 100% sure is actually valid.
> >
> > When a reset happens we flush the VMIDs when re-submitting the jobs to
> the rings and while doing so we also always do a pipeline sync.
> >
> > So you should never ever run into any issues in quark with that, even when
> we completely disable this workaround.
> >
> > Regards,
> > Christian.
> >
> > Am 04.11.18 um 01:48 schrieb Liu, Monk:
> >>> NAK, that would result in a severe performance drop.
> >>> We need the fence here to determine if we actually need to do the
> pipeline sync or not.
> >>> E.g. the explicit requested fence could already be signaled.
> >> For the performance issue, only insert a WAIT_REG_MEM on
> GFX/compute ring *doesn't* give the "severe" drop (it's mimic in fact) ...  At
> least I didn't observe any performance drop with 3dmark benchmark (also
> tested vulkan CTS), Can you tell me which game/benchmark will have
> performance drop with this fix by your understanding ? let me check it .
> >>
> >> The problem I hit is during the massive stress test against
> >> multi-process + quark , if the quark process hang the engine while there is
> another two job following the bad job, After the TDR these two job will lose
> the explicit and the pipeline-sync was also lost.
> >>
> >>
> >> BTW: for original logic, the pipeline sync have another corner case:
> >> Assume JobC depend on JobA with explicit flag, and there is jobB inserted
> in ring:
> >>
> >> jobA -> jobB -> (pipe sync)JobC
> >>
> >> if JobA really cost a lot of time to finish, in the
> >> amdgpu_ib_schedule() stage you will insert a pipeline sync for JobC
> against its explicit dependency which is JobA, but there is a JobB between A
> and C and the pipeline sync of before JobC will wrongly wait on the JobB ...
> >>
> >> while it is not a big issue but obviously not necessary: C have no
> >> relation with B
> >>
> >> /Monk
> >>
> >>
> >>
> >> -----Original Message-----
> >> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> >> Sent: Sunday, November 4, 2018 3:50 AM
> >> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
> >> Subject: Re: [PATCH 2/3] drm/amdgpu: drop the sched_sync
> >>
> >> Am 03.11.18 um 06:33 schrieb Monk Liu:
> >>> Reasons to drop it:
> >>>
> >>> 1) simplify the code: just introduce field member "need_pipe_sync"
> >>> for job is good enough to tell if the explicit dependency fence need
> >>> followed by a pipeline sync.
> >>>
> >>> 2) after GPU_recover the explicit fence from sched_syn will not come
> >>> back so the required pipeline_sync following it is missed, consider
> >>> scenario below:
> >>>> now on ring buffer:
> >>> Job-A -> pipe_sync -> Job-B
> >>>> TDR occured on Job-A, and after GPU recover:
> >>>> now on ring buffer:
> >>> Job-A -> Job-B
> >>>
> >>> because the fence from sched_sync is used and freed after
> >>> ib_schedule in first time, it will never come back, with this patch
> >>> this issue could be avoided.
> >> NAK, that would result in a severe performance drop.
> >>
> >> We need the fence here to determine if we actually need to do the
> pipeline sync or not.
> >>
> >> E.g. the explicit requested fence could already be signaled.
> >>
> >> Christian.
> >>
> >>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
> >>> ---
> >>>     drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c  | 16 ++++++----------
> >>>     drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 14 +++-----------
> >>>     drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  3 +--
> >>>     3 files changed, 10 insertions(+), 23 deletions(-)
> >>>
> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> >>> index c48207b3..ac7d2da 100644
> >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> >>> @@ -122,7 +122,6 @@ int amdgpu_ib_schedule(struct amdgpu_ring
> *ring, unsigned num_ibs,
> >>>     {
> >>>     	struct amdgpu_device *adev = ring->adev;
> >>>     	struct amdgpu_ib *ib = &ibs[0];
> >>> -	struct dma_fence *tmp = NULL;
> >>>     	bool skip_preamble, need_ctx_switch;
> >>>     	unsigned patch_offset = ~0;
> >>>     	struct amdgpu_vm *vm;
> >>> @@ -166,16 +165,13 @@ int amdgpu_ib_schedule(struct amdgpu_ring
> *ring, unsigned num_ibs,
> >>>     	}
> >>>
> >>>     	need_ctx_switch = ring->current_ctx != fence_ctx;
> >>> -	if (ring->funcs->emit_pipeline_sync && job &&
> >>> -	    ((tmp = amdgpu_sync_get_fence(&job->sched_sync, NULL)) ||
> >>> -	     (amdgpu_sriov_vf(adev) && need_ctx_switch) ||
> >>> -	     amdgpu_vm_need_pipeline_sync(ring, job))) {
> >>> -		need_pipe_sync = true;
> >>>
> >>> -		if (tmp)
> >>> -			trace_amdgpu_ib_pipe_sync(job, tmp);
> >>> -
> >>> -		dma_fence_put(tmp);
> >>> +	if (ring->funcs->emit_pipeline_sync && job) {
> >>> +		if ((need_ctx_switch && amdgpu_sriov_vf(adev)) ||
> >>> +			amdgpu_vm_need_pipeline_sync(ring, job))
> >>> +			need_pipe_sync = true;
> >>> +		else if (job->need_pipe_sync)
> >>> +			need_pipe_sync = true;
> >>>     	}
> >>>
> >>>     	if (ring->funcs->insert_start)
> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> >>> index 1d71f8c..dae997d 100644
> >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> >>> @@ -71,7 +71,6 @@ int amdgpu_job_alloc(struct amdgpu_device *adev,
> unsigned num_ibs,
> >>>     	(*job)->num_ibs = num_ibs;
> >>>
> >>>     	amdgpu_sync_create(&(*job)->sync);
> >>> -	amdgpu_sync_create(&(*job)->sched_sync);
> >>>     	(*job)->vram_lost_counter = atomic_read(&adev-
> >vram_lost_counter);
> >>>     	(*job)->vm_pd_addr = AMDGPU_BO_INVALID_OFFSET;
> >>>
> >>> @@ -117,7 +116,6 @@ static void amdgpu_job_free_cb(struct
> drm_sched_job *s_job)
> >>>     	amdgpu_ring_priority_put(ring, s_job->s_priority);
> >>>     	dma_fence_put(job->fence);
> >>>     	amdgpu_sync_free(&job->sync);
> >>> -	amdgpu_sync_free(&job->sched_sync);
> >>>     	kfree(job);
> >>>     }
> >>>
> >>> @@ -127,7 +125,6 @@ void amdgpu_job_free(struct amdgpu_job *job)
> >>>
> >>>     	dma_fence_put(job->fence);
> >>>     	amdgpu_sync_free(&job->sync);
> >>> -	amdgpu_sync_free(&job->sched_sync);
> >>>     	kfree(job);
> >>>     }
> >>>
> >>> @@ -182,14 +179,9 @@ static struct dma_fence
> *amdgpu_job_dependency(struct drm_sched_job *sched_job,
> >>>     	bool need_pipe_sync = false;
> >>>     	int r;
> >>>
> >>> -	fence = amdgpu_sync_get_fence(&job->sync, &need_pipe_sync);
> >>> -	if (fence && need_pipe_sync) {
> >>> -		if (drm_sched_dependency_optimized(fence, s_entity)) {
> >>> -			r = amdgpu_sync_fence(ring->adev, &job-
> >sched_sync,
> >>> -					      fence, false);
> >>> -			if (r)
> >>> -				DRM_ERROR("Error adding fence (%d)\n", r);
> >>> -		}
> >>> +	if (fence && need_pipe_sync &&
> drm_sched_dependency_optimized(fence, s_entity)) {
> >>> +		trace_amdgpu_ib_pipe_sync(job, fence);
> >>> +		job->need_pipe_sync = true;
> >>>     	}
> >>>
> >>>     	while (fence == NULL && vm && !job->vmid) { diff --git
> >>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> >>> index e1b46a6..c1d00f0 100644
> >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> >>> @@ -41,7 +41,6 @@ struct amdgpu_job {
> >>>     	struct drm_sched_job    base;
> >>>     	struct amdgpu_vm	*vm;
> >>>     	struct amdgpu_sync	sync;
> >>> -	struct amdgpu_sync	sched_sync;
> >>>     	struct amdgpu_ib	*ibs;
> >>>     	struct dma_fence	*fence; /* the hw fence */
> >>>     	uint32_t		preamble_status;
> >>> @@ -59,7 +58,7 @@ struct amdgpu_job {
> >>>     	/* user fence handling */
> >>>     	uint64_t		uf_addr;
> >>>     	uint64_t		uf_sequence;
> >>> -
> >>> +	bool            need_pipe_sync; /* require a pipeline sync for this job */
> >>>     };
> >>>
> >>>     int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned
> >>> num_ibs,

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* RE: [PATCH 2/3] drm/amdgpu: drop the sched_sync
       [not found]                             ` <BY1PR12MB0502C523B2FA785240D12E36B4CA0-PicGAnIBOobrCwm+z9iKNgdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2018-11-05  8:37                               ` Liu, Monk
  0 siblings, 0 replies; 19+ messages in thread
From: Liu, Monk @ 2018-11-05  8:37 UTC (permalink / raw)
  To: Zhou, David(ChunMing),
	Koenig, Christian, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

The bug I hit is because in my local branch that vmid counter logic is changed ( I optimized unnecessary vm-flush after gpu recover), Christian is right on that,
So with drm-next branch my bug won't hit, I'm fine with sched_dep there as long as not such issue. 

Thanks 
/Monk
-----Original Message-----
From: Zhou, David(ChunMing) 
Sent: Monday, November 5, 2018 3:51 PM
To: Koenig, Christian <Christian.Koenig@amd.com>; Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
Subject: RE: [PATCH 2/3] drm/amdgpu: drop the sched_sync



> -----Original Message-----
> From: Koenig, Christian
> Sent: Monday, November 05, 2018 3:48 PM
> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org; Zhou,
> David(ChunMing) <David1.Zhou@amd.com>
> Subject: Re: [PATCH 2/3] drm/amdgpu: drop the sched_sync
> 
> Am 05.11.18 um 08:24 schrieb Liu, Monk:
> >> David Zhou had an use case which saw a >10% performance drop the 
> >> last
> time he tried it.
> > I really don't believe that, because if you insert a WAIT_MEM on an 
> > already
> signaled fence, it only cost GPU couple clocks to move  on, right ? no 
> reason to slow down up to 10% ... with 3dmark vulkan version test, the 
> performance is barely different ... with my patch applied ...
> 
> Why do you think that we insert a WAIT_MEM on an already signaled fence?
> The pipeline sync always wait for the last fence value (because we 
> can't handle wraparounds in PM4).
> 
> So you have a pipeline sync when you don't need one and that is really 
> really bad for things shared between processes, e.g. X/Wayland and it's clients.
> 
> I also expects that this doesn't effect 3dmark at all, but everything 
> which runs in a window which is composed by X could be slowed down massively.
> 
> David do you remember which use case was affected when you tried to 
> drop this optimization?
That was a long time ago, I remember Andrey also tried to remove sched_sync before, but he eventually kept it, right?
From Monk's patch, seems he doesn't change main logic, he just  moved sched_sync logic to job->need_pipe_sync.
But at least, I can see a bit effect, e.g. job process evaluates fence to sched_sync, but the fence could be signaled when amdgpu_ib_schedule, then don't need insert pipeline sync.

Anyway, this is a sensitive path, we should change it carefully, we should give a wide test.

Regards,
David Zhou
> 
> >> When a reset happens we flush the VMIDs when re-submitting the jobs
> to the rings and while doing so we also always do a pipeline sync.
> > I will check that point in my branch, I didn't use drm-next, maybe 
> > there is gap in this part
> 
> We had that logic for a very long time now, but we recently simplified it.
> Could be that there was a bug introduced doing so.
> 
> Maybe we should add a specific flag to run_job() to note that we are 
> re- running a job and then always add VM flushes/pipeline syncs?
> 
> But my main question is why do you see any issues with quark? That is 
> a workaround for an issue for Vulkan sync handling and should only 
> surface when a specific test is run many many times.
> 
> Regards,
> Christian.
> 
> >
> > /Monk
> > -----Original Message-----
> > From: Koenig, Christian
> > Sent: Monday, November 5, 2018 3:02 AM
> > To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
> > Subject: Re: [PATCH 2/3] drm/amdgpu: drop the sched_sync
> >
> >> Can you tell me which game/benchmark will have performance drop 
> >> with
> this fix by your understanding ?
> > When you sync between submission things like composing X windows are
> slowed down massively.
> >
> > David Zhou had an use case which saw a >10% performance drop the 
> > last
> time he tried it.
> >
> >> The problem I hit is during the massive stress test against 
> >> multi-process + quark , if the quark process hang the engine while 
> >> there is
> another two job following the bad job, After the TDR these two job 
> will lose the explicit and the pipeline-sync was also lost.
> > Well that is really strange. This workaround is only for a very 
> > specific Vulkan
> CTS test which we are still not 100% sure is actually valid.
> >
> > When a reset happens we flush the VMIDs when re-submitting the jobs 
> > to
> the rings and while doing so we also always do a pipeline sync.
> >
> > So you should never ever run into any issues in quark with that, 
> > even when
> we completely disable this workaround.
> >
> > Regards,
> > Christian.
> >
> > Am 04.11.18 um 01:48 schrieb Liu, Monk:
> >>> NAK, that would result in a severe performance drop.
> >>> We need the fence here to determine if we actually need to do the
> pipeline sync or not.
> >>> E.g. the explicit requested fence could already be signaled.
> >> For the performance issue, only insert a WAIT_REG_MEM on
> GFX/compute ring *doesn't* give the "severe" drop (it's mimic in fact) 
> ...  At least I didn't observe any performance drop with 3dmark 
> benchmark (also tested vulkan CTS), Can you tell me which 
> game/benchmark will have performance drop with this fix by your understanding ? let me check it .
> >>
> >> The problem I hit is during the massive stress test against 
> >> multi-process + quark , if the quark process hang the engine while 
> >> there is
> another two job following the bad job, After the TDR these two job 
> will lose the explicit and the pipeline-sync was also lost.
> >>
> >>
> >> BTW: for original logic, the pipeline sync have another corner case:
> >> Assume JobC depend on JobA with explicit flag, and there is jobB 
> >> inserted
> in ring:
> >>
> >> jobA -> jobB -> (pipe sync)JobC
> >>
> >> if JobA really cost a lot of time to finish, in the
> >> amdgpu_ib_schedule() stage you will insert a pipeline sync for JobC
> against its explicit dependency which is JobA, but there is a JobB 
> between A and C and the pipeline sync of before JobC will wrongly wait on the JobB ...
> >>
> >> while it is not a big issue but obviously not necessary: C have no 
> >> relation with B
> >>
> >> /Monk
> >>
> >>
> >>
> >> -----Original Message-----
> >> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> >> Sent: Sunday, November 4, 2018 3:50 AM
> >> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
> >> Subject: Re: [PATCH 2/3] drm/amdgpu: drop the sched_sync
> >>
> >> Am 03.11.18 um 06:33 schrieb Monk Liu:
> >>> Reasons to drop it:
> >>>
> >>> 1) simplify the code: just introduce field member "need_pipe_sync"
> >>> for job is good enough to tell if the explicit dependency fence 
> >>> need followed by a pipeline sync.
> >>>
> >>> 2) after GPU_recover the explicit fence from sched_syn will not 
> >>> come back so the required pipeline_sync following it is missed, 
> >>> consider scenario below:
> >>>> now on ring buffer:
> >>> Job-A -> pipe_sync -> Job-B
> >>>> TDR occured on Job-A, and after GPU recover:
> >>>> now on ring buffer:
> >>> Job-A -> Job-B
> >>>
> >>> because the fence from sched_sync is used and freed after 
> >>> ib_schedule in first time, it will never come back, with this 
> >>> patch this issue could be avoided.
> >> NAK, that would result in a severe performance drop.
> >>
> >> We need the fence here to determine if we actually need to do the
> pipeline sync or not.
> >>
> >> E.g. the explicit requested fence could already be signaled.
> >>
> >> Christian.
> >>
> >>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
> >>> ---
> >>>     drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c  | 16 ++++++----------
> >>>     drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 14 +++-----------
> >>>     drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  3 +--
> >>>     3 files changed, 10 insertions(+), 23 deletions(-)
> >>>
> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> >>> index c48207b3..ac7d2da 100644
> >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> >>> @@ -122,7 +122,6 @@ int amdgpu_ib_schedule(struct amdgpu_ring
> *ring, unsigned num_ibs,
> >>>     {
> >>>     	struct amdgpu_device *adev = ring->adev;
> >>>     	struct amdgpu_ib *ib = &ibs[0];
> >>> -	struct dma_fence *tmp = NULL;
> >>>     	bool skip_preamble, need_ctx_switch;
> >>>     	unsigned patch_offset = ~0;
> >>>     	struct amdgpu_vm *vm;
> >>> @@ -166,16 +165,13 @@ int amdgpu_ib_schedule(struct amdgpu_ring
> *ring, unsigned num_ibs,
> >>>     	}
> >>>
> >>>     	need_ctx_switch = ring->current_ctx != fence_ctx;
> >>> -	if (ring->funcs->emit_pipeline_sync && job &&
> >>> -	    ((tmp = amdgpu_sync_get_fence(&job->sched_sync, NULL)) ||
> >>> -	     (amdgpu_sriov_vf(adev) && need_ctx_switch) ||
> >>> -	     amdgpu_vm_need_pipeline_sync(ring, job))) {
> >>> -		need_pipe_sync = true;
> >>>
> >>> -		if (tmp)
> >>> -			trace_amdgpu_ib_pipe_sync(job, tmp);
> >>> -
> >>> -		dma_fence_put(tmp);
> >>> +	if (ring->funcs->emit_pipeline_sync && job) {
> >>> +		if ((need_ctx_switch && amdgpu_sriov_vf(adev)) ||
> >>> +			amdgpu_vm_need_pipeline_sync(ring, job))
> >>> +			need_pipe_sync = true;
> >>> +		else if (job->need_pipe_sync)
> >>> +			need_pipe_sync = true;
> >>>     	}
> >>>
> >>>     	if (ring->funcs->insert_start) diff --git 
> >>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> >>> index 1d71f8c..dae997d 100644
> >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> >>> @@ -71,7 +71,6 @@ int amdgpu_job_alloc(struct amdgpu_device *adev,
> unsigned num_ibs,
> >>>     	(*job)->num_ibs = num_ibs;
> >>>
> >>>     	amdgpu_sync_create(&(*job)->sync);
> >>> -	amdgpu_sync_create(&(*job)->sched_sync);
> >>>     	(*job)->vram_lost_counter = atomic_read(&adev-
> >vram_lost_counter);
> >>>     	(*job)->vm_pd_addr = AMDGPU_BO_INVALID_OFFSET;
> >>>
> >>> @@ -117,7 +116,6 @@ static void amdgpu_job_free_cb(struct
> drm_sched_job *s_job)
> >>>     	amdgpu_ring_priority_put(ring, s_job->s_priority);
> >>>     	dma_fence_put(job->fence);
> >>>     	amdgpu_sync_free(&job->sync);
> >>> -	amdgpu_sync_free(&job->sched_sync);
> >>>     	kfree(job);
> >>>     }
> >>>
> >>> @@ -127,7 +125,6 @@ void amdgpu_job_free(struct amdgpu_job *job)
> >>>
> >>>     	dma_fence_put(job->fence);
> >>>     	amdgpu_sync_free(&job->sync);
> >>> -	amdgpu_sync_free(&job->sched_sync);
> >>>     	kfree(job);
> >>>     }
> >>>
> >>> @@ -182,14 +179,9 @@ static struct dma_fence
> *amdgpu_job_dependency(struct drm_sched_job *sched_job,
> >>>     	bool need_pipe_sync = false;
> >>>     	int r;
> >>>
> >>> -	fence = amdgpu_sync_get_fence(&job->sync, &need_pipe_sync);
> >>> -	if (fence && need_pipe_sync) {
> >>> -		if (drm_sched_dependency_optimized(fence, s_entity)) {
> >>> -			r = amdgpu_sync_fence(ring->adev, &job-
> >sched_sync,
> >>> -					      fence, false);
> >>> -			if (r)
> >>> -				DRM_ERROR("Error adding fence (%d)\n", r);
> >>> -		}
> >>> +	if (fence && need_pipe_sync &&
> drm_sched_dependency_optimized(fence, s_entity)) {
> >>> +		trace_amdgpu_ib_pipe_sync(job, fence);
> >>> +		job->need_pipe_sync = true;
> >>>     	}
> >>>
> >>>     	while (fence == NULL && vm && !job->vmid) { diff --git 
> >>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> >>> index e1b46a6..c1d00f0 100644
> >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> >>> @@ -41,7 +41,6 @@ struct amdgpu_job {
> >>>     	struct drm_sched_job    base;
> >>>     	struct amdgpu_vm	*vm;
> >>>     	struct amdgpu_sync	sync;
> >>> -	struct amdgpu_sync	sched_sync;
> >>>     	struct amdgpu_ib	*ibs;
> >>>     	struct dma_fence	*fence; /* the hw fence */
> >>>     	uint32_t		preamble_status;
> >>> @@ -59,7 +58,7 @@ struct amdgpu_job {
> >>>     	/* user fence handling */
> >>>     	uint64_t		uf_addr;
> >>>     	uint64_t		uf_sequence;
> >>> -
> >>> +	bool            need_pipe_sync; /* require a pipeline sync for this job */
> >>>     };
> >>>
> >>>     int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned 
> >>> num_ibs,

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* RE: [PATCH 2/3] drm/amdgpu: drop the sched_sync
       [not found]                         ` <9e5d6f91-9e20-8dcc-953f-e1e980b9e0d3-5C7GfCeVMHo@public.gmane.org>
  2018-11-05  7:50                           ` Zhou, David(ChunMing)
@ 2018-11-05  8:58                           ` Liu, Monk
       [not found]                             ` <CY4PR1201MB0245543F654663532113C54A84CA0-1iTaO6aE1DBfNQakwlCMTGrFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
  2018-11-05 13:41                           ` Liu, Monk
  2 siblings, 1 reply; 19+ messages in thread
From: Liu, Monk @ 2018-11-05  8:58 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	Zhou, David(ChunMing)

> So you have a pipeline sync when you don't need one and that is really really bad for things shared between processes, e.g. X/Wayland and it's clients.

Oh, that may explain the thing here: My environment is a no-X-window system (customer's cloud gaming user case), so I don't launch X at all, and there is only one process running there which is 3dmark_vulkan. 

> But my main question is why do you see any issues with quark? That is a workaround for an issue for Vulkan sync handling and should only surface when a specific test is run many many times.

Quark is only used to hang the gfx ring, and the missing explicit sync is from other processes (vulkan cts, vk_example, 3dmark). And I did some changes that drops unnecessary vm-flush/pipeline sync after GPU recover, that part is different with drm-next ... thanks for the reminding.

BTW: could we let the Job remember the hw fence seq that it need to sync up to ? e.g. in "drm_sched_entity_clear_dep" we not only wake up scheduler but also set the hw fence seq number to the job (and keep the big one), so in the end in amdgpu_ib_schedule(), we knows exactly the last seq value we need to pipeline sync to, and we can only WAIT_REG_MEM on it ( so we need change pipeline_sync routine for gfx, let it receive the seq value as parameter, and we use ">=" operation instead of "==")

/Monk
-----Original Message-----
From: Koenig, Christian 
Sent: Monday, November 5, 2018 3:48 PM
To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org; Zhou, David(ChunMing) <David1.Zhou@amd.com>
Subject: Re: [PATCH 2/3] drm/amdgpu: drop the sched_sync

Am 05.11.18 um 08:24 schrieb Liu, Monk:
>> David Zhou had an use case which saw a >10% performance drop the last time he tried it.
> I really don't believe that, because if you insert a WAIT_MEM on an already signaled fence, it only cost GPU couple clocks to move  on, right ? no reason to slow down up to 10% ... with 3dmark vulkan version test, the performance is barely different ... with my patch applied ...

Why do you think that we insert a WAIT_MEM on an already signaled fence? 
The pipeline sync always wait for the last fence value (because we can't handle wraparounds in PM4).

So you have a pipeline sync when you don't need one and that is really really bad for things shared between processes, e.g. X/Wayland and it's clients.

I also expects that this doesn't effect 3dmark at all, but everything which runs in a window which is composed by X could be slowed down massively.

David do you remember which use case was affected when you tried to drop this optimization?

>> When a reset happens we flush the VMIDs when re-submitting the jobs to the rings and while doing so we also always do a pipeline sync.
> I will check that point in my branch, I didn't use drm-next, maybe 
> there is gap in this part

We had that logic for a very long time now, but we recently simplified it. Could be that there was a bug introduced doing so.

Maybe we should add a specific flag to run_job() to note that we are re-running a job and then always add VM flushes/pipeline syncs?

But my main question is why do you see any issues with quark? That is a workaround for an issue for Vulkan sync handling and should only surface when a specific test is run many many times.

Regards,
Christian.

>
> /Monk
> -----Original Message-----
> From: Koenig, Christian
> Sent: Monday, November 5, 2018 3:02 AM
> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH 2/3] drm/amdgpu: drop the sched_sync
>
>> Can you tell me which game/benchmark will have performance drop with this fix by your understanding ?
> When you sync between submission things like composing X windows are slowed down massively.
>
> David Zhou had an use case which saw a >10% performance drop the last time he tried it.
>
>> The problem I hit is during the massive stress test against 
>> multi-process + quark , if the quark process hang the engine while there is another two job following the bad job, After the TDR these two job will lose the explicit and the pipeline-sync was also lost.
> Well that is really strange. This workaround is only for a very specific Vulkan CTS test which we are still not 100% sure is actually valid.
>
> When a reset happens we flush the VMIDs when re-submitting the jobs to the rings and while doing so we also always do a pipeline sync.
>
> So you should never ever run into any issues in quark with that, even when we completely disable this workaround.
>
> Regards,
> Christian.
>
> Am 04.11.18 um 01:48 schrieb Liu, Monk:
>>> NAK, that would result in a severe performance drop.
>>> We need the fence here to determine if we actually need to do the pipeline sync or not.
>>> E.g. the explicit requested fence could already be signaled.
>> For the performance issue, only insert a WAIT_REG_MEM on GFX/compute ring *doesn't* give the "severe" drop (it's mimic in fact) ...  At least I didn't observe any performance drop with 3dmark benchmark (also tested vulkan CTS), Can you tell me which game/benchmark will have performance drop with this fix by your understanding ? let me check it .
>>
>> The problem I hit is during the massive stress test against 
>> multi-process + quark , if the quark process hang the engine while there is another two job following the bad job, After the TDR these two job will lose the explicit and the pipeline-sync was also lost.
>>
>>
>> BTW: for original logic, the pipeline sync have another corner case:
>> Assume JobC depend on JobA with explicit flag, and there is jobB inserted in ring:
>>
>> jobA -> jobB -> (pipe sync)JobC
>>
>> if JobA really cost a lot of time to finish, in the
>> amdgpu_ib_schedule() stage you will insert a pipeline sync for JobC against its explicit dependency which is JobA, but there is a JobB between A and C and the pipeline sync of before JobC will wrongly wait on the JobB ...
>>
>> while it is not a big issue but obviously not necessary: C have no 
>> relation with B
>>
>> /Monk
>>
>>
>>
>> -----Original Message-----
>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>> Sent: Sunday, November 4, 2018 3:50 AM
>> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH 2/3] drm/amdgpu: drop the sched_sync
>>
>> Am 03.11.18 um 06:33 schrieb Monk Liu:
>>> Reasons to drop it:
>>>
>>> 1) simplify the code: just introduce field member "need_pipe_sync"
>>> for job is good enough to tell if the explicit dependency fence need 
>>> followed by a pipeline sync.
>>>
>>> 2) after GPU_recover the explicit fence from sched_syn will not come 
>>> back so the required pipeline_sync following it is missed, consider 
>>> scenario below:
>>>> now on ring buffer:
>>> Job-A -> pipe_sync -> Job-B
>>>> TDR occured on Job-A, and after GPU recover:
>>>> now on ring buffer:
>>> Job-A -> Job-B
>>>
>>> because the fence from sched_sync is used and freed after 
>>> ib_schedule in first time, it will never come back, with this patch 
>>> this issue could be avoided.
>> NAK, that would result in a severe performance drop.
>>
>> We need the fence here to determine if we actually need to do the pipeline sync or not.
>>
>> E.g. the explicit requested fence could already be signaled.
>>
>> Christian.
>>
>>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>>> ---
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c  | 16 ++++++----------
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 14 +++-----------
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  3 +--
>>>     3 files changed, 10 insertions(+), 23 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>> index c48207b3..ac7d2da 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>> @@ -122,7 +122,6 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>>     {
>>>     	struct amdgpu_device *adev = ring->adev;
>>>     	struct amdgpu_ib *ib = &ibs[0];
>>> -	struct dma_fence *tmp = NULL;
>>>     	bool skip_preamble, need_ctx_switch;
>>>     	unsigned patch_offset = ~0;
>>>     	struct amdgpu_vm *vm;
>>> @@ -166,16 +165,13 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>>     	}
>>>     
>>>     	need_ctx_switch = ring->current_ctx != fence_ctx;
>>> -	if (ring->funcs->emit_pipeline_sync && job &&
>>> -	    ((tmp = amdgpu_sync_get_fence(&job->sched_sync, NULL)) ||
>>> -	     (amdgpu_sriov_vf(adev) && need_ctx_switch) ||
>>> -	     amdgpu_vm_need_pipeline_sync(ring, job))) {
>>> -		need_pipe_sync = true;
>>>     
>>> -		if (tmp)
>>> -			trace_amdgpu_ib_pipe_sync(job, tmp);
>>> -
>>> -		dma_fence_put(tmp);
>>> +	if (ring->funcs->emit_pipeline_sync && job) {
>>> +		if ((need_ctx_switch && amdgpu_sriov_vf(adev)) ||
>>> +			amdgpu_vm_need_pipeline_sync(ring, job))
>>> +			need_pipe_sync = true;
>>> +		else if (job->need_pipe_sync)
>>> +			need_pipe_sync = true;
>>>     	}
>>>     
>>>     	if (ring->funcs->insert_start)
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> index 1d71f8c..dae997d 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> @@ -71,7 +71,6 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
>>>     	(*job)->num_ibs = num_ibs;
>>>     
>>>     	amdgpu_sync_create(&(*job)->sync);
>>> -	amdgpu_sync_create(&(*job)->sched_sync);
>>>     	(*job)->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
>>>     	(*job)->vm_pd_addr = AMDGPU_BO_INVALID_OFFSET;
>>>     
>>> @@ -117,7 +116,6 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job)
>>>     	amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>     	dma_fence_put(job->fence);
>>>     	amdgpu_sync_free(&job->sync);
>>> -	amdgpu_sync_free(&job->sched_sync);
>>>     	kfree(job);
>>>     }
>>>     
>>> @@ -127,7 +125,6 @@ void amdgpu_job_free(struct amdgpu_job *job)
>>>     
>>>     	dma_fence_put(job->fence);
>>>     	amdgpu_sync_free(&job->sync);
>>> -	amdgpu_sync_free(&job->sched_sync);
>>>     	kfree(job);
>>>     }
>>>     
>>> @@ -182,14 +179,9 @@ static struct dma_fence *amdgpu_job_dependency(struct drm_sched_job *sched_job,
>>>     	bool need_pipe_sync = false;
>>>     	int r;
>>>     
>>> -	fence = amdgpu_sync_get_fence(&job->sync, &need_pipe_sync);
>>> -	if (fence && need_pipe_sync) {
>>> -		if (drm_sched_dependency_optimized(fence, s_entity)) {
>>> -			r = amdgpu_sync_fence(ring->adev, &job->sched_sync,
>>> -					      fence, false);
>>> -			if (r)
>>> -				DRM_ERROR("Error adding fence (%d)\n", r);
>>> -		}
>>> +	if (fence && need_pipe_sync && drm_sched_dependency_optimized(fence, s_entity)) {
>>> +		trace_amdgpu_ib_pipe_sync(job, fence);
>>> +		job->need_pipe_sync = true;
>>>     	}
>>>     
>>>     	while (fence == NULL && vm && !job->vmid) { diff --git 
>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> index e1b46a6..c1d00f0 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> @@ -41,7 +41,6 @@ struct amdgpu_job {
>>>     	struct drm_sched_job    base;
>>>     	struct amdgpu_vm	*vm;
>>>     	struct amdgpu_sync	sync;
>>> -	struct amdgpu_sync	sched_sync;
>>>     	struct amdgpu_ib	*ibs;
>>>     	struct dma_fence	*fence; /* the hw fence */
>>>     	uint32_t		preamble_status;
>>> @@ -59,7 +58,7 @@ struct amdgpu_job {
>>>     	/* user fence handling */
>>>     	uint64_t		uf_addr;
>>>     	uint64_t		uf_sequence;
>>> -
>>> +	bool            need_pipe_sync; /* require a pipeline sync for this job */
>>>     };
>>>     
>>>     int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned 
>>> num_ibs,

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/3] drm/amdgpu: drop the sched_sync
       [not found]                             ` <CY4PR1201MB0245543F654663532113C54A84CA0-1iTaO6aE1DBfNQakwlCMTGrFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
@ 2018-11-05 11:11                               ` Koenig, Christian
       [not found]                                 ` <94885af2-43eb-45a3-513a-89d838dbaa95-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 19+ messages in thread
From: Koenig, Christian @ 2018-11-05 11:11 UTC (permalink / raw)
  To: Liu, Monk, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Zhou,
	David(ChunMing)

> BTW: could we let the Job remember the hw fence seq that it need to sync up to ? e.g. in "drm_sched_entity_clear_dep" we not only wake up scheduler but also set the hw fence seq number to the job (and keep the big one), so in the end in amdgpu_ib_schedule(), we knows exactly the last seq value we need to pipeline sync to, and we can only WAIT_REG_MEM on it ( so we need change pipeline_sync routine for gfx, let it receive the seq value as parameter, and we use ">=" operation instead of "==")

That won't work correctly because the sequence wraps around from time to 
time and you can't correctly handle that in PM4 AFAIK. Well maybe with 
two waits, but not 100% sure.

Anyway it's not worth it because the software scheduler makes sure that 
we always have only two jobs on the hardware queue normally.

Christian.

Am 05.11.18 um 09:58 schrieb Liu, Monk:
>> So you have a pipeline sync when you don't need one and that is really really bad for things shared between processes, e.g. X/Wayland and it's clients.
> Oh, that may explain the thing here: My environment is a no-X-window system (customer's cloud gaming user case), so I don't launch X at all, and there is only one process running there which is 3dmark_vulkan.
>
>> But my main question is why do you see any issues with quark? That is a workaround for an issue for Vulkan sync handling and should only surface when a specific test is run many many times.
> Quark is only used to hang the gfx ring, and the missing explicit sync is from other processes (vulkan cts, vk_example, 3dmark). And I did some changes that drops unnecessary vm-flush/pipeline sync after GPU recover, that part is different with drm-next ... thanks for the reminding.
>
> BTW: could we let the Job remember the hw fence seq that it need to sync up to ? e.g. in "drm_sched_entity_clear_dep" we not only wake up scheduler but also set the hw fence seq number to the job (and keep the big one), so in the end in amdgpu_ib_schedule(), we knows exactly the last seq value we need to pipeline sync to, and we can only WAIT_REG_MEM on it ( so we need change pipeline_sync routine for gfx, let it receive the seq value as parameter, and we use ">=" operation instead of "==")
>
> /Monk
> -----Original Message-----
> From: Koenig, Christian
> Sent: Monday, November 5, 2018 3:48 PM
> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org; Zhou, David(ChunMing) <David1.Zhou@amd.com>
> Subject: Re: [PATCH 2/3] drm/amdgpu: drop the sched_sync
>
> Am 05.11.18 um 08:24 schrieb Liu, Monk:
>>> David Zhou had an use case which saw a >10% performance drop the last time he tried it.
>> I really don't believe that, because if you insert a WAIT_MEM on an already signaled fence, it only cost GPU couple clocks to move  on, right ? no reason to slow down up to 10% ... with 3dmark vulkan version test, the performance is barely different ... with my patch applied ...
> Why do you think that we insert a WAIT_MEM on an already signaled fence?
> The pipeline sync always wait for the last fence value (because we can't handle wraparounds in PM4).
>
> So you have a pipeline sync when you don't need one and that is really really bad for things shared between processes, e.g. X/Wayland and it's clients.
>
> I also expects that this doesn't effect 3dmark at all, but everything which runs in a window which is composed by X could be slowed down massively.
>
> David do you remember which use case was affected when you tried to drop this optimization?
>
>>> When a reset happens we flush the VMIDs when re-submitting the jobs to the rings and while doing so we also always do a pipeline sync.
>> I will check that point in my branch, I didn't use drm-next, maybe
>> there is gap in this part
> We had that logic for a very long time now, but we recently simplified it. Could be that there was a bug introduced doing so.
>
> Maybe we should add a specific flag to run_job() to note that we are re-running a job and then always add VM flushes/pipeline syncs?
>
> But my main question is why do you see any issues with quark? That is a workaround for an issue for Vulkan sync handling and should only surface when a specific test is run many many times.
>
> Regards,
> Christian.
>
>> /Monk
>> -----Original Message-----
>> From: Koenig, Christian
>> Sent: Monday, November 5, 2018 3:02 AM
>> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH 2/3] drm/amdgpu: drop the sched_sync
>>
>>> Can you tell me which game/benchmark will have performance drop with this fix by your understanding ?
>> When you sync between submission things like composing X windows are slowed down massively.
>>
>> David Zhou had an use case which saw a >10% performance drop the last time he tried it.
>>
>>> The problem I hit is during the massive stress test against
>>> multi-process + quark , if the quark process hang the engine while there is another two job following the bad job, After the TDR these two job will lose the explicit and the pipeline-sync was also lost.
>> Well that is really strange. This workaround is only for a very specific Vulkan CTS test which we are still not 100% sure is actually valid.
>>
>> When a reset happens we flush the VMIDs when re-submitting the jobs to the rings and while doing so we also always do a pipeline sync.
>>
>> So you should never ever run into any issues in quark with that, even when we completely disable this workaround.
>>
>> Regards,
>> Christian.
>>
>> Am 04.11.18 um 01:48 schrieb Liu, Monk:
>>>> NAK, that would result in a severe performance drop.
>>>> We need the fence here to determine if we actually need to do the pipeline sync or not.
>>>> E.g. the explicit requested fence could already be signaled.
>>> For the performance issue, only insert a WAIT_REG_MEM on GFX/compute ring *doesn't* give the "severe" drop (it's mimic in fact) ...  At least I didn't observe any performance drop with 3dmark benchmark (also tested vulkan CTS), Can you tell me which game/benchmark will have performance drop with this fix by your understanding ? let me check it .
>>>
>>> The problem I hit is during the massive stress test against
>>> multi-process + quark , if the quark process hang the engine while there is another two job following the bad job, After the TDR these two job will lose the explicit and the pipeline-sync was also lost.
>>>
>>>
>>> BTW: for original logic, the pipeline sync have another corner case:
>>> Assume JobC depend on JobA with explicit flag, and there is jobB inserted in ring:
>>>
>>> jobA -> jobB -> (pipe sync)JobC
>>>
>>> if JobA really cost a lot of time to finish, in the
>>> amdgpu_ib_schedule() stage you will insert a pipeline sync for JobC against its explicit dependency which is JobA, but there is a JobB between A and C and the pipeline sync of before JobC will wrongly wait on the JobB ...
>>>
>>> while it is not a big issue but obviously not necessary: C have no
>>> relation with B
>>>
>>> /Monk
>>>
>>>
>>>
>>> -----Original Message-----
>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>> Sent: Sunday, November 4, 2018 3:50 AM
>>> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH 2/3] drm/amdgpu: drop the sched_sync
>>>
>>> Am 03.11.18 um 06:33 schrieb Monk Liu:
>>>> Reasons to drop it:
>>>>
>>>> 1) simplify the code: just introduce field member "need_pipe_sync"
>>>> for job is good enough to tell if the explicit dependency fence need
>>>> followed by a pipeline sync.
>>>>
>>>> 2) after GPU_recover the explicit fence from sched_syn will not come
>>>> back so the required pipeline_sync following it is missed, consider
>>>> scenario below:
>>>>> now on ring buffer:
>>>> Job-A -> pipe_sync -> Job-B
>>>>> TDR occured on Job-A, and after GPU recover:
>>>>> now on ring buffer:
>>>> Job-A -> Job-B
>>>>
>>>> because the fence from sched_sync is used and freed after
>>>> ib_schedule in first time, it will never come back, with this patch
>>>> this issue could be avoided.
>>> NAK, that would result in a severe performance drop.
>>>
>>> We need the fence here to determine if we actually need to do the pipeline sync or not.
>>>
>>> E.g. the explicit requested fence could already be signaled.
>>>
>>> Christian.
>>>
>>>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>>>> ---
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c  | 16 ++++++----------
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 14 +++-----------
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  3 +--
>>>>      3 files changed, 10 insertions(+), 23 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>>> index c48207b3..ac7d2da 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>>> @@ -122,7 +122,6 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>>>      {
>>>>      	struct amdgpu_device *adev = ring->adev;
>>>>      	struct amdgpu_ib *ib = &ibs[0];
>>>> -	struct dma_fence *tmp = NULL;
>>>>      	bool skip_preamble, need_ctx_switch;
>>>>      	unsigned patch_offset = ~0;
>>>>      	struct amdgpu_vm *vm;
>>>> @@ -166,16 +165,13 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>>>      	}
>>>>      
>>>>      	need_ctx_switch = ring->current_ctx != fence_ctx;
>>>> -	if (ring->funcs->emit_pipeline_sync && job &&
>>>> -	    ((tmp = amdgpu_sync_get_fence(&job->sched_sync, NULL)) ||
>>>> -	     (amdgpu_sriov_vf(adev) && need_ctx_switch) ||
>>>> -	     amdgpu_vm_need_pipeline_sync(ring, job))) {
>>>> -		need_pipe_sync = true;
>>>>      
>>>> -		if (tmp)
>>>> -			trace_amdgpu_ib_pipe_sync(job, tmp);
>>>> -
>>>> -		dma_fence_put(tmp);
>>>> +	if (ring->funcs->emit_pipeline_sync && job) {
>>>> +		if ((need_ctx_switch && amdgpu_sriov_vf(adev)) ||
>>>> +			amdgpu_vm_need_pipeline_sync(ring, job))
>>>> +			need_pipe_sync = true;
>>>> +		else if (job->need_pipe_sync)
>>>> +			need_pipe_sync = true;
>>>>      	}
>>>>      
>>>>      	if (ring->funcs->insert_start)
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>> index 1d71f8c..dae997d 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>> @@ -71,7 +71,6 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
>>>>      	(*job)->num_ibs = num_ibs;
>>>>      
>>>>      	amdgpu_sync_create(&(*job)->sync);
>>>> -	amdgpu_sync_create(&(*job)->sched_sync);
>>>>      	(*job)->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
>>>>      	(*job)->vm_pd_addr = AMDGPU_BO_INVALID_OFFSET;
>>>>      
>>>> @@ -117,7 +116,6 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job)
>>>>      	amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>      	dma_fence_put(job->fence);
>>>>      	amdgpu_sync_free(&job->sync);
>>>> -	amdgpu_sync_free(&job->sched_sync);
>>>>      	kfree(job);
>>>>      }
>>>>      
>>>> @@ -127,7 +125,6 @@ void amdgpu_job_free(struct amdgpu_job *job)
>>>>      
>>>>      	dma_fence_put(job->fence);
>>>>      	amdgpu_sync_free(&job->sync);
>>>> -	amdgpu_sync_free(&job->sched_sync);
>>>>      	kfree(job);
>>>>      }
>>>>      
>>>> @@ -182,14 +179,9 @@ static struct dma_fence *amdgpu_job_dependency(struct drm_sched_job *sched_job,
>>>>      	bool need_pipe_sync = false;
>>>>      	int r;
>>>>      
>>>> -	fence = amdgpu_sync_get_fence(&job->sync, &need_pipe_sync);
>>>> -	if (fence && need_pipe_sync) {
>>>> -		if (drm_sched_dependency_optimized(fence, s_entity)) {
>>>> -			r = amdgpu_sync_fence(ring->adev, &job->sched_sync,
>>>> -					      fence, false);
>>>> -			if (r)
>>>> -				DRM_ERROR("Error adding fence (%d)\n", r);
>>>> -		}
>>>> +	if (fence && need_pipe_sync && drm_sched_dependency_optimized(fence, s_entity)) {
>>>> +		trace_amdgpu_ib_pipe_sync(job, fence);
>>>> +		job->need_pipe_sync = true;
>>>>      	}
>>>>      
>>>>      	while (fence == NULL && vm && !job->vmid) { diff --git
>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>> index e1b46a6..c1d00f0 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>> @@ -41,7 +41,6 @@ struct amdgpu_job {
>>>>      	struct drm_sched_job    base;
>>>>      	struct amdgpu_vm	*vm;
>>>>      	struct amdgpu_sync	sync;
>>>> -	struct amdgpu_sync	sched_sync;
>>>>      	struct amdgpu_ib	*ibs;
>>>>      	struct dma_fence	*fence; /* the hw fence */
>>>>      	uint32_t		preamble_status;
>>>> @@ -59,7 +58,7 @@ struct amdgpu_job {
>>>>      	/* user fence handling */
>>>>      	uint64_t		uf_addr;
>>>>      	uint64_t		uf_sequence;
>>>> -
>>>> +	bool            need_pipe_sync; /* require a pipeline sync for this job */
>>>>      };
>>>>      
>>>>      int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned
>>>> num_ibs,

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* RE: [PATCH 2/3] drm/amdgpu: drop the sched_sync
       [not found]                         ` <9e5d6f91-9e20-8dcc-953f-e1e980b9e0d3-5C7GfCeVMHo@public.gmane.org>
  2018-11-05  7:50                           ` Zhou, David(ChunMing)
  2018-11-05  8:58                           ` Liu, Monk
@ 2018-11-05 13:41                           ` Liu, Monk
       [not found]                             ` <CY4PR1201MB024516A3396DBF651EAADA5684CA0-1iTaO6aE1DBfNQakwlCMTGrFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
  2 siblings, 1 reply; 19+ messages in thread
From: Liu, Monk @ 2018-11-05 13:41 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	Zhou, David(ChunMing)

Hi Christian 

For scenario: Bad Job (hang, vmid1) -->Job A (context 10, explicit dep for Job B, vmid2) --> Job B(context 10, vmid2) --> Job C (context 11, vmid3)

Assume "job_hang_limit" is 0, and assume "sched_hw_submission" is 4, I give a second thought on the logic after GPU reset:

1) the bad Job would be set guilty and skipped by scheduler,
2) the first re-submitted job (Job A) would be forced with a pipeline-sync, 
3) the first re-submitted job (Job A) would be forced with a vm-flush, and later its VMID's "current_gpu_reset_count" is updated to "adev->gpu_reset_count"
4) the second re-submitted job (Job B, assume it was from the same context of Job A, share the same page table/process, and no vm_update needed ) would not be forced with a pipeline-sync, and neither a vm-flush ...
 
Thus for Job B if it has an explicit dep on Job A, this explicit dep would get lost and there will be no pipeline sync inserted prior to Job B ... 

Do you think that's a possible corner case ?

/Monk

-----Original Message-----
From: Koenig, Christian 
Sent: Monday, November 5, 2018 3:48 PM
To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org; Zhou, David(ChunMing) <David1.Zhou@amd.com>
Subject: Re: [PATCH 2/3] drm/amdgpu: drop the sched_sync

Am 05.11.18 um 08:24 schrieb Liu, Monk:
>> David Zhou had an use case which saw a >10% performance drop the last time he tried it.
> I really don't believe that, because if you insert a WAIT_MEM on an already signaled fence, it only cost GPU couple clocks to move  on, right ? no reason to slow down up to 10% ... with 3dmark vulkan version test, the performance is barely different ... with my patch applied ...

Why do you think that we insert a WAIT_MEM on an already signaled fence? 
The pipeline sync always wait for the last fence value (because we can't handle wraparounds in PM4).

So you have a pipeline sync when you don't need one and that is really really bad for things shared between processes, e.g. X/Wayland and it's clients.

I also expects that this doesn't effect 3dmark at all, but everything which runs in a window which is composed by X could be slowed down massively.

David do you remember which use case was affected when you tried to drop this optimization?

>> When a reset happens we flush the VMIDs when re-submitting the jobs to the rings and while doing so we also always do a pipeline sync.
> I will check that point in my branch, I didn't use drm-next, maybe 
> there is gap in this part

We had that logic for a very long time now, but we recently simplified it. Could be that there was a bug introduced doing so.

Maybe we should add a specific flag to run_job() to note that we are re-running a job and then always add VM flushes/pipeline syncs?

But my main question is why do you see any issues with quark? That is a workaround for an issue for Vulkan sync handling and should only surface when a specific test is run many many times.

Regards,
Christian.

>
> /Monk
> -----Original Message-----
> From: Koenig, Christian
> Sent: Monday, November 5, 2018 3:02 AM
> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH 2/3] drm/amdgpu: drop the sched_sync
>
>> Can you tell me which game/benchmark will have performance drop with this fix by your understanding ?
> When you sync between submission things like composing X windows are slowed down massively.
>
> David Zhou had an use case which saw a >10% performance drop the last time he tried it.
>
>> The problem I hit is during the massive stress test against 
>> multi-process + quark , if the quark process hang the engine while there is another two job following the bad job, After the TDR these two job will lose the explicit and the pipeline-sync was also lost.
> Well that is really strange. This workaround is only for a very specific Vulkan CTS test which we are still not 100% sure is actually valid.
>
> When a reset happens we flush the VMIDs when re-submitting the jobs to the rings and while doing so we also always do a pipeline sync.
>
> So you should never ever run into any issues in quark with that, even when we completely disable this workaround.
>
> Regards,
> Christian.
>
> Am 04.11.18 um 01:48 schrieb Liu, Monk:
>>> NAK, that would result in a severe performance drop.
>>> We need the fence here to determine if we actually need to do the pipeline sync or not.
>>> E.g. the explicit requested fence could already be signaled.
>> For the performance issue, only insert a WAIT_REG_MEM on GFX/compute ring *doesn't* give the "severe" drop (it's mimic in fact) ...  At least I didn't observe any performance drop with 3dmark benchmark (also tested vulkan CTS), Can you tell me which game/benchmark will have performance drop with this fix by your understanding ? let me check it .
>>
>> The problem I hit is during the massive stress test against 
>> multi-process + quark , if the quark process hang the engine while there is another two job following the bad job, After the TDR these two job will lose the explicit and the pipeline-sync was also lost.
>>
>>
>> BTW: for original logic, the pipeline sync have another corner case:
>> Assume JobC depend on JobA with explicit flag, and there is jobB inserted in ring:
>>
>> jobA -> jobB -> (pipe sync)JobC
>>
>> if JobA really cost a lot of time to finish, in the
>> amdgpu_ib_schedule() stage you will insert a pipeline sync for JobC against its explicit dependency which is JobA, but there is a JobB between A and C and the pipeline sync of before JobC will wrongly wait on the JobB ...
>>
>> while it is not a big issue but obviously not necessary: C have no 
>> relation with B
>>
>> /Monk
>>
>>
>>
>> -----Original Message-----
>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>> Sent: Sunday, November 4, 2018 3:50 AM
>> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH 2/3] drm/amdgpu: drop the sched_sync
>>
>> Am 03.11.18 um 06:33 schrieb Monk Liu:
>>> Reasons to drop it:
>>>
>>> 1) simplify the code: just introduce field member "need_pipe_sync"
>>> for job is good enough to tell if the explicit dependency fence need 
>>> followed by a pipeline sync.
>>>
>>> 2) after GPU_recover the explicit fence from sched_syn will not come 
>>> back so the required pipeline_sync following it is missed, consider 
>>> scenario below:
>>>> now on ring buffer:
>>> Job-A -> pipe_sync -> Job-B
>>>> TDR occured on Job-A, and after GPU recover:
>>>> now on ring buffer:
>>> Job-A -> Job-B
>>>
>>> because the fence from sched_sync is used and freed after 
>>> ib_schedule in first time, it will never come back, with this patch 
>>> this issue could be avoided.
>> NAK, that would result in a severe performance drop.
>>
>> We need the fence here to determine if we actually need to do the pipeline sync or not.
>>
>> E.g. the explicit requested fence could already be signaled.
>>
>> Christian.
>>
>>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>>> ---
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c  | 16 ++++++----------
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 14 +++-----------
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  3 +--
>>>     3 files changed, 10 insertions(+), 23 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>> index c48207b3..ac7d2da 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>> @@ -122,7 +122,6 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>>     {
>>>     	struct amdgpu_device *adev = ring->adev;
>>>     	struct amdgpu_ib *ib = &ibs[0];
>>> -	struct dma_fence *tmp = NULL;
>>>     	bool skip_preamble, need_ctx_switch;
>>>     	unsigned patch_offset = ~0;
>>>     	struct amdgpu_vm *vm;
>>> @@ -166,16 +165,13 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>>     	}
>>>     
>>>     	need_ctx_switch = ring->current_ctx != fence_ctx;
>>> -	if (ring->funcs->emit_pipeline_sync && job &&
>>> -	    ((tmp = amdgpu_sync_get_fence(&job->sched_sync, NULL)) ||
>>> -	     (amdgpu_sriov_vf(adev) && need_ctx_switch) ||
>>> -	     amdgpu_vm_need_pipeline_sync(ring, job))) {
>>> -		need_pipe_sync = true;
>>>     
>>> -		if (tmp)
>>> -			trace_amdgpu_ib_pipe_sync(job, tmp);
>>> -
>>> -		dma_fence_put(tmp);
>>> +	if (ring->funcs->emit_pipeline_sync && job) {
>>> +		if ((need_ctx_switch && amdgpu_sriov_vf(adev)) ||
>>> +			amdgpu_vm_need_pipeline_sync(ring, job))
>>> +			need_pipe_sync = true;
>>> +		else if (job->need_pipe_sync)
>>> +			need_pipe_sync = true;
>>>     	}
>>>     
>>>     	if (ring->funcs->insert_start)
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> index 1d71f8c..dae997d 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> @@ -71,7 +71,6 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
>>>     	(*job)->num_ibs = num_ibs;
>>>     
>>>     	amdgpu_sync_create(&(*job)->sync);
>>> -	amdgpu_sync_create(&(*job)->sched_sync);
>>>     	(*job)->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
>>>     	(*job)->vm_pd_addr = AMDGPU_BO_INVALID_OFFSET;
>>>     
>>> @@ -117,7 +116,6 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job)
>>>     	amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>     	dma_fence_put(job->fence);
>>>     	amdgpu_sync_free(&job->sync);
>>> -	amdgpu_sync_free(&job->sched_sync);
>>>     	kfree(job);
>>>     }
>>>     
>>> @@ -127,7 +125,6 @@ void amdgpu_job_free(struct amdgpu_job *job)
>>>     
>>>     	dma_fence_put(job->fence);
>>>     	amdgpu_sync_free(&job->sync);
>>> -	amdgpu_sync_free(&job->sched_sync);
>>>     	kfree(job);
>>>     }
>>>     
>>> @@ -182,14 +179,9 @@ static struct dma_fence *amdgpu_job_dependency(struct drm_sched_job *sched_job,
>>>     	bool need_pipe_sync = false;
>>>     	int r;
>>>     
>>> -	fence = amdgpu_sync_get_fence(&job->sync, &need_pipe_sync);
>>> -	if (fence && need_pipe_sync) {
>>> -		if (drm_sched_dependency_optimized(fence, s_entity)) {
>>> -			r = amdgpu_sync_fence(ring->adev, &job->sched_sync,
>>> -					      fence, false);
>>> -			if (r)
>>> -				DRM_ERROR("Error adding fence (%d)\n", r);
>>> -		}
>>> +	if (fence && need_pipe_sync && drm_sched_dependency_optimized(fence, s_entity)) {
>>> +		trace_amdgpu_ib_pipe_sync(job, fence);
>>> +		job->need_pipe_sync = true;
>>>     	}
>>>     
>>>     	while (fence == NULL && vm && !job->vmid) { diff --git 
>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> index e1b46a6..c1d00f0 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> @@ -41,7 +41,6 @@ struct amdgpu_job {
>>>     	struct drm_sched_job    base;
>>>     	struct amdgpu_vm	*vm;
>>>     	struct amdgpu_sync	sync;
>>> -	struct amdgpu_sync	sched_sync;
>>>     	struct amdgpu_ib	*ibs;
>>>     	struct dma_fence	*fence; /* the hw fence */
>>>     	uint32_t		preamble_status;
>>> @@ -59,7 +58,7 @@ struct amdgpu_job {
>>>     	/* user fence handling */
>>>     	uint64_t		uf_addr;
>>>     	uint64_t		uf_sequence;
>>> -
>>> +	bool            need_pipe_sync; /* require a pipeline sync for this job */
>>>     };
>>>     
>>>     int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned 
>>> num_ibs,

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* RE: [PATCH 2/3] drm/amdgpu: drop the sched_sync
       [not found]                                 ` <94885af2-43eb-45a3-513a-89d838dbaa95-5C7GfCeVMHo@public.gmane.org>
@ 2018-11-05 13:47                                   ` Liu, Monk
  0 siblings, 0 replies; 19+ messages in thread
From: Liu, Monk @ 2018-11-05 13:47 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	Zhou, David(ChunMing)

> That won't work correctly because the sequence wraps around from time to time and you can't correctly handle that in PM4 AFAIK. Well maybe with two waits, but not 100% sure.

Hmmm, I still want to give a try, and I still feel current logic is wrong after TDR , see my reply in another email

> Anyway it's not worth it because the software scheduler makes sure that we always have only two jobs on the hardware queue normally.

Well, our customer required we set sched_hw_submission to 8, because in virtual machine the CPU latency is much more overhead than bare-metal 
And with intensive vulkan submit utility test we found set it to 8 (also set sched_jobs to 256) can greatly reduce the IOCTL latency (cs_submit IOCTL)

/Monk
-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Koenig, Christian
Sent: Monday, November 5, 2018 7:12 PM
To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org; Zhou, David(ChunMing) <David1.Zhou@amd.com>
Subject: Re: [PATCH 2/3] drm/amdgpu: drop the sched_sync

> BTW: could we let the Job remember the hw fence seq that it need to 
> sync up to ? e.g. in "drm_sched_entity_clear_dep" we not only wake up 
> scheduler but also set the hw fence seq number to the job (and keep 
> the big one), so in the end in amdgpu_ib_schedule(), we knows exactly 
> the last seq value we need to pipeline sync to, and we can only 
> WAIT_REG_MEM on it ( so we need change pipeline_sync routine for gfx, 
> let it receive the seq value as parameter, and we use ">=" operation 
> instead of "==")

That won't work correctly because the sequence wraps around from time to time and you can't correctly handle that in PM4 AFAIK. Well maybe with two waits, but not 100% sure.

Anyway it's not worth it because the software scheduler makes sure that we always have only two jobs on the hardware queue normally.

Christian.

Am 05.11.18 um 09:58 schrieb Liu, Monk:
>> So you have a pipeline sync when you don't need one and that is really really bad for things shared between processes, e.g. X/Wayland and it's clients.
> Oh, that may explain the thing here: My environment is a no-X-window system (customer's cloud gaming user case), so I don't launch X at all, and there is only one process running there which is 3dmark_vulkan.
>
>> But my main question is why do you see any issues with quark? That is a workaround for an issue for Vulkan sync handling and should only surface when a specific test is run many many times.
> Quark is only used to hang the gfx ring, and the missing explicit sync is from other processes (vulkan cts, vk_example, 3dmark). And I did some changes that drops unnecessary vm-flush/pipeline sync after GPU recover, that part is different with drm-next ... thanks for the reminding.
>
> BTW: could we let the Job remember the hw fence seq that it need to 
> sync up to ? e.g. in "drm_sched_entity_clear_dep" we not only wake up 
> scheduler but also set the hw fence seq number to the job (and keep 
> the big one), so in the end in amdgpu_ib_schedule(), we knows exactly 
> the last seq value we need to pipeline sync to, and we can only 
> WAIT_REG_MEM on it ( so we need change pipeline_sync routine for gfx, 
> let it receive the seq value as parameter, and we use ">=" operation 
> instead of "==")
>
> /Monk
> -----Original Message-----
> From: Koenig, Christian
> Sent: Monday, November 5, 2018 3:48 PM
> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org; Zhou, 
> David(ChunMing) <David1.Zhou@amd.com>
> Subject: Re: [PATCH 2/3] drm/amdgpu: drop the sched_sync
>
> Am 05.11.18 um 08:24 schrieb Liu, Monk:
>>> David Zhou had an use case which saw a >10% performance drop the last time he tried it.
>> I really don't believe that, because if you insert a WAIT_MEM on an already signaled fence, it only cost GPU couple clocks to move  on, right ? no reason to slow down up to 10% ... with 3dmark vulkan version test, the performance is barely different ... with my patch applied ...
> Why do you think that we insert a WAIT_MEM on an already signaled fence?
> The pipeline sync always wait for the last fence value (because we can't handle wraparounds in PM4).
>
> So you have a pipeline sync when you don't need one and that is really really bad for things shared between processes, e.g. X/Wayland and it's clients.
>
> I also expects that this doesn't effect 3dmark at all, but everything which runs in a window which is composed by X could be slowed down massively.
>
> David do you remember which use case was affected when you tried to drop this optimization?
>
>>> When a reset happens we flush the VMIDs when re-submitting the jobs to the rings and while doing so we also always do a pipeline sync.
>> I will check that point in my branch, I didn't use drm-next, maybe 
>> there is gap in this part
> We had that logic for a very long time now, but we recently simplified it. Could be that there was a bug introduced doing so.
>
> Maybe we should add a specific flag to run_job() to note that we are re-running a job and then always add VM flushes/pipeline syncs?
>
> But my main question is why do you see any issues with quark? That is a workaround for an issue for Vulkan sync handling and should only surface when a specific test is run many many times.
>
> Regards,
> Christian.
>
>> /Monk
>> -----Original Message-----
>> From: Koenig, Christian
>> Sent: Monday, November 5, 2018 3:02 AM
>> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH 2/3] drm/amdgpu: drop the sched_sync
>>
>>> Can you tell me which game/benchmark will have performance drop with this fix by your understanding ?
>> When you sync between submission things like composing X windows are slowed down massively.
>>
>> David Zhou had an use case which saw a >10% performance drop the last time he tried it.
>>
>>> The problem I hit is during the massive stress test against 
>>> multi-process + quark , if the quark process hang the engine while there is another two job following the bad job, After the TDR these two job will lose the explicit and the pipeline-sync was also lost.
>> Well that is really strange. This workaround is only for a very specific Vulkan CTS test which we are still not 100% sure is actually valid.
>>
>> When a reset happens we flush the VMIDs when re-submitting the jobs to the rings and while doing so we also always do a pipeline sync.
>>
>> So you should never ever run into any issues in quark with that, even when we completely disable this workaround.
>>
>> Regards,
>> Christian.
>>
>> Am 04.11.18 um 01:48 schrieb Liu, Monk:
>>>> NAK, that would result in a severe performance drop.
>>>> We need the fence here to determine if we actually need to do the pipeline sync or not.
>>>> E.g. the explicit requested fence could already be signaled.
>>> For the performance issue, only insert a WAIT_REG_MEM on GFX/compute ring *doesn't* give the "severe" drop (it's mimic in fact) ...  At least I didn't observe any performance drop with 3dmark benchmark (also tested vulkan CTS), Can you tell me which game/benchmark will have performance drop with this fix by your understanding ? let me check it .
>>>
>>> The problem I hit is during the massive stress test against 
>>> multi-process + quark , if the quark process hang the engine while there is another two job following the bad job, After the TDR these two job will lose the explicit and the pipeline-sync was also lost.
>>>
>>>
>>> BTW: for original logic, the pipeline sync have another corner case:
>>> Assume JobC depend on JobA with explicit flag, and there is jobB inserted in ring:
>>>
>>> jobA -> jobB -> (pipe sync)JobC
>>>
>>> if JobA really cost a lot of time to finish, in the
>>> amdgpu_ib_schedule() stage you will insert a pipeline sync for JobC against its explicit dependency which is JobA, but there is a JobB between A and C and the pipeline sync of before JobC will wrongly wait on the JobB ...
>>>
>>> while it is not a big issue but obviously not necessary: C have no 
>>> relation with B
>>>
>>> /Monk
>>>
>>>
>>>
>>> -----Original Message-----
>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>> Sent: Sunday, November 4, 2018 3:50 AM
>>> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH 2/3] drm/amdgpu: drop the sched_sync
>>>
>>> Am 03.11.18 um 06:33 schrieb Monk Liu:
>>>> Reasons to drop it:
>>>>
>>>> 1) simplify the code: just introduce field member "need_pipe_sync"
>>>> for job is good enough to tell if the explicit dependency fence 
>>>> need followed by a pipeline sync.
>>>>
>>>> 2) after GPU_recover the explicit fence from sched_syn will not 
>>>> come back so the required pipeline_sync following it is missed, 
>>>> consider scenario below:
>>>>> now on ring buffer:
>>>> Job-A -> pipe_sync -> Job-B
>>>>> TDR occured on Job-A, and after GPU recover:
>>>>> now on ring buffer:
>>>> Job-A -> Job-B
>>>>
>>>> because the fence from sched_sync is used and freed after 
>>>> ib_schedule in first time, it will never come back, with this patch 
>>>> this issue could be avoided.
>>> NAK, that would result in a severe performance drop.
>>>
>>> We need the fence here to determine if we actually need to do the pipeline sync or not.
>>>
>>> E.g. the explicit requested fence could already be signaled.
>>>
>>> Christian.
>>>
>>>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>>>> ---
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c  | 16 ++++++----------
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 14 +++-----------
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  3 +--
>>>>      3 files changed, 10 insertions(+), 23 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>>> index c48207b3..ac7d2da 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>>> @@ -122,7 +122,6 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>>>      {
>>>>      	struct amdgpu_device *adev = ring->adev;
>>>>      	struct amdgpu_ib *ib = &ibs[0];
>>>> -	struct dma_fence *tmp = NULL;
>>>>      	bool skip_preamble, need_ctx_switch;
>>>>      	unsigned patch_offset = ~0;
>>>>      	struct amdgpu_vm *vm;
>>>> @@ -166,16 +165,13 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>>>      	}
>>>>      
>>>>      	need_ctx_switch = ring->current_ctx != fence_ctx;
>>>> -	if (ring->funcs->emit_pipeline_sync && job &&
>>>> -	    ((tmp = amdgpu_sync_get_fence(&job->sched_sync, NULL)) ||
>>>> -	     (amdgpu_sriov_vf(adev) && need_ctx_switch) ||
>>>> -	     amdgpu_vm_need_pipeline_sync(ring, job))) {
>>>> -		need_pipe_sync = true;
>>>>      
>>>> -		if (tmp)
>>>> -			trace_amdgpu_ib_pipe_sync(job, tmp);
>>>> -
>>>> -		dma_fence_put(tmp);
>>>> +	if (ring->funcs->emit_pipeline_sync && job) {
>>>> +		if ((need_ctx_switch && amdgpu_sriov_vf(adev)) ||
>>>> +			amdgpu_vm_need_pipeline_sync(ring, job))
>>>> +			need_pipe_sync = true;
>>>> +		else if (job->need_pipe_sync)
>>>> +			need_pipe_sync = true;
>>>>      	}
>>>>      
>>>>      	if (ring->funcs->insert_start) diff --git 
>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>> index 1d71f8c..dae997d 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>> @@ -71,7 +71,6 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
>>>>      	(*job)->num_ibs = num_ibs;
>>>>      
>>>>      	amdgpu_sync_create(&(*job)->sync);
>>>> -	amdgpu_sync_create(&(*job)->sched_sync);
>>>>      	(*job)->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
>>>>      	(*job)->vm_pd_addr = AMDGPU_BO_INVALID_OFFSET;
>>>>      
>>>> @@ -117,7 +116,6 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job)
>>>>      	amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>      	dma_fence_put(job->fence);
>>>>      	amdgpu_sync_free(&job->sync);
>>>> -	amdgpu_sync_free(&job->sched_sync);
>>>>      	kfree(job);
>>>>      }
>>>>      
>>>> @@ -127,7 +125,6 @@ void amdgpu_job_free(struct amdgpu_job *job)
>>>>      
>>>>      	dma_fence_put(job->fence);
>>>>      	amdgpu_sync_free(&job->sync);
>>>> -	amdgpu_sync_free(&job->sched_sync);
>>>>      	kfree(job);
>>>>      }
>>>>      
>>>> @@ -182,14 +179,9 @@ static struct dma_fence *amdgpu_job_dependency(struct drm_sched_job *sched_job,
>>>>      	bool need_pipe_sync = false;
>>>>      	int r;
>>>>      
>>>> -	fence = amdgpu_sync_get_fence(&job->sync, &need_pipe_sync);
>>>> -	if (fence && need_pipe_sync) {
>>>> -		if (drm_sched_dependency_optimized(fence, s_entity)) {
>>>> -			r = amdgpu_sync_fence(ring->adev, &job->sched_sync,
>>>> -					      fence, false);
>>>> -			if (r)
>>>> -				DRM_ERROR("Error adding fence (%d)\n", r);
>>>> -		}
>>>> +	if (fence && need_pipe_sync && drm_sched_dependency_optimized(fence, s_entity)) {
>>>> +		trace_amdgpu_ib_pipe_sync(job, fence);
>>>> +		job->need_pipe_sync = true;
>>>>      	}
>>>>      
>>>>      	while (fence == NULL && vm && !job->vmid) { diff --git 
>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>> index e1b46a6..c1d00f0 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>> @@ -41,7 +41,6 @@ struct amdgpu_job {
>>>>      	struct drm_sched_job    base;
>>>>      	struct amdgpu_vm	*vm;
>>>>      	struct amdgpu_sync	sync;
>>>> -	struct amdgpu_sync	sched_sync;
>>>>      	struct amdgpu_ib	*ibs;
>>>>      	struct dma_fence	*fence; /* the hw fence */
>>>>      	uint32_t		preamble_status;
>>>> @@ -59,7 +58,7 @@ struct amdgpu_job {
>>>>      	/* user fence handling */
>>>>      	uint64_t		uf_addr;
>>>>      	uint64_t		uf_sequence;
>>>> -
>>>> +	bool            need_pipe_sync; /* require a pipeline sync for this job */
>>>>      };
>>>>      
>>>>      int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned 
>>>> num_ibs,

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/3] drm/amdgpu: drop the sched_sync
       [not found]                             ` <CY4PR1201MB024516A3396DBF651EAADA5684CA0-1iTaO6aE1DBfNQakwlCMTGrFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
@ 2018-11-05 13:59                               ` Koenig, Christian
       [not found]                                 ` <40e3e22f-f42e-85e0-f66e-c2d16d392289-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 19+ messages in thread
From: Koenig, Christian @ 2018-11-05 13:59 UTC (permalink / raw)
  To: Liu, Monk, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Zhou,
	David(ChunMing)

> and later its VMID's "current_gpu_reset_count" is updated to "adev->gpu_reset_count"
The question is how much later that is done. My recollection is that we 
don't reset that for resubmission, but that could be wrong.

Anyway I think the cleanest approach to always handle that correctly 
would be to always insert a vm flush before all jobs on resubmission. 
That is most likely better for VM flush handling as well.

Christian.

Am 05.11.18 um 14:41 schrieb Liu, Monk:
> Hi Christian
>
> For scenario: Bad Job (hang, vmid1) -->Job A (context 10, explicit dep for Job B, vmid2) --> Job B(context 10, vmid2) --> Job C (context 11, vmid3)
>
> Assume "job_hang_limit" is 0, and assume "sched_hw_submission" is 4, I give a second thought on the logic after GPU reset:
>
> 1) the bad Job would be set guilty and skipped by scheduler,
> 2) the first re-submitted job (Job A) would be forced with a pipeline-sync,
> 3) the first re-submitted job (Job A) would be forced with a vm-flush, and later its VMID's "current_gpu_reset_count" is updated to "adev->gpu_reset_count"
> 4) the second re-submitted job (Job B, assume it was from the same context of Job A, share the same page table/process, and no vm_update needed ) would not be forced with a pipeline-sync, and neither a vm-flush ...
>   
> Thus for Job B if it has an explicit dep on Job A, this explicit dep would get lost and there will be no pipeline sync inserted prior to Job B ...
>
> Do you think that's a possible corner case ?
>
> /Monk
>
> -----Original Message-----
> From: Koenig, Christian
> Sent: Monday, November 5, 2018 3:48 PM
> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org; Zhou, David(ChunMing) <David1.Zhou@amd.com>
> Subject: Re: [PATCH 2/3] drm/amdgpu: drop the sched_sync
>
> Am 05.11.18 um 08:24 schrieb Liu, Monk:
>>> David Zhou had an use case which saw a >10% performance drop the last time he tried it.
>> I really don't believe that, because if you insert a WAIT_MEM on an already signaled fence, it only cost GPU couple clocks to move  on, right ? no reason to slow down up to 10% ... with 3dmark vulkan version test, the performance is barely different ... with my patch applied ...
> Why do you think that we insert a WAIT_MEM on an already signaled fence?
> The pipeline sync always wait for the last fence value (because we can't handle wraparounds in PM4).
>
> So you have a pipeline sync when you don't need one and that is really really bad for things shared between processes, e.g. X/Wayland and it's clients.
>
> I also expects that this doesn't effect 3dmark at all, but everything which runs in a window which is composed by X could be slowed down massively.
>
> David do you remember which use case was affected when you tried to drop this optimization?
>
>>> When a reset happens we flush the VMIDs when re-submitting the jobs to the rings and while doing so we also always do a pipeline sync.
>> I will check that point in my branch, I didn't use drm-next, maybe
>> there is gap in this part
> We had that logic for a very long time now, but we recently simplified it. Could be that there was a bug introduced doing so.
>
> Maybe we should add a specific flag to run_job() to note that we are re-running a job and then always add VM flushes/pipeline syncs?
>
> But my main question is why do you see any issues with quark? That is a workaround for an issue for Vulkan sync handling and should only surface when a specific test is run many many times.
>
> Regards,
> Christian.
>
>> /Monk
>> -----Original Message-----
>> From: Koenig, Christian
>> Sent: Monday, November 5, 2018 3:02 AM
>> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH 2/3] drm/amdgpu: drop the sched_sync
>>
>>> Can you tell me which game/benchmark will have performance drop with this fix by your understanding ?
>> When you sync between submission things like composing X windows are slowed down massively.
>>
>> David Zhou had an use case which saw a >10% performance drop the last time he tried it.
>>
>>> The problem I hit is during the massive stress test against
>>> multi-process + quark , if the quark process hang the engine while there is another two job following the bad job, After the TDR these two job will lose the explicit and the pipeline-sync was also lost.
>> Well that is really strange. This workaround is only for a very specific Vulkan CTS test which we are still not 100% sure is actually valid.
>>
>> When a reset happens we flush the VMIDs when re-submitting the jobs to the rings and while doing so we also always do a pipeline sync.
>>
>> So you should never ever run into any issues in quark with that, even when we completely disable this workaround.
>>
>> Regards,
>> Christian.
>>
>> Am 04.11.18 um 01:48 schrieb Liu, Monk:
>>>> NAK, that would result in a severe performance drop.
>>>> We need the fence here to determine if we actually need to do the pipeline sync or not.
>>>> E.g. the explicit requested fence could already be signaled.
>>> For the performance issue, only insert a WAIT_REG_MEM on GFX/compute ring *doesn't* give the "severe" drop (it's mimic in fact) ...  At least I didn't observe any performance drop with 3dmark benchmark (also tested vulkan CTS), Can you tell me which game/benchmark will have performance drop with this fix by your understanding ? let me check it .
>>>
>>> The problem I hit is during the massive stress test against
>>> multi-process + quark , if the quark process hang the engine while there is another two job following the bad job, After the TDR these two job will lose the explicit and the pipeline-sync was also lost.
>>>
>>>
>>> BTW: for original logic, the pipeline sync have another corner case:
>>> Assume JobC depend on JobA with explicit flag, and there is jobB inserted in ring:
>>>
>>> jobA -> jobB -> (pipe sync)JobC
>>>
>>> if JobA really cost a lot of time to finish, in the
>>> amdgpu_ib_schedule() stage you will insert a pipeline sync for JobC against its explicit dependency which is JobA, but there is a JobB between A and C and the pipeline sync of before JobC will wrongly wait on the JobB ...
>>>
>>> while it is not a big issue but obviously not necessary: C have no
>>> relation with B
>>>
>>> /Monk
>>>
>>>
>>>
>>> -----Original Message-----
>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>> Sent: Sunday, November 4, 2018 3:50 AM
>>> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH 2/3] drm/amdgpu: drop the sched_sync
>>>
>>> Am 03.11.18 um 06:33 schrieb Monk Liu:
>>>> Reasons to drop it:
>>>>
>>>> 1) simplify the code: just introduce field member "need_pipe_sync"
>>>> for job is good enough to tell if the explicit dependency fence need
>>>> followed by a pipeline sync.
>>>>
>>>> 2) after GPU_recover the explicit fence from sched_syn will not come
>>>> back so the required pipeline_sync following it is missed, consider
>>>> scenario below:
>>>>> now on ring buffer:
>>>> Job-A -> pipe_sync -> Job-B
>>>>> TDR occured on Job-A, and after GPU recover:
>>>>> now on ring buffer:
>>>> Job-A -> Job-B
>>>>
>>>> because the fence from sched_sync is used and freed after
>>>> ib_schedule in first time, it will never come back, with this patch
>>>> this issue could be avoided.
>>> NAK, that would result in a severe performance drop.
>>>
>>> We need the fence here to determine if we actually need to do the pipeline sync or not.
>>>
>>> E.g. the explicit requested fence could already be signaled.
>>>
>>> Christian.
>>>
>>>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>>>> ---
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c  | 16 ++++++----------
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 14 +++-----------
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  3 +--
>>>>      3 files changed, 10 insertions(+), 23 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>>> index c48207b3..ac7d2da 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>>> @@ -122,7 +122,6 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>>>      {
>>>>      	struct amdgpu_device *adev = ring->adev;
>>>>      	struct amdgpu_ib *ib = &ibs[0];
>>>> -	struct dma_fence *tmp = NULL;
>>>>      	bool skip_preamble, need_ctx_switch;
>>>>      	unsigned patch_offset = ~0;
>>>>      	struct amdgpu_vm *vm;
>>>> @@ -166,16 +165,13 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>>>      	}
>>>>      
>>>>      	need_ctx_switch = ring->current_ctx != fence_ctx;
>>>> -	if (ring->funcs->emit_pipeline_sync && job &&
>>>> -	    ((tmp = amdgpu_sync_get_fence(&job->sched_sync, NULL)) ||
>>>> -	     (amdgpu_sriov_vf(adev) && need_ctx_switch) ||
>>>> -	     amdgpu_vm_need_pipeline_sync(ring, job))) {
>>>> -		need_pipe_sync = true;
>>>>      
>>>> -		if (tmp)
>>>> -			trace_amdgpu_ib_pipe_sync(job, tmp);
>>>> -
>>>> -		dma_fence_put(tmp);
>>>> +	if (ring->funcs->emit_pipeline_sync && job) {
>>>> +		if ((need_ctx_switch && amdgpu_sriov_vf(adev)) ||
>>>> +			amdgpu_vm_need_pipeline_sync(ring, job))
>>>> +			need_pipe_sync = true;
>>>> +		else if (job->need_pipe_sync)
>>>> +			need_pipe_sync = true;
>>>>      	}
>>>>      
>>>>      	if (ring->funcs->insert_start)
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>> index 1d71f8c..dae997d 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>> @@ -71,7 +71,6 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
>>>>      	(*job)->num_ibs = num_ibs;
>>>>      
>>>>      	amdgpu_sync_create(&(*job)->sync);
>>>> -	amdgpu_sync_create(&(*job)->sched_sync);
>>>>      	(*job)->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
>>>>      	(*job)->vm_pd_addr = AMDGPU_BO_INVALID_OFFSET;
>>>>      
>>>> @@ -117,7 +116,6 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job)
>>>>      	amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>      	dma_fence_put(job->fence);
>>>>      	amdgpu_sync_free(&job->sync);
>>>> -	amdgpu_sync_free(&job->sched_sync);
>>>>      	kfree(job);
>>>>      }
>>>>      
>>>> @@ -127,7 +125,6 @@ void amdgpu_job_free(struct amdgpu_job *job)
>>>>      
>>>>      	dma_fence_put(job->fence);
>>>>      	amdgpu_sync_free(&job->sync);
>>>> -	amdgpu_sync_free(&job->sched_sync);
>>>>      	kfree(job);
>>>>      }
>>>>      
>>>> @@ -182,14 +179,9 @@ static struct dma_fence *amdgpu_job_dependency(struct drm_sched_job *sched_job,
>>>>      	bool need_pipe_sync = false;
>>>>      	int r;
>>>>      
>>>> -	fence = amdgpu_sync_get_fence(&job->sync, &need_pipe_sync);
>>>> -	if (fence && need_pipe_sync) {
>>>> -		if (drm_sched_dependency_optimized(fence, s_entity)) {
>>>> -			r = amdgpu_sync_fence(ring->adev, &job->sched_sync,
>>>> -					      fence, false);
>>>> -			if (r)
>>>> -				DRM_ERROR("Error adding fence (%d)\n", r);
>>>> -		}
>>>> +	if (fence && need_pipe_sync && drm_sched_dependency_optimized(fence, s_entity)) {
>>>> +		trace_amdgpu_ib_pipe_sync(job, fence);
>>>> +		job->need_pipe_sync = true;
>>>>      	}
>>>>      
>>>>      	while (fence == NULL && vm && !job->vmid) { diff --git
>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>> index e1b46a6..c1d00f0 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>> @@ -41,7 +41,6 @@ struct amdgpu_job {
>>>>      	struct drm_sched_job    base;
>>>>      	struct amdgpu_vm	*vm;
>>>>      	struct amdgpu_sync	sync;
>>>> -	struct amdgpu_sync	sched_sync;
>>>>      	struct amdgpu_ib	*ibs;
>>>>      	struct dma_fence	*fence; /* the hw fence */
>>>>      	uint32_t		preamble_status;
>>>> @@ -59,7 +58,7 @@ struct amdgpu_job {
>>>>      	/* user fence handling */
>>>>      	uint64_t		uf_addr;
>>>>      	uint64_t		uf_sequence;
>>>> -
>>>> +	bool            need_pipe_sync; /* require a pipeline sync for this job */
>>>>      };
>>>>      
>>>>      int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned
>>>> num_ibs,

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* RE: [PATCH 2/3] drm/amdgpu: drop the sched_sync
       [not found]                                 ` <40e3e22f-f42e-85e0-f66e-c2d16d392289-5C7GfCeVMHo@public.gmane.org>
@ 2018-11-05 14:21                                   ` Liu, Monk
       [not found]                                     ` <CY4PR1201MB0245739F3F4FBC89DA2F18E284CA0-1iTaO6aE1DBfNQakwlCMTGrFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
  0 siblings, 1 reply; 19+ messages in thread
From: Liu, Monk @ 2018-11-05 14:21 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	Zhou, David(ChunMing)

> Anyway I think the cleanest approach to always handle that correctly would be to always insert a vm flush before all jobs on resubmission. 
That is most likely better for VM flush handling as well.

Yeah, that’s true and more simple 

/Monk
-----Original Message-----
From: Koenig, Christian 
Sent: Monday, November 5, 2018 9:59 PM
To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org; Zhou, David(ChunMing) <David1.Zhou@amd.com>
Subject: Re: [PATCH 2/3] drm/amdgpu: drop the sched_sync

> and later its VMID's "current_gpu_reset_count" is updated to "adev->gpu_reset_count"
The question is how much later that is done. My recollection is that we don't reset that for resubmission, but that could be wrong.

Anyway I think the cleanest approach to always handle that correctly would be to always insert a vm flush before all jobs on resubmission. 
That is most likely better for VM flush handling as well.

Christian.

Am 05.11.18 um 14:41 schrieb Liu, Monk:
> Hi Christian
>
> For scenario: Bad Job (hang, vmid1) -->Job A (context 10, explicit dep 
> for Job B, vmid2) --> Job B(context 10, vmid2) --> Job C (context 11, 
> vmid3)
>
> Assume "job_hang_limit" is 0, and assume "sched_hw_submission" is 4, I give a second thought on the logic after GPU reset:
>
> 1) the bad Job would be set guilty and skipped by scheduler,
> 2) the first re-submitted job (Job A) would be forced with a 
> pipeline-sync,
> 3) the first re-submitted job (Job A) would be forced with a vm-flush, and later its VMID's "current_gpu_reset_count" is updated to "adev->gpu_reset_count"
> 4) the second re-submitted job (Job B, assume it was from the same context of Job A, share the same page table/process, and no vm_update needed ) would not be forced with a pipeline-sync, and neither a vm-flush ...
>   
> Thus for Job B if it has an explicit dep on Job A, this explicit dep would get lost and there will be no pipeline sync inserted prior to Job B ...
>
> Do you think that's a possible corner case ?
>
> /Monk
>
> -----Original Message-----
> From: Koenig, Christian
> Sent: Monday, November 5, 2018 3:48 PM
> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org; Zhou, 
> David(ChunMing) <David1.Zhou@amd.com>
> Subject: Re: [PATCH 2/3] drm/amdgpu: drop the sched_sync
>
> Am 05.11.18 um 08:24 schrieb Liu, Monk:
>>> David Zhou had an use case which saw a >10% performance drop the last time he tried it.
>> I really don't believe that, because if you insert a WAIT_MEM on an already signaled fence, it only cost GPU couple clocks to move  on, right ? no reason to slow down up to 10% ... with 3dmark vulkan version test, the performance is barely different ... with my patch applied ...
> Why do you think that we insert a WAIT_MEM on an already signaled fence?
> The pipeline sync always wait for the last fence value (because we can't handle wraparounds in PM4).
>
> So you have a pipeline sync when you don't need one and that is really really bad for things shared between processes, e.g. X/Wayland and it's clients.
>
> I also expects that this doesn't effect 3dmark at all, but everything which runs in a window which is composed by X could be slowed down massively.
>
> David do you remember which use case was affected when you tried to drop this optimization?
>
>>> When a reset happens we flush the VMIDs when re-submitting the jobs to the rings and while doing so we also always do a pipeline sync.
>> I will check that point in my branch, I didn't use drm-next, maybe 
>> there is gap in this part
> We had that logic for a very long time now, but we recently simplified it. Could be that there was a bug introduced doing so.
>
> Maybe we should add a specific flag to run_job() to note that we are re-running a job and then always add VM flushes/pipeline syncs?
>
> But my main question is why do you see any issues with quark? That is a workaround for an issue for Vulkan sync handling and should only surface when a specific test is run many many times.
>
> Regards,
> Christian.
>
>> /Monk
>> -----Original Message-----
>> From: Koenig, Christian
>> Sent: Monday, November 5, 2018 3:02 AM
>> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH 2/3] drm/amdgpu: drop the sched_sync
>>
>>> Can you tell me which game/benchmark will have performance drop with this fix by your understanding ?
>> When you sync between submission things like composing X windows are slowed down massively.
>>
>> David Zhou had an use case which saw a >10% performance drop the last time he tried it.
>>
>>> The problem I hit is during the massive stress test against 
>>> multi-process + quark , if the quark process hang the engine while there is another two job following the bad job, After the TDR these two job will lose the explicit and the pipeline-sync was also lost.
>> Well that is really strange. This workaround is only for a very specific Vulkan CTS test which we are still not 100% sure is actually valid.
>>
>> When a reset happens we flush the VMIDs when re-submitting the jobs to the rings and while doing so we also always do a pipeline sync.
>>
>> So you should never ever run into any issues in quark with that, even when we completely disable this workaround.
>>
>> Regards,
>> Christian.
>>
>> Am 04.11.18 um 01:48 schrieb Liu, Monk:
>>>> NAK, that would result in a severe performance drop.
>>>> We need the fence here to determine if we actually need to do the pipeline sync or not.
>>>> E.g. the explicit requested fence could already be signaled.
>>> For the performance issue, only insert a WAIT_REG_MEM on GFX/compute ring *doesn't* give the "severe" drop (it's mimic in fact) ...  At least I didn't observe any performance drop with 3dmark benchmark (also tested vulkan CTS), Can you tell me which game/benchmark will have performance drop with this fix by your understanding ? let me check it .
>>>
>>> The problem I hit is during the massive stress test against 
>>> multi-process + quark , if the quark process hang the engine while there is another two job following the bad job, After the TDR these two job will lose the explicit and the pipeline-sync was also lost.
>>>
>>>
>>> BTW: for original logic, the pipeline sync have another corner case:
>>> Assume JobC depend on JobA with explicit flag, and there is jobB inserted in ring:
>>>
>>> jobA -> jobB -> (pipe sync)JobC
>>>
>>> if JobA really cost a lot of time to finish, in the
>>> amdgpu_ib_schedule() stage you will insert a pipeline sync for JobC against its explicit dependency which is JobA, but there is a JobB between A and C and the pipeline sync of before JobC will wrongly wait on the JobB ...
>>>
>>> while it is not a big issue but obviously not necessary: C have no 
>>> relation with B
>>>
>>> /Monk
>>>
>>>
>>>
>>> -----Original Message-----
>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>> Sent: Sunday, November 4, 2018 3:50 AM
>>> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH 2/3] drm/amdgpu: drop the sched_sync
>>>
>>> Am 03.11.18 um 06:33 schrieb Monk Liu:
>>>> Reasons to drop it:
>>>>
>>>> 1) simplify the code: just introduce field member "need_pipe_sync"
>>>> for job is good enough to tell if the explicit dependency fence 
>>>> need followed by a pipeline sync.
>>>>
>>>> 2) after GPU_recover the explicit fence from sched_syn will not 
>>>> come back so the required pipeline_sync following it is missed, 
>>>> consider scenario below:
>>>>> now on ring buffer:
>>>> Job-A -> pipe_sync -> Job-B
>>>>> TDR occured on Job-A, and after GPU recover:
>>>>> now on ring buffer:
>>>> Job-A -> Job-B
>>>>
>>>> because the fence from sched_sync is used and freed after 
>>>> ib_schedule in first time, it will never come back, with this patch 
>>>> this issue could be avoided.
>>> NAK, that would result in a severe performance drop.
>>>
>>> We need the fence here to determine if we actually need to do the pipeline sync or not.
>>>
>>> E.g. the explicit requested fence could already be signaled.
>>>
>>> Christian.
>>>
>>>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>>>> ---
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c  | 16 ++++++----------
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 14 +++-----------
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  3 +--
>>>>      3 files changed, 10 insertions(+), 23 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>>> index c48207b3..ac7d2da 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>>> @@ -122,7 +122,6 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>>>      {
>>>>      	struct amdgpu_device *adev = ring->adev;
>>>>      	struct amdgpu_ib *ib = &ibs[0];
>>>> -	struct dma_fence *tmp = NULL;
>>>>      	bool skip_preamble, need_ctx_switch;
>>>>      	unsigned patch_offset = ~0;
>>>>      	struct amdgpu_vm *vm;
>>>> @@ -166,16 +165,13 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>>>      	}
>>>>      
>>>>      	need_ctx_switch = ring->current_ctx != fence_ctx;
>>>> -	if (ring->funcs->emit_pipeline_sync && job &&
>>>> -	    ((tmp = amdgpu_sync_get_fence(&job->sched_sync, NULL)) ||
>>>> -	     (amdgpu_sriov_vf(adev) && need_ctx_switch) ||
>>>> -	     amdgpu_vm_need_pipeline_sync(ring, job))) {
>>>> -		need_pipe_sync = true;
>>>>      
>>>> -		if (tmp)
>>>> -			trace_amdgpu_ib_pipe_sync(job, tmp);
>>>> -
>>>> -		dma_fence_put(tmp);
>>>> +	if (ring->funcs->emit_pipeline_sync && job) {
>>>> +		if ((need_ctx_switch && amdgpu_sriov_vf(adev)) ||
>>>> +			amdgpu_vm_need_pipeline_sync(ring, job))
>>>> +			need_pipe_sync = true;
>>>> +		else if (job->need_pipe_sync)
>>>> +			need_pipe_sync = true;
>>>>      	}
>>>>      
>>>>      	if (ring->funcs->insert_start) diff --git 
>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>> index 1d71f8c..dae997d 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>> @@ -71,7 +71,6 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
>>>>      	(*job)->num_ibs = num_ibs;
>>>>      
>>>>      	amdgpu_sync_create(&(*job)->sync);
>>>> -	amdgpu_sync_create(&(*job)->sched_sync);
>>>>      	(*job)->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
>>>>      	(*job)->vm_pd_addr = AMDGPU_BO_INVALID_OFFSET;
>>>>      
>>>> @@ -117,7 +116,6 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job)
>>>>      	amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>      	dma_fence_put(job->fence);
>>>>      	amdgpu_sync_free(&job->sync);
>>>> -	amdgpu_sync_free(&job->sched_sync);
>>>>      	kfree(job);
>>>>      }
>>>>      
>>>> @@ -127,7 +125,6 @@ void amdgpu_job_free(struct amdgpu_job *job)
>>>>      
>>>>      	dma_fence_put(job->fence);
>>>>      	amdgpu_sync_free(&job->sync);
>>>> -	amdgpu_sync_free(&job->sched_sync);
>>>>      	kfree(job);
>>>>      }
>>>>      
>>>> @@ -182,14 +179,9 @@ static struct dma_fence *amdgpu_job_dependency(struct drm_sched_job *sched_job,
>>>>      	bool need_pipe_sync = false;
>>>>      	int r;
>>>>      
>>>> -	fence = amdgpu_sync_get_fence(&job->sync, &need_pipe_sync);
>>>> -	if (fence && need_pipe_sync) {
>>>> -		if (drm_sched_dependency_optimized(fence, s_entity)) {
>>>> -			r = amdgpu_sync_fence(ring->adev, &job->sched_sync,
>>>> -					      fence, false);
>>>> -			if (r)
>>>> -				DRM_ERROR("Error adding fence (%d)\n", r);
>>>> -		}
>>>> +	if (fence && need_pipe_sync && drm_sched_dependency_optimized(fence, s_entity)) {
>>>> +		trace_amdgpu_ib_pipe_sync(job, fence);
>>>> +		job->need_pipe_sync = true;
>>>>      	}
>>>>      
>>>>      	while (fence == NULL && vm && !job->vmid) { diff --git 
>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>> index e1b46a6..c1d00f0 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>> @@ -41,7 +41,6 @@ struct amdgpu_job {
>>>>      	struct drm_sched_job    base;
>>>>      	struct amdgpu_vm	*vm;
>>>>      	struct amdgpu_sync	sync;
>>>> -	struct amdgpu_sync	sched_sync;
>>>>      	struct amdgpu_ib	*ibs;
>>>>      	struct dma_fence	*fence; /* the hw fence */
>>>>      	uint32_t		preamble_status;
>>>> @@ -59,7 +58,7 @@ struct amdgpu_job {
>>>>      	/* user fence handling */
>>>>      	uint64_t		uf_addr;
>>>>      	uint64_t		uf_sequence;
>>>> -
>>>> +	bool            need_pipe_sync; /* require a pipeline sync for this job */
>>>>      };
>>>>      
>>>>      int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned 
>>>> num_ibs,

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* RE: [PATCH 2/3] drm/amdgpu: drop the sched_sync
       [not found]                                     ` <CY4PR1201MB0245739F3F4FBC89DA2F18E284CA0-1iTaO6aE1DBfNQakwlCMTGrFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
@ 2018-11-05 14:31                                       ` Liu, Monk
  0 siblings, 0 replies; 19+ messages in thread
From: Liu, Monk @ 2018-11-05 14:31 UTC (permalink / raw)
  To: Liu, Monk, Koenig, Christian,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Zhou, David(ChunMing)

> The question is how much later that is done. My recollection is that we don't reset that for resubmission, but that could be wrong.

According to my code (drm-next) VMID's "current_gpu_rest_counter" is updated in "vm_flush" routine, so it is always there for either resubmission or not ...

See the code in vm_flush():

	if (vm_flush_needed) {
		mutex_lock(&id_mgr->lock);
		dma_fence_put(id->last_flush);
		id->last_flush = dma_fence_get(fence);
		id->current_gpu_reset_count =
			atomic_read(&adev->gpu_reset_counter);
		mutex_unlock(&id_mgr->lock);
	}

/Monk

-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Liu, Monk
Sent: Monday, November 5, 2018 10:22 PM
To: Koenig, Christian <Christian.Koenig@amd.com>; amd-gfx@lists.freedesktop.org; Zhou, David(ChunMing) <David1.Zhou@amd.com>
Subject: RE: [PATCH 2/3] drm/amdgpu: drop the sched_sync

> Anyway I think the cleanest approach to always handle that correctly would be to always insert a vm flush before all jobs on resubmission. 
That is most likely better for VM flush handling as well.

Yeah, that’s true and more simple 

/Monk
-----Original Message-----
From: Koenig, Christian
Sent: Monday, November 5, 2018 9:59 PM
To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org; Zhou, David(ChunMing) <David1.Zhou@amd.com>
Subject: Re: [PATCH 2/3] drm/amdgpu: drop the sched_sync

> and later its VMID's "current_gpu_reset_count" is updated to "adev->gpu_reset_count"
The question is how much later that is done. My recollection is that we don't reset that for resubmission, but that could be wrong.

Anyway I think the cleanest approach to always handle that correctly would be to always insert a vm flush before all jobs on resubmission. 
That is most likely better for VM flush handling as well.

Christian.

Am 05.11.18 um 14:41 schrieb Liu, Monk:
> Hi Christian
>
> For scenario: Bad Job (hang, vmid1) -->Job A (context 10, explicit dep 
> for Job B, vmid2) --> Job B(context 10, vmid2) --> Job C (context 11,
> vmid3)
>
> Assume "job_hang_limit" is 0, and assume "sched_hw_submission" is 4, I give a second thought on the logic after GPU reset:
>
> 1) the bad Job would be set guilty and skipped by scheduler,
> 2) the first re-submitted job (Job A) would be forced with a 
> pipeline-sync,
> 3) the first re-submitted job (Job A) would be forced with a vm-flush, and later its VMID's "current_gpu_reset_count" is updated to "adev->gpu_reset_count"
> 4) the second re-submitted job (Job B, assume it was from the same context of Job A, share the same page table/process, and no vm_update needed ) would not be forced with a pipeline-sync, and neither a vm-flush ...
>   
> Thus for Job B if it has an explicit dep on Job A, this explicit dep would get lost and there will be no pipeline sync inserted prior to Job B ...
>
> Do you think that's a possible corner case ?
>
> /Monk
>
> -----Original Message-----
> From: Koenig, Christian
> Sent: Monday, November 5, 2018 3:48 PM
> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org; Zhou,
> David(ChunMing) <David1.Zhou@amd.com>
> Subject: Re: [PATCH 2/3] drm/amdgpu: drop the sched_sync
>
> Am 05.11.18 um 08:24 schrieb Liu, Monk:
>>> David Zhou had an use case which saw a >10% performance drop the last time he tried it.
>> I really don't believe that, because if you insert a WAIT_MEM on an already signaled fence, it only cost GPU couple clocks to move  on, right ? no reason to slow down up to 10% ... with 3dmark vulkan version test, the performance is barely different ... with my patch applied ...
> Why do you think that we insert a WAIT_MEM on an already signaled fence?
> The pipeline sync always wait for the last fence value (because we can't handle wraparounds in PM4).
>
> So you have a pipeline sync when you don't need one and that is really really bad for things shared between processes, e.g. X/Wayland and it's clients.
>
> I also expects that this doesn't effect 3dmark at all, but everything which runs in a window which is composed by X could be slowed down massively.
>
> David do you remember which use case was affected when you tried to drop this optimization?
>
>>> When a reset happens we flush the VMIDs when re-submitting the jobs to the rings and while doing so we also always do a pipeline sync.
>> I will check that point in my branch, I didn't use drm-next, maybe 
>> there is gap in this part
> We had that logic for a very long time now, but we recently simplified it. Could be that there was a bug introduced doing so.
>
> Maybe we should add a specific flag to run_job() to note that we are re-running a job and then always add VM flushes/pipeline syncs?
>
> But my main question is why do you see any issues with quark? That is a workaround for an issue for Vulkan sync handling and should only surface when a specific test is run many many times.
>
> Regards,
> Christian.
>
>> /Monk
>> -----Original Message-----
>> From: Koenig, Christian
>> Sent: Monday, November 5, 2018 3:02 AM
>> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH 2/3] drm/amdgpu: drop the sched_sync
>>
>>> Can you tell me which game/benchmark will have performance drop with this fix by your understanding ?
>> When you sync between submission things like composing X windows are slowed down massively.
>>
>> David Zhou had an use case which saw a >10% performance drop the last time he tried it.
>>
>>> The problem I hit is during the massive stress test against 
>>> multi-process + quark , if the quark process hang the engine while there is another two job following the bad job, After the TDR these two job will lose the explicit and the pipeline-sync was also lost.
>> Well that is really strange. This workaround is only for a very specific Vulkan CTS test which we are still not 100% sure is actually valid.
>>
>> When a reset happens we flush the VMIDs when re-submitting the jobs to the rings and while doing so we also always do a pipeline sync.
>>
>> So you should never ever run into any issues in quark with that, even when we completely disable this workaround.
>>
>> Regards,
>> Christian.
>>
>> Am 04.11.18 um 01:48 schrieb Liu, Monk:
>>>> NAK, that would result in a severe performance drop.
>>>> We need the fence here to determine if we actually need to do the pipeline sync or not.
>>>> E.g. the explicit requested fence could already be signaled.
>>> For the performance issue, only insert a WAIT_REG_MEM on GFX/compute ring *doesn't* give the "severe" drop (it's mimic in fact) ...  At least I didn't observe any performance drop with 3dmark benchmark (also tested vulkan CTS), Can you tell me which game/benchmark will have performance drop with this fix by your understanding ? let me check it .
>>>
>>> The problem I hit is during the massive stress test against 
>>> multi-process + quark , if the quark process hang the engine while there is another two job following the bad job, After the TDR these two job will lose the explicit and the pipeline-sync was also lost.
>>>
>>>
>>> BTW: for original logic, the pipeline sync have another corner case:
>>> Assume JobC depend on JobA with explicit flag, and there is jobB inserted in ring:
>>>
>>> jobA -> jobB -> (pipe sync)JobC
>>>
>>> if JobA really cost a lot of time to finish, in the
>>> amdgpu_ib_schedule() stage you will insert a pipeline sync for JobC against its explicit dependency which is JobA, but there is a JobB between A and C and the pipeline sync of before JobC will wrongly wait on the JobB ...
>>>
>>> while it is not a big issue but obviously not necessary: C have no 
>>> relation with B
>>>
>>> /Monk
>>>
>>>
>>>
>>> -----Original Message-----
>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>> Sent: Sunday, November 4, 2018 3:50 AM
>>> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH 2/3] drm/amdgpu: drop the sched_sync
>>>
>>> Am 03.11.18 um 06:33 schrieb Monk Liu:
>>>> Reasons to drop it:
>>>>
>>>> 1) simplify the code: just introduce field member "need_pipe_sync"
>>>> for job is good enough to tell if the explicit dependency fence 
>>>> need followed by a pipeline sync.
>>>>
>>>> 2) after GPU_recover the explicit fence from sched_syn will not 
>>>> come back so the required pipeline_sync following it is missed, 
>>>> consider scenario below:
>>>>> now on ring buffer:
>>>> Job-A -> pipe_sync -> Job-B
>>>>> TDR occured on Job-A, and after GPU recover:
>>>>> now on ring buffer:
>>>> Job-A -> Job-B
>>>>
>>>> because the fence from sched_sync is used and freed after 
>>>> ib_schedule in first time, it will never come back, with this patch 
>>>> this issue could be avoided.
>>> NAK, that would result in a severe performance drop.
>>>
>>> We need the fence here to determine if we actually need to do the pipeline sync or not.
>>>
>>> E.g. the explicit requested fence could already be signaled.
>>>
>>> Christian.
>>>
>>>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>>>> ---
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c  | 16 ++++++----------
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 14 +++-----------
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  3 +--
>>>>      3 files changed, 10 insertions(+), 23 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>>> index c48207b3..ac7d2da 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>>> @@ -122,7 +122,6 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>>>      {
>>>>      	struct amdgpu_device *adev = ring->adev;
>>>>      	struct amdgpu_ib *ib = &ibs[0];
>>>> -	struct dma_fence *tmp = NULL;
>>>>      	bool skip_preamble, need_ctx_switch;
>>>>      	unsigned patch_offset = ~0;
>>>>      	struct amdgpu_vm *vm;
>>>> @@ -166,16 +165,13 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>>>      	}
>>>>      
>>>>      	need_ctx_switch = ring->current_ctx != fence_ctx;
>>>> -	if (ring->funcs->emit_pipeline_sync && job &&
>>>> -	    ((tmp = amdgpu_sync_get_fence(&job->sched_sync, NULL)) ||
>>>> -	     (amdgpu_sriov_vf(adev) && need_ctx_switch) ||
>>>> -	     amdgpu_vm_need_pipeline_sync(ring, job))) {
>>>> -		need_pipe_sync = true;
>>>>      
>>>> -		if (tmp)
>>>> -			trace_amdgpu_ib_pipe_sync(job, tmp);
>>>> -
>>>> -		dma_fence_put(tmp);
>>>> +	if (ring->funcs->emit_pipeline_sync && job) {
>>>> +		if ((need_ctx_switch && amdgpu_sriov_vf(adev)) ||
>>>> +			amdgpu_vm_need_pipeline_sync(ring, job))
>>>> +			need_pipe_sync = true;
>>>> +		else if (job->need_pipe_sync)
>>>> +			need_pipe_sync = true;
>>>>      	}
>>>>      
>>>>      	if (ring->funcs->insert_start) diff --git 
>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>> index 1d71f8c..dae997d 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>> @@ -71,7 +71,6 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
>>>>      	(*job)->num_ibs = num_ibs;
>>>>      
>>>>      	amdgpu_sync_create(&(*job)->sync);
>>>> -	amdgpu_sync_create(&(*job)->sched_sync);
>>>>      	(*job)->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
>>>>      	(*job)->vm_pd_addr = AMDGPU_BO_INVALID_OFFSET;
>>>>      
>>>> @@ -117,7 +116,6 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job)
>>>>      	amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>      	dma_fence_put(job->fence);
>>>>      	amdgpu_sync_free(&job->sync);
>>>> -	amdgpu_sync_free(&job->sched_sync);
>>>>      	kfree(job);
>>>>      }
>>>>      
>>>> @@ -127,7 +125,6 @@ void amdgpu_job_free(struct amdgpu_job *job)
>>>>      
>>>>      	dma_fence_put(job->fence);
>>>>      	amdgpu_sync_free(&job->sync);
>>>> -	amdgpu_sync_free(&job->sched_sync);
>>>>      	kfree(job);
>>>>      }
>>>>      
>>>> @@ -182,14 +179,9 @@ static struct dma_fence *amdgpu_job_dependency(struct drm_sched_job *sched_job,
>>>>      	bool need_pipe_sync = false;
>>>>      	int r;
>>>>      
>>>> -	fence = amdgpu_sync_get_fence(&job->sync, &need_pipe_sync);
>>>> -	if (fence && need_pipe_sync) {
>>>> -		if (drm_sched_dependency_optimized(fence, s_entity)) {
>>>> -			r = amdgpu_sync_fence(ring->adev, &job->sched_sync,
>>>> -					      fence, false);
>>>> -			if (r)
>>>> -				DRM_ERROR("Error adding fence (%d)\n", r);
>>>> -		}
>>>> +	if (fence && need_pipe_sync && drm_sched_dependency_optimized(fence, s_entity)) {
>>>> +		trace_amdgpu_ib_pipe_sync(job, fence);
>>>> +		job->need_pipe_sync = true;
>>>>      	}
>>>>      
>>>>      	while (fence == NULL && vm && !job->vmid) { diff --git 
>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>> index e1b46a6..c1d00f0 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>> @@ -41,7 +41,6 @@ struct amdgpu_job {
>>>>      	struct drm_sched_job    base;
>>>>      	struct amdgpu_vm	*vm;
>>>>      	struct amdgpu_sync	sync;
>>>> -	struct amdgpu_sync	sched_sync;
>>>>      	struct amdgpu_ib	*ibs;
>>>>      	struct dma_fence	*fence; /* the hw fence */
>>>>      	uint32_t		preamble_status;
>>>> @@ -59,7 +58,7 @@ struct amdgpu_job {
>>>>      	/* user fence handling */
>>>>      	uint64_t		uf_addr;
>>>>      	uint64_t		uf_sequence;
>>>> -
>>>> +	bool            need_pipe_sync; /* require a pipeline sync for this job */
>>>>      };
>>>>      
>>>>      int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned 
>>>> num_ibs,

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2018-11-05 14:31 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-11-03  5:33 [PATCH 1/3] drm/amdgpu: rename explicit to need_pipe_sync for better understanding Monk Liu
     [not found] ` <1541223223-27020-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2018-11-03  5:33   ` [PATCH 2/3] drm/amdgpu: drop the sched_sync Monk Liu
     [not found]     ` <1541223223-27020-2-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2018-11-03 19:49       ` Christian König
     [not found]         ` <251ea9fd-910f-24aa-343c-a87b5039b4dd-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2018-11-04  0:21           ` Liu, Monk
2018-11-04  0:48           ` Liu, Monk
     [not found]             ` <BN6PR1201MB024149B0B50D889C4CEF655984C90-6iU6OBHu2P+jgkzTOKc/hWrFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
2018-11-04 19:02               ` Koenig, Christian
     [not found]                 ` <0abab90d-dd54-290d-a1cf-38047ec66b9f-5C7GfCeVMHo@public.gmane.org>
2018-11-05  7:24                   ` Liu, Monk
     [not found]                     ` <CY4PR1201MB02455363976262BBDC58415484CA0-1iTaO6aE1DBfNQakwlCMTGrFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
2018-11-05  7:47                       ` Koenig, Christian
     [not found]                         ` <9e5d6f91-9e20-8dcc-953f-e1e980b9e0d3-5C7GfCeVMHo@public.gmane.org>
2018-11-05  7:50                           ` Zhou, David(ChunMing)
     [not found]                             ` <BY1PR12MB0502C523B2FA785240D12E36B4CA0-PicGAnIBOobrCwm+z9iKNgdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2018-11-05  8:37                               ` Liu, Monk
2018-11-05  8:58                           ` Liu, Monk
     [not found]                             ` <CY4PR1201MB0245543F654663532113C54A84CA0-1iTaO6aE1DBfNQakwlCMTGrFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
2018-11-05 11:11                               ` Koenig, Christian
     [not found]                                 ` <94885af2-43eb-45a3-513a-89d838dbaa95-5C7GfCeVMHo@public.gmane.org>
2018-11-05 13:47                                   ` Liu, Monk
2018-11-05 13:41                           ` Liu, Monk
     [not found]                             ` <CY4PR1201MB024516A3396DBF651EAADA5684CA0-1iTaO6aE1DBfNQakwlCMTGrFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
2018-11-05 13:59                               ` Koenig, Christian
     [not found]                                 ` <40e3e22f-f42e-85e0-f66e-c2d16d392289-5C7GfCeVMHo@public.gmane.org>
2018-11-05 14:21                                   ` Liu, Monk
     [not found]                                     ` <CY4PR1201MB0245739F3F4FBC89DA2F18E284CA0-1iTaO6aE1DBfNQakwlCMTGrFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
2018-11-05 14:31                                       ` Liu, Monk
2018-11-03  5:33   ` [PATCH 3/3] drm/amdgpu: drop need_vm_flush/need_pipe_sync Monk Liu
     [not found]     ` <1541223223-27020-3-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2018-11-03 19:48       ` Christian König

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.