All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCHv2 1/2] drm/amd/amdgpu embed hw_fence into amdgpu_job
@ 2021-08-05  8:31 Jingwen Chen
  2021-08-05  8:31 ` [PATCHv2 2/2] drm/amd/amdgpu: add tdr support for embeded hw_fence Jingwen Chen
  2021-08-05 21:13 ` [PATCHv2 1/2] drm/amd/amdgpu embed hw_fence into amdgpu_job Andrey Grodzovsky
  0 siblings, 2 replies; 10+ messages in thread
From: Jingwen Chen @ 2021-08-05  8:31 UTC (permalink / raw)
  To: amd-gfx
  Cc: monk.liu, christian.koenig, Andrey.Grodzovsky, Jack Zhang,
	Jingwen Chen, Jack Zhang

From: Jack Zhang <Jack.Zhang1@amd.com>

Why: Previously hw fence is alloced separately with job.
It caused historical lifetime issues and corner cases.
The ideal situation is to take fence to manage both job
and fence's lifetime, and simplify the design of gpu-scheduler.

How:
We propose to embed hw_fence into amdgpu_job.
1. We cover the normal job submission by this method.
2. For ib_test, and submit without a parent job keep the
legacy way to create a hw fence separately.
v2:
use AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT to show that the fence is
embeded in a job.

Signed-off-by: Jingwen Chen <Jingwen.Chen2@amd.com>
Signed-off-by: Jack Zhang <Jack.Zhang7@hotmail.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c  |  1 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c   | 63 ++++++++++++++++-----
 drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c      |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c     | 35 ++++++++----
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.h     |  4 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h    |  5 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c      |  2 +-
 8 files changed, 84 insertions(+), 30 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 7b46ba551cb2..3003ee1c9487 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -714,7 +714,6 @@ int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine,
 	ret = dma_fence_wait(f, false);
 
 err_ib_sched:
-	dma_fence_put(f);
 	amdgpu_job_free(job);
 err:
 	return ret;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 536005bff24a..277128846dd1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1414,7 +1414,7 @@ static void amdgpu_ib_preempt_mark_partial_job(struct amdgpu_ring *ring)
 			continue;
 		}
 		job = to_amdgpu_job(s_job);
-		if (preempted && job->fence == fence)
+		if (preempted && (&job->hw_fence) == fence)
 			/* mark the job as preempted */
 			job->preemption_status |= AMDGPU_IB_PREEMPTED;
 	}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 7495911516c2..5e29d797a265 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -129,30 +129,46 @@ static u32 amdgpu_fence_read(struct amdgpu_ring *ring)
  *
  * @ring: ring the fence is associated with
  * @f: resulting fence object
+ * @job: job the fence is embeded in
  * @flags: flags to pass into the subordinate .emit_fence() call
  *
  * Emits a fence command on the requested ring (all asics).
  * Returns 0 on success, -ENOMEM on failure.
  */
-int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f,
+int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f, struct amdgpu_job *job,
 		      unsigned flags)
 {
 	struct amdgpu_device *adev = ring->adev;
-	struct amdgpu_fence *fence;
+	struct dma_fence *fence;
+	struct amdgpu_fence *am_fence;
 	struct dma_fence __rcu **ptr;
 	uint32_t seq;
 	int r;
 
-	fence = kmem_cache_alloc(amdgpu_fence_slab, GFP_KERNEL);
-	if (fence == NULL)
-		return -ENOMEM;
+	if (job == NULL) {
+		/* create a sperate hw fence */
+		am_fence = kmem_cache_alloc(amdgpu_fence_slab, GFP_ATOMIC);
+		if (am_fence == NULL)
+			return -ENOMEM;
+		fence = &am_fence->base;
+		am_fence->ring = ring;
+	} else {
+		/* take use of job-embedded fence */
+		fence = &job->hw_fence;
+		job->ring = ring;
+	}
 
 	seq = ++ring->fence_drv.sync_seq;
-	fence->ring = ring;
-	dma_fence_init(&fence->base, &amdgpu_fence_ops,
+	dma_fence_init(fence, &amdgpu_fence_ops,
 		       &ring->fence_drv.lock,
 		       adev->fence_context + ring->idx,
 		       seq);
+
+	if (job != NULL) {
+		/* mark this fence has a parent job */
+		set_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, &fence->flags);
+	}
+
 	amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
 			       seq, flags | AMDGPU_FENCE_FLAG_INT);
 	pm_runtime_get_noresume(adev_to_drm(adev)->dev);
@@ -175,9 +191,9 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f,
 	/* This function can't be called concurrently anyway, otherwise
 	 * emitting the fence would mess up the hardware ring buffer.
 	 */
-	rcu_assign_pointer(*ptr, dma_fence_get(&fence->base));
+	rcu_assign_pointer(*ptr, dma_fence_get(fence));
 
-	*f = &fence->base;
+	*f = fence;
 
 	return 0;
 }
@@ -621,8 +637,16 @@ static const char *amdgpu_fence_get_driver_name(struct dma_fence *fence)
 
 static const char *amdgpu_fence_get_timeline_name(struct dma_fence *f)
 {
-	struct amdgpu_fence *fence = to_amdgpu_fence(f);
-	return (const char *)fence->ring->name;
+	struct amdgpu_ring *ring;
+
+	if (test_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, &f->flags)) {
+		struct amdgpu_job *job = container_of(f, struct amdgpu_job, hw_fence);
+
+		ring = job->ring;
+	} else {
+		ring = to_amdgpu_fence(f)->ring;
+	}
+	return (const char *)ring->name;
 }
 
 /**
@@ -656,8 +680,20 @@ static bool amdgpu_fence_enable_signaling(struct dma_fence *f)
 static void amdgpu_fence_free(struct rcu_head *rcu)
 {
 	struct dma_fence *f = container_of(rcu, struct dma_fence, rcu);
-	struct amdgpu_fence *fence = to_amdgpu_fence(f);
-	kmem_cache_free(amdgpu_fence_slab, fence);
+
+	if (test_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, &f->flags)) {
+	/* free job if fence has a parent job */
+		struct amdgpu_job *job;
+
+		job = container_of(f, struct amdgpu_job, hw_fence);
+		kfree(job);
+	} else {
+	/* free fence_slab if it's separated fence*/
+		struct amdgpu_fence *fence;
+
+		fence = to_amdgpu_fence(f);
+		kmem_cache_free(amdgpu_fence_slab, fence);
+	}
 }
 
 /**
@@ -680,6 +716,7 @@ static const struct dma_fence_ops amdgpu_fence_ops = {
 	.release = amdgpu_fence_release,
 };
 
+
 /*
  * Fence debugfs
  */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
index ec65ab0ddf89..c076a6b9a5a2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
@@ -262,7 +262,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
 				       fence_flags | AMDGPU_FENCE_FLAG_64BIT);
 	}
 
-	r = amdgpu_fence_emit(ring, f, fence_flags);
+	r = amdgpu_fence_emit(ring, f, job, fence_flags);
 	if (r) {
 		dev_err(adev->dev, "failed to emit fence (%d)\n", r);
 		if (job && job->vmid)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index d33e6d97cc89..65a395060de2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -127,11 +127,16 @@ void amdgpu_job_free_resources(struct amdgpu_job *job)
 {
 	struct amdgpu_ring *ring = to_amdgpu_ring(job->base.sched);
 	struct dma_fence *f;
+	struct dma_fence *hw_fence;
 	unsigned i;
 
-	/* use sched fence if available */
-	f = job->base.s_fence ? &job->base.s_fence->finished : job->fence;
+	if (job->hw_fence.ops == NULL)
+		hw_fence = job->external_hw_fence;
+	else
+		hw_fence = &job->hw_fence;
 
+	/* use sched fence if available */
+	f = job->base.s_fence ? &job->base.s_fence->finished : hw_fence;
 	for (i = 0; i < job->num_ibs; ++i)
 		amdgpu_ib_free(ring->adev, &job->ibs[i], f);
 }
@@ -142,20 +147,27 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job)
 
 	drm_sched_job_cleanup(s_job);
 
-	dma_fence_put(job->fence);
 	amdgpu_sync_free(&job->sync);
 	amdgpu_sync_free(&job->sched_sync);
-	kfree(job);
+
+    /* only put the hw fence if has embedded fence */
+	if (job->hw_fence.ops != NULL)
+		dma_fence_put(&job->hw_fence);
+	else
+		kfree(job);
 }
 
 void amdgpu_job_free(struct amdgpu_job *job)
 {
 	amdgpu_job_free_resources(job);
-
-	dma_fence_put(job->fence);
 	amdgpu_sync_free(&job->sync);
 	amdgpu_sync_free(&job->sched_sync);
-	kfree(job);
+
+	/* only put the hw fence if has embedded fence */
+	if (job->hw_fence.ops != NULL)
+		dma_fence_put(&job->hw_fence);
+	else
+		kfree(job);
 }
 
 int amdgpu_job_submit(struct amdgpu_job *job, struct drm_sched_entity *entity,
@@ -184,11 +196,14 @@ int amdgpu_job_submit_direct(struct amdgpu_job *job, struct amdgpu_ring *ring,
 
 	job->base.sched = &ring->sched;
 	r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, NULL, fence);
-	job->fence = dma_fence_get(*fence);
+	/* record external_hw_fence for direct submit */
+	job->external_hw_fence = dma_fence_get(*fence);
 	if (r)
 		return r;
 
 	amdgpu_job_free(job);
+	dma_fence_put(*fence);
+
 	return 0;
 }
 
@@ -246,10 +261,8 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
 		if (r)
 			DRM_ERROR("Error scheduling IBs (%d)\n", r);
 	}
-	/* if gpu reset, hw fence will be replaced here */
-	dma_fence_put(job->fence);
-	job->fence = dma_fence_get(fence);
 
+	dma_fence_get(fence);
 	amdgpu_job_free_resources(job);
 
 	fence = r ? ERR_PTR(r) : fence;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
index 81caac9b958a..92324c978534 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
@@ -46,7 +46,9 @@ struct amdgpu_job {
 	struct amdgpu_sync	sync;
 	struct amdgpu_sync	sched_sync;
 	struct amdgpu_ib	*ibs;
-	struct dma_fence	*fence; /* the hw fence */
+	struct dma_fence	hw_fence;
+	struct amdgpu_ring *ring;
+	struct dma_fence	*external_hw_fence;
 	uint32_t		preamble_status;
 	uint32_t                preemption_status;
 	uint32_t		num_ibs;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 9c11ced4312c..03d4b29a76d6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -48,6 +48,9 @@
 #define AMDGPU_FENCE_FLAG_INT           (1 << 1)
 #define AMDGPU_FENCE_FLAG_TC_WB_ONLY    (1 << 2)
 
+/* fence flag bit to indicate the face is embeded in job*/
+#define AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT		(DMA_FENCE_FLAG_USER_BITS + 1)
+
 #define to_amdgpu_ring(s) container_of((s), struct amdgpu_ring, sched)
 
 #define AMDGPU_IB_POOL_SIZE	(1024 * 1024)
@@ -118,7 +121,7 @@ void amdgpu_fence_driver_hw_init(struct amdgpu_device *adev);
 void amdgpu_fence_driver_hw_fini(struct amdgpu_device *adev);
 int amdgpu_fence_driver_sw_init(struct amdgpu_device *adev);
 void amdgpu_fence_driver_sw_fini(struct amdgpu_device *adev);
-int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **fence,
+int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **fence, struct amdgpu_job *job,
 		      unsigned flags);
 int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, uint32_t *s,
 			      uint32_t timeout);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 2a88ed5d983b..2af8860d74cc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -1218,7 +1218,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
 		amdgpu_gmc_emit_pasid_mapping(ring, job->vmid, job->pasid);
 
 	if (vm_flush_needed || pasid_mapping_needed) {
-		r = amdgpu_fence_emit(ring, &fence, 0);
+		r = amdgpu_fence_emit(ring, &fence, NULL, 0);
 		if (r)
 			return r;
 	}
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCHv2 2/2] drm/amd/amdgpu: add tdr support for embeded hw_fence
  2021-08-05  8:31 [PATCHv2 1/2] drm/amd/amdgpu embed hw_fence into amdgpu_job Jingwen Chen
@ 2021-08-05  8:31 ` Jingwen Chen
  2021-08-09 16:24   ` Andrey Grodzovsky
  2021-08-05 21:13 ` [PATCHv2 1/2] drm/amd/amdgpu embed hw_fence into amdgpu_job Andrey Grodzovsky
  1 sibling, 1 reply; 10+ messages in thread
From: Jingwen Chen @ 2021-08-05  8:31 UTC (permalink / raw)
  To: amd-gfx
  Cc: monk.liu, christian.koenig, Andrey.Grodzovsky, Jingwen Chen, Jack Zhang

[Why]
After embeded hw_fence to amdgpu_job, we need to add tdr support
for this feature.

[How]
1. Add a resubmit_flag for resubmit jobs.
2. Clear job fence from RCU and force complete vm flush fences in
   pre_asic_reset
3. skip dma_fence_get for resubmit jobs and add a dma_fence_put
   for guilty jobs.
v2:
use a job_run_counter in amdgpu_job to replace the resubmit_flag in
drm_sched_job. When the job_run_counter >= 1, it means this job is a
resubmit job.

Signed-off-by: Jack Zhang <Jack.Zhang7@hotmail.com>
Signed-off-by: Jingwen Chen <Jingwen.Chen2@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 13 +++++++++----
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    |  5 ++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.h    |  3 +++
 4 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 9e53ff851496..ade2fa07a50a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4447,7 +4447,7 @@ int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
 				 struct amdgpu_reset_context *reset_context)
 {
-	int i, r = 0;
+	int i, j, r = 0;
 	struct amdgpu_job *job = NULL;
 	bool need_full_reset =
 		test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
@@ -4471,6 +4471,16 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
 		if (!ring || !ring->sched.thread)
 			continue;
 
+		/*clear job fence from fence drv to avoid force_completion
+		 *leave NULL and vm flush fence in fence drv */
+		for (j = 0; j <= ring->fence_drv.num_fences_mask; j ++) {
+			struct dma_fence *old,**ptr;
+			ptr = &ring->fence_drv.fences[j];
+			old = rcu_dereference_protected(*ptr, 1);
+			if (old && test_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, &old->flags)) {
+				RCU_INIT_POINTER(*ptr, NULL);
+			}
+		}
 		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
 		amdgpu_fence_driver_force_completion(ring);
 	}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 5e29d797a265..c9752cf794fb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -159,10 +159,15 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f, struct amd
 	}
 
 	seq = ++ring->fence_drv.sync_seq;
-	dma_fence_init(fence, &amdgpu_fence_ops,
-		       &ring->fence_drv.lock,
-		       adev->fence_context + ring->idx,
-		       seq);
+	if (job != NULL && job->job_run_counter) {
+		/* reinit seq for resubmitted jobs */
+		fence->seqno = seq;
+	} else {
+		dma_fence_init(fence, &amdgpu_fence_ops,
+				&ring->fence_drv.lock,
+				adev->fence_context + ring->idx,
+				seq);
+	}
 
 	if (job != NULL) {
 		/* mark this fence has a parent job */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 65a395060de2..19b13a65c73b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -254,6 +254,7 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
 		dma_fence_set_error(finished, -ECANCELED);/* skip IB as well if VRAM lost */
 
 	if (finished->error < 0) {
+		dma_fence_put(&job->hw_fence);
 		DRM_INFO("Skip scheduling IBs!\n");
 	} else {
 		r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
@@ -262,7 +263,9 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
 			DRM_ERROR("Error scheduling IBs (%d)\n", r);
 	}
 
-	dma_fence_get(fence);
+	if (!job->job_run_counter)
+		dma_fence_get(fence);
+	job->job_run_counter ++;
 	amdgpu_job_free_resources(job);
 
 	fence = r ? ERR_PTR(r) : fence;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
index 92324c978534..1fa667f245e1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
@@ -64,6 +64,9 @@ struct amdgpu_job {
 	/* user fence handling */
 	uint64_t		uf_addr;
 	uint64_t		uf_sequence;
+
+	/* job_run_counter >= 1 means a resubmit job */
+	uint32_t		job_run_counter;
 };
 
 int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCHv2 1/2] drm/amd/amdgpu embed hw_fence into amdgpu_job
  2021-08-05  8:31 [PATCHv2 1/2] drm/amd/amdgpu embed hw_fence into amdgpu_job Jingwen Chen
  2021-08-05  8:31 ` [PATCHv2 2/2] drm/amd/amdgpu: add tdr support for embeded hw_fence Jingwen Chen
@ 2021-08-05 21:13 ` Andrey Grodzovsky
  2021-08-06  5:52   ` Jingwen Chen
  1 sibling, 1 reply; 10+ messages in thread
From: Andrey Grodzovsky @ 2021-08-05 21:13 UTC (permalink / raw)
  To: Jingwen Chen, amd-gfx; +Cc: monk.liu, christian.koenig, Jack Zhang, Jack Zhang


On 2021-08-05 4:31 a.m., Jingwen Chen wrote:
> From: Jack Zhang <Jack.Zhang1@amd.com>
>
> Why: Previously hw fence is alloced separately with job.
> It caused historical lifetime issues and corner cases.
> The ideal situation is to take fence to manage both job
> and fence's lifetime, and simplify the design of gpu-scheduler.
>
> How:
> We propose to embed hw_fence into amdgpu_job.
> 1. We cover the normal job submission by this method.
> 2. For ib_test, and submit without a parent job keep the
> legacy way to create a hw fence separately.
> v2:
> use AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT to show that the fence is
> embeded in a job.
>
> Signed-off-by: Jingwen Chen <Jingwen.Chen2@amd.com>
> Signed-off-by: Jack Zhang <Jack.Zhang7@hotmail.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c  |  1 -
>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c   | 63 ++++++++++++++++-----
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c      |  2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c     | 35 ++++++++----
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.h     |  4 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h    |  5 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c      |  2 +-
>   8 files changed, 84 insertions(+), 30 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index 7b46ba551cb2..3003ee1c9487 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -714,7 +714,6 @@ int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine,
>   	ret = dma_fence_wait(f, false);
>   
>   err_ib_sched:
> -	dma_fence_put(f);
>   	amdgpu_job_free(job);
>   err:
>   	return ret;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> index 536005bff24a..277128846dd1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> @@ -1414,7 +1414,7 @@ static void amdgpu_ib_preempt_mark_partial_job(struct amdgpu_ring *ring)
>   			continue;
>   		}
>   		job = to_amdgpu_job(s_job);
> -		if (preempted && job->fence == fence)
> +		if (preempted && (&job->hw_fence) == fence)
>   			/* mark the job as preempted */
>   			job->preemption_status |= AMDGPU_IB_PREEMPTED;
>   	}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> index 7495911516c2..5e29d797a265 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> @@ -129,30 +129,46 @@ static u32 amdgpu_fence_read(struct amdgpu_ring *ring)
>    *
>    * @ring: ring the fence is associated with
>    * @f: resulting fence object
> + * @job: job the fence is embeded in
>    * @flags: flags to pass into the subordinate .emit_fence() call
>    *
>    * Emits a fence command on the requested ring (all asics).
>    * Returns 0 on success, -ENOMEM on failure.
>    */
> -int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f,
> +int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f, struct amdgpu_job *job,
>   		      unsigned flags)
>   {
>   	struct amdgpu_device *adev = ring->adev;
> -	struct amdgpu_fence *fence;
> +	struct dma_fence *fence;
> +	struct amdgpu_fence *am_fence;
>   	struct dma_fence __rcu **ptr;
>   	uint32_t seq;
>   	int r;
>   
> -	fence = kmem_cache_alloc(amdgpu_fence_slab, GFP_KERNEL);
> -	if (fence == NULL)
> -		return -ENOMEM;
> +	if (job == NULL) {
> +		/* create a sperate hw fence */
> +		am_fence = kmem_cache_alloc(amdgpu_fence_slab, GFP_ATOMIC);
> +		if (am_fence == NULL)
> +			return -ENOMEM;
> +		fence = &am_fence->base;
> +		am_fence->ring = ring;
> +	} else {
> +		/* take use of job-embedded fence */
> +		fence = &job->hw_fence;
> +		job->ring = ring;


If you would make hw_fence of type amdgpu_fence
you could probably avoid the special job->ring = ring
See more in related comment at the bottom


> +	}
>   
>   	seq = ++ring->fence_drv.sync_seq;
> -	fence->ring = ring;
> -	dma_fence_init(&fence->base, &amdgpu_fence_ops,
> +	dma_fence_init(fence, &amdgpu_fence_ops,
>   		       &ring->fence_drv.lock,
>   		       adev->fence_context + ring->idx,
>   		       seq);
> +
> +	if (job != NULL) {
> +		/* mark this fence has a parent job */
> +		set_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, &fence->flags);
> +	}
> +
>   	amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
>   			       seq, flags | AMDGPU_FENCE_FLAG_INT);
>   	pm_runtime_get_noresume(adev_to_drm(adev)->dev);
> @@ -175,9 +191,9 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f,
>   	/* This function can't be called concurrently anyway, otherwise
>   	 * emitting the fence would mess up the hardware ring buffer.
>   	 */
> -	rcu_assign_pointer(*ptr, dma_fence_get(&fence->base));
> +	rcu_assign_pointer(*ptr, dma_fence_get(fence));
>   
> -	*f = &fence->base;
> +	*f = fence;
>   
>   	return 0;
>   }
> @@ -621,8 +637,16 @@ static const char *amdgpu_fence_get_driver_name(struct dma_fence *fence)
>   
>   static const char *amdgpu_fence_get_timeline_name(struct dma_fence *f)
>   {
> -	struct amdgpu_fence *fence = to_amdgpu_fence(f);
> -	return (const char *)fence->ring->name;
> +	struct amdgpu_ring *ring;
> +
> +	if (test_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, &f->flags)) {
> +		struct amdgpu_job *job = container_of(f, struct amdgpu_job, hw_fence);
> +
> +		ring = job->ring;
> +	} else {
> +		ring = to_amdgpu_fence(f)->ring;
> +	}


Same as above


> +	return (const char *)ring->name;
>   }
>   
>   /**
> @@ -656,8 +680,20 @@ static bool amdgpu_fence_enable_signaling(struct dma_fence *f)
>   static void amdgpu_fence_free(struct rcu_head *rcu)
>   {
>   	struct dma_fence *f = container_of(rcu, struct dma_fence, rcu);
> -	struct amdgpu_fence *fence = to_amdgpu_fence(f);
> -	kmem_cache_free(amdgpu_fence_slab, fence);
> +
> +	if (test_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, &f->flags)) {
> +	/* free job if fence has a parent job */
> +		struct amdgpu_job *job;
> +
> +		job = container_of(f, struct amdgpu_job, hw_fence);
> +		kfree(job);
> +	} else {
> +	/* free fence_slab if it's separated fence*/
> +		struct amdgpu_fence *fence;
> +
> +		fence = to_amdgpu_fence(f);
> +		kmem_cache_free(amdgpu_fence_slab, fence);
> +	}
>   }
>   
>   /**
> @@ -680,6 +716,7 @@ static const struct dma_fence_ops amdgpu_fence_ops = {
>   	.release = amdgpu_fence_release,
>   };
>   
> +
>   /*
>    * Fence debugfs
>    */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> index ec65ab0ddf89..c076a6b9a5a2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> @@ -262,7 +262,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>   				       fence_flags | AMDGPU_FENCE_FLAG_64BIT);
>   	}
>   
> -	r = amdgpu_fence_emit(ring, f, fence_flags);
> +	r = amdgpu_fence_emit(ring, f, job, fence_flags);
>   	if (r) {
>   		dev_err(adev->dev, "failed to emit fence (%d)\n", r);
>   		if (job && job->vmid)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index d33e6d97cc89..65a395060de2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -127,11 +127,16 @@ void amdgpu_job_free_resources(struct amdgpu_job *job)
>   {
>   	struct amdgpu_ring *ring = to_amdgpu_ring(job->base.sched);
>   	struct dma_fence *f;
> +	struct dma_fence *hw_fence;
>   	unsigned i;
>   
> -	/* use sched fence if available */
> -	f = job->base.s_fence ? &job->base.s_fence->finished : job->fence;
> +	if (job->hw_fence.ops == NULL)
> +		hw_fence = job->external_hw_fence;
> +	else
> +		hw_fence = &job->hw_fence;
>   
> +	/* use sched fence if available */
> +	f = job->base.s_fence ? &job->base.s_fence->finished : hw_fence;
>   	for (i = 0; i < job->num_ibs; ++i)
>   		amdgpu_ib_free(ring->adev, &job->ibs[i], f);
>   }
> @@ -142,20 +147,27 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job)
>   
>   	drm_sched_job_cleanup(s_job);
>   
> -	dma_fence_put(job->fence);
>   	amdgpu_sync_free(&job->sync);
>   	amdgpu_sync_free(&job->sched_sync);
> -	kfree(job);
> +
> +    /* only put the hw fence if has embedded fence */
> +	if (job->hw_fence.ops != NULL)
> +		dma_fence_put(&job->hw_fence);
> +	else
> +		kfree(job);
>   }
>   
>   void amdgpu_job_free(struct amdgpu_job *job)
>   {
>   	amdgpu_job_free_resources(job);
> -
> -	dma_fence_put(job->fence);
>   	amdgpu_sync_free(&job->sync);
>   	amdgpu_sync_free(&job->sched_sync);
> -	kfree(job);
> +
> +	/* only put the hw fence if has embedded fence */
> +	if (job->hw_fence.ops != NULL)
> +		dma_fence_put(&job->hw_fence);
> +	else
> +		kfree(job);
>   }
>   
>   int amdgpu_job_submit(struct amdgpu_job *job, struct drm_sched_entity *entity,
> @@ -184,11 +196,14 @@ int amdgpu_job_submit_direct(struct amdgpu_job *job, struct amdgpu_ring *ring,
>   
>   	job->base.sched = &ring->sched;
>   	r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, NULL, fence);
> -	job->fence = dma_fence_get(*fence);
> +	/* record external_hw_fence for direct submit */
> +	job->external_hw_fence = dma_fence_get(*fence);
>   	if (r)
>   		return r;
>   
>   	amdgpu_job_free(job);
> +	dma_fence_put(*fence);
> +
>   	return 0;
>   }
>   
> @@ -246,10 +261,8 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
>   		if (r)
>   			DRM_ERROR("Error scheduling IBs (%d)\n", r);
>   	}
> -	/* if gpu reset, hw fence will be replaced here */
> -	dma_fence_put(job->fence);
> -	job->fence = dma_fence_get(fence);
>   
> +	dma_fence_get(fence);
>   	amdgpu_job_free_resources(job);
>   
>   	fence = r ? ERR_PTR(r) : fence;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> index 81caac9b958a..92324c978534 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> @@ -46,7 +46,9 @@ struct amdgpu_job {
>   	struct amdgpu_sync	sync;
>   	struct amdgpu_sync	sched_sync;
>   	struct amdgpu_ib	*ibs;
> -	struct dma_fence	*fence; /* the hw fence */
> +	struct dma_fence	hw_fence;
> +	struct amdgpu_ring *ring;

Why not instead of 2 fields above just embed   struct amdgpu_fence as
hw_fence  and by this save the extra 'ring' field handling ?

Andrey


> +	struct dma_fence	*external_hw_fence;
>   	uint32_t		preamble_status;
>   	uint32_t                preemption_status;
>   	uint32_t		num_ibs;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index 9c11ced4312c..03d4b29a76d6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -48,6 +48,9 @@
>   #define AMDGPU_FENCE_FLAG_INT           (1 << 1)
>   #define AMDGPU_FENCE_FLAG_TC_WB_ONLY    (1 << 2)
>   
> +/* fence flag bit to indicate the face is embeded in job*/
> +#define AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT		(DMA_FENCE_FLAG_USER_BITS + 1)
> +
>   #define to_amdgpu_ring(s) container_of((s), struct amdgpu_ring, sched)
>   
>   #define AMDGPU_IB_POOL_SIZE	(1024 * 1024)
> @@ -118,7 +121,7 @@ void amdgpu_fence_driver_hw_init(struct amdgpu_device *adev);
>   void amdgpu_fence_driver_hw_fini(struct amdgpu_device *adev);
>   int amdgpu_fence_driver_sw_init(struct amdgpu_device *adev);
>   void amdgpu_fence_driver_sw_fini(struct amdgpu_device *adev);
> -int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **fence,
> +int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **fence, struct amdgpu_job *job,
>   		      unsigned flags);
>   int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, uint32_t *s,
>   			      uint32_t timeout);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 2a88ed5d983b..2af8860d74cc 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -1218,7 +1218,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
>   		amdgpu_gmc_emit_pasid_mapping(ring, job->vmid, job->pasid);
>   
>   	if (vm_flush_needed || pasid_mapping_needed) {
> -		r = amdgpu_fence_emit(ring, &fence, 0);
> +		r = amdgpu_fence_emit(ring, &fence, NULL, 0);
>   		if (r)
>   			return r;
>   	}

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCHv2 1/2] drm/amd/amdgpu embed hw_fence into amdgpu_job
  2021-08-05 21:13 ` [PATCHv2 1/2] drm/amd/amdgpu embed hw_fence into amdgpu_job Andrey Grodzovsky
@ 2021-08-06  5:52   ` Jingwen Chen
  2021-08-06  9:48     ` Christian König
  0 siblings, 1 reply; 10+ messages in thread
From: Jingwen Chen @ 2021-08-06  5:52 UTC (permalink / raw)
  To: Andrey Grodzovsky, monk.liu, christian.koenig, Jack Zhang, amd-gfx

On Thu Aug 05, 2021 at 05:13:22PM -0400, Andrey Grodzovsky wrote:
> 
> On 2021-08-05 4:31 a.m., Jingwen Chen wrote:
> > From: Jack Zhang <Jack.Zhang1@amd.com>
> > 
> > Why: Previously hw fence is alloced separately with job.
> > It caused historical lifetime issues and corner cases.
> > The ideal situation is to take fence to manage both job
> > and fence's lifetime, and simplify the design of gpu-scheduler.
> > 
> > How:
> > We propose to embed hw_fence into amdgpu_job.
> > 1. We cover the normal job submission by this method.
> > 2. For ib_test, and submit without a parent job keep the
> > legacy way to create a hw fence separately.
> > v2:
> > use AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT to show that the fence is
> > embeded in a job.
> > 
> > Signed-off-by: Jingwen Chen <Jingwen.Chen2@amd.com>
> > Signed-off-by: Jack Zhang <Jack.Zhang7@hotmail.com>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c  |  1 -
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  2 +-
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c   | 63 ++++++++++++++++-----
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c      |  2 +-
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c     | 35 ++++++++----
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_job.h     |  4 +-
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h    |  5 +-
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c      |  2 +-
> >   8 files changed, 84 insertions(+), 30 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> > index 7b46ba551cb2..3003ee1c9487 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> > @@ -714,7 +714,6 @@ int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine,
> >   	ret = dma_fence_wait(f, false);
> >   err_ib_sched:
> > -	dma_fence_put(f);
> >   	amdgpu_job_free(job);
> >   err:
> >   	return ret;
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> > index 536005bff24a..277128846dd1 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> > @@ -1414,7 +1414,7 @@ static void amdgpu_ib_preempt_mark_partial_job(struct amdgpu_ring *ring)
> >   			continue;
> >   		}
> >   		job = to_amdgpu_job(s_job);
> > -		if (preempted && job->fence == fence)
> > +		if (preempted && (&job->hw_fence) == fence)
> >   			/* mark the job as preempted */
> >   			job->preemption_status |= AMDGPU_IB_PREEMPTED;
> >   	}
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> > index 7495911516c2..5e29d797a265 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> > @@ -129,30 +129,46 @@ static u32 amdgpu_fence_read(struct amdgpu_ring *ring)
> >    *
> >    * @ring: ring the fence is associated with
> >    * @f: resulting fence object
> > + * @job: job the fence is embeded in
> >    * @flags: flags to pass into the subordinate .emit_fence() call
> >    *
> >    * Emits a fence command on the requested ring (all asics).
> >    * Returns 0 on success, -ENOMEM on failure.
> >    */
> > -int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f,
> > +int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f, struct amdgpu_job *job,
> >   		      unsigned flags)
> >   {
> >   	struct amdgpu_device *adev = ring->adev;
> > -	struct amdgpu_fence *fence;
> > +	struct dma_fence *fence;
> > +	struct amdgpu_fence *am_fence;
> >   	struct dma_fence __rcu **ptr;
> >   	uint32_t seq;
> >   	int r;
> > -	fence = kmem_cache_alloc(amdgpu_fence_slab, GFP_KERNEL);
> > -	if (fence == NULL)
> > -		return -ENOMEM;
> > +	if (job == NULL) {
> > +		/* create a sperate hw fence */
> > +		am_fence = kmem_cache_alloc(amdgpu_fence_slab, GFP_ATOMIC);
> > +		if (am_fence == NULL)
> > +			return -ENOMEM;
> > +		fence = &am_fence->base;
> > +		am_fence->ring = ring;
> > +	} else {
> > +		/* take use of job-embedded fence */
> > +		fence = &job->hw_fence;
> > +		job->ring = ring;
> 
> 
> If you would make hw_fence of type amdgpu_fence
> you could probably avoid the special job->ring = ring
> See more in related comment at the bottom
> 
Hi Andry,

I'm only make the amdgpu_fence for the fence without job parameter
provided to amdgpu_fence_emit. For embeded fence which is the hw_fence
in amdgpu_job, it will be allocated along with amdgpu_job as dma_fence.

Regards,
Jingwen Chen
> 
> > +	}
> >   	seq = ++ring->fence_drv.sync_seq;
> > -	fence->ring = ring;
> > -	dma_fence_init(&fence->base, &amdgpu_fence_ops,
> > +	dma_fence_init(fence, &amdgpu_fence_ops,
> >   		       &ring->fence_drv.lock,
> >   		       adev->fence_context + ring->idx,
> >   		       seq);
> > +
> > +	if (job != NULL) {
> > +		/* mark this fence has a parent job */
> > +		set_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, &fence->flags);
> > +	}
> > +
> >   	amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
> >   			       seq, flags | AMDGPU_FENCE_FLAG_INT);
> >   	pm_runtime_get_noresume(adev_to_drm(adev)->dev);
> > @@ -175,9 +191,9 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f,
> >   	/* This function can't be called concurrently anyway, otherwise
> >   	 * emitting the fence would mess up the hardware ring buffer.
> >   	 */
> > -	rcu_assign_pointer(*ptr, dma_fence_get(&fence->base));
> > +	rcu_assign_pointer(*ptr, dma_fence_get(fence));
> > -	*f = &fence->base;
> > +	*f = fence;
> >   	return 0;
> >   }
> > @@ -621,8 +637,16 @@ static const char *amdgpu_fence_get_driver_name(struct dma_fence *fence)
> >   static const char *amdgpu_fence_get_timeline_name(struct dma_fence *f)
> >   {
> > -	struct amdgpu_fence *fence = to_amdgpu_fence(f);
> > -	return (const char *)fence->ring->name;
> > +	struct amdgpu_ring *ring;
> > +
> > +	if (test_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, &f->flags)) {
> > +		struct amdgpu_job *job = container_of(f, struct amdgpu_job, hw_fence);
> > +
> > +		ring = job->ring;
> > +	} else {
> > +		ring = to_amdgpu_fence(f)->ring;
> > +	}
> 
> 
> Same as above
> 
> 
> > +	return (const char *)ring->name;
> >   }
> >   /**
> > @@ -656,8 +680,20 @@ static bool amdgpu_fence_enable_signaling(struct dma_fence *f)
> >   static void amdgpu_fence_free(struct rcu_head *rcu)
> >   {
> >   	struct dma_fence *f = container_of(rcu, struct dma_fence, rcu);
> > -	struct amdgpu_fence *fence = to_amdgpu_fence(f);
> > -	kmem_cache_free(amdgpu_fence_slab, fence);
> > +
> > +	if (test_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, &f->flags)) {
> > +	/* free job if fence has a parent job */
> > +		struct amdgpu_job *job;
> > +
> > +		job = container_of(f, struct amdgpu_job, hw_fence);
> > +		kfree(job);
> > +	} else {
> > +	/* free fence_slab if it's separated fence*/
> > +		struct amdgpu_fence *fence;
> > +
> > +		fence = to_amdgpu_fence(f);
> > +		kmem_cache_free(amdgpu_fence_slab, fence);
> > +	}
> >   }
> >   /**
> > @@ -680,6 +716,7 @@ static const struct dma_fence_ops amdgpu_fence_ops = {
> >   	.release = amdgpu_fence_release,
> >   };
> > +
> >   /*
> >    * Fence debugfs
> >    */
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> > index ec65ab0ddf89..c076a6b9a5a2 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> > @@ -262,7 +262,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
> >   				       fence_flags | AMDGPU_FENCE_FLAG_64BIT);
> >   	}
> > -	r = amdgpu_fence_emit(ring, f, fence_flags);
> > +	r = amdgpu_fence_emit(ring, f, job, fence_flags);
> >   	if (r) {
> >   		dev_err(adev->dev, "failed to emit fence (%d)\n", r);
> >   		if (job && job->vmid)
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > index d33e6d97cc89..65a395060de2 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > @@ -127,11 +127,16 @@ void amdgpu_job_free_resources(struct amdgpu_job *job)
> >   {
> >   	struct amdgpu_ring *ring = to_amdgpu_ring(job->base.sched);
> >   	struct dma_fence *f;
> > +	struct dma_fence *hw_fence;
> >   	unsigned i;
> > -	/* use sched fence if available */
> > -	f = job->base.s_fence ? &job->base.s_fence->finished : job->fence;
> > +	if (job->hw_fence.ops == NULL)
> > +		hw_fence = job->external_hw_fence;
> > +	else
> > +		hw_fence = &job->hw_fence;
> > +	/* use sched fence if available */
> > +	f = job->base.s_fence ? &job->base.s_fence->finished : hw_fence;
> >   	for (i = 0; i < job->num_ibs; ++i)
> >   		amdgpu_ib_free(ring->adev, &job->ibs[i], f);
> >   }
> > @@ -142,20 +147,27 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job)
> >   	drm_sched_job_cleanup(s_job);
> > -	dma_fence_put(job->fence);
> >   	amdgpu_sync_free(&job->sync);
> >   	amdgpu_sync_free(&job->sched_sync);
> > -	kfree(job);
> > +
> > +    /* only put the hw fence if has embedded fence */
> > +	if (job->hw_fence.ops != NULL)
> > +		dma_fence_put(&job->hw_fence);
> > +	else
> > +		kfree(job);
> >   }
> >   void amdgpu_job_free(struct amdgpu_job *job)
> >   {
> >   	amdgpu_job_free_resources(job);
> > -
> > -	dma_fence_put(job->fence);
> >   	amdgpu_sync_free(&job->sync);
> >   	amdgpu_sync_free(&job->sched_sync);
> > -	kfree(job);
> > +
> > +	/* only put the hw fence if has embedded fence */
> > +	if (job->hw_fence.ops != NULL)
> > +		dma_fence_put(&job->hw_fence);
> > +	else
> > +		kfree(job);
> >   }
> >   int amdgpu_job_submit(struct amdgpu_job *job, struct drm_sched_entity *entity,
> > @@ -184,11 +196,14 @@ int amdgpu_job_submit_direct(struct amdgpu_job *job, struct amdgpu_ring *ring,
> >   	job->base.sched = &ring->sched;
> >   	r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, NULL, fence);
> > -	job->fence = dma_fence_get(*fence);
> > +	/* record external_hw_fence for direct submit */
> > +	job->external_hw_fence = dma_fence_get(*fence);
> >   	if (r)
> >   		return r;
> >   	amdgpu_job_free(job);
> > +	dma_fence_put(*fence);
> > +
> >   	return 0;
> >   }
> > @@ -246,10 +261,8 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
> >   		if (r)
> >   			DRM_ERROR("Error scheduling IBs (%d)\n", r);
> >   	}
> > -	/* if gpu reset, hw fence will be replaced here */
> > -	dma_fence_put(job->fence);
> > -	job->fence = dma_fence_get(fence);
> > +	dma_fence_get(fence);
> >   	amdgpu_job_free_resources(job);
> >   	fence = r ? ERR_PTR(r) : fence;
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> > index 81caac9b958a..92324c978534 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> > @@ -46,7 +46,9 @@ struct amdgpu_job {
> >   	struct amdgpu_sync	sync;
> >   	struct amdgpu_sync	sched_sync;
> >   	struct amdgpu_ib	*ibs;
> > -	struct dma_fence	*fence; /* the hw fence */
> > +	struct dma_fence	hw_fence;
> > +	struct amdgpu_ring *ring;
> 
> Why not instead of 2 fields above just embed   struct amdgpu_fence as
> hw_fence  and by this save the extra 'ring' field handling ?
> 
> Andrey
> 
> 
> > +	struct dma_fence	*external_hw_fence;
> >   	uint32_t		preamble_status;
> >   	uint32_t                preemption_status;
> >   	uint32_t		num_ibs;
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> > index 9c11ced4312c..03d4b29a76d6 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> > @@ -48,6 +48,9 @@
> >   #define AMDGPU_FENCE_FLAG_INT           (1 << 1)
> >   #define AMDGPU_FENCE_FLAG_TC_WB_ONLY    (1 << 2)
> > +/* fence flag bit to indicate the face is embeded in job*/
> > +#define AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT		(DMA_FENCE_FLAG_USER_BITS + 1)
> > +
> >   #define to_amdgpu_ring(s) container_of((s), struct amdgpu_ring, sched)
> >   #define AMDGPU_IB_POOL_SIZE	(1024 * 1024)
> > @@ -118,7 +121,7 @@ void amdgpu_fence_driver_hw_init(struct amdgpu_device *adev);
> >   void amdgpu_fence_driver_hw_fini(struct amdgpu_device *adev);
> >   int amdgpu_fence_driver_sw_init(struct amdgpu_device *adev);
> >   void amdgpu_fence_driver_sw_fini(struct amdgpu_device *adev);
> > -int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **fence,
> > +int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **fence, struct amdgpu_job *job,
> >   		      unsigned flags);
> >   int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, uint32_t *s,
> >   			      uint32_t timeout);
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > index 2a88ed5d983b..2af8860d74cc 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > @@ -1218,7 +1218,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
> >   		amdgpu_gmc_emit_pasid_mapping(ring, job->vmid, job->pasid);
> >   	if (vm_flush_needed || pasid_mapping_needed) {
> > -		r = amdgpu_fence_emit(ring, &fence, 0);
> > +		r = amdgpu_fence_emit(ring, &fence, NULL, 0);
> >   		if (r)
> >   			return r;
> >   	}

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCHv2 1/2] drm/amd/amdgpu embed hw_fence into amdgpu_job
  2021-08-06  5:52   ` Jingwen Chen
@ 2021-08-06  9:48     ` Christian König
  2021-08-09  2:18       ` Jingwen Chen
  0 siblings, 1 reply; 10+ messages in thread
From: Christian König @ 2021-08-06  9:48 UTC (permalink / raw)
  To: Jingwen Chen, Andrey Grodzovsky, monk.liu, christian.koenig,
	Jack Zhang, amd-gfx



Am 06.08.21 um 07:52 schrieb Jingwen Chen:
> On Thu Aug 05, 2021 at 05:13:22PM -0400, Andrey Grodzovsky wrote:
>> On 2021-08-05 4:31 a.m., Jingwen Chen wrote:
>>> From: Jack Zhang <Jack.Zhang1@amd.com>
>>>
>>> Why: Previously hw fence is alloced separately with job.
>>> It caused historical lifetime issues and corner cases.
>>> The ideal situation is to take fence to manage both job
>>> and fence's lifetime, and simplify the design of gpu-scheduler.
>>>
>>> How:
>>> We propose to embed hw_fence into amdgpu_job.
>>> 1. We cover the normal job submission by this method.
>>> 2. For ib_test, and submit without a parent job keep the
>>> legacy way to create a hw fence separately.
>>> v2:
>>> use AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT to show that the fence is
>>> embeded in a job.
>>>
>>> Signed-off-by: Jingwen Chen <Jingwen.Chen2@amd.com>
>>> Signed-off-by: Jack Zhang <Jack.Zhang7@hotmail.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c  |  1 -
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  2 +-
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c   | 63 ++++++++++++++++-----
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c      |  2 +-
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_job.c     | 35 ++++++++----
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_job.h     |  4 +-
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h    |  5 +-
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c      |  2 +-
>>>    8 files changed, 84 insertions(+), 30 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>>> index 7b46ba551cb2..3003ee1c9487 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>>> @@ -714,7 +714,6 @@ int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine,
>>>    	ret = dma_fence_wait(f, false);
>>>    err_ib_sched:
>>> -	dma_fence_put(f);
>>>    	amdgpu_job_free(job);
>>>    err:
>>>    	return ret;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> index 536005bff24a..277128846dd1 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> @@ -1414,7 +1414,7 @@ static void amdgpu_ib_preempt_mark_partial_job(struct amdgpu_ring *ring)
>>>    			continue;
>>>    		}
>>>    		job = to_amdgpu_job(s_job);
>>> -		if (preempted && job->fence == fence)
>>> +		if (preempted && (&job->hw_fence) == fence)
>>>    			/* mark the job as preempted */
>>>    			job->preemption_status |= AMDGPU_IB_PREEMPTED;
>>>    	}
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>> index 7495911516c2..5e29d797a265 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>> @@ -129,30 +129,46 @@ static u32 amdgpu_fence_read(struct amdgpu_ring *ring)
>>>     *
>>>     * @ring: ring the fence is associated with
>>>     * @f: resulting fence object
>>> + * @job: job the fence is embeded in
>>>     * @flags: flags to pass into the subordinate .emit_fence() call
>>>     *
>>>     * Emits a fence command on the requested ring (all asics).
>>>     * Returns 0 on success, -ENOMEM on failure.
>>>     */
>>> -int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f,
>>> +int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f, struct amdgpu_job *job,
>>>    		      unsigned flags)
>>>    {
>>>    	struct amdgpu_device *adev = ring->adev;
>>> -	struct amdgpu_fence *fence;
>>> +	struct dma_fence *fence;
>>> +	struct amdgpu_fence *am_fence;
>>>    	struct dma_fence __rcu **ptr;
>>>    	uint32_t seq;
>>>    	int r;
>>> -	fence = kmem_cache_alloc(amdgpu_fence_slab, GFP_KERNEL);
>>> -	if (fence == NULL)
>>> -		return -ENOMEM;
>>> +	if (job == NULL) {
>>> +		/* create a sperate hw fence */
>>> +		am_fence = kmem_cache_alloc(amdgpu_fence_slab, GFP_ATOMIC);
>>> +		if (am_fence == NULL)
>>> +			return -ENOMEM;
>>> +		fence = &am_fence->base;
>>> +		am_fence->ring = ring;
>>> +	} else {
>>> +		/* take use of job-embedded fence */
>>> +		fence = &job->hw_fence;
>>> +		job->ring = ring;
>>
>> If you would make hw_fence of type amdgpu_fence
>> you could probably avoid the special job->ring = ring
>> See more in related comment at the bottom
>>
> Hi Andry,
>
> I'm only make the amdgpu_fence for the fence without job parameter
> provided to amdgpu_fence_emit. For embeded fence which is the hw_fence
> in amdgpu_job, it will be allocated along with amdgpu_job as dma_fence.

When you have the job and need the ring you can just do 
conatiner_of(job->sched, struct amdgpu_ring, sched).

No need for an extra ring pointer here.

Regards,
Christian.

>
> Regards,
> Jingwen Chen
>>> +	}
>>>    	seq = ++ring->fence_drv.sync_seq;
>>> -	fence->ring = ring;
>>> -	dma_fence_init(&fence->base, &amdgpu_fence_ops,
>>> +	dma_fence_init(fence, &amdgpu_fence_ops,
>>>    		       &ring->fence_drv.lock,
>>>    		       adev->fence_context + ring->idx,
>>>    		       seq);
>>> +
>>> +	if (job != NULL) {
>>> +		/* mark this fence has a parent job */
>>> +		set_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, &fence->flags);
>>> +	}
>>> +
>>>    	amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
>>>    			       seq, flags | AMDGPU_FENCE_FLAG_INT);
>>>    	pm_runtime_get_noresume(adev_to_drm(adev)->dev);
>>> @@ -175,9 +191,9 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f,
>>>    	/* This function can't be called concurrently anyway, otherwise
>>>    	 * emitting the fence would mess up the hardware ring buffer.
>>>    	 */
>>> -	rcu_assign_pointer(*ptr, dma_fence_get(&fence->base));
>>> +	rcu_assign_pointer(*ptr, dma_fence_get(fence));
>>> -	*f = &fence->base;
>>> +	*f = fence;
>>>    	return 0;
>>>    }
>>> @@ -621,8 +637,16 @@ static const char *amdgpu_fence_get_driver_name(struct dma_fence *fence)
>>>    static const char *amdgpu_fence_get_timeline_name(struct dma_fence *f)
>>>    {
>>> -	struct amdgpu_fence *fence = to_amdgpu_fence(f);
>>> -	return (const char *)fence->ring->name;
>>> +	struct amdgpu_ring *ring;
>>> +
>>> +	if (test_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, &f->flags)) {
>>> +		struct amdgpu_job *job = container_of(f, struct amdgpu_job, hw_fence);
>>> +
>>> +		ring = job->ring;
>>> +	} else {
>>> +		ring = to_amdgpu_fence(f)->ring;
>>> +	}
>>
>> Same as above
>>
>>
>>> +	return (const char *)ring->name;
>>>    }
>>>    /**
>>> @@ -656,8 +680,20 @@ static bool amdgpu_fence_enable_signaling(struct dma_fence *f)
>>>    static void amdgpu_fence_free(struct rcu_head *rcu)
>>>    {
>>>    	struct dma_fence *f = container_of(rcu, struct dma_fence, rcu);
>>> -	struct amdgpu_fence *fence = to_amdgpu_fence(f);
>>> -	kmem_cache_free(amdgpu_fence_slab, fence);
>>> +
>>> +	if (test_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, &f->flags)) {
>>> +	/* free job if fence has a parent job */
>>> +		struct amdgpu_job *job;
>>> +
>>> +		job = container_of(f, struct amdgpu_job, hw_fence);
>>> +		kfree(job);
>>> +	} else {
>>> +	/* free fence_slab if it's separated fence*/
>>> +		struct amdgpu_fence *fence;
>>> +
>>> +		fence = to_amdgpu_fence(f);
>>> +		kmem_cache_free(amdgpu_fence_slab, fence);
>>> +	}
>>>    }
>>>    /**
>>> @@ -680,6 +716,7 @@ static const struct dma_fence_ops amdgpu_fence_ops = {
>>>    	.release = amdgpu_fence_release,
>>>    };
>>> +
>>>    /*
>>>     * Fence debugfs
>>>     */
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>> index ec65ab0ddf89..c076a6b9a5a2 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>> @@ -262,7 +262,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>>    				       fence_flags | AMDGPU_FENCE_FLAG_64BIT);
>>>    	}
>>> -	r = amdgpu_fence_emit(ring, f, fence_flags);
>>> +	r = amdgpu_fence_emit(ring, f, job, fence_flags);
>>>    	if (r) {
>>>    		dev_err(adev->dev, "failed to emit fence (%d)\n", r);
>>>    		if (job && job->vmid)
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> index d33e6d97cc89..65a395060de2 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> @@ -127,11 +127,16 @@ void amdgpu_job_free_resources(struct amdgpu_job *job)
>>>    {
>>>    	struct amdgpu_ring *ring = to_amdgpu_ring(job->base.sched);
>>>    	struct dma_fence *f;
>>> +	struct dma_fence *hw_fence;
>>>    	unsigned i;
>>> -	/* use sched fence if available */
>>> -	f = job->base.s_fence ? &job->base.s_fence->finished : job->fence;
>>> +	if (job->hw_fence.ops == NULL)
>>> +		hw_fence = job->external_hw_fence;
>>> +	else
>>> +		hw_fence = &job->hw_fence;
>>> +	/* use sched fence if available */
>>> +	f = job->base.s_fence ? &job->base.s_fence->finished : hw_fence;
>>>    	for (i = 0; i < job->num_ibs; ++i)
>>>    		amdgpu_ib_free(ring->adev, &job->ibs[i], f);
>>>    }
>>> @@ -142,20 +147,27 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job)
>>>    	drm_sched_job_cleanup(s_job);
>>> -	dma_fence_put(job->fence);
>>>    	amdgpu_sync_free(&job->sync);
>>>    	amdgpu_sync_free(&job->sched_sync);
>>> -	kfree(job);
>>> +
>>> +    /* only put the hw fence if has embedded fence */
>>> +	if (job->hw_fence.ops != NULL)
>>> +		dma_fence_put(&job->hw_fence);
>>> +	else
>>> +		kfree(job);
>>>    }
>>>    void amdgpu_job_free(struct amdgpu_job *job)
>>>    {
>>>    	amdgpu_job_free_resources(job);
>>> -
>>> -	dma_fence_put(job->fence);
>>>    	amdgpu_sync_free(&job->sync);
>>>    	amdgpu_sync_free(&job->sched_sync);
>>> -	kfree(job);
>>> +
>>> +	/* only put the hw fence if has embedded fence */
>>> +	if (job->hw_fence.ops != NULL)
>>> +		dma_fence_put(&job->hw_fence);
>>> +	else
>>> +		kfree(job);
>>>    }
>>>    int amdgpu_job_submit(struct amdgpu_job *job, struct drm_sched_entity *entity,
>>> @@ -184,11 +196,14 @@ int amdgpu_job_submit_direct(struct amdgpu_job *job, struct amdgpu_ring *ring,
>>>    	job->base.sched = &ring->sched;
>>>    	r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, NULL, fence);
>>> -	job->fence = dma_fence_get(*fence);
>>> +	/* record external_hw_fence for direct submit */
>>> +	job->external_hw_fence = dma_fence_get(*fence);
>>>    	if (r)
>>>    		return r;
>>>    	amdgpu_job_free(job);
>>> +	dma_fence_put(*fence);
>>> +
>>>    	return 0;
>>>    }
>>> @@ -246,10 +261,8 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
>>>    		if (r)
>>>    			DRM_ERROR("Error scheduling IBs (%d)\n", r);
>>>    	}
>>> -	/* if gpu reset, hw fence will be replaced here */
>>> -	dma_fence_put(job->fence);
>>> -	job->fence = dma_fence_get(fence);
>>> +	dma_fence_get(fence);
>>>    	amdgpu_job_free_resources(job);
>>>    	fence = r ? ERR_PTR(r) : fence;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> index 81caac9b958a..92324c978534 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> @@ -46,7 +46,9 @@ struct amdgpu_job {
>>>    	struct amdgpu_sync	sync;
>>>    	struct amdgpu_sync	sched_sync;
>>>    	struct amdgpu_ib	*ibs;
>>> -	struct dma_fence	*fence; /* the hw fence */
>>> +	struct dma_fence	hw_fence;
>>> +	struct amdgpu_ring *ring;
>> Why not instead of 2 fields above just embed   struct amdgpu_fence as
>> hw_fence  and by this save the extra 'ring' field handling ?
>>
>> Andrey
>>
>>
>>> +	struct dma_fence	*external_hw_fence;
>>>    	uint32_t		preamble_status;
>>>    	uint32_t                preemption_status;
>>>    	uint32_t		num_ibs;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> index 9c11ced4312c..03d4b29a76d6 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> @@ -48,6 +48,9 @@
>>>    #define AMDGPU_FENCE_FLAG_INT           (1 << 1)
>>>    #define AMDGPU_FENCE_FLAG_TC_WB_ONLY    (1 << 2)
>>> +/* fence flag bit to indicate the face is embeded in job*/
>>> +#define AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT		(DMA_FENCE_FLAG_USER_BITS + 1)
>>> +
>>>    #define to_amdgpu_ring(s) container_of((s), struct amdgpu_ring, sched)
>>>    #define AMDGPU_IB_POOL_SIZE	(1024 * 1024)
>>> @@ -118,7 +121,7 @@ void amdgpu_fence_driver_hw_init(struct amdgpu_device *adev);
>>>    void amdgpu_fence_driver_hw_fini(struct amdgpu_device *adev);
>>>    int amdgpu_fence_driver_sw_init(struct amdgpu_device *adev);
>>>    void amdgpu_fence_driver_sw_fini(struct amdgpu_device *adev);
>>> -int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **fence,
>>> +int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **fence, struct amdgpu_job *job,
>>>    		      unsigned flags);
>>>    int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, uint32_t *s,
>>>    			      uint32_t timeout);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> index 2a88ed5d983b..2af8860d74cc 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> @@ -1218,7 +1218,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
>>>    		amdgpu_gmc_emit_pasid_mapping(ring, job->vmid, job->pasid);
>>>    	if (vm_flush_needed || pasid_mapping_needed) {
>>> -		r = amdgpu_fence_emit(ring, &fence, 0);
>>> +		r = amdgpu_fence_emit(ring, &fence, NULL, 0);
>>>    		if (r)
>>>    			return r;
>>>    	}


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCHv2 1/2] drm/amd/amdgpu embed hw_fence into amdgpu_job
  2021-08-06  9:48     ` Christian König
@ 2021-08-09  2:18       ` Jingwen Chen
  2021-08-09  3:05         ` Jingwen Chen
  0 siblings, 1 reply; 10+ messages in thread
From: Jingwen Chen @ 2021-08-09  2:18 UTC (permalink / raw)
  To: Christian König, Andrey Grodzovsky, monk.liu,
	christian.koenig, Jack Zhang, amd-gfx

On Fri Aug 06, 2021 at 11:48:04AM +0200, Christian König wrote:
> 
> 
> Am 06.08.21 um 07:52 schrieb Jingwen Chen:
> > On Thu Aug 05, 2021 at 05:13:22PM -0400, Andrey Grodzovsky wrote:
> > > On 2021-08-05 4:31 a.m., Jingwen Chen wrote:
> > > > From: Jack Zhang <Jack.Zhang1@amd.com>
> > > > 
> > > > Why: Previously hw fence is alloced separately with job.
> > > > It caused historical lifetime issues and corner cases.
> > > > The ideal situation is to take fence to manage both job
> > > > and fence's lifetime, and simplify the design of gpu-scheduler.
> > > > 
> > > > How:
> > > > We propose to embed hw_fence into amdgpu_job.
> > > > 1. We cover the normal job submission by this method.
> > > > 2. For ib_test, and submit without a parent job keep the
> > > > legacy way to create a hw fence separately.
> > > > v2:
> > > > use AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT to show that the fence is
> > > > embeded in a job.
> > > > 
> > > > Signed-off-by: Jingwen Chen <Jingwen.Chen2@amd.com>
> > > > Signed-off-by: Jack Zhang <Jack.Zhang7@hotmail.com>
> > > > ---
> > > >    drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c  |  1 -
> > > >    drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  2 +-
> > > >    drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c   | 63 ++++++++++++++++-----
> > > >    drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c      |  2 +-
> > > >    drivers/gpu/drm/amd/amdgpu/amdgpu_job.c     | 35 ++++++++----
> > > >    drivers/gpu/drm/amd/amdgpu/amdgpu_job.h     |  4 +-
> > > >    drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h    |  5 +-
> > > >    drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c      |  2 +-
> > > >    8 files changed, 84 insertions(+), 30 deletions(-)
> > > > 
> > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> > > > index 7b46ba551cb2..3003ee1c9487 100644
> > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> > > > @@ -714,7 +714,6 @@ int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine,
> > > >    	ret = dma_fence_wait(f, false);
> > > >    err_ib_sched:
> > > > -	dma_fence_put(f);
> > > >    	amdgpu_job_free(job);
> > > >    err:
> > > >    	return ret;
> > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> > > > index 536005bff24a..277128846dd1 100644
> > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> > > > @@ -1414,7 +1414,7 @@ static void amdgpu_ib_preempt_mark_partial_job(struct amdgpu_ring *ring)
> > > >    			continue;
> > > >    		}
> > > >    		job = to_amdgpu_job(s_job);
> > > > -		if (preempted && job->fence == fence)
> > > > +		if (preempted && (&job->hw_fence) == fence)
> > > >    			/* mark the job as preempted */
> > > >    			job->preemption_status |= AMDGPU_IB_PREEMPTED;
> > > >    	}
> > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> > > > index 7495911516c2..5e29d797a265 100644
> > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> > > > @@ -129,30 +129,46 @@ static u32 amdgpu_fence_read(struct amdgpu_ring *ring)
> > > >     *
> > > >     * @ring: ring the fence is associated with
> > > >     * @f: resulting fence object
> > > > + * @job: job the fence is embeded in
> > > >     * @flags: flags to pass into the subordinate .emit_fence() call
> > > >     *
> > > >     * Emits a fence command on the requested ring (all asics).
> > > >     * Returns 0 on success, -ENOMEM on failure.
> > > >     */
> > > > -int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f,
> > > > +int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f, struct amdgpu_job *job,
> > > >    		      unsigned flags)
> > > >    {
> > > >    	struct amdgpu_device *adev = ring->adev;
> > > > -	struct amdgpu_fence *fence;
> > > > +	struct dma_fence *fence;
> > > > +	struct amdgpu_fence *am_fence;
> > > >    	struct dma_fence __rcu **ptr;
> > > >    	uint32_t seq;
> > > >    	int r;
> > > > -	fence = kmem_cache_alloc(amdgpu_fence_slab, GFP_KERNEL);
> > > > -	if (fence == NULL)
> > > > -		return -ENOMEM;
> > > > +	if (job == NULL) {
> > > > +		/* create a sperate hw fence */
> > > > +		am_fence = kmem_cache_alloc(amdgpu_fence_slab, GFP_ATOMIC);
> > > > +		if (am_fence == NULL)
> > > > +			return -ENOMEM;
> > > > +		fence = &am_fence->base;
> > > > +		am_fence->ring = ring;
> > > > +	} else {
> > > > +		/* take use of job-embedded fence */
> > > > +		fence = &job->hw_fence;
> > > > +		job->ring = ring;
> > > 
> > > If you would make hw_fence of type amdgpu_fence
> > > you could probably avoid the special job->ring = ring
> > > See more in related comment at the bottom
> > > 
> > Hi Andry,
> > 
> > I'm only make the amdgpu_fence for the fence without job parameter
> > provided to amdgpu_fence_emit. For embeded fence which is the hw_fence
> > in amdgpu_job, it will be allocated along with amdgpu_job as dma_fence.
> 
> When you have the job and need the ring you can just do
> conatiner_of(job->sched, struct amdgpu_ring, sched).
> 
> No need for an extra ring pointer here.
> 
> Regards,
> Christian.
I see, I will change this.

Best Regards,
JingWen Chen
> 
> > 
> > Regards,
> > Jingwen Chen
> > > > +	}
> > > >    	seq = ++ring->fence_drv.sync_seq;
> > > > -	fence->ring = ring;
> > > > -	dma_fence_init(&fence->base, &amdgpu_fence_ops,
> > > > +	dma_fence_init(fence, &amdgpu_fence_ops,
> > > >    		       &ring->fence_drv.lock,
> > > >    		       adev->fence_context + ring->idx,
> > > >    		       seq);
> > > > +
> > > > +	if (job != NULL) {
> > > > +		/* mark this fence has a parent job */
> > > > +		set_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, &fence->flags);
> > > > +	}
> > > > +
> > > >    	amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
> > > >    			       seq, flags | AMDGPU_FENCE_FLAG_INT);
> > > >    	pm_runtime_get_noresume(adev_to_drm(adev)->dev);
> > > > @@ -175,9 +191,9 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f,
> > > >    	/* This function can't be called concurrently anyway, otherwise
> > > >    	 * emitting the fence would mess up the hardware ring buffer.
> > > >    	 */
> > > > -	rcu_assign_pointer(*ptr, dma_fence_get(&fence->base));
> > > > +	rcu_assign_pointer(*ptr, dma_fence_get(fence));
> > > > -	*f = &fence->base;
> > > > +	*f = fence;
> > > >    	return 0;
> > > >    }
> > > > @@ -621,8 +637,16 @@ static const char *amdgpu_fence_get_driver_name(struct dma_fence *fence)
> > > >    static const char *amdgpu_fence_get_timeline_name(struct dma_fence *f)
> > > >    {
> > > > -	struct amdgpu_fence *fence = to_amdgpu_fence(f);
> > > > -	return (const char *)fence->ring->name;
> > > > +	struct amdgpu_ring *ring;
> > > > +
> > > > +	if (test_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, &f->flags)) {
> > > > +		struct amdgpu_job *job = container_of(f, struct amdgpu_job, hw_fence);
> > > > +
> > > > +		ring = job->ring;
> > > > +	} else {
> > > > +		ring = to_amdgpu_fence(f)->ring;
> > > > +	}
> > > 
> > > Same as above
> > > 
> > > 
> > > > +	return (const char *)ring->name;
> > > >    }
> > > >    /**
> > > > @@ -656,8 +680,20 @@ static bool amdgpu_fence_enable_signaling(struct dma_fence *f)
> > > >    static void amdgpu_fence_free(struct rcu_head *rcu)
> > > >    {
> > > >    	struct dma_fence *f = container_of(rcu, struct dma_fence, rcu);
> > > > -	struct amdgpu_fence *fence = to_amdgpu_fence(f);
> > > > -	kmem_cache_free(amdgpu_fence_slab, fence);
> > > > +
> > > > +	if (test_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, &f->flags)) {
> > > > +	/* free job if fence has a parent job */
> > > > +		struct amdgpu_job *job;
> > > > +
> > > > +		job = container_of(f, struct amdgpu_job, hw_fence);
> > > > +		kfree(job);
> > > > +	} else {
> > > > +	/* free fence_slab if it's separated fence*/
> > > > +		struct amdgpu_fence *fence;
> > > > +
> > > > +		fence = to_amdgpu_fence(f);
> > > > +		kmem_cache_free(amdgpu_fence_slab, fence);
> > > > +	}
> > > >    }
> > > >    /**
> > > > @@ -680,6 +716,7 @@ static const struct dma_fence_ops amdgpu_fence_ops = {
> > > >    	.release = amdgpu_fence_release,
> > > >    };
> > > > +
> > > >    /*
> > > >     * Fence debugfs
> > > >     */
> > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> > > > index ec65ab0ddf89..c076a6b9a5a2 100644
> > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> > > > @@ -262,7 +262,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
> > > >    				       fence_flags | AMDGPU_FENCE_FLAG_64BIT);
> > > >    	}
> > > > -	r = amdgpu_fence_emit(ring, f, fence_flags);
> > > > +	r = amdgpu_fence_emit(ring, f, job, fence_flags);
> > > >    	if (r) {
> > > >    		dev_err(adev->dev, "failed to emit fence (%d)\n", r);
> > > >    		if (job && job->vmid)
> > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > > > index d33e6d97cc89..65a395060de2 100644
> > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > > > @@ -127,11 +127,16 @@ void amdgpu_job_free_resources(struct amdgpu_job *job)
> > > >    {
> > > >    	struct amdgpu_ring *ring = to_amdgpu_ring(job->base.sched);
> > > >    	struct dma_fence *f;
> > > > +	struct dma_fence *hw_fence;
> > > >    	unsigned i;
> > > > -	/* use sched fence if available */
> > > > -	f = job->base.s_fence ? &job->base.s_fence->finished : job->fence;
> > > > +	if (job->hw_fence.ops == NULL)
> > > > +		hw_fence = job->external_hw_fence;
> > > > +	else
> > > > +		hw_fence = &job->hw_fence;
> > > > +	/* use sched fence if available */
> > > > +	f = job->base.s_fence ? &job->base.s_fence->finished : hw_fence;
> > > >    	for (i = 0; i < job->num_ibs; ++i)
> > > >    		amdgpu_ib_free(ring->adev, &job->ibs[i], f);
> > > >    }
> > > > @@ -142,20 +147,27 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job)
> > > >    	drm_sched_job_cleanup(s_job);
> > > > -	dma_fence_put(job->fence);
> > > >    	amdgpu_sync_free(&job->sync);
> > > >    	amdgpu_sync_free(&job->sched_sync);
> > > > -	kfree(job);
> > > > +
> > > > +    /* only put the hw fence if has embedded fence */
> > > > +	if (job->hw_fence.ops != NULL)
> > > > +		dma_fence_put(&job->hw_fence);
> > > > +	else
> > > > +		kfree(job);
> > > >    }
> > > >    void amdgpu_job_free(struct amdgpu_job *job)
> > > >    {
> > > >    	amdgpu_job_free_resources(job);
> > > > -
> > > > -	dma_fence_put(job->fence);
> > > >    	amdgpu_sync_free(&job->sync);
> > > >    	amdgpu_sync_free(&job->sched_sync);
> > > > -	kfree(job);
> > > > +
> > > > +	/* only put the hw fence if has embedded fence */
> > > > +	if (job->hw_fence.ops != NULL)
> > > > +		dma_fence_put(&job->hw_fence);
> > > > +	else
> > > > +		kfree(job);
> > > >    }
> > > >    int amdgpu_job_submit(struct amdgpu_job *job, struct drm_sched_entity *entity,
> > > > @@ -184,11 +196,14 @@ int amdgpu_job_submit_direct(struct amdgpu_job *job, struct amdgpu_ring *ring,
> > > >    	job->base.sched = &ring->sched;
> > > >    	r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, NULL, fence);
> > > > -	job->fence = dma_fence_get(*fence);
> > > > +	/* record external_hw_fence for direct submit */
> > > > +	job->external_hw_fence = dma_fence_get(*fence);
> > > >    	if (r)
> > > >    		return r;
> > > >    	amdgpu_job_free(job);
> > > > +	dma_fence_put(*fence);
> > > > +
> > > >    	return 0;
> > > >    }
> > > > @@ -246,10 +261,8 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
> > > >    		if (r)
> > > >    			DRM_ERROR("Error scheduling IBs (%d)\n", r);
> > > >    	}
> > > > -	/* if gpu reset, hw fence will be replaced here */
> > > > -	dma_fence_put(job->fence);
> > > > -	job->fence = dma_fence_get(fence);
> > > > +	dma_fence_get(fence);
> > > >    	amdgpu_job_free_resources(job);
> > > >    	fence = r ? ERR_PTR(r) : fence;
> > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> > > > index 81caac9b958a..92324c978534 100644
> > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> > > > @@ -46,7 +46,9 @@ struct amdgpu_job {
> > > >    	struct amdgpu_sync	sync;
> > > >    	struct amdgpu_sync	sched_sync;
> > > >    	struct amdgpu_ib	*ibs;
> > > > -	struct dma_fence	*fence; /* the hw fence */
> > > > +	struct dma_fence	hw_fence;
> > > > +	struct amdgpu_ring *ring;
> > > Why not instead of 2 fields above just embed   struct amdgpu_fence as
> > > hw_fence  and by this save the extra 'ring' field handling ?
> > > 
> > > Andrey
> > > 
> > > 
> > > > +	struct dma_fence	*external_hw_fence;
> > > >    	uint32_t		preamble_status;
> > > >    	uint32_t                preemption_status;
> > > >    	uint32_t		num_ibs;
> > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> > > > index 9c11ced4312c..03d4b29a76d6 100644
> > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> > > > @@ -48,6 +48,9 @@
> > > >    #define AMDGPU_FENCE_FLAG_INT           (1 << 1)
> > > >    #define AMDGPU_FENCE_FLAG_TC_WB_ONLY    (1 << 2)
> > > > +/* fence flag bit to indicate the face is embeded in job*/
> > > > +#define AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT		(DMA_FENCE_FLAG_USER_BITS + 1)
> > > > +
> > > >    #define to_amdgpu_ring(s) container_of((s), struct amdgpu_ring, sched)
> > > >    #define AMDGPU_IB_POOL_SIZE	(1024 * 1024)
> > > > @@ -118,7 +121,7 @@ void amdgpu_fence_driver_hw_init(struct amdgpu_device *adev);
> > > >    void amdgpu_fence_driver_hw_fini(struct amdgpu_device *adev);
> > > >    int amdgpu_fence_driver_sw_init(struct amdgpu_device *adev);
> > > >    void amdgpu_fence_driver_sw_fini(struct amdgpu_device *adev);
> > > > -int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **fence,
> > > > +int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **fence, struct amdgpu_job *job,
> > > >    		      unsigned flags);
> > > >    int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, uint32_t *s,
> > > >    			      uint32_t timeout);
> > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > > > index 2a88ed5d983b..2af8860d74cc 100644
> > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > > > @@ -1218,7 +1218,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
> > > >    		amdgpu_gmc_emit_pasid_mapping(ring, job->vmid, job->pasid);
> > > >    	if (vm_flush_needed || pasid_mapping_needed) {
> > > > -		r = amdgpu_fence_emit(ring, &fence, 0);
> > > > +		r = amdgpu_fence_emit(ring, &fence, NULL, 0);
> > > >    		if (r)
> > > >    			return r;
> > > >    	}
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCHv2 1/2] drm/amd/amdgpu embed hw_fence into amdgpu_job
  2021-08-09  2:18       ` Jingwen Chen
@ 2021-08-09  3:05         ` Jingwen Chen
  0 siblings, 0 replies; 10+ messages in thread
From: Jingwen Chen @ 2021-08-09  3:05 UTC (permalink / raw)
  To: Christian König, Andrey Grodzovsky, monk.liu,
	christian.koenig, Jack Zhang, amd-gfx

On Mon Aug 09, 2021 at 10:18:37AM +0800, Jingwen Chen wrote:
> On Fri Aug 06, 2021 at 11:48:04AM +0200, Christian König wrote:
> > 
> > 
> > Am 06.08.21 um 07:52 schrieb Jingwen Chen:
> > > On Thu Aug 05, 2021 at 05:13:22PM -0400, Andrey Grodzovsky wrote:
> > > > On 2021-08-05 4:31 a.m., Jingwen Chen wrote:
> > > > > From: Jack Zhang <Jack.Zhang1@amd.com>
> > > > > 
> > > > > Why: Previously hw fence is alloced separately with job.
> > > > > It caused historical lifetime issues and corner cases.
> > > > > The ideal situation is to take fence to manage both job
> > > > > and fence's lifetime, and simplify the design of gpu-scheduler.
> > > > > 
> > > > > How:
> > > > > We propose to embed hw_fence into amdgpu_job.
> > > > > 1. We cover the normal job submission by this method.
> > > > > 2. For ib_test, and submit without a parent job keep the
> > > > > legacy way to create a hw fence separately.
> > > > > v2:
> > > > > use AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT to show that the fence is
> > > > > embeded in a job.
> > > > > 
> > > > > Signed-off-by: Jingwen Chen <Jingwen.Chen2@amd.com>
> > > > > Signed-off-by: Jack Zhang <Jack.Zhang7@hotmail.com>
> > > > > ---
> > > > >    drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c  |  1 -
> > > > >    drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  2 +-
> > > > >    drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c   | 63 ++++++++++++++++-----
> > > > >    drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c      |  2 +-
> > > > >    drivers/gpu/drm/amd/amdgpu/amdgpu_job.c     | 35 ++++++++----
> > > > >    drivers/gpu/drm/amd/amdgpu/amdgpu_job.h     |  4 +-
> > > > >    drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h    |  5 +-
> > > > >    drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c      |  2 +-
> > > > >    8 files changed, 84 insertions(+), 30 deletions(-)
> > > > > 
> > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> > > > > index 7b46ba551cb2..3003ee1c9487 100644
> > > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> > > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> > > > > @@ -714,7 +714,6 @@ int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine,
> > > > >    	ret = dma_fence_wait(f, false);
> > > > >    err_ib_sched:
> > > > > -	dma_fence_put(f);
> > > > >    	amdgpu_job_free(job);
> > > > >    err:
> > > > >    	return ret;
> > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> > > > > index 536005bff24a..277128846dd1 100644
> > > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> > > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> > > > > @@ -1414,7 +1414,7 @@ static void amdgpu_ib_preempt_mark_partial_job(struct amdgpu_ring *ring)
> > > > >    			continue;
> > > > >    		}
> > > > >    		job = to_amdgpu_job(s_job);
> > > > > -		if (preempted && job->fence == fence)
> > > > > +		if (preempted && (&job->hw_fence) == fence)
> > > > >    			/* mark the job as preempted */
> > > > >    			job->preemption_status |= AMDGPU_IB_PREEMPTED;
> > > > >    	}
> > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> > > > > index 7495911516c2..5e29d797a265 100644
> > > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> > > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> > > > > @@ -129,30 +129,46 @@ static u32 amdgpu_fence_read(struct amdgpu_ring *ring)
> > > > >     *
> > > > >     * @ring: ring the fence is associated with
> > > > >     * @f: resulting fence object
> > > > > + * @job: job the fence is embeded in
> > > > >     * @flags: flags to pass into the subordinate .emit_fence() call
> > > > >     *
> > > > >     * Emits a fence command on the requested ring (all asics).
> > > > >     * Returns 0 on success, -ENOMEM on failure.
> > > > >     */
> > > > > -int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f,
> > > > > +int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f, struct amdgpu_job *job,
> > > > >    		      unsigned flags)
> > > > >    {
> > > > >    	struct amdgpu_device *adev = ring->adev;
> > > > > -	struct amdgpu_fence *fence;
> > > > > +	struct dma_fence *fence;
> > > > > +	struct amdgpu_fence *am_fence;
> > > > >    	struct dma_fence __rcu **ptr;
> > > > >    	uint32_t seq;
> > > > >    	int r;
> > > > > -	fence = kmem_cache_alloc(amdgpu_fence_slab, GFP_KERNEL);
> > > > > -	if (fence == NULL)
> > > > > -		return -ENOMEM;
> > > > > +	if (job == NULL) {
> > > > > +		/* create a sperate hw fence */
> > > > > +		am_fence = kmem_cache_alloc(amdgpu_fence_slab, GFP_ATOMIC);
> > > > > +		if (am_fence == NULL)
> > > > > +			return -ENOMEM;
> > > > > +		fence = &am_fence->base;
> > > > > +		am_fence->ring = ring;
> > > > > +	} else {
> > > > > +		/* take use of job-embedded fence */
> > > > > +		fence = &job->hw_fence;
> > > > > +		job->ring = ring;
> > > > 
> > > > If you would make hw_fence of type amdgpu_fence
> > > > you could probably avoid the special job->ring = ring
> > > > See more in related comment at the bottom
> > > > 
> > > Hi Andry,
> > > 
> > > I'm only make the amdgpu_fence for the fence without job parameter
> > > provided to amdgpu_fence_emit. For embeded fence which is the hw_fence
> > > in amdgpu_job, it will be allocated along with amdgpu_job as dma_fence.
> > 
> > When you have the job and need the ring you can just do
> > conatiner_of(job->sched, struct amdgpu_ring, sched).
> > 
> > No need for an extra ring pointer here.
> > 
> > Regards,
> > Christian.
> I see, I will change this.
> 
> Best Regards,
> JingWen Chen
Hi Christian & Andrey,

A version 3 of this patch has been sent, and the other tdr related patch
doesn't need change, so version 2 of tdr is OK to review.

Best Regards,
JingWen Chen

> > 
> > > 
> > > Regards,
> > > Jingwen Chen
> > > > > +	}
> > > > >    	seq = ++ring->fence_drv.sync_seq;
> > > > > -	fence->ring = ring;
> > > > > -	dma_fence_init(&fence->base, &amdgpu_fence_ops,
> > > > > +	dma_fence_init(fence, &amdgpu_fence_ops,
> > > > >    		       &ring->fence_drv.lock,
> > > > >    		       adev->fence_context + ring->idx,
> > > > >    		       seq);
> > > > > +
> > > > > +	if (job != NULL) {
> > > > > +		/* mark this fence has a parent job */
> > > > > +		set_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, &fence->flags);
> > > > > +	}
> > > > > +
> > > > >    	amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
> > > > >    			       seq, flags | AMDGPU_FENCE_FLAG_INT);
> > > > >    	pm_runtime_get_noresume(adev_to_drm(adev)->dev);
> > > > > @@ -175,9 +191,9 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f,
> > > > >    	/* This function can't be called concurrently anyway, otherwise
> > > > >    	 * emitting the fence would mess up the hardware ring buffer.
> > > > >    	 */
> > > > > -	rcu_assign_pointer(*ptr, dma_fence_get(&fence->base));
> > > > > +	rcu_assign_pointer(*ptr, dma_fence_get(fence));
> > > > > -	*f = &fence->base;
> > > > > +	*f = fence;
> > > > >    	return 0;
> > > > >    }
> > > > > @@ -621,8 +637,16 @@ static const char *amdgpu_fence_get_driver_name(struct dma_fence *fence)
> > > > >    static const char *amdgpu_fence_get_timeline_name(struct dma_fence *f)
> > > > >    {
> > > > > -	struct amdgpu_fence *fence = to_amdgpu_fence(f);
> > > > > -	return (const char *)fence->ring->name;
> > > > > +	struct amdgpu_ring *ring;
> > > > > +
> > > > > +	if (test_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, &f->flags)) {
> > > > > +		struct amdgpu_job *job = container_of(f, struct amdgpu_job, hw_fence);
> > > > > +
> > > > > +		ring = job->ring;
> > > > > +	} else {
> > > > > +		ring = to_amdgpu_fence(f)->ring;
> > > > > +	}
> > > > 
> > > > Same as above
> > > > 
> > > > 
> > > > > +	return (const char *)ring->name;
> > > > >    }
> > > > >    /**
> > > > > @@ -656,8 +680,20 @@ static bool amdgpu_fence_enable_signaling(struct dma_fence *f)
> > > > >    static void amdgpu_fence_free(struct rcu_head *rcu)
> > > > >    {
> > > > >    	struct dma_fence *f = container_of(rcu, struct dma_fence, rcu);
> > > > > -	struct amdgpu_fence *fence = to_amdgpu_fence(f);
> > > > > -	kmem_cache_free(amdgpu_fence_slab, fence);
> > > > > +
> > > > > +	if (test_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, &f->flags)) {
> > > > > +	/* free job if fence has a parent job */
> > > > > +		struct amdgpu_job *job;
> > > > > +
> > > > > +		job = container_of(f, struct amdgpu_job, hw_fence);
> > > > > +		kfree(job);
> > > > > +	} else {
> > > > > +	/* free fence_slab if it's separated fence*/
> > > > > +		struct amdgpu_fence *fence;
> > > > > +
> > > > > +		fence = to_amdgpu_fence(f);
> > > > > +		kmem_cache_free(amdgpu_fence_slab, fence);
> > > > > +	}
> > > > >    }
> > > > >    /**
> > > > > @@ -680,6 +716,7 @@ static const struct dma_fence_ops amdgpu_fence_ops = {
> > > > >    	.release = amdgpu_fence_release,
> > > > >    };
> > > > > +
> > > > >    /*
> > > > >     * Fence debugfs
> > > > >     */
> > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> > > > > index ec65ab0ddf89..c076a6b9a5a2 100644
> > > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> > > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> > > > > @@ -262,7 +262,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
> > > > >    				       fence_flags | AMDGPU_FENCE_FLAG_64BIT);
> > > > >    	}
> > > > > -	r = amdgpu_fence_emit(ring, f, fence_flags);
> > > > > +	r = amdgpu_fence_emit(ring, f, job, fence_flags);
> > > > >    	if (r) {
> > > > >    		dev_err(adev->dev, "failed to emit fence (%d)\n", r);
> > > > >    		if (job && job->vmid)
> > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > > > > index d33e6d97cc89..65a395060de2 100644
> > > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > > > > @@ -127,11 +127,16 @@ void amdgpu_job_free_resources(struct amdgpu_job *job)
> > > > >    {
> > > > >    	struct amdgpu_ring *ring = to_amdgpu_ring(job->base.sched);
> > > > >    	struct dma_fence *f;
> > > > > +	struct dma_fence *hw_fence;
> > > > >    	unsigned i;
> > > > > -	/* use sched fence if available */
> > > > > -	f = job->base.s_fence ? &job->base.s_fence->finished : job->fence;
> > > > > +	if (job->hw_fence.ops == NULL)
> > > > > +		hw_fence = job->external_hw_fence;
> > > > > +	else
> > > > > +		hw_fence = &job->hw_fence;
> > > > > +	/* use sched fence if available */
> > > > > +	f = job->base.s_fence ? &job->base.s_fence->finished : hw_fence;
> > > > >    	for (i = 0; i < job->num_ibs; ++i)
> > > > >    		amdgpu_ib_free(ring->adev, &job->ibs[i], f);
> > > > >    }
> > > > > @@ -142,20 +147,27 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job)
> > > > >    	drm_sched_job_cleanup(s_job);
> > > > > -	dma_fence_put(job->fence);
> > > > >    	amdgpu_sync_free(&job->sync);
> > > > >    	amdgpu_sync_free(&job->sched_sync);
> > > > > -	kfree(job);
> > > > > +
> > > > > +    /* only put the hw fence if has embedded fence */
> > > > > +	if (job->hw_fence.ops != NULL)
> > > > > +		dma_fence_put(&job->hw_fence);
> > > > > +	else
> > > > > +		kfree(job);
> > > > >    }
> > > > >    void amdgpu_job_free(struct amdgpu_job *job)
> > > > >    {
> > > > >    	amdgpu_job_free_resources(job);
> > > > > -
> > > > > -	dma_fence_put(job->fence);
> > > > >    	amdgpu_sync_free(&job->sync);
> > > > >    	amdgpu_sync_free(&job->sched_sync);
> > > > > -	kfree(job);
> > > > > +
> > > > > +	/* only put the hw fence if has embedded fence */
> > > > > +	if (job->hw_fence.ops != NULL)
> > > > > +		dma_fence_put(&job->hw_fence);
> > > > > +	else
> > > > > +		kfree(job);
> > > > >    }
> > > > >    int amdgpu_job_submit(struct amdgpu_job *job, struct drm_sched_entity *entity,
> > > > > @@ -184,11 +196,14 @@ int amdgpu_job_submit_direct(struct amdgpu_job *job, struct amdgpu_ring *ring,
> > > > >    	job->base.sched = &ring->sched;
> > > > >    	r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, NULL, fence);
> > > > > -	job->fence = dma_fence_get(*fence);
> > > > > +	/* record external_hw_fence for direct submit */
> > > > > +	job->external_hw_fence = dma_fence_get(*fence);
> > > > >    	if (r)
> > > > >    		return r;
> > > > >    	amdgpu_job_free(job);
> > > > > +	dma_fence_put(*fence);
> > > > > +
> > > > >    	return 0;
> > > > >    }
> > > > > @@ -246,10 +261,8 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
> > > > >    		if (r)
> > > > >    			DRM_ERROR("Error scheduling IBs (%d)\n", r);
> > > > >    	}
> > > > > -	/* if gpu reset, hw fence will be replaced here */
> > > > > -	dma_fence_put(job->fence);
> > > > > -	job->fence = dma_fence_get(fence);
> > > > > +	dma_fence_get(fence);
> > > > >    	amdgpu_job_free_resources(job);
> > > > >    	fence = r ? ERR_PTR(r) : fence;
> > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> > > > > index 81caac9b958a..92324c978534 100644
> > > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> > > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> > > > > @@ -46,7 +46,9 @@ struct amdgpu_job {
> > > > >    	struct amdgpu_sync	sync;
> > > > >    	struct amdgpu_sync	sched_sync;
> > > > >    	struct amdgpu_ib	*ibs;
> > > > > -	struct dma_fence	*fence; /* the hw fence */
> > > > > +	struct dma_fence	hw_fence;
> > > > > +	struct amdgpu_ring *ring;
> > > > Why not instead of 2 fields above just embed   struct amdgpu_fence as
> > > > hw_fence  and by this save the extra 'ring' field handling ?
> > > > 
> > > > Andrey
> > > > 
> > > > 
> > > > > +	struct dma_fence	*external_hw_fence;
> > > > >    	uint32_t		preamble_status;
> > > > >    	uint32_t                preemption_status;
> > > > >    	uint32_t		num_ibs;
> > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> > > > > index 9c11ced4312c..03d4b29a76d6 100644
> > > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> > > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> > > > > @@ -48,6 +48,9 @@
> > > > >    #define AMDGPU_FENCE_FLAG_INT           (1 << 1)
> > > > >    #define AMDGPU_FENCE_FLAG_TC_WB_ONLY    (1 << 2)
> > > > > +/* fence flag bit to indicate the face is embeded in job*/
> > > > > +#define AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT		(DMA_FENCE_FLAG_USER_BITS + 1)
> > > > > +
> > > > >    #define to_amdgpu_ring(s) container_of((s), struct amdgpu_ring, sched)
> > > > >    #define AMDGPU_IB_POOL_SIZE	(1024 * 1024)
> > > > > @@ -118,7 +121,7 @@ void amdgpu_fence_driver_hw_init(struct amdgpu_device *adev);
> > > > >    void amdgpu_fence_driver_hw_fini(struct amdgpu_device *adev);
> > > > >    int amdgpu_fence_driver_sw_init(struct amdgpu_device *adev);
> > > > >    void amdgpu_fence_driver_sw_fini(struct amdgpu_device *adev);
> > > > > -int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **fence,
> > > > > +int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **fence, struct amdgpu_job *job,
> > > > >    		      unsigned flags);
> > > > >    int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, uint32_t *s,
> > > > >    			      uint32_t timeout);
> > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > > > > index 2a88ed5d983b..2af8860d74cc 100644
> > > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > > > > @@ -1218,7 +1218,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
> > > > >    		amdgpu_gmc_emit_pasid_mapping(ring, job->vmid, job->pasid);
> > > > >    	if (vm_flush_needed || pasid_mapping_needed) {
> > > > > -		r = amdgpu_fence_emit(ring, &fence, 0);
> > > > > +		r = amdgpu_fence_emit(ring, &fence, NULL, 0);
> > > > >    		if (r)
> > > > >    			return r;
> > > > >    	}
> > 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCHv2 2/2] drm/amd/amdgpu: add tdr support for embeded hw_fence
  2021-08-05  8:31 ` [PATCHv2 2/2] drm/amd/amdgpu: add tdr support for embeded hw_fence Jingwen Chen
@ 2021-08-09 16:24   ` Andrey Grodzovsky
  2021-08-10  2:51     ` Jingwen Chen
  0 siblings, 1 reply; 10+ messages in thread
From: Andrey Grodzovsky @ 2021-08-09 16:24 UTC (permalink / raw)
  To: Jingwen Chen, amd-gfx; +Cc: monk.liu, christian.koenig, Jack Zhang


On 2021-08-05 4:31 a.m., Jingwen Chen wrote:
> [Why]
> After embeded hw_fence to amdgpu_job, we need to add tdr support
> for this feature.
>
> [How]
> 1. Add a resubmit_flag for resubmit jobs.
> 2. Clear job fence from RCU and force complete vm flush fences in
>     pre_asic_reset
> 3. skip dma_fence_get for resubmit jobs and add a dma_fence_put
>     for guilty jobs.
> v2:
> use a job_run_counter in amdgpu_job to replace the resubmit_flag in
> drm_sched_job. When the job_run_counter >= 1, it means this job is a
> resubmit job.
>
> Signed-off-by: Jack Zhang <Jack.Zhang7@hotmail.com>
> Signed-off-by: Jingwen Chen <Jingwen.Chen2@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++++++++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 13 +++++++++----
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    |  5 ++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.h    |  3 +++
>   4 files changed, 27 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 9e53ff851496..ade2fa07a50a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4447,7 +4447,7 @@ int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
>   int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>   				 struct amdgpu_reset_context *reset_context)
>   {
> -	int i, r = 0;
> +	int i, j, r = 0;
>   	struct amdgpu_job *job = NULL;
>   	bool need_full_reset =
>   		test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
> @@ -4471,6 +4471,16 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>   		if (!ring || !ring->sched.thread)
>   			continue;
>   
> +		/*clear job fence from fence drv to avoid force_completion
> +		 *leave NULL and vm flush fence in fence drv */
> +		for (j = 0; j <= ring->fence_drv.num_fences_mask; j ++) {
> +			struct dma_fence *old,**ptr;
> +			ptr = &ring->fence_drv.fences[j];
> +			old = rcu_dereference_protected(*ptr, 1);
> +			if (old && test_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, &old->flags)) {
> +				RCU_INIT_POINTER(*ptr, NULL);
> +			}
> +		}
>   		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
>   		amdgpu_fence_driver_force_completion(ring);
>   	}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> index 5e29d797a265..c9752cf794fb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> @@ -159,10 +159,15 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f, struct amd
>   	}
>   
>   	seq = ++ring->fence_drv.sync_seq;
> -	dma_fence_init(fence, &amdgpu_fence_ops,
> -		       &ring->fence_drv.lock,
> -		       adev->fence_context + ring->idx,
> -		       seq);
> +	if (job != NULL && job->job_run_counter) {
> +		/* reinit seq for resubmitted jobs */
> +		fence->seqno = seq;
> +	} else {
> +		dma_fence_init(fence, &amdgpu_fence_ops,
> +				&ring->fence_drv.lock,
> +				adev->fence_context + ring->idx,
> +				seq);
> +	}


I think this should be in the first patch actually (and the counter too),
without it the first patch is buggy.


>   
>   	if (job != NULL) {
>   		/* mark this fence has a parent job */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index 65a395060de2..19b13a65c73b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -254,6 +254,7 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
>   		dma_fence_set_error(finished, -ECANCELED);/* skip IB as well if VRAM lost */
>   
>   	if (finished->error < 0) {
> +		dma_fence_put(&job->hw_fence);


Would put this check bellow with the job_run_counter check


>   		DRM_INFO("Skip scheduling IBs!\n");
>   	} else {
>   		r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
> @@ -262,7 +263,9 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
>   			DRM_ERROR("Error scheduling IBs (%d)\n", r);
>   	}
>   
> -	dma_fence_get(fence);
> +	if (!job->job_run_counter)
> +		dma_fence_get(fence);
> +	job->job_run_counter ++;
>   	amdgpu_job_free_resources(job);


Here you  modify code you already changed in patch 1, looks to me
like those 2 patches should be squashed into one patch as the changes are
directly dependent and it's hard to follow

Andrey


>   
>   	fence = r ? ERR_PTR(r) : fence;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> index 92324c978534..1fa667f245e1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> @@ -64,6 +64,9 @@ struct amdgpu_job {
>   	/* user fence handling */
>   	uint64_t		uf_addr;
>   	uint64_t		uf_sequence;
> +
> +	/* job_run_counter >= 1 means a resubmit job */
> +	uint32_t		job_run_counter;
>   };
>   
>   int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCHv2 2/2] drm/amd/amdgpu: add tdr support for embeded hw_fence
  2021-08-09 16:24   ` Andrey Grodzovsky
@ 2021-08-10  2:51     ` Jingwen Chen
  2021-08-10  3:25       ` Jingwen Chen
  0 siblings, 1 reply; 10+ messages in thread
From: Jingwen Chen @ 2021-08-10  2:51 UTC (permalink / raw)
  To: Andrey Grodzovsky, amd-gfx; +Cc: monk.liu, christian.koenig, Jack Zhang

On Mon Aug 09, 2021 at 12:24:37PM -0400, Andrey Grodzovsky wrote:
> 
> On 2021-08-05 4:31 a.m., Jingwen Chen wrote:
> > [Why]
> > After embeded hw_fence to amdgpu_job, we need to add tdr support
> > for this feature.
> > 
> > [How]
> > 1. Add a resubmit_flag for resubmit jobs.
> > 2. Clear job fence from RCU and force complete vm flush fences in
> >     pre_asic_reset
> > 3. skip dma_fence_get for resubmit jobs and add a dma_fence_put
> >     for guilty jobs.
> > v2:
> > use a job_run_counter in amdgpu_job to replace the resubmit_flag in
> > drm_sched_job. When the job_run_counter >= 1, it means this job is a
> > resubmit job.
> > 
> > Signed-off-by: Jack Zhang <Jack.Zhang7@hotmail.com>
> > Signed-off-by: Jingwen Chen <Jingwen.Chen2@amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++++++++++-
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 13 +++++++++----
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    |  5 ++++-
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_job.h    |  3 +++
> >   4 files changed, 27 insertions(+), 6 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > index 9e53ff851496..ade2fa07a50a 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > @@ -4447,7 +4447,7 @@ int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
> >   int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
> >   				 struct amdgpu_reset_context *reset_context)
> >   {
> > -	int i, r = 0;
> > +	int i, j, r = 0;
> >   	struct amdgpu_job *job = NULL;
> >   	bool need_full_reset =
> >   		test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
> > @@ -4471,6 +4471,16 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
> >   		if (!ring || !ring->sched.thread)
> >   			continue;
> > +		/*clear job fence from fence drv to avoid force_completion
> > +		 *leave NULL and vm flush fence in fence drv */
> > +		for (j = 0; j <= ring->fence_drv.num_fences_mask; j ++) {
> > +			struct dma_fence *old,**ptr;
> > +			ptr = &ring->fence_drv.fences[j];
> > +			old = rcu_dereference_protected(*ptr, 1);
> > +			if (old && test_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, &old->flags)) {
> > +				RCU_INIT_POINTER(*ptr, NULL);
> > +			}
> > +		}
> >   		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
> >   		amdgpu_fence_driver_force_completion(ring);
> >   	}
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> > index 5e29d797a265..c9752cf794fb 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> > @@ -159,10 +159,15 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f, struct amd
> >   	}
> >   	seq = ++ring->fence_drv.sync_seq;
> > -	dma_fence_init(fence, &amdgpu_fence_ops,
> > -		       &ring->fence_drv.lock,
> > -		       adev->fence_context + ring->idx,
> > -		       seq);
> > +	if (job != NULL && job->job_run_counter) {
> > +		/* reinit seq for resubmitted jobs */
> > +		fence->seqno = seq;
> > +	} else {
> > +		dma_fence_init(fence, &amdgpu_fence_ops,
> > +				&ring->fence_drv.lock,
> > +				adev->fence_context + ring->idx,
> > +				seq);
> > +	}
> 
> 
> I think this should be in the first patch actually (and the counter too),
> without it the first patch is buggy.
> 
I was originally split these two patches by adding job submission
seqence and adding tdr sequence, But yes, I should merge these two
patches otherwise the tdr sequence will fail without second patch.
Will send a merged version today.

Best Regards,
Jingwen
> 
> >   	if (job != NULL) {
> >   		/* mark this fence has a parent job */
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > index 65a395060de2..19b13a65c73b 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > @@ -254,6 +254,7 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
> >   		dma_fence_set_error(finished, -ECANCELED);/* skip IB as well if VRAM lost */
> >   	if (finished->error < 0) {
> > +		dma_fence_put(&job->hw_fence);
> 
> 
> Would put this check bellow with the job_run_counter check
> 
> 
> >   		DRM_INFO("Skip scheduling IBs!\n");
> >   	} else {
> >   		r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
> > @@ -262,7 +263,9 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
> >   			DRM_ERROR("Error scheduling IBs (%d)\n", r);
> >   	}
> > -	dma_fence_get(fence);
> > +	if (!job->job_run_counter)
> > +		dma_fence_get(fence);
> > +	job->job_run_counter ++;
> >   	amdgpu_job_free_resources(job);
> 
> 
> Here you  modify code you already changed in patch 1, looks to me
> like those 2 patches should be squashed into one patch as the changes are
> directly dependent and it's hard to follow
> 
> Andrey
> 
> 
> >   	fence = r ? ERR_PTR(r) : fence;
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> > index 92324c978534..1fa667f245e1 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> > @@ -64,6 +64,9 @@ struct amdgpu_job {
> >   	/* user fence handling */
> >   	uint64_t		uf_addr;
> >   	uint64_t		uf_sequence;
> > +
> > +	/* job_run_counter >= 1 means a resubmit job */
> > +	uint32_t		job_run_counter;
> >   };
> >   int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCHv2 2/2] drm/amd/amdgpu: add tdr support for embeded hw_fence
  2021-08-10  2:51     ` Jingwen Chen
@ 2021-08-10  3:25       ` Jingwen Chen
  0 siblings, 0 replies; 10+ messages in thread
From: Jingwen Chen @ 2021-08-10  3:25 UTC (permalink / raw)
  To: Andrey Grodzovsky, amd-gfx; +Cc: monk.liu, christian.koenig, Jack Zhang

Hi Andrey,

The latest patch [PATCH v4] drm/amd/amdgpu embed hw_fence into
amdgpu_job has been sent to amd-gfx. can you help review this patch?

Best Regards,
Jingwen
On Tue Aug 10, 2021 at 10:51:17AM +0800, Jingwen Chen wrote:
> On Mon Aug 09, 2021 at 12:24:37PM -0400, Andrey Grodzovsky wrote:
> > 
> > On 2021-08-05 4:31 a.m., Jingwen Chen wrote:
> > > [Why]
> > > After embeded hw_fence to amdgpu_job, we need to add tdr support
> > > for this feature.
> > > 
> > > [How]
> > > 1. Add a resubmit_flag for resubmit jobs.
> > > 2. Clear job fence from RCU and force complete vm flush fences in
> > >     pre_asic_reset
> > > 3. skip dma_fence_get for resubmit jobs and add a dma_fence_put
> > >     for guilty jobs.
> > > v2:
> > > use a job_run_counter in amdgpu_job to replace the resubmit_flag in
> > > drm_sched_job. When the job_run_counter >= 1, it means this job is a
> > > resubmit job.
> > > 
> > > Signed-off-by: Jack Zhang <Jack.Zhang7@hotmail.com>
> > > Signed-off-by: Jingwen Chen <Jingwen.Chen2@amd.com>
> > > ---
> > >   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++++++++++-
> > >   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 13 +++++++++----
> > >   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    |  5 ++++-
> > >   drivers/gpu/drm/amd/amdgpu/amdgpu_job.h    |  3 +++
> > >   4 files changed, 27 insertions(+), 6 deletions(-)
> > > 
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > > index 9e53ff851496..ade2fa07a50a 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > > @@ -4447,7 +4447,7 @@ int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
> > >   int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
> > >   				 struct amdgpu_reset_context *reset_context)
> > >   {
> > > -	int i, r = 0;
> > > +	int i, j, r = 0;
> > >   	struct amdgpu_job *job = NULL;
> > >   	bool need_full_reset =
> > >   		test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
> > > @@ -4471,6 +4471,16 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
> > >   		if (!ring || !ring->sched.thread)
> > >   			continue;
> > > +		/*clear job fence from fence drv to avoid force_completion
> > > +		 *leave NULL and vm flush fence in fence drv */
> > > +		for (j = 0; j <= ring->fence_drv.num_fences_mask; j ++) {
> > > +			struct dma_fence *old,**ptr;
> > > +			ptr = &ring->fence_drv.fences[j];
> > > +			old = rcu_dereference_protected(*ptr, 1);
> > > +			if (old && test_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, &old->flags)) {
> > > +				RCU_INIT_POINTER(*ptr, NULL);
> > > +			}
> > > +		}
> > >   		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
> > >   		amdgpu_fence_driver_force_completion(ring);
> > >   	}
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> > > index 5e29d797a265..c9752cf794fb 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> > > @@ -159,10 +159,15 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f, struct amd
> > >   	}
> > >   	seq = ++ring->fence_drv.sync_seq;
> > > -	dma_fence_init(fence, &amdgpu_fence_ops,
> > > -		       &ring->fence_drv.lock,
> > > -		       adev->fence_context + ring->idx,
> > > -		       seq);
> > > +	if (job != NULL && job->job_run_counter) {
> > > +		/* reinit seq for resubmitted jobs */
> > > +		fence->seqno = seq;
> > > +	} else {
> > > +		dma_fence_init(fence, &amdgpu_fence_ops,
> > > +				&ring->fence_drv.lock,
> > > +				adev->fence_context + ring->idx,
> > > +				seq);
> > > +	}
> > 
> > 
> > I think this should be in the first patch actually (and the counter too),
> > without it the first patch is buggy.
> > 
> I was originally split these two patches by adding job submission
> seqence and adding tdr sequence, But yes, I should merge these two
> patches otherwise the tdr sequence will fail without second patch.
> Will send a merged version today.
> 
> Best Regards,
> Jingwen
> > 
> > >   	if (job != NULL) {
> > >   		/* mark this fence has a parent job */
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > > index 65a395060de2..19b13a65c73b 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > > @@ -254,6 +254,7 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
> > >   		dma_fence_set_error(finished, -ECANCELED);/* skip IB as well if VRAM lost */
> > >   	if (finished->error < 0) {
> > > +		dma_fence_put(&job->hw_fence);
> > 
> > 
> > Would put this check bellow with the job_run_counter check
> > 
> > 
> > >   		DRM_INFO("Skip scheduling IBs!\n");
> > >   	} else {
> > >   		r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
> > > @@ -262,7 +263,9 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
> > >   			DRM_ERROR("Error scheduling IBs (%d)\n", r);
> > >   	}
> > > -	dma_fence_get(fence);
> > > +	if (!job->job_run_counter)
> > > +		dma_fence_get(fence);
> > > +	job->job_run_counter ++;
> > >   	amdgpu_job_free_resources(job);
> > 
> > 
> > Here you  modify code you already changed in patch 1, looks to me
> > like those 2 patches should be squashed into one patch as the changes are
> > directly dependent and it's hard to follow
> > 
> > Andrey
> > 
> > 
> > >   	fence = r ? ERR_PTR(r) : fence;
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> > > index 92324c978534..1fa667f245e1 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> > > @@ -64,6 +64,9 @@ struct amdgpu_job {
> > >   	/* user fence handling */
> > >   	uint64_t		uf_addr;
> > >   	uint64_t		uf_sequence;
> > > +
> > > +	/* job_run_counter >= 1 means a resubmit job */
> > > +	uint32_t		job_run_counter;
> > >   };
> > >   int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2021-08-10  3:25 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-08-05  8:31 [PATCHv2 1/2] drm/amd/amdgpu embed hw_fence into amdgpu_job Jingwen Chen
2021-08-05  8:31 ` [PATCHv2 2/2] drm/amd/amdgpu: add tdr support for embeded hw_fence Jingwen Chen
2021-08-09 16:24   ` Andrey Grodzovsky
2021-08-10  2:51     ` Jingwen Chen
2021-08-10  3:25       ` Jingwen Chen
2021-08-05 21:13 ` [PATCHv2 1/2] drm/amd/amdgpu embed hw_fence into amdgpu_job Andrey Grodzovsky
2021-08-06  5:52   ` Jingwen Chen
2021-08-06  9:48     ` Christian König
2021-08-09  2:18       ` Jingwen Chen
2021-08-09  3:05         ` Jingwen Chen

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.