All of lore.kernel.org
 help / color / mirror / Atom feed
* Final gang submit patches
@ 2022-09-16  9:08 Christian König
  2022-09-16  9:08 ` [PATCH 1/8] drm/amdgpu: cleanup CS pass2 v5 Christian König
                   ` (8 more replies)
  0 siblings, 9 replies; 13+ messages in thread
From: Christian König @ 2022-09-16  9:08 UTC (permalink / raw)
  To: amd-gfx, alexander.deucher; +Cc: bas, timur.kristof

Hey guys,

so thanks to Ruijing I was finally able to hammer out all the known VCN
regressions from this patch set.

Alex can you review those? AFAIK you are pretty much the only other
person deep enough in the CS IOCTL for that.

Going to takle the userptr issue Bas stumbled over next.

Thanks,
Christian.



^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 1/8] drm/amdgpu: cleanup CS pass2 v5
  2022-09-16  9:08 Final gang submit patches Christian König
@ 2022-09-16  9:08 ` Christian König
  2022-09-16  9:08 ` [PATCH 2/8] drm/amdgpu: cleanup error handling in amdgpu_cs_parser_bos Christian König
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 13+ messages in thread
From: Christian König @ 2022-09-16  9:08 UTC (permalink / raw)
  To: amd-gfx, alexander.deucher; +Cc: bas, Christian König, timur.kristof

Cleanup the coding style and function names to represent the data
they process for pass2 as well.

Go over the chunks only twice now instead of multiple times.

v2: fix job initialisation order and use correct scheduler instance
v3: try to move all functional changes into a separate patch.
v4: separate reordering, pass1 and pass2 change
v5: fix va_start calculation

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 352 ++++++++++++-------------
 1 file changed, 168 insertions(+), 184 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index e104d7ef3c3d..c04073f4ead5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -278,93 +278,84 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p,
 	return ret;
 }
 
-static int amdgpu_cs_ib_fill(struct amdgpu_device *adev,
-			     struct amdgpu_cs_parser *parser)
+static int amdgpu_cs_p2_ib(struct amdgpu_cs_parser *p,
+			   struct amdgpu_cs_chunk *chunk,
+			   unsigned int *num_ibs,
+			   unsigned int *ce_preempt,
+			   unsigned int *de_preempt)
 {
-	struct amdgpu_fpriv *fpriv = parser->filp->driver_priv;
+	struct drm_amdgpu_cs_chunk_ib *chunk_ib = chunk->kdata;
+	struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
+	struct amdgpu_ib *ib = &p->job->ibs[*num_ibs];
 	struct amdgpu_vm *vm = &fpriv->vm;
-	int r, ce_preempt = 0, de_preempt = 0;
+	struct drm_sched_entity *entity;
 	struct amdgpu_ring *ring;
-	int i, j;
-
-	for (i = 0, j = 0; i < parser->nchunks && j < parser->job->num_ibs; i++) {
-		struct amdgpu_cs_chunk *chunk;
-		struct amdgpu_ib *ib;
-		struct drm_amdgpu_cs_chunk_ib *chunk_ib;
-		struct drm_sched_entity *entity;
-
-		chunk = &parser->chunks[i];
-		ib = &parser->job->ibs[j];
-		chunk_ib = (struct drm_amdgpu_cs_chunk_ib *)chunk->kdata;
+	int r;
 
-		if (chunk->chunk_id != AMDGPU_CHUNK_ID_IB)
-			continue;
+	r = amdgpu_ctx_get_entity(p->ctx, chunk_ib->ip_type,
+				  chunk_ib->ip_instance,
+				  chunk_ib->ring, &entity);
+	if (r)
+		return r;
 
-		if (chunk_ib->ip_type == AMDGPU_HW_IP_GFX &&
-		    chunk_ib->flags & AMDGPU_IB_FLAG_PREEMPT) {
-			if (chunk_ib->flags & AMDGPU_IB_FLAG_CE)
-				ce_preempt++;
-			else
-				de_preempt++;
+	/*
+	 * Abort if there is no run queue associated with this entity.
+	 * Possibly because of disabled HW IP.
+	 */
+	if (entity->rq == NULL)
+		return -EINVAL;
 
-			/* each GFX command submit allows 0 or 1 IB preemptible for CE & DE */
-			if (ce_preempt > 1 || de_preempt > 1)
-				return -EINVAL;
-		}
+	/* Currently we don't support submitting to multiple entities */
+	if (p->entity && p->entity != entity)
+		return -EINVAL;
 
-		r = amdgpu_ctx_get_entity(parser->ctx, chunk_ib->ip_type,
-					  chunk_ib->ip_instance, chunk_ib->ring,
-					  &entity);
-		if (r)
-			return r;
+	p->entity = entity;
 
-		if (chunk_ib->flags & AMDGPU_IB_FLAG_PREAMBLE)
-			parser->job->preamble_status |=
-				AMDGPU_PREAMBLE_IB_PRESENT;
+	ring = to_amdgpu_ring(entity->rq->sched);
+	/* MM engine doesn't support user fences */
+	if (p->job->uf_addr && ring->funcs->no_user_fence)
+		return -EINVAL;
 
-		if (parser->entity && parser->entity != entity)
-			return -EINVAL;
+	if (chunk_ib->ip_type == AMDGPU_HW_IP_GFX &&
+	    chunk_ib->flags & AMDGPU_IB_FLAG_PREEMPT) {
+		if (chunk_ib->flags & AMDGPU_IB_FLAG_CE)
+			(*ce_preempt)++;
+		else
+			(*de_preempt)++;
 
-		/* Return if there is no run queue associated with this entity.
-		 * Possibly because of disabled HW IP*/
-		if (entity->rq == NULL)
+		/* Each GFX command submit allows only 1 IB max
+		 * preemptible for CE & DE */
+		if (*ce_preempt > 1 || *de_preempt > 1)
 			return -EINVAL;
+	}
 
-		parser->entity = entity;
-
-		ring = to_amdgpu_ring(entity->rq->sched);
-		r =  amdgpu_ib_get(adev, vm, ring->funcs->parse_cs ?
-				   chunk_ib->ib_bytes : 0,
-				   AMDGPU_IB_POOL_DELAYED, ib);
-		if (r) {
-			DRM_ERROR("Failed to get ib !\n");
-			return r;
-		}
-
-		ib->gpu_addr = chunk_ib->va_start;
-		ib->length_dw = chunk_ib->ib_bytes / 4;
-		ib->flags = chunk_ib->flags;
+	if (chunk_ib->flags & AMDGPU_IB_FLAG_PREAMBLE)
+		p->job->preamble_status |= AMDGPU_PREAMBLE_IB_PRESENT;
 
-		j++;
+	r =  amdgpu_ib_get(p->adev, vm, ring->funcs->parse_cs ?
+			   chunk_ib->ib_bytes : 0,
+			   AMDGPU_IB_POOL_DELAYED, ib);
+	if (r) {
+		DRM_ERROR("Failed to get ib !\n");
+		return r;
 	}
 
-	/* MM engine doesn't support user fences */
-	ring = to_amdgpu_ring(parser->entity->rq->sched);
-	if (parser->job->uf_addr && ring->funcs->no_user_fence)
-		return -EINVAL;
+	ib->gpu_addr = chunk_ib->va_start;
+	ib->length_dw = chunk_ib->ib_bytes / 4;
+	ib->flags = chunk_ib->flags;
 
+	(*num_ibs)++;
 	return 0;
 }
 
-static int amdgpu_cs_process_fence_dep(struct amdgpu_cs_parser *p,
-				       struct amdgpu_cs_chunk *chunk)
+static int amdgpu_cs_p2_dependencies(struct amdgpu_cs_parser *p,
+				     struct amdgpu_cs_chunk *chunk)
 {
+	struct drm_amdgpu_cs_chunk_dep *deps = chunk->kdata;
 	struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
 	unsigned num_deps;
 	int i, r;
-	struct drm_amdgpu_cs_chunk_dep *deps;
 
-	deps = (struct drm_amdgpu_cs_chunk_dep *)chunk->kdata;
 	num_deps = chunk->length_dw * 4 /
 		sizeof(struct drm_amdgpu_cs_chunk_dep);
 
@@ -410,9 +401,9 @@ static int amdgpu_cs_process_fence_dep(struct amdgpu_cs_parser *p,
 	return 0;
 }
 
-static int amdgpu_syncobj_lookup_and_add_to_sync(struct amdgpu_cs_parser *p,
-						 uint32_t handle, u64 point,
-						 u64 flags)
+static int amdgpu_syncobj_lookup_and_add(struct amdgpu_cs_parser *p,
+					 uint32_t handle, u64 point,
+					 u64 flags)
 {
 	struct dma_fence *fence;
 	int r;
@@ -430,19 +421,17 @@ static int amdgpu_syncobj_lookup_and_add_to_sync(struct amdgpu_cs_parser *p,
 	return r;
 }
 
-static int amdgpu_cs_process_syncobj_in_dep(struct amdgpu_cs_parser *p,
-					    struct amdgpu_cs_chunk *chunk)
+static int amdgpu_cs_p2_syncobj_in(struct amdgpu_cs_parser *p,
+				   struct amdgpu_cs_chunk *chunk)
 {
-	struct drm_amdgpu_cs_chunk_sem *deps;
+	struct drm_amdgpu_cs_chunk_sem *deps = chunk->kdata;
 	unsigned num_deps;
 	int i, r;
 
-	deps = (struct drm_amdgpu_cs_chunk_sem *)chunk->kdata;
 	num_deps = chunk->length_dw * 4 /
 		sizeof(struct drm_amdgpu_cs_chunk_sem);
 	for (i = 0; i < num_deps; ++i) {
-		r = amdgpu_syncobj_lookup_and_add_to_sync(p, deps[i].handle,
-							  0, 0);
+		r = amdgpu_syncobj_lookup_and_add(p, deps[i].handle, 0, 0);
 		if (r)
 			return r;
 	}
@@ -450,21 +439,19 @@ static int amdgpu_cs_process_syncobj_in_dep(struct amdgpu_cs_parser *p,
 	return 0;
 }
 
-static int amdgpu_cs_process_syncobj_timeline_in_dep(struct amdgpu_cs_parser *p,
-						     struct amdgpu_cs_chunk *chunk)
+static int amdgpu_cs_p2_syncobj_timeline_wait(struct amdgpu_cs_parser *p,
+					      struct amdgpu_cs_chunk *chunk)
 {
-	struct drm_amdgpu_cs_chunk_syncobj *syncobj_deps;
+	struct drm_amdgpu_cs_chunk_syncobj *syncobj_deps = chunk->kdata;
 	unsigned num_deps;
 	int i, r;
 
-	syncobj_deps = (struct drm_amdgpu_cs_chunk_syncobj *)chunk->kdata;
 	num_deps = chunk->length_dw * 4 /
 		sizeof(struct drm_amdgpu_cs_chunk_syncobj);
 	for (i = 0; i < num_deps; ++i) {
-		r = amdgpu_syncobj_lookup_and_add_to_sync(p,
-							  syncobj_deps[i].handle,
-							  syncobj_deps[i].point,
-							  syncobj_deps[i].flags);
+		r = amdgpu_syncobj_lookup_and_add(p, syncobj_deps[i].handle,
+						  syncobj_deps[i].point,
+						  syncobj_deps[i].flags);
 		if (r)
 			return r;
 	}
@@ -472,14 +459,13 @@ static int amdgpu_cs_process_syncobj_timeline_in_dep(struct amdgpu_cs_parser *p,
 	return 0;
 }
 
-static int amdgpu_cs_process_syncobj_out_dep(struct amdgpu_cs_parser *p,
-					     struct amdgpu_cs_chunk *chunk)
+static int amdgpu_cs_p2_syncobj_out(struct amdgpu_cs_parser *p,
+				    struct amdgpu_cs_chunk *chunk)
 {
-	struct drm_amdgpu_cs_chunk_sem *deps;
+	struct drm_amdgpu_cs_chunk_sem *deps = chunk->kdata;
 	unsigned num_deps;
 	int i;
 
-	deps = (struct drm_amdgpu_cs_chunk_sem *)chunk->kdata;
 	num_deps = chunk->length_dw * 4 /
 		sizeof(struct drm_amdgpu_cs_chunk_sem);
 
@@ -507,15 +493,13 @@ static int amdgpu_cs_process_syncobj_out_dep(struct amdgpu_cs_parser *p,
 	return 0;
 }
 
-
-static int amdgpu_cs_process_syncobj_timeline_out_dep(struct amdgpu_cs_parser *p,
-						      struct amdgpu_cs_chunk *chunk)
+static int amdgpu_cs_p2_syncobj_timeline_signal(struct amdgpu_cs_parser *p,
+						struct amdgpu_cs_chunk *chunk)
 {
-	struct drm_amdgpu_cs_chunk_syncobj *syncobj_deps;
+	struct drm_amdgpu_cs_chunk_syncobj *syncobj_deps = chunk->kdata;
 	unsigned num_deps;
 	int i;
 
-	syncobj_deps = (struct drm_amdgpu_cs_chunk_syncobj *)chunk->kdata;
 	num_deps = chunk->length_dw * 4 /
 		sizeof(struct drm_amdgpu_cs_chunk_syncobj);
 
@@ -552,9 +536,9 @@ static int amdgpu_cs_process_syncobj_timeline_out_dep(struct amdgpu_cs_parser *p
 	return 0;
 }
 
-static int amdgpu_cs_dependencies(struct amdgpu_device *adev,
-				  struct amdgpu_cs_parser *p)
+static int amdgpu_cs_pass2(struct amdgpu_cs_parser *p)
 {
+	unsigned int num_ibs = 0, ce_preempt = 0, de_preempt = 0;
 	int i, r;
 
 	for (i = 0; i < p->nchunks; ++i) {
@@ -563,29 +547,35 @@ static int amdgpu_cs_dependencies(struct amdgpu_device *adev,
 		chunk = &p->chunks[i];
 
 		switch (chunk->chunk_id) {
+		case AMDGPU_CHUNK_ID_IB:
+			r = amdgpu_cs_p2_ib(p, chunk, &num_ibs,
+					    &ce_preempt, &de_preempt);
+			if (r)
+				return r;
+			break;
 		case AMDGPU_CHUNK_ID_DEPENDENCIES:
 		case AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES:
-			r = amdgpu_cs_process_fence_dep(p, chunk);
+			r = amdgpu_cs_p2_dependencies(p, chunk);
 			if (r)
 				return r;
 			break;
 		case AMDGPU_CHUNK_ID_SYNCOBJ_IN:
-			r = amdgpu_cs_process_syncobj_in_dep(p, chunk);
+			r = amdgpu_cs_p2_syncobj_in(p, chunk);
 			if (r)
 				return r;
 			break;
 		case AMDGPU_CHUNK_ID_SYNCOBJ_OUT:
-			r = amdgpu_cs_process_syncobj_out_dep(p, chunk);
+			r = amdgpu_cs_p2_syncobj_out(p, chunk);
 			if (r)
 				return r;
 			break;
 		case AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_WAIT:
-			r = amdgpu_cs_process_syncobj_timeline_in_dep(p, chunk);
+			r = amdgpu_cs_p2_syncobj_timeline_wait(p, chunk);
 			if (r)
 				return r;
 			break;
 		case AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_SIGNAL:
-			r = amdgpu_cs_process_syncobj_timeline_out_dep(p, chunk);
+			r = amdgpu_cs_p2_syncobj_timeline_signal(p, chunk);
 			if (r)
 				return r;
 			break;
@@ -987,78 +977,74 @@ static void trace_amdgpu_cs_ibs(struct amdgpu_cs_parser *parser)
 		trace_amdgpu_cs(parser, i);
 }
 
-static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
+static int amdgpu_cs_patch_ibs(struct amdgpu_cs_parser *p)
 {
 	struct amdgpu_ring *ring = to_amdgpu_ring(p->entity->rq->sched);
-	struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
-	struct amdgpu_device *adev = p->adev;
-	struct amdgpu_vm *vm = &fpriv->vm;
-	struct amdgpu_bo_list_entry *e;
-	struct amdgpu_bo_va *bo_va;
-	struct amdgpu_bo *bo;
+	struct amdgpu_job *job = p->job;
+	unsigned int i;
 	int r;
 
 	/* Only for UVD/VCE VM emulation */
-	if (ring->funcs->parse_cs || ring->funcs->patch_cs_in_place) {
-		unsigned i, j;
-
-		for (i = 0, j = 0; i < p->nchunks && j < p->job->num_ibs; i++) {
-			struct drm_amdgpu_cs_chunk_ib *chunk_ib;
-			struct amdgpu_bo_va_mapping *m;
-			struct amdgpu_bo *aobj = NULL;
-			struct amdgpu_cs_chunk *chunk;
-			uint64_t offset, va_start;
-			struct amdgpu_ib *ib;
-			uint8_t *kptr;
-
-			chunk = &p->chunks[i];
-			ib = &p->job->ibs[j];
-			chunk_ib = chunk->kdata;
-
-			if (chunk->chunk_id != AMDGPU_CHUNK_ID_IB)
-				continue;
+	if (!ring->funcs->parse_cs && !ring->funcs->patch_cs_in_place)
+		return 0;
 
-			va_start = chunk_ib->va_start & AMDGPU_GMC_HOLE_MASK;
-			r = amdgpu_cs_find_mapping(p, va_start, &aobj, &m);
-			if (r) {
-				DRM_ERROR("IB va_start is invalid\n");
-				return r;
-			}
+	for (i = 0; i < job->num_ibs; ++i) {
+		struct amdgpu_ib *ib = &job->ibs[i];
+		struct amdgpu_bo_va_mapping *m;
+		struct amdgpu_bo *aobj;
+		uint64_t va_start;
+		uint8_t *kptr;
 
-			if ((va_start + chunk_ib->ib_bytes) >
-			    (m->last + 1) * AMDGPU_GPU_PAGE_SIZE) {
-				DRM_ERROR("IB va_start+ib_bytes is invalid\n");
-				return -EINVAL;
-			}
+		va_start = ib->gpu_addr & AMDGPU_GMC_HOLE_MASK;
+		r = amdgpu_cs_find_mapping(p, va_start, &aobj, &m);
+		if (r) {
+			DRM_ERROR("IB va_start is invalid\n");
+			return r;
+		}
 
-			/* the IB should be reserved at this point */
-			r = amdgpu_bo_kmap(aobj, (void **)&kptr);
-			if (r) {
-				return r;
-			}
+		if ((va_start + ib->length_dw * 4) >
+		    (m->last + 1) * AMDGPU_GPU_PAGE_SIZE) {
+			DRM_ERROR("IB va_start+ib_bytes is invalid\n");
+			return -EINVAL;
+		}
 
-			offset = m->start * AMDGPU_GPU_PAGE_SIZE;
-			kptr += va_start - offset;
-
-			if (ring->funcs->parse_cs) {
-				memcpy(ib->ptr, kptr, chunk_ib->ib_bytes);
-				amdgpu_bo_kunmap(aobj);
-
-				r = amdgpu_ring_parse_cs(ring, p, p->job, ib);
-				if (r)
-					return r;
-			} else {
-				ib->ptr = (uint32_t *)kptr;
-				r = amdgpu_ring_patch_cs_in_place(ring, p, p->job, ib);
-				amdgpu_bo_kunmap(aobj);
-				if (r)
-					return r;
-			}
+		/* the IB should be reserved at this point */
+		r = amdgpu_bo_kmap(aobj, (void **)&kptr);
+		if (r) {
+			return r;
+		}
+
+		kptr += va_start - (m->start * AMDGPU_GPU_PAGE_SIZE);
 
-			j++;
+		if (ring->funcs->parse_cs) {
+			memcpy(ib->ptr, kptr, ib->length_dw * 4);
+			amdgpu_bo_kunmap(aobj);
+
+			r = amdgpu_ring_parse_cs(ring, p, p->job, ib);
+			if (r)
+				return r;
+		} else {
+			ib->ptr = (uint32_t *)kptr;
+			r = amdgpu_ring_patch_cs_in_place(ring, p, p->job, ib);
+			amdgpu_bo_kunmap(aobj);
+			if (r)
+				return r;
 		}
 	}
 
+	return 0;
+}
+
+static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
+{
+	struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
+	struct amdgpu_device *adev = p->adev;
+	struct amdgpu_vm *vm = &fpriv->vm;
+	struct amdgpu_bo_list_entry *e;
+	struct amdgpu_bo_va *bo_va;
+	struct amdgpu_bo *bo;
+	int r;
+
 	if (!p->job->vm)
 		return 0;
 
@@ -1186,7 +1172,7 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
 	job = p->job;
 	p->job = NULL;
 
-	r = drm_sched_job_init(&job->base, entity, &fpriv->vm);
+	r = drm_sched_job_init(&job->base, p->entity, &fpriv->vm);
 	if (r)
 		goto error_unlock;
 
@@ -1253,17 +1239,10 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
 }
 
 /* Cleanup the parser structure */
-static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser, int error,
-				  bool backoff)
+static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser)
 {
 	unsigned i;
 
-	if (error && backoff) {
-		ttm_eu_backoff_reservation(&parser->ticket,
-					   &parser->validated);
-		mutex_unlock(&parser->bo_list->bo_list_mutex);
-	}
-
 	for (i = 0; i < parser->num_post_deps; i++) {
 		drm_syncobj_put(parser->post_deps[i].syncobj);
 		kfree(parser->post_deps[i].chain);
@@ -1272,8 +1251,9 @@ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser, int error,
 
 	dma_fence_put(parser->fence);
 
-	if (parser->ctx)
+	if (parser->ctx) {
 		amdgpu_ctx_put(parser->ctx);
+	}
 	if (parser->bo_list)
 		amdgpu_bo_list_put(parser->bo_list);
 
@@ -1293,7 +1273,6 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
 {
 	struct amdgpu_device *adev = drm_to_adev(dev);
 	struct amdgpu_cs_parser parser;
-	bool reserved_buffers = false;
 	int r;
 
 	if (amdgpu_ras_intr_triggered())
@@ -1306,22 +1285,16 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
 	if (r) {
 		if (printk_ratelimit())
 			DRM_ERROR("Failed to initialize parser %d!\n", r);
-		goto out;
+		return r;
 	}
 
 	r = amdgpu_cs_pass1(&parser, data);
 	if (r)
-		goto out;
+		goto error_fini;
 
-	r = amdgpu_cs_ib_fill(adev, &parser);
+	r = amdgpu_cs_pass2(&parser);
 	if (r)
-		goto out;
-
-	r = amdgpu_cs_dependencies(adev, &parser);
-	if (r) {
-		DRM_ERROR("Failed in the dependencies handling %d!\n", r);
-		goto out;
-	}
+		goto error_fini;
 
 	r = amdgpu_cs_parser_bos(&parser, data);
 	if (r) {
@@ -1329,25 +1302,36 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
 			DRM_ERROR("Not enough memory for command submission!\n");
 		else if (r != -ERESTARTSYS && r != -EAGAIN)
 			DRM_ERROR("Failed to process the buffer list %d!\n", r);
-		goto out;
+		goto error_fini;
 	}
 
-	reserved_buffers = true;
-
-	trace_amdgpu_cs_ibs(&parser);
+	r = amdgpu_cs_patch_ibs(&parser);
+	if (r)
+		goto error_backoff;
 
 	r = amdgpu_cs_vm_handling(&parser);
 	if (r)
-		goto out;
+		goto error_backoff;
 
 	r = amdgpu_cs_sync_rings(&parser);
 	if (r)
-		goto out;
+		goto error_backoff;
+
+	trace_amdgpu_cs_ibs(&parser);
 
 	r = amdgpu_cs_submit(&parser, data);
-out:
-	amdgpu_cs_parser_fini(&parser, r, reserved_buffers);
+	if (r)
+		goto error_backoff;
+
+	amdgpu_cs_parser_fini(&parser);
+	return 0;
+
+error_backoff:
+	ttm_eu_backoff_reservation(&parser.ticket, &parser.validated);
+	mutex_unlock(&parser.bo_list->bo_list_mutex);
 
+error_fini:
+	amdgpu_cs_parser_fini(&parser);
 	return r;
 }
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 2/8] drm/amdgpu: cleanup error handling in amdgpu_cs_parser_bos
  2022-09-16  9:08 Final gang submit patches Christian König
  2022-09-16  9:08 ` [PATCH 1/8] drm/amdgpu: cleanup CS pass2 v5 Christian König
@ 2022-09-16  9:08 ` Christian König
  2022-09-16  9:08 ` [PATCH 3/8] drm/amdgpu: fix user fence CS check Christian König
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 13+ messages in thread
From: Christian König @ 2022-09-16  9:08 UTC (permalink / raw)
  To: amd-gfx, alexander.deucher; +Cc: bas, Christian König, timur.kristof

Return early on success and so remove all those "if (r)" in the error
path.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 37 +++++++++++++-------------
 1 file changed, 18 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index c04073f4ead5..5c37dde97ff2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -933,35 +933,34 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
 	if (r)
 		goto error_validate;
 
-	amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved,
-				     p->bytes_moved_vis);
-
-	amdgpu_job_set_resources(p->job, p->bo_list->gds_obj,
-				 p->bo_list->gws_obj, p->bo_list->oa_obj);
-
-	if (!r && p->uf_entry.tv.bo) {
+	if (p->uf_entry.tv.bo) {
 		struct amdgpu_bo *uf = ttm_to_amdgpu_bo(p->uf_entry.tv.bo);
 
 		r = amdgpu_ttm_alloc_gart(&uf->tbo);
+		if (r)
+			goto error_validate;
+
 		p->job->uf_addr += amdgpu_bo_gpu_offset(uf);
 	}
 
+	amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved,
+				     p->bytes_moved_vis);
+	amdgpu_job_set_resources(p->job, p->bo_list->gds_obj,
+				 p->bo_list->gws_obj, p->bo_list->oa_obj);
+	return 0;
+
 error_validate:
-	if (r)
-		ttm_eu_backoff_reservation(&p->ticket, &p->validated);
+	ttm_eu_backoff_reservation(&p->ticket, &p->validated);
 
 out_free_user_pages:
-	if (r) {
-		amdgpu_bo_list_for_each_userptr_entry(e, p->bo_list) {
-			struct amdgpu_bo *bo = ttm_to_amdgpu_bo(e->tv.bo);
+	amdgpu_bo_list_for_each_userptr_entry(e, p->bo_list) {
+		struct amdgpu_bo *bo = ttm_to_amdgpu_bo(e->tv.bo);
 
-			if (!e->user_pages)
-				continue;
-			amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
-			kvfree(e->user_pages);
-			e->user_pages = NULL;
-		}
-		mutex_unlock(&p->bo_list->bo_list_mutex);
+		if (!e->user_pages)
+			continue;
+		amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
+		kvfree(e->user_pages);
+		e->user_pages = NULL;
 	}
 	return r;
 }
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 3/8] drm/amdgpu: fix user fence CS check
  2022-09-16  9:08 Final gang submit patches Christian König
  2022-09-16  9:08 ` [PATCH 1/8] drm/amdgpu: cleanup CS pass2 v5 Christian König
  2022-09-16  9:08 ` [PATCH 2/8] drm/amdgpu: cleanup error handling in amdgpu_cs_parser_bos Christian König
@ 2022-09-16  9:08 ` Christian König
  2022-09-16 18:01   ` Alex Deucher
  2022-09-16  9:08 ` [PATCH 4/8] drm/amdgpu: move entity selection and job init earlier during CS Christian König
                   ` (5 subsequent siblings)
  8 siblings, 1 reply; 13+ messages in thread
From: Christian König @ 2022-09-16  9:08 UTC (permalink / raw)
  To: amd-gfx, alexander.deucher; +Cc: bas, Christian König, timur.kristof

An offset of zero is valid, check if the BO is present or not.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 5c37dde97ff2..265ed2118a80 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -313,7 +313,7 @@ static int amdgpu_cs_p2_ib(struct amdgpu_cs_parser *p,
 
 	ring = to_amdgpu_ring(entity->rq->sched);
 	/* MM engine doesn't support user fences */
-	if (p->job->uf_addr && ring->funcs->no_user_fence)
+	if (p->uf_entry.tv.bo && ring->funcs->no_user_fence)
 		return -EINVAL;
 
 	if (chunk_ib->ip_type == AMDGPU_HW_IP_GFX &&
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 4/8] drm/amdgpu: move entity selection and job init earlier during CS
  2022-09-16  9:08 Final gang submit patches Christian König
                   ` (2 preceding siblings ...)
  2022-09-16  9:08 ` [PATCH 3/8] drm/amdgpu: fix user fence CS check Christian König
@ 2022-09-16  9:08 ` Christian König
  2022-09-16  9:08 ` [PATCH 5/8] drm/amdgpu: revert "fix limiting AV1 to the first instance on VCN3" v3 Christian König
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 13+ messages in thread
From: Christian König @ 2022-09-16  9:08 UTC (permalink / raw)
  To: amd-gfx, alexander.deucher; +Cc: bas, Christian König, timur.kristof

Initialize the entity for the CS and scheduler job much earlier.

v2: fix job initialisation order and use correct scheduler instance

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 54 ++++++++++++-------------
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  5 +++
 2 files changed, 30 insertions(+), 29 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 265ed2118a80..58088c663125 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -68,6 +68,25 @@ static int amdgpu_cs_p1_ib(struct amdgpu_cs_parser *p,
 			   struct drm_amdgpu_cs_chunk_ib *chunk_ib,
 			   unsigned int *num_ibs)
 {
+	struct drm_sched_entity *entity;
+	int r;
+
+	r = amdgpu_ctx_get_entity(p->ctx, chunk_ib->ip_type,
+				  chunk_ib->ip_instance,
+				  chunk_ib->ring, &entity);
+	if (r)
+		return r;
+
+	/* Abort if there is no run queue associated with this entity.
+	 * Possibly because of disabled HW IP*/
+	if (entity->rq == NULL)
+		return -EINVAL;
+
+	/* Currently we don't support submitting to multiple entities */
+	if (p->entity && p->entity != entity)
+		return -EINVAL;
+
+	p->entity = entity;
 	++(*num_ibs);
 	return 0;
 }
@@ -250,6 +269,10 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p,
 	if (ret)
 		goto free_all_kdata;
 
+	ret = drm_sched_job_init(&p->job->base, p->entity, &fpriv->vm);
+	if (ret)
+		goto free_all_kdata;
+
 	if (p->ctx->vram_lost_counter != p->job->vram_lost_counter) {
 		ret = -ECANCELED;
 		goto free_all_kdata;
@@ -286,32 +309,11 @@ static int amdgpu_cs_p2_ib(struct amdgpu_cs_parser *p,
 {
 	struct drm_amdgpu_cs_chunk_ib *chunk_ib = chunk->kdata;
 	struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
+	struct amdgpu_ring *ring = amdgpu_job_ring(p->job);
 	struct amdgpu_ib *ib = &p->job->ibs[*num_ibs];
 	struct amdgpu_vm *vm = &fpriv->vm;
-	struct drm_sched_entity *entity;
-	struct amdgpu_ring *ring;
 	int r;
 
-	r = amdgpu_ctx_get_entity(p->ctx, chunk_ib->ip_type,
-				  chunk_ib->ip_instance,
-				  chunk_ib->ring, &entity);
-	if (r)
-		return r;
-
-	/*
-	 * Abort if there is no run queue associated with this entity.
-	 * Possibly because of disabled HW IP.
-	 */
-	if (entity->rq == NULL)
-		return -EINVAL;
-
-	/* Currently we don't support submitting to multiple entities */
-	if (p->entity && p->entity != entity)
-		return -EINVAL;
-
-	p->entity = entity;
-
-	ring = to_amdgpu_ring(entity->rq->sched);
 	/* MM engine doesn't support user fences */
 	if (p->uf_entry.tv.bo && ring->funcs->no_user_fence)
 		return -EINVAL;
@@ -978,8 +980,8 @@ static void trace_amdgpu_cs_ibs(struct amdgpu_cs_parser *parser)
 
 static int amdgpu_cs_patch_ibs(struct amdgpu_cs_parser *p)
 {
-	struct amdgpu_ring *ring = to_amdgpu_ring(p->entity->rq->sched);
 	struct amdgpu_job *job = p->job;
+	struct amdgpu_ring *ring = amdgpu_job_ring(job);
 	unsigned int i;
 	int r;
 
@@ -1171,10 +1173,6 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
 	job = p->job;
 	p->job = NULL;
 
-	r = drm_sched_job_init(&job->base, p->entity, &fpriv->vm);
-	if (r)
-		goto error_unlock;
-
 	drm_sched_job_arm(&job->base);
 
 	/* No memory allocation is allowed while holding the notifier lock.
@@ -1231,8 +1229,6 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
 error_abort:
 	drm_sched_job_cleanup(&job->base);
 	mutex_unlock(&p->adev->notifier_lock);
-
-error_unlock:
 	amdgpu_job_free(job);
 	return r;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
index 2a1961bf1194..866d35bbe073 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
@@ -72,6 +72,11 @@ struct amdgpu_job {
 	struct amdgpu_ib	ibs[];
 };
 
+static inline struct amdgpu_ring *amdgpu_job_ring(struct amdgpu_job *job)
+{
+	return to_amdgpu_ring(job->base.entity->rq->sched);
+}
+
 int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
 		     struct amdgpu_job **job, struct amdgpu_vm *vm);
 int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev, unsigned size,
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 5/8] drm/amdgpu: revert "fix limiting AV1 to the first instance on VCN3" v3
  2022-09-16  9:08 Final gang submit patches Christian König
                   ` (3 preceding siblings ...)
  2022-09-16  9:08 ` [PATCH 4/8] drm/amdgpu: move entity selection and job init earlier during CS Christian König
@ 2022-09-16  9:08 ` Christian König
  2022-09-16  9:08 ` [PATCH 6/8] drm/amdgpu: cleanup instance limit on VCN4 v3 Christian König
                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 13+ messages in thread
From: Christian König @ 2022-09-16  9:08 UTC (permalink / raw)
  To: amd-gfx, alexander.deucher; +Cc: bas, Christian König, timur.kristof

This reverts commit 250195ff744f260c169f5427422b6f39c58cb883.

The job should now be initialized when we reach the parser functions.

v2: merge improved application check into this patch
v3: back to the original test, but use the right ring

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c
index 39405f0db824..9c8b5fd99037 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c
@@ -1761,21 +1761,23 @@ static const struct amdgpu_ring_funcs vcn_v3_0_dec_sw_ring_vm_funcs = {
 	.emit_reg_write_reg_wait = amdgpu_ring_emit_reg_write_reg_wait_helper,
 };
 
-static int vcn_v3_0_limit_sched(struct amdgpu_cs_parser *p)
+static int vcn_v3_0_limit_sched(struct amdgpu_cs_parser *p,
+				struct amdgpu_job *job)
 {
 	struct drm_gpu_scheduler **scheds;
 
 	/* The create msg must be in the first IB submitted */
-	if (atomic_read(&p->entity->fence_seq))
+	if (atomic_read(&job->base.entity->fence_seq))
 		return -EINVAL;
 
 	scheds = p->adev->gpu_sched[AMDGPU_HW_IP_VCN_DEC]
 		[AMDGPU_RING_PRIO_DEFAULT].sched;
-	drm_sched_entity_modify_sched(p->entity, scheds, 1);
+	drm_sched_entity_modify_sched(job->base.entity, scheds, 1);
 	return 0;
 }
 
-static int vcn_v3_0_dec_msg(struct amdgpu_cs_parser *p, uint64_t addr)
+static int vcn_v3_0_dec_msg(struct amdgpu_cs_parser *p, struct amdgpu_job *job,
+			    uint64_t addr)
 {
 	struct ttm_operation_ctx ctx = { false, false };
 	struct amdgpu_bo_va_mapping *map;
@@ -1846,7 +1848,7 @@ static int vcn_v3_0_dec_msg(struct amdgpu_cs_parser *p, uint64_t addr)
 		if (create[0] == 0x7 || create[0] == 0x10 || create[0] == 0x11)
 			continue;
 
-		r = vcn_v3_0_limit_sched(p);
+		r = vcn_v3_0_limit_sched(p, job);
 		if (r)
 			goto out;
 	}
@@ -1860,7 +1862,7 @@ static int vcn_v3_0_ring_patch_cs_in_place(struct amdgpu_cs_parser *p,
 					   struct amdgpu_job *job,
 					   struct amdgpu_ib *ib)
 {
-	struct amdgpu_ring *ring = to_amdgpu_ring(p->entity->rq->sched);
+	struct amdgpu_ring *ring = amdgpu_job_ring(job);
 	uint32_t msg_lo = 0, msg_hi = 0;
 	unsigned i;
 	int r;
@@ -1879,7 +1881,8 @@ static int vcn_v3_0_ring_patch_cs_in_place(struct amdgpu_cs_parser *p,
 			msg_hi = val;
 		} else if (reg == PACKET0(p->adev->vcn.internal.cmd, 0) &&
 			   val == 0) {
-			r = vcn_v3_0_dec_msg(p, ((u64)msg_hi) << 32 | msg_lo);
+			r = vcn_v3_0_dec_msg(p, job,
+					     ((u64)msg_hi) << 32 | msg_lo);
 			if (r)
 				return r;
 		}
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 6/8] drm/amdgpu: cleanup instance limit on VCN4 v3
  2022-09-16  9:08 Final gang submit patches Christian König
                   ` (4 preceding siblings ...)
  2022-09-16  9:08 ` [PATCH 5/8] drm/amdgpu: revert "fix limiting AV1 to the first instance on VCN3" v3 Christian König
@ 2022-09-16  9:08 ` Christian König
  2022-09-16  9:08 ` [PATCH 7/8] drm/amdgpu: add gang submit backend v2 Christian König
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 13+ messages in thread
From: Christian König @ 2022-09-16  9:08 UTC (permalink / raw)
  To: amd-gfx, alexander.deucher; +Cc: bas, Christian König, timur.kristof

Similar to what we did for VCN3 use the job instead of the parser
entity. Cleanup the coding style quite a bit as well.

v2: merge improved application check into this patch
v3: finally fix the check

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c | 46 +++++++++++++++------------
 1 file changed, 25 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
index 09c89faa8c27..bdabe5da8d75 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
@@ -1591,21 +1591,23 @@ static void vcn_v4_0_unified_ring_set_wptr(struct amdgpu_ring *ring)
 	}
 }
 
-static int vcn_v4_0_limit_sched(struct amdgpu_cs_parser *p)
+static int vcn_v4_0_limit_sched(struct amdgpu_cs_parser *p,
+				struct amdgpu_job *job)
 {
 	struct drm_gpu_scheduler **scheds;
 
 	/* The create msg must be in the first IB submitted */
-	if (atomic_read(&p->entity->fence_seq))
+	if (atomic_read(&job->base.entity->fence_seq))
 		return -EINVAL;
 
-	scheds = p->adev->gpu_sched[AMDGPU_HW_IP_VCN_ENC]
-		[AMDGPU_RING_PRIO_0].sched;
-	drm_sched_entity_modify_sched(p->entity, scheds, 1);
+	scheds = p->adev->gpu_sched[AMDGPU_HW_IP_VCN_DEC]
+		[AMDGPU_RING_PRIO_DEFAULT].sched;
+	drm_sched_entity_modify_sched(job->base.entity, scheds, 1);
 	return 0;
 }
 
-static int vcn_v4_0_dec_msg(struct amdgpu_cs_parser *p, uint64_t addr)
+static int vcn_v4_0_dec_msg(struct amdgpu_cs_parser *p, struct amdgpu_job *job,
+			    uint64_t addr)
 {
 	struct ttm_operation_ctx ctx = { false, false };
 	struct amdgpu_bo_va_mapping *map;
@@ -1676,7 +1678,7 @@ static int vcn_v4_0_dec_msg(struct amdgpu_cs_parser *p, uint64_t addr)
 		if (create[0] == 0x7 || create[0] == 0x10 || create[0] == 0x11)
 			continue;
 
-		r = vcn_v4_0_limit_sched(p);
+		r = vcn_v4_0_limit_sched(p, job);
 		if (r)
 			goto out;
 	}
@@ -1689,32 +1691,34 @@ static int vcn_v4_0_dec_msg(struct amdgpu_cs_parser *p, uint64_t addr)
 #define RADEON_VCN_ENGINE_TYPE_DECODE                                 (0x00000003)
 
 static int vcn_v4_0_ring_patch_cs_in_place(struct amdgpu_cs_parser *p,
-				struct amdgpu_job *job,
-				struct amdgpu_ib *ib)
+					   struct amdgpu_job *job,
+					   struct amdgpu_ib *ib)
 {
-	struct amdgpu_ring *ring = to_amdgpu_ring(p->entity->rq->sched);
-	struct amdgpu_vcn_decode_buffer *decode_buffer = NULL;
+	struct amdgpu_ring *ring = amdgpu_job_ring(job);
+	struct amdgpu_vcn_decode_buffer *decode_buffer;
+	uint64_t addr;
 	uint32_t val;
-	int r = 0;
 
 	/* The first instance can decode anything */
 	if (!ring->me)
-		return r;
+		return 0;
 
 	/* unified queue ib header has 8 double words. */
 	if (ib->length_dw < 8)
-		return r;
+		return 0;
 
 	val = amdgpu_ib_get_value(ib, 6); //RADEON_VCN_ENGINE_TYPE
+	if (val != RADEON_VCN_ENGINE_TYPE_DECODE)
+		return 0;
 
-	if (val == RADEON_VCN_ENGINE_TYPE_DECODE) {
-		decode_buffer = (struct amdgpu_vcn_decode_buffer *)&ib->ptr[10];
+	decode_buffer = (struct amdgpu_vcn_decode_buffer *)&ib->ptr[10];
 
-		if (decode_buffer->valid_buf_flag  & 0x1)
-			r = vcn_v4_0_dec_msg(p, ((u64)decode_buffer->msg_buffer_address_hi) << 32 |
-						decode_buffer->msg_buffer_address_lo);
-	}
-	return r;
+	if (!(decode_buffer->valid_buf_flag  & 0x1))
+		return 0;
+
+	addr = ((u64)decode_buffer->msg_buffer_address_hi) << 32 |
+		decode_buffer->msg_buffer_address_lo;
+	return vcn_v4_0_dec_msg(p, job, addr);
 }
 
 static const struct amdgpu_ring_funcs vcn_v4_0_unified_ring_vm_funcs = {
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 7/8] drm/amdgpu: add gang submit backend v2
  2022-09-16  9:08 Final gang submit patches Christian König
                   ` (5 preceding siblings ...)
  2022-09-16  9:08 ` [PATCH 6/8] drm/amdgpu: cleanup instance limit on VCN4 v3 Christian König
@ 2022-09-16  9:08 ` Christian König
  2022-09-16  9:08 ` [PATCH 8/8] drm/amdgpu: add gang submit frontend v5 Christian König
  2022-09-16 19:22 ` Final gang submit patches Alex Deucher
  8 siblings, 0 replies; 13+ messages in thread
From: Christian König @ 2022-09-16  9:08 UTC (permalink / raw)
  To: amd-gfx, alexander.deucher; +Cc: bas, Christian König, timur.kristof

Allows submitting jobs as gang which needs to run on multiple
engines at the same time.

Basic idea is that we have a global gang submit fence representing when the
gang leader is finally pushed to run on the hardware last.

Jobs submitted as gang are never re-submitted in case of a GPU reset since this
won't work and will just deadlock the hardware immediately again.

v2: fix logic inversion, improve documentation, fix rcu

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  3 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 35 ++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    | 28 +++++++++++++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.h    |  3 ++
 4 files changed, 67 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 79bb6fd83094..ae9371b172e3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -885,6 +885,7 @@ struct amdgpu_device {
 	u64				fence_context;
 	unsigned			num_rings;
 	struct amdgpu_ring		*rings[AMDGPU_MAX_RINGS];
+	struct dma_fence __rcu		*gang_submit;
 	bool				ib_pool_ready;
 	struct amdgpu_sa_manager	ib_pools[AMDGPU_IB_POOL_MAX];
 	struct amdgpu_sched		gpu_sched[AMDGPU_HW_IP_NUM][AMDGPU_RING_PRIO_MAX];
@@ -1294,6 +1295,8 @@ u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev,
 				u32 reg);
 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
 				u32 reg, u32 v);
+struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
+					    struct dma_fence *gang);
 
 /* atpx handler */
 #if defined(CONFIG_VGA_SWITCHEROO)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 943c9e750575..b6f6445e50d2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3503,6 +3503,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 	adev->gmc.gart_size = 512 * 1024 * 1024;
 	adev->accel_working = false;
 	adev->num_rings = 0;
+	RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub());
 	adev->mman.buffer_funcs = NULL;
 	adev->mman.buffer_funcs_ring = NULL;
 	adev->vm_manager.vm_pte_funcs = NULL;
@@ -3984,6 +3985,7 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev)
 	release_firmware(adev->firmware.gpu_info_fw);
 	adev->firmware.gpu_info_fw = NULL;
 	adev->accel_working = false;
+	dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));
 
 	amdgpu_reset_fini(adev);
 
@@ -5919,3 +5921,36 @@ void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
 	(void)RREG32(data);
 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 }
+
+/**
+ * amdgpu_device_switch_gang - switch to a new gang
+ * @adev: amdgpu_device pointer
+ * @gang: the gang to switch to
+ *
+ * Try to switch to a new gang.
+ * Returns: NULL if we switched to the new gang or a reference to the current
+ * gang leader.
+ */
+struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
+					    struct dma_fence *gang)
+{
+	struct dma_fence *old = NULL;
+
+	do {
+		dma_fence_put(old);
+		rcu_read_lock();
+		old = dma_fence_get_rcu_safe(&adev->gang_submit);
+		rcu_read_unlock();
+
+		if (old == gang)
+			break;
+
+		if (!dma_fence_is_signaled(old))
+			return old;
+
+	} while (cmpxchg((struct dma_fence __force **)&adev->gang_submit,
+			 old, gang) != old);
+
+	dma_fence_put(old);
+	return NULL;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 877b3c22a8a8..cfbe19cfe9af 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -173,11 +173,29 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job)
 	dma_fence_put(&job->hw_fence);
 }
 
+void amdgpu_job_set_gang_leader(struct amdgpu_job *job,
+				struct amdgpu_job *leader)
+{
+	struct dma_fence *fence = &leader->base.s_fence->scheduled;
+
+	WARN_ON(job->gang_submit);
+
+	/*
+	 * Don't add a reference when we are the gang leader to avoid circle
+	 * dependency.
+	 */
+	if (job != leader)
+		dma_fence_get(fence);
+	job->gang_submit = fence;
+}
+
 void amdgpu_job_free(struct amdgpu_job *job)
 {
 	amdgpu_job_free_resources(job);
 	amdgpu_sync_free(&job->sync);
 	amdgpu_sync_free(&job->sched_sync);
+	if (job->gang_submit != &job->base.s_fence->scheduled)
+		dma_fence_put(job->gang_submit);
 
 	if (!job->hw_fence.ops)
 		kfree(job);
@@ -247,12 +265,16 @@ static struct dma_fence *amdgpu_job_dependency(struct drm_sched_job *sched_job,
 		fence = amdgpu_sync_get_fence(&job->sync);
 	}
 
+	if (!fence && job->gang_submit)
+		fence = amdgpu_device_switch_gang(ring->adev, job->gang_submit);
+
 	return fence;
 }
 
 static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
 {
 	struct amdgpu_ring *ring = to_amdgpu_ring(sched_job->sched);
+	struct amdgpu_device *adev = ring->adev;
 	struct dma_fence *fence = NULL, *finished;
 	struct amdgpu_job *job;
 	int r = 0;
@@ -264,8 +286,10 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
 
 	trace_amdgpu_sched_run_job(job);
 
-	if (job->vram_lost_counter != atomic_read(&ring->adev->vram_lost_counter))
-		dma_fence_set_error(finished, -ECANCELED);/* skip IB as well if VRAM lost */
+	/* Skip job if VRAM is lost and never resubmit gangs */
+	if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter) ||
+	    (job->job_run_counter && job->gang_submit))
+		dma_fence_set_error(finished, -ECANCELED);
 
 	if (finished->error < 0) {
 		DRM_INFO("Skip scheduling IBs!\n");
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
index 866d35bbe073..ab7b150e5d50 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
@@ -50,6 +50,7 @@ struct amdgpu_job {
 	struct amdgpu_sync	sync;
 	struct amdgpu_sync	sched_sync;
 	struct dma_fence	hw_fence;
+	struct dma_fence	*gang_submit;
 	uint32_t		preamble_status;
 	uint32_t                preemption_status;
 	bool                    vm_needs_flush;
@@ -84,6 +85,8 @@ int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev, unsigned size,
 void amdgpu_job_set_resources(struct amdgpu_job *job, struct amdgpu_bo *gds,
 			      struct amdgpu_bo *gws, struct amdgpu_bo *oa);
 void amdgpu_job_free_resources(struct amdgpu_job *job);
+void amdgpu_job_set_gang_leader(struct amdgpu_job *job,
+				struct amdgpu_job *leader);
 void amdgpu_job_free(struct amdgpu_job *job);
 int amdgpu_job_submit(struct amdgpu_job *job, struct drm_sched_entity *entity,
 		      void *owner, struct dma_fence **f);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 8/8] drm/amdgpu: add gang submit frontend v5
  2022-09-16  9:08 Final gang submit patches Christian König
                   ` (6 preceding siblings ...)
  2022-09-16  9:08 ` [PATCH 7/8] drm/amdgpu: add gang submit backend v2 Christian König
@ 2022-09-16  9:08 ` Christian König
  2022-09-16 19:21   ` Alex Deucher
  2022-09-16 19:22 ` Final gang submit patches Alex Deucher
  8 siblings, 1 reply; 13+ messages in thread
From: Christian König @ 2022-09-16  9:08 UTC (permalink / raw)
  To: amd-gfx, alexander.deucher; +Cc: bas, Christian König, timur.kristof

Allows submitting jobs as gang which needs to run on multiple engines at the
same time.

All members of the gang get the same implicit, explicit and VM dependencies. So
no gang member will start running until everything else is ready.

The last job is considered the gang leader (usually a submission to the GFX
ring) and used for signaling output dependencies.

Each job is remembered individually as user of a buffer object, so there is no
joining of work at the end.

v2: rebase and fix review comments from Andrey and Yogesh
v3: use READ instead of BOOKKEEP for now because of VM unmaps, set gang
    leader only when necessary
v4: fix order of pushing jobs and adding fences found by Trigger.
v5: fix job index calculation and adding IBs to jobs

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c |   1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c     | 269 ++++++++++++++-------
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h     |  10 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    |   2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h  |  12 +-
 5 files changed, 195 insertions(+), 99 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 4f5bd96000ec..c7b1a2dfde13 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -686,6 +686,7 @@ int amdgpu_amdkfd_submit_ib(struct amdgpu_device *adev,
 	ib->length_dw = ib_len;
 	/* This works for NO_HWS. TODO: need to handle without knowing VMID */
 	job->vmid = vmid;
+	job->num_ibs = 1;
 
 	ret = amdgpu_ib_schedule(ring, 1, ib, job, &f);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 58088c663125..964052377991 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -64,11 +64,11 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p,
 	return 0;
 }
 
-static int amdgpu_cs_p1_ib(struct amdgpu_cs_parser *p,
-			   struct drm_amdgpu_cs_chunk_ib *chunk_ib,
-			   unsigned int *num_ibs)
+static int amdgpu_cs_job_idx(struct amdgpu_cs_parser *p,
+			     struct drm_amdgpu_cs_chunk_ib *chunk_ib)
 {
 	struct drm_sched_entity *entity;
+	unsigned int i;
 	int r;
 
 	r = amdgpu_ctx_get_entity(p->ctx, chunk_ib->ip_type,
@@ -77,17 +77,38 @@ static int amdgpu_cs_p1_ib(struct amdgpu_cs_parser *p,
 	if (r)
 		return r;
 
-	/* Abort if there is no run queue associated with this entity.
-	 * Possibly because of disabled HW IP*/
+	/*
+	 * Abort if there is no run queue associated with this entity.
+	 * Possibly because of disabled HW IP.
+	 */
 	if (entity->rq == NULL)
 		return -EINVAL;
 
-	/* Currently we don't support submitting to multiple entities */
-	if (p->entity && p->entity != entity)
+	/* Check if we can add this IB to some existing job */
+	for (i = 0; i < p->gang_size; ++i)
+		if (p->entities[i] == entity)
+			return i;
+
+	/* If not increase the gang size if possible */
+	if (i == AMDGPU_CS_GANG_SIZE)
 		return -EINVAL;
 
-	p->entity = entity;
-	++(*num_ibs);
+	p->entities[i] = entity;
+	p->gang_size = i + 1;
+	return i;
+}
+
+static int amdgpu_cs_p1_ib(struct amdgpu_cs_parser *p,
+			   struct drm_amdgpu_cs_chunk_ib *chunk_ib,
+			   unsigned int *num_ibs)
+{
+	int r;
+
+	r = amdgpu_cs_job_idx(p, chunk_ib);
+	if (r < 0)
+		return r;
+
+	++(num_ibs[r]);
 	return 0;
 }
 
@@ -161,11 +182,12 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p,
 			   union drm_amdgpu_cs *cs)
 {
 	struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
+	unsigned int num_ibs[AMDGPU_CS_GANG_SIZE] = { };
 	struct amdgpu_vm *vm = &fpriv->vm;
 	uint64_t *chunk_array_user;
 	uint64_t *chunk_array;
-	unsigned size, num_ibs = 0;
 	uint32_t uf_offset = 0;
+	unsigned int size;
 	int ret;
 	int i;
 
@@ -228,7 +250,7 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p,
 			if (size < sizeof(struct drm_amdgpu_cs_chunk_ib))
 				goto free_partial_kdata;
 
-			ret = amdgpu_cs_p1_ib(p, p->chunks[i].kdata, &num_ibs);
+			ret = amdgpu_cs_p1_ib(p, p->chunks[i].kdata, num_ibs);
 			if (ret)
 				goto free_partial_kdata;
 			break;
@@ -265,21 +287,28 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p,
 		}
 	}
 
-	ret = amdgpu_job_alloc(p->adev, num_ibs, &p->job, vm);
-	if (ret)
-		goto free_all_kdata;
+	if (!p->gang_size)
+		return -EINVAL;
 
-	ret = drm_sched_job_init(&p->job->base, p->entity, &fpriv->vm);
-	if (ret)
-		goto free_all_kdata;
+	for (i = 0; i < p->gang_size; ++i) {
+		ret = amdgpu_job_alloc(p->adev, num_ibs[i], &p->jobs[i], vm);
+		if (ret)
+			goto free_all_kdata;
 
-	if (p->ctx->vram_lost_counter != p->job->vram_lost_counter) {
+		ret = drm_sched_job_init(&p->jobs[i]->base, p->entities[i],
+					 &fpriv->vm);
+		if (ret)
+			goto free_all_kdata;
+	}
+	p->gang_leader = p->jobs[p->gang_size - 1];
+
+	if (p->ctx->vram_lost_counter != p->gang_leader->vram_lost_counter) {
 		ret = -ECANCELED;
 		goto free_all_kdata;
 	}
 
 	if (p->uf_entry.tv.bo)
-		p->job->uf_addr = uf_offset;
+		p->gang_leader->uf_addr = uf_offset;
 	kvfree(chunk_array);
 
 	/* Use this opportunity to fill in task info for the vm */
@@ -303,17 +332,25 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p,
 
 static int amdgpu_cs_p2_ib(struct amdgpu_cs_parser *p,
 			   struct amdgpu_cs_chunk *chunk,
-			   unsigned int *num_ibs,
 			   unsigned int *ce_preempt,
 			   unsigned int *de_preempt)
 {
 	struct drm_amdgpu_cs_chunk_ib *chunk_ib = chunk->kdata;
 	struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
-	struct amdgpu_ring *ring = amdgpu_job_ring(p->job);
-	struct amdgpu_ib *ib = &p->job->ibs[*num_ibs];
 	struct amdgpu_vm *vm = &fpriv->vm;
+	struct amdgpu_ring *ring;
+	struct amdgpu_job *job;
+	struct amdgpu_ib *ib;
 	int r;
 
+	r = amdgpu_cs_job_idx(p, chunk_ib);
+	if (r < 0)
+		return r;
+
+	job = p->jobs[r];
+	ring = amdgpu_job_ring(job);
+	ib = &job->ibs[job->num_ibs++];
+
 	/* MM engine doesn't support user fences */
 	if (p->uf_entry.tv.bo && ring->funcs->no_user_fence)
 		return -EINVAL;
@@ -332,7 +369,7 @@ static int amdgpu_cs_p2_ib(struct amdgpu_cs_parser *p,
 	}
 
 	if (chunk_ib->flags & AMDGPU_IB_FLAG_PREAMBLE)
-		p->job->preamble_status |= AMDGPU_PREAMBLE_IB_PRESENT;
+		job->preamble_status |= AMDGPU_PREAMBLE_IB_PRESENT;
 
 	r =  amdgpu_ib_get(p->adev, vm, ring->funcs->parse_cs ?
 			   chunk_ib->ib_bytes : 0,
@@ -345,8 +382,6 @@ static int amdgpu_cs_p2_ib(struct amdgpu_cs_parser *p,
 	ib->gpu_addr = chunk_ib->va_start;
 	ib->length_dw = chunk_ib->ib_bytes / 4;
 	ib->flags = chunk_ib->flags;
-
-	(*num_ibs)++;
 	return 0;
 }
 
@@ -395,7 +430,7 @@ static int amdgpu_cs_p2_dependencies(struct amdgpu_cs_parser *p,
 			dma_fence_put(old);
 		}
 
-		r = amdgpu_sync_fence(&p->job->sync, fence);
+		r = amdgpu_sync_fence(&p->gang_leader->sync, fence);
 		dma_fence_put(fence);
 		if (r)
 			return r;
@@ -417,7 +452,7 @@ static int amdgpu_syncobj_lookup_and_add(struct amdgpu_cs_parser *p,
 		return r;
 	}
 
-	r = amdgpu_sync_fence(&p->job->sync, fence);
+	r = amdgpu_sync_fence(&p->gang_leader->sync, fence);
 	dma_fence_put(fence);
 
 	return r;
@@ -540,7 +575,7 @@ static int amdgpu_cs_p2_syncobj_timeline_signal(struct amdgpu_cs_parser *p,
 
 static int amdgpu_cs_pass2(struct amdgpu_cs_parser *p)
 {
-	unsigned int num_ibs = 0, ce_preempt = 0, de_preempt = 0;
+	unsigned int ce_preempt = 0, de_preempt = 0;
 	int i, r;
 
 	for (i = 0; i < p->nchunks; ++i) {
@@ -550,8 +585,7 @@ static int amdgpu_cs_pass2(struct amdgpu_cs_parser *p)
 
 		switch (chunk->chunk_id) {
 		case AMDGPU_CHUNK_ID_IB:
-			r = amdgpu_cs_p2_ib(p, chunk, &num_ibs,
-					    &ce_preempt, &de_preempt);
+			r = amdgpu_cs_p2_ib(p, chunk, &ce_preempt, &de_preempt);
 			if (r)
 				return r;
 			break;
@@ -822,6 +856,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
 	struct amdgpu_vm *vm = &fpriv->vm;
 	struct amdgpu_bo_list_entry *e;
 	struct list_head duplicates;
+	unsigned int i;
 	int r;
 
 	INIT_LIST_HEAD(&p->validated);
@@ -905,16 +940,6 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
 		e->bo_va = amdgpu_vm_bo_find(vm, bo);
 	}
 
-	/* Move fence waiting after getting reservation lock of
-	 * PD root. Then there is no need on a ctx mutex lock.
-	 */
-	r = amdgpu_ctx_wait_prev_fence(p->ctx, p->entity);
-	if (unlikely(r != 0)) {
-		if (r != -ERESTARTSYS)
-			DRM_ERROR("amdgpu_ctx_wait_prev_fence failed.\n");
-		goto error_validate;
-	}
-
 	amdgpu_cs_get_threshold_for_moves(p->adev, &p->bytes_moved_threshold,
 					  &p->bytes_moved_vis_threshold);
 	p->bytes_moved = 0;
@@ -942,13 +967,16 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
 		if (r)
 			goto error_validate;
 
-		p->job->uf_addr += amdgpu_bo_gpu_offset(uf);
+		p->gang_leader->uf_addr += amdgpu_bo_gpu_offset(uf);
 	}
 
 	amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved,
 				     p->bytes_moved_vis);
-	amdgpu_job_set_resources(p->job, p->bo_list->gds_obj,
-				 p->bo_list->gws_obj, p->bo_list->oa_obj);
+
+	for (i = 0; i < p->gang_size; ++i)
+		amdgpu_job_set_resources(p->jobs[i], p->bo_list->gds_obj,
+					 p->bo_list->gws_obj,
+					 p->bo_list->oa_obj);
 	return 0;
 
 error_validate:
@@ -967,20 +995,24 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
 	return r;
 }
 
-static void trace_amdgpu_cs_ibs(struct amdgpu_cs_parser *parser)
+static void trace_amdgpu_cs_ibs(struct amdgpu_cs_parser *p)
 {
-	int i;
+	int i, j;
 
 	if (!trace_amdgpu_cs_enabled())
 		return;
 
-	for (i = 0; i < parser->job->num_ibs; i++)
-		trace_amdgpu_cs(parser, i);
+	for (i = 0; i < p->gang_size; ++i) {
+		struct amdgpu_job *job = p->jobs[i];
+
+		for (j = 0; j < job->num_ibs; ++j)
+			trace_amdgpu_cs(p, job, &job->ibs[j]);
+	}
 }
 
-static int amdgpu_cs_patch_ibs(struct amdgpu_cs_parser *p)
+static int amdgpu_cs_patch_ibs(struct amdgpu_cs_parser *p,
+			       struct amdgpu_job *job)
 {
-	struct amdgpu_job *job = p->job;
 	struct amdgpu_ring *ring = amdgpu_job_ring(job);
 	unsigned int i;
 	int r;
@@ -1021,12 +1053,12 @@ static int amdgpu_cs_patch_ibs(struct amdgpu_cs_parser *p)
 			memcpy(ib->ptr, kptr, ib->length_dw * 4);
 			amdgpu_bo_kunmap(aobj);
 
-			r = amdgpu_ring_parse_cs(ring, p, p->job, ib);
+			r = amdgpu_ring_parse_cs(ring, p, job, ib);
 			if (r)
 				return r;
 		} else {
 			ib->ptr = (uint32_t *)kptr;
-			r = amdgpu_ring_patch_cs_in_place(ring, p, p->job, ib);
+			r = amdgpu_ring_patch_cs_in_place(ring, p, job, ib);
 			amdgpu_bo_kunmap(aobj);
 			if (r)
 				return r;
@@ -1036,19 +1068,31 @@ static int amdgpu_cs_patch_ibs(struct amdgpu_cs_parser *p)
 	return 0;
 }
 
+static int amdgpu_cs_patch_jobs(struct amdgpu_cs_parser *p)
+{
+	unsigned int i;
+	int r;
+
+	for (i = 0; i < p->gang_size; ++i) {
+		r = amdgpu_cs_patch_ibs(p, p->jobs[i]);
+		if (r)
+			return r;
+	}
+	return 0;
+}
+
 static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
 {
 	struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
+	struct amdgpu_job *job = p->gang_leader;
 	struct amdgpu_device *adev = p->adev;
 	struct amdgpu_vm *vm = &fpriv->vm;
 	struct amdgpu_bo_list_entry *e;
 	struct amdgpu_bo_va *bo_va;
 	struct amdgpu_bo *bo;
+	unsigned int i;
 	int r;
 
-	if (!p->job->vm)
-		return 0;
-
 	r = amdgpu_vm_clear_freed(adev, vm, NULL);
 	if (r)
 		return r;
@@ -1057,7 +1101,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
 	if (r)
 		return r;
 
-	r = amdgpu_sync_fence(&p->job->sync, fpriv->prt_va->last_pt_update);
+	r = amdgpu_sync_fence(&job->sync, fpriv->prt_va->last_pt_update);
 	if (r)
 		return r;
 
@@ -1068,7 +1112,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
 		if (r)
 			return r;
 
-		r = amdgpu_sync_fence(&p->job->sync, bo_va->last_pt_update);
+		r = amdgpu_sync_fence(&job->sync, bo_va->last_pt_update);
 		if (r)
 			return r;
 	}
@@ -1087,7 +1131,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
 		if (r)
 			return r;
 
-		r = amdgpu_sync_fence(&p->job->sync, bo_va->last_pt_update);
+		r = amdgpu_sync_fence(&job->sync, bo_va->last_pt_update);
 		if (r)
 			return r;
 	}
@@ -1100,11 +1144,18 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
 	if (r)
 		return r;
 
-	r = amdgpu_sync_fence(&p->job->sync, vm->last_update);
+	r = amdgpu_sync_fence(&job->sync, vm->last_update);
 	if (r)
 		return r;
 
-	p->job->vm_pd_addr = amdgpu_gmc_pd_addr(vm->root.bo);
+	for (i = 0; i < p->gang_size; ++i) {
+		job = p->jobs[i];
+
+		if (!job->vm)
+			continue;
+
+		job->vm_pd_addr = amdgpu_gmc_pd_addr(vm->root.bo);
+	}
 
 	if (amdgpu_vm_debug) {
 		/* Invalidate all BOs to test for userspace bugs */
@@ -1125,7 +1176,9 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
 static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p)
 {
 	struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
+	struct amdgpu_job *leader = p->gang_leader;
 	struct amdgpu_bo_list_entry *e;
+	unsigned int i;
 	int r;
 
 	list_for_each_entry(e, &p->validated, tv.head) {
@@ -1135,12 +1188,23 @@ static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p)
 
 		sync_mode = amdgpu_bo_explicit_sync(bo) ?
 			AMDGPU_SYNC_EXPLICIT : AMDGPU_SYNC_NE_OWNER;
-		r = amdgpu_sync_resv(p->adev, &p->job->sync, resv, sync_mode,
+		r = amdgpu_sync_resv(p->adev, &leader->sync, resv, sync_mode,
 				     &fpriv->vm);
 		if (r)
 			return r;
 	}
-	return 0;
+
+	for (i = 0; i < p->gang_size - 1; ++i) {
+		r = amdgpu_sync_clone(&leader->sync, &p->jobs[i]->sync);
+		if (r)
+			return r;
+	}
+
+	r = amdgpu_ctx_wait_prev_fence(p->ctx, p->entities[p->gang_size - 1]);
+	if (r && r != -ERESTARTSYS)
+		DRM_ERROR("amdgpu_ctx_wait_prev_fence failed.\n");
+
+	return r;
 }
 
 static void amdgpu_cs_post_dependencies(struct amdgpu_cs_parser *p)
@@ -1164,16 +1228,28 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
 			    union drm_amdgpu_cs *cs)
 {
 	struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
-	struct drm_sched_entity *entity = p->entity;
+	struct amdgpu_job *leader = p->gang_leader;
 	struct amdgpu_bo_list_entry *e;
-	struct amdgpu_job *job;
+	unsigned int i;
 	uint64_t seq;
 	int r;
 
-	job = p->job;
-	p->job = NULL;
+	for (i = 0; i < p->gang_size; ++i)
+		drm_sched_job_arm(&p->jobs[i]->base);
 
-	drm_sched_job_arm(&job->base);
+	for (i = 0; i < (p->gang_size - 1); ++i) {
+		struct dma_fence *fence;
+
+		fence = &p->jobs[i]->base.s_fence->scheduled;
+		r = amdgpu_sync_fence(&leader->sync, fence);
+		if (r)
+			goto error_cleanup;
+	}
+
+	if (p->gang_size > 1) {
+		for (i = 0; i < p->gang_size; ++i)
+			amdgpu_job_set_gang_leader(p->jobs[i], leader);
+	}
 
 	/* No memory allocation is allowed while holding the notifier lock.
 	 * The lock is held until amdgpu_cs_submit is finished and fence is
@@ -1191,45 +1267,57 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
 	}
 	if (r) {
 		r = -EAGAIN;
-		goto error_abort;
+		goto error_unlock;
 	}
 
-	p->fence = dma_fence_get(&job->base.s_fence->finished);
+	p->fence = dma_fence_get(&leader->base.s_fence->finished);
+	list_for_each_entry(e, &p->validated, tv.head) {
+
+		/* Everybody except for the gang leader uses READ */
+		for (i = 0; i < (p->gang_size - 1); ++i) {
+			dma_resv_add_fence(e->tv.bo->base.resv,
+					   &p->jobs[i]->base.s_fence->finished,
+					   DMA_RESV_USAGE_READ);
+		}
+
+		/* The gang leader as remembered as writer */
+		e->tv.num_shared = 0;
+	}
 
-	seq = amdgpu_ctx_add_fence(p->ctx, entity, p->fence);
+	seq = amdgpu_ctx_add_fence(p->ctx, p->entities[p->gang_size - 1],
+				   p->fence);
 	amdgpu_cs_post_dependencies(p);
 
-	if ((job->preamble_status & AMDGPU_PREAMBLE_IB_PRESENT) &&
+	if ((leader->preamble_status & AMDGPU_PREAMBLE_IB_PRESENT) &&
 	    !p->ctx->preamble_presented) {
-		job->preamble_status |= AMDGPU_PREAMBLE_IB_PRESENT_FIRST;
+		leader->preamble_status |= AMDGPU_PREAMBLE_IB_PRESENT_FIRST;
 		p->ctx->preamble_presented = true;
 	}
 
 	cs->out.handle = seq;
-	job->uf_sequence = seq;
+	leader->uf_sequence = seq;
 
-	amdgpu_job_free_resources(job);
-
-	trace_amdgpu_cs_ioctl(job);
 	amdgpu_vm_bo_trace_cs(&fpriv->vm, &p->ticket);
-	drm_sched_entity_push_job(&job->base);
+	for (i = 0; i < p->gang_size; ++i) {
+		amdgpu_job_free_resources(p->jobs[i]);
+		trace_amdgpu_cs_ioctl(p->jobs[i]);
+		drm_sched_entity_push_job(&p->jobs[i]->base);
+		p->jobs[i] = NULL;
+	}
 
 	amdgpu_vm_move_to_lru_tail(p->adev, &fpriv->vm);
-
-	/* Make sure all BOs are remembered as writers */
-	amdgpu_bo_list_for_each_entry(e, p->bo_list)
-		e->tv.num_shared = 0;
-
 	ttm_eu_fence_buffer_objects(&p->ticket, &p->validated, p->fence);
+
 	mutex_unlock(&p->adev->notifier_lock);
 	mutex_unlock(&p->bo_list->bo_list_mutex);
-
 	return 0;
 
-error_abort:
-	drm_sched_job_cleanup(&job->base);
+error_unlock:
 	mutex_unlock(&p->adev->notifier_lock);
-	amdgpu_job_free(job);
+
+error_cleanup:
+	for (i = 0; i < p->gang_size; ++i)
+		drm_sched_job_cleanup(&p->jobs[i]->base);
 	return r;
 }
 
@@ -1246,17 +1334,18 @@ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser)
 
 	dma_fence_put(parser->fence);
 
-	if (parser->ctx) {
+	if (parser->ctx)
 		amdgpu_ctx_put(parser->ctx);
-	}
 	if (parser->bo_list)
 		amdgpu_bo_list_put(parser->bo_list);
 
 	for (i = 0; i < parser->nchunks; i++)
 		kvfree(parser->chunks[i].kdata);
 	kvfree(parser->chunks);
-	if (parser->job)
-		amdgpu_job_free(parser->job);
+	for (i = 0; i < parser->gang_size; ++i) {
+		if (parser->jobs[i])
+			amdgpu_job_free(parser->jobs[i]);
+	}
 	if (parser->uf_entry.tv.bo) {
 		struct amdgpu_bo *uf = ttm_to_amdgpu_bo(parser->uf_entry.tv.bo);
 
@@ -1300,7 +1389,7 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
 		goto error_fini;
 	}
 
-	r = amdgpu_cs_patch_ibs(&parser);
+	r = amdgpu_cs_patch_jobs(&parser);
 	if (r)
 		goto error_backoff;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h
index 30ecc4917f81..cbaa19b2b8a3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h
@@ -27,6 +27,8 @@
 #include "amdgpu_bo_list.h"
 #include "amdgpu_ring.h"
 
+#define AMDGPU_CS_GANG_SIZE	4
+
 struct amdgpu_bo_va_mapping;
 
 struct amdgpu_cs_chunk {
@@ -50,9 +52,11 @@ struct amdgpu_cs_parser {
 	unsigned		nchunks;
 	struct amdgpu_cs_chunk	*chunks;
 
-	/* scheduler job object */
-	struct amdgpu_job	*job;
-	struct drm_sched_entity	*entity;
+	/* scheduler job objects */
+	unsigned int		gang_size;
+	struct drm_sched_entity	*entities[AMDGPU_CS_GANG_SIZE];
+	struct amdgpu_job	*jobs[AMDGPU_CS_GANG_SIZE];
+	struct amdgpu_job	*gang_leader;
 
 	/* buffer objects */
 	struct ww_acquire_ctx		ticket;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index cfbe19cfe9af..46c99331d7f1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -105,7 +105,6 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
 	 */
 	(*job)->base.sched = &adev->rings[0]->sched;
 	(*job)->vm = vm;
-	(*job)->num_ibs = num_ibs;
 
 	amdgpu_sync_create(&(*job)->sync);
 	amdgpu_sync_create(&(*job)->sched_sync);
@@ -125,6 +124,7 @@ int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev, unsigned size,
 	if (r)
 		return r;
 
+	(*job)->num_ibs = 1;
 	r = amdgpu_ib_get(adev, NULL, size, pool_type, &(*job)->ibs[0]);
 	if (r)
 		kfree(*job);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
index 06dfcf297a8d..5e6ddc7e101c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
@@ -140,8 +140,10 @@ TRACE_EVENT(amdgpu_bo_create,
 );
 
 TRACE_EVENT(amdgpu_cs,
-	    TP_PROTO(struct amdgpu_cs_parser *p, int i),
-	    TP_ARGS(p, i),
+	    TP_PROTO(struct amdgpu_cs_parser *p,
+		     struct amdgpu_job *job,
+		     struct amdgpu_ib *ib),
+	    TP_ARGS(p, job, ib),
 	    TP_STRUCT__entry(
 			     __field(struct amdgpu_bo_list *, bo_list)
 			     __field(u32, ring)
@@ -151,10 +153,10 @@ TRACE_EVENT(amdgpu_cs,
 
 	    TP_fast_assign(
 			   __entry->bo_list = p->bo_list;
-			   __entry->ring = to_amdgpu_ring(p->entity->rq->sched)->idx;
-			   __entry->dw = p->job->ibs[i].length_dw;
+			   __entry->ring = to_amdgpu_ring(job->base.sched)->idx;
+			   __entry->dw = ib->length_dw;
 			   __entry->fences = amdgpu_fence_count_emitted(
-				to_amdgpu_ring(p->entity->rq->sched));
+				to_amdgpu_ring(job->base.sched));
 			   ),
 	    TP_printk("bo_list=%p, ring=%u, dw=%u, fences=%u",
 		      __entry->bo_list, __entry->ring, __entry->dw,
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH 3/8] drm/amdgpu: fix user fence CS check
  2022-09-16  9:08 ` [PATCH 3/8] drm/amdgpu: fix user fence CS check Christian König
@ 2022-09-16 18:01   ` Alex Deucher
  2022-09-16 18:24     ` Christian König
  0 siblings, 1 reply; 13+ messages in thread
From: Alex Deucher @ 2022-09-16 18:01 UTC (permalink / raw)
  To: Christian König
  Cc: alexander.deucher, timur.kristof, Christian König, amd-gfx, bas

On Fri, Sep 16, 2022 at 5:09 AM Christian König
<ckoenig.leichtzumerken@gmail.com> wrote:
>
> An offset of zero is valid, check if the BO is present or not.
>
> Signed-off-by: Christian König <christian.koenig@amd.com>

Since this is a bug fix, should we make the first patch in the series?

Alex

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index 5c37dde97ff2..265ed2118a80 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -313,7 +313,7 @@ static int amdgpu_cs_p2_ib(struct amdgpu_cs_parser *p,
>
>         ring = to_amdgpu_ring(entity->rq->sched);
>         /* MM engine doesn't support user fences */
> -       if (p->job->uf_addr && ring->funcs->no_user_fence)
> +       if (p->uf_entry.tv.bo && ring->funcs->no_user_fence)
>                 return -EINVAL;
>
>         if (chunk_ib->ip_type == AMDGPU_HW_IP_GFX &&
> --
> 2.25.1
>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 3/8] drm/amdgpu: fix user fence CS check
  2022-09-16 18:01   ` Alex Deucher
@ 2022-09-16 18:24     ` Christian König
  0 siblings, 0 replies; 13+ messages in thread
From: Christian König @ 2022-09-16 18:24 UTC (permalink / raw)
  To: Alex Deucher, Christian König
  Cc: alexander.deucher, timur.kristof, amd-gfx, bas

Am 16.09.22 um 20:01 schrieb Alex Deucher:
> On Fri, Sep 16, 2022 at 5:09 AM Christian König
> <ckoenig.leichtzumerken@gmail.com> wrote:
>> An offset of zero is valid, check if the BO is present or not.
>>
>> Signed-off-by: Christian König <christian.koenig@amd.com>
> Since this is a bug fix, should we make the first patch in the series?

Good point, going to re-arrange this before pushing.

Christian.

>
> Alex
>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 2 +-
>>   1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> index 5c37dde97ff2..265ed2118a80 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> @@ -313,7 +313,7 @@ static int amdgpu_cs_p2_ib(struct amdgpu_cs_parser *p,
>>
>>          ring = to_amdgpu_ring(entity->rq->sched);
>>          /* MM engine doesn't support user fences */
>> -       if (p->job->uf_addr && ring->funcs->no_user_fence)
>> +       if (p->uf_entry.tv.bo && ring->funcs->no_user_fence)
>>                  return -EINVAL;
>>
>>          if (chunk_ib->ip_type == AMDGPU_HW_IP_GFX &&
>> --
>> 2.25.1
>>


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 8/8] drm/amdgpu: add gang submit frontend v5
  2022-09-16  9:08 ` [PATCH 8/8] drm/amdgpu: add gang submit frontend v5 Christian König
@ 2022-09-16 19:21   ` Alex Deucher
  0 siblings, 0 replies; 13+ messages in thread
From: Alex Deucher @ 2022-09-16 19:21 UTC (permalink / raw)
  To: Christian König
  Cc: alexander.deucher, timur.kristof, Christian König, amd-gfx, bas

On Fri, Sep 16, 2022 at 5:09 AM Christian König
<ckoenig.leichtzumerken@gmail.com> wrote:
>
> Allows submitting jobs as gang which needs to run on multiple engines at the
> same time.
>
> All members of the gang get the same implicit, explicit and VM dependencies. So
> no gang member will start running until everything else is ready.
>
> The last job is considered the gang leader (usually a submission to the GFX
> ring) and used for signaling output dependencies.
>
> Each job is remembered individually as user of a buffer object, so there is no
> joining of work at the end.
>
> v2: rebase and fix review comments from Andrey and Yogesh
> v3: use READ instead of BOOKKEEP for now because of VM unmaps, set gang
>     leader only when necessary
> v4: fix order of pushing jobs and adding fences found by Trigger.
> v5: fix job index calculation and adding IBs to jobs
>
> Signed-off-by: Christian König <christian.koenig@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c |   1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c     | 269 ++++++++++++++-------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h     |  10 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    |   2 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h  |  12 +-
>  5 files changed, 195 insertions(+), 99 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index 4f5bd96000ec..c7b1a2dfde13 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -686,6 +686,7 @@ int amdgpu_amdkfd_submit_ib(struct amdgpu_device *adev,
>         ib->length_dw = ib_len;
>         /* This works for NO_HWS. TODO: need to handle without knowing VMID */
>         job->vmid = vmid;
> +       job->num_ibs = 1;
>
>         ret = amdgpu_ib_schedule(ring, 1, ib, job, &f);
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index 58088c663125..964052377991 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -64,11 +64,11 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p,
>         return 0;
>  }
>
> -static int amdgpu_cs_p1_ib(struct amdgpu_cs_parser *p,
> -                          struct drm_amdgpu_cs_chunk_ib *chunk_ib,
> -                          unsigned int *num_ibs)
> +static int amdgpu_cs_job_idx(struct amdgpu_cs_parser *p,
> +                            struct drm_amdgpu_cs_chunk_ib *chunk_ib)
>  {
>         struct drm_sched_entity *entity;
> +       unsigned int i;
>         int r;
>
>         r = amdgpu_ctx_get_entity(p->ctx, chunk_ib->ip_type,
> @@ -77,17 +77,38 @@ static int amdgpu_cs_p1_ib(struct amdgpu_cs_parser *p,
>         if (r)
>                 return r;
>
> -       /* Abort if there is no run queue associated with this entity.
> -        * Possibly because of disabled HW IP*/
> +       /*
> +        * Abort if there is no run queue associated with this entity.
> +        * Possibly because of disabled HW IP.
> +        */
>         if (entity->rq == NULL)
>                 return -EINVAL;
>
> -       /* Currently we don't support submitting to multiple entities */
> -       if (p->entity && p->entity != entity)
> +       /* Check if we can add this IB to some existing job */
> +       for (i = 0; i < p->gang_size; ++i)
> +               if (p->entities[i] == entity)
> +                       return i;
> +
> +       /* If not increase the gang size if possible */
> +       if (i == AMDGPU_CS_GANG_SIZE)
>                 return -EINVAL;
>
> -       p->entity = entity;
> -       ++(*num_ibs);
> +       p->entities[i] = entity;
> +       p->gang_size = i + 1;
> +       return i;
> +}
> +
> +static int amdgpu_cs_p1_ib(struct amdgpu_cs_parser *p,
> +                          struct drm_amdgpu_cs_chunk_ib *chunk_ib,
> +                          unsigned int *num_ibs)
> +{
> +       int r;
> +
> +       r = amdgpu_cs_job_idx(p, chunk_ib);
> +       if (r < 0)
> +               return r;
> +
> +       ++(num_ibs[r]);
>         return 0;
>  }
>
> @@ -161,11 +182,12 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p,
>                            union drm_amdgpu_cs *cs)
>  {
>         struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
> +       unsigned int num_ibs[AMDGPU_CS_GANG_SIZE] = { };
>         struct amdgpu_vm *vm = &fpriv->vm;
>         uint64_t *chunk_array_user;
>         uint64_t *chunk_array;
> -       unsigned size, num_ibs = 0;
>         uint32_t uf_offset = 0;
> +       unsigned int size;
>         int ret;
>         int i;
>
> @@ -228,7 +250,7 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p,
>                         if (size < sizeof(struct drm_amdgpu_cs_chunk_ib))
>                                 goto free_partial_kdata;
>
> -                       ret = amdgpu_cs_p1_ib(p, p->chunks[i].kdata, &num_ibs);
> +                       ret = amdgpu_cs_p1_ib(p, p->chunks[i].kdata, num_ibs);
>                         if (ret)
>                                 goto free_partial_kdata;
>                         break;
> @@ -265,21 +287,28 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p,
>                 }
>         }
>
> -       ret = amdgpu_job_alloc(p->adev, num_ibs, &p->job, vm);
> -       if (ret)
> -               goto free_all_kdata;
> +       if (!p->gang_size)
> +               return -EINVAL;
>
> -       ret = drm_sched_job_init(&p->job->base, p->entity, &fpriv->vm);
> -       if (ret)
> -               goto free_all_kdata;
> +       for (i = 0; i < p->gang_size; ++i) {
> +               ret = amdgpu_job_alloc(p->adev, num_ibs[i], &p->jobs[i], vm);
> +               if (ret)
> +                       goto free_all_kdata;
>
> -       if (p->ctx->vram_lost_counter != p->job->vram_lost_counter) {
> +               ret = drm_sched_job_init(&p->jobs[i]->base, p->entities[i],
> +                                        &fpriv->vm);
> +               if (ret)
> +                       goto free_all_kdata;
> +       }
> +       p->gang_leader = p->jobs[p->gang_size - 1];
> +
> +       if (p->ctx->vram_lost_counter != p->gang_leader->vram_lost_counter) {
>                 ret = -ECANCELED;
>                 goto free_all_kdata;
>         }
>
>         if (p->uf_entry.tv.bo)
> -               p->job->uf_addr = uf_offset;
> +               p->gang_leader->uf_addr = uf_offset;
>         kvfree(chunk_array);
>
>         /* Use this opportunity to fill in task info for the vm */
> @@ -303,17 +332,25 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p,
>
>  static int amdgpu_cs_p2_ib(struct amdgpu_cs_parser *p,
>                            struct amdgpu_cs_chunk *chunk,
> -                          unsigned int *num_ibs,
>                            unsigned int *ce_preempt,
>                            unsigned int *de_preempt)
>  {
>         struct drm_amdgpu_cs_chunk_ib *chunk_ib = chunk->kdata;
>         struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
> -       struct amdgpu_ring *ring = amdgpu_job_ring(p->job);
> -       struct amdgpu_ib *ib = &p->job->ibs[*num_ibs];
>         struct amdgpu_vm *vm = &fpriv->vm;
> +       struct amdgpu_ring *ring;
> +       struct amdgpu_job *job;
> +       struct amdgpu_ib *ib;
>         int r;
>
> +       r = amdgpu_cs_job_idx(p, chunk_ib);
> +       if (r < 0)
> +               return r;
> +
> +       job = p->jobs[r];
> +       ring = amdgpu_job_ring(job);
> +       ib = &job->ibs[job->num_ibs++];
> +
>         /* MM engine doesn't support user fences */
>         if (p->uf_entry.tv.bo && ring->funcs->no_user_fence)
>                 return -EINVAL;
> @@ -332,7 +369,7 @@ static int amdgpu_cs_p2_ib(struct amdgpu_cs_parser *p,
>         }
>
>         if (chunk_ib->flags & AMDGPU_IB_FLAG_PREAMBLE)
> -               p->job->preamble_status |= AMDGPU_PREAMBLE_IB_PRESENT;
> +               job->preamble_status |= AMDGPU_PREAMBLE_IB_PRESENT;
>
>         r =  amdgpu_ib_get(p->adev, vm, ring->funcs->parse_cs ?
>                            chunk_ib->ib_bytes : 0,
> @@ -345,8 +382,6 @@ static int amdgpu_cs_p2_ib(struct amdgpu_cs_parser *p,
>         ib->gpu_addr = chunk_ib->va_start;
>         ib->length_dw = chunk_ib->ib_bytes / 4;
>         ib->flags = chunk_ib->flags;
> -
> -       (*num_ibs)++;
>         return 0;
>  }
>
> @@ -395,7 +430,7 @@ static int amdgpu_cs_p2_dependencies(struct amdgpu_cs_parser *p,
>                         dma_fence_put(old);
>                 }
>
> -               r = amdgpu_sync_fence(&p->job->sync, fence);
> +               r = amdgpu_sync_fence(&p->gang_leader->sync, fence);
>                 dma_fence_put(fence);
>                 if (r)
>                         return r;
> @@ -417,7 +452,7 @@ static int amdgpu_syncobj_lookup_and_add(struct amdgpu_cs_parser *p,
>                 return r;
>         }
>
> -       r = amdgpu_sync_fence(&p->job->sync, fence);
> +       r = amdgpu_sync_fence(&p->gang_leader->sync, fence);
>         dma_fence_put(fence);
>
>         return r;
> @@ -540,7 +575,7 @@ static int amdgpu_cs_p2_syncobj_timeline_signal(struct amdgpu_cs_parser *p,
>
>  static int amdgpu_cs_pass2(struct amdgpu_cs_parser *p)
>  {
> -       unsigned int num_ibs = 0, ce_preempt = 0, de_preempt = 0;
> +       unsigned int ce_preempt = 0, de_preempt = 0;
>         int i, r;
>
>         for (i = 0; i < p->nchunks; ++i) {
> @@ -550,8 +585,7 @@ static int amdgpu_cs_pass2(struct amdgpu_cs_parser *p)
>
>                 switch (chunk->chunk_id) {
>                 case AMDGPU_CHUNK_ID_IB:
> -                       r = amdgpu_cs_p2_ib(p, chunk, &num_ibs,
> -                                           &ce_preempt, &de_preempt);
> +                       r = amdgpu_cs_p2_ib(p, chunk, &ce_preempt, &de_preempt);
>                         if (r)
>                                 return r;
>                         break;
> @@ -822,6 +856,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
>         struct amdgpu_vm *vm = &fpriv->vm;
>         struct amdgpu_bo_list_entry *e;
>         struct list_head duplicates;
> +       unsigned int i;
>         int r;
>
>         INIT_LIST_HEAD(&p->validated);
> @@ -905,16 +940,6 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
>                 e->bo_va = amdgpu_vm_bo_find(vm, bo);
>         }
>
> -       /* Move fence waiting after getting reservation lock of
> -        * PD root. Then there is no need on a ctx mutex lock.
> -        */
> -       r = amdgpu_ctx_wait_prev_fence(p->ctx, p->entity);
> -       if (unlikely(r != 0)) {
> -               if (r != -ERESTARTSYS)
> -                       DRM_ERROR("amdgpu_ctx_wait_prev_fence failed.\n");
> -               goto error_validate;
> -       }
> -
>         amdgpu_cs_get_threshold_for_moves(p->adev, &p->bytes_moved_threshold,
>                                           &p->bytes_moved_vis_threshold);
>         p->bytes_moved = 0;
> @@ -942,13 +967,16 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
>                 if (r)
>                         goto error_validate;
>
> -               p->job->uf_addr += amdgpu_bo_gpu_offset(uf);
> +               p->gang_leader->uf_addr += amdgpu_bo_gpu_offset(uf);
>         }
>
>         amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved,
>                                      p->bytes_moved_vis);
> -       amdgpu_job_set_resources(p->job, p->bo_list->gds_obj,
> -                                p->bo_list->gws_obj, p->bo_list->oa_obj);
> +
> +       for (i = 0; i < p->gang_size; ++i)
> +               amdgpu_job_set_resources(p->jobs[i], p->bo_list->gds_obj,
> +                                        p->bo_list->gws_obj,
> +                                        p->bo_list->oa_obj);
>         return 0;
>
>  error_validate:
> @@ -967,20 +995,24 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
>         return r;
>  }
>
> -static void trace_amdgpu_cs_ibs(struct amdgpu_cs_parser *parser)
> +static void trace_amdgpu_cs_ibs(struct amdgpu_cs_parser *p)
>  {
> -       int i;
> +       int i, j;
>
>         if (!trace_amdgpu_cs_enabled())
>                 return;
>
> -       for (i = 0; i < parser->job->num_ibs; i++)
> -               trace_amdgpu_cs(parser, i);
> +       for (i = 0; i < p->gang_size; ++i) {
> +               struct amdgpu_job *job = p->jobs[i];
> +
> +               for (j = 0; j < job->num_ibs; ++j)
> +                       trace_amdgpu_cs(p, job, &job->ibs[j]);
> +       }
>  }
>
> -static int amdgpu_cs_patch_ibs(struct amdgpu_cs_parser *p)
> +static int amdgpu_cs_patch_ibs(struct amdgpu_cs_parser *p,
> +                              struct amdgpu_job *job)
>  {
> -       struct amdgpu_job *job = p->job;
>         struct amdgpu_ring *ring = amdgpu_job_ring(job);
>         unsigned int i;
>         int r;
> @@ -1021,12 +1053,12 @@ static int amdgpu_cs_patch_ibs(struct amdgpu_cs_parser *p)
>                         memcpy(ib->ptr, kptr, ib->length_dw * 4);
>                         amdgpu_bo_kunmap(aobj);
>
> -                       r = amdgpu_ring_parse_cs(ring, p, p->job, ib);
> +                       r = amdgpu_ring_parse_cs(ring, p, job, ib);
>                         if (r)
>                                 return r;
>                 } else {
>                         ib->ptr = (uint32_t *)kptr;
> -                       r = amdgpu_ring_patch_cs_in_place(ring, p, p->job, ib);
> +                       r = amdgpu_ring_patch_cs_in_place(ring, p, job, ib);
>                         amdgpu_bo_kunmap(aobj);
>                         if (r)
>                                 return r;
> @@ -1036,19 +1068,31 @@ static int amdgpu_cs_patch_ibs(struct amdgpu_cs_parser *p)
>         return 0;
>  }
>
> +static int amdgpu_cs_patch_jobs(struct amdgpu_cs_parser *p)
> +{
> +       unsigned int i;
> +       int r;
> +
> +       for (i = 0; i < p->gang_size; ++i) {
> +               r = amdgpu_cs_patch_ibs(p, p->jobs[i]);
> +               if (r)
> +                       return r;
> +       }
> +       return 0;
> +}
> +
>  static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
>  {
>         struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
> +       struct amdgpu_job *job = p->gang_leader;
>         struct amdgpu_device *adev = p->adev;
>         struct amdgpu_vm *vm = &fpriv->vm;
>         struct amdgpu_bo_list_entry *e;
>         struct amdgpu_bo_va *bo_va;
>         struct amdgpu_bo *bo;
> +       unsigned int i;
>         int r;
>
> -       if (!p->job->vm)
> -               return 0;
> -
>         r = amdgpu_vm_clear_freed(adev, vm, NULL);
>         if (r)
>                 return r;
> @@ -1057,7 +1101,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
>         if (r)
>                 return r;
>
> -       r = amdgpu_sync_fence(&p->job->sync, fpriv->prt_va->last_pt_update);
> +       r = amdgpu_sync_fence(&job->sync, fpriv->prt_va->last_pt_update);
>         if (r)
>                 return r;
>
> @@ -1068,7 +1112,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
>                 if (r)
>                         return r;
>
> -               r = amdgpu_sync_fence(&p->job->sync, bo_va->last_pt_update);
> +               r = amdgpu_sync_fence(&job->sync, bo_va->last_pt_update);
>                 if (r)
>                         return r;
>         }
> @@ -1087,7 +1131,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
>                 if (r)
>                         return r;
>
> -               r = amdgpu_sync_fence(&p->job->sync, bo_va->last_pt_update);
> +               r = amdgpu_sync_fence(&job->sync, bo_va->last_pt_update);
>                 if (r)
>                         return r;
>         }
> @@ -1100,11 +1144,18 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
>         if (r)
>                 return r;
>
> -       r = amdgpu_sync_fence(&p->job->sync, vm->last_update);
> +       r = amdgpu_sync_fence(&job->sync, vm->last_update);
>         if (r)
>                 return r;
>
> -       p->job->vm_pd_addr = amdgpu_gmc_pd_addr(vm->root.bo);
> +       for (i = 0; i < p->gang_size; ++i) {
> +               job = p->jobs[i];
> +
> +               if (!job->vm)
> +                       continue;
> +
> +               job->vm_pd_addr = amdgpu_gmc_pd_addr(vm->root.bo);
> +       }
>
>         if (amdgpu_vm_debug) {
>                 /* Invalidate all BOs to test for userspace bugs */
> @@ -1125,7 +1176,9 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
>  static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p)
>  {
>         struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
> +       struct amdgpu_job *leader = p->gang_leader;
>         struct amdgpu_bo_list_entry *e;
> +       unsigned int i;
>         int r;
>
>         list_for_each_entry(e, &p->validated, tv.head) {
> @@ -1135,12 +1188,23 @@ static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p)
>
>                 sync_mode = amdgpu_bo_explicit_sync(bo) ?
>                         AMDGPU_SYNC_EXPLICIT : AMDGPU_SYNC_NE_OWNER;
> -               r = amdgpu_sync_resv(p->adev, &p->job->sync, resv, sync_mode,
> +               r = amdgpu_sync_resv(p->adev, &leader->sync, resv, sync_mode,
>                                      &fpriv->vm);
>                 if (r)
>                         return r;
>         }
> -       return 0;
> +
> +       for (i = 0; i < p->gang_size - 1; ++i) {
> +               r = amdgpu_sync_clone(&leader->sync, &p->jobs[i]->sync);
> +               if (r)
> +                       return r;
> +       }
> +
> +       r = amdgpu_ctx_wait_prev_fence(p->ctx, p->entities[p->gang_size - 1]);
> +       if (r && r != -ERESTARTSYS)
> +               DRM_ERROR("amdgpu_ctx_wait_prev_fence failed.\n");
> +
> +       return r;
>  }
>
>  static void amdgpu_cs_post_dependencies(struct amdgpu_cs_parser *p)
> @@ -1164,16 +1228,28 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
>                             union drm_amdgpu_cs *cs)
>  {
>         struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
> -       struct drm_sched_entity *entity = p->entity;
> +       struct amdgpu_job *leader = p->gang_leader;
>         struct amdgpu_bo_list_entry *e;
> -       struct amdgpu_job *job;
> +       unsigned int i;
>         uint64_t seq;
>         int r;
>
> -       job = p->job;
> -       p->job = NULL;
> +       for (i = 0; i < p->gang_size; ++i)
> +               drm_sched_job_arm(&p->jobs[i]->base);
>
> -       drm_sched_job_arm(&job->base);
> +       for (i = 0; i < (p->gang_size - 1); ++i) {
> +               struct dma_fence *fence;
> +
> +               fence = &p->jobs[i]->base.s_fence->scheduled;
> +               r = amdgpu_sync_fence(&leader->sync, fence);
> +               if (r)
> +                       goto error_cleanup;
> +       }
> +
> +       if (p->gang_size > 1) {
> +               for (i = 0; i < p->gang_size; ++i)
> +                       amdgpu_job_set_gang_leader(p->jobs[i], leader);
> +       }
>
>         /* No memory allocation is allowed while holding the notifier lock.
>          * The lock is held until amdgpu_cs_submit is finished and fence is
> @@ -1191,45 +1267,57 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
>         }
>         if (r) {
>                 r = -EAGAIN;
> -               goto error_abort;
> +               goto error_unlock;
>         }
>
> -       p->fence = dma_fence_get(&job->base.s_fence->finished);
> +       p->fence = dma_fence_get(&leader->base.s_fence->finished);
> +       list_for_each_entry(e, &p->validated, tv.head) {
> +
> +               /* Everybody except for the gang leader uses READ */
> +               for (i = 0; i < (p->gang_size - 1); ++i) {
> +                       dma_resv_add_fence(e->tv.bo->base.resv,
> +                                          &p->jobs[i]->base.s_fence->finished,
> +                                          DMA_RESV_USAGE_READ);
> +               }
> +
> +               /* The gang leader as remembered as writer */

typo:
The gang leader IS remembered as writer

> +               e->tv.num_shared = 0;
> +       }
>
> -       seq = amdgpu_ctx_add_fence(p->ctx, entity, p->fence);
> +       seq = amdgpu_ctx_add_fence(p->ctx, p->entities[p->gang_size - 1],
> +                                  p->fence);
>         amdgpu_cs_post_dependencies(p);
>
> -       if ((job->preamble_status & AMDGPU_PREAMBLE_IB_PRESENT) &&
> +       if ((leader->preamble_status & AMDGPU_PREAMBLE_IB_PRESENT) &&
>             !p->ctx->preamble_presented) {
> -               job->preamble_status |= AMDGPU_PREAMBLE_IB_PRESENT_FIRST;
> +               leader->preamble_status |= AMDGPU_PREAMBLE_IB_PRESENT_FIRST;
>                 p->ctx->preamble_presented = true;
>         }
>
>         cs->out.handle = seq;
> -       job->uf_sequence = seq;
> +       leader->uf_sequence = seq;
>
> -       amdgpu_job_free_resources(job);
> -
> -       trace_amdgpu_cs_ioctl(job);
>         amdgpu_vm_bo_trace_cs(&fpriv->vm, &p->ticket);
> -       drm_sched_entity_push_job(&job->base);
> +       for (i = 0; i < p->gang_size; ++i) {
> +               amdgpu_job_free_resources(p->jobs[i]);
> +               trace_amdgpu_cs_ioctl(p->jobs[i]);
> +               drm_sched_entity_push_job(&p->jobs[i]->base);
> +               p->jobs[i] = NULL;
> +       }
>
>         amdgpu_vm_move_to_lru_tail(p->adev, &fpriv->vm);
> -
> -       /* Make sure all BOs are remembered as writers */
> -       amdgpu_bo_list_for_each_entry(e, p->bo_list)
> -               e->tv.num_shared = 0;
> -
>         ttm_eu_fence_buffer_objects(&p->ticket, &p->validated, p->fence);
> +
>         mutex_unlock(&p->adev->notifier_lock);
>         mutex_unlock(&p->bo_list->bo_list_mutex);
> -
>         return 0;
>
> -error_abort:
> -       drm_sched_job_cleanup(&job->base);
> +error_unlock:
>         mutex_unlock(&p->adev->notifier_lock);
> -       amdgpu_job_free(job);
> +
> +error_cleanup:
> +       for (i = 0; i < p->gang_size; ++i)
> +               drm_sched_job_cleanup(&p->jobs[i]->base);
>         return r;
>  }
>
> @@ -1246,17 +1334,18 @@ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser)
>
>         dma_fence_put(parser->fence);
>
> -       if (parser->ctx) {
> +       if (parser->ctx)
>                 amdgpu_ctx_put(parser->ctx);
> -       }
>         if (parser->bo_list)
>                 amdgpu_bo_list_put(parser->bo_list);
>
>         for (i = 0; i < parser->nchunks; i++)
>                 kvfree(parser->chunks[i].kdata);
>         kvfree(parser->chunks);
> -       if (parser->job)
> -               amdgpu_job_free(parser->job);
> +       for (i = 0; i < parser->gang_size; ++i) {
> +               if (parser->jobs[i])
> +                       amdgpu_job_free(parser->jobs[i]);
> +       }
>         if (parser->uf_entry.tv.bo) {
>                 struct amdgpu_bo *uf = ttm_to_amdgpu_bo(parser->uf_entry.tv.bo);
>
> @@ -1300,7 +1389,7 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
>                 goto error_fini;
>         }
>
> -       r = amdgpu_cs_patch_ibs(&parser);
> +       r = amdgpu_cs_patch_jobs(&parser);
>         if (r)
>                 goto error_backoff;
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h
> index 30ecc4917f81..cbaa19b2b8a3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h
> @@ -27,6 +27,8 @@
>  #include "amdgpu_bo_list.h"
>  #include "amdgpu_ring.h"
>
> +#define AMDGPU_CS_GANG_SIZE    4
> +
>  struct amdgpu_bo_va_mapping;
>
>  struct amdgpu_cs_chunk {
> @@ -50,9 +52,11 @@ struct amdgpu_cs_parser {
>         unsigned                nchunks;
>         struct amdgpu_cs_chunk  *chunks;
>
> -       /* scheduler job object */
> -       struct amdgpu_job       *job;
> -       struct drm_sched_entity *entity;
> +       /* scheduler job objects */
> +       unsigned int            gang_size;
> +       struct drm_sched_entity *entities[AMDGPU_CS_GANG_SIZE];
> +       struct amdgpu_job       *jobs[AMDGPU_CS_GANG_SIZE];
> +       struct amdgpu_job       *gang_leader;
>
>         /* buffer objects */
>         struct ww_acquire_ctx           ticket;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index cfbe19cfe9af..46c99331d7f1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -105,7 +105,6 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
>          */
>         (*job)->base.sched = &adev->rings[0]->sched;
>         (*job)->vm = vm;
> -       (*job)->num_ibs = num_ibs;
>
>         amdgpu_sync_create(&(*job)->sync);
>         amdgpu_sync_create(&(*job)->sched_sync);
> @@ -125,6 +124,7 @@ int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev, unsigned size,
>         if (r)
>                 return r;
>
> +       (*job)->num_ibs = 1;
>         r = amdgpu_ib_get(adev, NULL, size, pool_type, &(*job)->ibs[0]);
>         if (r)
>                 kfree(*job);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
> index 06dfcf297a8d..5e6ddc7e101c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
> @@ -140,8 +140,10 @@ TRACE_EVENT(amdgpu_bo_create,
>  );
>
>  TRACE_EVENT(amdgpu_cs,
> -           TP_PROTO(struct amdgpu_cs_parser *p, int i),
> -           TP_ARGS(p, i),
> +           TP_PROTO(struct amdgpu_cs_parser *p,
> +                    struct amdgpu_job *job,
> +                    struct amdgpu_ib *ib),
> +           TP_ARGS(p, job, ib),
>             TP_STRUCT__entry(
>                              __field(struct amdgpu_bo_list *, bo_list)
>                              __field(u32, ring)
> @@ -151,10 +153,10 @@ TRACE_EVENT(amdgpu_cs,
>
>             TP_fast_assign(
>                            __entry->bo_list = p->bo_list;
> -                          __entry->ring = to_amdgpu_ring(p->entity->rq->sched)->idx;
> -                          __entry->dw = p->job->ibs[i].length_dw;
> +                          __entry->ring = to_amdgpu_ring(job->base.sched)->idx;
> +                          __entry->dw = ib->length_dw;
>                            __entry->fences = amdgpu_fence_count_emitted(
> -                               to_amdgpu_ring(p->entity->rq->sched));
> +                               to_amdgpu_ring(job->base.sched));
>                            ),
>             TP_printk("bo_list=%p, ring=%u, dw=%u, fences=%u",
>                       __entry->bo_list, __entry->ring, __entry->dw,
> --
> 2.25.1
>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: Final gang submit patches
  2022-09-16  9:08 Final gang submit patches Christian König
                   ` (7 preceding siblings ...)
  2022-09-16  9:08 ` [PATCH 8/8] drm/amdgpu: add gang submit frontend v5 Christian König
@ 2022-09-16 19:22 ` Alex Deucher
  8 siblings, 0 replies; 13+ messages in thread
From: Alex Deucher @ 2022-09-16 19:22 UTC (permalink / raw)
  To: Christian König; +Cc: alexander.deucher, timur.kristof, amd-gfx, bas

On Fri, Sep 16, 2022 at 5:08 AM Christian König
<ckoenig.leichtzumerken@gmail.com> wrote:
>
> Hey guys,
>
> so thanks to Ruijing I was finally able to hammer out all the known VCN
> regressions from this patch set.
>
> Alex can you review those? AFAIK you are pretty much the only other
> person deep enough in the CS IOCTL for that.

These look good to me.  Just a couple of comments on some of the
patches.  We probably also want to patch to bump the driver version so
userspace knows when this is available.  With those addressed, the
series is:
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>

>
> Going to takle the userptr issue Bas stumbled over next.
>
> Thanks,
> Christian.
>
>

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2022-09-16 19:22 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-09-16  9:08 Final gang submit patches Christian König
2022-09-16  9:08 ` [PATCH 1/8] drm/amdgpu: cleanup CS pass2 v5 Christian König
2022-09-16  9:08 ` [PATCH 2/8] drm/amdgpu: cleanup error handling in amdgpu_cs_parser_bos Christian König
2022-09-16  9:08 ` [PATCH 3/8] drm/amdgpu: fix user fence CS check Christian König
2022-09-16 18:01   ` Alex Deucher
2022-09-16 18:24     ` Christian König
2022-09-16  9:08 ` [PATCH 4/8] drm/amdgpu: move entity selection and job init earlier during CS Christian König
2022-09-16  9:08 ` [PATCH 5/8] drm/amdgpu: revert "fix limiting AV1 to the first instance on VCN3" v3 Christian König
2022-09-16  9:08 ` [PATCH 6/8] drm/amdgpu: cleanup instance limit on VCN4 v3 Christian König
2022-09-16  9:08 ` [PATCH 7/8] drm/amdgpu: add gang submit backend v2 Christian König
2022-09-16  9:08 ` [PATCH 8/8] drm/amdgpu: add gang submit frontend v5 Christian König
2022-09-16 19:21   ` Alex Deucher
2022-09-16 19:22 ` Final gang submit patches Alex Deucher

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.