All of lore.kernel.org
 help / color / mirror / Atom feed
From: Matthew Brost <matthew.brost@intel.com>
To: <intel-gfx@lists.freedesktop.org>, <dri-devel@lists.freedesktop.org>
Subject: [RFC PATCH 23/42] drm/i915/guc: Implement multi-lrc submission
Date: Tue, 20 Jul 2021 13:57:43 -0700	[thread overview]
Message-ID: <20210720205802.39610-24-matthew.brost@intel.com> (raw)
In-Reply-To: <20210720205802.39610-1-matthew.brost@intel.com>

Implement multi-lrc submission via a single workqueue entry and single
H2G. The workqueue entry contains an updated tail value for each
request, of all the contexts in the multi-lrc submission, and updates
these values simultaneously. As such, the tasklet and bypass path have
been updated to coalesce requests into a single submission.

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |   6 +-
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 224 ++++++++++++++++--
 drivers/gpu/drm/i915/i915_request.h           |   8 +
 3 files changed, 222 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
index 86fb90a4bcfa..820cb6b5d2d0 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
@@ -64,12 +64,14 @@
 #define   WQ_TYPE_PSEUDO		(0x2 << WQ_TYPE_SHIFT)
 #define   WQ_TYPE_INORDER		(0x3 << WQ_TYPE_SHIFT)
 #define   WQ_TYPE_NOOP			(0x4 << WQ_TYPE_SHIFT)
-#define WQ_TARGET_SHIFT			10
+#define   WQ_TYPE_MULTI_LRC		(0x5 << WQ_TYPE_SHIFT)
+#define WQ_TARGET_SHIFT			8
 #define WQ_LEN_SHIFT			16
 #define WQ_NO_WCFLUSH_WAIT		(1 << 27)
 #define WQ_PRESENT_WORKLOAD		(1 << 28)
 
-#define WQ_RING_TAIL_SHIFT		20
+#define WQ_GUC_ID_SHIFT			0
+#define WQ_RING_TAIL_SHIFT		18
 #define WQ_RING_TAIL_MAX		0x7FF	/* 2^11 QWords */
 #define WQ_RING_TAIL_MASK		(WQ_RING_TAIL_MAX << WQ_RING_TAIL_SHIFT)
 
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 429473b4d46c..29a7616d3bcf 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -490,6 +490,29 @@ __get_process_desc(struct intel_context *ce)
 		   LRC_STATE_OFFSET) / sizeof(u32)));
 }
 
+static inline u32 *get_wq_pointer(struct guc_process_desc *desc,
+				  struct intel_context *ce,
+				  u32 wqi_size)
+{
+	/*
+	 * Check for space in work queue. Caching a value of head pointer in
+	 * intel_context structure in order reduce the number accesses to shared
+	 * GPU memory which may be across a PCIe bus.
+	 */
+#define AVAILABLE_SPACE	\
+	CIRC_SPACE(ce->guc_wqi_tail, ce->guc_wqi_head, GUC_WQ_SIZE)
+	if (AVAILABLE_SPACE < wqi_size) {
+		ce->guc_wqi_head = READ_ONCE(desc->head);
+
+		if (AVAILABLE_SPACE < wqi_size)
+			return NULL;
+	}
+#undef AVAILABLE_SPACE
+
+	return ((u32 *)__get_process_desc(ce)) +
+		((WQ_OFFSET + ce->guc_wqi_tail) / sizeof(u32));
+}
+
 static u32 __get_lrc_desc_offset(struct intel_guc *guc, int index)
 {
 	GEM_BUG_ON(index >= guc->lrcd_reg.max_idx);
@@ -640,7 +663,7 @@ static inline bool request_has_no_guc_id(struct i915_request *rq)
 static int __guc_add_request(struct intel_guc *guc, struct i915_request *rq)
 {
 	int err = 0;
-	struct intel_context *ce = rq->context;
+	struct intel_context *ce = request_to_scheduling_context(rq);
 	u32 action[3];
 	int len = 0;
 	u32 g2h_len_dw = 0;
@@ -690,6 +713,18 @@ static int __guc_add_request(struct intel_guc *guc, struct i915_request *rq)
 		trace_intel_context_sched_enable(ce);
 		atomic_inc(&guc->outstanding_submission_g2h);
 		set_context_enabled(ce);
+
+		/*
+		 * Without multi-lrc KMD does the submission step (moving the
+		 * lrc tail) so enabling scheduling is sufficient to submit the
+		 * context. This isn't the case in multi-lrc submission as the
+		 * GuC needs to move the tails, hence the need for another H2G
+		 * to submit a multi-lrc context after enabling scheduling.
+		 */
+		if (intel_context_is_parent(ce)) {
+			action[0] = INTEL_GUC_ACTION_SCHED_CONTEXT;
+			err = intel_guc_send_nb(guc, action, len - 1, 0);
+		}
 	} else if (!enabled) {
 		clr_context_pending_enable(ce);
 		intel_context_put(ce);
@@ -764,7 +799,6 @@ static int tasklet_register_context(struct guc_submit_engine *gse,
 	return ret;
 }
 
-
 static inline void guc_set_lrc_tail(struct i915_request *rq)
 {
 	rq->context->lrc_reg_state[CTX_RING_TAIL] =
@@ -776,6 +810,131 @@ static inline int rq_prio(const struct i915_request *rq)
 	return rq->sched.attr.priority;
 }
 
+static inline bool is_multi_lrc_rq(struct i915_request *rq)
+{
+	return intel_context_is_child(rq->context) ||
+		intel_context_is_parent(rq->context);
+}
+
+/*
+ * Multi-lrc requests are not submitted to the GuC until all requests in
+ * the set are ready. With the exception of the last request in the set,
+ * submitting a multi-lrc request is therefore just a status update on
+ * the driver-side and can be safely merged with other requests. When the
+ * last multi-lrc request in a set is detected, we break out of the
+ * submission loop and submit the whole set, thus we never attempt to
+ * merge that one with othe requests.
+ */
+static inline bool can_merge_rq(struct i915_request *rq,
+				struct i915_request *last)
+{
+	return is_multi_lrc_rq(last) || rq->context == last->context;
+}
+
+static inline u32 wq_space_until_wrap(struct intel_context *ce)
+{
+	return (GUC_WQ_SIZE - ce->guc_wqi_tail);
+}
+
+static inline void write_wqi(struct guc_process_desc *desc,
+			     struct intel_context *ce,
+			     u32 wqi_size)
+{
+	ce->guc_wqi_tail = (ce->guc_wqi_tail + wqi_size) & (GUC_WQ_SIZE - 1);
+	WRITE_ONCE(desc->tail, ce->guc_wqi_tail);
+}
+
+static inline int guc_wq_noop_append(struct intel_context *ce)
+{
+	struct guc_process_desc *desc = __get_process_desc(ce);
+	u32 *wqi = get_wq_pointer(desc, ce, wq_space_until_wrap(ce));
+
+	if (!wqi)
+		return -EBUSY;
+
+	*wqi = WQ_TYPE_NOOP |
+		((wq_space_until_wrap(ce) / sizeof(u32) - 1) << WQ_LEN_SHIFT);
+	ce->guc_wqi_tail = 0;
+
+	return 0;
+}
+
+static int __guc_wq_item_append(struct i915_request *rq)
+{
+	struct intel_context *ce = request_to_scheduling_context(rq);
+	struct intel_context *child;
+	struct guc_process_desc *desc = __get_process_desc(ce);
+	unsigned int wqi_size = (ce->guc_number_children + 4) *
+		sizeof(u32);
+	u32 *wqi;
+	int ret;
+
+	/* Ensure context is in correct state updating work queue */
+	GEM_BUG_ON(ce->guc_num_rq_submit_no_id);
+	GEM_BUG_ON(request_has_no_guc_id(rq));
+	GEM_BUG_ON(!atomic_read(&ce->guc_id_ref));
+	GEM_BUG_ON(context_guc_id_invalid(ce));
+	GEM_BUG_ON(context_pending_disable(ce));
+	GEM_BUG_ON(context_wait_for_deregister_to_register(ce));
+
+	/* Insert NOOP if this work queue item will wrap the tail pointer. */
+	if (wqi_size > wq_space_until_wrap(ce)) {
+		ret = guc_wq_noop_append(ce);
+		if (ret)
+			return ret;
+	}
+
+	wqi = get_wq_pointer(desc, ce, wqi_size);
+	if (!wqi)
+		return -EBUSY;
+
+	*wqi++ = WQ_TYPE_MULTI_LRC |
+		((wqi_size / sizeof(u32) - 1) << WQ_LEN_SHIFT);
+	*wqi++ = ce->lrc.lrca;
+	*wqi++ = (ce->guc_id << WQ_GUC_ID_SHIFT) |
+		 ((ce->ring->tail / sizeof(u64)) << WQ_RING_TAIL_SHIFT);
+	*wqi++ = 0;	/* fence_id */
+	for_each_child(ce, child)
+		*wqi++ = child->ring->tail / sizeof(u64);
+
+	write_wqi(desc, ce, wqi_size);
+
+	return 0;
+}
+
+static int gse_wq_item_append(struct guc_submit_engine *gse,
+			      struct i915_request *rq)
+{
+	struct intel_context *ce = request_to_scheduling_context(rq);
+	int ret = 0;
+
+	if (likely(!intel_context_is_banned(ce))) {
+		ret = __guc_wq_item_append(rq);
+
+		if (unlikely(ret == -EBUSY)) {
+			gse->stalled_rq = rq;
+			gse->submission_stall_reason = STALL_MOVE_LRC_TAIL;
+		}
+	}
+
+	return ret;
+}
+
+static inline bool multi_lrc_submit(struct i915_request *rq)
+{
+	struct intel_context *ce = request_to_scheduling_context(rq);
+	intel_ring_set_tail(rq->ring, rq->tail);
+
+	/*
+	 * We expect the front end (execbuf IOCTL) to set this flag on the last
+	 * request generated from a multi-BB submission. This indicates to the
+	 * backend (GuC interface) that we should submit this context thus
+	 * submitting all the requests generated in parallel.
+	 */
+	return test_bit(I915_FENCE_FLAG_SUBMIT_PARALLEL, &rq->fence.flags) ||
+		intel_context_is_banned(ce);
+}
+
 static void kick_retire_wq(struct guc_submit_engine *gse)
 {
 	queue_work(system_unbound_wq, &gse->retire_worker);
@@ -819,7 +978,7 @@ static int gse_dequeue_one_context(struct guc_submit_engine *gse)
 			struct i915_request *rq, *rn;
 
 			priolist_for_each_request_consume(rq, rn, p) {
-				if (last && rq->context != last->context)
+				if (last && !can_merge_rq(rq, last))
 					goto done;
 
 				list_del_init(&rq->sched.link);
@@ -828,7 +987,22 @@ static int gse_dequeue_one_context(struct guc_submit_engine *gse)
 
 				trace_i915_request_in(rq, 0);
 				last = rq;
-				submit = true;
+
+				if (is_multi_lrc_rq(rq)) {
+					/*
+					 * We need to coalesce all multi-lrc
+					 * requests in a relationship into a
+					 * single H2G. We are guaranteed that
+					 * all of these requests will be
+					 * submitted sequentially.
+					 */
+					if (multi_lrc_submit(rq)) {
+						submit = true;
+						goto done;
+					}
+				} else {
+					submit = true;
+				}
 			}
 
 			rb_erase_cached(&p->node, &sched_engine->queue);
@@ -838,7 +1012,7 @@ static int gse_dequeue_one_context(struct guc_submit_engine *gse)
 
 done:
 	if (submit) {
-		struct intel_context *ce = last->context;
+		struct intel_context *ce = request_to_scheduling_context(last);
 
 		if (ce->guc_num_rq_submit_no_id) {
 			ret = tasklet_pin_guc_id(gse, last);
@@ -860,7 +1034,17 @@ static int gse_dequeue_one_context(struct guc_submit_engine *gse)
 		}
 
 move_lrc_tail:
-		guc_set_lrc_tail(last);
+		if (is_multi_lrc_rq(last)) {
+			ret = gse_wq_item_append(gse, last);
+			if (ret == -EBUSY)
+				goto schedule_tasklet;
+			else if (ret != 0) {
+				GEM_WARN_ON(ret);	/* Unexpected */
+				goto deadlk;
+			}
+		} else {
+			guc_set_lrc_tail(last);
+		}
 
 add_request:
 		ret = gse_add_request(gse, last);
@@ -1565,14 +1749,22 @@ static bool need_tasklet(struct guc_submit_engine *gse, struct intel_context *ce
 static int gse_bypass_tasklet_submit(struct guc_submit_engine *gse,
 				     struct i915_request *rq)
 {
-	int ret;
+	int ret = 0;
 
 	__i915_request_submit(rq);
 
 	trace_i915_request_in(rq, 0);
 
-	guc_set_lrc_tail(rq);
-	ret = gse_add_request(gse, rq);
+	if (is_multi_lrc_rq(rq)) {
+		if (multi_lrc_submit(rq)) {
+			ret = gse_wq_item_append(gse, rq);
+			if (!ret)
+				ret = gse_add_request(gse, rq);
+		}
+	} else {
+		guc_set_lrc_tail(rq);
+		ret = gse_add_request(gse, rq);
+	}
 
 	if (unlikely(ret == -EPIPE))
 		disable_submission(gse->sched_engine.private_data);
@@ -1589,7 +1781,7 @@ static void guc_submit_request(struct i915_request *rq)
 	/* Will be called from irq-context when using foreign fences. */
 	spin_lock_irqsave(&sched_engine->lock, flags);
 
-	if (need_tasklet(gse, rq->context))
+	if (need_tasklet(gse, request_to_scheduling_context(rq)))
 		queue_request(sched_engine, rq, rq_prio(rq));
 	else if (gse_bypass_tasklet_submit(gse, rq) == -EBUSY)
 		kick_tasklet(gse);
@@ -2957,9 +3149,10 @@ static inline bool new_guc_prio_higher(u8 old_guc_prio, u8 new_guc_prio)
 
 static void add_to_context(struct i915_request *rq)
 {
-	struct intel_context *ce = rq->context;
+	struct intel_context *ce = request_to_scheduling_context(rq);
 	u8 new_guc_prio = map_i915_prio_to_guc_prio(rq_prio(rq));
 
+	GEM_BUG_ON(intel_context_is_child(ce));
 	GEM_BUG_ON(rq->guc_prio == GUC_PRIO_FINI);
 
 	spin_lock(&ce->guc_active.lock);
@@ -2993,7 +3186,9 @@ static void guc_prio_fini(struct i915_request *rq, struct intel_context *ce)
 
 static void remove_from_context(struct i915_request *rq)
 {
-	struct intel_context *ce = rq->context;
+	struct intel_context *ce = request_to_scheduling_context(rq);
+
+	GEM_BUG_ON(intel_context_is_child(ce));
 
 	spin_lock_irq(&ce->guc_active.lock);
 
@@ -3197,7 +3392,8 @@ static int tasklet_pin_guc_id(struct guc_submit_engine *gse,
 	GEM_BUG_ON(gse->total_num_rq_with_no_guc_id < 0);
 
 	list_for_each_entry_reverse(rq, &ce->guc_active.requests, sched.link)
-		if (request_has_no_guc_id(rq)) {
+		if (request_has_no_guc_id(rq) &&
+		    request_to_scheduling_context(rq) == ce) {
 			--ce->guc_num_rq_submit_no_id;
 			clear_bit(I915_FENCE_FLAG_GUC_ID_NOT_PINNED,
 				  &rq->fence.flags);
@@ -3517,7 +3713,7 @@ static void guc_bump_inflight_request_prio(struct i915_request *rq,
 
 static void guc_retire_inflight_request_prio(struct i915_request *rq)
 {
-	struct intel_context *ce = rq->context;
+	struct intel_context *ce = request_to_scheduling_context(rq);
 
 	spin_lock(&ce->guc_active.lock);
 	guc_prio_fini(rq, ce);
diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
index 5f304fd02071..ad3ec638d28b 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -145,6 +145,14 @@ enum {
 	 * tasklet that the guc_id isn't pinned.
 	 */
 	I915_FENCE_FLAG_GUC_ID_NOT_PINNED,
+
+	/*
+	 * I915_FENCE_FLAG_SUBMIT_PARALLEL - request with a context in a
+	 * parent-child relationship (parallel submission, multi-lrc) should
+	 * trigger a submission to the GuC rather than just moving the context
+	 * tail.
+	 */
+	I915_FENCE_FLAG_SUBMIT_PARALLEL,
 };
 
 /**
-- 
2.28.0


WARNING: multiple messages have this Message-ID (diff)
From: Matthew Brost <matthew.brost@intel.com>
To: <intel-gfx@lists.freedesktop.org>, <dri-devel@lists.freedesktop.org>
Subject: [Intel-gfx] [RFC PATCH 23/42] drm/i915/guc: Implement multi-lrc submission
Date: Tue, 20 Jul 2021 13:57:43 -0700	[thread overview]
Message-ID: <20210720205802.39610-24-matthew.brost@intel.com> (raw)
In-Reply-To: <20210720205802.39610-1-matthew.brost@intel.com>

Implement multi-lrc submission via a single workqueue entry and single
H2G. The workqueue entry contains an updated tail value for each
request, of all the contexts in the multi-lrc submission, and updates
these values simultaneously. As such, the tasklet and bypass path have
been updated to coalesce requests into a single submission.

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |   6 +-
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 224 ++++++++++++++++--
 drivers/gpu/drm/i915/i915_request.h           |   8 +
 3 files changed, 222 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
index 86fb90a4bcfa..820cb6b5d2d0 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
@@ -64,12 +64,14 @@
 #define   WQ_TYPE_PSEUDO		(0x2 << WQ_TYPE_SHIFT)
 #define   WQ_TYPE_INORDER		(0x3 << WQ_TYPE_SHIFT)
 #define   WQ_TYPE_NOOP			(0x4 << WQ_TYPE_SHIFT)
-#define WQ_TARGET_SHIFT			10
+#define   WQ_TYPE_MULTI_LRC		(0x5 << WQ_TYPE_SHIFT)
+#define WQ_TARGET_SHIFT			8
 #define WQ_LEN_SHIFT			16
 #define WQ_NO_WCFLUSH_WAIT		(1 << 27)
 #define WQ_PRESENT_WORKLOAD		(1 << 28)
 
-#define WQ_RING_TAIL_SHIFT		20
+#define WQ_GUC_ID_SHIFT			0
+#define WQ_RING_TAIL_SHIFT		18
 #define WQ_RING_TAIL_MAX		0x7FF	/* 2^11 QWords */
 #define WQ_RING_TAIL_MASK		(WQ_RING_TAIL_MAX << WQ_RING_TAIL_SHIFT)
 
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 429473b4d46c..29a7616d3bcf 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -490,6 +490,29 @@ __get_process_desc(struct intel_context *ce)
 		   LRC_STATE_OFFSET) / sizeof(u32)));
 }
 
+static inline u32 *get_wq_pointer(struct guc_process_desc *desc,
+				  struct intel_context *ce,
+				  u32 wqi_size)
+{
+	/*
+	 * Check for space in work queue. Caching a value of head pointer in
+	 * intel_context structure in order reduce the number accesses to shared
+	 * GPU memory which may be across a PCIe bus.
+	 */
+#define AVAILABLE_SPACE	\
+	CIRC_SPACE(ce->guc_wqi_tail, ce->guc_wqi_head, GUC_WQ_SIZE)
+	if (AVAILABLE_SPACE < wqi_size) {
+		ce->guc_wqi_head = READ_ONCE(desc->head);
+
+		if (AVAILABLE_SPACE < wqi_size)
+			return NULL;
+	}
+#undef AVAILABLE_SPACE
+
+	return ((u32 *)__get_process_desc(ce)) +
+		((WQ_OFFSET + ce->guc_wqi_tail) / sizeof(u32));
+}
+
 static u32 __get_lrc_desc_offset(struct intel_guc *guc, int index)
 {
 	GEM_BUG_ON(index >= guc->lrcd_reg.max_idx);
@@ -640,7 +663,7 @@ static inline bool request_has_no_guc_id(struct i915_request *rq)
 static int __guc_add_request(struct intel_guc *guc, struct i915_request *rq)
 {
 	int err = 0;
-	struct intel_context *ce = rq->context;
+	struct intel_context *ce = request_to_scheduling_context(rq);
 	u32 action[3];
 	int len = 0;
 	u32 g2h_len_dw = 0;
@@ -690,6 +713,18 @@ static int __guc_add_request(struct intel_guc *guc, struct i915_request *rq)
 		trace_intel_context_sched_enable(ce);
 		atomic_inc(&guc->outstanding_submission_g2h);
 		set_context_enabled(ce);
+
+		/*
+		 * Without multi-lrc KMD does the submission step (moving the
+		 * lrc tail) so enabling scheduling is sufficient to submit the
+		 * context. This isn't the case in multi-lrc submission as the
+		 * GuC needs to move the tails, hence the need for another H2G
+		 * to submit a multi-lrc context after enabling scheduling.
+		 */
+		if (intel_context_is_parent(ce)) {
+			action[0] = INTEL_GUC_ACTION_SCHED_CONTEXT;
+			err = intel_guc_send_nb(guc, action, len - 1, 0);
+		}
 	} else if (!enabled) {
 		clr_context_pending_enable(ce);
 		intel_context_put(ce);
@@ -764,7 +799,6 @@ static int tasklet_register_context(struct guc_submit_engine *gse,
 	return ret;
 }
 
-
 static inline void guc_set_lrc_tail(struct i915_request *rq)
 {
 	rq->context->lrc_reg_state[CTX_RING_TAIL] =
@@ -776,6 +810,131 @@ static inline int rq_prio(const struct i915_request *rq)
 	return rq->sched.attr.priority;
 }
 
+static inline bool is_multi_lrc_rq(struct i915_request *rq)
+{
+	return intel_context_is_child(rq->context) ||
+		intel_context_is_parent(rq->context);
+}
+
+/*
+ * Multi-lrc requests are not submitted to the GuC until all requests in
+ * the set are ready. With the exception of the last request in the set,
+ * submitting a multi-lrc request is therefore just a status update on
+ * the driver-side and can be safely merged with other requests. When the
+ * last multi-lrc request in a set is detected, we break out of the
+ * submission loop and submit the whole set, thus we never attempt to
+ * merge that one with othe requests.
+ */
+static inline bool can_merge_rq(struct i915_request *rq,
+				struct i915_request *last)
+{
+	return is_multi_lrc_rq(last) || rq->context == last->context;
+}
+
+static inline u32 wq_space_until_wrap(struct intel_context *ce)
+{
+	return (GUC_WQ_SIZE - ce->guc_wqi_tail);
+}
+
+static inline void write_wqi(struct guc_process_desc *desc,
+			     struct intel_context *ce,
+			     u32 wqi_size)
+{
+	ce->guc_wqi_tail = (ce->guc_wqi_tail + wqi_size) & (GUC_WQ_SIZE - 1);
+	WRITE_ONCE(desc->tail, ce->guc_wqi_tail);
+}
+
+static inline int guc_wq_noop_append(struct intel_context *ce)
+{
+	struct guc_process_desc *desc = __get_process_desc(ce);
+	u32 *wqi = get_wq_pointer(desc, ce, wq_space_until_wrap(ce));
+
+	if (!wqi)
+		return -EBUSY;
+
+	*wqi = WQ_TYPE_NOOP |
+		((wq_space_until_wrap(ce) / sizeof(u32) - 1) << WQ_LEN_SHIFT);
+	ce->guc_wqi_tail = 0;
+
+	return 0;
+}
+
+static int __guc_wq_item_append(struct i915_request *rq)
+{
+	struct intel_context *ce = request_to_scheduling_context(rq);
+	struct intel_context *child;
+	struct guc_process_desc *desc = __get_process_desc(ce);
+	unsigned int wqi_size = (ce->guc_number_children + 4) *
+		sizeof(u32);
+	u32 *wqi;
+	int ret;
+
+	/* Ensure context is in correct state updating work queue */
+	GEM_BUG_ON(ce->guc_num_rq_submit_no_id);
+	GEM_BUG_ON(request_has_no_guc_id(rq));
+	GEM_BUG_ON(!atomic_read(&ce->guc_id_ref));
+	GEM_BUG_ON(context_guc_id_invalid(ce));
+	GEM_BUG_ON(context_pending_disable(ce));
+	GEM_BUG_ON(context_wait_for_deregister_to_register(ce));
+
+	/* Insert NOOP if this work queue item will wrap the tail pointer. */
+	if (wqi_size > wq_space_until_wrap(ce)) {
+		ret = guc_wq_noop_append(ce);
+		if (ret)
+			return ret;
+	}
+
+	wqi = get_wq_pointer(desc, ce, wqi_size);
+	if (!wqi)
+		return -EBUSY;
+
+	*wqi++ = WQ_TYPE_MULTI_LRC |
+		((wqi_size / sizeof(u32) - 1) << WQ_LEN_SHIFT);
+	*wqi++ = ce->lrc.lrca;
+	*wqi++ = (ce->guc_id << WQ_GUC_ID_SHIFT) |
+		 ((ce->ring->tail / sizeof(u64)) << WQ_RING_TAIL_SHIFT);
+	*wqi++ = 0;	/* fence_id */
+	for_each_child(ce, child)
+		*wqi++ = child->ring->tail / sizeof(u64);
+
+	write_wqi(desc, ce, wqi_size);
+
+	return 0;
+}
+
+static int gse_wq_item_append(struct guc_submit_engine *gse,
+			      struct i915_request *rq)
+{
+	struct intel_context *ce = request_to_scheduling_context(rq);
+	int ret = 0;
+
+	if (likely(!intel_context_is_banned(ce))) {
+		ret = __guc_wq_item_append(rq);
+
+		if (unlikely(ret == -EBUSY)) {
+			gse->stalled_rq = rq;
+			gse->submission_stall_reason = STALL_MOVE_LRC_TAIL;
+		}
+	}
+
+	return ret;
+}
+
+static inline bool multi_lrc_submit(struct i915_request *rq)
+{
+	struct intel_context *ce = request_to_scheduling_context(rq);
+	intel_ring_set_tail(rq->ring, rq->tail);
+
+	/*
+	 * We expect the front end (execbuf IOCTL) to set this flag on the last
+	 * request generated from a multi-BB submission. This indicates to the
+	 * backend (GuC interface) that we should submit this context thus
+	 * submitting all the requests generated in parallel.
+	 */
+	return test_bit(I915_FENCE_FLAG_SUBMIT_PARALLEL, &rq->fence.flags) ||
+		intel_context_is_banned(ce);
+}
+
 static void kick_retire_wq(struct guc_submit_engine *gse)
 {
 	queue_work(system_unbound_wq, &gse->retire_worker);
@@ -819,7 +978,7 @@ static int gse_dequeue_one_context(struct guc_submit_engine *gse)
 			struct i915_request *rq, *rn;
 
 			priolist_for_each_request_consume(rq, rn, p) {
-				if (last && rq->context != last->context)
+				if (last && !can_merge_rq(rq, last))
 					goto done;
 
 				list_del_init(&rq->sched.link);
@@ -828,7 +987,22 @@ static int gse_dequeue_one_context(struct guc_submit_engine *gse)
 
 				trace_i915_request_in(rq, 0);
 				last = rq;
-				submit = true;
+
+				if (is_multi_lrc_rq(rq)) {
+					/*
+					 * We need to coalesce all multi-lrc
+					 * requests in a relationship into a
+					 * single H2G. We are guaranteed that
+					 * all of these requests will be
+					 * submitted sequentially.
+					 */
+					if (multi_lrc_submit(rq)) {
+						submit = true;
+						goto done;
+					}
+				} else {
+					submit = true;
+				}
 			}
 
 			rb_erase_cached(&p->node, &sched_engine->queue);
@@ -838,7 +1012,7 @@ static int gse_dequeue_one_context(struct guc_submit_engine *gse)
 
 done:
 	if (submit) {
-		struct intel_context *ce = last->context;
+		struct intel_context *ce = request_to_scheduling_context(last);
 
 		if (ce->guc_num_rq_submit_no_id) {
 			ret = tasklet_pin_guc_id(gse, last);
@@ -860,7 +1034,17 @@ static int gse_dequeue_one_context(struct guc_submit_engine *gse)
 		}
 
 move_lrc_tail:
-		guc_set_lrc_tail(last);
+		if (is_multi_lrc_rq(last)) {
+			ret = gse_wq_item_append(gse, last);
+			if (ret == -EBUSY)
+				goto schedule_tasklet;
+			else if (ret != 0) {
+				GEM_WARN_ON(ret);	/* Unexpected */
+				goto deadlk;
+			}
+		} else {
+			guc_set_lrc_tail(last);
+		}
 
 add_request:
 		ret = gse_add_request(gse, last);
@@ -1565,14 +1749,22 @@ static bool need_tasklet(struct guc_submit_engine *gse, struct intel_context *ce
 static int gse_bypass_tasklet_submit(struct guc_submit_engine *gse,
 				     struct i915_request *rq)
 {
-	int ret;
+	int ret = 0;
 
 	__i915_request_submit(rq);
 
 	trace_i915_request_in(rq, 0);
 
-	guc_set_lrc_tail(rq);
-	ret = gse_add_request(gse, rq);
+	if (is_multi_lrc_rq(rq)) {
+		if (multi_lrc_submit(rq)) {
+			ret = gse_wq_item_append(gse, rq);
+			if (!ret)
+				ret = gse_add_request(gse, rq);
+		}
+	} else {
+		guc_set_lrc_tail(rq);
+		ret = gse_add_request(gse, rq);
+	}
 
 	if (unlikely(ret == -EPIPE))
 		disable_submission(gse->sched_engine.private_data);
@@ -1589,7 +1781,7 @@ static void guc_submit_request(struct i915_request *rq)
 	/* Will be called from irq-context when using foreign fences. */
 	spin_lock_irqsave(&sched_engine->lock, flags);
 
-	if (need_tasklet(gse, rq->context))
+	if (need_tasklet(gse, request_to_scheduling_context(rq)))
 		queue_request(sched_engine, rq, rq_prio(rq));
 	else if (gse_bypass_tasklet_submit(gse, rq) == -EBUSY)
 		kick_tasklet(gse);
@@ -2957,9 +3149,10 @@ static inline bool new_guc_prio_higher(u8 old_guc_prio, u8 new_guc_prio)
 
 static void add_to_context(struct i915_request *rq)
 {
-	struct intel_context *ce = rq->context;
+	struct intel_context *ce = request_to_scheduling_context(rq);
 	u8 new_guc_prio = map_i915_prio_to_guc_prio(rq_prio(rq));
 
+	GEM_BUG_ON(intel_context_is_child(ce));
 	GEM_BUG_ON(rq->guc_prio == GUC_PRIO_FINI);
 
 	spin_lock(&ce->guc_active.lock);
@@ -2993,7 +3186,9 @@ static void guc_prio_fini(struct i915_request *rq, struct intel_context *ce)
 
 static void remove_from_context(struct i915_request *rq)
 {
-	struct intel_context *ce = rq->context;
+	struct intel_context *ce = request_to_scheduling_context(rq);
+
+	GEM_BUG_ON(intel_context_is_child(ce));
 
 	spin_lock_irq(&ce->guc_active.lock);
 
@@ -3197,7 +3392,8 @@ static int tasklet_pin_guc_id(struct guc_submit_engine *gse,
 	GEM_BUG_ON(gse->total_num_rq_with_no_guc_id < 0);
 
 	list_for_each_entry_reverse(rq, &ce->guc_active.requests, sched.link)
-		if (request_has_no_guc_id(rq)) {
+		if (request_has_no_guc_id(rq) &&
+		    request_to_scheduling_context(rq) == ce) {
 			--ce->guc_num_rq_submit_no_id;
 			clear_bit(I915_FENCE_FLAG_GUC_ID_NOT_PINNED,
 				  &rq->fence.flags);
@@ -3517,7 +3713,7 @@ static void guc_bump_inflight_request_prio(struct i915_request *rq,
 
 static void guc_retire_inflight_request_prio(struct i915_request *rq)
 {
-	struct intel_context *ce = rq->context;
+	struct intel_context *ce = request_to_scheduling_context(rq);
 
 	spin_lock(&ce->guc_active.lock);
 	guc_prio_fini(rq, ce);
diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
index 5f304fd02071..ad3ec638d28b 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -145,6 +145,14 @@ enum {
 	 * tasklet that the guc_id isn't pinned.
 	 */
 	I915_FENCE_FLAG_GUC_ID_NOT_PINNED,
+
+	/*
+	 * I915_FENCE_FLAG_SUBMIT_PARALLEL - request with a context in a
+	 * parent-child relationship (parallel submission, multi-lrc) should
+	 * trigger a submission to the GuC rather than just moving the context
+	 * tail.
+	 */
+	I915_FENCE_FLAG_SUBMIT_PARALLEL,
 };
 
 /**
-- 
2.28.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

  parent reply	other threads:[~2021-07-20 20:41 UTC|newest]

Thread overview: 88+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-07-20 20:57 [RFC PATCH 00/42] Parallel submission aka multi-bb execbuf Matthew Brost
2021-07-20 20:57 ` [Intel-gfx] " Matthew Brost
2021-07-20 20:57 ` [Intel-gfx] ✗ Fi.CI.BUILD: failure for " Patchwork
2021-07-20 20:57 ` [RFC PATCH 01/42] drm/i915/guc: GuC submission squashed into single patch Matthew Brost
2021-07-20 20:57   ` [Intel-gfx] " Matthew Brost
2021-07-28 12:57   ` kernel test robot
2021-07-20 20:57 ` [RFC PATCH 02/42] drm/i915/guc: Allow flexible number of context ids Matthew Brost
2021-07-20 20:57   ` [Intel-gfx] " Matthew Brost
2021-07-20 20:57 ` [RFC PATCH 03/42] drm/i915/guc: Connect the number of guc_ids to debugfs Matthew Brost
2021-07-20 20:57   ` [Intel-gfx] " Matthew Brost
2021-07-20 20:57 ` [RFC PATCH 04/42] drm/i915/guc: Don't return -EAGAIN to user when guc_ids exhausted Matthew Brost
2021-07-20 20:57   ` [Intel-gfx] " Matthew Brost
2021-07-20 20:57 ` [RFC PATCH 05/42] drm/i915/guc: Don't allow requests not ready to consume all guc_ids Matthew Brost
2021-07-20 20:57   ` [Intel-gfx] " Matthew Brost
2021-07-20 20:57 ` [RFC PATCH 06/42] drm/i915/guc: Introduce guc_submit_engine object Matthew Brost
2021-07-20 20:57   ` [Intel-gfx] " Matthew Brost
2021-07-20 20:57 ` [RFC PATCH 07/42] drm/i915/guc: Check return of __xa_store when registering a context Matthew Brost
2021-07-20 20:57   ` [Intel-gfx] " Matthew Brost
2021-07-20 20:57 ` [RFC PATCH 08/42] drm/i915/guc: Non-static lrc descriptor registration buffer Matthew Brost
2021-07-20 20:57   ` [Intel-gfx] " Matthew Brost
2021-07-20 20:57 ` [RFC PATCH 09/42] drm/i915/guc: Take GT PM ref when deregistering context Matthew Brost
2021-07-20 20:57   ` [Intel-gfx] " Matthew Brost
2021-07-20 20:57 ` [RFC PATCH 10/42] drm/i915: Add GT PM unpark worker Matthew Brost
2021-07-20 20:57   ` [Intel-gfx] " Matthew Brost
2021-07-20 20:57 ` [RFC PATCH 11/42] drm/i915/guc: Take engine PM when a context is pinned with GuC submission Matthew Brost
2021-07-20 20:57   ` [Intel-gfx] " Matthew Brost
2021-07-20 20:57 ` [RFC PATCH 12/42] drm/i915/guc: Don't call switch_to_kernel_context " Matthew Brost
2021-07-20 20:57   ` [Intel-gfx] " Matthew Brost
2021-07-20 20:57 ` [RFC PATCH 13/42] drm/i915/guc: Selftest for GuC flow control Matthew Brost
2021-07-20 20:57   ` [Intel-gfx] " Matthew Brost
2021-07-20 20:57 ` [RFC PATCH 14/42] drm/i915: Add logical engine mapping Matthew Brost
2021-07-20 20:57   ` [Intel-gfx] " Matthew Brost
2021-07-20 20:57 ` [RFC PATCH 15/42] drm/i915: Expose logical engine instance to user Matthew Brost
2021-07-20 20:57   ` [Intel-gfx] " Matthew Brost
2021-07-20 20:57 ` [RFC PATCH 16/42] drm/i915/guc: Introduce context parent-child relationship Matthew Brost
2021-07-20 20:57   ` [Intel-gfx] " Matthew Brost
2021-07-20 20:57 ` [RFC PATCH 17/42] drm/i915/guc: Implement GuC parent-child context pin / unpin functions Matthew Brost
2021-07-20 20:57   ` [Intel-gfx] " Matthew Brost
2021-07-20 20:57 ` [RFC PATCH 18/42] drm/i915/guc: Add multi-lrc context registration Matthew Brost
2021-07-20 20:57   ` [Intel-gfx] " Matthew Brost
2021-07-20 20:57 ` [RFC PATCH 19/42] drm/i915/guc: Ensure GuC schedule operations do not operate on child contexts Matthew Brost
2021-07-20 20:57   ` [Intel-gfx] " Matthew Brost
2021-07-20 20:57 ` [RFC PATCH 20/42] drm/i915/guc: Assign contexts in parent-child relationship consecutive guc_ids Matthew Brost
2021-07-20 20:57   ` [Intel-gfx] " Matthew Brost
2021-07-20 20:57 ` [RFC PATCH 21/42] drm/i915/guc: Add hang check to GuC submit engine Matthew Brost
2021-07-20 20:57   ` [Intel-gfx] " Matthew Brost
2021-07-20 20:57 ` [RFC PATCH 22/42] drm/i915/guc: Add guc_child_context_destroy Matthew Brost
2021-07-20 20:57   ` [Intel-gfx] " Matthew Brost
2021-07-20 20:57 ` Matthew Brost [this message]
2021-07-20 20:57   ` [Intel-gfx] [RFC PATCH 23/42] drm/i915/guc: Implement multi-lrc submission Matthew Brost
2021-07-20 20:57 ` [RFC PATCH 24/42] drm/i915/guc: Insert submit fences between requests in parent-child relationship Matthew Brost
2021-07-20 20:57   ` [Intel-gfx] " Matthew Brost
2021-07-20 20:57 ` [RFC PATCH 25/42] drm/i915/guc: Implement multi-lrc reset Matthew Brost
2021-07-20 20:57   ` [Intel-gfx] " Matthew Brost
2021-07-20 20:57 ` [RFC PATCH 26/42] drm/i915/guc: Update debugfs for GuC multi-lrc Matthew Brost
2021-07-20 20:57   ` [Intel-gfx] " Matthew Brost
2021-07-20 20:57 ` [RFC PATCH 27/42] drm/i915: Connect UAPI to GuC multi-lrc interface Matthew Brost
2021-07-20 20:57   ` [Intel-gfx] " Matthew Brost
2021-07-20 20:57 ` [RFC PATCH 28/42] drm/i915/guc: Add basic GuC multi-lrc selftest Matthew Brost
2021-07-20 20:57   ` [Intel-gfx] " Matthew Brost
2021-07-20 20:57 ` [RFC PATCH 29/42] drm/i915/guc: Implement BB boundary preemption for multi-lrc Matthew Brost
2021-07-20 20:57   ` [Intel-gfx] " Matthew Brost
2021-07-20 20:57 ` [RFC PATCH 30/42] i915/drm: Move secure execbuf check to execbuf2 Matthew Brost
2021-07-20 20:57   ` [Intel-gfx] " Matthew Brost
2021-07-20 20:57 ` [RFC PATCH 31/42] drm/i915: Move input/exec fence handling to i915_gem_execbuffer2 Matthew Brost
2021-07-20 20:57   ` [Intel-gfx] " Matthew Brost
2021-07-20 20:57 ` [RFC PATCH 32/42] drm/i915: Move output " Matthew Brost
2021-07-20 20:57   ` [Intel-gfx] " Matthew Brost
2021-07-20 20:57 ` [RFC PATCH 33/42] drm/i915: Return output fence from i915_gem_do_execbuffer Matthew Brost
2021-07-20 20:57   ` [Intel-gfx] " Matthew Brost
2021-07-20 20:57 ` [RFC PATCH 34/42] drm/i915: Store batch index in struct i915_execbuffer Matthew Brost
2021-07-20 20:57   ` [Intel-gfx] " Matthew Brost
2021-07-20 20:57 ` [RFC PATCH 35/42] drm/i915: Allow callers of i915_gem_do_execbuffer to override the batch index Matthew Brost
2021-07-20 20:57   ` [Intel-gfx] " Matthew Brost
2021-07-20 20:57 ` [RFC PATCH 36/42] drm/i915: Teach execbuf there can be more than one batch in the objects list Matthew Brost
2021-07-20 20:57   ` [Intel-gfx] " Matthew Brost
2021-07-20 20:57 ` [RFC PATCH 37/42] drm/i915: Only track object dependencies on first request Matthew Brost
2021-07-20 20:57   ` [Intel-gfx] " Matthew Brost
2021-07-20 20:57 ` [RFC PATCH 38/42] drm/i915: Force parallel contexts to use copy engine for reloc Matthew Brost
2021-07-20 20:57   ` [Intel-gfx] " Matthew Brost
2021-07-20 20:57 ` [RFC PATCH 39/42] drm/i915: Multi-batch execbuffer2 Matthew Brost
2021-07-20 20:57   ` [Intel-gfx] " Matthew Brost
2021-07-20 20:58 ` [RFC PATCH 40/42] drm/i915: Eliminate unnecessary VMA calls for multi-BB submission Matthew Brost
2021-07-20 20:58   ` [Intel-gfx] " Matthew Brost
2021-07-20 20:58 ` [RFC PATCH 41/42] drm/i915: Enable multi-bb execbuf Matthew Brost
2021-07-20 20:58   ` [Intel-gfx] " Matthew Brost
2021-07-20 20:58 ` [RFC PATCH 42/42] drm/i915/execlists: Parallel submission support for execlists Matthew Brost
2021-07-20 20:58   ` [Intel-gfx] " Matthew Brost

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210720205802.39610-24-matthew.brost@intel.com \
    --to=matthew.brost@intel.com \
    --cc=dri-devel@lists.freedesktop.org \
    --cc=intel-gfx@lists.freedesktop.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.