All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/i915/preemption: Allow preemption between submission ports
@ 2018-02-22 14:11 Chris Wilson
  2018-02-22 14:22 ` Chris Wilson
                   ` (3 more replies)
  0 siblings, 4 replies; 10+ messages in thread
From: Chris Wilson @ 2018-02-22 14:11 UTC (permalink / raw)
  To: intel-gfx

Sometimes we need to boost the priority of an in-flight request, which
may lead to the situation where the second submission port then contains
a higher priority context than the first and so we need to inject a
preemption event. To do so we must always check inside
execlists_dequeue() whether there is a priority inversion between the
ports themselves as well as the head of the priority sorted queue, and we
cannot just skip dequeuing if the queue is empty.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Michał Winiarski <michal.winiarski@intel.com>
Cc: Michel Thierry <michel.thierry@intel.com>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/intel_engine_cs.c      |   2 +
 drivers/gpu/drm/i915/intel_guc_submission.c |  17 +--
 drivers/gpu/drm/i915/intel_lrc.c            | 161 ++++++++++++++++------------
 drivers/gpu/drm/i915/intel_ringbuffer.h     |   5 +
 4 files changed, 107 insertions(+), 78 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index c31544406974..ce7fcf55ba18 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -423,6 +423,7 @@ static void intel_engine_init_execlist(struct intel_engine_cs *engine)
 	BUILD_BUG_ON_NOT_POWER_OF_2(execlists_num_ports(execlists));
 	GEM_BUG_ON(execlists_num_ports(execlists) > EXECLIST_MAX_PORTS);
 
+	execlists->queue_priority = INT_MIN;
 	execlists->queue = RB_ROOT;
 	execlists->first = NULL;
 }
@@ -1903,6 +1904,7 @@ void intel_engine_dump(struct intel_engine_cs *engine,
 	spin_lock_irq(&engine->timeline->lock);
 	list_for_each_entry(rq, &engine->timeline->requests, link)
 		print_request(m, rq, "\t\tE ");
+	drm_printf(m, "\t\tQueue priority: %d\n", execlists->queue_priority);
 	for (rb = execlists->first; rb; rb = rb_next(rb)) {
 		struct i915_priolist *p =
 			rb_entry(rb, typeof(*p), node);
diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c b/drivers/gpu/drm/i915/intel_guc_submission.c
index 649113c7a3c2..586dde579903 100644
--- a/drivers/gpu/drm/i915/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/intel_guc_submission.c
@@ -75,6 +75,11 @@
  *
  */
 
+static inline struct i915_priolist *to_priolist(struct rb_node *rb)
+{
+	return rb_entry(rb, struct i915_priolist, node);
+}
+
 static inline bool is_high_priority(struct intel_guc_client *client)
 {
 	return (client->priority == GUC_CLIENT_PRIORITY_KMD_HIGH ||
@@ -682,15 +687,12 @@ static void guc_dequeue(struct intel_engine_cs *engine)
 	rb = execlists->first;
 	GEM_BUG_ON(rb_first(&execlists->queue) != rb);
 
-	if (!rb)
-		goto unlock;
-
 	if (port_isset(port)) {
 		if (engine->i915->preempt_context) {
 			struct guc_preempt_work *preempt_work =
 				&engine->i915->guc.preempt_work[engine->id];
 
-			if (rb_entry(rb, struct i915_priolist, node)->priority >
+			if (execlists->queue_priority >
 			    max(port_request(port)->priotree.priority, 0)) {
 				execlists_set_active(execlists,
 						     EXECLISTS_ACTIVE_PREEMPT);
@@ -706,8 +708,8 @@ static void guc_dequeue(struct intel_engine_cs *engine)
 	}
 	GEM_BUG_ON(port_isset(port));
 
-	do {
-		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
+	while (rb) {
+		struct i915_priolist *p = to_priolist(rb);
 		struct i915_request *rq, *rn;
 
 		list_for_each_entry_safe(rq, rn, &p->requests, priotree.link) {
@@ -736,8 +738,9 @@ static void guc_dequeue(struct intel_engine_cs *engine)
 		INIT_LIST_HEAD(&p->requests);
 		if (p->priority != I915_PRIORITY_NORMAL)
 			kmem_cache_free(engine->i915->priorities, p);
-	} while (rb);
+	}
 done:
+	execlists->queue_priority = rb ? to_priolist(rb)->priority : INT_MIN;
 	execlists->first = rb;
 	if (submit) {
 		port_assign(port, last);
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index e781c912f197..8a98da7a5c83 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -169,6 +169,23 @@ static void execlists_init_reg_state(u32 *reg_state,
 				     struct intel_engine_cs *engine,
 				     struct intel_ring *ring);
 
+static inline struct i915_priolist *to_priolist(struct rb_node *rb)
+{
+	return rb_entry(rb, struct i915_priolist, node);
+}
+
+static inline int rq_prio(const struct i915_request *rq)
+{
+	return rq->priotree.priority;
+}
+
+static inline bool need_preempt(const struct intel_engine_cs *engine,
+                                const struct i915_request *last,
+                                int prio)
+{
+	return engine->i915->preempt_context && prio > max(rq_prio(last), 0);
+}
+
 /**
  * intel_lr_context_descriptor_update() - calculate & cache the descriptor
  * 					  descriptor for a pinned context
@@ -224,7 +241,7 @@ lookup_priolist(struct intel_engine_cs *engine,
 	parent = &execlists->queue.rb_node;
 	while (*parent) {
 		rb = *parent;
-		p = rb_entry(rb, typeof(*p), node);
+		p = to_priolist(rb);
 		if (prio > p->priority) {
 			parent = &rb->rb_left;
 		} else if (prio < p->priority) {
@@ -264,7 +281,7 @@ lookup_priolist(struct intel_engine_cs *engine,
 	if (first)
 		execlists->first = &p->node;
 
-	return ptr_pack_bits(p, first, 1);
+	return p;
 }
 
 static void unwind_wa_tail(struct i915_request *rq)
@@ -290,14 +307,10 @@ static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
 		__i915_request_unsubmit(rq);
 		unwind_wa_tail(rq);
 
-		GEM_BUG_ON(rq->priotree.priority == I915_PRIORITY_INVALID);
-		if (rq->priotree.priority != last_prio) {
-			p = lookup_priolist(engine,
-					    &rq->priotree,
-					    rq->priotree.priority);
-			p = ptr_mask_bits(p, 1);
-
-			last_prio = rq->priotree.priority;
+		GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
+		if (rq_prio(rq) != last_prio) {
+			last_prio = rq_prio(rq);
+			p = lookup_priolist(engine, &rq->priotree, last_prio);
 		}
 
 		list_add(&rq->priotree.link, &p->requests);
@@ -397,10 +410,11 @@ static void execlists_submit_ports(struct intel_engine_cs *engine)
 			desc = execlists_update_context(rq);
 			GEM_DEBUG_EXEC(port[n].context_id = upper_32_bits(desc));
 
-			GEM_TRACE("%s in[%d]:  ctx=%d.%d, seqno=%x\n",
+			GEM_TRACE("%s in[%d]:  ctx=%d.%d, seqno=%x, prio=%d\n",
 				  engine->name, n,
 				  port[n].context_id, count,
-				  rq->global_seqno);
+				  rq->global_seqno,
+				  rq_prio(rq));
 		} else {
 			GEM_BUG_ON(!n);
 			desc = 0;
@@ -453,12 +467,17 @@ static void inject_preempt_context(struct intel_engine_cs *engine)
 		   _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT |
 				      CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT));
 
+	/*
+	 * Switch to our empty preempt context so
+	 * the state of the GPU is known (idle).
+	 */
 	GEM_TRACE("%s\n", engine->name);
 	for (n = execlists_num_ports(&engine->execlists); --n; )
 		elsp_write(0, engine->execlists.elsp);
 
 	elsp_write(ce->lrc_desc, engine->execlists.elsp);
 	execlists_clear_active(&engine->execlists, EXECLISTS_ACTIVE_HWACK);
+	execlists_set_active(&engine->execlists, EXECLISTS_ACTIVE_PREEMPT);
 }
 
 static void execlists_dequeue(struct intel_engine_cs *engine)
@@ -495,8 +514,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 	spin_lock_irq(&engine->timeline->lock);
 	rb = execlists->first;
 	GEM_BUG_ON(rb_first(&execlists->queue) != rb);
-	if (!rb)
-		goto unlock;
 
 	if (last) {
 		/*
@@ -519,54 +536,48 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 		if (!execlists_is_active(execlists, EXECLISTS_ACTIVE_HWACK))
 			goto unlock;
 
-		if (engine->i915->preempt_context &&
-		    rb_entry(rb, struct i915_priolist, node)->priority >
-		    max(last->priotree.priority, 0)) {
-			/*
-			 * Switch to our empty preempt context so
-			 * the state of the GPU is known (idle).
-			 */
+		if (need_preempt(engine, last, execlists->queue_priority)) {
 			inject_preempt_context(engine);
-			execlists_set_active(execlists,
-					     EXECLISTS_ACTIVE_PREEMPT);
 			goto unlock;
-		} else {
-			/*
-			 * In theory, we could coalesce more requests onto
-			 * the second port (the first port is active, with
-			 * no preemptions pending). However, that means we
-			 * then have to deal with the possible lite-restore
-			 * of the second port (as we submit the ELSP, there
-			 * may be a context-switch) but also we may complete
-			 * the resubmission before the context-switch. Ergo,
-			 * coalescing onto the second port will cause a
-			 * preemption event, but we cannot predict whether
-			 * that will affect port[0] or port[1].
-			 *
-			 * If the second port is already active, we can wait
-			 * until the next context-switch before contemplating
-			 * new requests. The GPU will be busy and we should be
-			 * able to resubmit the new ELSP before it idles,
-			 * avoiding pipeline bubbles (momentary pauses where
-			 * the driver is unable to keep up the supply of new
-			 * work).
-			 */
-			if (port_count(&port[1]))
-				goto unlock;
-
-			/* WaIdleLiteRestore:bdw,skl
-			 * Apply the wa NOOPs to prevent
-			 * ring:HEAD == rq:TAIL as we resubmit the
-			 * request. See gen8_emit_breadcrumb() for
-			 * where we prepare the padding after the
-			 * end of the request.
-			 */
-			last->tail = last->wa_tail;
 		}
+
+		/*
+		 * In theory, we could coalesce more requests onto
+		 * the second port (the first port is active, with
+		 * no preemptions pending). However, that means we
+		 * then have to deal with the possible lite-restore
+		 * of the second port (as we submit the ELSP, there
+		 * may be a context-switch) but also we may complete
+		 * the resubmission before the context-switch. Ergo,
+		 * coalescing onto the second port will cause a
+		 * preemption event, but we cannot predict whether
+		 * that will affect port[0] or port[1].
+		 *
+		 * If the second port is already active, we can wait
+		 * until the next context-switch before contemplating
+		 * new requests. The GPU will be busy and we should be
+		 * able to resubmit the new ELSP before it idles,
+		 * avoiding pipeline bubbles (momentary pauses where
+		 * the driver is unable to keep up the supply of new
+		 * work). However, we have to double check that the
+		 * priorities of the ports haven't been switch.
+		 */
+		if (port_count(&port[1]))
+			goto unlock;
+
+		/*
+		 * WaIdleLiteRestore:bdw,skl
+		 * Apply the wa NOOPs to prevent
+		 * ring:HEAD == rq:TAIL as we resubmit the
+		 * request. See gen8_emit_breadcrumb() for
+		 * where we prepare the padding after the
+		 * end of the request.
+		 */
+		last->tail = last->wa_tail;
 	}
 
-	do {
-		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
+	while (rb) {
+		struct i915_priolist *p = to_priolist(rb);
 		struct i915_request *rq, *rn;
 
 		list_for_each_entry_safe(rq, rn, &p->requests, priotree.link) {
@@ -628,8 +639,9 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 		INIT_LIST_HEAD(&p->requests);
 		if (p->priority != I915_PRIORITY_NORMAL)
 			kmem_cache_free(engine->i915->priorities, p);
-	} while (rb);
+	}
 done:
+	execlists->queue_priority = rb ? to_priolist(rb)->priority : INT_MIN;
 	execlists->first = rb;
 	if (submit)
 		port_assign(port, last);
@@ -690,7 +702,7 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
 	/* Flush the queued requests to the timeline list (for retiring). */
 	rb = execlists->first;
 	while (rb) {
-		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
+		struct i915_priolist *p = to_priolist(rb);
 
 		list_for_each_entry_safe(rq, rn, &p->requests, priotree.link) {
 			INIT_LIST_HEAD(&rq->priotree.link);
@@ -708,7 +720,7 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
 
 	/* Remaining _unready_ requests will be nop'ed when submitted */
 
-
+	execlists->queue_priority = INT_MIN;
 	execlists->queue = RB_ROOT;
 	execlists->first = NULL;
 	GEM_BUG_ON(port_isset(execlists->port));
@@ -867,10 +879,11 @@ static void execlists_submission_tasklet(unsigned long data)
 			GEM_DEBUG_BUG_ON(buf[2 * head + 1] != port->context_id);
 
 			rq = port_unpack(port, &count);
-			GEM_TRACE("%s out[0]: ctx=%d.%d, seqno=%x\n",
+			GEM_TRACE("%s out[0]: ctx=%d.%d, seqno=%x, prio=%d\n",
 				  engine->name,
 				  port->context_id, count,
-				  rq ? rq->global_seqno : 0);
+				  rq ? rq->global_seqno : 0,
+				  rq ? rq_prio(rq) : 0);
 			GEM_BUG_ON(count == 0);
 			if (--count == 0) {
 				GEM_BUG_ON(status & GEN8_CTX_STATUS_PREEMPTED);
@@ -908,15 +921,19 @@ static void execlists_submission_tasklet(unsigned long data)
 		intel_uncore_forcewake_put(dev_priv, execlists->fw_domains);
 }
 
-static void insert_request(struct intel_engine_cs *engine,
-			   struct i915_priotree *pt,
-			   int prio)
+static void queue_request(struct intel_engine_cs *engine,
+			  struct i915_priotree *pt,
+			  int prio)
 {
-	struct i915_priolist *p = lookup_priolist(engine, pt, prio);
+	list_add_tail(&pt->link, &lookup_priolist(engine, pt, prio)->requests);
+}
 
-	list_add_tail(&pt->link, &ptr_mask_bits(p, 1)->requests);
-	if (ptr_unmask_bits(p, 1))
+static void submit_queue(struct intel_engine_cs *engine, int prio)
+{
+	if (prio > engine->execlists.queue_priority) {
+		engine->execlists.queue_priority = prio;
 		tasklet_hi_schedule(&engine->execlists.tasklet);
+	}
 }
 
 static void execlists_submit_request(struct i915_request *request)
@@ -927,7 +944,8 @@ static void execlists_submit_request(struct i915_request *request)
 	/* Will be called from irq-context when using foreign fences. */
 	spin_lock_irqsave(&engine->timeline->lock, flags);
 
-	insert_request(engine, &request->priotree, request->priotree.priority);
+	queue_request(engine, &request->priotree, rq_prio(request));
+	submit_queue(engine, rq_prio(request));
 
 	GEM_BUG_ON(!engine->execlists.first);
 	GEM_BUG_ON(list_empty(&request->priotree.link));
@@ -983,7 +1001,7 @@ static void execlists_schedule(struct i915_request *request, int prio)
 	 * static void update_priorities(struct i915_priotree *pt, prio) {
 	 *	list_for_each_entry(dep, &pt->signalers_list, signal_link)
 	 *		update_priorities(dep->signal, prio)
-	 *	insert_request(pt);
+	 *	queue_request(pt);
 	 * }
 	 * but that may have unlimited recursion depth and so runs a very
 	 * real risk of overunning the kernel stack. Instead, we build
@@ -1046,8 +1064,9 @@ static void execlists_schedule(struct i915_request *request, int prio)
 		pt->priority = prio;
 		if (!list_empty(&pt->link)) {
 			__list_del_entry(&pt->link);
-			insert_request(engine, pt, prio);
+			queue_request(engine, pt, prio);
 		}
+		submit_queue(engine, prio);
 	}
 
 	spin_unlock_irq(&engine->timeline->lock);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index a9b83bf7e837..c4e9022b34e3 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -257,6 +257,11 @@ struct intel_engine_execlists {
 	 */
 	unsigned int port_mask;
 
+	/**
+	 * @queue_priority: Highest pending priority.
+	 */
+	int queue_priority;
+
 	/**
 	 * @queue: queue of requests, in priority lists
 	 */
-- 
2.16.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH] drm/i915/preemption: Allow preemption between submission ports
  2018-02-22 14:11 [PATCH] drm/i915/preemption: Allow preemption between submission ports Chris Wilson
@ 2018-02-22 14:22 ` Chris Wilson
  2018-02-22 14:36   ` Chris Wilson
                     ` (2 more replies)
  2018-02-22 14:59 ` ✓ Fi.CI.BAT: success for drm/i915/preemption: Allow preemption between submission ports (rev2) Patchwork
                   ` (2 subsequent siblings)
  3 siblings, 3 replies; 10+ messages in thread
From: Chris Wilson @ 2018-02-22 14:22 UTC (permalink / raw)
  To: intel-gfx

Sometimes we need to boost the priority of an in-flight request, which
may lead to the situation where the second submission port then contains
a higher priority context than the first and so we need to inject a
preemption event. To do so we must always check inside
execlists_dequeue() whether there is a priority inversion between the
ports themselves as well as the head of the priority sorted queue, and we
cannot just skip dequeuing if the queue is empty.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Michał Winiarski <michal.winiarski@intel.com>
Cc: Michel Thierry <michel.thierry@intel.com>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
Rebase for conflicts
-Chris
---
 drivers/gpu/drm/i915/intel_engine_cs.c      |   2 +
 drivers/gpu/drm/i915/intel_guc_submission.c |  17 +--
 drivers/gpu/drm/i915/intel_lrc.c            | 161 ++++++++++++++++------------
 drivers/gpu/drm/i915/intel_ringbuffer.h     |   5 +
 4 files changed, 107 insertions(+), 78 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index c31544406974..ce7fcf55ba18 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -423,6 +423,7 @@ static void intel_engine_init_execlist(struct intel_engine_cs *engine)
 	BUILD_BUG_ON_NOT_POWER_OF_2(execlists_num_ports(execlists));
 	GEM_BUG_ON(execlists_num_ports(execlists) > EXECLIST_MAX_PORTS);
 
+	execlists->queue_priority = INT_MIN;
 	execlists->queue = RB_ROOT;
 	execlists->first = NULL;
 }
@@ -1903,6 +1904,7 @@ void intel_engine_dump(struct intel_engine_cs *engine,
 	spin_lock_irq(&engine->timeline->lock);
 	list_for_each_entry(rq, &engine->timeline->requests, link)
 		print_request(m, rq, "\t\tE ");
+	drm_printf(m, "\t\tQueue priority: %d\n", execlists->queue_priority);
 	for (rb = execlists->first; rb; rb = rb_next(rb)) {
 		struct i915_priolist *p =
 			rb_entry(rb, typeof(*p), node);
diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c b/drivers/gpu/drm/i915/intel_guc_submission.c
index 649113c7a3c2..586dde579903 100644
--- a/drivers/gpu/drm/i915/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/intel_guc_submission.c
@@ -75,6 +75,11 @@
  *
  */
 
+static inline struct i915_priolist *to_priolist(struct rb_node *rb)
+{
+	return rb_entry(rb, struct i915_priolist, node);
+}
+
 static inline bool is_high_priority(struct intel_guc_client *client)
 {
 	return (client->priority == GUC_CLIENT_PRIORITY_KMD_HIGH ||
@@ -682,15 +687,12 @@ static void guc_dequeue(struct intel_engine_cs *engine)
 	rb = execlists->first;
 	GEM_BUG_ON(rb_first(&execlists->queue) != rb);
 
-	if (!rb)
-		goto unlock;
-
 	if (port_isset(port)) {
 		if (engine->i915->preempt_context) {
 			struct guc_preempt_work *preempt_work =
 				&engine->i915->guc.preempt_work[engine->id];
 
-			if (rb_entry(rb, struct i915_priolist, node)->priority >
+			if (execlists->queue_priority >
 			    max(port_request(port)->priotree.priority, 0)) {
 				execlists_set_active(execlists,
 						     EXECLISTS_ACTIVE_PREEMPT);
@@ -706,8 +708,8 @@ static void guc_dequeue(struct intel_engine_cs *engine)
 	}
 	GEM_BUG_ON(port_isset(port));
 
-	do {
-		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
+	while (rb) {
+		struct i915_priolist *p = to_priolist(rb);
 		struct i915_request *rq, *rn;
 
 		list_for_each_entry_safe(rq, rn, &p->requests, priotree.link) {
@@ -736,8 +738,9 @@ static void guc_dequeue(struct intel_engine_cs *engine)
 		INIT_LIST_HEAD(&p->requests);
 		if (p->priority != I915_PRIORITY_NORMAL)
 			kmem_cache_free(engine->i915->priorities, p);
-	} while (rb);
+	}
 done:
+	execlists->queue_priority = rb ? to_priolist(rb)->priority : INT_MIN;
 	execlists->first = rb;
 	if (submit) {
 		port_assign(port, last);
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 964885b5d7cb..4bc72fbaf793 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -169,6 +169,23 @@ static void execlists_init_reg_state(u32 *reg_state,
 				     struct intel_engine_cs *engine,
 				     struct intel_ring *ring);
 
+static inline struct i915_priolist *to_priolist(struct rb_node *rb)
+{
+	return rb_entry(rb, struct i915_priolist, node);
+}
+
+static inline int rq_prio(const struct i915_request *rq)
+{
+	return rq->priotree.priority;
+}
+
+static inline bool need_preempt(const struct intel_engine_cs *engine,
+                                const struct i915_request *last,
+                                int prio)
+{
+	return engine->i915->preempt_context && prio > max(rq_prio(last), 0);
+}
+
 /**
  * intel_lr_context_descriptor_update() - calculate & cache the descriptor
  * 					  descriptor for a pinned context
@@ -224,7 +241,7 @@ lookup_priolist(struct intel_engine_cs *engine,
 	parent = &execlists->queue.rb_node;
 	while (*parent) {
 		rb = *parent;
-		p = rb_entry(rb, typeof(*p), node);
+		p = to_priolist(rb);
 		if (prio > p->priority) {
 			parent = &rb->rb_left;
 		} else if (prio < p->priority) {
@@ -264,7 +281,7 @@ lookup_priolist(struct intel_engine_cs *engine,
 	if (first)
 		execlists->first = &p->node;
 
-	return ptr_pack_bits(p, first, 1);
+	return p;
 }
 
 static void unwind_wa_tail(struct i915_request *rq)
@@ -290,14 +307,10 @@ static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
 		__i915_request_unsubmit(rq);
 		unwind_wa_tail(rq);
 
-		GEM_BUG_ON(rq->priotree.priority == I915_PRIORITY_INVALID);
-		if (rq->priotree.priority != last_prio) {
-			p = lookup_priolist(engine,
-					    &rq->priotree,
-					    rq->priotree.priority);
-			p = ptr_mask_bits(p, 1);
-
-			last_prio = rq->priotree.priority;
+		GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
+		if (rq_prio(rq) != last_prio) {
+			last_prio = rq_prio(rq);
+			p = lookup_priolist(engine, &rq->priotree, last_prio);
 		}
 
 		list_add(&rq->priotree.link, &p->requests);
@@ -397,10 +410,11 @@ static void execlists_submit_ports(struct intel_engine_cs *engine)
 			desc = execlists_update_context(rq);
 			GEM_DEBUG_EXEC(port[n].context_id = upper_32_bits(desc));
 
-			GEM_TRACE("%s in[%d]:  ctx=%d.%d, seqno=%x\n",
+			GEM_TRACE("%s in[%d]:  ctx=%d.%d, seqno=%x, prio=%d\n",
 				  engine->name, n,
 				  port[n].context_id, count,
-				  rq->global_seqno);
+				  rq->global_seqno,
+				  rq_prio(rq));
 		} else {
 			GEM_BUG_ON(!n);
 			desc = 0;
@@ -453,12 +467,17 @@ static void inject_preempt_context(struct intel_engine_cs *engine)
 		   _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT |
 				      CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT));
 
+	/*
+	 * Switch to our empty preempt context so
+	 * the state of the GPU is known (idle).
+	 */
 	GEM_TRACE("%s\n", engine->name);
 	for (n = execlists_num_ports(&engine->execlists); --n; )
 		elsp_write(0, engine->execlists.elsp);
 
 	elsp_write(ce->lrc_desc, engine->execlists.elsp);
 	execlists_clear_active(&engine->execlists, EXECLISTS_ACTIVE_HWACK);
+	execlists_set_active(&engine->execlists, EXECLISTS_ACTIVE_PREEMPT);
 }
 
 static void execlists_dequeue(struct intel_engine_cs *engine)
@@ -495,8 +514,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 	spin_lock_irq(&engine->timeline->lock);
 	rb = execlists->first;
 	GEM_BUG_ON(rb_first(&execlists->queue) != rb);
-	if (!rb)
-		goto unlock;
 
 	if (last) {
 		/*
@@ -519,54 +536,48 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 		if (!execlists_is_active(execlists, EXECLISTS_ACTIVE_HWACK))
 			goto unlock;
 
-		if (engine->i915->preempt_context &&
-		    rb_entry(rb, struct i915_priolist, node)->priority >
-		    max(last->priotree.priority, 0)) {
-			/*
-			 * Switch to our empty preempt context so
-			 * the state of the GPU is known (idle).
-			 */
+		if (need_preempt(engine, last, execlists->queue_priority)) {
 			inject_preempt_context(engine);
-			execlists_set_active(execlists,
-					     EXECLISTS_ACTIVE_PREEMPT);
 			goto unlock;
-		} else {
-			/*
-			 * In theory, we could coalesce more requests onto
-			 * the second port (the first port is active, with
-			 * no preemptions pending). However, that means we
-			 * then have to deal with the possible lite-restore
-			 * of the second port (as we submit the ELSP, there
-			 * may be a context-switch) but also we may complete
-			 * the resubmission before the context-switch. Ergo,
-			 * coalescing onto the second port will cause a
-			 * preemption event, but we cannot predict whether
-			 * that will affect port[0] or port[1].
-			 *
-			 * If the second port is already active, we can wait
-			 * until the next context-switch before contemplating
-			 * new requests. The GPU will be busy and we should be
-			 * able to resubmit the new ELSP before it idles,
-			 * avoiding pipeline bubbles (momentary pauses where
-			 * the driver is unable to keep up the supply of new
-			 * work).
-			 */
-			if (port_count(&port[1]))
-				goto unlock;
-
-			/* WaIdleLiteRestore:bdw,skl
-			 * Apply the wa NOOPs to prevent
-			 * ring:HEAD == rq:TAIL as we resubmit the
-			 * request. See gen8_emit_breadcrumb() for
-			 * where we prepare the padding after the
-			 * end of the request.
-			 */
-			last->tail = last->wa_tail;
 		}
+
+		/*
+		 * In theory, we could coalesce more requests onto
+		 * the second port (the first port is active, with
+		 * no preemptions pending). However, that means we
+		 * then have to deal with the possible lite-restore
+		 * of the second port (as we submit the ELSP, there
+		 * may be a context-switch) but also we may complete
+		 * the resubmission before the context-switch. Ergo,
+		 * coalescing onto the second port will cause a
+		 * preemption event, but we cannot predict whether
+		 * that will affect port[0] or port[1].
+		 *
+		 * If the second port is already active, we can wait
+		 * until the next context-switch before contemplating
+		 * new requests. The GPU will be busy and we should be
+		 * able to resubmit the new ELSP before it idles,
+		 * avoiding pipeline bubbles (momentary pauses where
+		 * the driver is unable to keep up the supply of new
+		 * work). However, we have to double check that the
+		 * priorities of the ports haven't been switch.
+		 */
+		if (port_count(&port[1]))
+			goto unlock;
+
+		/*
+		 * WaIdleLiteRestore:bdw,skl
+		 * Apply the wa NOOPs to prevent
+		 * ring:HEAD == rq:TAIL as we resubmit the
+		 * request. See gen8_emit_breadcrumb() for
+		 * where we prepare the padding after the
+		 * end of the request.
+		 */
+		last->tail = last->wa_tail;
 	}
 
-	do {
-		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
+	while (rb) {
+		struct i915_priolist *p = to_priolist(rb);
 		struct i915_request *rq, *rn;
 
 		list_for_each_entry_safe(rq, rn, &p->requests, priotree.link) {
@@ -628,8 +639,9 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 		INIT_LIST_HEAD(&p->requests);
 		if (p->priority != I915_PRIORITY_NORMAL)
 			kmem_cache_free(engine->i915->priorities, p);
-	} while (rb);
+	}
 done:
+	execlists->queue_priority = rb ? to_priolist(rb)->priority : INT_MIN;
 	execlists->first = rb;
 	if (submit)
 		port_assign(port, last);
@@ -690,7 +702,7 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
 	/* Flush the queued requests to the timeline list (for retiring). */
 	rb = execlists->first;
 	while (rb) {
-		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
+		struct i915_priolist *p = to_priolist(rb);
 
 		list_for_each_entry_safe(rq, rn, &p->requests, priotree.link) {
 			INIT_LIST_HEAD(&rq->priotree.link);
@@ -708,7 +720,7 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
 
 	/* Remaining _unready_ requests will be nop'ed when submitted */
 
-
+	execlists->queue_priority = INT_MIN;
 	execlists->queue = RB_ROOT;
 	execlists->first = NULL;
 	GEM_BUG_ON(port_isset(execlists->port));
@@ -864,10 +876,11 @@ static void execlists_submission_tasklet(unsigned long data)
 							EXECLISTS_ACTIVE_USER));
 
 			rq = port_unpack(port, &count);
-			GEM_TRACE("%s out[0]: ctx=%d.%d, seqno=%x\n",
+			GEM_TRACE("%s out[0]: ctx=%d.%d, seqno=%x, prio=%d\n",
 				  engine->name,
 				  port->context_id, count,
-				  rq ? rq->global_seqno : 0);
+				  rq ? rq->global_seqno : 0,
+				  rq ? rq_prio(rq) : 0);
 
 			/* Check the context/desc id for this event matches */
 			GEM_DEBUG_BUG_ON(buf[2 * head + 1] != port->context_id);
@@ -912,15 +925,19 @@ static void execlists_submission_tasklet(unsigned long data)
 		intel_uncore_forcewake_put(dev_priv, execlists->fw_domains);
 }
 
-static void insert_request(struct intel_engine_cs *engine,
-			   struct i915_priotree *pt,
-			   int prio)
+static void queue_request(struct intel_engine_cs *engine,
+			  struct i915_priotree *pt,
+			  int prio)
 {
-	struct i915_priolist *p = lookup_priolist(engine, pt, prio);
+	list_add_tail(&pt->link, &lookup_priolist(engine, pt, prio)->requests);
+}
 
-	list_add_tail(&pt->link, &ptr_mask_bits(p, 1)->requests);
-	if (ptr_unmask_bits(p, 1))
+static void submit_queue(struct intel_engine_cs *engine, int prio)
+{
+	if (prio > engine->execlists.queue_priority) {
+		engine->execlists.queue_priority = prio;
 		tasklet_hi_schedule(&engine->execlists.tasklet);
+	}
 }
 
 static void execlists_submit_request(struct i915_request *request)
@@ -931,7 +948,8 @@ static void execlists_submit_request(struct i915_request *request)
 	/* Will be called from irq-context when using foreign fences. */
 	spin_lock_irqsave(&engine->timeline->lock, flags);
 
-	insert_request(engine, &request->priotree, request->priotree.priority);
+	queue_request(engine, &request->priotree, rq_prio(request));
+	submit_queue(engine, rq_prio(request));
 
 	GEM_BUG_ON(!engine->execlists.first);
 	GEM_BUG_ON(list_empty(&request->priotree.link));
@@ -987,7 +1005,7 @@ static void execlists_schedule(struct i915_request *request, int prio)
 	 * static void update_priorities(struct i915_priotree *pt, prio) {
 	 *	list_for_each_entry(dep, &pt->signalers_list, signal_link)
 	 *		update_priorities(dep->signal, prio)
-	 *	insert_request(pt);
+	 *	queue_request(pt);
 	 * }
 	 * but that may have unlimited recursion depth and so runs a very
 	 * real risk of overunning the kernel stack. Instead, we build
@@ -1050,8 +1068,9 @@ static void execlists_schedule(struct i915_request *request, int prio)
 		pt->priority = prio;
 		if (!list_empty(&pt->link)) {
 			__list_del_entry(&pt->link);
-			insert_request(engine, pt, prio);
+			queue_request(engine, pt, prio);
 		}
+		submit_queue(engine, prio);
 	}
 
 	spin_unlock_irq(&engine->timeline->lock);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index a9b83bf7e837..c4e9022b34e3 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -257,6 +257,11 @@ struct intel_engine_execlists {
 	 */
 	unsigned int port_mask;
 
+	/**
+	 * @queue_priority: Highest pending priority.
+	 */
+	int queue_priority;
+
 	/**
 	 * @queue: queue of requests, in priority lists
 	 */
-- 
2.16.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] drm/i915/preemption: Allow preemption between submission ports
  2018-02-22 14:22 ` Chris Wilson
@ 2018-02-22 14:36   ` Chris Wilson
  2018-02-23 14:06   ` Mika Kuoppala
  2018-02-23 15:18   ` Mika Kuoppala
  2 siblings, 0 replies; 10+ messages in thread
From: Chris Wilson @ 2018-02-22 14:36 UTC (permalink / raw)
  To: intel-gfx

Quoting Chris Wilson (2018-02-22 14:22:29)
> Sometimes we need to boost the priority of an in-flight request, which
> may lead to the situation where the second submission port then contains
> a higher priority context than the first and so we need to inject a
> preemption event. To do so we must always check inside
> execlists_dequeue() whether there is a priority inversion between the
> ports themselves as well as the head of the priority sorted queue, and we
> cannot just skip dequeuing if the queue is empty.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Michał Winiarski <michal.winiarski@intel.com>
> Cc: Michel Thierry <michel.thierry@intel.com>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Note this probably merits a Fixes tag. Outside of igt causing priority
inversion between GPU hogs, is it a big enough problem to deserve
sending to stable@?
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* ✓ Fi.CI.BAT: success for drm/i915/preemption: Allow preemption between submission ports (rev2)
  2018-02-22 14:11 [PATCH] drm/i915/preemption: Allow preemption between submission ports Chris Wilson
  2018-02-22 14:22 ` Chris Wilson
@ 2018-02-22 14:59 ` Patchwork
  2018-02-22 19:48 ` ✓ Fi.CI.IGT: " Patchwork
  2018-02-23 13:35 ` [PATCH] drm/i915/preemption: Allow preemption between submission ports Chris Wilson
  3 siblings, 0 replies; 10+ messages in thread
From: Patchwork @ 2018-02-22 14:59 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: drm/i915/preemption: Allow preemption between submission ports (rev2)
URL   : https://patchwork.freedesktop.org/series/38774/
State : success

== Summary ==

Series 38774v2 drm/i915/preemption: Allow preemption between submission ports
https://patchwork.freedesktop.org/api/1.0/series/38774/revisions/2/mbox/

Test kms_pipe_crc_basic:
        Subgroup suspend-read-crc-pipe-c:
                pass       -> DMESG-WARN (fi-skl-6700k2) fdo#104108

fdo#104108 https://bugs.freedesktop.org/show_bug.cgi?id=104108

fi-bdw-5557u     total:288  pass:267  dwarn:0   dfail:0   fail:0   skip:21  time:421s
fi-bdw-gvtdvm    total:288  pass:264  dwarn:0   dfail:0   fail:0   skip:24  time:424s
fi-blb-e6850     total:288  pass:223  dwarn:1   dfail:0   fail:0   skip:64  time:374s
fi-bsw-n3050     total:288  pass:242  dwarn:0   dfail:0   fail:0   skip:46  time:479s
fi-bwr-2160      total:288  pass:183  dwarn:0   dfail:0   fail:0   skip:105 time:286s
fi-bxt-j4205     total:288  pass:259  dwarn:0   dfail:0   fail:0   skip:29  time:479s
fi-byt-j1900     total:288  pass:253  dwarn:0   dfail:0   fail:0   skip:35  time:465s
fi-byt-n2820     total:288  pass:249  dwarn:0   dfail:0   fail:0   skip:39  time:456s
fi-cfl-s2        total:288  pass:262  dwarn:0   dfail:0   fail:0   skip:26  time:564s
fi-elk-e7500     total:288  pass:229  dwarn:0   dfail:0   fail:0   skip:59  time:414s
fi-gdg-551       total:288  pass:179  dwarn:0   dfail:0   fail:1   skip:108 time:288s
fi-glk-1         total:288  pass:260  dwarn:0   dfail:0   fail:0   skip:28  time:503s
fi-hsw-4770      total:288  pass:261  dwarn:0   dfail:0   fail:0   skip:27  time:386s
fi-ilk-650       total:288  pass:227  dwarn:0   dfail:0   fail:1   skip:60  time:406s
fi-ivb-3520m     total:288  pass:259  dwarn:0   dfail:0   fail:0   skip:29  time:453s
fi-ivb-3770      total:288  pass:255  dwarn:0   dfail:0   fail:0   skip:33  time:413s
fi-kbl-7500u     total:288  pass:263  dwarn:1   dfail:0   fail:0   skip:24  time:450s
fi-kbl-7560u     total:288  pass:269  dwarn:0   dfail:0   fail:0   skip:19  time:495s
fi-kbl-7567u     total:288  pass:268  dwarn:0   dfail:0   fail:0   skip:20  time:454s
fi-kbl-r         total:288  pass:261  dwarn:0   dfail:0   fail:0   skip:27  time:494s
fi-pnv-d510      total:288  pass:222  dwarn:1   dfail:0   fail:0   skip:65  time:589s
fi-skl-6260u     total:288  pass:268  dwarn:0   dfail:0   fail:0   skip:20  time:429s
fi-skl-6600u     total:288  pass:261  dwarn:0   dfail:0   fail:0   skip:27  time:504s
fi-skl-6700hq    total:288  pass:262  dwarn:0   dfail:0   fail:0   skip:26  time:519s
fi-skl-6700k2    total:288  pass:263  dwarn:1   dfail:0   fail:0   skip:24  time:482s
fi-skl-6770hq    total:288  pass:268  dwarn:0   dfail:0   fail:0   skip:20  time:472s
fi-skl-guc       total:288  pass:260  dwarn:0   dfail:0   fail:0   skip:28  time:406s
fi-skl-gvtdvm    total:288  pass:265  dwarn:0   dfail:0   fail:0   skip:23  time:431s
fi-snb-2520m     total:288  pass:248  dwarn:0   dfail:0   fail:0   skip:40  time:523s
fi-snb-2600      total:288  pass:248  dwarn:0   dfail:0   fail:0   skip:40  time:404s
Blacklisted hosts:
fi-cfl-8700k     total:288  pass:260  dwarn:0   dfail:0   fail:0   skip:28  time:390s
fi-bxt-dsi failed to collect. IGT log at Patchwork_8127/fi-bxt-dsi/run0.log

77655eb53ee4b22bff8505da1d8141b93d5b864b drm-tip: 2018y-02m-22d-14h-14m-34s UTC integration manifest
376cc66b8d72 drm/i915/preemption: Allow preemption between submission ports

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_8127/issues.html
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* ✓ Fi.CI.IGT: success for drm/i915/preemption: Allow preemption between submission ports (rev2)
  2018-02-22 14:11 [PATCH] drm/i915/preemption: Allow preemption between submission ports Chris Wilson
  2018-02-22 14:22 ` Chris Wilson
  2018-02-22 14:59 ` ✓ Fi.CI.BAT: success for drm/i915/preemption: Allow preemption between submission ports (rev2) Patchwork
@ 2018-02-22 19:48 ` Patchwork
  2018-02-23 13:35 ` [PATCH] drm/i915/preemption: Allow preemption between submission ports Chris Wilson
  3 siblings, 0 replies; 10+ messages in thread
From: Patchwork @ 2018-02-22 19:48 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: drm/i915/preemption: Allow preemption between submission ports (rev2)
URL   : https://patchwork.freedesktop.org/series/38774/
State : success

== Summary ==

Test gem_eio:
        Subgroup in-flight-contexts:
                incomplete -> PASS       (shard-apl) fdo#104945
Test kms_cursor_crc:
        Subgroup cursor-64x64-suspend:
                pass       -> SKIP       (shard-snb) fdo#102365
Test kms_chv_cursor_fail:
        Subgroup pipe-b-128x128-right-edge:
                dmesg-warn -> PASS       (shard-snb)
        Subgroup pipe-b-64x64-bottom-edge:
                dmesg-warn -> PASS       (shard-snb) fdo#105185
Test perf:
        Subgroup enable-disable:
                pass       -> FAIL       (shard-apl) fdo#103715
        Subgroup oa-exponents:
                incomplete -> PASS       (shard-apl) fdo#102254
Test kms_flip:
        Subgroup plain-flip-fb-recreate-interruptible:
                pass       -> FAIL       (shard-hsw) fdo#100368 +1
Test kms_frontbuffer_tracking:
        Subgroup fbc-shrfb-scaledprimary:
                pass       -> DMESG-FAIL (shard-apl) fdo#103167 +2
Test kms_plane_lowres:
        Subgroup pipe-b-tiling-y:
                pass       -> FAIL       (shard-apl) fdo#103166

fdo#104945 https://bugs.freedesktop.org/show_bug.cgi?id=104945
fdo#102365 https://bugs.freedesktop.org/show_bug.cgi?id=102365
fdo#105185 https://bugs.freedesktop.org/show_bug.cgi?id=105185
fdo#103715 https://bugs.freedesktop.org/show_bug.cgi?id=103715
fdo#102254 https://bugs.freedesktop.org/show_bug.cgi?id=102254
fdo#100368 https://bugs.freedesktop.org/show_bug.cgi?id=100368
fdo#103167 https://bugs.freedesktop.org/show_bug.cgi?id=103167
fdo#103166 https://bugs.freedesktop.org/show_bug.cgi?id=103166

shard-apl        total:3465 pass:1816 dwarn:1   dfail:1   fail:16  skip:1631 time:12444s
shard-hsw        total:3465 pass:1767 dwarn:1   dfail:0   fail:3   skip:1693 time:11711s
shard-snb        total:3465 pass:1356 dwarn:1   dfail:0   fail:2   skip:2106 time:6489s
Blacklisted hosts:
shard-kbl        total:3448 pass:1950 dwarn:1   dfail:0   fail:14  skip:1482 time:9117s

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_8127/shards.html
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] drm/i915/preemption: Allow preemption between submission ports
  2018-02-22 14:11 [PATCH] drm/i915/preemption: Allow preemption between submission ports Chris Wilson
                   ` (2 preceding siblings ...)
  2018-02-22 19:48 ` ✓ Fi.CI.IGT: " Patchwork
@ 2018-02-23 13:35 ` Chris Wilson
  3 siblings, 0 replies; 10+ messages in thread
From: Chris Wilson @ 2018-02-23 13:35 UTC (permalink / raw)
  To: intel-gfx

Quoting Chris Wilson (2018-02-22 14:11:54)
> Sometimes we need to boost the priority of an in-flight request, which
> may lead to the situation where the second submission port then contains
> a higher priority context than the first and so we need to inject a
> preemption event. To do so we must always check inside
> execlists_dequeue() whether there is a priority inversion between the
> ports themselves as well as the head of the priority sorted queue, and we
> cannot just skip dequeuing if the queue is empty.

Michał noted this doesn't extend past 2-port submission as simply as I
thought it might. Nevertheless it does solve the problem we have today
of priority inversion within ELSP[2]. Extending the submission model as
a whole to more ports is left as an exercise to the reader. :|
-Chris

> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Michał Winiarski <michal.winiarski@intel.com>
> Cc: Michel Thierry <michel.thierry@intel.com>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>  drivers/gpu/drm/i915/intel_engine_cs.c      |   2 +
>  drivers/gpu/drm/i915/intel_guc_submission.c |  17 +--
>  drivers/gpu/drm/i915/intel_lrc.c            | 161 ++++++++++++++++------------
>  drivers/gpu/drm/i915/intel_ringbuffer.h     |   5 +
>  4 files changed, 107 insertions(+), 78 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
> index c31544406974..ce7fcf55ba18 100644
> --- a/drivers/gpu/drm/i915/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/intel_engine_cs.c
> @@ -423,6 +423,7 @@ static void intel_engine_init_execlist(struct intel_engine_cs *engine)
>         BUILD_BUG_ON_NOT_POWER_OF_2(execlists_num_ports(execlists));
>         GEM_BUG_ON(execlists_num_ports(execlists) > EXECLIST_MAX_PORTS);
>  
> +       execlists->queue_priority = INT_MIN;
>         execlists->queue = RB_ROOT;
>         execlists->first = NULL;
>  }
> @@ -1903,6 +1904,7 @@ void intel_engine_dump(struct intel_engine_cs *engine,
>         spin_lock_irq(&engine->timeline->lock);
>         list_for_each_entry(rq, &engine->timeline->requests, link)
>                 print_request(m, rq, "        E ");
> +       drm_printf(m, "        Queue priority: %d\n", execlists->queue_priority);
>         for (rb = execlists->first; rb; rb = rb_next(rb)) {
>                 struct i915_priolist *p =
>                         rb_entry(rb, typeof(*p), node);
> diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c b/drivers/gpu/drm/i915/intel_guc_submission.c
> index 649113c7a3c2..586dde579903 100644
> --- a/drivers/gpu/drm/i915/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/intel_guc_submission.c
> @@ -75,6 +75,11 @@
>   *
>   */
>  
> +static inline struct i915_priolist *to_priolist(struct rb_node *rb)
> +{
> +       return rb_entry(rb, struct i915_priolist, node);
> +}
> +
>  static inline bool is_high_priority(struct intel_guc_client *client)
>  {
>         return (client->priority == GUC_CLIENT_PRIORITY_KMD_HIGH ||
> @@ -682,15 +687,12 @@ static void guc_dequeue(struct intel_engine_cs *engine)
>         rb = execlists->first;
>         GEM_BUG_ON(rb_first(&execlists->queue) != rb);
>  
> -       if (!rb)
> -               goto unlock;
> -
>         if (port_isset(port)) {
>                 if (engine->i915->preempt_context) {
>                         struct guc_preempt_work *preempt_work =
>                                 &engine->i915->guc.preempt_work[engine->id];
>  
> -                       if (rb_entry(rb, struct i915_priolist, node)->priority >
> +                       if (execlists->queue_priority >
>                             max(port_request(port)->priotree.priority, 0)) {
>                                 execlists_set_active(execlists,
>                                                      EXECLISTS_ACTIVE_PREEMPT);
> @@ -706,8 +708,8 @@ static void guc_dequeue(struct intel_engine_cs *engine)
>         }
>         GEM_BUG_ON(port_isset(port));
>  
> -       do {
> -               struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
> +       while (rb) {
> +               struct i915_priolist *p = to_priolist(rb);
>                 struct i915_request *rq, *rn;
>  
>                 list_for_each_entry_safe(rq, rn, &p->requests, priotree.link) {
> @@ -736,8 +738,9 @@ static void guc_dequeue(struct intel_engine_cs *engine)
>                 INIT_LIST_HEAD(&p->requests);
>                 if (p->priority != I915_PRIORITY_NORMAL)
>                         kmem_cache_free(engine->i915->priorities, p);
> -       } while (rb);
> +       }
>  done:
> +       execlists->queue_priority = rb ? to_priolist(rb)->priority : INT_MIN;
>         execlists->first = rb;
>         if (submit) {
>                 port_assign(port, last);
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index e781c912f197..8a98da7a5c83 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -169,6 +169,23 @@ static void execlists_init_reg_state(u32 *reg_state,
>                                      struct intel_engine_cs *engine,
>                                      struct intel_ring *ring);
>  
> +static inline struct i915_priolist *to_priolist(struct rb_node *rb)
> +{
> +       return rb_entry(rb, struct i915_priolist, node);
> +}
> +
> +static inline int rq_prio(const struct i915_request *rq)
> +{
> +       return rq->priotree.priority;
> +}
> +
> +static inline bool need_preempt(const struct intel_engine_cs *engine,
> +                                const struct i915_request *last,
> +                                int prio)
> +{
> +       return engine->i915->preempt_context && prio > max(rq_prio(last), 0);
> +}
> +
>  /**
>   * intel_lr_context_descriptor_update() - calculate & cache the descriptor
>   *                                       descriptor for a pinned context
> @@ -224,7 +241,7 @@ lookup_priolist(struct intel_engine_cs *engine,
>         parent = &execlists->queue.rb_node;
>         while (*parent) {
>                 rb = *parent;
> -               p = rb_entry(rb, typeof(*p), node);
> +               p = to_priolist(rb);
>                 if (prio > p->priority) {
>                         parent = &rb->rb_left;
>                 } else if (prio < p->priority) {
> @@ -264,7 +281,7 @@ lookup_priolist(struct intel_engine_cs *engine,
>         if (first)
>                 execlists->first = &p->node;
>  
> -       return ptr_pack_bits(p, first, 1);
> +       return p;
>  }
>  
>  static void unwind_wa_tail(struct i915_request *rq)
> @@ -290,14 +307,10 @@ static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
>                 __i915_request_unsubmit(rq);
>                 unwind_wa_tail(rq);
>  
> -               GEM_BUG_ON(rq->priotree.priority == I915_PRIORITY_INVALID);
> -               if (rq->priotree.priority != last_prio) {
> -                       p = lookup_priolist(engine,
> -                                           &rq->priotree,
> -                                           rq->priotree.priority);
> -                       p = ptr_mask_bits(p, 1);
> -
> -                       last_prio = rq->priotree.priority;
> +               GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
> +               if (rq_prio(rq) != last_prio) {
> +                       last_prio = rq_prio(rq);
> +                       p = lookup_priolist(engine, &rq->priotree, last_prio);
>                 }
>  
>                 list_add(&rq->priotree.link, &p->requests);
> @@ -397,10 +410,11 @@ static void execlists_submit_ports(struct intel_engine_cs *engine)
>                         desc = execlists_update_context(rq);
>                         GEM_DEBUG_EXEC(port[n].context_id = upper_32_bits(desc));
>  
> -                       GEM_TRACE("%s in[%d]:  ctx=%d.%d, seqno=%x\n",
> +                       GEM_TRACE("%s in[%d]:  ctx=%d.%d, seqno=%x, prio=%d\n",
>                                   engine->name, n,
>                                   port[n].context_id, count,
> -                                 rq->global_seqno);
> +                                 rq->global_seqno,
> +                                 rq_prio(rq));
>                 } else {
>                         GEM_BUG_ON(!n);
>                         desc = 0;
> @@ -453,12 +467,17 @@ static void inject_preempt_context(struct intel_engine_cs *engine)
>                    _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT |
>                                       CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT));
>  
> +       /*
> +        * Switch to our empty preempt context so
> +        * the state of the GPU is known (idle).
> +        */
>         GEM_TRACE("%s\n", engine->name);
>         for (n = execlists_num_ports(&engine->execlists); --n; )
>                 elsp_write(0, engine->execlists.elsp);
>  
>         elsp_write(ce->lrc_desc, engine->execlists.elsp);
>         execlists_clear_active(&engine->execlists, EXECLISTS_ACTIVE_HWACK);
> +       execlists_set_active(&engine->execlists, EXECLISTS_ACTIVE_PREEMPT);
>  }
>  
>  static void execlists_dequeue(struct intel_engine_cs *engine)
> @@ -495,8 +514,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>         spin_lock_irq(&engine->timeline->lock);
>         rb = execlists->first;
>         GEM_BUG_ON(rb_first(&execlists->queue) != rb);
> -       if (!rb)
> -               goto unlock;
>  
>         if (last) {
>                 /*
> @@ -519,54 +536,48 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>                 if (!execlists_is_active(execlists, EXECLISTS_ACTIVE_HWACK))
>                         goto unlock;
>  
> -               if (engine->i915->preempt_context &&
> -                   rb_entry(rb, struct i915_priolist, node)->priority >
> -                   max(last->priotree.priority, 0)) {
> -                       /*
> -                        * Switch to our empty preempt context so
> -                        * the state of the GPU is known (idle).
> -                        */
> +               if (need_preempt(engine, last, execlists->queue_priority)) {
>                         inject_preempt_context(engine);
> -                       execlists_set_active(execlists,
> -                                            EXECLISTS_ACTIVE_PREEMPT);
>                         goto unlock;
> -               } else {
> -                       /*
> -                        * In theory, we could coalesce more requests onto
> -                        * the second port (the first port is active, with
> -                        * no preemptions pending). However, that means we
> -                        * then have to deal with the possible lite-restore
> -                        * of the second port (as we submit the ELSP, there
> -                        * may be a context-switch) but also we may complete
> -                        * the resubmission before the context-switch. Ergo,
> -                        * coalescing onto the second port will cause a
> -                        * preemption event, but we cannot predict whether
> -                        * that will affect port[0] or port[1].
> -                        *
> -                        * If the second port is already active, we can wait
> -                        * until the next context-switch before contemplating
> -                        * new requests. The GPU will be busy and we should be
> -                        * able to resubmit the new ELSP before it idles,
> -                        * avoiding pipeline bubbles (momentary pauses where
> -                        * the driver is unable to keep up the supply of new
> -                        * work).
> -                        */
> -                       if (port_count(&port[1]))
> -                               goto unlock;
> -
> -                       /* WaIdleLiteRestore:bdw,skl
> -                        * Apply the wa NOOPs to prevent
> -                        * ring:HEAD == rq:TAIL as we resubmit the
> -                        * request. See gen8_emit_breadcrumb() for
> -                        * where we prepare the padding after the
> -                        * end of the request.
> -                        */
> -                       last->tail = last->wa_tail;
>                 }
> +
> +               /*
> +                * In theory, we could coalesce more requests onto
> +                * the second port (the first port is active, with
> +                * no preemptions pending). However, that means we
> +                * then have to deal with the possible lite-restore
> +                * of the second port (as we submit the ELSP, there
> +                * may be a context-switch) but also we may complete
> +                * the resubmission before the context-switch. Ergo,
> +                * coalescing onto the second port will cause a
> +                * preemption event, but we cannot predict whether
> +                * that will affect port[0] or port[1].
> +                *
> +                * If the second port is already active, we can wait
> +                * until the next context-switch before contemplating
> +                * new requests. The GPU will be busy and we should be
> +                * able to resubmit the new ELSP before it idles,
> +                * avoiding pipeline bubbles (momentary pauses where
> +                * the driver is unable to keep up the supply of new
> +                * work). However, we have to double check that the
> +                * priorities of the ports haven't been switch.
> +                */
> +               if (port_count(&port[1]))
> +                       goto unlock;
> +
> +               /*
> +                * WaIdleLiteRestore:bdw,skl
> +                * Apply the wa NOOPs to prevent
> +                * ring:HEAD == rq:TAIL as we resubmit the
> +                * request. See gen8_emit_breadcrumb() for
> +                * where we prepare the padding after the
> +                * end of the request.
> +                */
> +               last->tail = last->wa_tail;
>         }
>  
> -       do {
> -               struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
> +       while (rb) {
> +               struct i915_priolist *p = to_priolist(rb);
>                 struct i915_request *rq, *rn;
>  
>                 list_for_each_entry_safe(rq, rn, &p->requests, priotree.link) {
> @@ -628,8 +639,9 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>                 INIT_LIST_HEAD(&p->requests);
>                 if (p->priority != I915_PRIORITY_NORMAL)
>                         kmem_cache_free(engine->i915->priorities, p);
> -       } while (rb);
> +       }
>  done:
> +       execlists->queue_priority = rb ? to_priolist(rb)->priority : INT_MIN;
>         execlists->first = rb;
>         if (submit)
>                 port_assign(port, last);
> @@ -690,7 +702,7 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
>         /* Flush the queued requests to the timeline list (for retiring). */
>         rb = execlists->first;
>         while (rb) {
> -               struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
> +               struct i915_priolist *p = to_priolist(rb);
>  
>                 list_for_each_entry_safe(rq, rn, &p->requests, priotree.link) {
>                         INIT_LIST_HEAD(&rq->priotree.link);
> @@ -708,7 +720,7 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
>  
>         /* Remaining _unready_ requests will be nop'ed when submitted */
>  
> -
> +       execlists->queue_priority = INT_MIN;
>         execlists->queue = RB_ROOT;
>         execlists->first = NULL;
>         GEM_BUG_ON(port_isset(execlists->port));
> @@ -867,10 +879,11 @@ static void execlists_submission_tasklet(unsigned long data)
>                         GEM_DEBUG_BUG_ON(buf[2 * head + 1] != port->context_id);
>  
>                         rq = port_unpack(port, &count);
> -                       GEM_TRACE("%s out[0]: ctx=%d.%d, seqno=%x\n",
> +                       GEM_TRACE("%s out[0]: ctx=%d.%d, seqno=%x, prio=%d\n",
>                                   engine->name,
>                                   port->context_id, count,
> -                                 rq ? rq->global_seqno : 0);
> +                                 rq ? rq->global_seqno : 0,
> +                                 rq ? rq_prio(rq) : 0);
>                         GEM_BUG_ON(count == 0);
>                         if (--count == 0) {
>                                 GEM_BUG_ON(status & GEN8_CTX_STATUS_PREEMPTED);
> @@ -908,15 +921,19 @@ static void execlists_submission_tasklet(unsigned long data)
>                 intel_uncore_forcewake_put(dev_priv, execlists->fw_domains);
>  }
>  
> -static void insert_request(struct intel_engine_cs *engine,
> -                          struct i915_priotree *pt,
> -                          int prio)
> +static void queue_request(struct intel_engine_cs *engine,
> +                         struct i915_priotree *pt,
> +                         int prio)
>  {
> -       struct i915_priolist *p = lookup_priolist(engine, pt, prio);
> +       list_add_tail(&pt->link, &lookup_priolist(engine, pt, prio)->requests);
> +}
>  
> -       list_add_tail(&pt->link, &ptr_mask_bits(p, 1)->requests);
> -       if (ptr_unmask_bits(p, 1))
> +static void submit_queue(struct intel_engine_cs *engine, int prio)
> +{
> +       if (prio > engine->execlists.queue_priority) {
> +               engine->execlists.queue_priority = prio;
>                 tasklet_hi_schedule(&engine->execlists.tasklet);
> +       }
>  }
>  
>  static void execlists_submit_request(struct i915_request *request)
> @@ -927,7 +944,8 @@ static void execlists_submit_request(struct i915_request *request)
>         /* Will be called from irq-context when using foreign fences. */
>         spin_lock_irqsave(&engine->timeline->lock, flags);
>  
> -       insert_request(engine, &request->priotree, request->priotree.priority);
> +       queue_request(engine, &request->priotree, rq_prio(request));
> +       submit_queue(engine, rq_prio(request));
>  
>         GEM_BUG_ON(!engine->execlists.first);
>         GEM_BUG_ON(list_empty(&request->priotree.link));
> @@ -983,7 +1001,7 @@ static void execlists_schedule(struct i915_request *request, int prio)
>          * static void update_priorities(struct i915_priotree *pt, prio) {
>          *      list_for_each_entry(dep, &pt->signalers_list, signal_link)
>          *              update_priorities(dep->signal, prio)
> -        *      insert_request(pt);
> +        *      queue_request(pt);
>          * }
>          * but that may have unlimited recursion depth and so runs a very
>          * real risk of overunning the kernel stack. Instead, we build
> @@ -1046,8 +1064,9 @@ static void execlists_schedule(struct i915_request *request, int prio)
>                 pt->priority = prio;
>                 if (!list_empty(&pt->link)) {
>                         __list_del_entry(&pt->link);
> -                       insert_request(engine, pt, prio);
> +                       queue_request(engine, pt, prio);
>                 }
> +               submit_queue(engine, prio);
>         }
>  
>         spin_unlock_irq(&engine->timeline->lock);
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index a9b83bf7e837..c4e9022b34e3 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -257,6 +257,11 @@ struct intel_engine_execlists {
>          */
>         unsigned int port_mask;
>  
> +       /**
> +        * @queue_priority: Highest pending priority.
> +        */
> +       int queue_priority;
> +
>         /**
>          * @queue: queue of requests, in priority lists
>          */
> -- 
> 2.16.1
> 
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] drm/i915/preemption: Allow preemption between submission ports
  2018-02-22 14:22 ` Chris Wilson
  2018-02-22 14:36   ` Chris Wilson
@ 2018-02-23 14:06   ` Mika Kuoppala
  2018-02-23 14:23     ` Chris Wilson
  2018-02-23 15:18   ` Mika Kuoppala
  2 siblings, 1 reply; 10+ messages in thread
From: Mika Kuoppala @ 2018-02-23 14:06 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Sometimes we need to boost the priority of an in-flight request, which
> may lead to the situation where the second submission port then contains
> a higher priority context than the first and so we need to inject a
> preemption event. To do so we must always check inside
> execlists_dequeue() whether there is a priority inversion between the
> ports themselves as well as the head of the priority sorted queue, and we
> cannot just skip dequeuing if the queue is empty.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Michał Winiarski <michal.winiarski@intel.com>
> Cc: Michel Thierry <michel.thierry@intel.com>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
> Rebase for conflicts
> -Chris
> ---
>  drivers/gpu/drm/i915/intel_engine_cs.c      |   2 +
>  drivers/gpu/drm/i915/intel_guc_submission.c |  17 +--
>  drivers/gpu/drm/i915/intel_lrc.c            | 161 ++++++++++++++++------------
>  drivers/gpu/drm/i915/intel_ringbuffer.h     |   5 +
>  4 files changed, 107 insertions(+), 78 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
> index c31544406974..ce7fcf55ba18 100644
> --- a/drivers/gpu/drm/i915/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/intel_engine_cs.c
> @@ -423,6 +423,7 @@ static void intel_engine_init_execlist(struct intel_engine_cs *engine)
>  	BUILD_BUG_ON_NOT_POWER_OF_2(execlists_num_ports(execlists));
>  	GEM_BUG_ON(execlists_num_ports(execlists) > EXECLIST_MAX_PORTS);
>  
> +	execlists->queue_priority = INT_MIN;
>  	execlists->queue = RB_ROOT;
>  	execlists->first = NULL;
>  }
> @@ -1903,6 +1904,7 @@ void intel_engine_dump(struct intel_engine_cs *engine,
>  	spin_lock_irq(&engine->timeline->lock);
>  	list_for_each_entry(rq, &engine->timeline->requests, link)
>  		print_request(m, rq, "\t\tE ");
> +	drm_printf(m, "\t\tQueue priority: %d\n", execlists->queue_priority);
>  	for (rb = execlists->first; rb; rb = rb_next(rb)) {
>  		struct i915_priolist *p =
>  			rb_entry(rb, typeof(*p), node);
> diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c b/drivers/gpu/drm/i915/intel_guc_submission.c
> index 649113c7a3c2..586dde579903 100644
> --- a/drivers/gpu/drm/i915/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/intel_guc_submission.c
> @@ -75,6 +75,11 @@
>   *
>   */
>  
> +static inline struct i915_priolist *to_priolist(struct rb_node *rb)
> +{
> +	return rb_entry(rb, struct i915_priolist, node);
> +}
> +
>  static inline bool is_high_priority(struct intel_guc_client *client)
>  {
>  	return (client->priority == GUC_CLIENT_PRIORITY_KMD_HIGH ||
> @@ -682,15 +687,12 @@ static void guc_dequeue(struct intel_engine_cs *engine)
>  	rb = execlists->first;
>  	GEM_BUG_ON(rb_first(&execlists->queue) != rb);
>  
> -	if (!rb)
> -		goto unlock;
> -
>  	if (port_isset(port)) {
>  		if (engine->i915->preempt_context) {
>  			struct guc_preempt_work *preempt_work =
>  				&engine->i915->guc.preempt_work[engine->id];
>  
> -			if (rb_entry(rb, struct i915_priolist, node)->priority >
> +			if (execlists->queue_priority >
>  			    max(port_request(port)->priotree.priority, 0)) {
>  				execlists_set_active(execlists,
>  						     EXECLISTS_ACTIVE_PREEMPT);

This is the priority inversion part? We preempt and clear the ports
to rearrange if the last port has a higher priority request?

> @@ -706,8 +708,8 @@ static void guc_dequeue(struct intel_engine_cs *engine)
>  	}
>  	GEM_BUG_ON(port_isset(port));
>  
> -	do {
> -		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
> +	while (rb) {
> +		struct i915_priolist *p = to_priolist(rb);
>  		struct i915_request *rq, *rn;
>  
>  		list_for_each_entry_safe(rq, rn, &p->requests, priotree.link) {
> @@ -736,8 +738,9 @@ static void guc_dequeue(struct intel_engine_cs *engine)
>  		INIT_LIST_HEAD(&p->requests);
>  		if (p->priority != I915_PRIORITY_NORMAL)
>  			kmem_cache_free(engine->i915->priorities, p);
> -	} while (rb);
> +	}
>  done:
> +	execlists->queue_priority = rb ? to_priolist(rb)->priority : INT_MIN;
>  	execlists->first = rb;
>  	if (submit) {
>  		port_assign(port, last);
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 964885b5d7cb..4bc72fbaf793 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -169,6 +169,23 @@ static void execlists_init_reg_state(u32 *reg_state,
>  				     struct intel_engine_cs *engine,
>  				     struct intel_ring *ring);
>  
> +static inline struct i915_priolist *to_priolist(struct rb_node *rb)
> +{
> +	return rb_entry(rb, struct i915_priolist, node);
> +}
> +
> +static inline int rq_prio(const struct i915_request *rq)
> +{
> +	return rq->priotree.priority;
> +}
> +
> +static inline bool need_preempt(const struct intel_engine_cs *engine,
> +                                const struct i915_request *last,
> +                                int prio)
> +{
> +	return engine->i915->preempt_context && prio > max(rq_prio(last), 0);
> +}
> +
>  /**
>   * intel_lr_context_descriptor_update() - calculate & cache the descriptor
>   * 					  descriptor for a pinned context
> @@ -224,7 +241,7 @@ lookup_priolist(struct intel_engine_cs *engine,
>  	parent = &execlists->queue.rb_node;
>  	while (*parent) {
>  		rb = *parent;
> -		p = rb_entry(rb, typeof(*p), node);
> +		p = to_priolist(rb);
>  		if (prio > p->priority) {
>  			parent = &rb->rb_left;
>  		} else if (prio < p->priority) {
> @@ -264,7 +281,7 @@ lookup_priolist(struct intel_engine_cs *engine,
>  	if (first)
>  		execlists->first = &p->node;
>  
> -	return ptr_pack_bits(p, first, 1);
> +	return p;

Hmm there is no need for decode first as we always resubmit
the queue depending on the prio?


>  }
>  
>  static void unwind_wa_tail(struct i915_request *rq)
> @@ -290,14 +307,10 @@ static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
>  		__i915_request_unsubmit(rq);
>  		unwind_wa_tail(rq);
>  
> -		GEM_BUG_ON(rq->priotree.priority == I915_PRIORITY_INVALID);
> -		if (rq->priotree.priority != last_prio) {
> -			p = lookup_priolist(engine,
> -					    &rq->priotree,
> -					    rq->priotree.priority);
> -			p = ptr_mask_bits(p, 1);
> -
> -			last_prio = rq->priotree.priority;
> +		GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
> +		if (rq_prio(rq) != last_prio) {
> +			last_prio = rq_prio(rq);
> +			p = lookup_priolist(engine, &rq->priotree, last_prio);
>  		}
>  
>  		list_add(&rq->priotree.link, &p->requests);
> @@ -397,10 +410,11 @@ static void execlists_submit_ports(struct intel_engine_cs *engine)
>  			desc = execlists_update_context(rq);
>  			GEM_DEBUG_EXEC(port[n].context_id = upper_32_bits(desc));
>  
> -			GEM_TRACE("%s in[%d]:  ctx=%d.%d, seqno=%x\n",
> +			GEM_TRACE("%s in[%d]:  ctx=%d.%d, seqno=%x, prio=%d\n",
>  				  engine->name, n,
>  				  port[n].context_id, count,
> -				  rq->global_seqno);
> +				  rq->global_seqno,
> +				  rq_prio(rq));
>  		} else {
>  			GEM_BUG_ON(!n);
>  			desc = 0;
> @@ -453,12 +467,17 @@ static void inject_preempt_context(struct intel_engine_cs *engine)
>  		   _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT |
>  				      CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT));
>  
> +	/*
> +	 * Switch to our empty preempt context so
> +	 * the state of the GPU is known (idle).
> +	 */
>  	GEM_TRACE("%s\n", engine->name);
>  	for (n = execlists_num_ports(&engine->execlists); --n; )
>  		elsp_write(0, engine->execlists.elsp);
>  
>  	elsp_write(ce->lrc_desc, engine->execlists.elsp);
>  	execlists_clear_active(&engine->execlists, EXECLISTS_ACTIVE_HWACK);
> +	execlists_set_active(&engine->execlists, EXECLISTS_ACTIVE_PREEMPT);

Surely a better place. Now looking at this would it be more prudent to
move both the clear and set right before the last elsp write?

Well I guess it really doesn't matter as we hold the timeline lock.

-Mika

>  }
>  
>  static void execlists_dequeue(struct intel_engine_cs *engine)
> @@ -495,8 +514,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  	spin_lock_irq(&engine->timeline->lock);
>  	rb = execlists->first;
>  	GEM_BUG_ON(rb_first(&execlists->queue) != rb);
> -	if (!rb)
> -		goto unlock;
>  
>  	if (last) {
>  		/*
> @@ -519,54 +536,48 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  		if (!execlists_is_active(execlists, EXECLISTS_ACTIVE_HWACK))
>  			goto unlock;
>  
> -		if (engine->i915->preempt_context &&
> -		    rb_entry(rb, struct i915_priolist, node)->priority >
> -		    max(last->priotree.priority, 0)) {
> -			/*
> -			 * Switch to our empty preempt context so
> -			 * the state of the GPU is known (idle).
> -			 */
> +		if (need_preempt(engine, last, execlists->queue_priority)) {
>  			inject_preempt_context(engine);
> -			execlists_set_active(execlists,
> -					     EXECLISTS_ACTIVE_PREEMPT);
>  			goto unlock;
> -		} else {
> -			/*
> -			 * In theory, we could coalesce more requests onto
> -			 * the second port (the first port is active, with
> -			 * no preemptions pending). However, that means we
> -			 * then have to deal with the possible lite-restore
> -			 * of the second port (as we submit the ELSP, there
> -			 * may be a context-switch) but also we may complete
> -			 * the resubmission before the context-switch. Ergo,
> -			 * coalescing onto the second port will cause a
> -			 * preemption event, but we cannot predict whether
> -			 * that will affect port[0] or port[1].
> -			 *
> -			 * If the second port is already active, we can wait
> -			 * until the next context-switch before contemplating
> -			 * new requests. The GPU will be busy and we should be
> -			 * able to resubmit the new ELSP before it idles,
> -			 * avoiding pipeline bubbles (momentary pauses where
> -			 * the driver is unable to keep up the supply of new
> -			 * work).
> -			 */
> -			if (port_count(&port[1]))
> -				goto unlock;
> -
> -			/* WaIdleLiteRestore:bdw,skl
> -			 * Apply the wa NOOPs to prevent
> -			 * ring:HEAD == rq:TAIL as we resubmit the
> -			 * request. See gen8_emit_breadcrumb() for
> -			 * where we prepare the padding after the
> -			 * end of the request.
> -			 */
> -			last->tail = last->wa_tail;
>  		}
> +
> +		/*
> +		 * In theory, we could coalesce more requests onto
> +		 * the second port (the first port is active, with
> +		 * no preemptions pending). However, that means we
> +		 * then have to deal with the possible lite-restore
> +		 * of the second port (as we submit the ELSP, there
> +		 * may be a context-switch) but also we may complete
> +		 * the resubmission before the context-switch. Ergo,
> +		 * coalescing onto the second port will cause a
> +		 * preemption event, but we cannot predict whether
> +		 * that will affect port[0] or port[1].
> +		 *
> +		 * If the second port is already active, we can wait
> +		 * until the next context-switch before contemplating
> +		 * new requests. The GPU will be busy and we should be
> +		 * able to resubmit the new ELSP before it idles,
> +		 * avoiding pipeline bubbles (momentary pauses where
> +		 * the driver is unable to keep up the supply of new
> +		 * work). However, we have to double check that the
> +		 * priorities of the ports haven't been switch.
> +		 */
> +		if (port_count(&port[1]))
> +			goto unlock;
> +
> +		/*
> +		 * WaIdleLiteRestore:bdw,skl
> +		 * Apply the wa NOOPs to prevent
> +		 * ring:HEAD == rq:TAIL as we resubmit the
> +		 * request. See gen8_emit_breadcrumb() for
> +		 * where we prepare the padding after the
> +		 * end of the request.
> +		 */
> +		last->tail = last->wa_tail;
>  	}
>  
> -	do {
> -		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
> +	while (rb) {
> +		struct i915_priolist *p = to_priolist(rb);
>  		struct i915_request *rq, *rn;
>  
>  		list_for_each_entry_safe(rq, rn, &p->requests, priotree.link) {
> @@ -628,8 +639,9 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  		INIT_LIST_HEAD(&p->requests);
>  		if (p->priority != I915_PRIORITY_NORMAL)
>  			kmem_cache_free(engine->i915->priorities, p);
> -	} while (rb);
> +	}
>  done:
> +	execlists->queue_priority = rb ? to_priolist(rb)->priority : INT_MIN;
>  	execlists->first = rb;
>  	if (submit)
>  		port_assign(port, last);
> @@ -690,7 +702,7 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
>  	/* Flush the queued requests to the timeline list (for retiring). */
>  	rb = execlists->first;
>  	while (rb) {
> -		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
> +		struct i915_priolist *p = to_priolist(rb);
>  
>  		list_for_each_entry_safe(rq, rn, &p->requests, priotree.link) {
>  			INIT_LIST_HEAD(&rq->priotree.link);
> @@ -708,7 +720,7 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
>  
>  	/* Remaining _unready_ requests will be nop'ed when submitted */
>  
> -
> +	execlists->queue_priority = INT_MIN;
>  	execlists->queue = RB_ROOT;
>  	execlists->first = NULL;
>  	GEM_BUG_ON(port_isset(execlists->port));
> @@ -864,10 +876,11 @@ static void execlists_submission_tasklet(unsigned long data)
>  							EXECLISTS_ACTIVE_USER));
>  
>  			rq = port_unpack(port, &count);
> -			GEM_TRACE("%s out[0]: ctx=%d.%d, seqno=%x\n",
> +			GEM_TRACE("%s out[0]: ctx=%d.%d, seqno=%x, prio=%d\n",
>  				  engine->name,
>  				  port->context_id, count,
> -				  rq ? rq->global_seqno : 0);
> +				  rq ? rq->global_seqno : 0,
> +				  rq ? rq_prio(rq) : 0);
>  
>  			/* Check the context/desc id for this event matches */
>  			GEM_DEBUG_BUG_ON(buf[2 * head + 1] != port->context_id);
> @@ -912,15 +925,19 @@ static void execlists_submission_tasklet(unsigned long data)
>  		intel_uncore_forcewake_put(dev_priv, execlists->fw_domains);
>  }
>  
> -static void insert_request(struct intel_engine_cs *engine,
> -			   struct i915_priotree *pt,
> -			   int prio)
> +static void queue_request(struct intel_engine_cs *engine,
> +			  struct i915_priotree *pt,
> +			  int prio)
>  {
> -	struct i915_priolist *p = lookup_priolist(engine, pt, prio);
> +	list_add_tail(&pt->link, &lookup_priolist(engine, pt, prio)->requests);
> +}
>  
> -	list_add_tail(&pt->link, &ptr_mask_bits(p, 1)->requests);
> -	if (ptr_unmask_bits(p, 1))
> +static void submit_queue(struct intel_engine_cs *engine, int prio)
> +{
> +	if (prio > engine->execlists.queue_priority) {
> +		engine->execlists.queue_priority = prio;
>  		tasklet_hi_schedule(&engine->execlists.tasklet);
> +	}
>  }
>  
>  static void execlists_submit_request(struct i915_request *request)
> @@ -931,7 +948,8 @@ static void execlists_submit_request(struct i915_request *request)
>  	/* Will be called from irq-context when using foreign fences. */
>  	spin_lock_irqsave(&engine->timeline->lock, flags);
>  
> -	insert_request(engine, &request->priotree, request->priotree.priority);
> +	queue_request(engine, &request->priotree, rq_prio(request));
> +	submit_queue(engine, rq_prio(request));
>  
>  	GEM_BUG_ON(!engine->execlists.first);
>  	GEM_BUG_ON(list_empty(&request->priotree.link));
> @@ -987,7 +1005,7 @@ static void execlists_schedule(struct i915_request *request, int prio)
>  	 * static void update_priorities(struct i915_priotree *pt, prio) {
>  	 *	list_for_each_entry(dep, &pt->signalers_list, signal_link)
>  	 *		update_priorities(dep->signal, prio)
> -	 *	insert_request(pt);
> +	 *	queue_request(pt);
>  	 * }
>  	 * but that may have unlimited recursion depth and so runs a very
>  	 * real risk of overunning the kernel stack. Instead, we build
> @@ -1050,8 +1068,9 @@ static void execlists_schedule(struct i915_request *request, int prio)
>  		pt->priority = prio;
>  		if (!list_empty(&pt->link)) {
>  			__list_del_entry(&pt->link);
> -			insert_request(engine, pt, prio);
> +			queue_request(engine, pt, prio);
>  		}
> +		submit_queue(engine, prio);
>  	}
>  
>  	spin_unlock_irq(&engine->timeline->lock);
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index a9b83bf7e837..c4e9022b34e3 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -257,6 +257,11 @@ struct intel_engine_execlists {
>  	 */
>  	unsigned int port_mask;
>  
> +	/**
> +	 * @queue_priority: Highest pending priority.
> +	 */
> +	int queue_priority;
> +
>  	/**
>  	 * @queue: queue of requests, in priority lists
>  	 */
> -- 
> 2.16.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] drm/i915/preemption: Allow preemption between submission ports
  2018-02-23 14:06   ` Mika Kuoppala
@ 2018-02-23 14:23     ` Chris Wilson
  0 siblings, 0 replies; 10+ messages in thread
From: Chris Wilson @ 2018-02-23 14:23 UTC (permalink / raw)
  To: Mika Kuoppala, intel-gfx

Quoting Mika Kuoppala (2018-02-23 14:06:06)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> >  static inline bool is_high_priority(struct intel_guc_client *client)
> >  {
> >       return (client->priority == GUC_CLIENT_PRIORITY_KMD_HIGH ||
> > @@ -682,15 +687,12 @@ static void guc_dequeue(struct intel_engine_cs *engine)
> >       rb = execlists->first;
> >       GEM_BUG_ON(rb_first(&execlists->queue) != rb);
> >  
> > -     if (!rb)
> > -             goto unlock;
> > -
> >       if (port_isset(port)) {
> >               if (engine->i915->preempt_context) {
> >                       struct guc_preempt_work *preempt_work =
> >                               &engine->i915->guc.preempt_work[engine->id];
> >  
> > -                     if (rb_entry(rb, struct i915_priolist, node)->priority >
> > +                     if (execlists->queue_priority >
> >                           max(port_request(port)->priotree.priority, 0)) {
> >                               execlists_set_active(execlists,
> >                                                    EXECLISTS_ACTIVE_PREEMPT);
> 
> This is the priority inversion part? We preempt and clear the ports
> to rearrange if the last port has a higher priority request?

Yes, along with a strong kick from

> > @@ -1050,8 +1068,9 @@ static void execlists_schedule(struct i915_request *request, int prio)
> >               pt->priority = prio;
> >               if (!list_empty(&pt->link)) {
> >                       __list_del_entry(&pt->link);
> > -                     insert_request(engine, pt, prio);
> > +                     queue_request(engine, pt, prio);
> >               }
> > +             submit_queue(engine, prio);

So that we re-evaluate ELSP if the active prio change.

> > @@ -264,7 +281,7 @@ lookup_priolist(struct intel_engine_cs *engine,
> >       if (first)
> >               execlists->first = &p->node;
> >  
> > -     return ptr_pack_bits(p, first, 1);
> > +     return p;
> 
> Hmm there is no need for decode first as we always resubmit
> the queue depending on the prio?

Right, the first bit is now checked against queue_priority instead. If
we are higher priority than the queue, we must rerun the tasklet.
Whereas before we knew we only had to do it if we inserted into the
start of the queue.

> > @@ -453,12 +467,17 @@ static void inject_preempt_context(struct intel_engine_cs *engine)
> >                  _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT |
> >                                     CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT));
> >  
> > +     /*
> > +      * Switch to our empty preempt context so
> > +      * the state of the GPU is known (idle).
> > +      */
> >       GEM_TRACE("%s\n", engine->name);
> >       for (n = execlists_num_ports(&engine->execlists); --n; )
> >               elsp_write(0, engine->execlists.elsp);
> >  
> >       elsp_write(ce->lrc_desc, engine->execlists.elsp);
> >       execlists_clear_active(&engine->execlists, EXECLISTS_ACTIVE_HWACK);
> > +     execlists_set_active(&engine->execlists, EXECLISTS_ACTIVE_PREEMPT);
> 
> Surely a better place. Now looking at this would it be more prudent to
> move both the clear and set right before the last elsp write?
> 
> Well I guess it really doesn't matter as we hold the timeline lock.

And we only process ELSP from a single cpu, so it's all sequential,
right.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] drm/i915/preemption: Allow preemption between submission ports
  2018-02-22 14:22 ` Chris Wilson
  2018-02-22 14:36   ` Chris Wilson
  2018-02-23 14:06   ` Mika Kuoppala
@ 2018-02-23 15:18   ` Mika Kuoppala
  2018-02-23 16:41     ` Chris Wilson
  2 siblings, 1 reply; 10+ messages in thread
From: Mika Kuoppala @ 2018-02-23 15:18 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Sometimes we need to boost the priority of an in-flight request, which
> may lead to the situation where the second submission port then contains
> a higher priority context than the first and so we need to inject a
> preemption event. To do so we must always check inside
> execlists_dequeue() whether there is a priority inversion between the
> ports themselves as well as the head of the priority sorted queue, and we
> cannot just skip dequeuing if the queue is empty.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Michał Winiarski <michal.winiarski@intel.com>
> Cc: Michel Thierry <michel.thierry@intel.com>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>

> ---
> Rebase for conflicts
> -Chris
> ---
>  drivers/gpu/drm/i915/intel_engine_cs.c      |   2 +
>  drivers/gpu/drm/i915/intel_guc_submission.c |  17 +--
>  drivers/gpu/drm/i915/intel_lrc.c            | 161 ++++++++++++++++------------
>  drivers/gpu/drm/i915/intel_ringbuffer.h     |   5 +
>  4 files changed, 107 insertions(+), 78 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
> index c31544406974..ce7fcf55ba18 100644
> --- a/drivers/gpu/drm/i915/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/intel_engine_cs.c
> @@ -423,6 +423,7 @@ static void intel_engine_init_execlist(struct intel_engine_cs *engine)
>  	BUILD_BUG_ON_NOT_POWER_OF_2(execlists_num_ports(execlists));
>  	GEM_BUG_ON(execlists_num_ports(execlists) > EXECLIST_MAX_PORTS);
>  
> +	execlists->queue_priority = INT_MIN;
>  	execlists->queue = RB_ROOT;
>  	execlists->first = NULL;
>  }
> @@ -1903,6 +1904,7 @@ void intel_engine_dump(struct intel_engine_cs *engine,
>  	spin_lock_irq(&engine->timeline->lock);
>  	list_for_each_entry(rq, &engine->timeline->requests, link)
>  		print_request(m, rq, "\t\tE ");
> +	drm_printf(m, "\t\tQueue priority: %d\n", execlists->queue_priority);
>  	for (rb = execlists->first; rb; rb = rb_next(rb)) {
>  		struct i915_priolist *p =
>  			rb_entry(rb, typeof(*p), node);
> diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c b/drivers/gpu/drm/i915/intel_guc_submission.c
> index 649113c7a3c2..586dde579903 100644
> --- a/drivers/gpu/drm/i915/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/intel_guc_submission.c
> @@ -75,6 +75,11 @@
>   *
>   */
>  
> +static inline struct i915_priolist *to_priolist(struct rb_node *rb)
> +{
> +	return rb_entry(rb, struct i915_priolist, node);
> +}
> +
>  static inline bool is_high_priority(struct intel_guc_client *client)
>  {
>  	return (client->priority == GUC_CLIENT_PRIORITY_KMD_HIGH ||
> @@ -682,15 +687,12 @@ static void guc_dequeue(struct intel_engine_cs *engine)
>  	rb = execlists->first;
>  	GEM_BUG_ON(rb_first(&execlists->queue) != rb);
>  
> -	if (!rb)
> -		goto unlock;
> -
>  	if (port_isset(port)) {
>  		if (engine->i915->preempt_context) {
>  			struct guc_preempt_work *preempt_work =
>  				&engine->i915->guc.preempt_work[engine->id];
>  
> -			if (rb_entry(rb, struct i915_priolist, node)->priority >
> +			if (execlists->queue_priority >
>  			    max(port_request(port)->priotree.priority, 0)) {
>  				execlists_set_active(execlists,
>  						     EXECLISTS_ACTIVE_PREEMPT);
> @@ -706,8 +708,8 @@ static void guc_dequeue(struct intel_engine_cs *engine)
>  	}
>  	GEM_BUG_ON(port_isset(port));
>  
> -	do {
> -		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
> +	while (rb) {
> +		struct i915_priolist *p = to_priolist(rb);
>  		struct i915_request *rq, *rn;
>  
>  		list_for_each_entry_safe(rq, rn, &p->requests, priotree.link) {
> @@ -736,8 +738,9 @@ static void guc_dequeue(struct intel_engine_cs *engine)
>  		INIT_LIST_HEAD(&p->requests);
>  		if (p->priority != I915_PRIORITY_NORMAL)
>  			kmem_cache_free(engine->i915->priorities, p);
> -	} while (rb);
> +	}
>  done:
> +	execlists->queue_priority = rb ? to_priolist(rb)->priority : INT_MIN;
>  	execlists->first = rb;
>  	if (submit) {
>  		port_assign(port, last);
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 964885b5d7cb..4bc72fbaf793 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -169,6 +169,23 @@ static void execlists_init_reg_state(u32 *reg_state,
>  				     struct intel_engine_cs *engine,
>  				     struct intel_ring *ring);
>  
> +static inline struct i915_priolist *to_priolist(struct rb_node *rb)
> +{
> +	return rb_entry(rb, struct i915_priolist, node);
> +}
> +
> +static inline int rq_prio(const struct i915_request *rq)
> +{
> +	return rq->priotree.priority;
> +}
> +
> +static inline bool need_preempt(const struct intel_engine_cs *engine,
> +                                const struct i915_request *last,
> +                                int prio)
> +{
> +	return engine->i915->preempt_context && prio > max(rq_prio(last), 0);
> +}
> +
>  /**
>   * intel_lr_context_descriptor_update() - calculate & cache the descriptor
>   * 					  descriptor for a pinned context
> @@ -224,7 +241,7 @@ lookup_priolist(struct intel_engine_cs *engine,
>  	parent = &execlists->queue.rb_node;
>  	while (*parent) {
>  		rb = *parent;
> -		p = rb_entry(rb, typeof(*p), node);
> +		p = to_priolist(rb);
>  		if (prio > p->priority) {
>  			parent = &rb->rb_left;
>  		} else if (prio < p->priority) {
> @@ -264,7 +281,7 @@ lookup_priolist(struct intel_engine_cs *engine,
>  	if (first)
>  		execlists->first = &p->node;
>  
> -	return ptr_pack_bits(p, first, 1);
> +	return p;
>  }
>  
>  static void unwind_wa_tail(struct i915_request *rq)
> @@ -290,14 +307,10 @@ static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
>  		__i915_request_unsubmit(rq);
>  		unwind_wa_tail(rq);
>  
> -		GEM_BUG_ON(rq->priotree.priority == I915_PRIORITY_INVALID);
> -		if (rq->priotree.priority != last_prio) {
> -			p = lookup_priolist(engine,
> -					    &rq->priotree,
> -					    rq->priotree.priority);
> -			p = ptr_mask_bits(p, 1);
> -
> -			last_prio = rq->priotree.priority;
> +		GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
> +		if (rq_prio(rq) != last_prio) {
> +			last_prio = rq_prio(rq);
> +			p = lookup_priolist(engine, &rq->priotree, last_prio);
>  		}
>  
>  		list_add(&rq->priotree.link, &p->requests);
> @@ -397,10 +410,11 @@ static void execlists_submit_ports(struct intel_engine_cs *engine)
>  			desc = execlists_update_context(rq);
>  			GEM_DEBUG_EXEC(port[n].context_id = upper_32_bits(desc));
>  
> -			GEM_TRACE("%s in[%d]:  ctx=%d.%d, seqno=%x\n",
> +			GEM_TRACE("%s in[%d]:  ctx=%d.%d, seqno=%x, prio=%d\n",
>  				  engine->name, n,
>  				  port[n].context_id, count,
> -				  rq->global_seqno);
> +				  rq->global_seqno,
> +				  rq_prio(rq));
>  		} else {
>  			GEM_BUG_ON(!n);
>  			desc = 0;
> @@ -453,12 +467,17 @@ static void inject_preempt_context(struct intel_engine_cs *engine)
>  		   _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT |
>  				      CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT));
>  
> +	/*
> +	 * Switch to our empty preempt context so
> +	 * the state of the GPU is known (idle).
> +	 */
>  	GEM_TRACE("%s\n", engine->name);
>  	for (n = execlists_num_ports(&engine->execlists); --n; )
>  		elsp_write(0, engine->execlists.elsp);
>  
>  	elsp_write(ce->lrc_desc, engine->execlists.elsp);
>  	execlists_clear_active(&engine->execlists, EXECLISTS_ACTIVE_HWACK);
> +	execlists_set_active(&engine->execlists, EXECLISTS_ACTIVE_PREEMPT);
>  }
>  
>  static void execlists_dequeue(struct intel_engine_cs *engine)
> @@ -495,8 +514,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  	spin_lock_irq(&engine->timeline->lock);
>  	rb = execlists->first;
>  	GEM_BUG_ON(rb_first(&execlists->queue) != rb);
> -	if (!rb)
> -		goto unlock;
>  
>  	if (last) {
>  		/*
> @@ -519,54 +536,48 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  		if (!execlists_is_active(execlists, EXECLISTS_ACTIVE_HWACK))
>  			goto unlock;
>  
> -		if (engine->i915->preempt_context &&
> -		    rb_entry(rb, struct i915_priolist, node)->priority >
> -		    max(last->priotree.priority, 0)) {
> -			/*
> -			 * Switch to our empty preempt context so
> -			 * the state of the GPU is known (idle).
> -			 */
> +		if (need_preempt(engine, last, execlists->queue_priority)) {
>  			inject_preempt_context(engine);
> -			execlists_set_active(execlists,
> -					     EXECLISTS_ACTIVE_PREEMPT);
>  			goto unlock;
> -		} else {
> -			/*
> -			 * In theory, we could coalesce more requests onto
> -			 * the second port (the first port is active, with
> -			 * no preemptions pending). However, that means we
> -			 * then have to deal with the possible lite-restore
> -			 * of the second port (as we submit the ELSP, there
> -			 * may be a context-switch) but also we may complete
> -			 * the resubmission before the context-switch. Ergo,
> -			 * coalescing onto the second port will cause a
> -			 * preemption event, but we cannot predict whether
> -			 * that will affect port[0] or port[1].
> -			 *
> -			 * If the second port is already active, we can wait
> -			 * until the next context-switch before contemplating
> -			 * new requests. The GPU will be busy and we should be
> -			 * able to resubmit the new ELSP before it idles,
> -			 * avoiding pipeline bubbles (momentary pauses where
> -			 * the driver is unable to keep up the supply of new
> -			 * work).
> -			 */
> -			if (port_count(&port[1]))
> -				goto unlock;
> -
> -			/* WaIdleLiteRestore:bdw,skl
> -			 * Apply the wa NOOPs to prevent
> -			 * ring:HEAD == rq:TAIL as we resubmit the
> -			 * request. See gen8_emit_breadcrumb() for
> -			 * where we prepare the padding after the
> -			 * end of the request.
> -			 */
> -			last->tail = last->wa_tail;
>  		}
> +
> +		/*
> +		 * In theory, we could coalesce more requests onto
> +		 * the second port (the first port is active, with
> +		 * no preemptions pending). However, that means we
> +		 * then have to deal with the possible lite-restore
> +		 * of the second port (as we submit the ELSP, there
> +		 * may be a context-switch) but also we may complete
> +		 * the resubmission before the context-switch. Ergo,
> +		 * coalescing onto the second port will cause a
> +		 * preemption event, but we cannot predict whether
> +		 * that will affect port[0] or port[1].
> +		 *
> +		 * If the second port is already active, we can wait
> +		 * until the next context-switch before contemplating
> +		 * new requests. The GPU will be busy and we should be
> +		 * able to resubmit the new ELSP before it idles,
> +		 * avoiding pipeline bubbles (momentary pauses where
> +		 * the driver is unable to keep up the supply of new
> +		 * work). However, we have to double check that the
> +		 * priorities of the ports haven't been switch.
> +		 */
> +		if (port_count(&port[1]))
> +			goto unlock;
> +
> +		/*
> +		 * WaIdleLiteRestore:bdw,skl
> +		 * Apply the wa NOOPs to prevent
> +		 * ring:HEAD == rq:TAIL as we resubmit the
> +		 * request. See gen8_emit_breadcrumb() for
> +		 * where we prepare the padding after the
> +		 * end of the request.
> +		 */
> +		last->tail = last->wa_tail;
>  	}
>  
> -	do {
> -		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
> +	while (rb) {
> +		struct i915_priolist *p = to_priolist(rb);
>  		struct i915_request *rq, *rn;
>  
>  		list_for_each_entry_safe(rq, rn, &p->requests, priotree.link) {
> @@ -628,8 +639,9 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  		INIT_LIST_HEAD(&p->requests);
>  		if (p->priority != I915_PRIORITY_NORMAL)
>  			kmem_cache_free(engine->i915->priorities, p);
> -	} while (rb);
> +	}
>  done:
> +	execlists->queue_priority = rb ? to_priolist(rb)->priority : INT_MIN;
>  	execlists->first = rb;
>  	if (submit)
>  		port_assign(port, last);
> @@ -690,7 +702,7 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
>  	/* Flush the queued requests to the timeline list (for retiring). */
>  	rb = execlists->first;
>  	while (rb) {
> -		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
> +		struct i915_priolist *p = to_priolist(rb);
>  
>  		list_for_each_entry_safe(rq, rn, &p->requests, priotree.link) {
>  			INIT_LIST_HEAD(&rq->priotree.link);
> @@ -708,7 +720,7 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
>  
>  	/* Remaining _unready_ requests will be nop'ed when submitted */
>  
> -
> +	execlists->queue_priority = INT_MIN;
>  	execlists->queue = RB_ROOT;
>  	execlists->first = NULL;
>  	GEM_BUG_ON(port_isset(execlists->port));
> @@ -864,10 +876,11 @@ static void execlists_submission_tasklet(unsigned long data)
>  							EXECLISTS_ACTIVE_USER));
>  
>  			rq = port_unpack(port, &count);
> -			GEM_TRACE("%s out[0]: ctx=%d.%d, seqno=%x\n",
> +			GEM_TRACE("%s out[0]: ctx=%d.%d, seqno=%x, prio=%d\n",
>  				  engine->name,
>  				  port->context_id, count,
> -				  rq ? rq->global_seqno : 0);
> +				  rq ? rq->global_seqno : 0,
> +				  rq ? rq_prio(rq) : 0);
>  
>  			/* Check the context/desc id for this event matches */
>  			GEM_DEBUG_BUG_ON(buf[2 * head + 1] != port->context_id);
> @@ -912,15 +925,19 @@ static void execlists_submission_tasklet(unsigned long data)
>  		intel_uncore_forcewake_put(dev_priv, execlists->fw_domains);
>  }
>  
> -static void insert_request(struct intel_engine_cs *engine,
> -			   struct i915_priotree *pt,
> -			   int prio)
> +static void queue_request(struct intel_engine_cs *engine,
> +			  struct i915_priotree *pt,
> +			  int prio)
>  {
> -	struct i915_priolist *p = lookup_priolist(engine, pt, prio);
> +	list_add_tail(&pt->link, &lookup_priolist(engine, pt, prio)->requests);
> +}
>  
> -	list_add_tail(&pt->link, &ptr_mask_bits(p, 1)->requests);
> -	if (ptr_unmask_bits(p, 1))
> +static void submit_queue(struct intel_engine_cs *engine, int prio)
> +{
> +	if (prio > engine->execlists.queue_priority) {
> +		engine->execlists.queue_priority = prio;
>  		tasklet_hi_schedule(&engine->execlists.tasklet);
> +	}
>  }
>  
>  static void execlists_submit_request(struct i915_request *request)
> @@ -931,7 +948,8 @@ static void execlists_submit_request(struct i915_request *request)
>  	/* Will be called from irq-context when using foreign fences. */
>  	spin_lock_irqsave(&engine->timeline->lock, flags);
>  
> -	insert_request(engine, &request->priotree, request->priotree.priority);
> +	queue_request(engine, &request->priotree, rq_prio(request));
> +	submit_queue(engine, rq_prio(request));
>  
>  	GEM_BUG_ON(!engine->execlists.first);
>  	GEM_BUG_ON(list_empty(&request->priotree.link));
> @@ -987,7 +1005,7 @@ static void execlists_schedule(struct i915_request *request, int prio)
>  	 * static void update_priorities(struct i915_priotree *pt, prio) {
>  	 *	list_for_each_entry(dep, &pt->signalers_list, signal_link)
>  	 *		update_priorities(dep->signal, prio)
> -	 *	insert_request(pt);
> +	 *	queue_request(pt);
>  	 * }
>  	 * but that may have unlimited recursion depth and so runs a very
>  	 * real risk of overunning the kernel stack. Instead, we build
> @@ -1050,8 +1068,9 @@ static void execlists_schedule(struct i915_request *request, int prio)
>  		pt->priority = prio;
>  		if (!list_empty(&pt->link)) {
>  			__list_del_entry(&pt->link);
> -			insert_request(engine, pt, prio);
> +			queue_request(engine, pt, prio);
>  		}
> +		submit_queue(engine, prio);
>  	}
>  
>  	spin_unlock_irq(&engine->timeline->lock);
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index a9b83bf7e837..c4e9022b34e3 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -257,6 +257,11 @@ struct intel_engine_execlists {
>  	 */
>  	unsigned int port_mask;
>  
> +	/**
> +	 * @queue_priority: Highest pending priority.
> +	 */
> +	int queue_priority;
> +
>  	/**
>  	 * @queue: queue of requests, in priority lists
>  	 */
> -- 
> 2.16.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] drm/i915/preemption: Allow preemption between submission ports
  2018-02-23 15:18   ` Mika Kuoppala
@ 2018-02-23 16:41     ` Chris Wilson
  0 siblings, 0 replies; 10+ messages in thread
From: Chris Wilson @ 2018-02-23 16:41 UTC (permalink / raw)
  To: Mika Kuoppala, intel-gfx

Quoting Mika Kuoppala (2018-02-23 15:18:17)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > Sometimes we need to boost the priority of an in-flight request, which
> > may lead to the situation where the second submission port then contains
> > a higher priority context than the first and so we need to inject a
> > preemption event. To do so we must always check inside
> > execlists_dequeue() whether there is a priority inversion between the
> > ports themselves as well as the head of the priority sorted queue, and we
> > cannot just skip dequeuing if the queue is empty.
> >
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Michał Winiarski <michal.winiarski@intel.com>
> > Cc: Michel Thierry <michel.thierry@intel.com>
> > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>

Added detail to @queue_priority and pushed. Thanks for the review!
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2018-02-23 16:42 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-02-22 14:11 [PATCH] drm/i915/preemption: Allow preemption between submission ports Chris Wilson
2018-02-22 14:22 ` Chris Wilson
2018-02-22 14:36   ` Chris Wilson
2018-02-23 14:06   ` Mika Kuoppala
2018-02-23 14:23     ` Chris Wilson
2018-02-23 15:18   ` Mika Kuoppala
2018-02-23 16:41     ` Chris Wilson
2018-02-22 14:59 ` ✓ Fi.CI.BAT: success for drm/i915/preemption: Allow preemption between submission ports (rev2) Patchwork
2018-02-22 19:48 ` ✓ Fi.CI.IGT: " Patchwork
2018-02-23 13:35 ` [PATCH] drm/i915/preemption: Allow preemption between submission ports Chris Wilson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.