[PATCH 089/190] drm/i915: Tidy execlists submission and tracking

From: Chris Wilson <chris@chris-wilson.co.uk>
To: intel-gfx@lists.freedesktop.org
Subject: [PATCH 089/190] drm/i915: Tidy execlists submission and tracking
Date: Mon, 11 Jan 2016 10:44:33 +0000	[thread overview]
Message-ID: <1452509174-16671-3-git-send-email-chris@chris-wilson.co.uk> (raw)
In-Reply-To: <1452509174-16671-1-git-send-email-chris@chris-wilson.co.uk>

Other than dramatically simplifying the submission code (requests ftw),
we can reduce the execlist spinlock duration and importantly avoid
having to hold it across the context switch register reads.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_debugfs.c        |  20 +-
 drivers/gpu/drm/i915/i915_gem.c            |   8 +-
 drivers/gpu/drm/i915/i915_gem_request.h    |  21 +-
 drivers/gpu/drm/i915/i915_guc_submission.c |  31 +-
 drivers/gpu/drm/i915/intel_lrc.c           | 505 +++++++++++------------------
 drivers/gpu/drm/i915/intel_lrc.h           |   3 -
 drivers/gpu/drm/i915/intel_ringbuffer.h    |   8 +-
 7 files changed, 209 insertions(+), 387 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 15a6fddfb79b..a5ea90944bbb 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -2005,8 +2005,7 @@ static void i915_dump_lrc_obj(struct seq_file *m,
 		return;
 	}
 
-	seq_printf(m, "CONTEXT: %s %u\n", ring->name,
-		   intel_execlists_ctx_id(ctx_obj));
+	seq_printf(m, "CONTEXT: %s\n", ring->name);
 
 	if (!i915_gem_obj_ggtt_bound(ctx_obj))
 		seq_puts(m, "\tNot bound in GGTT\n");
@@ -2092,7 +2091,6 @@ static int i915_execlists(struct seq_file *m, void *data)
 	intel_runtime_pm_get(dev_priv);
 
 	for_each_ring(ring, dev_priv, ring_id) {
-		struct drm_i915_gem_request *head_req = NULL;
 		int count = 0;
 
 		seq_printf(m, "%s\n", ring->name);
@@ -2105,8 +2103,8 @@ static int i915_execlists(struct seq_file *m, void *data)
 		status_pointer = I915_READ(RING_CONTEXT_STATUS_PTR(ring));
 		seq_printf(m, "\tStatus pointer: 0x%08X\n", status_pointer);
 
-		read_pointer = ring->next_context_status_buffer;
-		write_pointer = GEN8_CSB_WRITE_PTR(status_pointer);
+		read_pointer = (status_pointer >> 8) & GEN8_CSB_PTR_MASK;
+		write_pointer = status_pointer & GEN8_CSB_PTR_MASK;
 		if (read_pointer > write_pointer)
 			write_pointer += GEN8_CSB_ENTRIES;
 		seq_printf(m, "\tRead pointer: 0x%08X, write pointer 0x%08X\n",
@@ -2123,21 +2121,9 @@ static int i915_execlists(struct seq_file *m, void *data)
 		spin_lock(&ring->execlist_lock);
 		list_for_each(cursor, &ring->execlist_queue)
 			count++;
-		head_req = list_first_entry_or_null(&ring->execlist_queue,
-				struct drm_i915_gem_request, execlist_link);
 		spin_unlock(&ring->execlist_lock);
 
 		seq_printf(m, "\t%d requests in queue\n", count);
-		if (head_req) {
-			struct drm_i915_gem_object *ctx_obj;
-
-			ctx_obj = head_req->ctx->engine[ring_id].state;
-			seq_printf(m, "\tHead request id: %u\n",
-				   intel_execlists_ctx_id(ctx_obj));
-			seq_printf(m, "\tHead request tail: %u\n",
-				   head_req->tail);
-		}
-
 		seq_putc(m, '\n');
 	}
 
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index eb875ecd7907..054e11cff00f 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2193,12 +2193,12 @@ static void i915_gem_reset_ring_cleanup(struct intel_engine_cs *engine)
 
 	if (i915.enable_execlists) {
 		spin_lock(&engine->execlist_lock);
-
-		/* list_splice_tail_init checks for empty lists */
 		list_splice_tail_init(&engine->execlist_queue,
-				      &engine->execlist_retired_req_list);
-
+				      &engine->execlist_completed);
+		memset(&engine->execlist_port, 0,
+		       sizeof(engine->execlist_port));
 		spin_unlock(&engine->execlist_lock);
+
 		intel_execlists_retire_requests(engine);
 	}
 
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index 59957d5edfdb..c2e83584f8a2 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -63,10 +63,11 @@ struct drm_i915_gem_request {
 	 * This is required to calculate the maximum available ringbuffer
 	 * space without overwriting the postfix.
 	 */
-	 u32 postfix;
+	u32 postfix;
 
 	/** Position in the ringbuffer of the end of the whole request */
 	u32 tail;
+	u32 wa_tail;
 
 	/**
 	 * Context and ring buffer related to this request
@@ -99,24 +100,8 @@ struct drm_i915_gem_request {
 	/** process identifier submitting this request */
 	struct pid *pid;
 
-	/**
-	 * The ELSP only accepts two elements at a time, so we queue
-	 * context/tail pairs on a given queue (ring->execlist_queue) until the
-	 * hardware is available. The queue serves a double purpose: we also use
-	 * it to keep track of the up to 2 contexts currently in the hardware
-	 * (usually one in execution and the other queued up by the GPU): We
-	 * only remove elements from the head of the queue when the hardware
-	 * informs us that an element has been completed.
-	 *
-	 * All accesses to the queue are mediated by a spinlock
-	 * (ring->execlist_lock).
-	 */
-
 	/** Execlist link in the submission queue.*/
-	struct list_head execlist_link;
-
-	/** Execlists no. of times this request has been sent to the ELSP */
-	int elsp_submitted;
+	struct list_head execlist_link; /* guarded by engine->execlist_lock */
 };
 
 struct drm_i915_gem_request *
diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index 5a6251926367..f4e09952d52c 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -393,7 +393,6 @@ static void guc_init_ctx_desc(struct intel_guc *guc,
 		struct intel_ring *ring = ctx->engine[i].ring;
 		struct intel_engine_cs *engine;
 		struct drm_i915_gem_object *obj;
-		uint64_t ctx_desc;
 
 		/* TODO: We have a design issue to be solved here. Only when we
 		 * receive the first batch, we know which engine is used by the
@@ -407,8 +406,7 @@ static void guc_init_ctx_desc(struct intel_guc *guc,
 			break;	/* XXX: continue? */
 
 		engine = ring->engine;
-		ctx_desc = intel_lr_context_descriptor(ctx, engine);
-		lrc->context_desc = (u32)ctx_desc;
+		lrc->context_desc = engine->execlist_context_descriptor;
 
 		/* The state page is after PPHWSP */
 		lrc->ring_lcra = i915_gem_obj_ggtt_offset(obj) +
@@ -548,7 +546,7 @@ static int guc_add_workqueue_item(struct i915_guc_client *gc,
 			WQ_NO_WCFLUSH_WAIT;
 
 	/* The GuC wants only the low-order word of the context descriptor */
-	wqi->context_desc = (u32)intel_lr_context_descriptor(rq->ctx, rq->engine);
+	wqi->context_desc = rq->engine->execlist_context_descriptor;
 
 	/* The GuC firmware wants the tail index in QWords, not bytes */
 	tail = rq->ring->tail >> 3;
@@ -562,27 +560,6 @@ static int guc_add_workqueue_item(struct i915_guc_client *gc,
 
 #define CTX_RING_BUFFER_START		0x08
 
-/* Update the ringbuffer pointer in a saved context image */
-static void lr_context_update(struct drm_i915_gem_request *rq)
-{
-	enum intel_engine_id ring_id = rq->engine->id;
-	struct drm_i915_gem_object *ctx_obj = rq->ctx->engine[ring_id].state;
-	struct drm_i915_gem_object *rb_obj = rq->ring->obj;
-	struct page *page;
-	uint32_t *reg_state;
-
-	BUG_ON(!ctx_obj);
-	WARN_ON(!i915_gem_obj_is_pinned(ctx_obj));
-	WARN_ON(!i915_gem_obj_is_pinned(rb_obj));
-
-	page = i915_gem_object_get_dirty_page(ctx_obj, LRC_STATE_PN);
-	reg_state = kmap_atomic(page);
-
-	reg_state[CTX_RING_BUFFER_START+1] = i915_gem_obj_ggtt_offset(rb_obj);
-
-	kunmap_atomic(reg_state);
-}
-
 /**
  * i915_guc_submit() - Submit commands through GuC
  * @client:	the guc client where commands will go through
@@ -597,10 +574,6 @@ int i915_guc_submit(struct i915_guc_client *client,
 	enum intel_engine_id ring_id = rq->engine->id;
 	int q_ret, b_ret;
 
-	/* Need this because of the deferred pin ctx and ring */
-	/* Shall we move this right after ring is pinned? */
-	lr_context_update(rq);
-
 	q_ret = guc_add_workqueue_item(client, rq);
 	if (q_ret == 0)
 		b_ret = guc_ring_doorbell(client);
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index de5889e95d6d..80b346a3fd8a 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -265,233 +265,133 @@ int intel_sanitize_enable_execlists(struct drm_device *dev, int enable_execlists
 	return 0;
 }
 
-/**
- * intel_execlists_ctx_id() - get the Execlists Context ID
- * @ctx_obj: Logical Ring Context backing object.
- *
- * Do not confuse with ctx->id! Unfortunately we have a name overload
- * here: the old context ID we pass to userspace as a handler so that
- * they can refer to a context, and the new context ID we pass to the
- * ELSP so that the GPU can inform us of the context status via
- * interrupts.
- *
- * Return: 20-bits globally unique context ID.
- */
-u32 intel_execlists_ctx_id(struct drm_i915_gem_object *ctx_obj)
-{
-	u32 lrca = i915_gem_obj_ggtt_offset(ctx_obj) +
-			LRC_PPHWSP_PN * PAGE_SIZE;
-
-	/* LRCA is required to be 4K aligned so the more significant 20 bits
-	 * are globally unique */
-	return lrca >> 12;
-}
-
-static bool disable_lite_restore_wa(struct intel_engine_cs *ring)
-{
-	return (IS_SKL_REVID(ring->dev, 0, SKL_REVID_B0) ||
-		IS_BXT_REVID(ring->dev, 0, BXT_REVID_A1)) &&
-		(ring->id == VCS || ring->id == VCS2);
-}
-
-uint64_t intel_lr_context_descriptor(struct intel_context *ctx,
-				     struct intel_engine_cs *ring)
+static u32 execlists_request_write_tail(struct drm_i915_gem_request *req)
 {
-	struct drm_i915_gem_object *ctx_obj = ctx->engine[ring->id].state;
-	uint64_t desc;
-	uint64_t lrca = i915_gem_obj_ggtt_offset(ctx_obj) +
-			LRC_PPHWSP_PN * PAGE_SIZE;
-
-	WARN_ON(lrca & 0xFFFFFFFF00000FFFULL);
-
-	desc = GEN8_CTX_VALID;
-	desc |= GEN8_CTX_ADDRESSING_MODE(ring->i915) << GEN8_CTX_ADDRESSING_MODE_SHIFT;
-	if (IS_GEN8(ring->i915))
-		desc |= GEN8_CTX_L3LLC_COHERENT;
-	desc |= GEN8_CTX_PRIVILEGE;
-	desc |= lrca;
-	desc |= (u64)intel_execlists_ctx_id(ctx_obj) << GEN8_CTX_ID_SHIFT;
-
-	/* TODO: WaDisableLiteRestore when we start using semaphore
-	 * signalling between Command Streamers */
-	/* desc |= GEN8_CTX_FORCE_RESTORE; */
+	struct intel_ring *ring = req->ring;
+	struct i915_hw_ppgtt *ppgtt = req->ctx->ppgtt;
 
-	/* WaEnableForceRestoreInCtxtDescForVCS:skl */
-	/* WaEnableForceRestoreInCtxtDescForVCS:bxt */
-	if (disable_lite_restore_wa(ring))
-		desc |= GEN8_CTX_FORCE_RESTORE;
+	if (ppgtt && !USES_FULL_48BIT_PPGTT(req->i915)) {
+		/* True 32b PPGTT with dynamic page allocation: update PDP
+		 * registers and point the unallocated PDPs to scratch page.
+		 * PML4 is allocated during ppgtt init, so this is not needed
+		 * in 48-bit mode.
+		 */
+		if (ppgtt->pd_dirty_rings & intel_engine_flag(req->engine)) {
+			ASSIGN_CTX_PDP(ppgtt, ring->registers, 3);
+			ASSIGN_CTX_PDP(ppgtt, ring->registers, 2);
+			ASSIGN_CTX_PDP(ppgtt, ring->registers, 1);
+			ASSIGN_CTX_PDP(ppgtt, ring->registers, 0);
+			ppgtt->pd_dirty_rings &= ~intel_engine_flag(req->engine);
+		}
+	}
 
-	return desc;
+	ring->registers[CTX_RING_TAIL+1] = req->tail;
+	return ring->context_descriptor;
 }
 
-static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
-				 struct drm_i915_gem_request *rq1)
+static void execlists_submit_pair(struct intel_engine_cs *ring)
 {
+	struct drm_i915_private *dev_priv = ring->i915;
+	uint32_t desc[4];
 
-	struct intel_engine_cs *engine = rq0->engine;
-	struct drm_i915_private *dev_priv = rq0->i915;
-	uint64_t desc[2];
-
-	if (rq1) {
-		desc[1] = intel_lr_context_descriptor(rq1->ctx, rq1->engine);
-		rq1->elsp_submitted++;
-	} else {
-		desc[1] = 0;
-	}
+	if (ring->execlist_port[1]) {
+		desc[0] = execlists_request_write_tail(ring->execlist_port[1]);
+		desc[1] = ring->execlist_port[1]->fence.seqno;
+	} else
+		desc[1] = desc[0] = 0;
 
-	desc[0] = intel_lr_context_descriptor(rq0->ctx, rq0->engine);
-	rq0->elsp_submitted++;
+	desc[2] = execlists_request_write_tail(ring->execlist_port[0]);
+	desc[3] = ring->execlist_port[0]->fence.seqno;
 
-	/* You must always write both descriptors in the order below. */
-	spin_lock_irq(&dev_priv->uncore.lock);
-	intel_uncore_forcewake_get__locked(dev_priv, FORCEWAKE_ALL);
-	I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[1]));
-	I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[1]));
+	/* Note: You must always write both descriptors in the order below. */
+	I915_WRITE_FW(RING_ELSP(ring), desc[1]);
+	I915_WRITE_FW(RING_ELSP(ring), desc[0]);
+	I915_WRITE_FW(RING_ELSP(ring), desc[3]);
 
-	I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[0]));
 	/* The context is automatically loaded after the following */
-	I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[0]));
-
-	/* ELSP is a wo register, use another nearby reg for posting */
-	POSTING_READ_FW(RING_EXECLIST_STATUS_LO(engine));
-	intel_uncore_forcewake_put__locked(dev_priv, FORCEWAKE_ALL);
-	spin_unlock_irq(&dev_priv->uncore.lock);
+	I915_WRITE_FW(RING_ELSP(ring), desc[2]);
 }
 
-static int execlists_update_context(struct drm_i915_gem_request *rq)
+static void execlists_context_unqueue(struct intel_engine_cs *engine)
 {
-	struct i915_hw_ppgtt *ppgtt = rq->ctx->ppgtt;
-	struct drm_i915_gem_object *ctx_obj = rq->ctx->engine[rq->engine->id].state;
-	struct drm_i915_gem_object *rb_obj = rq->ring->obj;
-	struct page *page;
-	uint32_t *reg_state;
-
-	BUG_ON(!ctx_obj);
-	WARN_ON(!i915_gem_obj_is_pinned(ctx_obj));
-	WARN_ON(!i915_gem_obj_is_pinned(rb_obj));
-
-	page = i915_gem_object_get_dirty_page(ctx_obj, LRC_STATE_PN);
-	reg_state = kmap_atomic(page);
+	struct drm_i915_gem_request *cursor;
+	bool submit = false;
+	int port = 0;
 
-	reg_state[CTX_RING_TAIL+1] = rq->tail;
-	reg_state[CTX_RING_BUFFER_START+1] = i915_gem_obj_ggtt_offset(rb_obj);
+	assert_spin_locked(&engine->execlist_lock);
 
-	if (ppgtt && !USES_FULL_48BIT_PPGTT(rq->i915)) {
-		/* True 32b PPGTT with dynamic page allocation: update PDP
-		 * registers and point the unallocated PDPs to scratch page.
-		 * PML4 is allocated during ppgtt init, so this is not needed
-		 * in 48-bit mode.
+	/* Try to read in pairs and fill both submission ports */
+	cursor = engine->execlist_port[port];
+	if (cursor != NULL) {
+		/* WaIdleLiteRestore:bdw,skl
+		 * Apply the wa NOOPs to prevent ring:HEAD == req:TAIL
+		 * as we resubmit the request. See gen8_emit_request()
+		 * for where we prepare the padding after the end of the
+		 * request.
 		 */
-		ASSIGN_CTX_PDP(ppgtt, reg_state, 3);
-		ASSIGN_CTX_PDP(ppgtt, reg_state, 2);
-		ASSIGN_CTX_PDP(ppgtt, reg_state, 1);
-		ASSIGN_CTX_PDP(ppgtt, reg_state, 0);
-	}
-
-	kunmap_atomic(reg_state);
-
-	return 0;
-}
+		cursor->tail = cursor->wa_tail;
+		cursor = list_next_entry(cursor, execlist_link);
+	} else
+		cursor = list_first_entry(&engine->execlist_queue,
+					  typeof(*cursor),
+					  execlist_link);
+	while (&cursor->execlist_link != &engine->execlist_queue) {
+		/* Same ctx: ignore earlier request, as the
+		 * second request extends the first.
+		 */
+		if (engine->execlist_port[port] &&
+		    cursor->ctx != engine->execlist_port[port]->ctx) {
+			if (++port == ARRAY_SIZE(engine->execlist_port))
+				break;
+		}
 
-static void execlists_submit_requests(struct drm_i915_gem_request *rq0,
-				      struct drm_i915_gem_request *rq1)
-{
-	execlists_update_context(rq0);
+		engine->execlist_port[port] = cursor;
+		submit = true;
 
-	if (rq1)
-		execlists_update_context(rq1);
+		cursor = list_next_entry(cursor, execlist_link);
+	}
 
-	execlists_elsp_write(rq0, rq1);
+	if (submit)
+		execlists_submit_pair(engine);
 }
 
-static void execlists_context_unqueue(struct intel_engine_cs *engine)
+static bool execlists_complete_requests(struct intel_engine_cs *engine,
+					u32 seqno)
 {
-	struct drm_i915_gem_request *req0 = NULL, *req1 = NULL;
-	struct drm_i915_gem_request *cursor = NULL, *tmp = NULL;
-
 	assert_spin_locked(&engine->execlist_lock);
 
-	/*
-	 * If irqs are not active generate a warning as batches that finish
-	 * without the irqs may get lost and a GPU Hang may occur.
-	 */
-	WARN_ON(!intel_irqs_enabled(engine->dev->dev_private));
+	do {
+		struct drm_i915_gem_request *req;
 
-	if (list_empty(&engine->execlist_queue))
-		return;
+		req = engine->execlist_port[0];
+		if (req == NULL)
+			break;
 
-	/* Try to read in pairs */
-	list_for_each_entry_safe(cursor, tmp, &engine->execlist_queue,
-				 execlist_link) {
-		if (!req0) {
-			req0 = cursor;
-		} else if (req0->ctx == cursor->ctx) {
-			/* Same ctx: ignore first request, as second request
-			 * will update tail past first request's workload */
-			cursor->elsp_submitted = req0->elsp_submitted;
-			list_del(&req0->execlist_link);
-			list_add_tail(&req0->execlist_link,
-				&engine->execlist_retired_req_list);
-			req0 = cursor;
-		} else {
-			req1 = cursor;
+		if (!i915_seqno_passed(seqno, req->fence.seqno))
 			break;
-		}
-	}
 
-	if (IS_GEN8(engine->dev) || IS_GEN9(engine->dev)) {
-		/*
-		 * WaIdleLiteRestore: make sure we never cause a lite
-		 * restore with HEAD==TAIL
+		/* Move the completed set of requests from the start of the
+		 * execlist_queue over to the tail of the execlist_completed.
 		 */
-		if (req0->elsp_submitted) {
-			/*
-			 * Apply the wa NOOPS to prevent ring:HEAD == req:TAIL
-			 * as we resubmit the request. See gen8_add_request()
-			 * for where we prepare the padding after the end of the
-			 * request.
-			 */
-			struct intel_ring *ring;
-
-			ring = req0->ctx->engine[engine->id].ring;
-			req0->tail += 8;
-			req0->tail &= ring->size - 1;
-		}
-	}
-
-	WARN_ON(req1 && req1->elsp_submitted);
+		engine->execlist_completed.prev->next = engine->execlist_queue.next;
+		engine->execlist_completed.prev = &req->execlist_link;
 
-	execlists_submit_requests(req0, req1);
-}
-
-static bool execlists_check_remove_request(struct intel_engine_cs *ring,
-					   u32 request_id)
-{
-	struct drm_i915_gem_request *head_req;
+		engine->execlist_queue.next = req->execlist_link.next;
+		req->execlist_link.next->prev = &engine->execlist_queue;
 
-	assert_spin_locked(&ring->execlist_lock);
+		req->execlist_link.next = &engine->execlist_completed;
 
-	head_req = list_first_entry_or_null(&ring->execlist_queue,
-					    struct drm_i915_gem_request,
-					    execlist_link);
-
-	if (head_req != NULL) {
-		struct drm_i915_gem_object *ctx_obj =
-				head_req->ctx->engine[ring->id].state;
-		if (intel_execlists_ctx_id(ctx_obj) == request_id) {
-			WARN(head_req->elsp_submitted == 0,
-			     "Never submitted head request\n");
-
-			if (--head_req->elsp_submitted <= 0) {
-				list_del(&head_req->execlist_link);
-				list_add_tail(&head_req->execlist_link,
-					&ring->execlist_retired_req_list);
-				return true;
-			}
-		}
-	}
+		/* The hardware has completed the request on this port, it
+		 * will switch to the next.
+		 */
+		engine->execlist_port[0] = engine->execlist_port[1];
+		engine->execlist_port[1] = NULL;
+	} while (1);
 
-	return false;
+	if (engine->execlist_context_descriptor & GEN8_CTX_FORCE_RESTORE)
+		return engine->execlist_port[0] == NULL;
+	else
+		return engine->execlist_port[1] == NULL;
 }
 
 static void set_rtpriority(void)
@@ -504,23 +404,29 @@ static int intel_execlists_submit(void *arg)
 {
 	struct intel_engine_cs *ring = arg;
 	struct drm_i915_private *dev_priv = ring->i915;
+	const i915_reg_t ptrs = RING_CONTEXT_STATUS_PTR(ring);
 
 	set_rtpriority();
 
+	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
 	do {
-		u32 status;
-		u32 status_id;
-		u32 submit_contexts;
 		u8 head, tail;
+		u32 seqno;
 
 		set_current_state(TASK_INTERRUPTIBLE);
-		head = ring->next_context_status_buffer;
-		tail = I915_READ(RING_CONTEXT_STATUS_PTR(ring)) & GEN8_CSB_PTR_MASK;
+		head = tail = 0;
+		if (READ_ONCE(ring->execlist_port[0])) {
+			u32 x = I915_READ_FW(ptrs);
+			head = x >> 8;
+			tail = x;
+		}
 		if (head == tail) {
+			intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
 			if (kthread_should_stop())
 				return 0;
 
 			schedule();
+			intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
 			continue;
 		}
 		__set_current_state(TASK_RUNNING);
@@ -528,86 +434,46 @@ static int intel_execlists_submit(void *arg)
 		if (head > tail)
 			tail += GEN8_CSB_ENTRIES;
 
-		status = 0;
-		submit_contexts = 0;
-
-		spin_lock(&ring->execlist_lock);
-
+		seqno = 0;
 		while (head++ < tail) {
-			status = I915_READ(RING_CONTEXT_STATUS_BUF_LO(ring, head % GEN8_CSB_ENTRIES));
-			status_id = I915_READ(RING_CONTEXT_STATUS_BUF_HI(ring, head % GEN8_CSB_ENTRIES));
-
-			if (status & GEN8_CTX_STATUS_IDLE_ACTIVE)
-				continue;
-
-			if (status & GEN8_CTX_STATUS_PREEMPTED) {
-				if (status & GEN8_CTX_STATUS_LITE_RESTORE) {
-					if (execlists_check_remove_request(ring, status_id))
-						WARN(1, "Lite Restored request removed from queue\n");
-				} else
-					WARN(1, "Preemption without Lite Restore\n");
-			}
-
-			if ((status & GEN8_CTX_STATUS_ACTIVE_IDLE) ||
-			    (status & GEN8_CTX_STATUS_ELEMENT_SWITCH)) {
-				if (execlists_check_remove_request(ring, status_id))
-					submit_contexts++;
+			u32 status = I915_READ_FW(RING_CONTEXT_STATUS_BUF_LO(ring,
+									     head % GEN8_CSB_ENTRIES));
+			if (unlikely(status & GEN8_CTX_STATUS_PREEMPTED && 0)) {
+				DRM_ERROR("Pre-empted request %x %s Lite Restore\n",
+					  I915_READ_FW(RING_CONTEXT_STATUS_BUF_HI(ring, head % GEN8_CSB_ENTRIES)),
+					  status & GEN8_CTX_STATUS_LITE_RESTORE ? "with" : "without");
 			}
+			if (status & (GEN8_CTX_STATUS_ACTIVE_IDLE |
+				      GEN8_CTX_STATUS_ELEMENT_SWITCH))
+				seqno = I915_READ_FW(RING_CONTEXT_STATUS_BUF_HI(ring,
+										head % GEN8_CSB_ENTRIES));
 		}
 
-		if (disable_lite_restore_wa(ring)) {
-			/* Prevent a ctx to preempt itself */
-			if ((status & GEN8_CTX_STATUS_ACTIVE_IDLE) &&
-					(submit_contexts != 0))
+		I915_WRITE_FW(ptrs,
+			      _MASKED_FIELD(GEN8_CSB_PTR_MASK<<8,
+					    (tail % GEN8_CSB_ENTRIES) << 8));
+
+		if (seqno) {
+			spin_lock(&ring->execlist_lock);
+			if (execlists_complete_requests(ring, seqno))
 				execlists_context_unqueue(ring);
-		} else if (submit_contexts != 0) {
-			execlists_context_unqueue(ring);
+			spin_unlock(&ring->execlist_lock);
 		}
-
-		spin_unlock(&ring->execlist_lock);
-
-		WARN(submit_contexts > 2, "More than two context complete events?\n");
-		ring->next_context_status_buffer = tail % GEN8_CSB_ENTRIES;
-		I915_WRITE(RING_CONTEXT_STATUS_PTR(ring),
-			   _MASKED_FIELD(GEN8_CSB_PTR_MASK << 8,
-					 ring->next_context_status_buffer<<8));
 	} while (1);
 }
 
 static int execlists_context_queue(struct drm_i915_gem_request *request)
 {
 	struct intel_engine_cs *engine = request->engine;
-	struct drm_i915_gem_request *cursor;
-	int num_elements = 0;
 
 	i915_gem_request_get(request);
 
 	spin_lock(&engine->execlist_lock);
-
-	list_for_each_entry(cursor, &engine->execlist_queue, execlist_link)
-		if (++num_elements > 2)
-			break;
-
-	if (num_elements > 2) {
-		struct drm_i915_gem_request *tail_req;
-
-		tail_req = list_last_entry(&engine->execlist_queue,
-					   struct drm_i915_gem_request,
-					   execlist_link);
-
-		if (request->ctx == tail_req->ctx) {
-			WARN(tail_req->elsp_submitted != 0,
-				"More than 2 already-submitted reqs queued\n");
-			list_del(&tail_req->execlist_link);
-			list_add_tail(&tail_req->execlist_link,
-				&engine->execlist_retired_req_list);
-		}
-	}
-
 	list_add_tail(&request->execlist_link, &engine->execlist_queue);
-	if (num_elements == 0)
-		execlists_context_unqueue(engine);
-
+	if (engine->execlist_port[0] == NULL) {
+		engine->execlist_port[0] = request;
+		execlists_submit_pair(engine);
+	}
 	spin_unlock(&engine->execlist_lock);
 
 	return 0;
@@ -641,56 +507,32 @@ int intel_logical_ring_alloc_request_extras(struct drm_i915_gem_request *request
 	return 0;
 }
 
-/*
- * intel_logical_ring_advance_and_submit() - advance the tail and submit the workload
- * @request: Request to advance the logical ringbuffer of.
- *
- * The tail is updated in our logical ringbuffer struct, not in the actual context. What
- * really happens during submission is that the context and current tail will be placed
- * on a queue waiting for the ELSP to be ready to accept a new context submission. At that
- * point, the tail *inside* the context is updated and the ELSP written to.
- */
-static void
-intel_logical_ring_advance_and_submit(struct drm_i915_gem_request *request)
-{
-	struct drm_i915_private *dev_priv = request->i915;
-
-	intel_ring_advance(request->ring);
-	request->tail = request->ring->tail;
-
-	if (dev_priv->guc.execbuf_client)
-		i915_guc_submit(dev_priv->guc.execbuf_client, request);
-	else
-		execlists_context_queue(request);
-}
-
 bool intel_execlists_retire_requests(struct intel_engine_cs *ring)
 {
 	struct drm_i915_gem_request *req, *tmp;
-	struct list_head retired_list;
+	struct list_head list;
 
-	WARN_ON(!mutex_is_locked(&ring->dev->struct_mutex));
-	if (list_empty(&ring->execlist_retired_req_list))
+	lockdep_assert_held(&ring->dev->struct_mutex);
+	if (list_empty(&ring->execlist_completed))
 		goto out;
 
-	INIT_LIST_HEAD(&retired_list);
 	spin_lock(&ring->execlist_lock);
-	list_replace_init(&ring->execlist_retired_req_list, &retired_list);
+	list_replace_init(&ring->execlist_completed, &list);
 	spin_unlock(&ring->execlist_lock);
 
-	list_for_each_entry_safe(req, tmp, &retired_list, execlist_link) {
+	list_for_each_entry_safe(req, tmp, &list, execlist_link) {
 		struct intel_context *ctx = req->ctx;
 		struct drm_i915_gem_object *ctx_obj =
 				ctx->engine[ring->id].state;
 
 		if (ctx_obj && (ctx != ring->default_context))
 			intel_lr_context_unpin(req);
-		list_del(&req->execlist_link);
+
 		i915_gem_request_put(req);
 	}
 
 out:
-	return list_empty(&ring->execlist_queue);
+	return READ_ONCE(ring->execlist_port[0]) == NULL;
 }
 
 void intel_logical_ring_stop(struct intel_engine_cs *ring)
@@ -720,6 +562,7 @@ static int intel_lr_context_do_pin(struct intel_engine_cs *ring,
 		struct intel_ring *ringbuf)
 {
 	struct drm_i915_private *dev_priv = ring->i915;
+	u32 ggtt_offset;
 	int ret = 0;
 
 	WARN_ON(!mutex_is_locked(&ring->dev->struct_mutex));
@@ -734,6 +577,16 @@ static int intel_lr_context_do_pin(struct intel_engine_cs *ring,
 
 	ctx_obj->dirty = true;
 
+	ggtt_offset =
+		i915_gem_obj_ggtt_offset(ctx_obj) + LRC_PPHWSP_PN * PAGE_SIZE;
+	ringbuf->context_descriptor =
+		ggtt_offset | ring->execlist_context_descriptor;
+
+	ringbuf->registers =
+		kmap(i915_gem_object_get_dirty_page(ctx_obj, LRC_STATE_PN));
+	ringbuf->registers[CTX_RING_BUFFER_START+1] =
+		i915_gem_obj_ggtt_offset(ringbuf->obj);
+
 	/* Invalidate GuC TLB. */
 	if (i915.enable_guc_submission)
 		I915_WRITE(GEN8_GTCR, GEN8_GTCR_INVALIDATE);
@@ -768,6 +621,7 @@ static int intel_lr_context_pin(struct drm_i915_gem_request *rq)
 
 void intel_lr_context_unpin(struct drm_i915_gem_request *rq)
 {
+	struct drm_i915_gem_object *ctx_obj;
 	int engine = rq->engine->id;
 
 	WARN_ON(!mutex_is_locked(&rq->i915->dev->struct_mutex));
@@ -775,7 +629,10 @@ void intel_lr_context_unpin(struct drm_i915_gem_request *rq)
 		return;
 
 	intel_ring_unmap(rq->ring);
-	i915_gem_object_ggtt_unpin(rq->ctx->engine[engine].state);
+
+	ctx_obj = rq->ctx->engine[engine].state;
+	kunmap(i915_gem_object_get_page(ctx_obj, LRC_STATE_PN));
+	i915_gem_object_ggtt_unpin(ctx_obj);
 	i915_gem_context_unreference(rq->ctx);
 }
 
@@ -1168,12 +1025,39 @@ out:
 	return ret;
 }
 
+static bool disable_lite_restore_wa(struct intel_engine_cs *ring)
+{
+	return (IS_SKL_REVID(ring->i915, 0, SKL_REVID_B0) ||
+		IS_BXT_REVID(ring->i915, 0, BXT_REVID_A1)) &&
+		(ring->id == VCS || ring->id == VCS2);
+}
+
+static uint64_t lr_context_descriptor(struct intel_engine_cs *ring)
+{
+	uint64_t desc;
+
+	desc = GEN8_CTX_VALID;
+	desc |= GEN8_CTX_ADDRESSING_MODE(ring->i915) << GEN8_CTX_ADDRESSING_MODE_SHIFT;
+	if (IS_GEN8(ring->i915))
+		desc |= GEN8_CTX_L3LLC_COHERENT;
+	desc |= GEN8_CTX_PRIVILEGE;
+
+	/* TODO: WaDisableLiteRestore when we start using semaphore
+	 * signalling between Command Streamers */
+	/* desc |= GEN8_CTX_FORCE_RESTORE; */
+
+	/* WaEnableForceRestoreInCtxtDescForVCS:skl */
+	/* WaEnableForceRestoreInCtxtDescForVCS:bxt */
+	if (disable_lite_restore_wa(ring))
+		desc |= GEN8_CTX_FORCE_RESTORE;
+
+	return desc;
+}
+
 static int gen8_init_common_ring(struct intel_engine_cs *ring)
 {
 	struct drm_device *dev = ring->dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
-	u8 next_context_status_buffer_hw;
-
 	lrc_setup_hardware_status_page(ring,
 				ring->default_context->engine[ring->id].state);
 
@@ -1197,18 +1081,6 @@ static int gen8_init_common_ring(struct intel_engine_cs *ring)
 	 * SKL  |         ?                |         ?            |
 	 * BXT  |         ?                |         ?            |
 	 */
-	next_context_status_buffer_hw =
-		GEN8_CSB_WRITE_PTR(I915_READ(RING_CONTEXT_STATUS_PTR(ring)));
-
-	/*
-	 * When the CSB registers are reset (also after power-up / gpu reset),
-	 * CSB write pointer is set to all 1's, which is not valid, use '5' in
-	 * this special case, so the first element read is CSB[0].
-	 */
-	if (next_context_status_buffer_hw == GEN8_CSB_PTR_MASK)
-		next_context_status_buffer_hw = (GEN8_CSB_ENTRIES - 1);
-
-	ring->next_context_status_buffer = next_context_status_buffer_hw;
 	DRM_DEBUG_DRIVER("Execlists enabled for %s\n", ring->name);
 
 	memset(&ring->hangcheck, 0, sizeof(ring->hangcheck));
@@ -1482,7 +1354,8 @@ static int gen8_add_request(struct drm_i915_gem_request *request)
 	intel_ring_emit(ring, request->fence.seqno);
 	intel_ring_emit(ring, MI_USER_INTERRUPT);
 	intel_ring_emit(ring, MI_NOOP);
-	intel_logical_ring_advance_and_submit(request);
+	intel_ring_advance(ring);
+	request->tail = ring->tail;
 
 	/*
 	 * Here we add two extra NOOPs as padding to avoid
@@ -1491,6 +1364,12 @@ static int gen8_add_request(struct drm_i915_gem_request *request)
 	intel_ring_emit(ring, MI_NOOP);
 	intel_ring_emit(ring, MI_NOOP);
 	intel_ring_advance(ring);
+	request->wa_tail = ring->tail;
+
+	if (request->i915->guc.execbuf_client)
+		i915_guc_submit(request->i915->guc.execbuf_client, request);
+	else
+		execlists_context_queue(request);
 
 	return 0;
 }
@@ -1569,9 +1448,11 @@ static int logical_ring_init(struct drm_device *dev, struct intel_engine_cs *rin
 
 	INIT_LIST_HEAD(&ring->buffers);
 	INIT_LIST_HEAD(&ring->execlist_queue);
-	INIT_LIST_HEAD(&ring->execlist_retired_req_list);
+	INIT_LIST_HEAD(&ring->execlist_completed);
 	spin_lock_init(&ring->execlist_lock);
 
+	ring->execlist_context_descriptor = lr_context_descriptor(ring);
+
 	ret = i915_cmd_parser_init_ring(ring);
 	if (ret)
 		goto error;
@@ -1592,8 +1473,6 @@ static int logical_ring_init(struct drm_device *dev, struct intel_engine_cs *rin
 		goto error;
 	}
 
-	ring->next_context_status_buffer =
-			I915_READ(RING_CONTEXT_STATUS_PTR(ring)) & GEN8_CSB_PTR_MASK;
 	task = kthread_run(intel_execlists_submit, ring,
 			   "irq/i915:%de", ring->id);
 	if (IS_ERR(task))
@@ -1904,9 +1783,7 @@ populate_lr_context(struct intel_context *ctx, struct drm_i915_gem_object *ctx_o
 					  CTX_CTRL_RS_CTX_ENABLE));
 	ASSIGN_CTX_REG(reg_state, CTX_RING_HEAD, RING_HEAD(ring->mmio_base), 0);
 	ASSIGN_CTX_REG(reg_state, CTX_RING_TAIL, RING_TAIL(ring->mmio_base), 0);
-	/* Ring buffer start address is not known until the buffer is pinned.
-	 * It is written to the context image in execlists_update_context()
-	 */
+	/* Ring buffer start address is not known until the buffer is pinned. */
 	ASSIGN_CTX_REG(reg_state, CTX_RING_BUFFER_START, RING_START(ring->mmio_base), 0);
 	ASSIGN_CTX_REG(reg_state, CTX_RING_BUFFER_CONTROL, RING_CTL(ring->mmio_base),
 		       ((ringbuf->size - PAGE_SIZE) & RING_NR_PAGES) | RING_VALID);
diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
index 33f82a84065a..37601a35d5fc 100644
--- a/drivers/gpu/drm/i915/intel_lrc.h
+++ b/drivers/gpu/drm/i915/intel_lrc.h
@@ -74,12 +74,9 @@ int intel_lr_context_deferred_alloc(struct intel_context *ctx,
 void intel_lr_context_unpin(struct drm_i915_gem_request *req);
 void intel_lr_context_reset(struct drm_device *dev,
 			struct intel_context *ctx);
-uint64_t intel_lr_context_descriptor(struct intel_context *ctx,
-				     struct intel_engine_cs *ring);
 
 /* Execlists */
 int intel_sanitize_enable_execlists(struct drm_device *dev, int enable_execlists);
-u32 intel_execlists_ctx_id(struct drm_i915_gem_object *ctx_obj);
 
 bool intel_execlists_retire_requests(struct intel_engine_cs *ring);
 
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index edaf07b2292e..3d4d5711aea9 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -122,6 +122,9 @@ struct intel_ring {
 	 * we can detect new retirements.
 	 */
 	u32 last_retired_head;
+
+	u32 context_descriptor;
+	u32 *registers;
 };
 
 struct	intel_context;
@@ -293,9 +296,10 @@ struct intel_engine_cs {
 	/* Execlists */
 	struct task_struct *execlists_submit;
 	spinlock_t execlist_lock;
+	struct drm_i915_gem_request *execlist_port[2];
 	struct list_head execlist_queue;
-	struct list_head execlist_retired_req_list;
-	u8 next_context_status_buffer;
+	struct list_head execlist_completed;
+	u32 execlist_context_descriptor;
 	u32             irq_keep_mask; /* bitmask for interrupts that should not be masked */
 
 	/**
-- 
2.7.0.rc3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx