[PATCH 2/5] drm/i915/guc: Submit GuC workitems containing coalesced requests

From: "Michał Winiarski" <michal.winiarski@intel.com>
To: intel-gfx@lists.freedesktop.org
Subject: [PATCH 2/5] drm/i915/guc: Submit GuC workitems containing coalesced requests
Date: Tue, 12 Sep 2017 19:40:53 +0200	[thread overview]
Message-ID: <20170912174056.20139-2-michal.winiarski@intel.com> (raw)
In-Reply-To: <20170912174056.20139-1-michal.winiarski@intel.com>

To create an upper bound on number of GuC workitems, we need to change
the way that requests are being submitted. Rather than submitting each
request as an individual workitem, we can do coalescing in a similar way
we're handlig execlist submission ports. We also need to stop pretending
that we're doing "lite-restore" in GuC submission (we would create a
workitem each time we hit this condition). This allows us to completely
remove the reservation, replacing it with a compile time check.

v2: Also coalesce when replaying on reset (Daniele)
v3: Consistent wq_resv - per-request (Daniele)
v4: Squash removing wq_resv

References: https://bugs.freedesktop.org/show_bug.cgi?id=101873
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Cc: Jeff McGee <jeff.mcgee@intel.com>
Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Oscar Mateo <oscar.mateo@intel.com>
Signed-off-by: Michał Winiarski <michal.winiarski@intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_debugfs.c        |   2 -
 drivers/gpu/drm/i915/i915_guc_submission.c | 179 ++++++++++-------------------
 drivers/gpu/drm/i915/intel_lrc.c           |  25 +---
 drivers/gpu/drm/i915/intel_uc.h            |  11 --
 4 files changed, 62 insertions(+), 155 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 6338018f655d..f5fd00cfb3b0 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -2450,8 +2450,6 @@ static void i915_guc_client_info(struct seq_file *m,
 	seq_printf(m, "\tWQ size %d, offset: 0x%x, tail %d\n",
 		client->wq_size, client->wq_offset, client->wq_tail);
 
-	seq_printf(m, "\tWork queue full: %u\n", client->no_wq_space);
-
 	for_each_engine(engine, dev_priv, id) {
 		u64 submissions = client->submissions[id];
 		tot += submissions;
diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index 8a550785b257..6f0adcd2a058 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -406,63 +406,6 @@ static void guc_stage_desc_fini(struct intel_guc *guc,
 	memset(desc, 0, sizeof(*desc));
 }
 
-/**
- * i915_guc_wq_reserve() - reserve space in the GuC's workqueue
- * @request:	request associated with the commands
- *
- * Return:	0 if space is available
- *		-EAGAIN if space is not currently available
- *
- * This function must be called (and must return 0) before a request
- * is submitted to the GuC via i915_guc_submit() below. Once a result
- * of 0 has been returned, it must be balanced by a corresponding
- * call to submit().
- *
- * Reservation allows the caller to determine in advance that space
- * will be available for the next submission before committing resources
- * to it, and helps avoid late failures with complicated recovery paths.
- */
-int i915_guc_wq_reserve(struct drm_i915_gem_request *request)
-{
-	const size_t wqi_size = sizeof(struct guc_wq_item);
-	struct i915_guc_client *client = request->i915->guc.execbuf_client;
-	struct guc_process_desc *desc = __get_process_desc(client);
-	u32 freespace;
-	int ret;
-
-	spin_lock_irq(&client->wq_lock);
-	freespace = CIRC_SPACE(client->wq_tail, desc->head, client->wq_size);
-	freespace -= client->wq_rsvd;
-	if (likely(freespace >= wqi_size)) {
-		client->wq_rsvd += wqi_size;
-		ret = 0;
-	} else {
-		client->no_wq_space++;
-		ret = -EAGAIN;
-	}
-	spin_unlock_irq(&client->wq_lock);
-
-	return ret;
-}
-
-static void guc_client_update_wq_rsvd(struct i915_guc_client *client, int size)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&client->wq_lock, flags);
-	client->wq_rsvd += size;
-	spin_unlock_irqrestore(&client->wq_lock, flags);
-}
-
-void i915_guc_wq_unreserve(struct drm_i915_gem_request *request)
-{
-	const int wqi_size = sizeof(struct guc_wq_item);
-	struct i915_guc_client *client = request->i915->guc.execbuf_client;
-
-	GEM_BUG_ON(READ_ONCE(client->wq_rsvd) < wqi_size);
-	guc_client_update_wq_rsvd(client, -wqi_size);
-}
-
 /* Construct a Work Item and append it to the GuC's Work Queue */
 static void guc_wq_item_append(struct i915_guc_client *client,
 			       struct drm_i915_gem_request *rq)
@@ -475,7 +418,7 @@ static void guc_wq_item_append(struct i915_guc_client *client,
 	struct guc_wq_item *wqi;
 	u32 freespace, tail, wq_off;
 
-	/* Free space is guaranteed, see i915_guc_wq_reserve() above */
+	/* Free space is guaranteed */
 	freespace = CIRC_SPACE(client->wq_tail, desc->head, client->wq_size);
 	GEM_BUG_ON(freespace < wqi_size);
 
@@ -491,14 +434,12 @@ static void guc_wq_item_append(struct i915_guc_client *client,
 	 * workqueue buffer dw by dw.
 	 */
 	BUILD_BUG_ON(wqi_size != 16);
-	GEM_BUG_ON(client->wq_rsvd < wqi_size);
 
 	/* postincrement WQ tail for next time */
 	wq_off = client->wq_tail;
 	GEM_BUG_ON(wq_off & (wqi_size - 1));
 	client->wq_tail += wqi_size;
 	client->wq_tail &= client->wq_size - 1;
-	client->wq_rsvd -= wqi_size;
 
 	/* WQ starts from the page after doorbell / process_desc */
 	wqi = client->vaddr + wq_off + GUC_DB_SIZE;
@@ -580,47 +521,43 @@ static int guc_ring_doorbell(struct i915_guc_client *client)
 }
 
 /**
- * __i915_guc_submit() - Submit commands through GuC
+ * i915_guc_submit() - Submit commands through GuC
  * @rq:		request associated with the commands
  *
- * The caller must have already called i915_guc_wq_reserve() above with
- * a result of 0 (success), guaranteeing that there is space in the work
- * queue for the new request, so enqueuing the item cannot fail.
- *
- * Bad Things Will Happen if the caller violates this protocol e.g. calls
- * submit() when _reserve() says there's no space, or calls _submit()
- * a different number of times from (successful) calls to _reserve().
- *
  * The only error here arises if the doorbell hardware isn't functioning
  * as expected, which really shouln't happen.
  */
-static void __i915_guc_submit(struct drm_i915_gem_request *rq)
+static void i915_guc_submit(struct intel_engine_cs *engine)
 {
-	struct drm_i915_private *dev_priv = rq->i915;
-	struct intel_engine_cs *engine = rq->engine;
-	unsigned int engine_id = engine->id;
-	struct intel_guc *guc = &rq->i915->guc;
+	struct drm_i915_private *dev_priv = engine->i915;
+	struct intel_guc *guc = &dev_priv->guc;
 	struct i915_guc_client *client = guc->execbuf_client;
+	struct execlist_port *port = engine->execlist_port;
+	unsigned int engine_id = engine->id;
+	unsigned int n;
 	unsigned long flags;
 
-	/* WA to flush out the pending GMADR writes to ring buffer. */
-	if (i915_vma_is_map_and_fenceable(rq->ring->vma))
-		POSTING_READ_FW(GUC_STATUS);
+	for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++) {
+		struct drm_i915_gem_request *rq;
+		unsigned int count;
 
-	spin_lock_irqsave(&client->wq_lock, flags);
+		rq = port_unpack(&port[n], &count);
+		if (rq && count == 0) {
+			port_set(&port[n], port_pack(rq, ++count));
 
-	guc_wq_item_append(client, rq);
-	WARN_ON(guc_ring_doorbell(client));
+			if (i915_vma_is_map_and_fenceable(rq->ring->vma))
+				POSTING_READ_FW(GUC_STATUS);
 
-	client->submissions[engine_id] += 1;
+			spin_lock_irqsave(&client->wq_lock, flags);
 
-	spin_unlock_irqrestore(&client->wq_lock, flags);
-}
+			guc_wq_item_append(client, rq);
+			WARN_ON(guc_ring_doorbell(client));
 
-static void i915_guc_submit(struct drm_i915_gem_request *rq)
-{
-	__i915_gem_request_submit(rq);
-	__i915_guc_submit(rq);
+			client->submissions[engine_id] += 1;
+
+			spin_unlock_irqrestore(&client->wq_lock, flags);
+		}
+	}
 }
 
 static void nested_enable_signaling(struct drm_i915_gem_request *rq)
@@ -654,16 +591,19 @@ static void port_assign(struct execlist_port *port,
 	if (port_isset(port))
 		i915_gem_request_put(port_request(port));
 
-	port_set(port, i915_gem_request_get(rq));
+	port_set(port, port_pack(i915_gem_request_get(rq), port_count(port)));
 	nested_enable_signaling(rq);
 }
 
-static bool i915_guc_dequeue(struct intel_engine_cs *engine)
+static void i915_guc_dequeue(struct intel_engine_cs *engine)
 {
 	struct execlist_port *port = engine->execlist_port;
-	struct drm_i915_gem_request *last = port_request(port);
-	struct rb_node *rb;
+	struct drm_i915_gem_request *last = NULL;
 	bool submit = false;
+	struct rb_node *rb;
+
+	if (port_isset(port))
+		port++;
 
 	spin_lock_irq(&engine->timeline->lock);
 	rb = engine->execlist_first;
@@ -688,7 +628,7 @@ static bool i915_guc_dequeue(struct intel_engine_cs *engine)
 			INIT_LIST_HEAD(&rq->priotree.link);
 			rq->priotree.priority = INT_MAX;
 
-			i915_guc_submit(rq);
+			__i915_gem_request_submit(rq);
 			trace_i915_gem_request_in(rq, port_index(port, engine));
 			last = rq;
 			submit = true;
@@ -702,11 +642,11 @@ static bool i915_guc_dequeue(struct intel_engine_cs *engine)
 	}
 done:
 	engine->execlist_first = rb;
-	if (submit)
+	if (submit) {
 		port_assign(port, last);
+		i915_guc_submit(engine);
+	}
 	spin_unlock_irq(&engine->timeline->lock);
-
-	return submit;
 }
 
 static void i915_guc_irq_handler(unsigned long data)
@@ -714,24 +654,20 @@ static void i915_guc_irq_handler(unsigned long data)
 	struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
 	struct execlist_port *port = engine->execlist_port;
 	struct drm_i915_gem_request *rq;
-	bool submit;
 
-	do {
-		rq = port_request(&port[0]);
-		while (rq && i915_gem_request_completed(rq)) {
-			trace_i915_gem_request_out(rq);
-			i915_gem_request_put(rq);
+	rq = port_request(&port[0]);
+	while (rq && i915_gem_request_completed(rq)) {
+		trace_i915_gem_request_out(rq);
+		i915_gem_request_put(rq);
 
-			port[0] = port[1];
-			memset(&port[1], 0, sizeof(port[1]));
+		port[0] = port[1];
+		memset(&port[1], 0, sizeof(port[1]));
 
-			rq = port_request(&port[0]);
-		}
+		rq = port_request(&port[0]);
+	}
 
-		submit = false;
-		if (!port_count(&port[1]))
-			submit = i915_guc_dequeue(engine);
-	} while (submit);
+	if (!port_isset(&port[1]))
+		i915_guc_dequeue(engine);
 }
 
 /*
@@ -1220,6 +1156,19 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
 	enum intel_engine_id id;
 	int err;
 
+	/*
+	 * We're using GuC work items for submitting work through GuC. Since
+	 * we're coalescing multiple requests from a single context into a
+	 * single work item prior to assigning it to execlist_port, we can
+	 * never have more work items than the total number of ports (for all
+	 * engines). The GuC firmware is controlling the HEAD of work queue,
+	 * and it is guaranteed that it will remove the work item from the
+	 * queue before our request is completed.
+	 */
+	BUILD_BUG_ON(ARRAY_SIZE(engine->execlist_port) *
+		     sizeof(struct guc_wq_item) *
+		     I915_NUM_ENGINES > GUC_WQ_SIZE);
+
 	if (!client) {
 		client = guc_client_alloc(dev_priv,
 					  INTEL_INFO(dev_priv)->ring_mask,
@@ -1247,9 +1196,6 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
 	guc_interrupts_capture(dev_priv);
 
 	for_each_engine(engine, dev_priv, id) {
-		const int wqi_size = sizeof(struct guc_wq_item);
-		struct drm_i915_gem_request *rq;
-
 		/* The tasklet was initialised by execlists, and may be in
 		 * a state of flux (across a reset) and so we just want to
 		 * take over the callback without changing any other state
@@ -1257,14 +1203,7 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
 		 */
 		engine->irq_tasklet.func = i915_guc_irq_handler;
 		clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
-
-		/* Replay the current set of previously submitted requests */
-		spin_lock_irq(&engine->timeline->lock);
-		list_for_each_entry(rq, &engine->timeline->requests, link) {
-			guc_client_update_wq_rsvd(client, wqi_size);
-			__i915_guc_submit(rq);
-		}
-		spin_unlock_irq(&engine->timeline->lock);
+		i915_guc_submit(engine);
 	}
 
 	return 0;
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index d89e1b8e1cc5..5837b33f9705 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -914,27 +914,14 @@ static int execlists_request_alloc(struct drm_i915_gem_request *request)
 	 */
 	request->reserved_space += EXECLISTS_REQUEST_SIZE;
 
-	if (i915.enable_guc_submission) {
-		/*
-		 * Check that the GuC has space for the request before
-		 * going any further, as the i915_add_request() call
-		 * later on mustn't fail ...
-		 */
-		ret = i915_guc_wq_reserve(request);
-		if (ret)
-			goto err;
-	}
-
 	cs = intel_ring_begin(request, 0);
-	if (IS_ERR(cs)) {
-		ret = PTR_ERR(cs);
-		goto err_unreserve;
-	}
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
 
 	if (!ce->initialised) {
 		ret = engine->init_context(request);
 		if (ret)
-			goto err_unreserve;
+			return ret;
 
 		ce->initialised = true;
 	}
@@ -948,12 +935,6 @@ static int execlists_request_alloc(struct drm_i915_gem_request *request)
 
 	request->reserved_space -= EXECLISTS_REQUEST_SIZE;
 	return 0;
-
-err_unreserve:
-	if (i915.enable_guc_submission)
-		i915_guc_wq_unreserve(request);
-err:
-	return ret;
 }
 
 /*
diff --git a/drivers/gpu/drm/i915/intel_uc.h b/drivers/gpu/drm/i915/intel_uc.h
index 69daf4c01cd0..d41051688221 100644
--- a/drivers/gpu/drm/i915/intel_uc.h
+++ b/drivers/gpu/drm/i915/intel_uc.h
@@ -52,13 +52,6 @@ struct drm_i915_gem_request;
  * GuC). The subsequent  pages of the client object constitute the work
  * queue (a circular array of work items), again described in the process
  * descriptor. Work queue pages are mapped momentarily as required.
- *
- * We also keep a few statistics on failures. Ideally, these should all
- * be zero!
- *   no_wq_space: times that the submission pre-check found no space was
- *                available in the work queue (note, the queue is shared,
- *                not per-engine). It is OK for this to be nonzero, but
- *                it should not be huge!
  */
 struct i915_guc_client {
 	struct i915_vma *vma;
@@ -79,8 +72,6 @@ struct i915_guc_client {
 	uint32_t wq_offset;
 	uint32_t wq_size;
 	uint32_t wq_tail;
-	uint32_t wq_rsvd;
-	uint32_t no_wq_space;
 
 	/* Per-engine counts of GuC submissions */
 	uint64_t submissions[I915_NUM_ENGINES];
@@ -246,8 +237,6 @@ u32 intel_guc_wopcm_size(struct drm_i915_private *dev_priv);
 /* i915_guc_submission.c */
 int i915_guc_submission_init(struct drm_i915_private *dev_priv);
 int i915_guc_submission_enable(struct drm_i915_private *dev_priv);
-int i915_guc_wq_reserve(struct drm_i915_gem_request *rq);
-void i915_guc_wq_unreserve(struct drm_i915_gem_request *request);
 void i915_guc_submission_disable(struct drm_i915_private *dev_priv);
 void i915_guc_submission_fini(struct drm_i915_private *dev_priv);
 struct i915_vma *intel_guc_allocate_vma(struct intel_guc *guc, u32 size);
-- 
2.13.5

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx