Non-blocking fences, now GuC compatible

All of lore.kernel.org
 help / color / mirror / Atom feed

* Non-blocking fences, now GuC compatible
@ 2016-08-30  8:17 Chris Wilson
  2016-08-30  8:17 ` [PATCH 01/18] drm/i915: Add a sw fence for collecting up dma fences Chris Wilson
                   ` (18 more replies)
  0 siblings, 19 replies; 27+ messages in thread
From: Chris Wilson @ 2016-08-30  8:17 UTC (permalink / raw)
  To: intel-gfx; +Cc: mika.kuoppala

The GuC required a couple of fixes to allow deferred request submission
and for reset.
-Chris

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 01/18] drm/i915: Add a sw fence for collecting up dma fences
  2016-08-30  8:17 Non-blocking fences, now GuC compatible Chris Wilson
@ 2016-08-30  8:17 ` Chris Wilson
  2016-08-30  8:17 ` [PATCH 02/18] drm/i915: Only queue requests during execlists submission Chris Wilson
                   ` (17 subsequent siblings)
  18 siblings, 0 replies; 27+ messages in thread
From: Chris Wilson @ 2016-08-30  8:17 UTC (permalink / raw)
  To: intel-gfx; +Cc: mika.kuoppala

This is really a core kernel struct in disguise until we can finally
place it in kernel/. There is an immediate need for a fence collection
mechanism that is more flexible than fence-array, in particular being
able to easily drive request submission via events (and not just
interrupt driven). The same mechanism would be useful for handling
nonblocking and asynchronous atomic modesets, parallel execution and
more, but for the time being just create a local sw fence for execbuf.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 drivers/gpu/drm/i915/Makefile        |   1 +
 drivers/gpu/drm/i915/i915_sw_fence.c | 346 +++++++++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/i915_sw_fence.h |  62 +++++++
 3 files changed, 409 insertions(+)
 create mode 100644 drivers/gpu/drm/i915/i915_sw_fence.c
 create mode 100644 drivers/gpu/drm/i915/i915_sw_fence.h

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index a7da24640e88..a998c2bce70a 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -16,6 +16,7 @@ i915-y := i915_drv.o \
 	  i915_params.o \
 	  i915_pci.o \
           i915_suspend.o \
+	  i915_sw_fence.o \
 	  i915_sysfs.o \
 	  intel_csr.o \
 	  intel_device_info.o \
diff --git a/drivers/gpu/drm/i915/i915_sw_fence.c b/drivers/gpu/drm/i915/i915_sw_fence.c
new file mode 100644
index 000000000000..63a7372032e7
--- /dev/null
+++ b/drivers/gpu/drm/i915/i915_sw_fence.c
@@ -0,0 +1,346 @@
+/*
+ * (C) Copyright 2016 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/slab.h>
+#include <linux/fence.h>
+#include <linux/reservation.h>
+
+#include "i915_sw_fence.h"
+
+static DEFINE_SPINLOCK(i915_sw_fence_lock);
+
+static int __i915_sw_fence_notify(struct i915_sw_fence *fence,
+				  enum i915_sw_fence_notify state)
+{
+	i915_sw_fence_notify_t fn;
+
+	fn = (i915_sw_fence_notify_t)(fence->flags & I915_SW_FENCE_MASK);
+	return fn(fence, state);
+}
+
+static void i915_sw_fence_free(struct kref *kref)
+{
+	struct i915_sw_fence *fence = container_of(kref, typeof(*fence), kref);
+
+	WARN_ON(atomic_read(&fence->pending) > 0);
+
+	if (fence->flags & I915_SW_FENCE_MASK)
+		__i915_sw_fence_notify(fence, FENCE_FREE);
+	else
+		kfree(fence);
+}
+
+static void i915_sw_fence_put(struct i915_sw_fence *fence)
+{
+	kref_put(&fence->kref, i915_sw_fence_free);
+}
+
+static struct i915_sw_fence *i915_sw_fence_get(struct i915_sw_fence *fence)
+{
+	kref_get(&fence->kref);
+	return fence;
+}
+
+static void __i915_sw_fence_wake_up_all(struct i915_sw_fence *fence,
+					struct list_head *continuation)
+{
+	wait_queue_head_t *x = &fence->wait;
+	unsigned long flags;
+
+	atomic_set(&fence->pending, -1); /* 0 -> -1 [done] */
+
+	/*
+	 * To prevent unbounded recursion as we traverse the graph of
+	 * i915_sw_fences, we move the task_list from this, the next ready
+	 * fence, to the tail of the original fence's task_list
+	 * (and so added to the list to be woken).
+	 */
+
+	smp_mb__before_spinlock();
+	spin_lock_irqsave_nested(&x->lock, flags, 1 + !!continuation);
+	if (continuation) {
+		list_splice_tail_init(&x->task_list, continuation);
+	} else {
+		LIST_HEAD(extra);
+
+		do {
+			__wake_up_locked_key(x, TASK_NORMAL, &extra);
+
+			if (list_empty(&extra))
+				break;
+
+			list_splice_tail_init(&extra, &x->task_list);
+		} while (1);
+	}
+	spin_unlock_irqrestore(&x->lock, flags);
+}
+
+static void __i915_sw_fence_complete(struct i915_sw_fence *fence,
+				     struct list_head *continuation)
+{
+	if (!atomic_dec_and_test(&fence->pending))
+		return;
+
+	if (fence->flags & I915_SW_FENCE_MASK &&
+	    __i915_sw_fence_notify(fence, FENCE_COMPLETE) != NOTIFY_DONE)
+		return;
+
+	__i915_sw_fence_wake_up_all(fence, continuation);
+}
+
+static void i915_sw_fence_complete(struct i915_sw_fence *fence)
+{
+	if (WARN_ON(i915_sw_fence_done(fence)))
+		return;
+
+	__i915_sw_fence_complete(fence, NULL);
+}
+
+static void i915_sw_fence_await(struct i915_sw_fence *fence)
+{
+	WARN_ON(atomic_inc_return(&fence->pending) <= 1);
+}
+
+static void i915_sw_fence_wait(struct i915_sw_fence *fence)
+{
+	wait_event(fence->wait, i915_sw_fence_done(fence));
+}
+
+void i915_sw_fence_init(struct i915_sw_fence *fence, i915_sw_fence_notify_t fn)
+{
+	BUG_ON((unsigned long)fn & ~I915_SW_FENCE_MASK);
+
+	init_waitqueue_head(&fence->wait);
+	kref_init(&fence->kref);
+	atomic_set(&fence->pending, 1);
+	fence->flags = (unsigned long)fn;
+}
+
+void i915_sw_fence_commit(struct i915_sw_fence *fence)
+{
+	i915_sw_fence_complete(fence);
+	i915_sw_fence_put(fence);
+}
+
+static int i915_sw_fence_wake(wait_queue_t *wq, unsigned mode, int flags, void *key)
+{
+	list_del(&wq->task_list);
+	__i915_sw_fence_complete(wq->private, key);
+	i915_sw_fence_put(wq->private);
+	kfree(wq);
+	return 0;
+}
+
+static bool __i915_sw_fence_check_if_after(struct i915_sw_fence *fence,
+				    const struct i915_sw_fence * const signaler)
+{
+	wait_queue_t *wq;
+
+	if (__test_and_set_bit(I915_SW_FENCE_CHECKED_BIT, &fence->flags))
+		return false;
+
+	if (fence == signaler)
+		return true;
+
+	list_for_each_entry(wq, &fence->wait.task_list, task_list) {
+		if (wq->func != i915_sw_fence_wake)
+			continue;
+
+		if (__i915_sw_fence_check_if_after(wq->private, signaler))
+			return true;
+	}
+
+	return false;
+}
+
+static void __i915_sw_fence_clear_checked_bit(struct i915_sw_fence *fence)
+{
+	wait_queue_t *wq;
+
+	if (!__test_and_clear_bit(I915_SW_FENCE_CHECKED_BIT, &fence->flags))
+		return;
+
+	list_for_each_entry(wq, &fence->wait.task_list, task_list) {
+		if (wq->func != i915_sw_fence_wake)
+			continue;
+
+		__i915_sw_fence_clear_checked_bit(wq->private);
+	}
+}
+
+static bool i915_sw_fence_check_if_after(struct i915_sw_fence *fence,
+				  const struct i915_sw_fence * const signaler)
+{
+	unsigned long flags;
+	bool err;
+
+	if (!IS_ENABLED(CONFIG_I915_SW_FENCE_CHECK_DAG))
+		return false;
+
+	spin_lock_irqsave(&i915_sw_fence_lock, flags);
+	err = __i915_sw_fence_check_if_after(fence, signaler);
+	__i915_sw_fence_clear_checked_bit(fence);
+	spin_unlock_irqrestore(&i915_sw_fence_lock, flags);
+
+	return err;
+}
+
+static wait_queue_t *__i915_sw_fence_create_wq(struct i915_sw_fence *fence, gfp_t gfp)
+{
+	wait_queue_t *wq;
+
+	wq = kmalloc(sizeof(*wq), gfp);
+	if (unlikely(!wq))
+		return NULL;
+
+	INIT_LIST_HEAD(&wq->task_list);
+	wq->flags = 0;
+	wq->func = i915_sw_fence_wake;
+	wq->private = i915_sw_fence_get(fence);
+
+	i915_sw_fence_await(fence);
+
+	return wq;
+}
+
+int i915_sw_fence_await_sw_fence(struct i915_sw_fence *fence,
+				 struct i915_sw_fence *signaler,
+				 gfp_t gfp)
+{
+	wait_queue_t *wq;
+	unsigned long flags;
+	int pending;
+
+	if (i915_sw_fence_done(signaler))
+		return 0;
+
+	/* The dependency graph must be acyclic. */
+	if (unlikely(i915_sw_fence_check_if_after(fence, signaler)))
+		return -EINVAL;
+
+	wq = __i915_sw_fence_create_wq(fence, gfp);
+	if (unlikely(!wq)) {
+		if (!gfpflags_allow_blocking(gfp))
+			return -ENOMEM;
+
+		i915_sw_fence_wait(signaler);
+		return 0;
+	}
+
+	spin_lock_irqsave(&signaler->wait.lock, flags);
+	if (likely(!i915_sw_fence_done(signaler))) {
+		__add_wait_queue_tail(&signaler->wait, wq);
+		pending = 1;
+	} else {
+		i915_sw_fence_wake(wq, 0, 0, NULL);
+		pending = 0;
+	}
+	spin_unlock_irqrestore(&signaler->wait.lock, flags);
+
+	return pending;
+}
+
+struct dma_fence_cb {
+	struct fence_cb base;
+	struct i915_sw_fence *fence;
+};
+
+static void dma_i915_sw_fence_wake(struct fence *dma, struct fence_cb *data)
+{
+	struct dma_fence_cb *cb = container_of(data, typeof(*cb), base);
+
+	i915_sw_fence_commit(cb->fence);
+	kfree(cb);
+}
+
+int i915_sw_fence_await_dma_fence(struct i915_sw_fence *fence,
+				  struct fence *dma, gfp_t gfp)
+{
+	struct dma_fence_cb *cb;
+	int ret;
+
+	if (fence_is_signaled(dma))
+		return 0;
+
+	cb = kmalloc(sizeof(*cb), gfp);
+	if (!cb) {
+		if (!gfpflags_allow_blocking(gfp))
+			return -ENOMEM;
+
+		return fence_wait(dma, false);
+	}
+
+	cb->fence = i915_sw_fence_get(fence);
+	i915_sw_fence_await(fence);
+
+	ret = fence_add_callback(dma, &cb->base, dma_i915_sw_fence_wake);
+	if (ret == 0) {
+		ret = 1;
+	} else {
+		dma_i915_sw_fence_wake(dma, &cb->base);
+		if (ret == -ENOENT) /* fence already signaled */
+			ret = 0;
+	}
+
+	return ret;
+}
+
+int i915_sw_fence_await_reservation(struct i915_sw_fence *fence,
+				    struct reservation_object *resv,
+				    const struct fence_ops *exclude,
+				    bool write,
+				    gfp_t gfp)
+{
+	struct fence *excl, **shared;
+	unsigned int count, i;
+	int ret;
+
+	ret = reservation_object_get_fences_rcu(resv, &excl, &count, &shared);
+	if (ret)
+		return ret;
+
+	if (write) {
+		for (i = 0; i < count; i++) {
+			int pending;
+
+			if (shared[i]->ops == exclude)
+				continue;
+
+			pending = i915_sw_fence_await_dma_fence(fence,
+								shared[i],
+								gfp);
+			if (pending < 0) {
+				ret = pending;
+				goto out;
+			}
+
+			ret |= pending;
+		}
+	}
+
+	if (excl && excl->ops != exclude) {
+		int pending;
+
+		pending = i915_sw_fence_await_dma_fence(fence, excl, gfp);
+		if (pending < 0) {
+			ret = pending;
+			goto out;
+		}
+
+		ret |= pending;
+	}
+
+out:
+	fence_put(excl);
+	for (i = 0; i < count; i++)
+		fence_put(shared[i]);
+	kfree(shared);
+
+	return ret;
+}
diff --git a/drivers/gpu/drm/i915/i915_sw_fence.h b/drivers/gpu/drm/i915/i915_sw_fence.h
new file mode 100644
index 000000000000..07a7fefd3e59
--- /dev/null
+++ b/drivers/gpu/drm/i915/i915_sw_fence.h
@@ -0,0 +1,62 @@
+/*
+ * i915_sw_fence.h - library routines for N:M synchronisation points
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * This file is released under the GPLv2.
+ *
+ */
+
+#ifndef _I915_SW_FENCE_H_
+#define _I915_SW_FENCE_H_
+
+#include <linux/gfp.h>
+#include <linux/kref.h>
+#include <linux/notifier.h> /* for NOTIFY_DONE */
+#include <linux/wait.h>
+
+struct completion;
+struct fence;
+struct fence_ops;
+struct reservation_object;
+
+struct i915_sw_fence {
+	wait_queue_head_t wait;
+	unsigned long flags;
+	struct kref kref;
+	atomic_t pending;
+};
+
+#define I915_SW_FENCE_CHECKED_BIT	0 /* used internally for DAG checking */
+#define I915_SW_FENCE_PRIVATE_BIT	1 /* available for use by owner */
+#define I915_SW_FENCE_MASK		(~3)
+
+enum i915_sw_fence_notify {
+	FENCE_COMPLETE,
+	FENCE_FREE
+};
+
+typedef int (*i915_sw_fence_notify_t)(struct i915_sw_fence *,
+				      enum i915_sw_fence_notify state);
+#define __i915_sw_fence_call __aligned(4)
+
+void i915_sw_fence_init(struct i915_sw_fence *fence, i915_sw_fence_notify_t fn);
+void i915_sw_fence_commit(struct i915_sw_fence *fence);
+
+int i915_sw_fence_await_sw_fence(struct i915_sw_fence *fence,
+				 struct i915_sw_fence *after,
+				 gfp_t gfp);
+int i915_sw_fence_await_dma_fence(struct i915_sw_fence *fence,
+				  struct fence *dma, gfp_t gfp);
+int i915_sw_fence_await_reservation(struct i915_sw_fence *fence,
+			     struct reservation_object *resv,
+			     const struct fence_ops *exclude,
+			     bool write,
+			     gfp_t gfp);
+
+static inline bool i915_sw_fence_done(const struct i915_sw_fence *fence)
+{
+	return atomic_read(&fence->pending) < 0;
+}
+
+#endif /* _I915_SW_FENCE_H_ */
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 02/18] drm/i915: Only queue requests during execlists submission
  2016-08-30  8:17 Non-blocking fences, now GuC compatible Chris Wilson
  2016-08-30  8:17 ` [PATCH 01/18] drm/i915: Add a sw fence for collecting up dma fences Chris Wilson
@ 2016-08-30  8:17 ` Chris Wilson
  2016-08-30  8:17 ` [PATCH 03/18] drm/i915: Record the position of the workarounds in the tail of the request Chris Wilson
                   ` (16 subsequent siblings)
  18 siblings, 0 replies; 27+ messages in thread
From: Chris Wilson @ 2016-08-30  8:17 UTC (permalink / raw)
  To: intel-gfx; +Cc: mika.kuoppala

Leave the more complicated request dequeueing to the tasklet and instead
just kick start the tasklet if we detect we are adding the first
request.

v2: Play around with list operators until we agree upon something

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/intel_lrc.c | 28 ++++------------------------
 1 file changed, 4 insertions(+), 24 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 6b49df4316f4..48248289d88a 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -609,35 +609,15 @@ static void intel_lrc_irq_handler(unsigned long data)
 static void execlists_submit_request(struct drm_i915_gem_request *request)
 {
 	struct intel_engine_cs *engine = request->engine;
-	struct drm_i915_gem_request *cursor;
-	int num_elements = 0;
 
 	spin_lock_bh(&engine->execlist_lock);
 
-	list_for_each_entry(cursor, &engine->execlist_queue, execlist_link)
-		if (++num_elements > 2)
-			break;
-
-	if (num_elements > 2) {
-		struct drm_i915_gem_request *tail_req;
-
-		tail_req = list_last_entry(&engine->execlist_queue,
-					   struct drm_i915_gem_request,
-					   execlist_link);
-
-		if (request->ctx == tail_req->ctx) {
-			WARN(tail_req->elsp_submitted != 0,
-				"More than 2 already-submitted reqs queued\n");
-			list_del(&tail_req->execlist_link);
-			i915_gem_request_put(tail_req);
-		}
-	}
-
 	i915_gem_request_get(request);
-	list_add_tail(&request->execlist_link, &engine->execlist_queue);
 	request->ctx_hw_id = request->ctx->hw_id;
-	if (num_elements == 0)
-		execlists_unqueue(engine);
+
+	if (list_empty(&engine->execlist_queue))
+		tasklet_hi_schedule(&engine->irq_tasklet);
+	list_add_tail(&request->execlist_link, &engine->execlist_queue);
 
 	spin_unlock_bh(&engine->execlist_lock);
 }
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 03/18] drm/i915: Record the position of the workarounds in the tail of the request
  2016-08-30  8:17 Non-blocking fences, now GuC compatible Chris Wilson
  2016-08-30  8:17 ` [PATCH 01/18] drm/i915: Add a sw fence for collecting up dma fences Chris Wilson
  2016-08-30  8:17 ` [PATCH 02/18] drm/i915: Only queue requests during execlists submission Chris Wilson
@ 2016-08-30  8:17 ` Chris Wilson
  2016-08-30  8:17 ` [PATCH 04/18] drm/i915: Compute the ELSP register location once Chris Wilson
                   ` (15 subsequent siblings)
  18 siblings, 0 replies; 27+ messages in thread
From: Chris Wilson @ 2016-08-30  8:17 UTC (permalink / raw)
  To: intel-gfx; +Cc: mika.kuoppala

Rather than blindly assuming we need to advance the tail for
resubmitting the request via the ELSP, record the position.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_gem_request.h | 15 +++++++++------
 drivers/gpu/drm/i915/intel_lrc.c        |  4 ++--
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index 91014de8bfbc..2faa3bb4c39b 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -88,20 +88,23 @@ struct drm_i915_gem_request {
 	 */
 	u32 previous_seqno;
 
-	/** Position in the ringbuffer of the start of the request */
+	/** Position in the ring of the start of the request */
 	u32 head;
 
 	/**
-	 * Position in the ringbuffer of the start of the postfix.
-	 * This is required to calculate the maximum available ringbuffer
-	 * space without overwriting the postfix.
+	 * Position in the ring of the start of the postfix.
+	 * This is required to calculate the maximum available ring space
+	 * without overwriting the postfix.
 	 */
 	u32 postfix;
 
-	/** Position in the ringbuffer of the end of the whole request */
+	/** Position in the ring of the end of the whole request */
 	u32 tail;
 
-	/** Preallocate space in the ringbuffer for the emitting the request */
+	/** Position in the ring of the end of any workarounds after the tail */
+	u32 wa_tail;
+
+	/** Preallocate space in the ring for the emitting the request */
 	u32 reserved_space;
 
 	/**
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 48248289d88a..45d417d22610 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -482,8 +482,7 @@ static void execlists_unqueue(struct intel_engine_cs *engine)
 		 * resubmit the request. See gen8_emit_request() for where we
 		 * prepare the padding after the end of the request.
 		 */
-		req0->tail += 8;
-		req0->tail &= req0->ring->size - 1;
+		req0->tail = req0->wa_tail;
 	}
 
 	execlists_elsp_submit_contexts(req0, req1);
@@ -711,6 +710,7 @@ intel_logical_ring_advance(struct drm_i915_gem_request *request)
 	intel_ring_emit(ring, MI_NOOP);
 	intel_ring_emit(ring, MI_NOOP);
 	intel_ring_advance(ring);
+	request->wa_tail = ring->tail;
 
 	/* We keep the previous context alive until we retire the following
 	 * request. This ensures that any the context object is still pinned
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 04/18] drm/i915: Compute the ELSP register location once
  2016-08-30  8:17 Non-blocking fences, now GuC compatible Chris Wilson
                   ` (2 preceding siblings ...)
  2016-08-30  8:17 ` [PATCH 03/18] drm/i915: Record the position of the workarounds in the tail of the request Chris Wilson
@ 2016-08-30  8:17 ` Chris Wilson
  2016-08-30  8:17 ` [PATCH 05/18] drm/i915: Reorder submitting the requests to ELSP Chris Wilson
                   ` (14 subsequent siblings)
  18 siblings, 0 replies; 27+ messages in thread
From: Chris Wilson @ 2016-08-30  8:17 UTC (permalink / raw)
  To: intel-gfx; +Cc: mika.kuoppala

Similar to the issue with reading from the context status buffer,
see commit 26720ab97fea ("drm/i915: Move CSB MMIO reads out of the
execlists lock"), we frequently write to the ELSP register (4 writes per
interrupt) and know we hold the required spinlock and forcewake throughout.
We can further reduce the cost of writing these registers beyond the
I915_WRITE_FW() by precomputing the address of the ELSP register. We also
note that the subsequent read serves no purpose here, and are happy to
see it go.

v2: Address I915_WRITE mistakes in changelog

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/intel_lrc.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 45d417d22610..831a5aa769b2 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -331,10 +331,11 @@ uint64_t intel_lr_context_descriptor(struct i915_gem_context *ctx,
 static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
 				 struct drm_i915_gem_request *rq1)
 {
-
 	struct intel_engine_cs *engine = rq0->engine;
 	struct drm_i915_private *dev_priv = rq0->i915;
-	uint64_t desc[2];
+	u32 __iomem *elsp =
+		dev_priv->regs + i915_mmio_reg_offset(RING_ELSP(engine));
+	u64 desc[2];
 
 	if (rq1) {
 		desc[1] = intel_lr_context_descriptor(rq1->ctx, rq1->engine);
@@ -347,15 +348,12 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
 	rq0->elsp_submitted++;
 
 	/* You must always write both descriptors in the order below. */
-	I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[1]));
-	I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[1]));
+	writel(upper_32_bits(desc[1]), elsp);
+	writel(lower_32_bits(desc[1]), elsp);
 
-	I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[0]));
+	writel(upper_32_bits(desc[0]), elsp);
 	/* The context is automatically loaded after the following */
-	I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[0]));
-
-	/* ELSP is a wo register, use another nearby reg for posting */
-	POSTING_READ_FW(RING_EXECLIST_STATUS_LO(engine));
+	writel(lower_32_bits(desc[0]), elsp);
 }
 
 static void
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 05/18] drm/i915: Reorder submitting the requests to ELSP
  2016-08-30  8:17 Non-blocking fences, now GuC compatible Chris Wilson
                   ` (3 preceding siblings ...)
  2016-08-30  8:17 ` [PATCH 04/18] drm/i915: Compute the ELSP register location once Chris Wilson
@ 2016-08-30  8:17 ` Chris Wilson
  2016-08-30  8:18 ` [PATCH 06/18] drm/i915: Simplify ELSP queue request tracking Chris Wilson
                   ` (13 subsequent siblings)
  18 siblings, 0 replies; 27+ messages in thread
From: Chris Wilson @ 2016-08-30  8:17 UTC (permalink / raw)
  To: intel-gfx; +Cc: mika.kuoppala

Just rearrange the code to reduce churn in the next patch.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/intel_lrc.c | 76 ++++++++++++++++++++--------------------
 1 file changed, 38 insertions(+), 38 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 831a5aa769b2..e9cb4a906009 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -328,32 +328,18 @@ uint64_t intel_lr_context_descriptor(struct i915_gem_context *ctx,
 	return ctx->engine[engine->id].lrc_desc;
 }
 
-static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
-				 struct drm_i915_gem_request *rq1)
+static inline void
+execlists_context_status_change(struct drm_i915_gem_request *rq,
+				unsigned long status)
 {
-	struct intel_engine_cs *engine = rq0->engine;
-	struct drm_i915_private *dev_priv = rq0->i915;
-	u32 __iomem *elsp =
-		dev_priv->regs + i915_mmio_reg_offset(RING_ELSP(engine));
-	u64 desc[2];
-
-	if (rq1) {
-		desc[1] = intel_lr_context_descriptor(rq1->ctx, rq1->engine);
-		rq1->elsp_submitted++;
-	} else {
-		desc[1] = 0;
-	}
-
-	desc[0] = intel_lr_context_descriptor(rq0->ctx, rq0->engine);
-	rq0->elsp_submitted++;
-
-	/* You must always write both descriptors in the order below. */
-	writel(upper_32_bits(desc[1]), elsp);
-	writel(lower_32_bits(desc[1]), elsp);
+	/*
+	 * Only used when GVT-g is enabled now. When GVT-g is disabled,
+	 * The compiler should eliminate this function as dead-code.
+	 */
+	if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
+		return;
 
-	writel(upper_32_bits(desc[0]), elsp);
-	/* The context is automatically loaded after the following */
-	writel(lower_32_bits(desc[0]), elsp);
+	atomic_notifier_call_chain(&rq->ctx->status_notifier, status, rq);
 }
 
 static void
@@ -382,6 +368,34 @@ static void execlists_update_context(struct drm_i915_gem_request *rq)
 		execlists_update_context_pdps(ppgtt, reg_state);
 }
 
+static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
+				 struct drm_i915_gem_request *rq1)
+{
+	struct intel_engine_cs *engine = rq0->engine;
+	struct drm_i915_private *dev_priv = rq0->i915;
+	u32 __iomem *elsp =
+		dev_priv->regs + i915_mmio_reg_offset(RING_ELSP(engine));
+	u64 desc[2];
+
+	if (rq1) {
+		desc[1] = intel_lr_context_descriptor(rq1->ctx, rq1->engine);
+		rq1->elsp_submitted++;
+	} else {
+		desc[1] = 0;
+	}
+
+	desc[0] = intel_lr_context_descriptor(rq0->ctx, rq0->engine);
+	rq0->elsp_submitted++;
+
+	/* You must always write both descriptors in the order below. */
+	writel(upper_32_bits(desc[1]), elsp);
+	writel(lower_32_bits(desc[1]), elsp);
+
+	writel(upper_32_bits(desc[0]), elsp);
+	/* The context is automatically loaded after the following */
+	writel(lower_32_bits(desc[0]), elsp);
+}
+
 static void execlists_elsp_submit_contexts(struct drm_i915_gem_request *rq0,
 					   struct drm_i915_gem_request *rq1)
 {
@@ -402,20 +416,6 @@ static void execlists_elsp_submit_contexts(struct drm_i915_gem_request *rq0,
 	spin_unlock_irq(&dev_priv->uncore.lock);
 }
 
-static inline void execlists_context_status_change(
-		struct drm_i915_gem_request *rq,
-		unsigned long status)
-{
-	/*
-	 * Only used when GVT-g is enabled now. When GVT-g is disabled,
-	 * The compiler should eliminate this function as dead-code.
-	 */
-	if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
-		return;
-
-	atomic_notifier_call_chain(&rq->ctx->status_notifier, status, rq);
-}
-
 static void execlists_unqueue(struct intel_engine_cs *engine)
 {
 	struct drm_i915_gem_request *req0 = NULL, *req1 = NULL;
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 06/18] drm/i915: Simplify ELSP queue request tracking
  2016-08-30  8:17 Non-blocking fences, now GuC compatible Chris Wilson
                   ` (4 preceding siblings ...)
  2016-08-30  8:17 ` [PATCH 05/18] drm/i915: Reorder submitting the requests to ELSP Chris Wilson
@ 2016-08-30  8:18 ` Chris Wilson
  2016-08-30  8:18 ` [PATCH 07/18] drm/i915: Separate out reset flags from the reset counter Chris Wilson
                   ` (12 subsequent siblings)
  18 siblings, 0 replies; 27+ messages in thread
From: Chris Wilson @ 2016-08-30  8:18 UTC (permalink / raw)
  To: intel-gfx; +Cc: mika.kuoppala

Emulate HW to track and manage ELSP queue. A set of SW ports are defined
and requests are assigned to these ports before submitting them to HW. This
helps in cleaning up incomplete requests during reset recovery easier
especially after engine reset by decoupling elsp queue management. This
will become more clear in the next patch.

In the engine reset case we want to resume where we left-off after skipping
the incomplete batch which requires checking the elsp queue, removing
element and fixing elsp_submitted counts in some cases. Instead of directly
manipulating the elsp queue from reset path we can examine these ports, fix
up ringbuffer pointers using the incomplete request and restart submissions
again after reset.

Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Cc: Arun Siluvery <arun.siluvery@linux.intel.com>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: http://patchwork.freedesktop.org/patch/msgid/1470414607-32453-3-git-send-email-arun.siluvery@linux.intel.com
Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c     |   2 +-
 drivers/gpu/drm/i915/i915_gem.c         |  13 +-
 drivers/gpu/drm/i915/i915_gem_request.c |   1 -
 drivers/gpu/drm/i915/i915_gem_request.h |  21 +-
 drivers/gpu/drm/i915/intel_lrc.c        | 405 ++++++++++++++------------------
 drivers/gpu/drm/i915/intel_lrc.h        |   2 -
 drivers/gpu/drm/i915/intel_ringbuffer.h |   7 +-
 7 files changed, 188 insertions(+), 263 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index d89359a50742..5f932ebc6ff1 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -2074,7 +2074,7 @@ static int i915_execlists(struct seq_file *m, void *data)
 		status_pointer = I915_READ(RING_CONTEXT_STATUS_PTR(engine));
 		seq_printf(m, "\tStatus pointer: 0x%08X\n", status_pointer);
 
-		read_pointer = engine->next_context_status_buffer;
+		read_pointer = GEN8_CSB_READ_PTR(status_pointer);
 		write_pointer = GEN8_CSB_WRITE_PTR(status_pointer);
 		if (read_pointer > write_pointer)
 			write_pointer += GEN8_CSB_ENTRIES;
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index d37b44126942..838a275e7fac 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2575,6 +2575,9 @@ static void i915_gem_reset_engine_cleanup(struct intel_engine_cs *engine)
 	struct drm_i915_gem_request *request;
 	struct intel_ring *ring;
 
+	/* Ensure irq handler finishes, and not run again. */
+	tasklet_kill(&engine->irq_tasklet);
+
 	/* Mark all pending requests as complete so that any concurrent
 	 * (lockless) lookup doesn't try and wait upon the request as we
 	 * reset it.
@@ -2588,10 +2591,12 @@ static void i915_gem_reset_engine_cleanup(struct intel_engine_cs *engine)
 	 */
 
 	if (i915.enable_execlists) {
-		/* Ensure irq handler finishes or is cancelled. */
-		tasklet_kill(&engine->irq_tasklet);
-
-		intel_execlists_cancel_requests(engine);
+		spin_lock(&engine->execlist_lock);
+		INIT_LIST_HEAD(&engine->execlist_queue);
+		i915_gem_request_put(engine->execlist_port[0].request);
+		i915_gem_request_put(engine->execlist_port[1].request);
+		memset(engine->execlist_port, 0, sizeof(engine->execlist_port));
+		spin_unlock(&engine->execlist_lock);
 	}
 
 	/*
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 9cc08a1e43c6..ec613fd5e01c 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -402,7 +402,6 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 	req->previous_context = NULL;
 	req->file_priv = NULL;
 	req->batch = NULL;
-	req->elsp_submitted = 0;
 
 	/*
 	 * Reserve space in the ring buffer for all the commands required to
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index 2faa3bb4c39b..a231bd318ef0 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -137,27 +137,8 @@ struct drm_i915_gem_request {
 	/** file_priv list entry for this request */
 	struct list_head client_list;
 
-	/**
-	 * The ELSP only accepts two elements at a time, so we queue
-	 * context/tail pairs on a given queue (ring->execlist_queue) until the
-	 * hardware is available. The queue serves a double purpose: we also use
-	 * it to keep track of the up to 2 contexts currently in the hardware
-	 * (usually one in execution and the other queued up by the GPU): We
-	 * only remove elements from the head of the queue when the hardware
-	 * informs us that an element has been completed.
-	 *
-	 * All accesses to the queue are mediated by a spinlock
-	 * (ring->execlist_lock).
-	 */
-
-	/** Execlist link in the submission queue.*/
+	/** Link in the execlist submission queue, guarded by execlist_lock. */
 	struct list_head execlist_link;
-
-	/** Execlists no. of times this request has been sent to the ELSP */
-	int elsp_submitted;
-
-	/** Execlists context hardware id. */
-	unsigned int ctx_hw_id;
 };
 
 extern const struct fence_ops i915_fence_ops;
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index e9cb4a906009..864b5248279a 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -156,6 +156,11 @@
 #define GEN8_CTX_STATUS_COMPLETE	(1 << 4)
 #define GEN8_CTX_STATUS_LITE_RESTORE	(1 << 15)
 
+#define GEN8_CTX_STATUS_COMPLETED_MASK \
+	 (GEN8_CTX_STATUS_ACTIVE_IDLE | \
+	  GEN8_CTX_STATUS_PREEMPTED | \
+	  GEN8_CTX_STATUS_ELEMENT_SWITCH)
+
 #define CTX_LRI_HEADER_0		0x01
 #define CTX_CONTEXT_CONTROL		0x02
 #define CTX_RING_HEAD			0x04
@@ -263,12 +268,10 @@ logical_ring_init_platform_invariants(struct intel_engine_cs *engine)
 {
 	struct drm_i915_private *dev_priv = engine->i915;
 
-	if (IS_GEN8(dev_priv) || IS_GEN9(dev_priv))
-		engine->idle_lite_restore_wa = ~0;
-
-	engine->disable_lite_restore_wa = (IS_SKL_REVID(dev_priv, 0, SKL_REVID_B0) ||
-					IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1)) &&
-					(engine->id == VCS || engine->id == VCS2);
+	engine->disable_lite_restore_wa =
+		(IS_SKL_REVID(dev_priv, 0, SKL_REVID_B0) ||
+		 IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1)) &&
+		(engine->id == VCS || engine->id == VCS2);
 
 	engine->ctx_desc_template = GEN8_CTX_VALID;
 	if (IS_GEN8(dev_priv))
@@ -351,11 +354,11 @@ execlists_update_context_pdps(struct i915_hw_ppgtt *ppgtt, u32 *reg_state)
 	ASSIGN_CTX_PDP(ppgtt, reg_state, 0);
 }
 
-static void execlists_update_context(struct drm_i915_gem_request *rq)
+static u64 execlists_update_context(struct drm_i915_gem_request *rq)
 {
-	struct intel_engine_cs *engine = rq->engine;
+	struct intel_context *ce = &rq->ctx->engine[rq->engine->id];
 	struct i915_hw_ppgtt *ppgtt = rq->ctx->ppgtt;
-	uint32_t *reg_state = rq->ctx->engine[engine->id].lrc_reg_state;
+	u32 *reg_state = ce->lrc_reg_state;
 
 	reg_state[CTX_RING_TAIL+1] = intel_ring_offset(rq->ring, rq->tail);
 
@@ -366,26 +369,34 @@ static void execlists_update_context(struct drm_i915_gem_request *rq)
 	 */
 	if (ppgtt && !USES_FULL_48BIT_PPGTT(ppgtt->base.dev))
 		execlists_update_context_pdps(ppgtt, reg_state);
+
+	return ce->lrc_desc;
 }
 
-static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
-				 struct drm_i915_gem_request *rq1)
+static void execlists_submit_ports(struct intel_engine_cs *engine)
 {
-	struct intel_engine_cs *engine = rq0->engine;
-	struct drm_i915_private *dev_priv = rq0->i915;
+	struct drm_i915_private *dev_priv = engine->i915;
+	struct execlist_port *port = engine->execlist_port;
 	u32 __iomem *elsp =
 		dev_priv->regs + i915_mmio_reg_offset(RING_ELSP(engine));
 	u64 desc[2];
 
-	if (rq1) {
-		desc[1] = intel_lr_context_descriptor(rq1->ctx, rq1->engine);
-		rq1->elsp_submitted++;
+	if (!port[0].count)
+		execlists_context_status_change(port[0].request,
+						INTEL_CONTEXT_SCHEDULE_IN);
+	desc[0] = execlists_update_context(port[0].request);
+	engine->preempt_wa = port[0].count++; /* bdw only? fixed on skl? */
+
+	if (port[1].request) {
+		GEM_BUG_ON(port[1].count);
+		execlists_context_status_change(port[1].request,
+						INTEL_CONTEXT_SCHEDULE_IN);
+		desc[1] = execlists_update_context(port[1].request);
+		port[1].count = 1;
 	} else {
 		desc[1] = 0;
 	}
-
-	desc[0] = intel_lr_context_descriptor(rq0->ctx, rq0->engine);
-	rq0->elsp_submitted++;
+	GEM_BUG_ON(desc[0] == desc[1]);
 
 	/* You must always write both descriptors in the order below. */
 	writel(upper_32_bits(desc[1]), elsp);
@@ -396,141 +407,125 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
 	writel(lower_32_bits(desc[0]), elsp);
 }
 
-static void execlists_elsp_submit_contexts(struct drm_i915_gem_request *rq0,
-					   struct drm_i915_gem_request *rq1)
+static bool ctx_single_port_submission(const struct i915_gem_context *ctx)
 {
-	struct drm_i915_private *dev_priv = rq0->i915;
-	unsigned int fw_domains = rq0->engine->fw_domains;
-
-	execlists_update_context(rq0);
-
-	if (rq1)
-		execlists_update_context(rq1);
+	return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
+		ctx->execlists_force_single_submission);
+}
 
-	spin_lock_irq(&dev_priv->uncore.lock);
-	intel_uncore_forcewake_get__locked(dev_priv, fw_domains);
+static bool can_merge_ctx(const struct i915_gem_context *prev,
+			  const struct i915_gem_context *next)
+{
+	if (prev != next)
+		return false;
 
-	execlists_elsp_write(rq0, rq1);
+	if (ctx_single_port_submission(prev))
+		return false;
 
-	intel_uncore_forcewake_put__locked(dev_priv, fw_domains);
-	spin_unlock_irq(&dev_priv->uncore.lock);
+	return true;
 }
 
-static void execlists_unqueue(struct intel_engine_cs *engine)
+static void execlists_dequeue(struct intel_engine_cs *engine)
 {
-	struct drm_i915_gem_request *req0 = NULL, *req1 = NULL;
-	struct drm_i915_gem_request *cursor, *tmp;
+	struct drm_i915_gem_request *cursor, *last;
+	struct execlist_port *port = engine->execlist_port;
+	bool submit = false;
+
+	last = port->request;
+	if (last)
+		/* WaIdleLiteRestore:bdw,skl
+		 * Apply the wa NOOPs to prevent ring:HEAD == req:TAIL
+		 * as we resubmit the request. See gen8_emit_request()
+		 * for where we prepare the padding after the end of the
+		 * request.
+		 */
+		last->tail = last->wa_tail;
 
-	assert_spin_locked(&engine->execlist_lock);
+	GEM_BUG_ON(port[1].request);
 
-	/*
-	 * If irqs are not active generate a warning as batches that finish
-	 * without the irqs may get lost and a GPU Hang may occur.
+	/* Hardware submission is through 2 ports. Conceptually each port
+	 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
+	 * static for a context, and unique to each, so we only execute
+	 * requests belonging to a single context from each ring. RING_HEAD
+	 * is maintained by the CS in the context image, it marks the place
+	 * where it got up to last time, and through RING_TAIL we tell the CS
+	 * where we want to execute up to this time.
+	 *
+	 * In this list the requests are in order of execution. Consecutive
+	 * requests from the same context are adjacent in the ringbuffer. We
+	 * can combine these requests into a single RING_TAIL update:
+	 *
+	 *              RING_HEAD...req1...req2
+	 *                                    ^- RING_TAIL
+	 * since to execute req2 the CS must first execute req1.
+	 *
+	 * Our goal then is to point each port to the end of a consecutive
+	 * sequence of requests as being the most optimal (fewest wake ups
+	 * and context switches) submission.
 	 */
-	WARN_ON(!intel_irqs_enabled(engine->i915));
-
-	/* Try to read in pairs */
-	list_for_each_entry_safe(cursor, tmp, &engine->execlist_queue,
-				 execlist_link) {
-		if (!req0) {
-			req0 = cursor;
-		} else if (req0->ctx == cursor->ctx) {
-			/* Same ctx: ignore first request, as second request
-			 * will update tail past first request's workload */
-			cursor->elsp_submitted = req0->elsp_submitted;
-			list_del(&req0->execlist_link);
-			i915_gem_request_put(req0);
-			req0 = cursor;
-		} else {
-			if (IS_ENABLED(CONFIG_DRM_I915_GVT)) {
-				/*
-				 * req0 (after merged) ctx requires single
-				 * submission, stop picking
-				 */
-				if (req0->ctx->execlists_force_single_submission)
-					break;
-				/*
-				 * req0 ctx doesn't require single submission,
-				 * but next req ctx requires, stop picking
-				 */
-				if (cursor->ctx->execlists_force_single_submission)
-					break;
-			}
-			req1 = cursor;
-			WARN_ON(req1->elsp_submitted);
-			break;
-		}
-	}
 
-	if (unlikely(!req0))
-		return;
-
-	execlists_context_status_change(req0, INTEL_CONTEXT_SCHEDULE_IN);
-
-	if (req1)
-		execlists_context_status_change(req1,
-						INTEL_CONTEXT_SCHEDULE_IN);
-
-	if (req0->elsp_submitted & engine->idle_lite_restore_wa) {
-		/*
-		 * WaIdleLiteRestore: make sure we never cause a lite restore
-		 * with HEAD==TAIL.
+	spin_lock(&engine->execlist_lock);
+	list_for_each_entry(cursor, &engine->execlist_queue, execlist_link) {
+		/* Can we combine this request with the current port? It has to
+		 * be the same context/ringbuffer and not have any exceptions
+		 * (e.g. GVT saying never to combine contexts).
 		 *
-		 * Apply the wa NOOPS to prevent ring:HEAD == req:TAIL as we
-		 * resubmit the request. See gen8_emit_request() for where we
-		 * prepare the padding after the end of the request.
+		 * If we can combine the requests, we can execute both by
+		 * updating the RING_TAIL to point to the end of the second
+		 * request, and so we never need to tell the hardware about
+		 * the first.
 		 */
-		req0->tail = req0->wa_tail;
+		if (last && !can_merge_ctx(cursor->ctx, last->ctx)) {
+			/* If we are on the second port and cannot combine
+			 * this request with the last, then we are done.
+			 */
+			if (port != engine->execlist_port)
+				break;
+
+			/* If GVT overrides us we only ever submit port[0],
+			 * leaving port[1] empty. Note that we also have
+			 * to be careful that we don't queue the same
+			 * context (even though a different request) to
+			 * the second port.
+			 */
+			if (ctx_single_port_submission(cursor->ctx))
+				break;
+
+			GEM_BUG_ON(last->ctx == cursor->ctx);
+
+			i915_gem_request_assign(&port->request, last);
+			port++;
+		}
+		last = cursor;
+		submit = true;
+	}
+	if (submit) {
+		/* Decouple all the requests submitted from the queue */
+		engine->execlist_queue.next = &cursor->execlist_link;
+		cursor->execlist_link.prev = &engine->execlist_queue;
+
+		i915_gem_request_assign(&port->request, last);
 	}
+	spin_unlock(&engine->execlist_lock);
 
-	execlists_elsp_submit_contexts(req0, req1);
+	if (submit)
+		execlists_submit_ports(engine);
 }
 
-static unsigned int
-execlists_check_remove_request(struct intel_engine_cs *engine, u32 ctx_id)
+static bool execlists_elsp_idle(struct intel_engine_cs *engine)
 {
-	struct drm_i915_gem_request *head_req;
-
-	assert_spin_locked(&engine->execlist_lock);
-
-	head_req = list_first_entry_or_null(&engine->execlist_queue,
-					    struct drm_i915_gem_request,
-					    execlist_link);
-
-	if (WARN_ON(!head_req || (head_req->ctx_hw_id != ctx_id)))
-               return 0;
-
-	WARN(head_req->elsp_submitted == 0, "Never submitted head request\n");
-
-	if (--head_req->elsp_submitted > 0)
-		return 0;
-
-	execlists_context_status_change(head_req, INTEL_CONTEXT_SCHEDULE_OUT);
-
-	list_del(&head_req->execlist_link);
-	i915_gem_request_put(head_req);
-
-	return 1;
+	return !engine->execlist_port[0].request;
 }
 
-static u32
-get_context_status(struct intel_engine_cs *engine, unsigned int read_pointer,
-		   u32 *context_id)
+static bool execlists_elsp_ready(struct intel_engine_cs *engine)
 {
-	struct drm_i915_private *dev_priv = engine->i915;
-	u32 status;
-
-	read_pointer %= GEN8_CSB_ENTRIES;
-
-	status = I915_READ_FW(RING_CONTEXT_STATUS_BUF_LO(engine, read_pointer));
-
-	if (status & GEN8_CTX_STATUS_IDLE_ACTIVE)
-		return 0;
+	int port;
 
-	*context_id = I915_READ_FW(RING_CONTEXT_STATUS_BUF_HI(engine,
-							      read_pointer));
+	port = 1; /* wait for a free slot */
+	if (engine->disable_lite_restore_wa || engine->preempt_wa)
+		port = 0; /* wait for GPU to be idle before continuing */
 
-	return status;
+	return !engine->execlist_port[port].request;
 }
 
 /*
@@ -540,67 +535,56 @@ get_context_status(struct intel_engine_cs *engine, unsigned int read_pointer,
 static void intel_lrc_irq_handler(unsigned long data)
 {
 	struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
+	struct execlist_port *port = engine->execlist_port;
 	struct drm_i915_private *dev_priv = engine->i915;
-	u32 status_pointer;
-	unsigned int read_pointer, write_pointer;
-	u32 csb[GEN8_CSB_ENTRIES][2];
-	unsigned int csb_read = 0, i;
-	unsigned int submit_contexts = 0;
 
 	intel_uncore_forcewake_get(dev_priv, engine->fw_domains);
 
-	status_pointer = I915_READ_FW(RING_CONTEXT_STATUS_PTR(engine));
-
-	read_pointer = engine->next_context_status_buffer;
-	write_pointer = GEN8_CSB_WRITE_PTR(status_pointer);
-	if (read_pointer > write_pointer)
-		write_pointer += GEN8_CSB_ENTRIES;
-
-	while (read_pointer < write_pointer) {
-		if (WARN_ON_ONCE(csb_read == GEN8_CSB_ENTRIES))
-			break;
-		csb[csb_read][0] = get_context_status(engine, ++read_pointer,
-						      &csb[csb_read][1]);
-		csb_read++;
-	}
-
-	engine->next_context_status_buffer = write_pointer % GEN8_CSB_ENTRIES;
-
-	/* Update the read pointer to the old write pointer. Manual ringbuffer
-	 * management ftw </sarcasm> */
-	I915_WRITE_FW(RING_CONTEXT_STATUS_PTR(engine),
-		      _MASKED_FIELD(GEN8_CSB_READ_PTR_MASK,
-				    engine->next_context_status_buffer << 8));
-
-	intel_uncore_forcewake_put(dev_priv, engine->fw_domains);
-
-	spin_lock(&engine->execlist_lock);
+	if (!execlists_elsp_idle(engine)) {
+		u32 __iomem *csb_mmio =
+			dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine));
+		u32 __iomem *buf =
+			dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0));
+		unsigned int csb, head, tail;
+
+		csb = readl(csb_mmio);
+		head = GEN8_CSB_READ_PTR(csb);
+		tail = GEN8_CSB_WRITE_PTR(csb);
+		if (tail < head)
+			tail += GEN8_CSB_ENTRIES;
+		while (head < tail) {
+			unsigned int idx = ++head % GEN8_CSB_ENTRIES;
+			unsigned int status = readl(buf + 2 * idx);
+
+			if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK))
+				continue;
+
+			GEM_BUG_ON(port[0].count == 0);
+			if (--port[0].count == 0) {
+				GEM_BUG_ON(status & GEN8_CTX_STATUS_PREEMPTED);
+				execlists_context_status_change(port[0].request,
+								INTEL_CONTEXT_SCHEDULE_OUT);
+
+				i915_gem_request_put(port[0].request);
+				port[0] = port[1];
+				memset(&port[1], 0, sizeof(port[1]));
+
+				engine->preempt_wa = false;
+			}
 
-	for (i = 0; i < csb_read; i++) {
-		if (unlikely(csb[i][0] & GEN8_CTX_STATUS_PREEMPTED)) {
-			if (csb[i][0] & GEN8_CTX_STATUS_LITE_RESTORE) {
-				if (execlists_check_remove_request(engine, csb[i][1]))
-					WARN(1, "Lite Restored request removed from queue\n");
-			} else
-				WARN(1, "Preemption without Lite Restore\n");
+			GEM_BUG_ON(port[0].count == 0 &&
+				   !(status & GEN8_CTX_STATUS_ACTIVE_IDLE));
 		}
 
-		if (csb[i][0] & (GEN8_CTX_STATUS_ACTIVE_IDLE |
-		    GEN8_CTX_STATUS_ELEMENT_SWITCH))
-			submit_contexts +=
-				execlists_check_remove_request(engine, csb[i][1]);
+		writel(_MASKED_FIELD(GEN8_CSB_READ_PTR_MASK,
+				     GEN8_CSB_WRITE_PTR(csb) << 8),
+		       csb_mmio);
 	}
 
-	if (submit_contexts) {
-		if (!engine->disable_lite_restore_wa ||
-		    (csb[i][0] & GEN8_CTX_STATUS_ACTIVE_IDLE))
-			execlists_unqueue(engine);
-	}
+	if (execlists_elsp_ready(engine))
+		execlists_dequeue(engine);
 
-	spin_unlock(&engine->execlist_lock);
-
-	if (unlikely(submit_contexts > 2))
-		DRM_ERROR("More than two context complete events?\n");
+	intel_uncore_forcewake_put(dev_priv, engine->fw_domains);
 }
 
 static void execlists_submit_request(struct drm_i915_gem_request *request)
@@ -609,12 +593,9 @@ static void execlists_submit_request(struct drm_i915_gem_request *request)
 
 	spin_lock_bh(&engine->execlist_lock);
 
-	i915_gem_request_get(request);
-	request->ctx_hw_id = request->ctx->hw_id;
-
-	if (list_empty(&engine->execlist_queue))
-		tasklet_hi_schedule(&engine->irq_tasklet);
 	list_add_tail(&request->execlist_link, &engine->execlist_queue);
+	if (execlists_elsp_idle(engine))
+		tasklet_hi_schedule(&engine->irq_tasklet);
 
 	spin_unlock_bh(&engine->execlist_lock);
 }
@@ -721,23 +702,6 @@ intel_logical_ring_advance(struct drm_i915_gem_request *request)
 	return 0;
 }
 
-void intel_execlists_cancel_requests(struct intel_engine_cs *engine)
-{
-	struct drm_i915_gem_request *req, *tmp;
-	LIST_HEAD(cancel_list);
-
-	WARN_ON(!mutex_is_locked(&engine->i915->drm.struct_mutex));
-
-	spin_lock_bh(&engine->execlist_lock);
-	list_replace_init(&engine->execlist_queue, &cancel_list);
-	spin_unlock_bh(&engine->execlist_lock);
-
-	list_for_each_entry_safe(req, tmp, &cancel_list, execlist_link) {
-		list_del(&req->execlist_link);
-		i915_gem_request_put(req);
-	}
-}
-
 static int intel_lr_context_pin(struct i915_gem_context *ctx,
 				struct intel_engine_cs *engine)
 {
@@ -1258,7 +1222,6 @@ static void lrc_init_hws(struct intel_engine_cs *engine)
 static int gen8_init_common_ring(struct intel_engine_cs *engine)
 {
 	struct drm_i915_private *dev_priv = engine->i915;
-	unsigned int next_context_status_buffer_hw;
 
 	lrc_init_hws(engine);
 
@@ -1269,32 +1232,12 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
 	I915_WRITE(RING_MODE_GEN7(engine),
 		   _MASKED_BIT_DISABLE(GFX_REPLAY_MODE) |
 		   _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE));
-	POSTING_READ(RING_MODE_GEN7(engine));
 
-	/*
-	 * Instead of resetting the Context Status Buffer (CSB) read pointer to
-	 * zero, we need to read the write pointer from hardware and use its
-	 * value because "this register is power context save restored".
-	 * Effectively, these states have been observed:
-	 *
-	 *      | Suspend-to-idle (freeze) | Suspend-to-RAM (mem) |
-	 * BDW  | CSB regs not reset       | CSB regs reset       |
-	 * CHT  | CSB regs not reset       | CSB regs not reset   |
-	 * SKL  |         ?                |         ?            |
-	 * BXT  |         ?                |         ?            |
-	 */
-	next_context_status_buffer_hw =
-		GEN8_CSB_WRITE_PTR(I915_READ(RING_CONTEXT_STATUS_PTR(engine)));
+	I915_WRITE(RING_CONTEXT_STATUS_PTR(engine),
+		   _MASKED_FIELD(GEN8_CSB_READ_PTR_MASK |
+				 GEN8_CSB_WRITE_PTR_MASK,
+				 0));
 
-	/*
-	 * When the CSB registers are reset (also after power-up / gpu reset),
-	 * CSB write pointer is set to all 1's, which is not valid, use '5' in
-	 * this special case, so the first element read is CSB[0].
-	 */
-	if (next_context_status_buffer_hw == GEN8_CSB_PTR_MASK)
-		next_context_status_buffer_hw = (GEN8_CSB_ENTRIES - 1);
-
-	engine->next_context_status_buffer = next_context_status_buffer_hw;
 	DRM_DEBUG_DRIVER("Execlists enabled for %s\n", engine->name);
 
 	intel_engine_init_hangcheck(engine);
@@ -1680,10 +1623,6 @@ void intel_logical_ring_cleanup(struct intel_engine_cs *engine)
 	}
 	intel_lr_context_unpin(dev_priv->kernel_context, engine);
 
-	engine->idle_lite_restore_wa = 0;
-	engine->disable_lite_restore_wa = false;
-	engine->ctx_desc_template = 0;
-
 	lrc_destroy_wa_ctx_obj(engine);
 	engine->i915 = NULL;
 }
diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
index a52cf57dbd40..4d70346500c2 100644
--- a/drivers/gpu/drm/i915/intel_lrc.h
+++ b/drivers/gpu/drm/i915/intel_lrc.h
@@ -97,6 +97,4 @@ int intel_sanitize_enable_execlists(struct drm_i915_private *dev_priv,
 				    int enable_execlists);
 void intel_execlists_enable_submission(struct drm_i915_private *dev_priv);
 
-void intel_execlists_cancel_requests(struct intel_engine_cs *engine);
-
 #endif /* _INTEL_LRC_H_ */
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 84aea549de5d..2181d0a41a96 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -298,11 +298,14 @@ struct intel_engine_cs {
 	/* Execlists */
 	struct tasklet_struct irq_tasklet;
 	spinlock_t execlist_lock; /* used inside tasklet, use spin_lock_bh */
+	struct execlist_port {
+		struct drm_i915_gem_request *request;
+		unsigned int count;
+	} execlist_port[2];
 	struct list_head execlist_queue;
 	unsigned int fw_domains;
-	unsigned int next_context_status_buffer;
-	unsigned int idle_lite_restore_wa;
 	bool disable_lite_restore_wa;
+	bool preempt_wa;
 	u32 ctx_desc_template;
 
 	/**
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 07/18] drm/i915: Separate out reset flags from the reset counter
  2016-08-30  8:17 Non-blocking fences, now GuC compatible Chris Wilson
                   ` (5 preceding siblings ...)
  2016-08-30  8:18 ` [PATCH 06/18] drm/i915: Simplify ELSP queue request tracking Chris Wilson
@ 2016-08-30  8:18 ` Chris Wilson
  2016-08-30  8:18 ` [PATCH 08/18] drm/i915: Drop local struct_mutex around intel_init_emon[ilk] Chris Wilson
                   ` (11 subsequent siblings)
  18 siblings, 0 replies; 27+ messages in thread
From: Chris Wilson @ 2016-08-30  8:18 UTC (permalink / raw)
  To: intel-gfx; +Cc: mika.kuoppala

In preparation for introducing a per-engine reset, we can first separate
the mixing of the reset state from the global reset counter.

The loss of atomicity in updating the reset state poses a small problem
for handling the waiters. For requests, this is solved by advancing the
seqno so that a waiter waking up after the reset knows the request is
complete. For pending flips, we still rely on the increment of the
global reset epoch (as well as the reset-in-progress flag) to signify
when the hardware was reset.

The advantage, now that we do not inspect the reset state during reset
itself i.e. we no longer emit requests during reset, is that we can use
the atomic updates of the state flags to ensure that only one reset
worker is active.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Arun Siluvery <arun.siluvery@linux.intel.com>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1470414607-32453-6-git-send-email-arun.siluvery@linux.intel.com
---
 drivers/gpu/drm/i915/i915_drv.c         |  16 ++---
 drivers/gpu/drm/i915/i915_drv.h         |  46 +++++---------
 drivers/gpu/drm/i915/i915_gem.c         |   2 +-
 drivers/gpu/drm/i915/i915_gem_request.c |  13 ++--
 drivers/gpu/drm/i915/i915_irq.c         | 104 +++++++++++++++-----------------
 drivers/gpu/drm/i915/intel_display.c    |  25 +++++---
 drivers/gpu/drm/i915/intel_drv.h        |   4 +-
 7 files changed, 93 insertions(+), 117 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 47fe07283d88..01b518dcbd7a 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1602,7 +1602,7 @@ static int i915_drm_resume(struct drm_device *dev)
 	mutex_lock(&dev->struct_mutex);
 	if (i915_gem_init_hw(dev)) {
 		DRM_ERROR("failed to re-initialize GPU, declaring wedged!\n");
-		atomic_or(I915_WEDGED, &dev_priv->gpu_error.reset_counter);
+		set_bit(I915_WEDGED, &dev_priv->gpu_error.flags);
 	}
 	mutex_unlock(&dev->struct_mutex);
 
@@ -1764,20 +1764,13 @@ int i915_reset(struct drm_i915_private *dev_priv)
 {
 	struct drm_device *dev = &dev_priv->drm;
 	struct i915_gpu_error *error = &dev_priv->gpu_error;
-	unsigned reset_counter;
 	int ret;
 
 	mutex_lock(&dev->struct_mutex);
 
 	/* Clear any previous failed attempts at recovery. Time to try again. */
-	atomic_andnot(I915_WEDGED, &error->reset_counter);
-
-	/* Clear the reset-in-progress flag and increment the reset epoch. */
-	reset_counter = atomic_inc_return(&error->reset_counter);
-	if (WARN_ON(__i915_reset_in_progress(reset_counter))) {
-		ret = -EIO;
-		goto error;
-	}
+	__clear_bit(I915_WEDGED, &error->flags);
+	error->reset_count++;
 
 	pr_notice("drm/i915: Resetting chip after gpu hang\n");
 
@@ -1814,6 +1807,7 @@ int i915_reset(struct drm_i915_private *dev_priv)
 		goto error;
 	}
 
+	clear_bit(I915_RESET_IN_PROGRESS, &error->flags);
 	mutex_unlock(&dev->struct_mutex);
 
 	/*
@@ -1828,7 +1822,7 @@ int i915_reset(struct drm_i915_private *dev_priv)
 	return 0;
 
 error:
-	atomic_or(I915_WEDGED, &error->reset_counter);
+	set_bit(I915_WEDGED, &error->flags);
 	mutex_unlock(&dev->struct_mutex);
 	return ret;
 }
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index c413587895cf..e574eaa65c4d 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1386,9 +1386,10 @@ struct i915_gpu_error {
 	 * State variable controlling the reset flow and count
 	 *
 	 * This is a counter which gets incremented when reset is triggered,
-	 * and again when reset has been handled. So odd values (lowest bit set)
-	 * means that reset is in progress and even values that
-	 * (reset_counter >> 1):th reset was successfully completed.
+	 *
+	 * Before the reset commences, the I915_RESET_IN_PROGRESS bit is set
+	 * meaning that any waiters holding onto the struct_mutex should
+	 * relinquish the lock immediately in order for the reset to start.
 	 *
 	 * If reset is not completed succesfully, the I915_WEDGE bit is
 	 * set meaning that hardware is terminally sour and there is no
@@ -1403,10 +1404,11 @@ struct i915_gpu_error {
 	 * naturally enforces the correct ordering between the bail-out of the
 	 * waiter and the gpu reset work code.
 	 */
-	atomic_t reset_counter;
+	unsigned long reset_count;
 
-#define I915_RESET_IN_PROGRESS_FLAG	1
-#define I915_WEDGED			(1 << 31)
+	unsigned long flags;
+#define I915_RESET_IN_PROGRESS	0
+#define I915_WEDGED		(BITS_PER_LONG - 1)
 
 	/**
 	 * Waitqueue to signal when a hang is detected. Used to for waiters
@@ -3234,44 +3236,24 @@ i915_gem_find_active_request(struct intel_engine_cs *engine);
 
 void i915_gem_retire_requests(struct drm_i915_private *dev_priv);
 
-static inline u32 i915_reset_counter(struct i915_gpu_error *error)
-{
-	return atomic_read(&error->reset_counter);
-}
-
-static inline bool __i915_reset_in_progress(u32 reset)
-{
-	return unlikely(reset & I915_RESET_IN_PROGRESS_FLAG);
-}
-
-static inline bool __i915_reset_in_progress_or_wedged(u32 reset)
-{
-	return unlikely(reset & (I915_RESET_IN_PROGRESS_FLAG | I915_WEDGED));
-}
-
-static inline bool __i915_terminally_wedged(u32 reset)
-{
-	return unlikely(reset & I915_WEDGED);
-}
-
 static inline bool i915_reset_in_progress(struct i915_gpu_error *error)
 {
-	return __i915_reset_in_progress(i915_reset_counter(error));
+	return unlikely(test_bit(I915_RESET_IN_PROGRESS, &error->flags));
 }
 
-static inline bool i915_reset_in_progress_or_wedged(struct i915_gpu_error *error)
+static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
 {
-	return __i915_reset_in_progress_or_wedged(i915_reset_counter(error));
+	return unlikely(test_bit(I915_WEDGED, &error->flags));
 }
 
-static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
+static inline bool i915_reset_in_progress_or_wedged(struct i915_gpu_error *error)
 {
-	return __i915_terminally_wedged(i915_reset_counter(error));
+	return i915_reset_in_progress(error) | i915_terminally_wedged(error);
 }
 
 static inline u32 i915_reset_count(struct i915_gpu_error *error)
 {
-	return ((i915_reset_counter(error) & ~I915_WEDGED) + 1) / 2;
+	return READ_ONCE(error->reset_count);
 }
 
 void i915_gem_reset(struct drm_device *dev);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 838a275e7fac..c06dacdae87f 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4525,7 +4525,7 @@ int i915_gem_init(struct drm_device *dev)
 		 * for all other failure, such as an allocation failure, bail.
 		 */
 		DRM_ERROR("Failed to initialize GPU, declaring it wedged\n");
-		atomic_or(I915_WEDGED, &dev_priv->gpu_error.reset_counter);
+		set_bit(I915_WEDGED, &dev_priv->gpu_error.flags);
 		ret = 0;
 	}
 
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index ec613fd5e01c..24eb4b1b7540 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -233,16 +233,18 @@ void i915_gem_request_retire_upto(struct drm_i915_gem_request *req)
 	} while (tmp != req);
 }
 
-static int i915_gem_check_wedge(unsigned int reset_counter, bool interruptible)
+static int i915_gem_check_wedge(struct drm_i915_private *dev_priv)
 {
-	if (__i915_terminally_wedged(reset_counter))
+	struct i915_gpu_error *error = &dev_priv->gpu_error;
+
+	if (i915_terminally_wedged(error))
 		return -EIO;
 
-	if (__i915_reset_in_progress(reset_counter)) {
+	if (i915_reset_in_progress(error)) {
 		/* Non-interruptible callers can't handle -EAGAIN, hence return
 		 * -EIO unconditionally for these.
 		 */
-		if (!interruptible)
+		if (!dev_priv->mm.interruptible)
 			return -EIO;
 
 		return -EAGAIN;
@@ -331,7 +333,6 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 		       struct i915_gem_context *ctx)
 {
 	struct drm_i915_private *dev_priv = engine->i915;
-	unsigned int reset_counter = i915_reset_counter(&dev_priv->gpu_error);
 	struct drm_i915_gem_request *req;
 	u32 seqno;
 	int ret;
@@ -340,7 +341,7 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 	 * EIO if the GPU is already wedged, or EAGAIN to drop the struct_mutex
 	 * and restart.
 	 */
-	ret = i915_gem_check_wedge(reset_counter, dev_priv->mm.interruptible);
+	ret = i915_gem_check_wedge(dev_priv);
 	if (ret)
 		return ERR_PTR(ret);
 
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 82358d4e0cc2..3fd9cbdf2adb 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2501,53 +2501,41 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
 
 	kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
 
+	DRM_DEBUG_DRIVER("resetting chip\n");
+	kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
+
 	/*
-	 * Note that there's only one work item which does gpu resets, so we
-	 * need not worry about concurrent gpu resets potentially incrementing
-	 * error->reset_counter twice. We only need to take care of another
-	 * racing irq/hangcheck declaring the gpu dead for a second time. A
-	 * quick check for that is good enough: schedule_work ensures the
-	 * correct ordering between hang detection and this work item, and since
-	 * the reset in-progress bit is only ever set by code outside of this
-	 * work we don't need to worry about any other races.
+	 * In most cases it's guaranteed that we get here with an RPM
+	 * reference held, for example because there is a pending GPU
+	 * request that won't finish until the reset is done. This
+	 * isn't the case at least when we get here by doing a
+	 * simulated reset via debugs, so get an RPM reference.
 	 */
-	if (i915_reset_in_progress(&dev_priv->gpu_error)) {
-		DRM_DEBUG_DRIVER("resetting chip\n");
-		kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
-
-		/*
-		 * In most cases it's guaranteed that we get here with an RPM
-		 * reference held, for example because there is a pending GPU
-		 * request that won't finish until the reset is done. This
-		 * isn't the case at least when we get here by doing a
-		 * simulated reset via debugs, so get an RPM reference.
-		 */
-		intel_runtime_pm_get(dev_priv);
+	intel_runtime_pm_get(dev_priv);
 
-		intel_prepare_reset(dev_priv);
+	intel_prepare_reset(dev_priv);
 
-		/*
-		 * All state reset _must_ be completed before we update the
-		 * reset counter, for otherwise waiters might miss the reset
-		 * pending state and not properly drop locks, resulting in
-		 * deadlocks with the reset work.
-		 */
-		ret = i915_reset(dev_priv);
+	/*
+	 * All state reset _must_ be completed before we update the
+	 * reset counter, for otherwise waiters might miss the reset
+	 * pending state and not properly drop locks, resulting in
+	 * deadlocks with the reset work.
+	 */
+	ret = i915_reset(dev_priv);
 
-		intel_finish_reset(dev_priv);
+	intel_finish_reset(dev_priv);
 
-		intel_runtime_pm_put(dev_priv);
+	intel_runtime_pm_put(dev_priv);
 
-		if (ret == 0)
-			kobject_uevent_env(kobj,
-					   KOBJ_CHANGE, reset_done_event);
+	if (ret == 0)
+		kobject_uevent_env(kobj,
+				   KOBJ_CHANGE, reset_done_event);
 
-		/*
-		 * Note: The wake_up also serves as a memory barrier so that
-		 * waiters see the update value of the reset counter atomic_t.
-		 */
-		wake_up_all(&dev_priv->gpu_error.reset_queue);
-	}
+	/*
+	 * Note: The wake_up also serves as a memory barrier so that
+	 * waiters see the update value of the reset counter atomic_t.
+	 */
+	i915_error_wake_up(dev_priv);
 }
 
 static void i915_report_and_clear_eir(struct drm_i915_private *dev_priv)
@@ -2666,25 +2654,27 @@ void i915_handle_error(struct drm_i915_private *dev_priv,
 	i915_capture_error_state(dev_priv, engine_mask, error_msg);
 	i915_report_and_clear_eir(dev_priv);
 
-	if (engine_mask) {
-		atomic_or(I915_RESET_IN_PROGRESS_FLAG,
-				&dev_priv->gpu_error.reset_counter);
+	if (!engine_mask)
+		return;
 
-		/*
-		 * Wakeup waiting processes so that the reset function
-		 * i915_reset_and_wakeup doesn't deadlock trying to grab
-		 * various locks. By bumping the reset counter first, the woken
-		 * processes will see a reset in progress and back off,
-		 * releasing their locks and then wait for the reset completion.
-		 * We must do this for _all_ gpu waiters that might hold locks
-		 * that the reset work needs to acquire.
-		 *
-		 * Note: The wake_up serves as the required memory barrier to
-		 * ensure that the waiters see the updated value of the reset
-		 * counter atomic_t.
-		 */
-		i915_error_wake_up(dev_priv);
-	}
+	if (test_and_set_bit(I915_RESET_IN_PROGRESS,
+			     &dev_priv->gpu_error.flags))
+		return;
+
+	/*
+	 * Wakeup waiting processes so that the reset function
+	 * i915_reset_and_wakeup doesn't deadlock trying to grab
+	 * various locks. By bumping the reset counter first, the woken
+	 * processes will see a reset in progress and back off,
+	 * releasing their locks and then wait for the reset completion.
+	 * We must do this for _all_ gpu waiters that might hold locks
+	 * that the reset work needs to acquire.
+	 *
+	 * Note: The wake_up serves as the required memory barrier to
+	 * ensure that the waiters see the updated value of the reset
+	 * counter atomic_t.
+	 */
+	i915_error_wake_up(dev_priv);
 
 	i915_reset_and_wakeup(dev_priv);
 }
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 19ffd024ddec..2e63e5cfa98d 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -3643,15 +3643,26 @@ void intel_finish_reset(struct drm_i915_private *dev_priv)
 	mutex_unlock(&dev->mode_config.mutex);
 }
 
+static bool abort_flip_on_reset(struct intel_crtc *crtc)
+{
+	struct i915_gpu_error *error = &to_i915(crtc->base.dev)->gpu_error;
+
+	if (i915_reset_in_progress(error))
+		return true;
+
+	if (crtc->reset_count != i915_reset_count(error))
+		return true;
+
+	return false;
+}
+
 static bool intel_crtc_has_pending_flip(struct drm_crtc *crtc)
 {
 	struct drm_device *dev = crtc->dev;
 	struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
-	unsigned reset_counter;
 	bool pending;
 
-	reset_counter = i915_reset_counter(&to_i915(dev)->gpu_error);
-	if (intel_crtc->reset_counter != reset_counter)
+	if (abort_flip_on_reset(intel_crtc))
 		return false;
 
 	spin_lock_irq(&dev->event_lock);
@@ -11549,10 +11560,8 @@ static bool __pageflip_finished_cs(struct intel_crtc *crtc,
 {
 	struct drm_device *dev = crtc->base.dev;
 	struct drm_i915_private *dev_priv = to_i915(dev);
-	unsigned reset_counter;
 
-	reset_counter = i915_reset_counter(&dev_priv->gpu_error);
-	if (crtc->reset_counter != reset_counter)
+	if (abort_flip_on_reset(crtc))
 		return true;
 
 	/*
@@ -12218,8 +12227,8 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc,
 	if (ret)
 		goto cleanup;
 
-	intel_crtc->reset_counter = i915_reset_counter(&dev_priv->gpu_error);
-	if (__i915_reset_in_progress_or_wedged(intel_crtc->reset_counter)) {
+	intel_crtc->reset_count = i915_reset_count(&dev_priv->gpu_error);
+	if (i915_reset_in_progress_or_wedged(&dev_priv->gpu_error)) {
 		ret = -EIO;
 		goto cleanup;
 	}
diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h
index 570a7ca7983f..60e1cd915b85 100644
--- a/drivers/gpu/drm/i915/intel_drv.h
+++ b/drivers/gpu/drm/i915/intel_drv.h
@@ -712,8 +712,8 @@ struct intel_crtc {
 
 	struct intel_crtc_state *config;
 
-	/* reset counter value when the last flip was submitted */
-	unsigned int reset_counter;
+	/* global reset count when the last flip was submitted */
+	unsigned int reset_count;
 
 	/* Access to these should be protected by dev_priv->irq_lock. */
 	bool cpu_fifo_underrun_disabled;
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 08/18] drm/i915: Drop local struct_mutex around intel_init_emon[ilk]
  2016-08-30  8:17 Non-blocking fences, now GuC compatible Chris Wilson
                   ` (6 preceding siblings ...)
  2016-08-30  8:18 ` [PATCH 07/18] drm/i915: Separate out reset flags from the reset counter Chris Wilson
@ 2016-08-30  8:18 ` Chris Wilson
  2016-08-30  8:18 ` [PATCH 09/18] drm/i915: Expand bool interruptible to pass flags to i915_wait_request() Chris Wilson
                   ` (10 subsequent siblings)
  18 siblings, 0 replies; 27+ messages in thread
From: Chris Wilson @ 2016-08-30  8:18 UTC (permalink / raw)
  To: intel-gfx; +Cc: mika.kuoppala

Access to intel_init_emon() is strictly ordered by gt_powersave, using
struct_mutex around it is overkill (and will conflict with the caller
holding struct_mutex themselves).

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/intel_pm.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
index aba6fd036c4e..5e2a33d066c4 100644
--- a/drivers/gpu/drm/i915/intel_pm.c
+++ b/drivers/gpu/drm/i915/intel_pm.c
@@ -6773,9 +6773,7 @@ void intel_autoenable_gt_powersave(struct drm_i915_private *dev_priv)
 
 	if (IS_IRONLAKE_M(dev_priv)) {
 		ironlake_enable_drps(dev_priv);
-		mutex_lock(&dev_priv->drm.struct_mutex);
 		intel_init_emon(dev_priv);
-		mutex_unlock(&dev_priv->drm.struct_mutex);
 	} else if (INTEL_INFO(dev_priv)->gen >= 6) {
 		/*
 		 * PCU communication is slow and this doesn't need to be
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 09/18] drm/i915: Expand bool interruptible to pass flags to i915_wait_request()
  2016-08-30  8:17 Non-blocking fences, now GuC compatible Chris Wilson
                   ` (7 preceding siblings ...)
  2016-08-30  8:18 ` [PATCH 08/18] drm/i915: Drop local struct_mutex around intel_init_emon[ilk] Chris Wilson
@ 2016-08-30  8:18 ` Chris Wilson
  2016-08-30  8:18 ` [PATCH 10/18] drm/i915: Perform a direct reset of the GPU from the waiter Chris Wilson
                   ` (9 subsequent siblings)
  18 siblings, 0 replies; 27+ messages in thread
From: Chris Wilson @ 2016-08-30  8:18 UTC (permalink / raw)
  To: intel-gfx; +Cc: mika.kuoppala

We need finer control over wakeup behaviour during i915_wait_request(),
so expand the current bool interruptible to a bitmask.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_gem.c         | 2 +-
 drivers/gpu/drm/i915/i915_gem_request.c | 7 ++++---
 drivers/gpu/drm/i915/i915_gem_request.h | 7 ++++---
 drivers/gpu/drm/i915/intel_display.c    | 8 ++++----
 drivers/gpu/drm/i915/intel_ringbuffer.c | 3 ++-
 5 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index c06dacdae87f..51bb05f62686 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3746,7 +3746,7 @@ i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
 	if (target == NULL)
 		return 0;
 
-	ret = i915_wait_request(target, true, NULL, NULL);
+	ret = i915_wait_request(target, I915_WAIT_INTERRUPTIBLE, NULL, NULL);
 	i915_gem_request_put(target);
 
 	return ret;
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 24eb4b1b7540..3c600ff3e9d5 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -598,7 +598,7 @@ bool __i915_spin_request(const struct drm_i915_gem_request *req,
 /**
  * i915_wait_request - wait until execution of request has finished
  * @req: duh!
- * @interruptible: do an interruptible wait (normally yes)
+ * @flags: how to wait
  * @timeout: in - how long to wait (NULL forever); out - how much time remaining
  * @rps: client to charge for RPS boosting
  *
@@ -613,11 +613,12 @@ bool __i915_spin_request(const struct drm_i915_gem_request *req,
  * errno with remaining time filled in timeout argument.
  */
 int i915_wait_request(struct drm_i915_gem_request *req,
-		      bool interruptible,
+		      unsigned int flags,
 		      s64 *timeout,
 		      struct intel_rps_client *rps)
 {
-	int state = interruptible ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE;
+	const int state = flags & I915_WAIT_INTERRUPTIBLE ?
+		TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE;
 	DEFINE_WAIT(reset);
 	struct intel_wait wait;
 	unsigned long timeout_remain;
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index a231bd318ef0..41de4d9aa3fe 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -218,10 +218,11 @@ struct intel_rps_client;
 #define IS_RPS_USER(p) (!IS_ERR_OR_NULL(p))
 
 int i915_wait_request(struct drm_i915_gem_request *req,
-		      bool interruptible,
+		      unsigned int flags,
 		      s64 *timeout,
 		      struct intel_rps_client *rps)
 	__attribute__((nonnull(1)));
+#define I915_WAIT_INTERRUPTIBLE BIT(0)
 
 static inline u32 intel_engine_get_seqno(struct intel_engine_cs *engine);
 
@@ -575,7 +576,7 @@ i915_gem_active_wait(const struct i915_gem_active *active, struct mutex *mutex)
 	if (!request)
 		return 0;
 
-	return i915_wait_request(request, true, NULL, NULL);
+	return i915_wait_request(request, I915_WAIT_INTERRUPTIBLE, NULL, NULL);
 }
 
 /**
@@ -638,7 +639,7 @@ i915_gem_active_retire(struct i915_gem_active *active,
 	if (!request)
 		return 0;
 
-	ret = i915_wait_request(request, true, NULL, NULL);
+	ret = i915_wait_request(request, I915_WAIT_INTERRUPTIBLE, NULL, NULL);
 	if (ret)
 		return ret;
 
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 2e63e5cfa98d..3e287b8f67f2 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -12038,8 +12038,7 @@ static void intel_mmio_flip_work_func(struct work_struct *w)
 
 	if (work->flip_queued_req)
 		WARN_ON(i915_wait_request(work->flip_queued_req,
-					  false, NULL,
-					  NO_WAITBOOST));
+					  0, NULL, NO_WAITBOOST));
 
 	/* For framebuffer backed by dmabuf, wait for fence */
 	resv = i915_gem_object_get_dmabuf_resv(obj);
@@ -14099,7 +14098,8 @@ static int intel_atomic_prepare_commit(struct drm_device *dev,
 				continue;
 
 			ret = i915_wait_request(intel_plane_state->wait_req,
-						true, NULL, NULL);
+						I915_WAIT_INTERRUPTIBLE,
+						NULL, NULL);
 			if (ret) {
 				/* Any hang should be swallowed by the wait */
 				WARN_ON(ret == -EIO);
@@ -14317,7 +14317,7 @@ static void intel_atomic_commit_tail(struct drm_atomic_state *state)
 			continue;
 
 		ret = i915_wait_request(intel_plane_state->wait_req,
-					true, NULL, NULL);
+					0, NULL, NULL);
 		/* EIO should be eaten, and we can't get interrupted in the
 		 * worker, and blocking commits have waited already. */
 		WARN_ON(ret);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index cc5bcd14b6df..a6e6f2c49299 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -2223,7 +2223,8 @@ static int wait_for_space(struct drm_i915_gem_request *req, int bytes)
 	if (WARN_ON(&target->ring_link == &ring->request_list))
 		return -ENOSPC;
 
-	ret = i915_wait_request(target, true, NULL, NO_WAITBOOST);
+	ret = i915_wait_request(target, I915_WAIT_INTERRUPTIBLE,
+				NULL, NO_WAITBOOST);
 	if (ret)
 		return ret;
 
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 10/18] drm/i915: Perform a direct reset of the GPU from the waiter
  2016-08-30  8:17 Non-blocking fences, now GuC compatible Chris Wilson
                   ` (8 preceding siblings ...)
  2016-08-30  8:18 ` [PATCH 09/18] drm/i915: Expand bool interruptible to pass flags to i915_wait_request() Chris Wilson
@ 2016-08-30  8:18 ` Chris Wilson
  2016-08-30  8:18 ` [PATCH 11/18] drm/i915: Update reset path to fix incomplete requests Chris Wilson
                   ` (8 subsequent siblings)
  18 siblings, 0 replies; 27+ messages in thread
From: Chris Wilson @ 2016-08-30  8:18 UTC (permalink / raw)
  To: intel-gfx; +Cc: mika.kuoppala

If a waiter is holding the struct_mutex, then the reset worker cannot
reset the GPU until the waiter returns. We do not want to return -EAGAIN
form i915_wait_request as that breaks delicate operations like
i915_vma_unbind() which often cannot be restarted easily, and returning
-EIO is just as useless (and has in the past proven dangerous). The
remaining WARN_ON(i915_wait_request) serve as a valuable reminder that
handling errors from an indefinite wait are tricky.

We can keep the current semantic that knowing after a reset is complete,
so is the request, by performing the reset ourselves if we hold the
mutex.

uevent emission is still handled by the reset worker, so it may appear
slightly out of order with respect to the actual reset (and concurrent
use of the device).

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.c         | 11 ++++++-----
 drivers/gpu/drm/i915/i915_drv.h         | 15 +++------------
 drivers/gpu/drm/i915/i915_gem.c         |  3 ++-
 drivers/gpu/drm/i915/i915_gem_request.c | 29 +++++++++++++++++++++++++----
 drivers/gpu/drm/i915/i915_gem_request.h |  9 +++++++--
 drivers/gpu/drm/i915/i915_irq.c         |  2 ++
 drivers/gpu/drm/i915/intel_ringbuffer.c |  6 ++----
 7 files changed, 47 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 01b518dcbd7a..ea7d3e87815c 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1752,6 +1752,8 @@ int i915_resume_switcheroo(struct drm_device *dev)
  * Reset the chip.  Useful if a hang is detected. Returns zero on successful
  * reset or otherwise an error code.
  *
+ * Caller must hold the struct_mutex.
+ *
  * Procedure is fairly simple:
  *   - reset the chip using the reset reg
  *   - re-init context state
@@ -1766,7 +1768,10 @@ int i915_reset(struct drm_i915_private *dev_priv)
 	struct i915_gpu_error *error = &dev_priv->gpu_error;
 	int ret;
 
-	mutex_lock(&dev->struct_mutex);
+	lockdep_assert_held(&dev->struct_mutex);
+
+	if (!test_and_clear_bit(I915_RESET_IN_PROGRESS, &error->flags))
+		return 0;
 
 	/* Clear any previous failed attempts at recovery. Time to try again. */
 	__clear_bit(I915_WEDGED, &error->flags);
@@ -1807,9 +1812,6 @@ int i915_reset(struct drm_i915_private *dev_priv)
 		goto error;
 	}
 
-	clear_bit(I915_RESET_IN_PROGRESS, &error->flags);
-	mutex_unlock(&dev->struct_mutex);
-
 	/*
 	 * rps/rc6 re-init is necessary to restore state lost after the
 	 * reset and the re-install of gt irqs. Skip for ironlake per
@@ -1823,7 +1825,6 @@ int i915_reset(struct drm_i915_private *dev_priv)
 
 error:
 	set_bit(I915_WEDGED, &error->flags);
-	mutex_unlock(&dev->struct_mutex);
 	return ret;
 }
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index e574eaa65c4d..c6dbc6b5798a 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3849,7 +3849,9 @@ wait_remaining_ms_from_jiffies(unsigned long timestamp_jiffies, int to_wait_ms)
 			    schedule_timeout_uninterruptible(remaining_jiffies);
 	}
 }
-static inline bool __i915_request_irq_complete(struct drm_i915_gem_request *req)
+
+static inline bool
+__i915_request_irq_complete(struct drm_i915_gem_request *req)
 {
 	struct intel_engine_cs *engine = req->engine;
 
@@ -3911,17 +3913,6 @@ static inline bool __i915_request_irq_complete(struct drm_i915_gem_request *req)
 			return true;
 	}
 
-	/* We need to check whether any gpu reset happened in between
-	 * the request being submitted and now. If a reset has occurred,
-	 * the seqno will have been advance past ours and our request
-	 * is complete. If we are in the process of handling a reset,
-	 * the request is effectively complete as the rendering will
-	 * be discarded, but we need to return in order to drop the
-	 * struct_mutex.
-	 */
-	if (i915_reset_in_progress(&req->i915->gpu_error))
-		return true;
-
 	return false;
 }
 
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 51bb05f62686..cc565f785888 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2800,7 +2800,8 @@ __i915_gem_object_sync(struct drm_i915_gem_request *to,
 
 	if (!i915.semaphores) {
 		ret = i915_wait_request(from,
-					from->i915->mm.interruptible,
+					from->i915->mm.interruptible |
+					I915_WAIT_LOCKED,
 					NULL,
 					NO_WAITBOOST);
 		if (ret)
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 3c600ff3e9d5..10a11b7d1114 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -666,8 +666,8 @@ int i915_wait_request(struct drm_i915_gem_request *req,
 	if (i915_spin_request(req, state, 5))
 		goto complete;
 
-	set_current_state(state);
-	add_wait_queue(&req->i915->gpu_error.wait_queue, &reset);
+	if (flags & I915_WAIT_LOCKED)
+		add_wait_queue(&req->i915->gpu_error.wait_queue, &reset);
 
 	intel_wait_init(&wait, req->fence.seqno);
 	if (intel_engine_add_wait(req->engine, &wait))
@@ -692,9 +692,9 @@ int i915_wait_request(struct drm_i915_gem_request *req,
 		if (intel_wait_complete(&wait))
 			break;
 
+wakeup:
 		set_current_state(state);
 
-wakeup:
 		/* Carefully check if the request is complete, giving time
 		 * for the seqno to be visible following the interrupt.
 		 * We also have to check in case we are kicked by the GPU
@@ -703,11 +703,32 @@ wakeup:
 		if (__i915_request_irq_complete(req))
 			break;
 
+		/* If the GPU is hung, and we hold the lock, reset the GPU
+		 * and then check for completion. On a full reset, the engine's
+		 * HW seqno will be advanced passed us and we are complete.
+		 * If we do a partial reset, we have to wait for the GPU to
+		 * resume and update the breadcrumb.
+		 *
+		 * If we don't hold the mutex, we can just wait for the worker
+		 * to come along and update the breadcrumb (either directly
+		 * itself, or indirectly by recovering the GPU).
+		 */
+		if (flags & I915_WAIT_LOCKED &&
+		    i915_reset_in_progress(&req->i915->gpu_error)) {
+			__set_current_state(TASK_RUNNING);
+			if (!i915_reset(req->i915))
+				goto wakeup;
+
+			break;
+		}
+
 		/* Only spin if we know the GPU is processing this request */
 		if (i915_spin_request(req, state, 2))
 			break;
 	}
-	remove_wait_queue(&req->i915->gpu_error.wait_queue, &reset);
+
+	if (flags & I915_WAIT_LOCKED)
+		remove_wait_queue(&req->i915->gpu_error.wait_queue, &reset);
 
 	intel_engine_remove_wait(req->engine, &wait);
 	__set_current_state(TASK_RUNNING);
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index 41de4d9aa3fe..49c3c006ec17 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -223,6 +223,7 @@ int i915_wait_request(struct drm_i915_gem_request *req,
 		      struct intel_rps_client *rps)
 	__attribute__((nonnull(1)));
 #define I915_WAIT_INTERRUPTIBLE BIT(0)
+#define I915_WAIT_LOCKED BIT(1) /* struct_mutex held, handle GPU reset */
 
 static inline u32 intel_engine_get_seqno(struct intel_engine_cs *engine);
 
@@ -576,7 +577,9 @@ i915_gem_active_wait(const struct i915_gem_active *active, struct mutex *mutex)
 	if (!request)
 		return 0;
 
-	return i915_wait_request(request, I915_WAIT_INTERRUPTIBLE, NULL, NULL);
+	return i915_wait_request(request,
+				 I915_WAIT_INTERRUPTIBLE | I915_WAIT_LOCKED,
+				 NULL, NULL);
 }
 
 /**
@@ -639,7 +642,9 @@ i915_gem_active_retire(struct i915_gem_active *active,
 	if (!request)
 		return 0;
 
-	ret = i915_wait_request(request, I915_WAIT_INTERRUPTIBLE, NULL, NULL);
+	ret = i915_wait_request(request,
+				I915_WAIT_INTERRUPTIBLE | I915_WAIT_LOCKED,
+				NULL, NULL);
 	if (ret)
 		return ret;
 
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 3fd9cbdf2adb..0d1fb47f86c5 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2521,7 +2521,9 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
 	 * pending state and not properly drop locks, resulting in
 	 * deadlocks with the reset work.
 	 */
+	mutex_lock(&dev_priv->drm.struct_mutex);
 	ret = i915_reset(dev_priv);
+	mutex_unlock(&dev_priv->drm.struct_mutex);
 
 	intel_finish_reset(dev_priv);
 
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index a6e6f2c49299..fa4f3731be29 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -2223,14 +2223,12 @@ static int wait_for_space(struct drm_i915_gem_request *req, int bytes)
 	if (WARN_ON(&target->ring_link == &ring->request_list))
 		return -ENOSPC;
 
-	ret = i915_wait_request(target, I915_WAIT_INTERRUPTIBLE,
+	ret = i915_wait_request(target,
+				I915_WAIT_INTERRUPTIBLE | I915_WAIT_LOCKED,
 				NULL, NO_WAITBOOST);
 	if (ret)
 		return ret;
 
-	if (i915_reset_in_progress(&target->i915->gpu_error))
-		return -EAGAIN;
-
 	i915_gem_request_retire_upto(target);
 
 	intel_ring_update_space(ring);
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 11/18] drm/i915: Update reset path to fix incomplete requests
  2016-08-30  8:17 Non-blocking fences, now GuC compatible Chris Wilson
                   ` (9 preceding siblings ...)
  2016-08-30  8:18 ` [PATCH 10/18] drm/i915: Perform a direct reset of the GPU from the waiter Chris Wilson
@ 2016-08-30  8:18 ` Chris Wilson
  2016-08-30  8:18 ` [PATCH 12/18] drm/i915: Drive request submission through fence callbacks Chris Wilson
                   ` (7 subsequent siblings)
  18 siblings, 0 replies; 27+ messages in thread
From: Chris Wilson @ 2016-08-30  8:18 UTC (permalink / raw)
  To: intel-gfx; +Cc: mika.kuoppala

Update reset path in preparation for engine reset which requires
identification of incomplete requests and associated context and fixing
their state so that engine can resume correctly after reset.

The request that caused the hang will be skipped and head is reset to the
start of breadcrumb. This allows us to resume from where we left-off.
Since this request didn't complete normally we also need to cleanup elsp
queue manually. This is vital if we employ nonblocking request
submission where we may have a web of dependencies upon the hung request
and so advancing the seqno manually is no longer trivial.

v2: Legacy ringbuffer submission required a reset following hibernation,
or else we restore stale values to the RING_HEAD and walked over
stolen garbage.

v3: GuC requires replaying the requests after a reset.

Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Cc: Arun Siluvery <arun.siluvery@linux.intel.com>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.c            |   8 +-
 drivers/gpu/drm/i915/i915_drv.h            |   5 +-
 drivers/gpu/drm/i915/i915_gem.c            | 114 ++++++++++++++++-------------
 drivers/gpu/drm/i915/i915_gem_context.c    |  16 ----
 drivers/gpu/drm/i915/i915_gem_request.c    |   2 +-
 drivers/gpu/drm/i915/i915_guc_submission.c |   8 +-
 drivers/gpu/drm/i915/intel_engine_cs.c     |   1 -
 drivers/gpu/drm/i915/intel_lrc.c           |  41 ++++++++++-
 drivers/gpu/drm/i915/intel_lrc.h           |   3 +-
 drivers/gpu/drm/i915/intel_ringbuffer.c    |  45 ++++++++----
 drivers/gpu/drm/i915/intel_ringbuffer.h    |   6 +-
 11 files changed, 151 insertions(+), 98 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index ea7d3e87815c..449fcfa40319 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -582,7 +582,6 @@ static void i915_gem_fini(struct drm_device *dev)
 	}
 
 	mutex_lock(&dev->struct_mutex);
-	i915_gem_reset(dev);
 	i915_gem_cleanup_engines(dev);
 	i915_gem_context_fini(dev);
 	mutex_unlock(&dev->struct_mutex);
@@ -1602,7 +1601,7 @@ static int i915_drm_resume(struct drm_device *dev)
 	mutex_lock(&dev->struct_mutex);
 	if (i915_gem_init_hw(dev)) {
 		DRM_ERROR("failed to re-initialize GPU, declaring wedged!\n");
-		set_bit(I915_WEDGED, &dev_priv->gpu_error.flags);
+		i915_gem_set_wedged(dev_priv);
 	}
 	mutex_unlock(&dev->struct_mutex);
 
@@ -1779,8 +1778,6 @@ int i915_reset(struct drm_i915_private *dev_priv)
 
 	pr_notice("drm/i915: Resetting chip after gpu hang\n");
 
-	i915_gem_reset(dev);
-
 	ret = intel_gpu_reset(dev_priv, ALL_ENGINES);
 	if (ret) {
 		if (ret != -ENODEV)
@@ -1790,6 +1787,7 @@ int i915_reset(struct drm_i915_private *dev_priv)
 		goto error;
 	}
 
+	i915_gem_reset(dev_priv);
 	intel_overlay_reset(dev_priv);
 
 	/* Ok, now get things going again... */
@@ -1824,7 +1822,7 @@ int i915_reset(struct drm_i915_private *dev_priv)
 	return 0;
 
 error:
-	set_bit(I915_WEDGED, &error->flags);
+	i915_gem_set_wedged(dev_priv);
 	return ret;
 }
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index c6dbc6b5798a..436eec945837 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2023,6 +2023,7 @@ struct drm_i915_private {
 
 	/* Abstract the submission mechanism (legacy ringbuffer or execlists) away */
 	struct {
+		void (*resume)(struct drm_i915_private *);
 		void (*cleanup_engine)(struct intel_engine_cs *engine);
 
 		/**
@@ -3256,7 +3257,8 @@ static inline u32 i915_reset_count(struct i915_gpu_error *error)
 	return READ_ONCE(error->reset_count);
 }
 
-void i915_gem_reset(struct drm_device *dev);
+void i915_gem_reset(struct drm_i915_private *dev_priv);
+void i915_gem_set_wedged(struct drm_i915_private *dev_priv);
 bool i915_gem_clflush_object(struct drm_i915_gem_object *obj, bool force);
 int __must_check i915_gem_init(struct drm_device *dev);
 int __must_check i915_gem_init_hw(struct drm_device *dev);
@@ -3385,7 +3387,6 @@ void i915_gem_object_save_bit_17_swizzle(struct drm_i915_gem_object *obj);
 int __must_check i915_gem_context_init(struct drm_device *dev);
 void i915_gem_context_lost(struct drm_i915_private *dev_priv);
 void i915_gem_context_fini(struct drm_device *dev);
-void i915_gem_context_reset(struct drm_device *dev);
 int i915_gem_context_open(struct drm_device *dev, struct drm_file *file);
 void i915_gem_context_close(struct drm_device *dev, struct drm_file *file);
 int i915_switch_context(struct drm_i915_gem_request *req);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index cc565f785888..e5a5a7695cf2 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2554,30 +2554,76 @@ i915_gem_find_active_request(struct intel_engine_cs *engine)
 	return NULL;
 }
 
-static void i915_gem_reset_engine_status(struct intel_engine_cs *engine)
+static void reset_request(struct drm_i915_gem_request *request)
+{
+	void *vaddr = request->ring->vaddr;
+	u32 head;
+
+	/* As this request likely depends on state from the lost
+	 * context, clear out all the user operations leaving the
+	 * breadcrumb at the end (so we get the fence notifications).
+	 */
+	head = request->head;
+	if (request->postfix < head) {
+		memset(vaddr + head, 0, request->ring->size - head);
+		head = 0;
+	}
+	memset(vaddr + head, 0, request->postfix - head);
+
+	i915_set_reset_status(request->ctx, false);
+}
+
+static void i915_gem_reset_engine(struct intel_engine_cs *engine)
 {
 	struct drm_i915_gem_request *request;
+	struct i915_gem_context *incomplete_ctx;
 	bool ring_hung;
 
+	/* Ensure irq handler finishes, and not run again. */
+	tasklet_kill(&engine->irq_tasklet);
+
 	request = i915_gem_find_active_request(engine);
-	if (request == NULL)
+	if (!request)
 		return;
 
 	ring_hung = engine->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG;
-
 	i915_set_reset_status(request->ctx, ring_hung);
+
+	/* Setup the CS to resume from the breadcrumb of the hung request */
+	engine->reset_hw(engine, request);
+
+	/* Users of the default context do not rely on logical state
+	 * preserved between batches. They have to emit full state on
+	 * every batch and so it is safe to execute queued requests following
+	 * the hang.
+	 *
+	 * Other contexts preserve state, now corrupt. We want to skip all
+	 * queued requests that reference the corrupt context.
+	 */
+	incomplete_ctx = request->ctx;
+	if (i915_gem_context_is_default(incomplete_ctx))
+		return;
+
 	list_for_each_entry_continue(request, &engine->request_list, link)
-		i915_set_reset_status(request->ctx, false);
+		if (request->ctx == incomplete_ctx)
+			reset_request(request);
 }
 
-static void i915_gem_reset_engine_cleanup(struct intel_engine_cs *engine)
+void i915_gem_reset(struct drm_i915_private *dev_priv)
 {
-	struct drm_i915_gem_request *request;
-	struct intel_ring *ring;
+	struct intel_engine_cs *engine;
 
-	/* Ensure irq handler finishes, and not run again. */
-	tasklet_kill(&engine->irq_tasklet);
+	i915_gem_retire_requests(dev_priv);
+
+	for_each_engine(engine, dev_priv)
+		i915_gem_reset_engine(engine);
+
+	i915_gem_context_lost(dev_priv);
+	i915_gem_restore_fences(&dev_priv->drm);
+}
 
+static void i915_gem_cleanup_engine(struct intel_engine_cs *engine)
+{
 	/* Mark all pending requests as complete so that any concurrent
 	 * (lockless) lookup doesn't try and wait upon the request as we
 	 * reset it.
@@ -2599,54 +2645,19 @@ static void i915_gem_reset_engine_cleanup(struct intel_engine_cs *engine)
 		spin_unlock(&engine->execlist_lock);
 	}
 
-	/*
-	 * We must free the requests after all the corresponding objects have
-	 * been moved off active lists. Which is the same order as the normal
-	 * retire_requests function does. This is important if object hold
-	 * implicit references on things like e.g. ppgtt address spaces through
-	 * the request.
-	 */
-	request = i915_gem_active_raw(&engine->last_request,
-				      &engine->i915->drm.struct_mutex);
-	if (request)
-		i915_gem_request_retire_upto(request);
-	GEM_BUG_ON(intel_engine_is_active(engine));
-
-	/* Having flushed all requests from all queues, we know that all
-	 * ringbuffers must now be empty. However, since we do not reclaim
-	 * all space when retiring the request (to prevent HEADs colliding
-	 * with rapid ringbuffer wraparound) the amount of available space
-	 * upon reset is less than when we start. Do one more pass over
-	 * all the ringbuffers to reset last_retired_head.
-	 */
-	list_for_each_entry(ring, &engine->buffers, link) {
-		ring->last_retired_head = ring->tail;
-		intel_ring_update_space(ring);
-	}
-
 	engine->i915->gt.active_engines &= ~intel_engine_flag(engine);
 }
 
-void i915_gem_reset(struct drm_device *dev)
+void i915_gem_set_wedged(struct drm_i915_private *dev_priv)
 {
-	struct drm_i915_private *dev_priv = to_i915(dev);
 	struct intel_engine_cs *engine;
 
-	/*
-	 * Before we free the objects from the requests, we need to inspect
-	 * them for finding the guilty party. As the requests only borrow
-	 * their reference to the objects, the inspection must be done first.
-	 */
-	for_each_engine(engine, dev_priv)
-		i915_gem_reset_engine_status(engine);
+	lockdep_assert_held(&dev_priv->drm.struct_mutex);
+	set_bit(I915_WEDGED, &dev_priv->gpu_error.flags);
 
 	for_each_engine(engine, dev_priv)
-		i915_gem_reset_engine_cleanup(engine);
+		i915_gem_cleanup_engine(engine);
 	mod_delayed_work(dev_priv->wq, &dev_priv->gt.idle_work, 0);
-
-	i915_gem_context_reset(dev);
-
-	i915_gem_restore_fences(dev);
 }
 
 static void
@@ -4339,8 +4350,7 @@ void i915_gem_resume(struct drm_device *dev)
 	 * guarantee that the context image is complete. So let's just reset
 	 * it and start again.
 	 */
-	if (i915.enable_execlists)
-		intel_lr_context_reset(dev_priv, dev_priv->kernel_context);
+	dev_priv->gt.resume(dev_priv);
 
 	mutex_unlock(&dev->struct_mutex);
 }
@@ -4492,8 +4502,10 @@ int i915_gem_init(struct drm_device *dev)
 	mutex_lock(&dev->struct_mutex);
 
 	if (!i915.enable_execlists) {
+		dev_priv->gt.resume = intel_legacy_submission_resume;
 		dev_priv->gt.cleanup_engine = intel_engine_cleanup;
 	} else {
+		dev_priv->gt.resume = intel_lr_context_resume;
 		dev_priv->gt.cleanup_engine = intel_logical_ring_cleanup;
 	}
 
@@ -4526,7 +4538,7 @@ int i915_gem_init(struct drm_device *dev)
 		 * for all other failure, such as an allocation failure, bail.
 		 */
 		DRM_ERROR("Failed to initialize GPU, declaring it wedged\n");
-		set_bit(I915_WEDGED, &dev_priv->gpu_error.flags);
+		i915_gem_set_wedged(dev_priv);
 		ret = 0;
 	}
 
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 35950ee46a1d..df10f4e95736 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -420,22 +420,6 @@ static void i915_gem_context_unpin(struct i915_gem_context *ctx,
 	}
 }
 
-void i915_gem_context_reset(struct drm_device *dev)
-{
-	struct drm_i915_private *dev_priv = to_i915(dev);
-
-	lockdep_assert_held(&dev->struct_mutex);
-
-	if (i915.enable_execlists) {
-		struct i915_gem_context *ctx;
-
-		list_for_each_entry(ctx, &dev_priv->context_list, link)
-			intel_lr_context_reset(dev_priv, ctx);
-	}
-
-	i915_gem_context_lost(dev_priv);
-}
-
 int i915_gem_context_init(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = to_i915(dev);
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 10a11b7d1114..f31246cf4f6e 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -716,7 +716,7 @@ wakeup:
 		if (flags & I915_WAIT_LOCKED &&
 		    i915_reset_in_progress(&req->i915->gpu_error)) {
 			__set_current_state(TASK_RUNNING);
-			if (!i915_reset(req->i915))
+			if (i915_reset(req->i915))
 				goto wakeup;
 
 			break;
diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index 32699a744aca..e51fce8c11ca 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -998,6 +998,7 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
 	struct intel_guc *guc = &dev_priv->guc;
 	struct i915_guc_client *client;
 	struct intel_engine_cs *engine;
+	struct drm_i915_gem_request *request;
 
 	/* client for execbuf submission */
 	client = guc_client_alloc(dev_priv,
@@ -1014,9 +1015,14 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
 	guc_init_doorbell_hw(guc);
 
 	/* Take over from manual control of ELSP (execlists) */
-	for_each_engine(engine, dev_priv)
+	for_each_engine(engine, dev_priv) {
 		engine->submit_request = i915_guc_submit;
 
+		/* Replay the current set of previously submitted requests */
+		list_for_each_entry(request, &engine->request_list, link)
+			i915_guc_submit(request);
+	}
+
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 2e96a86105c2..0dedc2fa497a 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -230,7 +230,6 @@ static void intel_engine_init_requests(struct intel_engine_cs *engine)
  */
 void intel_engine_setup_common(struct intel_engine_cs *engine)
 {
-	INIT_LIST_HEAD(&engine->buffers);
 	INIT_LIST_HEAD(&engine->execlist_queue);
 	spin_lock_init(&engine->execlist_lock);
 
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 864b5248279a..e2e70ecd077f 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1223,6 +1223,7 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
 {
 	struct drm_i915_private *dev_priv = engine->i915;
 
+	intel_mocs_init_engine(engine);
 	lrc_init_hws(engine);
 
 	I915_WRITE_IMR(engine,
@@ -1242,7 +1243,10 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
 
 	intel_engine_init_hangcheck(engine);
 
-	return intel_mocs_init_engine(engine);
+	if (!execlists_elsp_idle(engine))
+		execlists_submit_ports(engine);
+
+	return 0;
 }
 
 static int gen8_init_render_ring(struct intel_engine_cs *engine)
@@ -1278,6 +1282,36 @@ static int gen9_init_render_ring(struct intel_engine_cs *engine)
 	return init_workarounds_ring(engine);
 }
 
+static void reset_common_ring(struct intel_engine_cs *engine,
+			      struct drm_i915_gem_request *request)
+{
+	struct drm_i915_private *dev_priv = engine->i915;
+	struct execlist_port *port = engine->execlist_port;
+	struct intel_context *ce = &request->ctx->engine[engine->id];
+
+	/* Move the RING_HEAD onto the breadcrumb, past the hanging batch */
+	ce->lrc_reg_state[CTX_RING_HEAD+1] = request->postfix;
+	request->ring->head = request->postfix;
+	request->ring->last_retired_head = -1;
+	intel_ring_update_space(request->ring);
+
+	if (i915.enable_guc_submission)
+		return;
+
+	/* Catch up with any missed context-switch interrupts */
+	I915_WRITE(RING_CONTEXT_STATUS_PTR(engine), _MASKED_FIELD(0xffff, 0));
+	if (request->ctx != port[0].request->ctx) {
+		i915_gem_request_put(port[0].request);
+		port[0] = port[1];
+		memset(&port[1], 0, sizeof(port[1]));
+	}
+
+	/* CS is stopped, and we will resubmit both ports on resume */
+	GEM_BUG_ON(request->ctx != port[0].request->ctx);
+	port[0].count = 0;
+	port[1].count = 0;
+}
+
 static int intel_logical_ring_emit_pdps(struct drm_i915_gem_request *req)
 {
 	struct i915_hw_ppgtt *ppgtt = req->ctx->ppgtt;
@@ -1640,6 +1674,7 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine)
 {
 	/* Default vfuncs which can be overriden by each engine. */
 	engine->init_hw = gen8_init_common_ring;
+	engine->reset_hw = reset_common_ring;
 	engine->emit_flush = gen8_emit_flush;
 	engine->emit_request = gen8_emit_request;
 	engine->submit_request = execlists_submit_request;
@@ -2092,9 +2127,9 @@ error_deref_obj:
 	return ret;
 }
 
-void intel_lr_context_reset(struct drm_i915_private *dev_priv,
-			    struct i915_gem_context *ctx)
+void intel_lr_context_resume(struct drm_i915_private *dev_priv)
 {
+	struct i915_gem_context *ctx = dev_priv->kernel_context;
 	struct intel_engine_cs *engine;
 
 	for_each_engine(engine, dev_priv) {
diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
index 4d70346500c2..4fed8165f98a 100644
--- a/drivers/gpu/drm/i915/intel_lrc.h
+++ b/drivers/gpu/drm/i915/intel_lrc.h
@@ -87,8 +87,7 @@ void intel_lr_context_unpin(struct i915_gem_context *ctx,
 
 struct drm_i915_private;
 
-void intel_lr_context_reset(struct drm_i915_private *dev_priv,
-			    struct i915_gem_context *ctx);
+void intel_lr_context_resume(struct drm_i915_private *dev_priv);
 uint64_t intel_lr_context_descriptor(struct i915_gem_context *ctx,
 				     struct intel_engine_cs *engine);
 
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index fa4f3731be29..870d6b025850 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -577,34 +577,33 @@ static int init_ring_common(struct intel_engine_cs *engine)
 	if (I915_READ_HEAD(engine))
 		DRM_DEBUG("%s initialization failed [head=%08x], fudging\n",
 			  engine->name, I915_READ_HEAD(engine));
-	I915_WRITE_HEAD(engine, 0);
-	(void)I915_READ_HEAD(engine);
+
+	intel_ring_update_space(ring);
+	I915_WRITE_HEAD(engine, ring->head);
+	I915_WRITE_TAIL(engine, ring->tail);
+	(void)I915_READ_TAIL(engine);
 
 	I915_WRITE_CTL(engine,
 			((ring->size - PAGE_SIZE) & RING_NR_PAGES)
 			| RING_VALID);
 
 	/* If the head is still not zero, the ring is dead */
-	if (wait_for((I915_READ_CTL(engine) & RING_VALID) != 0 &&
-		     I915_READ_START(engine) == i915_ggtt_offset(ring->vma) &&
-		     (I915_READ_HEAD(engine) & HEAD_ADDR) == 0, 50)) {
+	if (intel_wait_for_register_fw(dev_priv, RING_CTL(engine->mmio_base),
+				       RING_VALID, RING_VALID,
+				       50)) {
 		DRM_ERROR("%s initialization failed "
-			  "ctl %08x (valid? %d) head %08x tail %08x start %08x [expected %08x]\n",
+			  "ctl %08x (valid? %d) head %08x [%08x] tail %08x [%08x] start %08x [expected %08x]\n",
 			  engine->name,
 			  I915_READ_CTL(engine),
 			  I915_READ_CTL(engine) & RING_VALID,
-			  I915_READ_HEAD(engine), I915_READ_TAIL(engine),
+			  I915_READ_HEAD(engine), ring->head,
+			  I915_READ_TAIL(engine), ring->tail,
 			  I915_READ_START(engine),
 			  i915_ggtt_offset(ring->vma));
 		ret = -EIO;
 		goto out;
 	}
 
-	ring->last_retired_head = -1;
-	ring->head = I915_READ_HEAD(engine);
-	ring->tail = I915_READ_TAIL(engine) & TAIL_ADDR;
-	intel_ring_update_space(ring);
-
 	intel_engine_init_hangcheck(engine);
 
 out:
@@ -613,6 +612,15 @@ out:
 	return ret;
 }
 
+static void reset_ring_common(struct intel_engine_cs *engine,
+			      struct drm_i915_gem_request *request)
+{
+	struct intel_ring *ring = request->ring;
+
+	ring->head = request->postfix;
+	ring->last_retired_head = -1;
+}
+
 static int intel_ring_workarounds_emit(struct drm_i915_gem_request *req)
 {
 	struct intel_ring *ring = req->ring;
@@ -2007,7 +2015,6 @@ intel_engine_create_ring(struct intel_engine_cs *engine, int size)
 	}
 	ring->vma = vma;
 
-	list_add(&ring->link, &engine->buffers);
 	return ring;
 }
 
@@ -2015,7 +2022,6 @@ void
 intel_ring_free(struct intel_ring *ring)
 {
 	i915_vma_put(ring->vma);
-	list_del(&ring->link);
 	kfree(ring);
 }
 
@@ -2169,6 +2175,16 @@ void intel_engine_cleanup(struct intel_engine_cs *engine)
 	engine->i915 = NULL;
 }
 
+void intel_legacy_submission_resume(struct drm_i915_private *dev_priv)
+{
+	struct intel_engine_cs *engine;
+
+	for_each_engine(engine, dev_priv) {
+		engine->buffer->head = engine->buffer->tail;
+		engine->buffer->last_retired_head = -1;
+	}
+}
+
 int intel_ring_alloc_request_extras(struct drm_i915_gem_request *request)
 {
 	int ret;
@@ -2654,6 +2670,7 @@ static void intel_ring_default_vfuncs(struct drm_i915_private *dev_priv,
 	intel_ring_init_semaphores(dev_priv, engine);
 
 	engine->init_hw = init_ring_common;
+	engine->reset_hw = reset_ring_common;
 
 	engine->emit_request = i9xx_emit_request;
 	if (i915.semaphores)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 2181d0a41a96..a08dbe73b0ee 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -87,7 +87,6 @@ struct intel_ring {
 	void *vaddr;
 
 	struct intel_engine_cs *engine;
-	struct list_head link;
 
 	struct list_head request_list;
 
@@ -157,7 +156,6 @@ struct intel_engine_cs {
 	u32		mmio_base;
 	unsigned int irq_shift;
 	struct intel_ring *buffer;
-	struct list_head buffers;
 
 	/* Rather than have every client wait upon all user interrupts,
 	 * with the herd waking after every interrupt and each doing the
@@ -211,6 +209,8 @@ struct intel_engine_cs {
 	void		(*irq_disable)(struct intel_engine_cs *engine);
 
 	int		(*init_hw)(struct intel_engine_cs *engine);
+	void		(*reset_hw)(struct intel_engine_cs *engine,
+				    struct drm_i915_gem_request *req);
 
 	int		(*init_context)(struct drm_i915_gem_request *req);
 
@@ -444,6 +444,8 @@ void intel_ring_free(struct intel_ring *ring);
 void intel_engine_stop(struct intel_engine_cs *engine);
 void intel_engine_cleanup(struct intel_engine_cs *engine);
 
+void intel_legacy_submission_resume(struct drm_i915_private *dev_priv);
+
 int intel_ring_alloc_request_extras(struct drm_i915_gem_request *request);
 
 int __must_check intel_ring_begin(struct drm_i915_gem_request *req, int n);
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 12/18] drm/i915: Drive request submission through fence callbacks
  2016-08-30  8:17 Non-blocking fences, now GuC compatible Chris Wilson
                   ` (10 preceding siblings ...)
  2016-08-30  8:18 ` [PATCH 11/18] drm/i915: Update reset path to fix incomplete requests Chris Wilson
@ 2016-08-30  8:18 ` Chris Wilson
  2016-08-31 11:05   ` John Harrison
  2016-08-30  8:18 ` [PATCH 13/18] drm/i915: Move execbuf object synchronisation to i915_gem_execbuffer Chris Wilson
                   ` (6 subsequent siblings)
  18 siblings, 1 reply; 27+ messages in thread
From: Chris Wilson @ 2016-08-30  8:18 UTC (permalink / raw)
  To: intel-gfx; +Cc: mika.kuoppala

Drive final request submission from a callback from the fence. This way
the request is queued until all dependencies are resolved, at which
point it is handed to the backend for queueing to hardware. At this
point, no dependencies are set on the request, so the callback is
immediate.

A side-effect of imposing a heavier-irqsafe spinlock for execlist
submission is that we lose the softirq enabling after scheduling the
execlists tasklet. To compensate, we manually kickstart the softirq by
disabling and enabling the bh around the fence signaling.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem.c            |  3 +++
 drivers/gpu/drm/i915/i915_gem_request.c    | 26 +++++++++++++++++++++++++-
 drivers/gpu/drm/i915/i915_gem_request.h    |  3 +++
 drivers/gpu/drm/i915/i915_guc_submission.c |  3 ++-
 drivers/gpu/drm/i915/intel_breadcrumbs.c   |  3 +++
 drivers/gpu/drm/i915/intel_lrc.c           |  5 +++--
 6 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index e5a5a7695cf2..4f5d12e93682 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2548,6 +2548,9 @@ i915_gem_find_active_request(struct intel_engine_cs *engine)
 		if (i915_gem_request_completed(request))
 			continue;
 
+		if (!i915_sw_fence_done(&request->submit))
+			break;
+
 		return request;
 	}
 
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index f31246cf4f6e..89ed66275d95 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -316,6 +316,25 @@ static int i915_gem_get_seqno(struct drm_i915_private *dev_priv, u32 *seqno)
 	return 0;
 }
 
+static int __i915_sw_fence_call
+submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
+{
+	struct drm_i915_gem_request *request =
+		container_of(fence, typeof(*request), submit);
+
+	switch (state) {
+	case FENCE_COMPLETE:
+		request->engine->submit_request(request);
+		break;
+
+	case FENCE_FREE:
+		i915_gem_request_put(request);
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
 /**
  * i915_gem_request_alloc - allocate a request structure
  *
@@ -413,6 +432,8 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 	 */
 	req->reserved_space = MIN_SPACE_FOR_ADD_REQUEST;
 
+	i915_sw_fence_init(&i915_gem_request_get(req)->submit, submit_notify);
+
 	if (i915.enable_execlists)
 		ret = intel_logical_ring_alloc_request_extras(req);
 	else
@@ -528,7 +549,10 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
 		  reserved_tail, ret);
 
 	i915_gem_mark_busy(engine);
-	engine->submit_request(request);
+
+	local_bh_disable();
+	i915_sw_fence_commit(&request->submit);
+	local_bh_enable(); /* Kick the execlists tasklet if just scheduled */
 }
 
 static unsigned long local_clock_us(unsigned int *cpu)
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index 49c3c006ec17..50eac3174e63 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -28,6 +28,7 @@
 #include <linux/fence.h>
 
 #include "i915_gem.h"
+#include "i915_sw_fence.h"
 
 struct intel_wait {
 	struct rb_node node;
@@ -82,6 +83,8 @@ struct drm_i915_gem_request {
 	struct intel_ring *ring;
 	struct intel_signal_node signaling;
 
+	struct i915_sw_fence submit;
+
 	/** GEM sequence number associated with the previous request,
 	 * when the HWS breadcrumb is equal to this the GPU is processing
 	 * this request.
diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index e51fce8c11ca..2332f9c98bdd 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -1020,7 +1020,8 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
 
 		/* Replay the current set of previously submitted requests */
 		list_for_each_entry(request, &engine->request_list, link)
-			i915_guc_submit(request);
+			if (i915_sw_fence_done(&request->submit))
+				i915_guc_submit(request);
 	}
 
 	return 0;
diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
index 2491e4c1eaf0..9bad14d22c95 100644
--- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
+++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
@@ -462,7 +462,10 @@ static int intel_breadcrumbs_signaler(void *arg)
 			 */
 			intel_engine_remove_wait(engine,
 						 &request->signaling.wait);
+
+			local_bh_disable();
 			fence_signal(&request->fence);
+			local_bh_enable(); /* kick start the tasklets */
 
 			/* Find the next oldest signal. Note that as we have
 			 * not been holding the lock, another client may
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index e2e70ecd077f..c23b151d662f 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -590,14 +590,15 @@ static void intel_lrc_irq_handler(unsigned long data)
 static void execlists_submit_request(struct drm_i915_gem_request *request)
 {
 	struct intel_engine_cs *engine = request->engine;
+	unsigned long flags;
 
-	spin_lock_bh(&engine->execlist_lock);
+	spin_lock_irqsave(&engine->execlist_lock, flags);
 
 	list_add_tail(&request->execlist_link, &engine->execlist_queue);
 	if (execlists_elsp_idle(engine))
 		tasklet_hi_schedule(&engine->irq_tasklet);
 
-	spin_unlock_bh(&engine->execlist_lock);
+	spin_unlock_irqrestore(&engine->execlist_lock, flags);
 }
 
 int intel_logical_ring_alloc_request_extras(struct drm_i915_gem_request *request)
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH 12/18] drm/i915: Drive request submission through fence callbacks
  2016-08-30  8:18 ` [PATCH 12/18] drm/i915: Drive request submission through fence callbacks Chris Wilson
@ 2016-08-31 11:05   ` John Harrison
  0 siblings, 0 replies; 27+ messages in thread
From: John Harrison @ 2016-08-31 11:05 UTC (permalink / raw)
  To: intel-gfx


[-- Attachment #1.1: Type: text/plain, Size: 6305 bytes --]

On 30/08/2016 09:18, Chris Wilson wrote:
> Drive final request submission from a callback from the fence. This way
> the request is queued until all dependencies are resolved, at which
> point it is handed to the backend for queueing to hardware. At this
> point, no dependencies are set on the request, so the callback is
> immediate.
>
> A side-effect of imposing a heavier-irqsafe spinlock for execlist
> submission is that we lose the softirq enabling after scheduling the
> execlists tasklet. To compensate, we manually kickstart the softirq by
> disabling and enabling the bh around the fence signaling.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_gem.c            |  3 +++
>   drivers/gpu/drm/i915/i915_gem_request.c    | 26 +++++++++++++++++++++++++-
>   drivers/gpu/drm/i915/i915_gem_request.h    |  3 +++
>   drivers/gpu/drm/i915/i915_guc_submission.c |  3 ++-
>   drivers/gpu/drm/i915/intel_breadcrumbs.c   |  3 +++
>   drivers/gpu/drm/i915/intel_lrc.c           |  5 +++--
>   6 files changed, 39 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index e5a5a7695cf2..4f5d12e93682 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2548,6 +2548,9 @@ i915_gem_find_active_request(struct intel_engine_cs *engine)
>   		if (i915_gem_request_completed(request))
>   			continue;
>   
> +		if (!i915_sw_fence_done(&request->submit))
> +			break;
> +
>   		return request;
>   	}
>   
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> index f31246cf4f6e..89ed66275d95 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -316,6 +316,25 @@ static int i915_gem_get_seqno(struct drm_i915_private *dev_priv, u32 *seqno)
>   	return 0;
>   }
>   
> +static int __i915_sw_fence_call
> +submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
> +{
> +	struct drm_i915_gem_request *request =
> +		container_of(fence, typeof(*request), submit);
> +
> +	switch (state) {
> +	case FENCE_COMPLETE:
> +		request->engine->submit_request(request);
As per comment on earlier version, I think this patch should also add a 
comment to the declaration for submit_request to say that it must now be 
IRQ context safe.

With that:
Reviewed-by: John Harrison <john.c.harrison@intel.com>

> +		break;
> +
> +	case FENCE_FREE:
> +		i915_gem_request_put(request);
> +		break;
> +	}
> +
> +	return NOTIFY_DONE;
> +}
> +
>   /**
>    * i915_gem_request_alloc - allocate a request structure
>    *
> @@ -413,6 +432,8 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
>   	 */
>   	req->reserved_space = MIN_SPACE_FOR_ADD_REQUEST;
>   
> +	i915_sw_fence_init(&i915_gem_request_get(req)->submit, submit_notify);
> +
>   	if (i915.enable_execlists)
>   		ret = intel_logical_ring_alloc_request_extras(req);
>   	else
> @@ -528,7 +549,10 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
>   		  reserved_tail, ret);
>   
>   	i915_gem_mark_busy(engine);
> -	engine->submit_request(request);
> +
> +	local_bh_disable();
> +	i915_sw_fence_commit(&request->submit);
> +	local_bh_enable(); /* Kick the execlists tasklet if just scheduled */
>   }
>   
>   static unsigned long local_clock_us(unsigned int *cpu)
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
> index 49c3c006ec17..50eac3174e63 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.h
> +++ b/drivers/gpu/drm/i915/i915_gem_request.h
> @@ -28,6 +28,7 @@
>   #include <linux/fence.h>
>   
>   #include "i915_gem.h"
> +#include "i915_sw_fence.h"
>   
>   struct intel_wait {
>   	struct rb_node node;
> @@ -82,6 +83,8 @@ struct drm_i915_gem_request {
>   	struct intel_ring *ring;
>   	struct intel_signal_node signaling;
>   
> +	struct i915_sw_fence submit;
> +
>   	/** GEM sequence number associated with the previous request,
>   	 * when the HWS breadcrumb is equal to this the GPU is processing
>   	 * this request.
> diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
> index e51fce8c11ca..2332f9c98bdd 100644
> --- a/drivers/gpu/drm/i915/i915_guc_submission.c
> +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> @@ -1020,7 +1020,8 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
>   
>   		/* Replay the current set of previously submitted requests */
>   		list_for_each_entry(request, &engine->request_list, link)
> -			i915_guc_submit(request);
> +			if (i915_sw_fence_done(&request->submit))
> +				i915_guc_submit(request);
>   	}
>   
>   	return 0;
> diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
> index 2491e4c1eaf0..9bad14d22c95 100644
> --- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
> +++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
> @@ -462,7 +462,10 @@ static int intel_breadcrumbs_signaler(void *arg)
>   			 */
>   			intel_engine_remove_wait(engine,
>   						 &request->signaling.wait);
> +
> +			local_bh_disable();
>   			fence_signal(&request->fence);
> +			local_bh_enable(); /* kick start the tasklets */
>   
>   			/* Find the next oldest signal. Note that as we have
>   			 * not been holding the lock, another client may
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index e2e70ecd077f..c23b151d662f 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -590,14 +590,15 @@ static void intel_lrc_irq_handler(unsigned long data)
>   static void execlists_submit_request(struct drm_i915_gem_request *request)
>   {
>   	struct intel_engine_cs *engine = request->engine;
> +	unsigned long flags;
>   
> -	spin_lock_bh(&engine->execlist_lock);
> +	spin_lock_irqsave(&engine->execlist_lock, flags);
>   
>   	list_add_tail(&request->execlist_link, &engine->execlist_queue);
>   	if (execlists_elsp_idle(engine))
>   		tasklet_hi_schedule(&engine->irq_tasklet);
>   
> -	spin_unlock_bh(&engine->execlist_lock);
> +	spin_unlock_irqrestore(&engine->execlist_lock, flags);
>   }
>   
>   int intel_logical_ring_alloc_request_extras(struct drm_i915_gem_request *request)


[-- Attachment #1.2: Type: text/html, Size: 6751 bytes --]

[-- Attachment #2: Type: text/plain, Size: 160 bytes --]

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 13/18] drm/i915: Move execbuf object synchronisation to i915_gem_execbuffer
  2016-08-30  8:17 Non-blocking fences, now GuC compatible Chris Wilson
                   ` (11 preceding siblings ...)
  2016-08-30  8:18 ` [PATCH 12/18] drm/i915: Drive request submission through fence callbacks Chris Wilson
@ 2016-08-30  8:18 ` Chris Wilson
  2016-08-30  8:18 ` [PATCH 14/18] drm/i915/guc: Prepare for nonblocking execbuf submission Chris Wilson
                   ` (5 subsequent siblings)
  18 siblings, 0 replies; 27+ messages in thread
From: Chris Wilson @ 2016-08-30  8:18 UTC (permalink / raw)
  To: intel-gfx; +Cc: mika.kuoppala

We are about to specialize object synchronisation to enable nonblocking
execbuf submission. First we make a copy of the current object
synchronisation for execbuffer. The general i915_gem_object_sync() will
be removed following the removal of CS flips in the near future.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: John Harrison <john.c.harrison@intel.com>
---
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 68 +++++++++++++++++++++++++++++-
 1 file changed, 67 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 57d096aef548..1685f4aaa4c2 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1122,6 +1122,71 @@ static unsigned int eb_other_engines(struct drm_i915_gem_request *req)
 }
 
 static int
+eb_await_request(struct drm_i915_gem_request *to,
+		 struct drm_i915_gem_request *from)
+{
+	int idx, ret;
+
+	if (to->engine == from->engine)
+		return 0;
+
+	idx = intel_engine_sync_index(from->engine, to->engine);
+	if (from->fence.seqno <= from->engine->semaphore.sync_seqno[idx])
+		return 0;
+
+	trace_i915_gem_ring_sync_to(to, from);
+	if (!i915.semaphores) {
+		ret = i915_wait_request(from,
+					I915_WAIT_INTERRUPTIBLE |
+					I915_WAIT_LOCKED,
+					NULL, NO_WAITBOOST);
+		if (ret)
+			return ret;
+	} else {
+		ret = to->engine->semaphore.sync_to(to, from);
+		if (ret)
+			return ret;
+	}
+
+	from->engine->semaphore.sync_seqno[idx] = from->fence.seqno;
+	return 0;
+}
+
+static int
+eb_await_bo(struct drm_i915_gem_request *to,
+	     struct drm_i915_gem_object *obj,
+	     bool write)
+{
+	struct i915_gem_active *active;
+	unsigned long active_mask;
+	int idx;
+
+	if (write) {
+		active_mask = i915_gem_object_get_active(obj);
+		active = obj->last_read;
+	} else {
+		active_mask = 1;
+		active = &obj->last_write;
+	}
+
+	for_each_active(active_mask, idx) {
+		struct drm_i915_gem_request *request;
+		int ret;
+
+		request = i915_gem_active_peek(&active[idx],
+					       &obj->base.dev->struct_mutex);
+		if (!request)
+			continue;
+
+		ret = eb_await_request(to, request);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int
 i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req,
 				struct list_head *vmas)
 {
@@ -1133,7 +1198,8 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req,
 		struct drm_i915_gem_object *obj = vma->obj;
 
 		if (obj->flags & other_rings) {
-			ret = i915_gem_object_sync(obj, req);
+			ret = eb_await_bo(req, obj,
+					  obj->base.pending_write_domain);
 			if (ret)
 				return ret;
 		}
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 14/18] drm/i915/guc: Prepare for nonblocking execbuf submission
  2016-08-30  8:17 Non-blocking fences, now GuC compatible Chris Wilson
                   ` (12 preceding siblings ...)
  2016-08-30  8:18 ` [PATCH 13/18] drm/i915: Move execbuf object synchronisation to i915_gem_execbuffer Chris Wilson
@ 2016-08-30  8:18 ` Chris Wilson
  2016-09-02 18:20   ` Dave Gordon
  2016-08-30  8:18 ` [PATCH 15/18] drm/i915: Nonblocking request submission Chris Wilson
                   ` (4 subsequent siblings)
  18 siblings, 1 reply; 27+ messages in thread
From: Chris Wilson @ 2016-08-30  8:18 UTC (permalink / raw)
  To: intel-gfx; +Cc: mika.kuoppala

Currently the presumption is that the request construction and its
submission to the GuC are all under the same holding of struct_mutex. We
wish to relax this to separate the request construction and the later
submission to the GuC. This requires us to reserve some space in the
GuC command queue for the future submission. For flexibility to handle
out-of-order request submission we do not preallocate the next slot in
the GuC command queue during request construction, just ensuring that
there is enough space later.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_guc_submission.c | 55 ++++++++++++++----------------
 drivers/gpu/drm/i915/intel_guc.h           |  3 ++
 2 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index 2332f9c98bdd..a047e61adc2a 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -434,20 +434,23 @@ int i915_guc_wq_check_space(struct drm_i915_gem_request *request)
 {
 	const size_t wqi_size = sizeof(struct guc_wq_item);
 	struct i915_guc_client *gc = request->i915->guc.execbuf_client;
-	struct guc_process_desc *desc;
+	struct guc_process_desc *desc = gc->client_base + gc->proc_desc_offset;
 	u32 freespace;
+	int ret;
 
-	GEM_BUG_ON(gc == NULL);
-
-	desc = gc->client_base + gc->proc_desc_offset;
-
+	spin_lock(&gc->lock);
 	freespace = CIRC_SPACE(gc->wq_tail, desc->head, gc->wq_size);
-	if (likely(freespace >= wqi_size))
-		return 0;
-
-	gc->no_wq_space += 1;
+	freespace -= gc->wq_rsvd;
+	if (likely(freespace >= wqi_size)) {
+		gc->wq_rsvd += wqi_size;
+		ret = 0;
+	} else {
+		gc->no_wq_space++;
+		ret = -EAGAIN;
+	}
+	spin_unlock(&gc->lock);
 
-	return -EAGAIN;
+	return ret;
 }
 
 static void guc_add_workqueue_item(struct i915_guc_client *gc,
@@ -457,22 +460,9 @@ static void guc_add_workqueue_item(struct i915_guc_client *gc,
 	const size_t wqi_size = sizeof(struct guc_wq_item);
 	const u32 wqi_len = wqi_size/sizeof(u32) - 1;
 	struct intel_engine_cs *engine = rq->engine;
-	struct guc_process_desc *desc;
 	struct guc_wq_item *wqi;
 	void *base;
-	u32 freespace, tail, wq_off, wq_page;
-
-	desc = gc->client_base + gc->proc_desc_offset;
-
-	/* Free space is guaranteed, see i915_guc_wq_check_space() above */
-	freespace = CIRC_SPACE(gc->wq_tail, desc->head, gc->wq_size);
-	GEM_BUG_ON(freespace < wqi_size);
-
-	/* The GuC firmware wants the tail index in QWords, not bytes */
-	tail = rq->tail;
-	GEM_BUG_ON(tail & 7);
-	tail >>= 3;
-	GEM_BUG_ON(tail > WQ_RING_TAIL_MAX);
+	u32 wq_off, wq_page;
 
 	/* For now workqueue item is 4 DWs; workqueue buffer is 2 pages. So we
 	 * should not have the case where structure wqi is across page, neither
@@ -482,18 +472,19 @@ static void guc_add_workqueue_item(struct i915_guc_client *gc,
 	 * workqueue buffer dw by dw.
 	 */
 	BUILD_BUG_ON(wqi_size != 16);
+	GEM_BUG_ON(gc->wq_rsvd < wqi_size);
 
 	/* postincrement WQ tail for next time */
 	wq_off = gc->wq_tail;
+	GEM_BUG_ON(wq_off & (wqi_size - 1));
 	gc->wq_tail += wqi_size;
 	gc->wq_tail &= gc->wq_size - 1;
-	GEM_BUG_ON(wq_off & (wqi_size - 1));
+	gc->wq_rsvd -= wqi_size;
 
 	/* WQ starts from the page after doorbell / process_desc */
 	wq_page = (wq_off + GUC_DB_SIZE) >> PAGE_SHIFT;
-	wq_off &= PAGE_SIZE - 1;
 	base = kmap_atomic(i915_gem_object_get_page(gc->vma->obj, wq_page));
-	wqi = (struct guc_wq_item *)((char *)base + wq_off);
+	wqi = (struct guc_wq_item *)((char *)base + offset_in_page(wq_off));
 
 	/* Now fill in the 4-word work queue item */
 	wqi->header = WQ_TYPE_INORDER |
@@ -504,7 +495,7 @@ static void guc_add_workqueue_item(struct i915_guc_client *gc,
 	/* The GuC wants only the low-order word of the context descriptor */
 	wqi->context_desc = (u32)intel_lr_context_descriptor(rq->ctx, engine);
 
-	wqi->ring_tail = tail << WQ_RING_TAIL_SHIFT;
+	wqi->ring_tail = rq->tail >> 3 << WQ_RING_TAIL_SHIFT;
 	wqi->fence_id = rq->fence.seqno;
 
 	kunmap_atomic(base);
@@ -591,8 +582,10 @@ static void i915_guc_submit(struct drm_i915_gem_request *rq)
 	struct i915_guc_client *client = guc->execbuf_client;
 	int b_ret;
 
+	spin_lock(&client->lock);
 	guc_add_workqueue_item(client, rq);
 	b_ret = guc_ring_doorbell(client);
+	spin_unlock(&client->lock);
 
 	client->submissions[engine_id] += 1;
 	client->retcode = b_ret;
@@ -770,6 +763,8 @@ guc_client_alloc(struct drm_i915_private *dev_priv,
 	if (!client)
 		return NULL;
 
+	spin_lock_init(&client->lock);
+
 	client->owner = ctx;
 	client->guc = guc;
 	client->engines = engines;
@@ -1019,9 +1014,11 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
 		engine->submit_request = i915_guc_submit;
 
 		/* Replay the current set of previously submitted requests */
-		list_for_each_entry(request, &engine->request_list, link)
+		list_for_each_entry(request, &engine->request_list, link) {
+			client->wq_rsvd += sizeof(struct guc_wq_item);
 			if (i915_sw_fence_done(&request->submit))
 				i915_guc_submit(request);
+		}
 	}
 
 	return 0;
diff --git a/drivers/gpu/drm/i915/intel_guc.h b/drivers/gpu/drm/i915/intel_guc.h
index c97326269588..27a622824b54 100644
--- a/drivers/gpu/drm/i915/intel_guc.h
+++ b/drivers/gpu/drm/i915/intel_guc.h
@@ -68,6 +68,8 @@ struct i915_guc_client {
 	struct i915_gem_context *owner;
 	struct intel_guc *guc;
 
+	spinlock_t lock;
+
 	uint32_t engines;		/* bitmap of (host) engine ids	*/
 	uint32_t priority;
 	uint32_t ctx_index;
@@ -81,6 +83,7 @@ struct i915_guc_client {
 	uint32_t wq_offset;
 	uint32_t wq_size;
 	uint32_t wq_tail;
+	uint32_t wq_rsvd;
 	uint32_t no_wq_space;
 	uint32_t b_fail;
 	int retcode;
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH 14/18] drm/i915/guc: Prepare for nonblocking execbuf submission
  2016-08-30  8:18 ` [PATCH 14/18] drm/i915/guc: Prepare for nonblocking execbuf submission Chris Wilson
@ 2016-09-02 18:20   ` Dave Gordon
  2016-09-05  8:08     ` Chris Wilson
  0 siblings, 1 reply; 27+ messages in thread
From: Dave Gordon @ 2016-09-02 18:20 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx; +Cc: mika.kuoppala

On 30/08/16 09:18, Chris Wilson wrote:
> Currently the presumption is that the request construction and its
> submission to the GuC are all under the same holding of struct_mutex. We
> wish to relax this to separate the request construction and the later
> submission to the GuC. This requires us to reserve some space in the
> GuC command queue for the future submission. For flexibility to handle
> out-of-order request submission we do not preallocate the next slot in
> the GuC command queue during request construction, just ensuring that
> there is enough space later.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_guc_submission.c | 55 ++++++++++++++----------------
>  drivers/gpu/drm/i915/intel_guc.h           |  3 ++
>  2 files changed, 29 insertions(+), 29 deletions(-)

Hmm .. the functional changes look mostly OK, apart from some locking, 
but there seems to be a great deal of unnecessary churn, such as 
combining statements which had been kept separate for clarity :(

> diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
> index 2332f9c98bdd..a047e61adc2a 100644
> --- a/drivers/gpu/drm/i915/i915_guc_submission.c
> +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> @@ -434,20 +434,23 @@ int i915_guc_wq_check_space(struct drm_i915_gem_request *request)
>  {
>  	const size_t wqi_size = sizeof(struct guc_wq_item);
>  	struct i915_guc_client *gc = request->i915->guc.execbuf_client;
> -	struct guc_process_desc *desc;
> +	struct guc_process_desc *desc = gc->client_base + gc->proc_desc_offset;
>  	u32 freespace;
> +	int ret;
>
> -	GEM_BUG_ON(gc == NULL);
> -
> -	desc = gc->client_base + gc->proc_desc_offset;
> -
> +	spin_lock(&gc->lock);
>  	freespace = CIRC_SPACE(gc->wq_tail, desc->head, gc->wq_size);
> -	if (likely(freespace >= wqi_size))
> -		return 0;
> -
> -	gc->no_wq_space += 1;
> +	freespace -= gc->wq_rsvd;
> +	if (likely(freespace >= wqi_size)) {
> +		gc->wq_rsvd += wqi_size;
> +		ret = 0;
> +	} else {
> +		gc->no_wq_space++;
> +		ret = -EAGAIN;
> +	}
> +	spin_unlock(&gc->lock);
>
> -	return -EAGAIN;
> +	return ret;
>  }
>
>  static void guc_add_workqueue_item(struct i915_guc_client *gc,
> @@ -457,22 +460,9 @@ static void guc_add_workqueue_item(struct i915_guc_client *gc,
>  	const size_t wqi_size = sizeof(struct guc_wq_item);
>  	const u32 wqi_len = wqi_size/sizeof(u32) - 1;
>  	struct intel_engine_cs *engine = rq->engine;
> -	struct guc_process_desc *desc;
>  	struct guc_wq_item *wqi;
>  	void *base;
> -	u32 freespace, tail, wq_off, wq_page;
> -
> -	desc = gc->client_base + gc->proc_desc_offset;
> -
> -	/* Free space is guaranteed, see i915_guc_wq_check_space() above */

This comment seems to have been lost. It still applies (mutatis 
mutandis), so it should be relocated to some part of the new version ...

> -	freespace = CIRC_SPACE(gc->wq_tail, desc->head, gc->wq_size);
> -	GEM_BUG_ON(freespace < wqi_size);
> -
> -	/* The GuC firmware wants the tail index in QWords, not bytes */
> -	tail = rq->tail;
> -	GEM_BUG_ON(tail & 7);
> -	tail >>= 3;
> -	GEM_BUG_ON(tail > WQ_RING_TAIL_MAX);

This *commented* sequence of statements seems clearer than the 
replacement below ...

> +	u32 wq_off, wq_page;
>
>  	/* For now workqueue item is 4 DWs; workqueue buffer is 2 pages. So we
>  	 * should not have the case where structure wqi is across page, neither
> @@ -482,18 +472,19 @@ static void guc_add_workqueue_item(struct i915_guc_client *gc,
>  	 * workqueue buffer dw by dw.
>  	 */
>  	BUILD_BUG_ON(wqi_size != 16);

This would be a good place to note that:

/* Reserved space is guaranteed, see i915_guc_wq_check_space() above */

> +	GEM_BUG_ON(gc->wq_rsvd < wqi_size);
>
>  	/* postincrement WQ tail for next time */
>  	wq_off = gc->wq_tail;
> +	GEM_BUG_ON(wq_off & (wqi_size - 1));
>  	gc->wq_tail += wqi_size;
>  	gc->wq_tail &= gc->wq_size - 1;
> -	GEM_BUG_ON(wq_off & (wqi_size - 1));
> +	gc->wq_rsvd -= wqi_size;
>
>  	/* WQ starts from the page after doorbell / process_desc */
>  	wq_page = (wq_off + GUC_DB_SIZE) >> PAGE_SHIFT;
> -	wq_off &= PAGE_SIZE - 1;
>  	base = kmap_atomic(i915_gem_object_get_page(gc->vma->obj, wq_page));
> -	wqi = (struct guc_wq_item *)((char *)base + wq_off);
> +	wqi = (struct guc_wq_item *)((char *)base + offset_in_page(wq_off));
>
>  	/* Now fill in the 4-word work queue item */
>  	wqi->header = WQ_TYPE_INORDER |
> @@ -504,7 +495,7 @@ static void guc_add_workqueue_item(struct i915_guc_client *gc,
>  	/* The GuC wants only the low-order word of the context descriptor */
>  	wqi->context_desc = (u32)intel_lr_context_descriptor(rq->ctx, engine);
>
> -	wqi->ring_tail = tail << WQ_RING_TAIL_SHIFT;
> +	wqi->ring_tail = rq->tail >> 3 << WQ_RING_TAIL_SHIFT;

This line is particularly ugly. I think it's the >> chevron << effect.
Parenthesisation would help, but it would be nicer as separate lines.
Also, there's no explanation of the "3" here, unlike the original 
version above.

>  	wqi->fence_id = rq->fence.seqno;
>
>  	kunmap_atomic(base);
> @@ -591,8 +582,10 @@ static void i915_guc_submit(struct drm_i915_gem_request *rq)
>  	struct i915_guc_client *client = guc->execbuf_client;
>  	int b_ret;
>
> +	spin_lock(&client->lock);
>  	guc_add_workqueue_item(client, rq);
>  	b_ret = guc_ring_doorbell(client);
> +	spin_unlock(&client->lock);
>
>  	client->submissions[engine_id] += 1;

Outside the spinlock? Do we still have the BKL during submit(), just not 
during i915_guc_wq_check_space() ? If so, then guc_ring_doorbell() 
doesn't strictly need to be inside the spinlock (or the lock could be 
inside guc_add_workqueue_item()); but if not then the update of 
submissions[] should be inside.

>  	client->retcode = b_ret;
> @@ -770,6 +763,8 @@ guc_client_alloc(struct drm_i915_private *dev_priv,
>  	if (!client)
>  		return NULL;
>
> +	spin_lock_init(&client->lock);
> +
>  	client->owner = ctx;
>  	client->guc = guc;
>  	client->engines = engines;
> @@ -1019,9 +1014,11 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
>  		engine->submit_request = i915_guc_submit;
>
>  		/* Replay the current set of previously submitted requests */
> -		list_for_each_entry(request, &engine->request_list, link)
> +		list_for_each_entry(request, &engine->request_list, link) {
> +			client->wq_rsvd += sizeof(struct guc_wq_item);

Presumably this is being called in some context that ensures that we 
don't need to hold the spinlock while updating wq_rsvd ? Maybe that 
should be mentioned ...

>  			if (i915_sw_fence_done(&request->submit))
>  				i915_guc_submit(request);
> +		}
>  	}
>
>  	return 0;
> diff --git a/drivers/gpu/drm/i915/intel_guc.h b/drivers/gpu/drm/i915/intel_guc.h
> index c97326269588..27a622824b54 100644
> --- a/drivers/gpu/drm/i915/intel_guc.h
> +++ b/drivers/gpu/drm/i915/intel_guc.h
> @@ -68,6 +68,8 @@ struct i915_guc_client {
>  	struct i915_gem_context *owner;
>  	struct intel_guc *guc;
>
> +	spinlock_t lock;

It would be helpful if this lock were annotated with a list of things it 
protects. AFAICT it might be:
	wq_tail
	wq_rsvd
	no_wq_space
	cookie

b_fail and the submissions[] statistics seem to have been left 
unprotected :( though of course they're not really critical

.Dave.

> +
>  	uint32_t engines;		/* bitmap of (host) engine ids	*/
>  	uint32_t priority;
>  	uint32_t ctx_index;
> @@ -81,6 +83,7 @@ struct i915_guc_client {
>  	uint32_t wq_offset;
>  	uint32_t wq_size;
>  	uint32_t wq_tail;
> +	uint32_t wq_rsvd;
>  	uint32_t no_wq_space;
>  	uint32_t b_fail;
>  	int retcode;

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 14/18] drm/i915/guc: Prepare for nonblocking execbuf submission
  2016-09-02 18:20   ` Dave Gordon
@ 2016-09-05  8:08     ` Chris Wilson
  0 siblings, 0 replies; 27+ messages in thread
From: Chris Wilson @ 2016-09-05  8:08 UTC (permalink / raw)
  To: Dave Gordon; +Cc: intel-gfx, mika.kuoppala

On Fri, Sep 02, 2016 at 07:20:52PM +0100, Dave Gordon wrote:
> On 30/08/16 09:18, Chris Wilson wrote:
> >Currently the presumption is that the request construction and its
> >submission to the GuC are all under the same holding of struct_mutex. We
> >wish to relax this to separate the request construction and the later
> >submission to the GuC. This requires us to reserve some space in the
> >GuC command queue for the future submission. For flexibility to handle
> >out-of-order request submission we do not preallocate the next slot in
> >the GuC command queue during request construction, just ensuring that
> >there is enough space later.
> >
> >Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >---
> > drivers/gpu/drm/i915/i915_guc_submission.c | 55 ++++++++++++++----------------
> > drivers/gpu/drm/i915/intel_guc.h           |  3 ++
> > 2 files changed, 29 insertions(+), 29 deletions(-)
> 
> Hmm .. the functional changes look mostly OK, apart from some
> locking, but there seems to be a great deal of unnecessary churn,
> such as combining statements which had been kept separate for
> clarity :(
> 
> >diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
> >index 2332f9c98bdd..a047e61adc2a 100644
> >--- a/drivers/gpu/drm/i915/i915_guc_submission.c
> >+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> >@@ -434,20 +434,23 @@ int i915_guc_wq_check_space(struct drm_i915_gem_request *request)
> > {
> > 	const size_t wqi_size = sizeof(struct guc_wq_item);
> > 	struct i915_guc_client *gc = request->i915->guc.execbuf_client;
> >-	struct guc_process_desc *desc;
> >+	struct guc_process_desc *desc = gc->client_base + gc->proc_desc_offset;
> > 	u32 freespace;
> >+	int ret;
> >
> >-	GEM_BUG_ON(gc == NULL);
> >-
> >-	desc = gc->client_base + gc->proc_desc_offset;
> >-
> >+	spin_lock(&gc->lock);
> > 	freespace = CIRC_SPACE(gc->wq_tail, desc->head, gc->wq_size);
> >-	if (likely(freespace >= wqi_size))
> >-		return 0;
> >-
> >-	gc->no_wq_space += 1;
> >+	freespace -= gc->wq_rsvd;
> >+	if (likely(freespace >= wqi_size)) {
> >+		gc->wq_rsvd += wqi_size;
> >+		ret = 0;
> >+	} else {
> >+		gc->no_wq_space++;
> >+		ret = -EAGAIN;
> >+	}
> >+	spin_unlock(&gc->lock);
> >
> >-	return -EAGAIN;
> >+	return ret;
> > }
> >
> > static void guc_add_workqueue_item(struct i915_guc_client *gc,
> >@@ -457,22 +460,9 @@ static void guc_add_workqueue_item(struct i915_guc_client *gc,
> > 	const size_t wqi_size = sizeof(struct guc_wq_item);
> > 	const u32 wqi_len = wqi_size/sizeof(u32) - 1;
> > 	struct intel_engine_cs *engine = rq->engine;
> >-	struct guc_process_desc *desc;
> > 	struct guc_wq_item *wqi;
> > 	void *base;
> >-	u32 freespace, tail, wq_off, wq_page;
> >-
> >-	desc = gc->client_base + gc->proc_desc_offset;
> >-
> >-	/* Free space is guaranteed, see i915_guc_wq_check_space() above */
> 
> This comment seems to have been lost. It still applies (mutatis
> mutandis), so it should be relocated to some part of the new version
> ...
> 
> >-	freespace = CIRC_SPACE(gc->wq_tail, desc->head, gc->wq_size);
> >-	GEM_BUG_ON(freespace < wqi_size);
> >-
> >-	/* The GuC firmware wants the tail index in QWords, not bytes */
> >-	tail = rq->tail;
> >-	GEM_BUG_ON(tail & 7);
> >-	tail >>= 3;
> >-	GEM_BUG_ON(tail > WQ_RING_TAIL_MAX);
> 
> This *commented* sequence of statements seems clearer than the
> replacement below ...
> 
> >+	u32 wq_off, wq_page;
> >
> > 	/* For now workqueue item is 4 DWs; workqueue buffer is 2 pages. So we
> > 	 * should not have the case where structure wqi is across page, neither
> >@@ -482,18 +472,19 @@ static void guc_add_workqueue_item(struct i915_guc_client *gc,
> > 	 * workqueue buffer dw by dw.
> > 	 */
> > 	BUILD_BUG_ON(wqi_size != 16);
> 
> This would be a good place to note that:
> 
> /* Reserved space is guaranteed, see i915_guc_wq_check_space() above */
> 
> >+	GEM_BUG_ON(gc->wq_rsvd < wqi_size);
> >
> > 	/* postincrement WQ tail for next time */
> > 	wq_off = gc->wq_tail;
> >+	GEM_BUG_ON(wq_off & (wqi_size - 1));
> > 	gc->wq_tail += wqi_size;
> > 	gc->wq_tail &= gc->wq_size - 1;
> >-	GEM_BUG_ON(wq_off & (wqi_size - 1));
> >+	gc->wq_rsvd -= wqi_size;
> >
> > 	/* WQ starts from the page after doorbell / process_desc */
> > 	wq_page = (wq_off + GUC_DB_SIZE) >> PAGE_SHIFT;
> >-	wq_off &= PAGE_SIZE - 1;
> > 	base = kmap_atomic(i915_gem_object_get_page(gc->vma->obj, wq_page));
> >-	wqi = (struct guc_wq_item *)((char *)base + wq_off);
> >+	wqi = (struct guc_wq_item *)((char *)base + offset_in_page(wq_off));
> >
> > 	/* Now fill in the 4-word work queue item */
> > 	wqi->header = WQ_TYPE_INORDER |
> >@@ -504,7 +495,7 @@ static void guc_add_workqueue_item(struct i915_guc_client *gc,
> > 	/* The GuC wants only the low-order word of the context descriptor */
> > 	wqi->context_desc = (u32)intel_lr_context_descriptor(rq->ctx, engine);
> >
> >-	wqi->ring_tail = tail << WQ_RING_TAIL_SHIFT;
> >+	wqi->ring_tail = rq->tail >> 3 << WQ_RING_TAIL_SHIFT;
> 
> This line is particularly ugly. I think it's the >> chevron << effect.
> Parenthesisation would help, but it would be nicer as separate lines.
> Also, there's no explanation of the "3" here, unlike the original
> version above.

Correct. The code was and still is ugly.

> > 	wqi->fence_id = rq->fence.seqno;
> >
> > 	kunmap_atomic(base);
> >@@ -591,8 +582,10 @@ static void i915_guc_submit(struct drm_i915_gem_request *rq)
> > 	struct i915_guc_client *client = guc->execbuf_client;
> > 	int b_ret;
> >
> >+	spin_lock(&client->lock);
> > 	guc_add_workqueue_item(client, rq);
> > 	b_ret = guc_ring_doorbell(client);
> >+	spin_unlock(&client->lock);
> >
> > 	client->submissions[engine_id] += 1;
> 
> Outside the spinlock? Do we still have the BKL during submit(), just
> not during i915_guc_wq_check_space() ? If so, then
> guc_ring_doorbell() doesn't strictly need to be inside the spinlock
> (or the lock could be inside guc_add_workqueue_item()); but if not
> then the update of submissions[] should be inside.

Nope, this code is just garbage, so I didn't bother polishing it.

> > 	client->retcode = b_ret;
> >@@ -770,6 +763,8 @@ guc_client_alloc(struct drm_i915_private *dev_priv,
> > 	if (!client)
> > 		return NULL;
> >
> >+	spin_lock_init(&client->lock);
> >+
> > 	client->owner = ctx;
> > 	client->guc = guc;
> > 	client->engines = engines;
> >@@ -1019,9 +1014,11 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
> > 		engine->submit_request = i915_guc_submit;
> >
> > 		/* Replay the current set of previously submitted requests */
> >-		list_for_each_entry(request, &engine->request_list, link)
> >+		list_for_each_entry(request, &engine->request_list, link) {
> >+			client->wq_rsvd += sizeof(struct guc_wq_item);
> 
> Presumably this is being called in some context that ensures that we
> don't need to hold the spinlock while updating wq_rsvd ? Maybe that
> should be mentioned ...

Obvious.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 15/18] drm/i915: Nonblocking request submission
  2016-08-30  8:17 Non-blocking fences, now GuC compatible Chris Wilson
                   ` (13 preceding siblings ...)
  2016-08-30  8:18 ` [PATCH 14/18] drm/i915/guc: Prepare for nonblocking execbuf submission Chris Wilson
@ 2016-08-30  8:18 ` Chris Wilson
  2016-09-02 15:49   ` John Harrison
  2016-08-30  8:18 ` [PATCH 16/18] drm/i915: Serialise execbuf operation after a dma-buf reservation object Chris Wilson
                   ` (3 subsequent siblings)
  18 siblings, 1 reply; 27+ messages in thread
From: Chris Wilson @ 2016-08-30  8:18 UTC (permalink / raw)
  To: intel-gfx; +Cc: mika.kuoppala

Now that we have fences in place to drive request submission, we can
employ those to queue requests after their dependencies as opposed to
stalling in the middle of an execbuf ioctl. (However, we still choose to
spin before enabling the IRQ as that is faster - though contentious.)

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 13 +++++++------
 drivers/gpu/drm/i915/i915_gem_request.c    | 14 +++++++++++++-
 2 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 1685f4aaa4c2..0c8e447ffdbb 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1136,12 +1136,13 @@ eb_await_request(struct drm_i915_gem_request *to,
 
 	trace_i915_gem_ring_sync_to(to, from);
 	if (!i915.semaphores) {
-		ret = i915_wait_request(from,
-					I915_WAIT_INTERRUPTIBLE |
-					I915_WAIT_LOCKED,
-					NULL, NO_WAITBOOST);
-		if (ret)
-			return ret;
+		if (!i915_spin_request(from, TASK_INTERRUPTIBLE, 2)) {
+			ret = i915_sw_fence_await_dma_fence(&to->submit,
+							    &from->fence,
+							    GFP_KERNEL);
+			if (ret < 0)
+				return ret;
+		}
 	} else {
 		ret = to->engine->semaphore.sync_to(to, from);
 		if (ret)
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 89ed66275d95..5837660502cd 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -352,7 +352,7 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 		       struct i915_gem_context *ctx)
 {
 	struct drm_i915_private *dev_priv = engine->i915;
-	struct drm_i915_gem_request *req;
+	struct drm_i915_gem_request *req, *prev;
 	u32 seqno;
 	int ret;
 
@@ -448,6 +448,18 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 	 */
 	req->head = req->ring->tail;
 
+	prev = i915_gem_active_peek(&engine->last_request,
+				    &req->i915->drm.struct_mutex);
+	if (prev) {
+		ret = i915_sw_fence_await_sw_fence(&req->submit,
+						   &prev->submit,
+						   GFP_KERNEL);
+		if (ret < 0) {
+			i915_add_request(req);
+			return ERR_PTR(ret);
+		}
+	}
+
 	return req;
 
 err_ctx:
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH 15/18] drm/i915: Nonblocking request submission
  2016-08-30  8:18 ` [PATCH 15/18] drm/i915: Nonblocking request submission Chris Wilson
@ 2016-09-02 15:49   ` John Harrison
  0 siblings, 0 replies; 27+ messages in thread
From: John Harrison @ 2016-09-02 15:49 UTC (permalink / raw)
  To: intel-gfx

On 30/08/2016 09:18, Chris Wilson wrote:
> Now that we have fences in place to drive request submission, we can
> employ those to queue requests after their dependencies as opposed to
> stalling in the middle of an execbuf ioctl. (However, we still choose to
> spin before enabling the IRQ as that is faster - though contentious.)
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_gem_execbuffer.c | 13 +++++++------
>   drivers/gpu/drm/i915/i915_gem_request.c    | 14 +++++++++++++-
>   2 files changed, 20 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index 1685f4aaa4c2..0c8e447ffdbb 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -1136,12 +1136,13 @@ eb_await_request(struct drm_i915_gem_request *to,
>   
>   	trace_i915_gem_ring_sync_to(to, from);
>   	if (!i915.semaphores) {
> -		ret = i915_wait_request(from,
> -					I915_WAIT_INTERRUPTIBLE |
> -					I915_WAIT_LOCKED,
> -					NULL, NO_WAITBOOST);
> -		if (ret)
> -			return ret;
> +		if (!i915_spin_request(from, TASK_INTERRUPTIBLE, 2)) {
> +			ret = i915_sw_fence_await_dma_fence(&to->submit,
> +							    &from->fence,
> +							    GFP_KERNEL);
To finish the discussion here:

> > Why not use 'i915_sw_fence_await_sw_fence(from->submit)' as below?
> > Or conversely, why not use '_await_dma_fence(prev->fence)' below?
>
> On the same engine the requests are in execution order and so once the
> first is ready to submit so are its dependents. This extends naturally
> to timelines. Between engines, we have to wait until the request is
> complete before we can submit the second (note this applies to the
> !semaphore branch).

Doh. Yes, req->submit is signalled when the request is submitted but 
req->fence is signalled when the request completes. For some reason, I 
was thinking the two were actually signalled together.


> +			if (ret < 0)
> +				return ret;
> +		}
>   	} else {
>   		ret = to->engine->semaphore.sync_to(to, from);
>   		if (ret)
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> index 89ed66275d95..5837660502cd 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -352,7 +352,7 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
>   		       struct i915_gem_context *ctx)
>   {
>   	struct drm_i915_private *dev_priv = engine->i915;
> -	struct drm_i915_gem_request *req;
> +	struct drm_i915_gem_request *req, *prev;
>   	u32 seqno;
>   	int ret;
>   
> @@ -448,6 +448,18 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
>   	 */
>   	req->head = req->ring->tail;
>   
> +	prev = i915_gem_active_peek(&engine->last_request,
> +				    &req->i915->drm.struct_mutex);
> +	if (prev) {
> +		ret = i915_sw_fence_await_sw_fence(&req->submit,
> +						   &prev->submit,
> +						   GFP_KERNEL);
> +		if (ret < 0) {
> +			i915_add_request(req);
Isn't this an old version of the patch. I thought you re-issued it with 
the ordering changed to avoid this add_request() on sync failure?


> +			return ERR_PTR(ret);
> +		}
> +	}
> +
>   	return req;
>   
>   err_ctx:

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 16/18] drm/i915: Serialise execbuf operation after a dma-buf reservation object
  2016-08-30  8:17 Non-blocking fences, now GuC compatible Chris Wilson
                   ` (14 preceding siblings ...)
  2016-08-30  8:18 ` [PATCH 15/18] drm/i915: Nonblocking request submission Chris Wilson
@ 2016-08-30  8:18 ` Chris Wilson
  2016-08-30  8:18 ` [PATCH 17/18] drm/i915: Enable userspace to opt-out of implicit fencing Chris Wilson
                   ` (2 subsequent siblings)
  18 siblings, 0 replies; 27+ messages in thread
From: Chris Wilson @ 2016-08-30  8:18 UTC (permalink / raw)
  To: intel-gfx; +Cc: mika.kuoppala

Now that we can wait upon fences before emitting the request, it becomes
trivial to wait upon any implicit fence provided by the dma-buf
reservation object.

Testcase: igt/prime_vgem/fence-wait
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: John Harrison <john.c.harrison@intel.com>
---
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 0c8e447ffdbb..b63f9327fc40 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1197,6 +1197,7 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req,
 
 	list_for_each_entry(vma, vmas, exec_list) {
 		struct drm_i915_gem_object *obj = vma->obj;
+		struct reservation_object *resv;
 
 		if (obj->flags & other_rings) {
 			ret = eb_await_bo(req, obj,
@@ -1205,6 +1206,16 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req,
 				return ret;
 		}
 
+		resv = i915_gem_object_get_dmabuf_resv(obj);
+		if (resv) {
+			ret = i915_sw_fence_await_reservation
+				(&req->submit, resv, &i915_fence_ops,
+				 obj->base.pending_write_domain,
+				 GFP_KERNEL | __GFP_NOWARN);
+			if (ret < 0)
+				return ret;
+		}
+
 		if (obj->base.write_domain & I915_GEM_DOMAIN_CPU)
 			i915_gem_clflush_object(obj, false);
 	}
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 17/18] drm/i915: Enable userspace to opt-out of implicit fencing
  2016-08-30  8:17 Non-blocking fences, now GuC compatible Chris Wilson
                   ` (15 preceding siblings ...)
  2016-08-30  8:18 ` [PATCH 16/18] drm/i915: Serialise execbuf operation after a dma-buf reservation object Chris Wilson
@ 2016-08-30  8:18 ` Chris Wilson
  2016-08-30  8:18 ` [PATCH 18/18] drm/i915: Support explicit fencing for execbuf Chris Wilson
  2016-08-30  8:57 ` ✓ Fi.CI.BAT: success for series starting with [01/18] drm/i915: Add a sw fence for collecting up dma fences Patchwork
  18 siblings, 0 replies; 27+ messages in thread
From: Chris Wilson @ 2016-08-30  8:18 UTC (permalink / raw)
  To: intel-gfx; +Cc: mika.kuoppala

Userspace is faced with a dilemma. The kernel requires implicit fencing
to manage resource usage (we always must wait for the GPU to finish
before releasing its PTE) and for third parties. However, userspace may
wish to avoid this serialisation if it is either using explicit fencing
between parties and wants more fine-grained access to buffers (e.g. it
may partition the buffer between uses and track fences on ranges rather
than the implicit fences tracking the whole object). To whit, userspace
needs a mechanism to avoid the kernel's serialisation on its implicit
fences before execbuf execution.

The next question is whether this is an object, execbuf or context flag.
Hybrid users (such as using explicit EGL_ANDROID_native_sync fencing on
shared winsys buffers, but implicit fencing on internal surfaces)
require a per-object level flag. Given that this flag need to be only
set once for the lifetime of the object, this reduces the convenience of
having an execbuf or context level flag (and avoids having multiple
pieces of uABI controlling the same feature).

Incorrect use of this flag will result in rendering corruption and GPU
hangs - but will not result in use-after-free or similar resource
tracking issues.

Serious caveat: write ordering is not strictly correct after setting
this flag on a render target on multiple engines. This affects all
subsequent GEM operations (execbuf, set-domain, pread) and shared
dma-buf operations. A fix is possible - but costly (both in terms of
further ABI changes and runtime overhead).

Testcase: igt/gem_exec_async
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.c            |  3 +++
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 30 ++++++++++++++++--------------
 include/uapi/drm/i915_drm.h                | 24 +++++++++++++++++++++++-
 3 files changed, 42 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 449fcfa40319..437ab0695373 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -362,6 +362,9 @@ static int i915_getparam(struct drm_device *dev, void *data,
 		 */
 		value = i915_gem_mmap_gtt_version();
 		break;
+	case I915_PARAM_HAS_EXEC_ASYNC:
+		value = 1;
+		break;
 	default:
 		DRM_DEBUG("Unknown parameter %d\n", param->param);
 		return -EINVAL;
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index b63f9327fc40..ca7675c9c1b7 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1199,21 +1199,23 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req,
 		struct drm_i915_gem_object *obj = vma->obj;
 		struct reservation_object *resv;
 
-		if (obj->flags & other_rings) {
-			ret = eb_await_bo(req, obj,
-					  obj->base.pending_write_domain);
-			if (ret)
-				return ret;
-		}
+		if ((vma->exec_entry->flags & EXEC_OBJECT_ASYNC) == 0) {
+			if (obj->flags & other_rings) {
+				ret = eb_await_bo(req, obj,
+						  obj->base.pending_write_domain);
+				if (ret)
+					return ret;
+			}
 
-		resv = i915_gem_object_get_dmabuf_resv(obj);
-		if (resv) {
-			ret = i915_sw_fence_await_reservation
-				(&req->submit, resv, &i915_fence_ops,
-				 obj->base.pending_write_domain,
-				 GFP_KERNEL | __GFP_NOWARN);
-			if (ret < 0)
-				return ret;
+			resv = i915_gem_object_get_dmabuf_resv(obj);
+			if (resv) {
+				ret = i915_sw_fence_await_reservation
+					(&req->submit, resv, &i915_fence_ops,
+					 obj->base.pending_write_domain,
+					 GFP_KERNEL | __GFP_NOWARN);
+				if (ret < 0)
+					return ret;
+			}
 		}
 
 		if (obj->base.write_domain & I915_GEM_DOMAIN_CPU)
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 03725fe89859..730cba23f982 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -388,6 +388,10 @@ typedef struct drm_i915_irq_wait {
 #define I915_PARAM_HAS_POOLED_EU	 38
 #define I915_PARAM_MIN_EU_IN_POOL	 39
 #define I915_PARAM_MMAP_GTT_VERSION	 40
+/* Query whether DRM_I915_GEM_EXECBUFFER2 supports the ability to opt-out of
+ * synchronisation with implicit fencing on individual objects.
+ */
+#define I915_PARAM_HAS_EXEC_ASYNC	 41
 
 typedef struct drm_i915_getparam {
 	__s32 param;
@@ -729,8 +733,26 @@ struct drm_i915_gem_exec_object2 {
 #define EXEC_OBJECT_SUPPORTS_48B_ADDRESS (1<<3)
 #define EXEC_OBJECT_PINNED		 (1<<4)
 #define EXEC_OBJECT_PAD_TO_SIZE		 (1<<5)
+/* The kernel implicitly tracks GPU activity on all GEM objects, and
+ * synchronises operations with outstanding rendering. This includes
+ * rendering on other devices if exported via dma-buf. However, sometimes
+ * this tracking is too coarse and the user knows better. For example,
+ * if the object is split into non-overlapping ranges shared between different
+ * clients or engines (i.e. suballocating objects), the implicit tracking
+ * by kernel assumes that each operation affects the whole object rather
+ * than an individual range, causing needless synchronisation between clients.
+ *
+ * The kernel maintains the implicit tracking in order to manage resources
+ * used by the GPU - this flag only disables the synchronisation prior to
+ * rendering with this object in this execbuf.
+ *
+ * Opting out of implicit synhronisation requires the user to do its own
+ * explicit tracking to avoid rendering corruption. See, for example,
+ * I915_PARAM_HAS_EXEC_FENCE to order execbufs and execute them asynchronously.
+ */
+#define EXEC_OBJECT_ASYNC		 (1<<6)
 /* All remaining bits are MBZ and RESERVED FOR FUTURE USE */
-#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_PAD_TO_SIZE<<1)
+#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_ASYNC<<1)
 	__u64 flags;
 
 	union {
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 18/18] drm/i915: Support explicit fencing for execbuf
  2016-08-30  8:17 Non-blocking fences, now GuC compatible Chris Wilson
                   ` (16 preceding siblings ...)
  2016-08-30  8:18 ` [PATCH 17/18] drm/i915: Enable userspace to opt-out of implicit fencing Chris Wilson
@ 2016-08-30  8:18 ` Chris Wilson
  2016-09-02 15:56   ` John Harrison
  2016-08-30  8:57 ` ✓ Fi.CI.BAT: success for series starting with [01/18] drm/i915: Add a sw fence for collecting up dma fences Patchwork
  18 siblings, 1 reply; 27+ messages in thread
From: Chris Wilson @ 2016-08-30  8:18 UTC (permalink / raw)
  To: intel-gfx; +Cc: mika.kuoppala

Now that the user can opt-out of implicit fencing, we need to give them
back control over the fencing. We employ sync_file to wrap our
drm_i915_gem_request and provide an fd that userspace can merge with
other sync_file fds and pass back to the kernel to wait upon before
future execution.

Testcase: igt/gem_exec_fence
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/Kconfig               |  1 +
 drivers/gpu/drm/i915/i915_drv.c            |  5 +-
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 82 ++++++++++++++++++++++++++++--
 include/uapi/drm/i915_drm.h                | 36 ++++++++++++-
 4 files changed, 117 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig
index 7769e469118f..319ca27ea719 100644
--- a/drivers/gpu/drm/i915/Kconfig
+++ b/drivers/gpu/drm/i915/Kconfig
@@ -18,6 +18,7 @@ config DRM_I915
 	select INPUT if ACPI
 	select ACPI_VIDEO if ACPI
 	select ACPI_BUTTON if ACPI
+	select SYNC_FILE
 	help
 	  Choose this option if you have a system that has "Intel Graphics
 	  Media Accelerator" or "HD Graphics" integrated graphics,
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 437ab0695373..492c4d4e5ebc 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -365,6 +365,9 @@ static int i915_getparam(struct drm_device *dev, void *data,
 	case I915_PARAM_HAS_EXEC_ASYNC:
 		value = 1;
 		break;
+	case I915_PARAM_HAS_EXEC_FENCE:
+		value = 1;
+		break;
 	default:
 		DRM_DEBUG("Unknown parameter %d\n", param->param);
 		return -EINVAL;
@@ -2552,7 +2555,7 @@ static const struct drm_ioctl_desc i915_ioctls[] = {
 	DRM_IOCTL_DEF_DRV(I915_HWS_ADDR, drm_noop, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY),
 	DRM_IOCTL_DEF_DRV(I915_GEM_INIT, drm_noop, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY),
 	DRM_IOCTL_DEF_DRV(I915_GEM_EXECBUFFER, i915_gem_execbuffer, DRM_AUTH),
-	DRM_IOCTL_DEF_DRV(I915_GEM_EXECBUFFER2, i915_gem_execbuffer2, DRM_AUTH|DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(I915_GEM_EXECBUFFER2_WR, i915_gem_execbuffer2, DRM_AUTH|DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(I915_GEM_PIN, i915_gem_reject_pin_ioctl, DRM_AUTH|DRM_ROOT_ONLY),
 	DRM_IOCTL_DEF_DRV(I915_GEM_UNPIN, i915_gem_reject_pin_ioctl, DRM_AUTH|DRM_ROOT_ONLY),
 	DRM_IOCTL_DEF_DRV(I915_GEM_BUSY, i915_gem_busy_ioctl, DRM_AUTH|DRM_RENDER_ALLOW),
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index ca7675c9c1b7..98042fc26404 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -28,6 +28,7 @@
 
 #include <linux/dma_remapping.h>
 #include <linux/reservation.h>
+#include <linux/sync_file.h>
 #include <linux/uaccess.h>
 
 #include <drm/drmP.h>
@@ -1153,6 +1154,35 @@ eb_await_request(struct drm_i915_gem_request *to,
 	return 0;
 }
 
+static int eb_await_fence(struct drm_i915_gem_request *req, struct fence *fence)
+{
+	struct fence_array *array;
+	int i;
+
+	if (fence_is_i915(fence))
+		return eb_await_request(req, to_request(fence));
+
+	if (!fence_is_array(fence))
+		return i915_sw_fence_await_dma_fence(&req->submit,
+						     fence, GFP_KERNEL);
+
+	array = to_fence_array(fence);
+	for (i = 0; i < array->num_fences; i++) {
+		struct fence *child = array->fences[i];
+		int ret;
+
+		if (fence_is_i915(child))
+			ret = eb_await_request(req, to_request(child));
+		else
+			ret = i915_sw_fence_await_dma_fence(&req->submit,
+							    child, GFP_KERNEL);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
+
 static int
 eb_await_bo(struct drm_i915_gem_request *to,
 	     struct drm_i915_gem_object *obj,
@@ -1701,6 +1731,9 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	struct i915_execbuffer_params *params = &params_master;
 	const u32 ctx_id = i915_execbuffer2_get_context_id(*args);
 	u32 dispatch_flags;
+	struct fence *in_fence = NULL;
+	struct sync_file *out_fence = NULL;
+	int out_fence_fd = -1;
 	int ret;
 	bool need_relocs;
 
@@ -1744,6 +1777,23 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 		dispatch_flags |= I915_DISPATCH_RS;
 	}
 
+	if (args->flags & I915_EXEC_FENCE_IN) {
+		in_fence = sync_file_get_fence(lower_32_bits(args->rsvd2));
+		if (!in_fence) {
+			ret = -EINVAL;
+			goto pre_mutex_err;
+		}
+	}
+
+	if (args->flags & I915_EXEC_FENCE_OUT) {
+		out_fence_fd = get_unused_fd_flags(O_CLOEXEC);
+		if (out_fence_fd < 0) {
+			ret = out_fence_fd;
+			out_fence_fd = -1;
+			goto pre_mutex_err;
+		}
+	}
+
 	/* Take a local wakeref for preparing to dispatch the execbuf as
 	 * we expect to access the hardware fairly frequently in the
 	 * process. Upon first dispatch, we acquire another prolonged
@@ -1888,6 +1938,20 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 		goto err_batch_unpin;
 	}
 
+	if (in_fence) {
+		ret = eb_await_fence(params->request, in_fence);
+		if (ret < 0)
+			goto err_request;
+	}
+
+	if (out_fence_fd != -1) {
+		out_fence = sync_file_create(fence_get(&params->request->fence));
+		if (!out_fence) {
+			ret = -ENOMEM;
+			goto err_request;
+		}
+	}
+
 	/* Whilst this request exists, batch_obj will be on the
 	 * active_list, and so will hold the active reference. Only when this
 	 * request is retired will the the batch_obj be moved onto the
@@ -1915,6 +1979,16 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	ret = execbuf_submit(params, args, &eb->vmas);
 err_request:
 	__i915_add_request(params->request, ret == 0);
+	if (out_fence) {
+		if (ret == 0) {
+			fd_install(out_fence_fd, out_fence->file);
+			args->rsvd2 &= ~((u64)0xffffffff << 32);
+			args->rsvd2 |= (u64)out_fence_fd << 32;
+			out_fence_fd = -1;
+		} else {
+			fput(out_fence->file);
+		}
+	}
 
 err_batch_unpin:
 	/*
@@ -1936,6 +2010,9 @@ pre_mutex_err:
 	/* intel_gpu_busy should also get a ref, so it will free when the device
 	 * is really idle. */
 	intel_runtime_pm_put(dev_priv);
+	if (out_fence_fd != -1)
+		put_unused_fd(out_fence_fd);
+	fence_put(in_fence);
 	return ret;
 }
 
@@ -2043,11 +2120,6 @@ i915_gem_execbuffer2(struct drm_device *dev, void *data,
 		return -EINVAL;
 	}
 
-	if (args->rsvd2 != 0) {
-		DRM_DEBUG("dirty rvsd2 field\n");
-		return -EINVAL;
-	}
-
 	exec2_list = drm_malloc_gfp(args->buffer_count,
 				    sizeof(*exec2_list),
 				    GFP_TEMPORARY);
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 730cba23f982..735fd509fb31 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -246,6 +246,7 @@ typedef struct _drm_i915_sarea {
 #define DRM_I915_OVERLAY_PUT_IMAGE	0x27
 #define DRM_I915_OVERLAY_ATTRS	0x28
 #define DRM_I915_GEM_EXECBUFFER2	0x29
+#define DRM_I915_GEM_EXECBUFFER2_WR	DRM_I915_GEM_EXECBUFFER2
 #define DRM_I915_GET_SPRITE_COLORKEY	0x2a
 #define DRM_I915_SET_SPRITE_COLORKEY	0x2b
 #define DRM_I915_GEM_WAIT	0x2c
@@ -279,6 +280,7 @@ typedef struct _drm_i915_sarea {
 #define DRM_IOCTL_I915_GEM_INIT		DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_INIT, struct drm_i915_gem_init)
 #define DRM_IOCTL_I915_GEM_EXECBUFFER	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_EXECBUFFER, struct drm_i915_gem_execbuffer)
 #define DRM_IOCTL_I915_GEM_EXECBUFFER2	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_EXECBUFFER2, struct drm_i915_gem_execbuffer2)
+#define DRM_IOCTL_I915_GEM_EXECBUFFER2_WR	DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_EXECBUFFER2_WR, struct drm_i915_gem_execbuffer2)
 #define DRM_IOCTL_I915_GEM_PIN		DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_PIN, struct drm_i915_gem_pin)
 #define DRM_IOCTL_I915_GEM_UNPIN	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_UNPIN, struct drm_i915_gem_unpin)
 #define DRM_IOCTL_I915_GEM_BUSY		DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_BUSY, struct drm_i915_gem_busy)
@@ -393,6 +395,13 @@ typedef struct drm_i915_irq_wait {
  */
 #define I915_PARAM_HAS_EXEC_ASYNC	 41
 
+/* Query whether DRM_I915_GEM_EXECBUFFER2 supports explicit fence support -
+ * both being able to pass in a sync_file fd to wait upon before executing,
+ * and being able to return a new sync_file fd that is signaled when the
+ * current request is complete.
+ */
+#define I915_PARAM_HAS_EXEC_FENCE	 42
+
 typedef struct drm_i915_getparam {
 	__s32 param;
 	/*
@@ -842,7 +851,32 @@ struct drm_i915_gem_execbuffer2 {
  */
 #define I915_EXEC_RESOURCE_STREAMER     (1<<15)
 
-#define __I915_EXEC_UNKNOWN_FLAGS -(I915_EXEC_RESOURCE_STREAMER<<1)
+/* Setting I915_EXEC_FENCE_IN implies that lower_32_bits(rsvd2) represent
+ * a sync_file fd to wait upon (in a nonblocking manner) prior to executing
+ * the batch.
+ *
+ * Returns -EINVAL if the sync_file fd cannot be found.
+ */
+#define I915_EXEC_FENCE_IN		(1<<16)
+
+/* Setting I915_EXEC_FENCE_OUT causes the ioctl to return a sync_file fd
+ * in the upper_32_bits(rsvd2) upon success. Ownership of the fd is given
+ * to the caller, and it should be close() after use. (The fd is a regular
+ * file descriptor and will be cleaned up on process termination. It holds
+ * a reference to the request, but nothing else.)
+ *
+ * The sync_file fd can be combined with other sync_file and passed either
+ * to execbuf using I915_EXEC_FENCE_IN, to atomic KMS ioctls (so that a flip
+ * will only occur after this request completes), or to other devices.
+ *
+ * Using I915_EXEC_FENCE_OUT requires use of
+ * DRM_IOCTL_I915_GEM_EXECBUFFER2_WR ioctl so that the result is written
+ * back to userspace. Failure to do so will cause the out-fence to always
+ * be reported as zero, and the real fence fd to be leaked.
+ */
+#define I915_EXEC_FENCE_OUT		(1<<17)
+
+#define __I915_EXEC_UNKNOWN_FLAGS (-(I915_EXEC_FENCE_OUT<<1))
 
 #define I915_EXEC_CONTEXT_ID_MASK	(0xffffffff)
 #define i915_execbuffer2_set_context_id(eb2, context) \
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH 18/18] drm/i915: Support explicit fencing for execbuf
  2016-08-30  8:18 ` [PATCH 18/18] drm/i915: Support explicit fencing for execbuf Chris Wilson
@ 2016-09-02 15:56   ` John Harrison
  0 siblings, 0 replies; 27+ messages in thread
From: John Harrison @ 2016-09-02 15:56 UTC (permalink / raw)
  To: intel-gfx

On 30/08/2016 09:18, Chris Wilson wrote:
> Now that the user can opt-out of implicit fencing, we need to give them
> back control over the fencing. We employ sync_file to wrap our
> drm_i915_gem_request and provide an fd that userspace can merge with
> other sync_file fds and pass back to the kernel to wait upon before
> future execution.

> +			args->rsvd2 &= ~((u64)0xffffffff << 32);
> +			args->rsvd2 |= (u64)out_fence_fd << 32;
>
I didn't see a response to the suggestion of renaming rsvd2 to something 
more useful or even replacing it with a pair of u32 values instead. At 
the very least, I think it is worth adding a comment to the structure 
declaration to say that the field is actually in use for something now.

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* ✓ Fi.CI.BAT: success for series starting with [01/18] drm/i915: Add a sw fence for collecting up dma fences
  2016-08-30  8:17 Non-blocking fences, now GuC compatible Chris Wilson
                   ` (17 preceding siblings ...)
  2016-08-30  8:18 ` [PATCH 18/18] drm/i915: Support explicit fencing for execbuf Chris Wilson
@ 2016-08-30  8:57 ` Patchwork
  2016-08-30  9:04   ` Chris Wilson
  18 siblings, 1 reply; 27+ messages in thread
From: Patchwork @ 2016-08-30  8:57 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [01/18] drm/i915: Add a sw fence for collecting up dma fences
URL   : https://patchwork.freedesktop.org/series/11768/
State : success

== Summary ==

Series 11768v1 Series without cover letter
http://patchwork.freedesktop.org/api/1.0/series/11768/revisions/1/mbox/


fi-bdw-5557u     total:252  pass:236  dwarn:0   dfail:0   fail:1   skip:15 
fi-bsw-n3050     total:252  pass:206  dwarn:0   dfail:0   fail:0   skip:46 
fi-hsw-4770k     total:252  pass:229  dwarn:0   dfail:0   fail:1   skip:22 
fi-hsw-4770r     total:252  pass:225  dwarn:0   dfail:0   fail:1   skip:26 
fi-ivb-3520m     total:252  pass:221  dwarn:0   dfail:0   fail:0   skip:31 
fi-skl-6260u     total:252  pass:237  dwarn:0   dfail:0   fail:1   skip:14 
fi-skl-6700k     total:252  pass:223  dwarn:0   dfail:0   fail:1   skip:28 
fi-snb-2520m     total:252  pass:208  dwarn:0   dfail:0   fail:1   skip:43 
fi-snb-2600      total:252  pass:208  dwarn:0   dfail:0   fail:1   skip:43 

Results at /archive/results/CI_IGT_test/Patchwork_2451/

57de27e40b9741c17c6749a366e891faf8b22fcb drm-intel-nightly: 2016y-08m-29d-15h-38m-26s UTC integration manifest
7b22d68 drm/i915: Support explicit fencing for execbuf
69db14e drm/i915: Enable userspace to opt-out of implicit fencing
b1f0ee6 drm/i915: Serialise execbuf operation after a dma-buf reservation object
6d93d84 drm/i915: Nonblocking request submission
754970b drm/i915/guc: Prepare for nonblocking execbuf submission
79f5293 drm/i915: Move execbuf object synchronisation to i915_gem_execbuffer
c76a525 drm/i915: Drive request submission through fence callbacks
b1e83c2 drm/i915: Update reset path to fix incomplete requests
307a0d2 drm/i915: Perform a direct reset of the GPU from the waiter
6de04b7 drm/i915: Expand bool interruptible to pass flags to i915_wait_request()
24273c8 drm/i915: Drop local struct_mutex around intel_init_emon[ilk]
2415581 drm/i915: Separate out reset flags from the reset counter
2d5c2fb drm/i915: Simplify ELSP queue request tracking
c7e000c drm/i915: Reorder submitting the requests to ELSP
5b65ab2 drm/i915: Compute the ELSP register location once
9dbc988 drm/i915: Record the position of the workarounds in the tail of the request
e41ddb5 drm/i915: Only queue requests during execlists submission
0ca57da drm/i915: Add a sw fence for collecting up dma fences

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: ✓ Fi.CI.BAT: success for series starting with [01/18] drm/i915: Add a sw fence for collecting up dma fences
  2016-08-30  8:57 ` ✓ Fi.CI.BAT: success for series starting with [01/18] drm/i915: Add a sw fence for collecting up dma fences Patchwork
@ 2016-08-30  9:04   ` Chris Wilson
  0 siblings, 0 replies; 27+ messages in thread
From: Chris Wilson @ 2016-08-30  9:04 UTC (permalink / raw)
  To: intel-gfx

On Tue, Aug 30, 2016 at 08:57:12AM -0000, Patchwork wrote:
> == Series Details ==
> 
> Series: series starting with [01/18] drm/i915: Add a sw fence for collecting up dma fences
> URL   : https://patchwork.freedesktop.org/series/11768/
> State : success
> 
> == Summary ==
> 
> Series 11768v1 Series without cover letter
> http://patchwork.freedesktop.org/api/1.0/series/11768/revisions/1/mbox/
> 
> 
> fi-bdw-5557u     total:252  pass:236  dwarn:0   dfail:0   fail:1   skip:15 
> fi-bsw-n3050     total:252  pass:206  dwarn:0   dfail:0   fail:0   skip:46 
> fi-hsw-4770k     total:252  pass:229  dwarn:0   dfail:0   fail:1   skip:22 
> fi-hsw-4770r     total:252  pass:225  dwarn:0   dfail:0   fail:1   skip:26 
> fi-ivb-3520m     total:252  pass:221  dwarn:0   dfail:0   fail:0   skip:31 
> fi-skl-6260u     total:252  pass:237  dwarn:0   dfail:0   fail:1   skip:14 
> fi-skl-6700k     total:252  pass:223  dwarn:0   dfail:0   fail:1   skip:28 
> fi-snb-2520m     total:252  pass:208  dwarn:0   dfail:0   fail:1   skip:43 
> fi-snb-2600      total:252  pass:208  dwarn:0   dfail:0   fail:1   skip:43 

Fewer fails than before, so where are my FAIL -> PASS?
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Tracking multiple timelines (full-ppgtt)
@ 2016-09-14  6:52 Chris Wilson
  2016-09-14  6:52 ` [PATCH 17/18] drm/i915: Enable userspace to opt-out of implicit fencing Chris Wilson
  0 siblings, 1 reply; 27+ messages in thread
From: Chris Wilson @ 2016-09-14  6:52 UTC (permalink / raw)
  To: intel-gfx

Here's how I think we can handle multiple timelines whilst preserving
the efficiency of retirement ordering of requests and the single waiter
property.
-Chris

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 17/18] drm/i915: Enable userspace to opt-out of implicit fencing
  2016-09-14  6:52 Tracking multiple timelines (full-ppgtt) Chris Wilson
@ 2016-09-14  6:52 ` Chris Wilson
  0 siblings, 0 replies; 27+ messages in thread
From: Chris Wilson @ 2016-09-14  6:52 UTC (permalink / raw)
  To: intel-gfx

Userspace is faced with a dilemma. The kernel requires implicit fencing
to manage resource usage (we always must wait for the GPU to finish
before releasing its PTE) and for third parties. However, userspace may
wish to avoid this serialisation if it is either using explicit fencing
between parties and wants more fine-grained access to buffers (e.g. it
may partition the buffer between uses and track fences on ranges rather
than the implicit fences tracking the whole object). It follows that
userspace needs a mechanism to avoid the kernel's serialisation on its
implicit fences before execbuf execution.

The next question is whether this is an object, execbuf or context flag.
Hybrid users (such as using explicit EGL_ANDROID_native_sync fencing on
shared winsys buffers, but implicit fencing on internal surfaces)
require a per-object level flag. Given that this flag need to be only
set once for the lifetime of the object, this reduces the convenience of
having an execbuf or context level flag (and avoids having multiple
pieces of uABI controlling the same feature).

Incorrect use of this flag will result in rendering corruption and GPU
hangs - but will not result in use-after-free or similar resource
tracking issues.

Serious caveat: write ordering is not strictly correct after setting
this flag on a render target on multiple engines. This affects all
subsequent GEM operations (execbuf, set-domain, pread) and shared
dma-buf operations. A fix is possible - but costly (both in terms of
further ABI changes and runtime overhead).

Testcase: igt/gem_exec_async
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_drv.c            |  1 +
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |  3 +++
 include/uapi/drm/i915_drm.h                | 27 ++++++++++++++++++++++++++-
 3 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 5719deefd484..87748c8c091e 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -332,6 +332,7 @@ static int i915_getparam(struct drm_device *dev, void *data,
 	case I915_PARAM_HAS_EXEC_HANDLE_LUT:
 	case I915_PARAM_HAS_COHERENT_PHYS_GTT:
 	case I915_PARAM_HAS_EXEC_SOFTPIN:
+	case I915_PARAM_HAS_EXEC_ASYNC:
 		/* For the time being all of these are always true;
 		 * if some supported hardware does not have one of these
 		 * features this value needs to be provided from
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 6b5175ee824c..7a20dd2e5b55 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1107,6 +1107,9 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req,
 	list_for_each_entry(vma, vmas, exec_list) {
 		struct drm_i915_gem_object *obj = vma->obj;
 
+		if (vma->exec_entry->flags & EXEC_OBJECT_ASYNC)
+			continue;
+
 		ret = i915_gem_request_await_object
 			(req, obj, obj->base.pending_write_domain);
 		if (ret)
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 93ad6f9d2496..66ee7d30329f 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -388,6 +388,10 @@ typedef struct drm_i915_irq_wait {
 #define I915_PARAM_HAS_POOLED_EU	 38
 #define I915_PARAM_MIN_EU_IN_POOL	 39
 #define I915_PARAM_MMAP_GTT_VERSION	 40
+/* Query whether DRM_I915_GEM_EXECBUFFER2 supports the ability to opt-out of
+ * synchronisation with implicit fencing on individual objects.
+ */
+#define I915_PARAM_HAS_EXEC_ASYNC	 41
 
 typedef struct drm_i915_getparam {
 	__s32 param;
@@ -729,8 +733,29 @@ struct drm_i915_gem_exec_object2 {
 #define EXEC_OBJECT_SUPPORTS_48B_ADDRESS (1<<3)
 #define EXEC_OBJECT_PINNED		 (1<<4)
 #define EXEC_OBJECT_PAD_TO_SIZE		 (1<<5)
+/* The kernel implicitly tracks GPU activity on all GEM objects, and
+ * synchronises operations with outstanding rendering. This includes
+ * rendering on other devices if exported via dma-buf. However, sometimes
+ * this tracking is too coarse and the user knows better. For example,
+ * if the object is split into non-overlapping ranges shared between different
+ * clients or engines (i.e. suballocating objects), the implicit tracking
+ * by kernel assumes that each operation affects the whole object rather
+ * than an individual range, causing needless synchronisation between clients.
+ * The kernel will also forgo any CPU cache flushes prior to rendering from
+ * the object as the client is expected to be also handling such domain
+ * tracking.
+ *
+ * The kernel maintains the implicit tracking in order to manage resources
+ * used by the GPU - this flag only disables the synchronisation prior to
+ * rendering with this object in this execbuf.
+ *
+ * Opting out of implicit synhronisation requires the user to do its own
+ * explicit tracking to avoid rendering corruption. See, for example,
+ * I915_PARAM_HAS_EXEC_FENCE to order execbufs and execute them asynchronously.
+ */
+#define EXEC_OBJECT_ASYNC		 (1<<6)
 /* All remaining bits are MBZ and RESERVED FOR FUTURE USE */
-#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_PAD_TO_SIZE<<1)
+#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_ASYNC<<1)
 	__u64 flags;
 
 	union {
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2016-09-14  6:53 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-08-30  8:17 Non-blocking fences, now GuC compatible Chris Wilson
2016-08-30  8:17 ` [PATCH 01/18] drm/i915: Add a sw fence for collecting up dma fences Chris Wilson
2016-08-30  8:17 ` [PATCH 02/18] drm/i915: Only queue requests during execlists submission Chris Wilson
2016-08-30  8:17 ` [PATCH 03/18] drm/i915: Record the position of the workarounds in the tail of the request Chris Wilson
2016-08-30  8:17 ` [PATCH 04/18] drm/i915: Compute the ELSP register location once Chris Wilson
2016-08-30  8:17 ` [PATCH 05/18] drm/i915: Reorder submitting the requests to ELSP Chris Wilson
2016-08-30  8:18 ` [PATCH 06/18] drm/i915: Simplify ELSP queue request tracking Chris Wilson
2016-08-30  8:18 ` [PATCH 07/18] drm/i915: Separate out reset flags from the reset counter Chris Wilson
2016-08-30  8:18 ` [PATCH 08/18] drm/i915: Drop local struct_mutex around intel_init_emon[ilk] Chris Wilson
2016-08-30  8:18 ` [PATCH 09/18] drm/i915: Expand bool interruptible to pass flags to i915_wait_request() Chris Wilson
2016-08-30  8:18 ` [PATCH 10/18] drm/i915: Perform a direct reset of the GPU from the waiter Chris Wilson
2016-08-30  8:18 ` [PATCH 11/18] drm/i915: Update reset path to fix incomplete requests Chris Wilson
2016-08-30  8:18 ` [PATCH 12/18] drm/i915: Drive request submission through fence callbacks Chris Wilson
2016-08-31 11:05   ` John Harrison
2016-08-30  8:18 ` [PATCH 13/18] drm/i915: Move execbuf object synchronisation to i915_gem_execbuffer Chris Wilson
2016-08-30  8:18 ` [PATCH 14/18] drm/i915/guc: Prepare for nonblocking execbuf submission Chris Wilson
2016-09-02 18:20   ` Dave Gordon
2016-09-05  8:08     ` Chris Wilson
2016-08-30  8:18 ` [PATCH 15/18] drm/i915: Nonblocking request submission Chris Wilson
2016-09-02 15:49   ` John Harrison
2016-08-30  8:18 ` [PATCH 16/18] drm/i915: Serialise execbuf operation after a dma-buf reservation object Chris Wilson
2016-08-30  8:18 ` [PATCH 17/18] drm/i915: Enable userspace to opt-out of implicit fencing Chris Wilson
2016-08-30  8:18 ` [PATCH 18/18] drm/i915: Support explicit fencing for execbuf Chris Wilson
2016-09-02 15:56   ` John Harrison
2016-08-30  8:57 ` ✓ Fi.CI.BAT: success for series starting with [01/18] drm/i915: Add a sw fence for collecting up dma fences Patchwork
2016-08-30  9:04   ` Chris Wilson
2016-09-14  6:52 Tracking multiple timelines (full-ppgtt) Chris Wilson
2016-09-14  6:52 ` [PATCH 17/18] drm/i915: Enable userspace to opt-out of implicit fencing Chris Wilson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.