All of lore.kernel.org
 help / color / mirror / Atom feed
* Shortest path to EGL_ANDRIOD_native_sync
@ 2016-08-25  9:08 Chris Wilson
  2016-08-25  9:08 ` [PATCH 01/13] drm/i915: Add a sw fence for collecting up dma fences Chris Wilson
                   ` (20 more replies)
  0 siblings, 21 replies; 49+ messages in thread
From: Chris Wilson @ 2016-08-25  9:08 UTC (permalink / raw)
  To: intel-gfx

Features: PRIME fencing, explicit sync_file fencing, nonblocking execbuf
dispatch at marginal cost (~10%) to throughput and latency

Missing: all the performance and general regression fixes.
-Chris

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 49+ messages in thread

* [PATCH 01/13] drm/i915: Add a sw fence for collecting up dma fences
  2016-08-25  9:08 Shortest path to EGL_ANDRIOD_native_sync Chris Wilson
@ 2016-08-25  9:08 ` Chris Wilson
  2016-08-25 10:42   ` John Harrison
  2016-08-25 11:49   ` Joonas Lahtinen
  2016-08-25  9:08 ` [PATCH 02/13] drm/i915: Only queue requests during execlists submission Chris Wilson
                   ` (19 subsequent siblings)
  20 siblings, 2 replies; 49+ messages in thread
From: Chris Wilson @ 2016-08-25  9:08 UTC (permalink / raw)
  To: intel-gfx

This is really a core kernel struct in disguise until we can finally
place it in kernel/. There is an immediate need for a fence collection
mechanism that is more flexible than fence-array, in particular being
able to easily drive request submission via events (and not just
interrupt driven). The same mechanism would be useful for handling
nonblocking and asynchronous atomic modesets, parallel execution and
more, but for the time being just create a local sw fence for execbuf.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/Makefile        |   1 +
 drivers/gpu/drm/i915/i915_sw_fence.c | 329 +++++++++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/i915_sw_fence.h |  56 ++++++
 3 files changed, 386 insertions(+)
 create mode 100644 drivers/gpu/drm/i915/i915_sw_fence.c
 create mode 100644 drivers/gpu/drm/i915/i915_sw_fence.h

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index a7da24640e88..a998c2bce70a 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -16,6 +16,7 @@ i915-y := i915_drv.o \
 	  i915_params.o \
 	  i915_pci.o \
           i915_suspend.o \
+	  i915_sw_fence.o \
 	  i915_sysfs.o \
 	  intel_csr.o \
 	  intel_device_info.o \
diff --git a/drivers/gpu/drm/i915/i915_sw_fence.c b/drivers/gpu/drm/i915/i915_sw_fence.c
new file mode 100644
index 000000000000..44a02d836a94
--- /dev/null
+++ b/drivers/gpu/drm/i915/i915_sw_fence.c
@@ -0,0 +1,329 @@
+/*
+ * (C) Copyright 2016 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/slab.h>
+#include <linux/fence.h>
+#include <linux/reservation.h>
+
+#include "i915_sw_fence.h"
+
+static DEFINE_SPINLOCK(i915_sw_fence_lock);
+
+static int __i915_sw_fence_notify(struct i915_sw_fence *fence)
+{
+	i915_sw_fence_notify_t fn;
+
+	fn = (i915_sw_fence_notify_t)(fence->flags & I915_SW_FENCE_MASK);
+	return fn(fence);
+}
+
+static void i915_sw_fence_free(struct kref *kref)
+{
+	struct i915_sw_fence *fence = container_of(kref, typeof(*fence), kref);
+
+	WARN_ON(atomic_read(&fence->pending) > 0);
+
+	if (fence->flags & I915_SW_FENCE_MASK)
+		WARN_ON(__i915_sw_fence_notify(fence) != NOTIFY_DONE);
+	else
+		kfree(fence);
+}
+
+static void i915_sw_fence_put(struct i915_sw_fence *fence)
+{
+	kref_put(&fence->kref, i915_sw_fence_free);
+}
+
+static struct i915_sw_fence *i915_sw_fence_get(struct i915_sw_fence *fence)
+{
+	kref_get(&fence->kref);
+	return fence;
+}
+
+static void __i915_sw_fence_wake_up_all(struct i915_sw_fence *fence,
+				 struct list_head *continuation)
+{
+	wait_queue_head_t *x = &fence->wait;
+	unsigned long flags;
+
+	atomic_dec(&fence->pending);
+
+	/*
+	 * To prevent unbounded recursion as we traverse the graph of i915_sw_fences,
+	 * we move the task_list from this the next ready fence to the tail of
+	 * the original fence's task_list (and so added to the list to be
+	 * woken).
+	 */
+	smp_mb__before_spinlock();
+	if (!list_empty_careful(&x->task_list)) {
+		spin_lock_irqsave_nested(&x->lock, flags, 1 + !!continuation);
+		if (continuation) {
+			list_splice_tail_init(&x->task_list, continuation);
+		} else {
+			LIST_HEAD(extra);
+
+			do {
+				__wake_up_locked_key(x, TASK_NORMAL, &extra);
+
+				if (list_empty(&extra))
+					break;
+
+				list_splice_tail_init(&extra, &x->task_list);
+			} while (1);
+		}
+		spin_unlock_irqrestore(&x->lock, flags);
+	}
+}
+
+static void __i915_sw_fence_complete(struct i915_sw_fence *fence,
+				     struct list_head *continuation)
+{
+	if (!atomic_dec_and_test(&fence->pending))
+		return;
+
+	if (fence->flags & I915_SW_FENCE_MASK &&
+	    __i915_sw_fence_notify(fence) != NOTIFY_DONE)
+		return;
+
+	__i915_sw_fence_wake_up_all(fence, continuation);
+}
+
+static void i915_sw_fence_complete(struct i915_sw_fence *fence)
+{
+	if (WARN_ON(i915_sw_fence_done(fence)))
+		return;
+
+	__i915_sw_fence_complete(fence, NULL);
+}
+
+static void i915_sw_fence_await(struct i915_sw_fence *fence)
+{
+	WARN_ON(atomic_inc_return(&fence->pending) <= 1);
+}
+
+static void i915_sw_fence_wait(struct i915_sw_fence *fence)
+{
+	wait_event(fence->wait, i915_sw_fence_done(fence));
+}
+
+void i915_sw_fence_init(struct i915_sw_fence *fence, i915_sw_fence_notify_t fn)
+{
+	BUG_ON((unsigned long)fn & ~I915_SW_FENCE_MASK);
+
+	init_waitqueue_head(&fence->wait);
+	kref_init(&fence->kref);
+	atomic_set(&fence->pending, 1);
+	fence->flags = (unsigned long)fn;
+}
+
+void i915_sw_fence_commit(struct i915_sw_fence *fence)
+{
+	i915_sw_fence_complete(fence);
+	i915_sw_fence_put(fence);
+}
+
+static int i915_sw_fence_wake(wait_queue_t *wq, unsigned mode, int flags, void *key)
+{
+	list_del(&wq->task_list);
+	__i915_sw_fence_complete(wq->private, key);
+	i915_sw_fence_put(wq->private);
+	kfree(wq);
+	return 0;
+}
+
+static bool __i915_sw_fence_check_if_after(struct i915_sw_fence *fence,
+				    const struct i915_sw_fence * const signaler)
+{
+	wait_queue_t *wq;
+
+	if (__test_and_set_bit(I915_SW_FENCE_CHECKED_BIT, &fence->flags))
+		return false;
+
+	if (fence == signaler)
+		return true;
+
+	list_for_each_entry(wq, &fence->wait.task_list, task_list) {
+		if (wq->func != i915_sw_fence_wake)
+			continue;
+
+		if (__i915_sw_fence_check_if_after(wq->private, signaler))
+			return true;
+	}
+
+	return false;
+}
+
+static void __i915_sw_fence_clear_checked_bit(struct i915_sw_fence *fence)
+{
+	wait_queue_t *wq;
+
+	if (!__test_and_clear_bit(I915_SW_FENCE_CHECKED_BIT, &fence->flags))
+		return;
+
+	list_for_each_entry(wq, &fence->wait.task_list, task_list) {
+		if (wq->func != i915_sw_fence_wake)
+			continue;
+
+		__i915_sw_fence_clear_checked_bit(wq->private);
+	}
+}
+
+static bool i915_sw_fence_check_if_after(struct i915_sw_fence *fence,
+				  const struct i915_sw_fence * const signaler)
+{
+	unsigned long flags;
+	bool err;
+
+	if (!IS_ENABLED(CONFIG_I915_SW_FENCE_CHECK_DAG))
+		return false;
+
+	spin_lock_irqsave(&i915_sw_fence_lock, flags);
+	err = __i915_sw_fence_check_if_after(fence, signaler);
+	__i915_sw_fence_clear_checked_bit(fence);
+	spin_unlock_irqrestore(&i915_sw_fence_lock, flags);
+
+	return err;
+}
+
+static wait_queue_t *__i915_sw_fence_create_wq(struct i915_sw_fence *fence, gfp_t gfp)
+{
+	wait_queue_t *wq;
+
+	wq = kmalloc(sizeof(*wq), gfp);
+	if (unlikely(!wq))
+		return NULL;
+
+	INIT_LIST_HEAD(&wq->task_list);
+	wq->flags = 0;
+	wq->func = i915_sw_fence_wake;
+	wq->private = i915_sw_fence_get(fence);
+
+	i915_sw_fence_await(fence);
+
+	return wq;
+}
+
+int i915_sw_fence_await_sw_fence(struct i915_sw_fence *fence,
+				 struct i915_sw_fence *signaler,
+				 gfp_t gfp)
+{
+	wait_queue_t *wq;
+	unsigned long flags;
+	int pending;
+
+	if (i915_sw_fence_done(signaler))
+		return 0;
+
+	/* The dependency graph must be acyclic. */
+	if (unlikely(i915_sw_fence_check_if_after(fence, signaler)))
+		return -EINVAL;
+
+	wq = __i915_sw_fence_create_wq(fence, gfp);
+	if (unlikely(!wq)) {
+		if (!gfpflags_allow_blocking(gfp))
+			return -ENOMEM;
+
+		i915_sw_fence_wait(signaler);
+		return 0;
+	}
+
+	spin_lock_irqsave(&signaler->wait.lock, flags);
+	if (likely(!i915_sw_fence_done(signaler))) {
+		__add_wait_queue_tail(&signaler->wait, wq);
+		pending = 1;
+	} else {
+		i915_sw_fence_wake(wq, 0, 0, NULL);
+		pending = 0;
+	}
+	spin_unlock_irqrestore(&signaler->wait.lock, flags);
+
+	return pending;
+}
+
+struct dma_fence_cb {
+	struct fence_cb base;
+	struct i915_sw_fence *fence;
+};
+
+static void dma_i915_sw_fence_wake(struct fence *dma, struct fence_cb *data)
+{
+	struct dma_fence_cb *cb = container_of(data, typeof(*cb), base);
+
+	i915_sw_fence_commit(cb->fence);
+	kfree(cb);
+}
+
+int i915_sw_fence_await_dma_fence(struct i915_sw_fence *fence,
+				  struct fence *dma, gfp_t gfp)
+{
+	struct dma_fence_cb *cb;
+	int ret;
+
+	if (fence_is_signaled(dma))
+		return 0;
+
+	cb = kmalloc(sizeof(*cb), gfp);
+	if (!cb) {
+		if (!gfpflags_allow_blocking(gfp))
+			return -ENOMEM;
+
+		return fence_wait(dma, false);
+	}
+
+	cb->fence = i915_sw_fence_get(fence);
+	i915_sw_fence_await(fence);
+
+	ret = fence_add_callback(dma, &cb->base, dma_i915_sw_fence_wake);
+	if (ret == 0) {
+		ret = 1;
+	} else {
+		dma_i915_sw_fence_wake(dma, &cb->base);
+		if (ret == -ENOENT) /* fence already signaled */
+			ret = 0;
+	}
+
+	return ret;
+}
+
+int i915_sw_fence_await_reservation(struct i915_sw_fence *fence,
+				    struct reservation_object *resv,
+				    const struct fence_ops *exclude,
+				    bool write,
+				    gfp_t gfp)
+{
+	struct fence *excl, **shared;
+	unsigned int count, i;
+	int ret;
+
+	ret = reservation_object_get_fences_rcu(resv, &excl, &count, &shared);
+	if (ret)
+		return ret;
+
+	if (write) {
+		for (i = 0; i < count; i++) {
+			if (shared[i]->ops == exclude)
+				continue;
+
+			ret |= i915_sw_fence_await_dma_fence(fence, shared[i], gfp);
+			if (ret < 0)
+				goto out;
+		}
+	}
+
+	if (excl && excl->ops != exclude)
+		ret |= i915_sw_fence_await_dma_fence(fence, excl, gfp);
+
+out:
+	fence_put(excl);
+	for (i = 0; i < count; i++)
+		fence_put(shared[i]);
+	kfree(shared);
+
+	return ret;
+}
diff --git a/drivers/gpu/drm/i915/i915_sw_fence.h b/drivers/gpu/drm/i915/i915_sw_fence.h
new file mode 100644
index 000000000000..ac99c7965436
--- /dev/null
+++ b/drivers/gpu/drm/i915/i915_sw_fence.h
@@ -0,0 +1,56 @@
+/*
+ * i915_sw_fence.h - library routines for N:M synchronisation points
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * This file is released under the GPLv2.
+ *
+ */
+
+#ifndef _I915_SW_FENCE_H_
+#define _I915_SW_FENCE_H_
+
+#include <linux/gfp.h>
+#include <linux/kref.h>
+#include <linux/notifier.h> /* for NOTIFY_DONE */
+#include <linux/wait.h>
+
+struct completion;
+struct fence;
+struct fence_ops;
+struct reservation_object;
+
+struct i915_sw_fence {
+	wait_queue_head_t wait;
+	unsigned long flags;
+	struct kref kref;
+	atomic_t pending;
+};
+
+#define I915_SW_FENCE_CHECKED_BIT	0 /* used internally for DAG checking */
+#define I915_SW_FENCE_PRIVATE_BIT	1 /* available for use by owner */
+#define I915_SW_FENCE_MASK		(~3)
+
+typedef int (*i915_sw_fence_notify_t)(struct i915_sw_fence *);
+#define __i915_sw_fence_call __aligned(4)
+
+void i915_sw_fence_init(struct i915_sw_fence *fence, i915_sw_fence_notify_t fn);
+void i915_sw_fence_commit(struct i915_sw_fence *fence);
+
+int i915_sw_fence_await_sw_fence(struct i915_sw_fence *fence,
+				 struct i915_sw_fence *after,
+				 gfp_t gfp);
+int i915_sw_fence_await_dma_fence(struct i915_sw_fence *fence,
+				  struct fence *dma, gfp_t gfp);
+int i915_sw_fence_await_reservation(struct i915_sw_fence *fence,
+			     struct reservation_object *resv,
+			     const struct fence_ops *exclude,
+			     bool write,
+			     gfp_t gfp);
+
+static inline bool i915_sw_fence_done(const struct i915_sw_fence *fence)
+{
+	return atomic_read(&fence->pending) < 0;
+}
+
+#endif /* _I915_SW_FENCE_H_ */
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 49+ messages in thread

* [PATCH 02/13] drm/i915: Only queue requests during execlists submission
  2016-08-25  9:08 Shortest path to EGL_ANDRIOD_native_sync Chris Wilson
  2016-08-25  9:08 ` [PATCH 01/13] drm/i915: Add a sw fence for collecting up dma fences Chris Wilson
@ 2016-08-25  9:08 ` Chris Wilson
  2016-08-26  9:41   ` Mika Kuoppala
  2016-08-25  9:08 ` [PATCH 03/13] drm/i915: Record the position of the workarounds in the tail of the request Chris Wilson
                   ` (18 subsequent siblings)
  20 siblings, 1 reply; 49+ messages in thread
From: Chris Wilson @ 2016-08-25  9:08 UTC (permalink / raw)
  To: intel-gfx

Leave the more complicated request dequeueing to the tasklet and instead
just kick start the tasklet if we detect we are adding the first
request.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/intel_lrc.c | 28 ++++------------------------
 1 file changed, 4 insertions(+), 24 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 6b49df4316f4..ca52b8e63305 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -609,35 +609,15 @@ static void intel_lrc_irq_handler(unsigned long data)
 static void execlists_submit_request(struct drm_i915_gem_request *request)
 {
 	struct intel_engine_cs *engine = request->engine;
-	struct drm_i915_gem_request *cursor;
-	int num_elements = 0;
 
 	spin_lock_bh(&engine->execlist_lock);
 
-	list_for_each_entry(cursor, &engine->execlist_queue, execlist_link)
-		if (++num_elements > 2)
-			break;
-
-	if (num_elements > 2) {
-		struct drm_i915_gem_request *tail_req;
-
-		tail_req = list_last_entry(&engine->execlist_queue,
-					   struct drm_i915_gem_request,
-					   execlist_link);
-
-		if (request->ctx == tail_req->ctx) {
-			WARN(tail_req->elsp_submitted != 0,
-				"More than 2 already-submitted reqs queued\n");
-			list_del(&tail_req->execlist_link);
-			i915_gem_request_put(tail_req);
-		}
-	}
-
 	i915_gem_request_get(request);
-	list_add_tail(&request->execlist_link, &engine->execlist_queue);
 	request->ctx_hw_id = request->ctx->hw_id;
-	if (num_elements == 0)
-		execlists_unqueue(engine);
+
+	list_add_tail(&request->execlist_link, &engine->execlist_queue);
+	if (request->execlist_link.prev == &engine->execlist_queue)
+		tasklet_hi_schedule(&engine->irq_tasklet);
 
 	spin_unlock_bh(&engine->execlist_lock);
 }
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 49+ messages in thread

* [PATCH 03/13] drm/i915: Record the position of the workarounds in the tail of the request
  2016-08-25  9:08 Shortest path to EGL_ANDRIOD_native_sync Chris Wilson
  2016-08-25  9:08 ` [PATCH 01/13] drm/i915: Add a sw fence for collecting up dma fences Chris Wilson
  2016-08-25  9:08 ` [PATCH 02/13] drm/i915: Only queue requests during execlists submission Chris Wilson
@ 2016-08-25  9:08 ` Chris Wilson
  2016-08-25  9:35   ` Mika Kuoppala
  2016-08-25  9:08 ` [PATCH 04/13] drm/i915: Compute the ELSP register location once Chris Wilson
                   ` (17 subsequent siblings)
  20 siblings, 1 reply; 49+ messages in thread
From: Chris Wilson @ 2016-08-25  9:08 UTC (permalink / raw)
  To: intel-gfx

Rather than blindly assuming we need to advance the tail for
resubmitting the request via the ELSP, record the position.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem_request.h | 15 +++++++++------
 drivers/gpu/drm/i915/intel_lrc.c        |  4 ++--
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index 91014de8bfbc..2faa3bb4c39b 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -88,20 +88,23 @@ struct drm_i915_gem_request {
 	 */
 	u32 previous_seqno;
 
-	/** Position in the ringbuffer of the start of the request */
+	/** Position in the ring of the start of the request */
 	u32 head;
 
 	/**
-	 * Position in the ringbuffer of the start of the postfix.
-	 * This is required to calculate the maximum available ringbuffer
-	 * space without overwriting the postfix.
+	 * Position in the ring of the start of the postfix.
+	 * This is required to calculate the maximum available ring space
+	 * without overwriting the postfix.
 	 */
 	u32 postfix;
 
-	/** Position in the ringbuffer of the end of the whole request */
+	/** Position in the ring of the end of the whole request */
 	u32 tail;
 
-	/** Preallocate space in the ringbuffer for the emitting the request */
+	/** Position in the ring of the end of any workarounds after the tail */
+	u32 wa_tail;
+
+	/** Preallocate space in the ring for the emitting the request */
 	u32 reserved_space;
 
 	/**
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index ca52b8e63305..25e0972e7166 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -482,8 +482,7 @@ static void execlists_unqueue(struct intel_engine_cs *engine)
 		 * resubmit the request. See gen8_emit_request() for where we
 		 * prepare the padding after the end of the request.
 		 */
-		req0->tail += 8;
-		req0->tail &= req0->ring->size - 1;
+		req0->tail = req0->wa_tail;
 	}
 
 	execlists_elsp_submit_contexts(req0, req1);
@@ -711,6 +710,7 @@ intel_logical_ring_advance(struct drm_i915_gem_request *request)
 	intel_ring_emit(ring, MI_NOOP);
 	intel_ring_emit(ring, MI_NOOP);
 	intel_ring_advance(ring);
+	request->wa_tail = ring->tail;
 
 	/* We keep the previous context alive until we retire the following
 	 * request. This ensures that any the context object is still pinned
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 49+ messages in thread

* [PATCH 04/13] drm/i915: Compute the ELSP register location once
  2016-08-25  9:08 Shortest path to EGL_ANDRIOD_native_sync Chris Wilson
                   ` (2 preceding siblings ...)
  2016-08-25  9:08 ` [PATCH 03/13] drm/i915: Record the position of the workarounds in the tail of the request Chris Wilson
@ 2016-08-25  9:08 ` Chris Wilson
  2016-08-25  9:51   ` Mika Kuoppala
  2016-08-25  9:08 ` [PATCH 05/13] drm/i915: Reorder submitting the requests to ELSP Chris Wilson
                   ` (16 subsequent siblings)
  20 siblings, 1 reply; 49+ messages in thread
From: Chris Wilson @ 2016-08-25  9:08 UTC (permalink / raw)
  To: intel-gfx

Similar to the issue with reading from the context status buffer, we
frequently write to the ELSP register (4 writes per interrupt) and know
we hold the required spinlock and forcewake throughout. We can
shortcircuit the I915_WRITE() by precomputing the address of the ELSP
register and avoid all the known checks.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/intel_lrc.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 25e0972e7166..15873e69e7fe 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -331,10 +331,11 @@ uint64_t intel_lr_context_descriptor(struct i915_gem_context *ctx,
 static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
 				 struct drm_i915_gem_request *rq1)
 {
-
 	struct intel_engine_cs *engine = rq0->engine;
 	struct drm_i915_private *dev_priv = rq0->i915;
-	uint64_t desc[2];
+	u32 __iomem *elsp =
+		dev_priv->regs + i915_mmio_reg_offset(RING_ELSP(engine));
+	u64 desc[2];
 
 	if (rq1) {
 		desc[1] = intel_lr_context_descriptor(rq1->ctx, rq1->engine);
@@ -347,15 +348,12 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
 	rq0->elsp_submitted++;
 
 	/* You must always write both descriptors in the order below. */
-	I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[1]));
-	I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[1]));
+	writel(upper_32_bits(desc[1]), elsp);
+	writel(lower_32_bits(desc[1]), elsp);
 
-	I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[0]));
+	writel(upper_32_bits(desc[0]), elsp);
 	/* The context is automatically loaded after the following */
-	I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[0]));
-
-	/* ELSP is a wo register, use another nearby reg for posting */
-	POSTING_READ_FW(RING_EXECLIST_STATUS_LO(engine));
+	writel(lower_32_bits(desc[0]), elsp);
 }
 
 static void
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 49+ messages in thread

* [PATCH 05/13] drm/i915: Reorder submitting the requests to ELSP
  2016-08-25  9:08 Shortest path to EGL_ANDRIOD_native_sync Chris Wilson
                   ` (3 preceding siblings ...)
  2016-08-25  9:08 ` [PATCH 04/13] drm/i915: Compute the ELSP register location once Chris Wilson
@ 2016-08-25  9:08 ` Chris Wilson
  2016-08-25 13:02   ` Mika Kuoppala
  2016-08-25  9:08 ` [PATCH 06/13] drm/i915: Simplify ELSP queue request tracking Chris Wilson
                   ` (15 subsequent siblings)
  20 siblings, 1 reply; 49+ messages in thread
From: Chris Wilson @ 2016-08-25  9:08 UTC (permalink / raw)
  To: intel-gfx

Just rearrange the code to reduce churn in the next patch.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/intel_lrc.c | 76 ++++++++++++++++++++--------------------
 1 file changed, 38 insertions(+), 38 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 15873e69e7fe..d0a1f14a4f14 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -328,32 +328,18 @@ uint64_t intel_lr_context_descriptor(struct i915_gem_context *ctx,
 	return ctx->engine[engine->id].lrc_desc;
 }
 
-static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
-				 struct drm_i915_gem_request *rq1)
+static inline void
+execlists_context_status_change(struct drm_i915_gem_request *rq,
+				unsigned long status)
 {
-	struct intel_engine_cs *engine = rq0->engine;
-	struct drm_i915_private *dev_priv = rq0->i915;
-	u32 __iomem *elsp =
-		dev_priv->regs + i915_mmio_reg_offset(RING_ELSP(engine));
-	u64 desc[2];
-
-	if (rq1) {
-		desc[1] = intel_lr_context_descriptor(rq1->ctx, rq1->engine);
-		rq1->elsp_submitted++;
-	} else {
-		desc[1] = 0;
-	}
-
-	desc[0] = intel_lr_context_descriptor(rq0->ctx, rq0->engine);
-	rq0->elsp_submitted++;
-
-	/* You must always write both descriptors in the order below. */
-	writel(upper_32_bits(desc[1]), elsp);
-	writel(lower_32_bits(desc[1]), elsp);
+	/*
+	 * Only used when GVT-g is enabled now. When GVT-g is disabled,
+	 * The compiler should eliminate this function as dead-code.
+	 */
+	if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
+		return;
 
-	writel(upper_32_bits(desc[0]), elsp);
-	/* The context is automatically loaded after the following */
-	writel(lower_32_bits(desc[0]), elsp);
+	atomic_notifier_call_chain(&rq->ctx->status_notifier, status, rq);
 }
 
 static void
@@ -382,6 +368,34 @@ static void execlists_update_context(struct drm_i915_gem_request *rq)
 		execlists_update_context_pdps(ppgtt, reg_state);
 }
 
+static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
+				 struct drm_i915_gem_request *rq1)
+{
+	struct intel_engine_cs *engine = rq0->engine;
+	struct drm_i915_private *dev_priv = rq0->i915;
+	u32 __iomem *elsp =
+		dev_priv->regs + i915_mmio_reg_offset(RING_ELSP(engine));
+	u64 desc[2];
+
+	if (rq1) {
+		desc[1] = intel_lr_context_descriptor(rq1->ctx, rq1->engine);
+		rq1->elsp_submitted++;
+	} else {
+		desc[1] = 0;
+	}
+
+	desc[0] = intel_lr_context_descriptor(rq0->ctx, rq0->engine);
+	rq0->elsp_submitted++;
+
+	/* You must always write both descriptors in the order below. */
+	writel(upper_32_bits(desc[1]), elsp);
+	writel(lower_32_bits(desc[1]), elsp);
+
+	writel(upper_32_bits(desc[0]), elsp);
+	/* The context is automatically loaded after the following */
+	writel(lower_32_bits(desc[0]), elsp);
+}
+
 static void execlists_elsp_submit_contexts(struct drm_i915_gem_request *rq0,
 					   struct drm_i915_gem_request *rq1)
 {
@@ -402,20 +416,6 @@ static void execlists_elsp_submit_contexts(struct drm_i915_gem_request *rq0,
 	spin_unlock_irq(&dev_priv->uncore.lock);
 }
 
-static inline void execlists_context_status_change(
-		struct drm_i915_gem_request *rq,
-		unsigned long status)
-{
-	/*
-	 * Only used when GVT-g is enabled now. When GVT-g is disabled,
-	 * The compiler should eliminate this function as dead-code.
-	 */
-	if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
-		return;
-
-	atomic_notifier_call_chain(&rq->ctx->status_notifier, status, rq);
-}
-
 static void execlists_unqueue(struct intel_engine_cs *engine)
 {
 	struct drm_i915_gem_request *req0 = NULL, *req1 = NULL;
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 49+ messages in thread

* [PATCH 06/13] drm/i915: Simplify ELSP queue request tracking
  2016-08-25  9:08 Shortest path to EGL_ANDRIOD_native_sync Chris Wilson
                   ` (4 preceding siblings ...)
  2016-08-25  9:08 ` [PATCH 05/13] drm/i915: Reorder submitting the requests to ELSP Chris Wilson
@ 2016-08-25  9:08 ` Chris Wilson
  2016-08-25  9:08 ` [PATCH 07/13] drm/i915: Update reset path to fix incomplete requests Chris Wilson
                   ` (14 subsequent siblings)
  20 siblings, 0 replies; 49+ messages in thread
From: Chris Wilson @ 2016-08-25  9:08 UTC (permalink / raw)
  To: intel-gfx; +Cc: Mika Kuoppala

Emulate HW to track and manage ELSP queue. A set of SW ports are defined
and requests are assigned to these ports before submitting them to HW. This
helps in cleaning up incomplete requests during reset recovery easier
especially after engine reset by decoupling elsp queue management. This
will become more clear in the next patch.

In the engine reset case we want to resume where we left-off after skipping
the incomplete batch which requires checking the elsp queue, removing
element and fixing elsp_submitted counts in some cases. Instead of directly
manipulating the elsp queue from reset path we can examine these ports, fix
up ringbuffer pointers using the incomplete request and restart submissions
again after reset.

Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Cc: Arun Siluvery <arun.siluvery@linux.intel.com>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: http://patchwork.freedesktop.org/patch/msgid/1470414607-32453-3-git-send-email-arun.siluvery@linux.intel.com
---
 drivers/gpu/drm/i915/i915_debugfs.c     |   2 +-
 drivers/gpu/drm/i915/i915_gem.c         |  19 +-
 drivers/gpu/drm/i915/i915_gem_request.c |   1 -
 drivers/gpu/drm/i915/i915_gem_request.h |  21 +-
 drivers/gpu/drm/i915/intel_lrc.c        | 361 +++++++++++---------------------
 drivers/gpu/drm/i915/intel_lrc.h        |   2 -
 drivers/gpu/drm/i915/intel_ringbuffer.h |   7 +-
 7 files changed, 146 insertions(+), 267 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 7d7b4d9280e9..27f5d08384d2 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -2074,7 +2074,7 @@ static int i915_execlists(struct seq_file *m, void *data)
 		status_pointer = I915_READ(RING_CONTEXT_STATUS_PTR(engine));
 		seq_printf(m, "\tStatus pointer: 0x%08X\n", status_pointer);
 
-		read_pointer = engine->next_context_status_buffer;
+		read_pointer = GEN8_CSB_READ_PTR(status_pointer);
 		write_pointer = GEN8_CSB_WRITE_PTR(status_pointer);
 		if (read_pointer > write_pointer)
 			write_pointer += GEN8_CSB_ENTRIES;
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 04607d4115d6..23ac08819950 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2520,8 +2520,15 @@ static void i915_gem_reset_engine_status(struct intel_engine_cs *engine)
 static void i915_gem_reset_engine_cleanup(struct intel_engine_cs *engine)
 {
 	struct drm_i915_gem_request *request;
+	unsigned long flags;
 	struct intel_ring *ring;
 
+	/* Ensure irq handler finishes or is cancelled, and not run again. */
+	local_irq_save(flags);
+	tasklet_kill(&engine->irq_tasklet);
+	tasklet_disable(&engine->irq_tasklet);
+	local_irq_restore(flags);
+
 	/* Mark all pending requests as complete so that any concurrent
 	 * (lockless) lookup doesn't try and wait upon the request as we
 	 * reset it.
@@ -2535,10 +2542,12 @@ static void i915_gem_reset_engine_cleanup(struct intel_engine_cs *engine)
 	 */
 
 	if (i915.enable_execlists) {
-		/* Ensure irq handler finishes or is cancelled. */
-		tasklet_kill(&engine->irq_tasklet);
-
-		intel_execlists_cancel_requests(engine);
+		spin_lock(&engine->execlist_lock);
+		INIT_LIST_HEAD(&engine->execlist_queue);
+		i915_gem_request_put(engine->execlist_port[0].request);
+		i915_gem_request_put(engine->execlist_port[1].request);
+		memset(engine->execlist_port, 0, sizeof(engine->execlist_port));
+		spin_unlock(&engine->execlist_lock);
 	}
 
 	/*
@@ -2567,6 +2576,8 @@ static void i915_gem_reset_engine_cleanup(struct intel_engine_cs *engine)
 	}
 
 	engine->i915->gt.active_engines &= ~intel_engine_flag(engine);
+
+	tasklet_enable(&engine->irq_tasklet);
 }
 
 void i915_gem_reset(struct drm_device *dev)
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 1a215320cefb..fa5e36de55d0 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -402,7 +402,6 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 	req->previous_context = NULL;
 	req->file_priv = NULL;
 	req->batch = NULL;
-	req->elsp_submitted = 0;
 
 	/*
 	 * Reserve space in the ring buffer for all the commands required to
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index 2faa3bb4c39b..a231bd318ef0 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -137,27 +137,8 @@ struct drm_i915_gem_request {
 	/** file_priv list entry for this request */
 	struct list_head client_list;
 
-	/**
-	 * The ELSP only accepts two elements at a time, so we queue
-	 * context/tail pairs on a given queue (ring->execlist_queue) until the
-	 * hardware is available. The queue serves a double purpose: we also use
-	 * it to keep track of the up to 2 contexts currently in the hardware
-	 * (usually one in execution and the other queued up by the GPU): We
-	 * only remove elements from the head of the queue when the hardware
-	 * informs us that an element has been completed.
-	 *
-	 * All accesses to the queue are mediated by a spinlock
-	 * (ring->execlist_lock).
-	 */
-
-	/** Execlist link in the submission queue.*/
+	/** Link in the execlist submission queue, guarded by execlist_lock. */
 	struct list_head execlist_link;
-
-	/** Execlists no. of times this request has been sent to the ELSP */
-	int elsp_submitted;
-
-	/** Execlists context hardware id. */
-	unsigned int ctx_hw_id;
 };
 
 extern const struct fence_ops i915_fence_ops;
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index d0a1f14a4f14..325c1273450f 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -156,6 +156,11 @@
 #define GEN8_CTX_STATUS_COMPLETE	(1 << 4)
 #define GEN8_CTX_STATUS_LITE_RESTORE	(1 << 15)
 
+#define GEN8_CTX_STATUS_COMPLETED_MASK \
+	 (GEN8_CTX_STATUS_ACTIVE_IDLE | \
+	  GEN8_CTX_STATUS_PREEMPTED | \
+	  GEN8_CTX_STATUS_ELEMENT_SWITCH)
+
 #define CTX_LRI_HEADER_0		0x01
 #define CTX_CONTEXT_CONTROL		0x02
 #define CTX_RING_HEAD			0x04
@@ -263,12 +268,10 @@ logical_ring_init_platform_invariants(struct intel_engine_cs *engine)
 {
 	struct drm_i915_private *dev_priv = engine->i915;
 
-	if (IS_GEN8(dev_priv) || IS_GEN9(dev_priv))
-		engine->idle_lite_restore_wa = ~0;
-
-	engine->disable_lite_restore_wa = (IS_SKL_REVID(dev_priv, 0, SKL_REVID_B0) ||
-					IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1)) &&
-					(engine->id == VCS || engine->id == VCS2);
+	engine->disable_lite_restore_wa =
+		(IS_SKL_REVID(dev_priv, 0, SKL_REVID_B0) ||
+		 IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1)) &&
+		(engine->id == VCS || engine->id == VCS2);
 
 	engine->ctx_desc_template = GEN8_CTX_VALID;
 	if (IS_GEN8(dev_priv))
@@ -351,11 +354,11 @@ execlists_update_context_pdps(struct i915_hw_ppgtt *ppgtt, u32 *reg_state)
 	ASSIGN_CTX_PDP(ppgtt, reg_state, 0);
 }
 
-static void execlists_update_context(struct drm_i915_gem_request *rq)
+static u64 execlists_update_context(struct drm_i915_gem_request *rq)
 {
-	struct intel_engine_cs *engine = rq->engine;
+	struct intel_context *ce = &rq->ctx->engine[rq->engine->id];
 	struct i915_hw_ppgtt *ppgtt = rq->ctx->ppgtt;
-	uint32_t *reg_state = rq->ctx->engine[engine->id].lrc_reg_state;
+	u32 *reg_state = ce->lrc_reg_state;
 
 	reg_state[CTX_RING_TAIL+1] = intel_ring_offset(rq->ring, rq->tail);
 
@@ -366,26 +369,31 @@ static void execlists_update_context(struct drm_i915_gem_request *rq)
 	 */
 	if (ppgtt && !USES_FULL_48BIT_PPGTT(ppgtt->base.dev))
 		execlists_update_context_pdps(ppgtt, reg_state);
+
+	return ce->lrc_desc;
 }
 
-static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
-				 struct drm_i915_gem_request *rq1)
+static void execlists_submit_ports(struct intel_engine_cs *engine)
 {
-	struct intel_engine_cs *engine = rq0->engine;
-	struct drm_i915_private *dev_priv = rq0->i915;
+	struct drm_i915_private *dev_priv = engine->i915;
 	u32 __iomem *elsp =
 		dev_priv->regs + i915_mmio_reg_offset(RING_ELSP(engine));
 	u64 desc[2];
 
-	if (rq1) {
-		desc[1] = intel_lr_context_descriptor(rq1->ctx, rq1->engine);
-		rq1->elsp_submitted++;
-	} else {
-		desc[1] = 0;
-	}
+	if (!engine->execlist_port[0].count)
+		execlists_context_status_change(engine->execlist_port[0].request,
+						INTEL_CONTEXT_SCHEDULE_IN);
+	desc[0] = execlists_update_context(engine->execlist_port[0].request);
+	engine->preempt_wa = engine->execlist_port[0].count++;
 
-	desc[0] = intel_lr_context_descriptor(rq0->ctx, rq0->engine);
-	rq0->elsp_submitted++;
+	if (engine->execlist_port[1].request) {
+		GEM_BUG_ON(engine->execlist_port[1].count);
+		execlists_context_status_change(engine->execlist_port[1].request,
+						INTEL_CONTEXT_SCHEDULE_IN);
+		desc[1] = execlists_update_context(engine->execlist_port[1].request);
+		engine->execlist_port[1].count = 1;
+	} else
+		desc[1] = 0;
 
 	/* You must always write both descriptors in the order below. */
 	writel(upper_32_bits(desc[1]), elsp);
@@ -396,141 +404,80 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
 	writel(lower_32_bits(desc[0]), elsp);
 }
 
-static void execlists_elsp_submit_contexts(struct drm_i915_gem_request *rq0,
-					   struct drm_i915_gem_request *rq1)
+static bool ctx_single_port_submission(const struct i915_gem_context *ctx)
 {
-	struct drm_i915_private *dev_priv = rq0->i915;
-	unsigned int fw_domains = rq0->engine->fw_domains;
-
-	execlists_update_context(rq0);
-
-	if (rq1)
-		execlists_update_context(rq1);
+	return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
+		ctx->execlists_force_single_submission);
+}
 
-	spin_lock_irq(&dev_priv->uncore.lock);
-	intel_uncore_forcewake_get__locked(dev_priv, fw_domains);
+static bool merge_ctx(struct i915_gem_context *prev,
+		      struct i915_gem_context *next)
+{
+	if (prev != next)
+		return false;
 
-	execlists_elsp_write(rq0, rq1);
+	if (ctx_single_port_submission(prev))
+		return false;
 
-	intel_uncore_forcewake_put__locked(dev_priv, fw_domains);
-	spin_unlock_irq(&dev_priv->uncore.lock);
+	return true;
 }
 
 static void execlists_unqueue(struct intel_engine_cs *engine)
 {
-	struct drm_i915_gem_request *req0 = NULL, *req1 = NULL;
-	struct drm_i915_gem_request *cursor, *tmp;
-
-	assert_spin_locked(&engine->execlist_lock);
-
-	/*
-	 * If irqs are not active generate a warning as batches that finish
-	 * without the irqs may get lost and a GPU Hang may occur.
-	 */
-	WARN_ON(!intel_irqs_enabled(engine->i915));
-
-	/* Try to read in pairs */
-	list_for_each_entry_safe(cursor, tmp, &engine->execlist_queue,
-				 execlist_link) {
-		if (!req0) {
-			req0 = cursor;
-		} else if (req0->ctx == cursor->ctx) {
-			/* Same ctx: ignore first request, as second request
-			 * will update tail past first request's workload */
-			cursor->elsp_submitted = req0->elsp_submitted;
-			list_del(&req0->execlist_link);
-			i915_gem_request_put(req0);
-			req0 = cursor;
-		} else {
-			if (IS_ENABLED(CONFIG_DRM_I915_GVT)) {
-				/*
-				 * req0 (after merged) ctx requires single
-				 * submission, stop picking
-				 */
-				if (req0->ctx->execlists_force_single_submission)
-					break;
-				/*
-				 * req0 ctx doesn't require single submission,
-				 * but next req ctx requires, stop picking
-				 */
-				if (cursor->ctx->execlists_force_single_submission)
-					break;
-			}
-			req1 = cursor;
-			WARN_ON(req1->elsp_submitted);
-			break;
-		}
+	struct drm_i915_gem_request *cursor, *last;
+	struct execlist_port *port = engine->execlist_port;
+	bool submit = false;
+
+	last = port->request;
+	if (last != NULL) {
+		/* WaIdleLiteRestore:bdw,skl
+		 * Apply the wa NOOPs to prevent ring:HEAD == req:TAIL
+		 * as we resubmit the request. See gen8_emit_request()
+		 * for where we prepare the padding after the end of the
+		 * request.
+		 */
+		last->tail = last->wa_tail;
 	}
 
-	if (unlikely(!req0))
-		return;
+	/* Try to read in pairs and fill both submission ports */
+	spin_lock(&engine->execlist_lock);
+	list_for_each_entry(cursor, &engine->execlist_queue, execlist_link) {
+		if (last && !merge_ctx(cursor->ctx, last->ctx)) {
+			if (port != engine->execlist_port)
+				break;
 
-	execlists_context_status_change(req0, INTEL_CONTEXT_SCHEDULE_IN);
+			if (ctx_single_port_submission(cursor->ctx))
+				break;
 
-	if (req1)
-		execlists_context_status_change(req1,
-						INTEL_CONTEXT_SCHEDULE_IN);
+			i915_gem_request_assign(&port->request, last);
+			port++;
 
-	if (req0->elsp_submitted & engine->idle_lite_restore_wa) {
-		/*
-		 * WaIdleLiteRestore: make sure we never cause a lite restore
-		 * with HEAD==TAIL.
-		 *
-		 * Apply the wa NOOPS to prevent ring:HEAD == req:TAIL as we
-		 * resubmit the request. See gen8_emit_request() for where we
-		 * prepare the padding after the end of the request.
-		 */
-		req0->tail = req0->wa_tail;
+		}
+		last = cursor;
+		submit = true;
+	}
+	if (submit) {
+		i915_gem_request_assign(&port->request, last);
+		engine->execlist_queue.next = &cursor->execlist_link;
+		cursor->execlist_link.prev = &engine->execlist_queue;
 	}
+	spin_unlock(&engine->execlist_lock);
 
-	execlists_elsp_submit_contexts(req0, req1);
+	if (submit)
+		execlists_submit_ports(engine);
 }
 
-static unsigned int
-execlists_check_remove_request(struct intel_engine_cs *engine, u32 ctx_id)
+static bool execlists_elsp_idle(struct intel_engine_cs *engine)
 {
-	struct drm_i915_gem_request *head_req;
-
-	assert_spin_locked(&engine->execlist_lock);
-
-	head_req = list_first_entry_or_null(&engine->execlist_queue,
-					    struct drm_i915_gem_request,
-					    execlist_link);
-
-	if (WARN_ON(!head_req || (head_req->ctx_hw_id != ctx_id)))
-               return 0;
-
-	WARN(head_req->elsp_submitted == 0, "Never submitted head request\n");
-
-	if (--head_req->elsp_submitted > 0)
-		return 0;
-
-	execlists_context_status_change(head_req, INTEL_CONTEXT_SCHEDULE_OUT);
-
-	list_del(&head_req->execlist_link);
-	i915_gem_request_put(head_req);
-
-	return 1;
+	return engine->execlist_port[0].request == NULL;
 }
 
-static u32
-get_context_status(struct intel_engine_cs *engine, unsigned int read_pointer,
-		   u32 *context_id)
+static bool execlists_elsp_ready(struct intel_engine_cs *engine)
 {
-	struct drm_i915_private *dev_priv = engine->i915;
-	u32 status;
-
-	read_pointer %= GEN8_CSB_ENTRIES;
-
-	status = I915_READ_FW(RING_CONTEXT_STATUS_BUF_LO(engine, read_pointer));
-
-	if (status & GEN8_CTX_STATUS_IDLE_ACTIVE)
-		return 0;
-
-	*context_id = I915_READ_FW(RING_CONTEXT_STATUS_BUF_HI(engine,
-							      read_pointer));
-
-	return status;
+	if (engine->disable_lite_restore_wa || engine->preempt_wa)
+		return engine->execlist_port[0].request == NULL;
+	else
+		return engine->execlist_port[1].request == NULL;
 }
 
 /*
@@ -541,66 +488,51 @@ static void intel_lrc_irq_handler(unsigned long data)
 {
 	struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
 	struct drm_i915_private *dev_priv = engine->i915;
-	u32 status_pointer;
-	unsigned int read_pointer, write_pointer;
-	u32 csb[GEN8_CSB_ENTRIES][2];
-	unsigned int csb_read = 0, i;
-	unsigned int submit_contexts = 0;
 
 	intel_uncore_forcewake_get(dev_priv, engine->fw_domains);
 
-	status_pointer = I915_READ_FW(RING_CONTEXT_STATUS_PTR(engine));
-
-	read_pointer = engine->next_context_status_buffer;
-	write_pointer = GEN8_CSB_WRITE_PTR(status_pointer);
-	if (read_pointer > write_pointer)
-		write_pointer += GEN8_CSB_ENTRIES;
-
-	while (read_pointer < write_pointer) {
-		if (WARN_ON_ONCE(csb_read == GEN8_CSB_ENTRIES))
-			break;
-		csb[csb_read][0] = get_context_status(engine, ++read_pointer,
-						      &csb[csb_read][1]);
-		csb_read++;
-	}
-
-	engine->next_context_status_buffer = write_pointer % GEN8_CSB_ENTRIES;
-
-	/* Update the read pointer to the old write pointer. Manual ringbuffer
-	 * management ftw </sarcasm> */
-	I915_WRITE_FW(RING_CONTEXT_STATUS_PTR(engine),
-		      _MASKED_FIELD(GEN8_CSB_READ_PTR_MASK,
-				    engine->next_context_status_buffer << 8));
-
-	intel_uncore_forcewake_put(dev_priv, engine->fw_domains);
-
-	spin_lock(&engine->execlist_lock);
-
-	for (i = 0; i < csb_read; i++) {
-		if (unlikely(csb[i][0] & GEN8_CTX_STATUS_PREEMPTED)) {
-			if (csb[i][0] & GEN8_CTX_STATUS_LITE_RESTORE) {
-				if (execlists_check_remove_request(engine, csb[i][1]))
-					WARN(1, "Lite Restored request removed from queue\n");
-			} else
-				WARN(1, "Preemption without Lite Restore\n");
+	if (!execlists_elsp_idle(engine)) {
+		u32 *ring_mmio =
+			dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine));
+		u32 *csb_mmio =
+			dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0));
+		unsigned ring, head, tail;
+
+		ring = readl(ring_mmio);
+		head = GEN8_CSB_READ_PTR(ring);
+		tail = GEN8_CSB_WRITE_PTR(ring);
+		if (tail < head)
+			tail += GEN8_CSB_ENTRIES;
+		while (head < tail) {
+			unsigned idx = ++head % GEN8_CSB_ENTRIES;
+			unsigned status = readl(&csb_mmio[2 * idx]);
+
+			if (status & GEN8_CTX_STATUS_COMPLETED_MASK) {
+				GEM_BUG_ON(engine->execlist_port[0].count == 0);
+				if (--engine->execlist_port[0].count == 0) {
+					GEM_BUG_ON(status & GEN8_CTX_STATUS_PREEMPTED);
+					execlists_context_status_change(engine->execlist_port[0].request,
+									INTEL_CONTEXT_SCHEDULE_OUT);
+					i915_gem_request_put(engine->execlist_port[0].request);
+					engine->execlist_port[0] = engine->execlist_port[1];
+					memset(&engine->execlist_port[1], 0,
+					       sizeof(engine->execlist_port[1]));
+					engine->preempt_wa = false;
+				}
+			}
+			GEM_BUG_ON(engine->execlist_port[0].count == 0 &&
+				   !(status & GEN8_CTX_STATUS_ACTIVE_IDLE));
 		}
 
-		if (csb[i][0] & (GEN8_CTX_STATUS_ACTIVE_IDLE |
-		    GEN8_CTX_STATUS_ELEMENT_SWITCH))
-			submit_contexts +=
-				execlists_check_remove_request(engine, csb[i][1]);
+		writel(_MASKED_FIELD(GEN8_CSB_READ_PTR_MASK,
+				     GEN8_CSB_WRITE_PTR(ring) << 8),
+		       ring_mmio);
 	}
 
-	if (submit_contexts) {
-		if (!engine->disable_lite_restore_wa ||
-		    (csb[i][0] & GEN8_CTX_STATUS_ACTIVE_IDLE))
-			execlists_unqueue(engine);
-	}
+	if (execlists_elsp_ready(engine))
+		execlists_unqueue(engine);
 
-	spin_unlock(&engine->execlist_lock);
-
-	if (unlikely(submit_contexts > 2))
-		DRM_ERROR("More than two context complete events?\n");
+	intel_uncore_forcewake_put(dev_priv, engine->fw_domains);
 }
 
 static void execlists_submit_request(struct drm_i915_gem_request *request)
@@ -609,11 +541,8 @@ static void execlists_submit_request(struct drm_i915_gem_request *request)
 
 	spin_lock_bh(&engine->execlist_lock);
 
-	i915_gem_request_get(request);
-	request->ctx_hw_id = request->ctx->hw_id;
-
 	list_add_tail(&request->execlist_link, &engine->execlist_queue);
-	if (request->execlist_link.prev == &engine->execlist_queue)
+	if (execlists_elsp_idle(engine))
 		tasklet_hi_schedule(&engine->irq_tasklet);
 
 	spin_unlock_bh(&engine->execlist_lock);
@@ -721,23 +650,6 @@ intel_logical_ring_advance(struct drm_i915_gem_request *request)
 	return 0;
 }
 
-void intel_execlists_cancel_requests(struct intel_engine_cs *engine)
-{
-	struct drm_i915_gem_request *req, *tmp;
-	LIST_HEAD(cancel_list);
-
-	WARN_ON(!mutex_is_locked(&engine->i915->drm.struct_mutex));
-
-	spin_lock_bh(&engine->execlist_lock);
-	list_replace_init(&engine->execlist_queue, &cancel_list);
-	spin_unlock_bh(&engine->execlist_lock);
-
-	list_for_each_entry_safe(req, tmp, &cancel_list, execlist_link) {
-		list_del(&req->execlist_link);
-		i915_gem_request_put(req);
-	}
-}
-
 static int intel_lr_context_pin(struct i915_gem_context *ctx,
 				struct intel_engine_cs *engine)
 {
@@ -1258,7 +1170,6 @@ static void lrc_init_hws(struct intel_engine_cs *engine)
 static int gen8_init_common_ring(struct intel_engine_cs *engine)
 {
 	struct drm_i915_private *dev_priv = engine->i915;
-	unsigned int next_context_status_buffer_hw;
 
 	lrc_init_hws(engine);
 
@@ -1269,32 +1180,12 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
 	I915_WRITE(RING_MODE_GEN7(engine),
 		   _MASKED_BIT_DISABLE(GFX_REPLAY_MODE) |
 		   _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE));
-	POSTING_READ(RING_MODE_GEN7(engine));
 
-	/*
-	 * Instead of resetting the Context Status Buffer (CSB) read pointer to
-	 * zero, we need to read the write pointer from hardware and use its
-	 * value because "this register is power context save restored".
-	 * Effectively, these states have been observed:
-	 *
-	 *      | Suspend-to-idle (freeze) | Suspend-to-RAM (mem) |
-	 * BDW  | CSB regs not reset       | CSB regs reset       |
-	 * CHT  | CSB regs not reset       | CSB regs not reset   |
-	 * SKL  |         ?                |         ?            |
-	 * BXT  |         ?                |         ?            |
-	 */
-	next_context_status_buffer_hw =
-		GEN8_CSB_WRITE_PTR(I915_READ(RING_CONTEXT_STATUS_PTR(engine)));
+	I915_WRITE(RING_CONTEXT_STATUS_PTR(engine),
+		   _MASKED_FIELD(GEN8_CSB_READ_PTR_MASK |
+				 GEN8_CSB_WRITE_PTR_MASK,
+				 0));
 
-	/*
-	 * When the CSB registers are reset (also after power-up / gpu reset),
-	 * CSB write pointer is set to all 1's, which is not valid, use '5' in
-	 * this special case, so the first element read is CSB[0].
-	 */
-	if (next_context_status_buffer_hw == GEN8_CSB_PTR_MASK)
-		next_context_status_buffer_hw = (GEN8_CSB_ENTRIES - 1);
-
-	engine->next_context_status_buffer = next_context_status_buffer_hw;
 	DRM_DEBUG_DRIVER("Execlists enabled for %s\n", engine->name);
 
 	intel_engine_init_hangcheck(engine);
@@ -1680,10 +1571,6 @@ void intel_logical_ring_cleanup(struct intel_engine_cs *engine)
 	}
 	intel_lr_context_unpin(dev_priv->kernel_context, engine);
 
-	engine->idle_lite_restore_wa = 0;
-	engine->disable_lite_restore_wa = false;
-	engine->ctx_desc_template = 0;
-
 	lrc_destroy_wa_ctx_obj(engine);
 	engine->i915 = NULL;
 }
diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
index a52cf57dbd40..4d70346500c2 100644
--- a/drivers/gpu/drm/i915/intel_lrc.h
+++ b/drivers/gpu/drm/i915/intel_lrc.h
@@ -97,6 +97,4 @@ int intel_sanitize_enable_execlists(struct drm_i915_private *dev_priv,
 				    int enable_execlists);
 void intel_execlists_enable_submission(struct drm_i915_private *dev_priv);
 
-void intel_execlists_cancel_requests(struct intel_engine_cs *engine);
-
 #endif /* _INTEL_LRC_H_ */
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 84aea549de5d..64cb7a7dc687 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -298,11 +298,14 @@ struct intel_engine_cs {
 	/* Execlists */
 	struct tasklet_struct irq_tasklet;
 	spinlock_t execlist_lock; /* used inside tasklet, use spin_lock_bh */
+	struct execlist_port {
+		struct drm_i915_gem_request *request;
+		unsigned count;
+	} execlist_port[2];
 	struct list_head execlist_queue;
 	unsigned int fw_domains;
-	unsigned int next_context_status_buffer;
-	unsigned int idle_lite_restore_wa;
 	bool disable_lite_restore_wa;
+	bool preempt_wa;
 	u32 ctx_desc_template;
 
 	/**
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 49+ messages in thread

* [PATCH 07/13] drm/i915: Update reset path to fix incomplete requests
  2016-08-25  9:08 Shortest path to EGL_ANDRIOD_native_sync Chris Wilson
                   ` (5 preceding siblings ...)
  2016-08-25  9:08 ` [PATCH 06/13] drm/i915: Simplify ELSP queue request tracking Chris Wilson
@ 2016-08-25  9:08 ` Chris Wilson
  2016-08-25  9:08 ` [PATCH 08/13] drm/i915: Drive request submission through fence callbacks Chris Wilson
                   ` (13 subsequent siblings)
  20 siblings, 0 replies; 49+ messages in thread
From: Chris Wilson @ 2016-08-25  9:08 UTC (permalink / raw)
  To: intel-gfx; +Cc: Mika Kuoppala

Update reset path in preparation for engine reset which requires
identification of incomplete requests and associated context and fixing
their state so that engine can resume correctly after reset.

The request that caused the hang will be skipped and head is reset to the
start of breadcrumb. This allows us to resume from where we left-off.
Since this request didn't complete normally we also need to cleanup elsp
queue manually. This is vital if we employ nonblocking request
submission where we may have a web of dependencies upon the hung request
and so advancing the seqno manually is no longer trivial.

Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Cc: Arun Siluvery <arun.siluvery@linux.intel.com>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.c         |   4 +-
 drivers/gpu/drm/i915/i915_drv.h         |   2 +-
 drivers/gpu/drm/i915/i915_gem.c         | 106 +++++++++++++++-----------------
 drivers/gpu/drm/i915/i915_gem_context.c |  16 -----
 drivers/gpu/drm/i915/intel_lrc.c        |  34 +++++++++-
 drivers/gpu/drm/i915/intel_ringbuffer.c |  33 ++++++----
 drivers/gpu/drm/i915/intel_ringbuffer.h |   2 +
 7 files changed, 110 insertions(+), 87 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index fc9273215286..af3c46eadd1a 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1590,7 +1590,7 @@ static int i915_drm_resume(struct drm_device *dev)
 	mutex_lock(&dev->struct_mutex);
 	if (i915_gem_init_hw(dev)) {
 		DRM_ERROR("failed to re-initialize GPU, declaring wedged!\n");
-		atomic_or(I915_WEDGED, &dev_priv->gpu_error.reset_counter);
+		i915_gem_set_wedged(dev_priv);
 	}
 	mutex_unlock(&dev->struct_mutex);
 
@@ -1816,7 +1816,7 @@ int i915_reset(struct drm_i915_private *dev_priv)
 	return 0;
 
 error:
-	atomic_or(I915_WEDGED, &error->reset_counter);
+	i915_gem_set_wedged(dev_priv);
 	mutex_unlock(&dev->struct_mutex);
 	return ret;
 }
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index ff96b7a69e6c..03eff38bc081 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3269,6 +3269,7 @@ static inline u32 i915_reset_count(struct i915_gpu_error *error)
 }
 
 void i915_gem_reset(struct drm_device *dev);
+void i915_gem_set_wedged(struct drm_i915_private *dev_priv);
 bool i915_gem_clflush_object(struct drm_i915_gem_object *obj, bool force);
 int __must_check i915_gem_init(struct drm_device *dev);
 int __must_check i915_gem_init_hw(struct drm_device *dev);
@@ -3397,7 +3398,6 @@ void i915_gem_object_save_bit_17_swizzle(struct drm_i915_gem_object *obj);
 int __must_check i915_gem_context_init(struct drm_device *dev);
 void i915_gem_context_lost(struct drm_i915_private *dev_priv);
 void i915_gem_context_fini(struct drm_device *dev);
-void i915_gem_context_reset(struct drm_device *dev);
 int i915_gem_context_open(struct drm_device *dev, struct drm_file *file);
 void i915_gem_context_close(struct drm_device *dev, struct drm_file *file);
 int i915_switch_context(struct drm_i915_gem_request *req);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 23ac08819950..265149711af6 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2501,34 +2501,68 @@ i915_gem_find_active_request(struct intel_engine_cs *engine)
 	return NULL;
 }
 
-static void i915_gem_reset_engine_status(struct intel_engine_cs *engine)
+static void i915_gem_reset_engine(struct intel_engine_cs *engine)
 {
 	struct drm_i915_gem_request *request;
+	struct i915_gem_context *incomplete_ctx;
+	unsigned long flags;
 	bool ring_hung;
 
+	/* Ensure irq handler finishes or is cancelled, and not run again. */
+	local_irq_save(flags);
+	tasklet_kill(&engine->irq_tasklet);
+	tasklet_disable(&engine->irq_tasklet);
+	local_irq_restore(flags);
+
 	request = i915_gem_find_active_request(engine);
-	if (request == NULL)
-		return;
+	if (!request)
+		goto out;
 
 	ring_hung = engine->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG;
 
 	i915_set_reset_status(request->ctx, ring_hung);
-	list_for_each_entry_continue(request, &engine->request_list, link)
+
+	engine->reset_hw(engine, request);
+
+	incomplete_ctx = request->ctx;
+	if (i915_gem_context_is_default(incomplete_ctx))
+		incomplete_ctx = NULL;
+
+	list_for_each_entry_continue(request, &engine->request_list, link) {
+		void *vaddr = request->ring->vaddr;
+		u32 head;
+
+		if (request->ctx != incomplete_ctx)
+			continue;
+
+		head = request->head;
+		if (request->postfix < head) {
+			memset(vaddr + head, 0, request->ring->size - head);
+			head = 0;
+		}
+		memset(vaddr + head, 0, request->postfix - head);
+
 		i915_set_reset_status(request->ctx, false);
+	}
+
+out:
+	tasklet_enable(&engine->irq_tasklet);
 }
 
-static void i915_gem_reset_engine_cleanup(struct intel_engine_cs *engine)
+void i915_gem_reset(struct drm_device *dev)
 {
-	struct drm_i915_gem_request *request;
-	unsigned long flags;
-	struct intel_ring *ring;
+	struct drm_i915_private *dev_priv = to_i915(dev);
+	struct intel_engine_cs *engine;
 
-	/* Ensure irq handler finishes or is cancelled, and not run again. */
-	local_irq_save(flags);
-	tasklet_kill(&engine->irq_tasklet);
-	tasklet_disable(&engine->irq_tasklet);
-	local_irq_restore(flags);
+	for_each_engine(engine, dev_priv)
+		i915_gem_reset_engine(engine);
 
+	i915_gem_context_lost(dev_priv);
+	i915_gem_restore_fences(dev);
+}
+
+static void i915_gem_cleanup_engine(struct intel_engine_cs *engine)
+{
 	/* Mark all pending requests as complete so that any concurrent
 	 * (lockless) lookup doesn't try and wait upon the request as we
 	 * reset it.
@@ -2550,56 +2584,18 @@ static void i915_gem_reset_engine_cleanup(struct intel_engine_cs *engine)
 		spin_unlock(&engine->execlist_lock);
 	}
 
-	/*
-	 * We must free the requests after all the corresponding objects have
-	 * been moved off active lists. Which is the same order as the normal
-	 * retire_requests function does. This is important if object hold
-	 * implicit references on things like e.g. ppgtt address spaces through
-	 * the request.
-	 */
-	request = i915_gem_active_raw(&engine->last_request,
-				      &engine->i915->drm.struct_mutex);
-	if (request)
-		i915_gem_request_retire_upto(request);
-	GEM_BUG_ON(intel_engine_is_active(engine));
-
-	/* Having flushed all requests from all queues, we know that all
-	 * ringbuffers must now be empty. However, since we do not reclaim
-	 * all space when retiring the request (to prevent HEADs colliding
-	 * with rapid ringbuffer wraparound) the amount of available space
-	 * upon reset is less than when we start. Do one more pass over
-	 * all the ringbuffers to reset last_retired_head.
-	 */
-	list_for_each_entry(ring, &engine->buffers, link) {
-		ring->last_retired_head = ring->tail;
-		intel_ring_update_space(ring);
-	}
-
 	engine->i915->gt.active_engines &= ~intel_engine_flag(engine);
-
-	tasklet_enable(&engine->irq_tasklet);
 }
 
-void i915_gem_reset(struct drm_device *dev)
+void i915_gem_set_wedged(struct drm_i915_private *dev_priv)
 {
-	struct drm_i915_private *dev_priv = to_i915(dev);
 	struct intel_engine_cs *engine;
 
-	/*
-	 * Before we free the objects from the requests, we need to inspect
-	 * them for finding the guilty party. As the requests only borrow
-	 * their reference to the objects, the inspection must be done first.
-	 */
-	for_each_engine(engine, dev_priv)
-		i915_gem_reset_engine_status(engine);
+	atomic_or(I915_WEDGED, &dev_priv->gpu_error.reset_counter);
 
 	for_each_engine(engine, dev_priv)
-		i915_gem_reset_engine_cleanup(engine);
+		i915_gem_cleanup_engine(engine);
 	mod_delayed_work(dev_priv->wq, &dev_priv->gt.idle_work, 0);
-
-	i915_gem_context_reset(dev);
-
-	i915_gem_restore_fences(dev);
 }
 
 static void
@@ -4478,7 +4474,7 @@ int i915_gem_init(struct drm_device *dev)
 		 * for all other failure, such as an allocation failure, bail.
 		 */
 		DRM_ERROR("Failed to initialize GPU, declaring it wedged\n");
-		atomic_or(I915_WEDGED, &dev_priv->gpu_error.reset_counter);
+		i915_gem_set_wedged(dev_priv);
 		ret = 0;
 	}
 
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 35950ee46a1d..df10f4e95736 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -420,22 +420,6 @@ static void i915_gem_context_unpin(struct i915_gem_context *ctx,
 	}
 }
 
-void i915_gem_context_reset(struct drm_device *dev)
-{
-	struct drm_i915_private *dev_priv = to_i915(dev);
-
-	lockdep_assert_held(&dev->struct_mutex);
-
-	if (i915.enable_execlists) {
-		struct i915_gem_context *ctx;
-
-		list_for_each_entry(ctx, &dev_priv->context_list, link)
-			intel_lr_context_reset(dev_priv, ctx);
-	}
-
-	i915_gem_context_lost(dev_priv);
-}
-
 int i915_gem_context_init(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = to_i915(dev);
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 325c1273450f..5e60519ede8d 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1171,6 +1171,7 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
 {
 	struct drm_i915_private *dev_priv = engine->i915;
 
+	intel_mocs_init_engine(engine);
 	lrc_init_hws(engine);
 
 	I915_WRITE_IMR(engine,
@@ -1190,7 +1191,10 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
 
 	intel_engine_init_hangcheck(engine);
 
-	return intel_mocs_init_engine(engine);
+	if (engine->execlist_port[0].request)
+		execlists_submit_ports(engine);
+
+	return 0;
 }
 
 static int gen8_init_render_ring(struct intel_engine_cs *engine)
@@ -1226,6 +1230,33 @@ static int gen9_init_render_ring(struct intel_engine_cs *engine)
 	return init_workarounds_ring(engine);
 }
 
+static void reset_common_ring(struct intel_engine_cs *engine,
+			      struct drm_i915_gem_request *request)
+{
+	struct drm_i915_private *dev_priv = engine->i915;
+	struct i915_gem_context *ctx = request->ctx;
+	struct intel_context *ce = &ctx->engine[engine->id];
+	u32 *reg_state;
+
+	reg_state = ce->lrc_reg_state;
+	reg_state[CTX_RING_HEAD+1] = request->postfix;
+
+	request->ring->head = request->postfix;
+	request->ring->last_retired_head = -1;
+	intel_ring_update_space(request->ring);
+
+	if (request == engine->execlist_port[1].request) {
+		i915_gem_request_put(engine->execlist_port[0].request);
+		engine->execlist_port[0] = engine->execlist_port[1];
+		memset(&engine->execlist_port[1], 0,
+		       sizeof(engine->execlist_port[1]));
+	}
+
+	engine->execlist_port[0].count = 0;
+
+	I915_WRITE(RING_CONTEXT_STATUS_PTR(engine), _MASKED_FIELD(0xffff, 0));
+}
+
 static int intel_logical_ring_emit_pdps(struct drm_i915_gem_request *req)
 {
 	struct i915_hw_ppgtt *ppgtt = req->ctx->ppgtt;
@@ -1588,6 +1619,7 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine)
 {
 	/* Default vfuncs which can be overriden by each engine. */
 	engine->init_hw = gen8_init_common_ring;
+	engine->reset_hw = reset_common_ring;
 	engine->emit_flush = gen8_emit_flush;
 	engine->emit_request = gen8_emit_request;
 	engine->submit_request = execlists_submit_request;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index cc5bcd14b6df..f0a5c6e9b908 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -577,34 +577,33 @@ static int init_ring_common(struct intel_engine_cs *engine)
 	if (I915_READ_HEAD(engine))
 		DRM_DEBUG("%s initialization failed [head=%08x], fudging\n",
 			  engine->name, I915_READ_HEAD(engine));
-	I915_WRITE_HEAD(engine, 0);
-	(void)I915_READ_HEAD(engine);
+
+	intel_ring_update_space(ring);
+	I915_WRITE_HEAD(engine, ring->head);
+	I915_WRITE_TAIL(engine, ring->tail);
+	(void)I915_READ_TAIL(engine);
 
 	I915_WRITE_CTL(engine,
 			((ring->size - PAGE_SIZE) & RING_NR_PAGES)
 			| RING_VALID);
 
 	/* If the head is still not zero, the ring is dead */
-	if (wait_for((I915_READ_CTL(engine) & RING_VALID) != 0 &&
-		     I915_READ_START(engine) == i915_ggtt_offset(ring->vma) &&
-		     (I915_READ_HEAD(engine) & HEAD_ADDR) == 0, 50)) {
+	if (intel_wait_for_register_fw(dev_priv, RING_CTL(engine->mmio_base),
+				       RING_VALID, RING_VALID,
+				       50)) {
 		DRM_ERROR("%s initialization failed "
-			  "ctl %08x (valid? %d) head %08x tail %08x start %08x [expected %08x]\n",
+			  "ctl %08x (valid? %d) head %08x [%08x] tail %08x [%08x] start %08x [expected %08x]\n",
 			  engine->name,
 			  I915_READ_CTL(engine),
 			  I915_READ_CTL(engine) & RING_VALID,
-			  I915_READ_HEAD(engine), I915_READ_TAIL(engine),
+			  I915_READ_HEAD(engine), ring->head,
+			  I915_READ_TAIL(engine), ring->tail,
 			  I915_READ_START(engine),
 			  i915_ggtt_offset(ring->vma));
 		ret = -EIO;
 		goto out;
 	}
 
-	ring->last_retired_head = -1;
-	ring->head = I915_READ_HEAD(engine);
-	ring->tail = I915_READ_TAIL(engine) & TAIL_ADDR;
-	intel_ring_update_space(ring);
-
 	intel_engine_init_hangcheck(engine);
 
 out:
@@ -613,6 +612,15 @@ out:
 	return ret;
 }
 
+static void reset_ring_common(struct intel_engine_cs *engine,
+			      struct drm_i915_gem_request *request)
+{
+	struct intel_ring *ring = request->ring;
+
+	ring->head = request->postfix;
+	ring->last_retired_head = -1;
+}
+
 static int intel_ring_workarounds_emit(struct drm_i915_gem_request *req)
 {
 	struct intel_ring *ring = req->ring;
@@ -2655,6 +2663,7 @@ static void intel_ring_default_vfuncs(struct drm_i915_private *dev_priv,
 	intel_ring_init_semaphores(dev_priv, engine);
 
 	engine->init_hw = init_ring_common;
+	engine->reset_hw = reset_ring_common;
 
 	engine->emit_request = i9xx_emit_request;
 	if (i915.semaphores)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 64cb7a7dc687..8d1b2fc3e485 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -211,6 +211,8 @@ struct intel_engine_cs {
 	void		(*irq_disable)(struct intel_engine_cs *engine);
 
 	int		(*init_hw)(struct intel_engine_cs *engine);
+	void		(*reset_hw)(struct intel_engine_cs *engine,
+				    struct drm_i915_gem_request *req);
 
 	int		(*init_context)(struct drm_i915_gem_request *req);
 
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 49+ messages in thread

* [PATCH 08/13] drm/i915: Drive request submission through fence callbacks
  2016-08-25  9:08 Shortest path to EGL_ANDRIOD_native_sync Chris Wilson
                   ` (6 preceding siblings ...)
  2016-08-25  9:08 ` [PATCH 07/13] drm/i915: Update reset path to fix incomplete requests Chris Wilson
@ 2016-08-25  9:08 ` Chris Wilson
  2016-08-25 12:08   ` Joonas Lahtinen
  2016-08-26 12:47   ` John Harrison
  2016-08-25  9:08 ` [PATCH 09/13] drm/i915: Move execbuf object synchronisation to i915_gem_execbuffer Chris Wilson
                   ` (12 subsequent siblings)
  20 siblings, 2 replies; 49+ messages in thread
From: Chris Wilson @ 2016-08-25  9:08 UTC (permalink / raw)
  To: intel-gfx

Drive final request submission from a callback from the fence. This way
the request is queued until all dependencies are resolved, at which
point it is handed to the backend for queueing to hardware. At this
point, no dependencies are set on the request, so the callback is
immediate.

A side-effect of imposing a heavier-irqsafe spinlock for execlist
submission is that we lose the softirq enabling after scheduling the
execlists tasklet. To compensate, we manually kickstart the softirq by
disabling and enabling the bh around the fence signaling.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem_request.c  | 20 +++++++++++++++++++-
 drivers/gpu/drm/i915/i915_gem_request.h  |  3 +++
 drivers/gpu/drm/i915/intel_breadcrumbs.c |  3 +++
 drivers/gpu/drm/i915/intel_lrc.c         |  5 +++--
 4 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index fa5e36de55d0..db45482ea194 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -314,6 +314,19 @@ static int i915_gem_get_seqno(struct drm_i915_private *dev_priv, u32 *seqno)
 	return 0;
 }
 
+static int __i915_sw_fence_call submit_notify(struct i915_sw_fence *fence)
+{
+	struct drm_i915_gem_request *request =
+		container_of(fence, typeof(*request), submit);
+
+	if (i915_sw_fence_done(fence))
+		i915_gem_request_put(request);
+	else
+		request->engine->submit_request(request);
+
+	return NOTIFY_DONE;
+}
+
 /**
  * i915_gem_request_alloc - allocate a request structure
  *
@@ -412,6 +425,8 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 	 */
 	req->reserved_space = MIN_SPACE_FOR_ADD_REQUEST;
 
+	i915_sw_fence_init(&i915_gem_request_get(req)->submit, submit_notify);
+
 	if (i915.enable_execlists)
 		ret = intel_logical_ring_alloc_request_extras(req);
 	else
@@ -527,7 +542,10 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
 		  reserved_tail, ret);
 
 	i915_gem_mark_busy(engine);
-	engine->submit_request(request);
+
+	local_bh_disable();
+	i915_sw_fence_commit(&request->submit);
+	local_bh_enable(); /* Kick the execlists tasklet if just scheduled */
 }
 
 static unsigned long local_clock_us(unsigned int *cpu)
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index a231bd318ef0..a85723463978 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -28,6 +28,7 @@
 #include <linux/fence.h>
 
 #include "i915_gem.h"
+#include "i915_sw_fence.h"
 
 struct intel_wait {
 	struct rb_node node;
@@ -82,6 +83,8 @@ struct drm_i915_gem_request {
 	struct intel_ring *ring;
 	struct intel_signal_node signaling;
 
+	struct i915_sw_fence submit;
+
 	/** GEM sequence number associated with the previous request,
 	 * when the HWS breadcrumb is equal to this the GPU is processing
 	 * this request.
diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
index 2491e4c1eaf0..9bad14d22c95 100644
--- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
+++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
@@ -462,7 +462,10 @@ static int intel_breadcrumbs_signaler(void *arg)
 			 */
 			intel_engine_remove_wait(engine,
 						 &request->signaling.wait);
+
+			local_bh_disable();
 			fence_signal(&request->fence);
+			local_bh_enable(); /* kick start the tasklets */
 
 			/* Find the next oldest signal. Note that as we have
 			 * not been holding the lock, another client may
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 5e60519ede8d..babeaa8b1273 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -538,14 +538,15 @@ static void intel_lrc_irq_handler(unsigned long data)
 static void execlists_submit_request(struct drm_i915_gem_request *request)
 {
 	struct intel_engine_cs *engine = request->engine;
+	unsigned long flags;
 
-	spin_lock_bh(&engine->execlist_lock);
+	spin_lock_irqsave(&engine->execlist_lock, flags);
 
 	list_add_tail(&request->execlist_link, &engine->execlist_queue);
 	if (execlists_elsp_idle(engine))
 		tasklet_hi_schedule(&engine->irq_tasklet);
 
-	spin_unlock_bh(&engine->execlist_lock);
+	spin_unlock_irqrestore(&engine->execlist_lock, flags);
 }
 
 int intel_logical_ring_alloc_request_extras(struct drm_i915_gem_request *request)
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 49+ messages in thread

* [PATCH 09/13] drm/i915: Move execbuf object synchronisation to i915_gem_execbuffer
  2016-08-25  9:08 Shortest path to EGL_ANDRIOD_native_sync Chris Wilson
                   ` (7 preceding siblings ...)
  2016-08-25  9:08 ` [PATCH 08/13] drm/i915: Drive request submission through fence callbacks Chris Wilson
@ 2016-08-25  9:08 ` Chris Wilson
  2016-08-26 13:29   ` John Harrison
  2016-08-25  9:08 ` [PATCH 10/13] drm/i915: Nonblocking request submission Chris Wilson
                   ` (11 subsequent siblings)
  20 siblings, 1 reply; 49+ messages in thread
From: Chris Wilson @ 2016-08-25  9:08 UTC (permalink / raw)
  To: intel-gfx

We are about to specialize object synchronisation to enable nonblocking
execbuf submission. First we make a copy of the current object
synchronisation for execbuffer. The general i915_gem_object_sync() will
be removed following the removal of CS flips in the near future.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 64 +++++++++++++++++++++++++++++-
 1 file changed, 63 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 601156c353cc..b7cc158733ad 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1122,6 +1122,68 @@ static unsigned int eb_other_engines(struct drm_i915_gem_request *req)
 }
 
 static int
+__eb_sync(struct drm_i915_gem_request *to,
+	  struct drm_i915_gem_request *from)
+{
+	int idx, ret;
+
+	if (to->engine == from->engine)
+		return 0;
+
+	idx = intel_engine_sync_index(from->engine, to->engine);
+	if (from->fence.seqno <= from->engine->semaphore.sync_seqno[idx])
+		return 0;
+
+	trace_i915_gem_ring_sync_to(to, from);
+	if (!i915.semaphores) {
+		ret = i915_wait_request(from, true, NULL, NO_WAITBOOST);
+		if (ret)
+			return ret;
+	} else {
+		ret = to->engine->semaphore.sync_to(to, from);
+		if (ret)
+			return ret;
+	}
+
+	from->engine->semaphore.sync_seqno[idx] = from->fence.seqno;
+	return 0;
+}
+
+static int
+eb_sync(struct drm_i915_gem_object *obj,
+	struct drm_i915_gem_request *to,
+	bool write)
+{
+	struct i915_gem_active *active;
+	unsigned long active_mask;
+	int idx;
+
+	if (write) {
+		active_mask = i915_gem_object_get_active(obj);
+		active = obj->last_read;
+	} else {
+		active_mask = 1;
+		active = &obj->last_write;
+	}
+
+	for_each_active(active_mask, idx) {
+		struct drm_i915_gem_request *request;
+		int ret;
+
+		request = i915_gem_active_peek(&active[idx],
+					       &obj->base.dev->struct_mutex);
+		if (!request)
+			continue;
+
+		ret = __eb_sync(to, request);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int
 i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req,
 				struct list_head *vmas)
 {
@@ -1133,7 +1195,7 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req,
 		struct drm_i915_gem_object *obj = vma->obj;
 
 		if (obj->flags & other_rings) {
-			ret = i915_gem_object_sync(obj, req);
+			ret = eb_sync(obj, req, obj->base.pending_write_domain);
 			if (ret)
 				return ret;
 		}
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 49+ messages in thread

* [PATCH 10/13] drm/i915: Nonblocking request submission
  2016-08-25  9:08 Shortest path to EGL_ANDRIOD_native_sync Chris Wilson
                   ` (8 preceding siblings ...)
  2016-08-25  9:08 ` [PATCH 09/13] drm/i915: Move execbuf object synchronisation to i915_gem_execbuffer Chris Wilson
@ 2016-08-25  9:08 ` Chris Wilson
  2016-08-26 13:39   ` John Harrison
  2016-08-25  9:08 ` [PATCH 11/13] drm/i915: Serialise execbuf operation after a dma-buf reservation object Chris Wilson
                   ` (10 subsequent siblings)
  20 siblings, 1 reply; 49+ messages in thread
From: Chris Wilson @ 2016-08-25  9:08 UTC (permalink / raw)
  To: intel-gfx

Now that we have fences in place to drive request submission, we can
employ those to queue requests after their dependencies as opposed to
stalling in the middle of an execbuf ioctl. (However, we still choose to
spin before enabling the IRQ as that is faster - though contentious.)

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 10 +++++++---
 drivers/gpu/drm/i915/i915_gem_request.c    | 14 +++++++++++++-
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index b7cc158733ad..104c713ec681 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1136,9 +1136,13 @@ __eb_sync(struct drm_i915_gem_request *to,
 
 	trace_i915_gem_ring_sync_to(to, from);
 	if (!i915.semaphores) {
-		ret = i915_wait_request(from, true, NULL, NO_WAITBOOST);
-		if (ret)
-			return ret;
+		if (!i915_spin_request(from, TASK_INTERRUPTIBLE, 2)) {
+			ret = i915_sw_fence_await_dma_fence(&to->submit,
+							    &from->fence,
+							    GFP_KERNEL);
+			if (ret < 0)
+				return ret;
+		}
 	} else {
 		ret = to->engine->semaphore.sync_to(to, from);
 		if (ret)
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index db45482ea194..d3477dabb534 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -345,7 +345,7 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 {
 	struct drm_i915_private *dev_priv = engine->i915;
 	unsigned int reset_counter = i915_reset_counter(&dev_priv->gpu_error);
-	struct drm_i915_gem_request *req;
+	struct drm_i915_gem_request *req, *prev;
 	u32 seqno;
 	int ret;
 
@@ -441,6 +441,18 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 	 */
 	req->head = req->ring->tail;
 
+	prev = i915_gem_active_peek(&engine->last_request,
+				    &req->i915->drm.struct_mutex);
+	if (prev) {
+		ret = i915_sw_fence_await_sw_fence(&req->submit,
+						   &prev->submit,
+						   GFP_KERNEL);
+		if (ret < 0) {
+			i915_add_request(req);
+			return ERR_PTR(ret);
+		}
+	}
+
 	return req;
 
 err_ctx:
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 49+ messages in thread

* [PATCH 11/13] drm/i915: Serialise execbuf operation after a dma-buf reservation object
  2016-08-25  9:08 Shortest path to EGL_ANDRIOD_native_sync Chris Wilson
                   ` (9 preceding siblings ...)
  2016-08-25  9:08 ` [PATCH 10/13] drm/i915: Nonblocking request submission Chris Wilson
@ 2016-08-25  9:08 ` Chris Wilson
  2016-08-26 13:52   ` John Harrison
  2016-08-25  9:08 ` [PATCH 12/13] drm/i915: Enable userspace to opt-out of implicit fencing Chris Wilson
                   ` (9 subsequent siblings)
  20 siblings, 1 reply; 49+ messages in thread
From: Chris Wilson @ 2016-08-25  9:08 UTC (permalink / raw)
  To: intel-gfx

Now that we can wait upon fences before emitting the request, it becomes
trivial to wait upon any implicit fence provided by the dma-buf
reservation object.

Testcase: igt/prime_vgem/fence-wait
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 104c713ec681..d9b6fe9f7542 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1197,6 +1197,7 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req,
 
 	list_for_each_entry(vma, vmas, exec_list) {
 		struct drm_i915_gem_object *obj = vma->obj;
+		struct reservation_object *resv;
 
 		if (obj->flags & other_rings) {
 			ret = eb_sync(obj, req, obj->base.pending_write_domain);
@@ -1204,6 +1205,16 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req,
 				return ret;
 		}
 
+		resv = i915_gem_object_get_dmabuf_resv(obj);
+		if (resv) {
+			ret = i915_sw_fence_await_reservation
+				(&req->submit, resv, &i915_fence_ops,
+				 obj->base.pending_write_domain,
+				 GFP_KERNEL | __GFP_NOWARN);
+			if (ret < 0)
+				return ret;
+		}
+
 		if (obj->base.write_domain & I915_GEM_DOMAIN_CPU)
 			i915_gem_clflush_object(obj, false);
 	}
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 49+ messages in thread

* [PATCH 12/13] drm/i915: Enable userspace to opt-out of implicit fencing
  2016-08-25  9:08 Shortest path to EGL_ANDRIOD_native_sync Chris Wilson
                   ` (10 preceding siblings ...)
  2016-08-25  9:08 ` [PATCH 11/13] drm/i915: Serialise execbuf operation after a dma-buf reservation object Chris Wilson
@ 2016-08-25  9:08 ` Chris Wilson
  2016-08-25  9:08 ` [PATCH 13/13] drm/i915: Support explicit fencing for execbuf Chris Wilson
                   ` (8 subsequent siblings)
  20 siblings, 0 replies; 49+ messages in thread
From: Chris Wilson @ 2016-08-25  9:08 UTC (permalink / raw)
  To: intel-gfx

Userspace is faced with a dilemma. The kernel requires implicit fencing
to manage resource usage (we always must wait for the GPU to finish
before releasing its PTE) and for third parties. However, userspace may
wish to avoid this serialisation if it is either using explicit fencing
between parties and wants more fine-grained access to buffers (e.g. it
may partition the buffer between uses and track fences on ranges rather
than the implicit fences tracking the whole object). To whit, userspace
needs a mechanism to avoid the kernel's serialisation on its implicit
fences before execbuf execution.

The next question is whether this is an object, execbuf or context flag.
Hybrid users (such as using explicit EGL_ANDROID_native_sync fencing on
shared winsys buffers, but implicit fencing on internal surfaces)
require a per-object level flag. Given that this flag need to be only
set once for the lifetime of the object, this reduces the convenience of
having an execbuf or context level flag (and avoids having multiple
pieces of uABI controlling the same feature).

Incorrect use of this flag will result in rendering corruption and GPU
hangs - but will not result in use-after-free or similar resource
tracking issues.

Testcase: igt/gem_exec_async
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.c            |  3 +++
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 28 +++++++++++++++-------------
 include/uapi/drm/i915_drm.h                |  4 +++-
 3 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index af3c46eadd1a..3eecfef8bcca 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -355,6 +355,9 @@ static int i915_getparam(struct drm_device *dev, void *data,
 	case I915_PARAM_MIN_EU_IN_POOL:
 		value = INTEL_INFO(dev)->min_eu_in_pool;
 		break;
+	case I915_PARAM_HAS_EXEC_ASYNC:
+		value = 1;
+		break;
 	default:
 		DRM_DEBUG("Unknown parameter %d\n", param->param);
 		return -EINVAL;
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index d9b6fe9f7542..a0f46293e492 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1199,20 +1199,22 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req,
 		struct drm_i915_gem_object *obj = vma->obj;
 		struct reservation_object *resv;
 
-		if (obj->flags & other_rings) {
-			ret = eb_sync(obj, req, obj->base.pending_write_domain);
-			if (ret)
-				return ret;
-		}
+		if ((vma->exec_entry->flags & EXEC_OBJECT_ASYNC) == 0) {
+			if (obj->flags & other_rings) {
+				ret = eb_sync(obj, req, obj->base.pending_write_domain);
+				if (ret)
+					return ret;
+			}
 
-		resv = i915_gem_object_get_dmabuf_resv(obj);
-		if (resv) {
-			ret = i915_sw_fence_await_reservation
-				(&req->submit, resv, &i915_fence_ops,
-				 obj->base.pending_write_domain,
-				 GFP_KERNEL | __GFP_NOWARN);
-			if (ret < 0)
-				return ret;
+			resv = i915_gem_object_get_dmabuf_resv(obj);
+			if (resv) {
+				ret = i915_sw_fence_await_reservation
+					(&req->submit, resv, &i915_fence_ops,
+					 obj->base.pending_write_domain,
+					 GFP_KERNEL | __GFP_NOWARN);
+				if (ret < 0)
+					return ret;
+			}
 		}
 
 		if (obj->base.write_domain & I915_GEM_DOMAIN_CPU)
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 5501fe83ed92..75f9cab23249 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -387,6 +387,7 @@ typedef struct drm_i915_irq_wait {
 #define I915_PARAM_HAS_EXEC_SOFTPIN	 37
 #define I915_PARAM_HAS_POOLED_EU	 38
 #define I915_PARAM_MIN_EU_IN_POOL	 39
+#define I915_PARAM_HAS_EXEC_ASYNC	 40
 
 typedef struct drm_i915_getparam {
 	__s32 param;
@@ -728,8 +729,9 @@ struct drm_i915_gem_exec_object2 {
 #define EXEC_OBJECT_SUPPORTS_48B_ADDRESS (1<<3)
 #define EXEC_OBJECT_PINNED		 (1<<4)
 #define EXEC_OBJECT_PAD_TO_SIZE		 (1<<5)
+#define EXEC_OBJECT_ASYNC		 (1<<6)
 /* All remaining bits are MBZ and RESERVED FOR FUTURE USE */
-#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_PAD_TO_SIZE<<1)
+#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_ASYNC<<1)
 	__u64 flags;
 
 	union {
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 49+ messages in thread

* [PATCH 13/13] drm/i915: Support explicit fencing for execbuf
  2016-08-25  9:08 Shortest path to EGL_ANDRIOD_native_sync Chris Wilson
                   ` (11 preceding siblings ...)
  2016-08-25  9:08 ` [PATCH 12/13] drm/i915: Enable userspace to opt-out of implicit fencing Chris Wilson
@ 2016-08-25  9:08 ` Chris Wilson
  2016-08-26 15:08   ` John Harrison
  2016-08-25  9:08 ` [PATCH libdrm 14/15] intel: Allow the client to control implicit synchronisation Chris Wilson
                   ` (7 subsequent siblings)
  20 siblings, 1 reply; 49+ messages in thread
From: Chris Wilson @ 2016-08-25  9:08 UTC (permalink / raw)
  To: intel-gfx

Now that the user can opt-out of implicit fencing, we need to give them
back control over the fencing. We employ sync_file to wrap our
drm_i915_gem_request and provide an fd that userspace can merge with
other sync_file fds and pass back to the kernel to wait upon before
future execution.

Testcase: igt/gem_exec_fence
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/Kconfig               |  1 +
 drivers/gpu/drm/i915/i915_drv.c            |  5 +-
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 79 ++++++++++++++++++++++++++++--
 include/uapi/drm/i915_drm.h                |  8 ++-
 4 files changed, 86 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig
index 7769e469118f..319ca27ea719 100644
--- a/drivers/gpu/drm/i915/Kconfig
+++ b/drivers/gpu/drm/i915/Kconfig
@@ -18,6 +18,7 @@ config DRM_I915
 	select INPUT if ACPI
 	select ACPI_VIDEO if ACPI
 	select ACPI_BUTTON if ACPI
+	select SYNC_FILE
 	help
 	  Choose this option if you have a system that has "Intel Graphics
 	  Media Accelerator" or "HD Graphics" integrated graphics,
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 3eecfef8bcca..9460bfff360f 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -358,6 +358,9 @@ static int i915_getparam(struct drm_device *dev, void *data,
 	case I915_PARAM_HAS_EXEC_ASYNC:
 		value = 1;
 		break;
+	case I915_PARAM_HAS_EXEC_FENCE:
+		value = 1;
+		break;
 	default:
 		DRM_DEBUG("Unknown parameter %d\n", param->param);
 		return -EINVAL;
@@ -2547,7 +2550,7 @@ static const struct drm_ioctl_desc i915_ioctls[] = {
 	DRM_IOCTL_DEF_DRV(I915_HWS_ADDR, drm_noop, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY),
 	DRM_IOCTL_DEF_DRV(I915_GEM_INIT, drm_noop, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY),
 	DRM_IOCTL_DEF_DRV(I915_GEM_EXECBUFFER, i915_gem_execbuffer, DRM_AUTH),
-	DRM_IOCTL_DEF_DRV(I915_GEM_EXECBUFFER2, i915_gem_execbuffer2, DRM_AUTH|DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(I915_GEM_EXECBUFFER2_WR, i915_gem_execbuffer2, DRM_AUTH|DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(I915_GEM_PIN, i915_gem_reject_pin_ioctl, DRM_AUTH|DRM_ROOT_ONLY),
 	DRM_IOCTL_DEF_DRV(I915_GEM_UNPIN, i915_gem_reject_pin_ioctl, DRM_AUTH|DRM_ROOT_ONLY),
 	DRM_IOCTL_DEF_DRV(I915_GEM_BUSY, i915_gem_busy_ioctl, DRM_AUTH|DRM_RENDER_ALLOW),
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index a0f46293e492..8701b8bb46fc 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -28,6 +28,7 @@
 
 #include <linux/dma_remapping.h>
 #include <linux/reservation.h>
+#include <linux/sync_file.h>
 #include <linux/uaccess.h>
 
 #include <drm/drmP.h>
@@ -1686,6 +1687,33 @@ eb_select_engine(struct drm_i915_private *dev_priv,
 	return engine;
 }
 
+static int eb_add_fence(struct drm_i915_gem_request *req, struct fence *fence)
+{
+	struct fence_array *array;
+	int ret, i;
+
+	if (fence_is_i915(fence))
+		return __eb_sync(req, to_request(fence));
+
+	if (!fence_is_array(fence))
+		return i915_sw_fence_await_dma_fence(&req->submit, fence, GFP_KERNEL);
+
+	array = to_fence_array(fence);
+	for (i = 0; i < array->num_fences; i++) {
+		struct fence *child = array->fences[i];
+
+		if (fence_is_i915(child))
+			ret = __eb_sync(req, to_request(child));
+		else
+			ret = i915_sw_fence_await_dma_fence(&req->submit,
+							    child, GFP_KERNEL);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
+
 static int
 i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 		       struct drm_file *file,
@@ -1703,6 +1731,9 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	struct i915_execbuffer_params *params = &params_master;
 	const u32 ctx_id = i915_execbuffer2_get_context_id(*args);
 	u32 dispatch_flags;
+	struct fence *in_fence = NULL;
+	struct sync_file *out_fence = NULL;
+	int out_fence_fd = -1;
 	int ret;
 	bool need_relocs;
 
@@ -1746,6 +1777,23 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 		dispatch_flags |= I915_DISPATCH_RS;
 	}
 
+	if (args->flags & I915_EXEC_FENCE_IN) {
+		in_fence = sync_file_get_fence(lower_32_bits(args->rsvd2));
+		if (!in_fence) {
+			ret = -EINVAL;
+			goto pre_mutex_err;
+		}
+	}
+
+	if (args->flags & I915_EXEC_FENCE_OUT) {
+		out_fence_fd = get_unused_fd_flags(O_CLOEXEC);
+		if (out_fence_fd < 0) {
+			ret = out_fence_fd;
+			out_fence_fd = -1;
+			goto pre_mutex_err;
+		}
+	}
+
 	/* Take a local wakeref for preparing to dispatch the execbuf as
 	 * we expect to access the hardware fairly frequently in the
 	 * process. Upon first dispatch, we acquire another prolonged
@@ -1890,6 +1938,20 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 		goto err_batch_unpin;
 	}
 
+	if (in_fence) {
+		ret = eb_add_fence(params->request, in_fence);
+		if (ret < 0)
+			goto err_request;
+	}
+
+	if (out_fence_fd != -1) {
+		out_fence = sync_file_create(fence_get(&params->request->fence));
+		if (!out_fence) {
+			ret = -ENOMEM;
+			goto err_request;
+		}
+	}
+
 	/* Whilst this request exists, batch_obj will be on the
 	 * active_list, and so will hold the active reference. Only when this
 	 * request is retired will the the batch_obj be moved onto the
@@ -1917,6 +1979,15 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	ret = execbuf_submit(params, args, &eb->vmas);
 err_request:
 	__i915_add_request(params->request, ret == 0);
+	if (out_fence) {
+		if (ret == 0) {
+			fd_install(out_fence_fd, out_fence->file);
+			args->rsvd2 &= ~(u64)0xffffffff << 32;
+			args->rsvd2 |= (u64)out_fence_fd << 32;
+			out_fence_fd = -1;
+		} else
+			fput(out_fence->file);
+	}
 
 err_batch_unpin:
 	/*
@@ -1938,6 +2009,9 @@ pre_mutex_err:
 	/* intel_gpu_busy should also get a ref, so it will free when the device
 	 * is really idle. */
 	intel_runtime_pm_put(dev_priv);
+	if (out_fence_fd != -1)
+		put_unused_fd(out_fence_fd);
+	fence_put(in_fence);
 	return ret;
 }
 
@@ -2045,11 +2119,6 @@ i915_gem_execbuffer2(struct drm_device *dev, void *data,
 		return -EINVAL;
 	}
 
-	if (args->rsvd2 != 0) {
-		DRM_DEBUG("dirty rvsd2 field\n");
-		return -EINVAL;
-	}
-
 	exec2_list = drm_malloc_gfp(args->buffer_count,
 				    sizeof(*exec2_list),
 				    GFP_TEMPORARY);
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 75f9cab23249..9fe670cc7bf3 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -246,6 +246,7 @@ typedef struct _drm_i915_sarea {
 #define DRM_I915_OVERLAY_PUT_IMAGE	0x27
 #define DRM_I915_OVERLAY_ATTRS	0x28
 #define DRM_I915_GEM_EXECBUFFER2	0x29
+#define DRM_I915_GEM_EXECBUFFER2_WR	DRM_I915_GEM_EXECBUFFER2
 #define DRM_I915_GET_SPRITE_COLORKEY	0x2a
 #define DRM_I915_SET_SPRITE_COLORKEY	0x2b
 #define DRM_I915_GEM_WAIT	0x2c
@@ -279,6 +280,7 @@ typedef struct _drm_i915_sarea {
 #define DRM_IOCTL_I915_GEM_INIT		DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_INIT, struct drm_i915_gem_init)
 #define DRM_IOCTL_I915_GEM_EXECBUFFER	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_EXECBUFFER, struct drm_i915_gem_execbuffer)
 #define DRM_IOCTL_I915_GEM_EXECBUFFER2	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_EXECBUFFER2, struct drm_i915_gem_execbuffer2)
+#define DRM_IOCTL_I915_GEM_EXECBUFFER2_WR	DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_EXECBUFFER2_WR, struct drm_i915_gem_execbuffer2)
 #define DRM_IOCTL_I915_GEM_PIN		DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_PIN, struct drm_i915_gem_pin)
 #define DRM_IOCTL_I915_GEM_UNPIN	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_UNPIN, struct drm_i915_gem_unpin)
 #define DRM_IOCTL_I915_GEM_BUSY		DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_BUSY, struct drm_i915_gem_busy)
@@ -388,6 +390,7 @@ typedef struct drm_i915_irq_wait {
 #define I915_PARAM_HAS_POOLED_EU	 38
 #define I915_PARAM_MIN_EU_IN_POOL	 39
 #define I915_PARAM_HAS_EXEC_ASYNC	 40
+#define I915_PARAM_HAS_EXEC_FENCE	 41
 
 typedef struct drm_i915_getparam {
 	__s32 param;
@@ -821,7 +824,10 @@ struct drm_i915_gem_execbuffer2 {
  */
 #define I915_EXEC_RESOURCE_STREAMER     (1<<15)
 
-#define __I915_EXEC_UNKNOWN_FLAGS -(I915_EXEC_RESOURCE_STREAMER<<1)
+#define I915_EXEC_FENCE_IN		(1<<16)
+#define I915_EXEC_FENCE_OUT		(1<<17)
+
+#define __I915_EXEC_UNKNOWN_FLAGS (-(I915_EXEC_FENCE_OUT<<1))
 
 #define I915_EXEC_CONTEXT_ID_MASK	(0xffffffff)
 #define i915_execbuffer2_set_context_id(eb2, context) \
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 49+ messages in thread

* [PATCH libdrm 14/15] intel: Allow the client to control implicit synchronisation
  2016-08-25  9:08 Shortest path to EGL_ANDRIOD_native_sync Chris Wilson
                   ` (12 preceding siblings ...)
  2016-08-25  9:08 ` [PATCH 13/13] drm/i915: Support explicit fencing for execbuf Chris Wilson
@ 2016-08-25  9:08 ` Chris Wilson
  2016-08-25  9:08 ` [PATCH libdrm 15/15] intel: Support passing of explicit fencing from execbuf Chris Wilson
                   ` (6 subsequent siblings)
  20 siblings, 0 replies; 49+ messages in thread
From: Chris Wilson @ 2016-08-25  9:08 UTC (permalink / raw)
  To: intel-gfx

The kernel allows implicit synchronisation to be disabled on individual
buffers. Use at your own risk.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 intel/intel_bufmgr.h     |  2 ++
 intel/intel_bufmgr_gem.c | 34 ++++++++++++++++++++++++++++++----
 2 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/intel/intel_bufmgr.h b/intel/intel_bufmgr.h
index a7285b7..f4b9b0e 100644
--- a/intel/intel_bufmgr.h
+++ b/intel/intel_bufmgr.h
@@ -184,6 +184,8 @@ int drm_intel_gem_bo_map_unsynchronized(drm_intel_bo *bo);
 int drm_intel_gem_bo_map_gtt(drm_intel_bo *bo);
 int drm_intel_gem_bo_unmap_gtt(drm_intel_bo *bo);
 
+void drm_intel_gem_bo_disable_implicit_sync(drm_intel_bo *bo);
+
 void *drm_intel_gem_bo_map__cpu(drm_intel_bo *bo);
 void *drm_intel_gem_bo_map__gtt(drm_intel_bo *bo);
 void *drm_intel_gem_bo_map__wc(drm_intel_bo *bo);
diff --git a/intel/intel_bufmgr_gem.c b/intel/intel_bufmgr_gem.c
index 004aeb1..259543f 100644
--- a/intel/intel_bufmgr_gem.c
+++ b/intel/intel_bufmgr_gem.c
@@ -73,6 +73,8 @@
 #define VG(x)
 #endif
 
+#define EXEC_OBJECT_ASYNC	(1 << 6)
+
 #define memclear(s) memset(&s, 0, sizeof(s))
 
 #define DBG(...) do {					\
@@ -190,8 +192,11 @@ struct _drm_intel_bo_gem {
 	uint32_t swizzle_mode;
 	unsigned long stride;
 
+	unsigned long kflags;
+
 	time_t free_time;
 
+
 	/** Array passed to the DRM containing relocation information. */
 	struct drm_i915_gem_relocation_entry *relocs;
 	/**
@@ -570,12 +575,11 @@ drm_intel_add_validate_buffer2(drm_intel_bo *bo, int need_fence)
 	bufmgr_gem->exec2_objects[index].relocation_count = bo_gem->reloc_count;
 	bufmgr_gem->exec2_objects[index].relocs_ptr = (uintptr_t)bo_gem->relocs;
 	bufmgr_gem->exec2_objects[index].alignment = bo->align;
-	bufmgr_gem->exec2_objects[index].offset = bo_gem->is_softpin ?
-		bo->offset64 : 0;
-	bufmgr_gem->exec_bos[index] = bo;
-	bufmgr_gem->exec2_objects[index].flags = flags;
+	bufmgr_gem->exec2_objects[index].offset = bo->offset64;
+	bufmgr_gem->exec2_objects[index].flags = flags | bo_gem->kflags;
 	bufmgr_gem->exec2_objects[index].rsvd1 = 0;
 	bufmgr_gem->exec2_objects[index].rsvd2 = 0;
+	bufmgr_gem->exec_bos[index] = bo;
 	bufmgr_gem->exec_count++;
 }
 
@@ -1347,6 +1351,7 @@ drm_intel_gem_bo_unreference_final(drm_intel_bo *bo, time_t time)
 	for (i = 0; i < bo_gem->softpin_target_count; i++)
 		drm_intel_gem_bo_unreference_locked_timed(bo_gem->softpin_target[i],
 								  time);
+	bo_gem->kflags = 0;
 	bo_gem->reloc_count = 0;
 	bo_gem->used_as_reloc_target = false;
 	bo_gem->softpin_target_count = 0;
@@ -2758,6 +2763,27 @@ drm_intel_bufmgr_gem_enable_reuse(drm_intel_bufmgr *bufmgr)
 }
 
 /**
+ * Disables implicit synchronisation before executing the bo
+ *
+ * This will cause rendering corruption unless you correctly manage explicit
+ * fences for all rendering involving this buffer - including use by others.
+ * Disabling the implicit serialisation is only required if that serialisation
+ * is too coarse (for example, you have split the buffer into many
+ * non-overlapping regions and are sharing the whole buffer between concurrent
+ * independent command streams).
+ *
+ * Note the kernel must advertise support via I915_PARAM_HAS_EXEC_ASYNC or
+ * subsequent execbufs involving the bo will generate EINVAL.
+ */
+void
+drm_intel_gem_bo_disable_implicit_sync(drm_intel_bo *bo)
+{
+	drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo;
+
+	bo_gem->kflags |= EXEC_OBJECT_ASYNC;
+}
+
+/**
  * Enable use of fenced reloc type.
  *
  * New code should enable this to avoid unnecessary fence register
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 49+ messages in thread

* [PATCH libdrm 15/15] intel: Support passing of explicit fencing from execbuf
  2016-08-25  9:08 Shortest path to EGL_ANDRIOD_native_sync Chris Wilson
                   ` (13 preceding siblings ...)
  2016-08-25  9:08 ` [PATCH libdrm 14/15] intel: Allow the client to control implicit synchronisation Chris Wilson
@ 2016-08-25  9:08 ` Chris Wilson
  2016-09-30 20:53   ` Rafael Antognolli
  2016-08-25  9:08 ` [PATCH 16/21] i965: Add explicit fence tracking to batch flush Chris Wilson
                   ` (5 subsequent siblings)
  20 siblings, 1 reply; 49+ messages in thread
From: Chris Wilson @ 2016-08-25  9:08 UTC (permalink / raw)
  To: intel-gfx

Allow the caller to pass in an fd to an array of fences to control
serialisation of the execbuf in the kernel and on the GPU, and in return
allow creation of a fence fd for signaling the completion (and flushing)
of the batch. When the returned fence is signaled, all writes to the
buffers inside the batch will be complete and coherent from the cpu, or
other consumers. The return fence is a sync_file object and can be
passed to other users (such as atomic modesetting, or other drivers).

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 intel/intel_bufmgr.h     |  6 ++++++
 intel/intel_bufmgr_gem.c | 38 +++++++++++++++++++++++++++++++++-----
 2 files changed, 39 insertions(+), 5 deletions(-)

diff --git a/intel/intel_bufmgr.h b/intel/intel_bufmgr.h
index f4b9b0e..1688312 100644
--- a/intel/intel_bufmgr.h
+++ b/intel/intel_bufmgr.h
@@ -217,6 +217,12 @@ drm_intel_context *drm_intel_gem_context_create(drm_intel_bufmgr *bufmgr);
 void drm_intel_gem_context_destroy(drm_intel_context *ctx);
 int drm_intel_gem_bo_context_exec(drm_intel_bo *bo, drm_intel_context *ctx,
 				  int used, unsigned int flags);
+int drm_intel_gem_bo_fence_exec(drm_intel_bo *bo,
+				drm_intel_context *ctx,
+				int used,
+				int in_fence,
+				int *out_fence,
+				unsigned int flags);
 
 int drm_intel_bo_gem_export_to_prime(drm_intel_bo *bo, int *prime_fd);
 drm_intel_bo *drm_intel_bo_gem_create_from_prime(drm_intel_bufmgr *bufmgr,
diff --git a/intel/intel_bufmgr_gem.c b/intel/intel_bufmgr_gem.c
index 259543f..02ec757 100644
--- a/intel/intel_bufmgr_gem.c
+++ b/intel/intel_bufmgr_gem.c
@@ -75,6 +75,12 @@
 
 #define EXEC_OBJECT_ASYNC	(1 << 6)
 
+#define I915_EXEC_FENCE_IN	(1 << 17)
+#define I915_EXEC_FENCE_OUT	(1 << 18)
+
+#define DRM_I915_GEM_EXECBUFFER2_WR     DRM_I915_GEM_EXECBUFFER2
+#define DRM_IOCTL_I915_GEM_EXECBUFFER2_WR       DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_EXECBUFFER2_WR, struct drm_i915_gem_execbuffer2)
+
 #define memclear(s) memset(&s, 0, sizeof(s))
 
 #define DBG(...) do {					\
@@ -2359,6 +2365,7 @@ drm_intel_gem_bo_exec(drm_intel_bo *bo, int used,
 static int
 do_exec2(drm_intel_bo *bo, int used, drm_intel_context *ctx,
 	 drm_clip_rect_t *cliprects, int num_cliprects, int DR4,
+	 int in_fence, int *out_fence,
 	 unsigned int flags)
 {
 	drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *)bo->bufmgr;
@@ -2412,13 +2419,20 @@ do_exec2(drm_intel_bo *bo, int used, drm_intel_context *ctx,
 		i915_execbuffer2_set_context_id(execbuf, 0);
 	else
 		i915_execbuffer2_set_context_id(execbuf, ctx->ctx_id);
-	execbuf.rsvd2 = 0;
+	if (in_fence != -1) {
+		execbuf.rsvd2 = in_fence;
+		flags |= I915_EXEC_FENCE_IN;
+	}
+	if (out_fence != NULL) {
+		*out_fence = -1;
+		flags |= I915_EXEC_FENCE_OUT;
+	}
 
 	if (bufmgr_gem->no_exec)
 		goto skip_execution;
 
 	ret = drmIoctl(bufmgr_gem->fd,
-		       DRM_IOCTL_I915_GEM_EXECBUFFER2,
+		       DRM_IOCTL_I915_GEM_EXECBUFFER2_WR,
 		       &execbuf);
 	if (ret != 0) {
 		ret = -errno;
@@ -2434,6 +2448,9 @@ do_exec2(drm_intel_bo *bo, int used, drm_intel_context *ctx,
 	}
 	drm_intel_update_buffer_offsets2(bufmgr_gem);
 
+	if (out_fence != NULL)
+		*out_fence = execbuf.rsvd2 >> 32;
+
 skip_execution:
 	if (bufmgr_gem->bufmgr.debug)
 		drm_intel_gem_dump_validation_list(bufmgr_gem);
@@ -2459,7 +2476,7 @@ drm_intel_gem_bo_exec2(drm_intel_bo *bo, int used,
 		       int DR4)
 {
 	return do_exec2(bo, used, NULL, cliprects, num_cliprects, DR4,
-			I915_EXEC_RENDER);
+			-1, NULL, I915_EXEC_RENDER);
 }
 
 static int
@@ -2468,14 +2485,25 @@ drm_intel_gem_bo_mrb_exec2(drm_intel_bo *bo, int used,
 			unsigned int flags)
 {
 	return do_exec2(bo, used, NULL, cliprects, num_cliprects, DR4,
-			flags);
+			-1, NULL, flags);
 }
 
 int
 drm_intel_gem_bo_context_exec(drm_intel_bo *bo, drm_intel_context *ctx,
 			      int used, unsigned int flags)
 {
-	return do_exec2(bo, used, ctx, NULL, 0, 0, flags);
+	return do_exec2(bo, used, ctx, NULL, 0, 0, -1, NULL, flags);
+}
+
+int
+drm_intel_gem_bo_fence_exec(drm_intel_bo *bo,
+			    drm_intel_context *ctx,
+			    int used,
+			    int in_fence,
+			    int *out_fence,
+			    unsigned int flags)
+{
+	return do_exec2(bo, used, ctx, NULL, 0, 0, in_fence, out_fence, flags);
 }
 
 static int
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 49+ messages in thread

* [PATCH 16/21] i965: Add explicit fence tracking to batch flush
  2016-08-25  9:08 Shortest path to EGL_ANDRIOD_native_sync Chris Wilson
                   ` (14 preceding siblings ...)
  2016-08-25  9:08 ` [PATCH libdrm 15/15] intel: Support passing of explicit fencing from execbuf Chris Wilson
@ 2016-08-25  9:08 ` Chris Wilson
  2016-08-25  9:08 ` [PATCH 17/21] i965: Split intel_syncobject into vfuncs Chris Wilson
                   ` (4 subsequent siblings)
  20 siblings, 0 replies; 49+ messages in thread
From: Chris Wilson @ 2016-08-25  9:08 UTC (permalink / raw)
  To: intel-gfx

Wire up the infrastructure to pass in and receive back a file descriptor
describing a fence (a sync_file object in the kernel) associated with
the batch. Fences can be passed in that the GPU must wait for before
executing the batch. Ideally those waits with neither block the client
nor the GPU from performing other tasks until the fences are signaled.
All batches submitted after this are only executed after the first batch,
the GL command stream is always ordered. Furthermore, the client can
request a fence associated with this point in the GL command stream that
can be passed to others (and used to serialise other GL command
streams). See EGL_ANDROID_native_fence_sync.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 src/mesa/drivers/dri/i965/brw_context.h       |  2 ++
 src/mesa/drivers/dri/i965/intel_batchbuffer.c | 29 ++++++++++++++++++---------
 src/mesa/drivers/dri/i965/intel_batchbuffer.h |  7 +++++--
 3 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 33a919b..f2dd164 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -625,6 +625,8 @@ struct intel_batchbuffer {
    uint32_t *cpu_map;
 #define BATCH_SZ (8192*sizeof(uint32_t))
 
+   int fence; /* fd to array of fences to wait on before GPU executes batch */
+
    uint32_t state_batch_offset;
    enum brw_gpu_ring ring;
    bool needs_sol_reset;
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index caa33f8..d60f95b 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -41,6 +41,8 @@ intel_batchbuffer_reset(struct brw_context *brw);
 void
 intel_batchbuffer_init(struct brw_context *brw)
 {
+   brw->batch.fence = -1;
+
    intel_batchbuffer_reset(brw);
 
    if (!brw->has_llc) {
@@ -59,6 +61,11 @@ intel_batchbuffer_reset(struct brw_context *brw)
    }
    brw->batch.last_bo = brw->batch.bo;
 
+   if (brw->batch.fence != -1) {
+      close(brw->batch.fence);
+      brw->batch.fence = -1;
+   }
+
    brw_render_cache_set_clear(brw);
 
    brw->batch.bo = drm_intel_bo_alloc(brw->bufmgr, "batchbuffer",
@@ -101,6 +108,9 @@ intel_batchbuffer_reset_to_saved(struct brw_context *brw)
 void
 intel_batchbuffer_free(struct brw_context *brw)
 {
+   if (brw->batch.fence != -1)
+      close(brw->batch.fence);
+
    free(brw->batch.cpu_map);
    drm_intel_bo_unreference(brw->batch.last_bo);
    drm_intel_bo_unreference(brw->batch.bo);
@@ -319,7 +329,7 @@ throttle(struct brw_context *brw)
 /* TODO: Push this whole function into bufmgr.
  */
 static int
-do_flush_locked(struct brw_context *brw)
+do_flush_locked(struct brw_context *brw, int *out_fence)
 {
    struct intel_batchbuffer *batch = &brw->batch;
    int ret = 0;
@@ -337,11 +347,14 @@ do_flush_locked(struct brw_context *brw)
    }
 
    if (!brw->intelScreen->no_hw) {
+      drm_intel_context *ctx;
       int flags;
 
       if (brw->gen >= 6 && batch->ring == BLT_RING) {
+         ctx = NULL;
          flags = I915_EXEC_BLT;
       } else {
+         ctx = brw->hw_ctx;
          flags = I915_EXEC_RENDER |
             (brw->use_resource_streamer ? I915_EXEC_RESOURCE_STREAMER : 0);
       }
@@ -352,13 +365,10 @@ do_flush_locked(struct brw_context *brw)
          if (unlikely(INTEL_DEBUG & DEBUG_AUB))
             brw_annotate_aub(brw);
 
-	 if (brw->hw_ctx == NULL || batch->ring != RENDER_RING) {
-            ret = drm_intel_bo_mrb_exec(batch->bo, 4 * USED_BATCH(*batch),
-                                        NULL, 0, 0, flags);
-	 } else {
-	    ret = drm_intel_gem_bo_context_exec(batch->bo, brw->hw_ctx,
-                                                4 * USED_BATCH(*batch), flags);
-	 }
+         ret = drm_intel_gem_bo_fence_exec(batch->bo, ctx,
+                                           4 * USED_BATCH(*batch),
+                                           batch->fence, out_fence,
+                                           flags);
       }
 
       throttle(brw);
@@ -380,6 +390,7 @@ do_flush_locked(struct brw_context *brw)
 
 int
 _intel_batchbuffer_flush(struct brw_context *brw,
+			 int *out_fence,
 			 const char *file, int line)
 {
    int ret;
@@ -419,7 +430,7 @@ _intel_batchbuffer_flush(struct brw_context *brw,
    /* Check that we didn't just wrap our batchbuffer at a bad time. */
    assert(!brw->no_batch_wrap);
 
-   ret = do_flush_locked(brw);
+   ret = do_flush_locked(brw, out_fence);
 
    if (unlikely(INTEL_DEBUG & DEBUG_SYNC)) {
       fprintf(stderr, "waiting for idle\n");
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.h b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
index fbb5158..e966d7a 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.h
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
@@ -46,11 +46,14 @@ void intel_batchbuffer_reset_to_saved(struct brw_context *brw);
 void intel_batchbuffer_require_space(struct brw_context *brw, GLuint sz,
                                      enum brw_gpu_ring ring);
 
-int _intel_batchbuffer_flush(struct brw_context *brw,
+int _intel_batchbuffer_flush(struct brw_context *brw, int *out_fence,
 			     const char *file, int line);
 
 #define intel_batchbuffer_flush(intel) \
-	_intel_batchbuffer_flush(intel, __FILE__, __LINE__)
+	_intel_batchbuffer_flush(intel, NULL, __FILE__, __LINE__)
+
+#define intel_batchbuffer_flush_fence(intel, fence) \
+	_intel_batchbuffer_flush(intel, fence, __FILE__, __LINE__)
 
 
 
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 49+ messages in thread

* [PATCH 17/21] i965: Split intel_syncobject into vfuncs
  2016-08-25  9:08 Shortest path to EGL_ANDRIOD_native_sync Chris Wilson
                   ` (15 preceding siblings ...)
  2016-08-25  9:08 ` [PATCH 16/21] i965: Add explicit fence tracking to batch flush Chris Wilson
@ 2016-08-25  9:08 ` Chris Wilson
  2016-08-25  9:08 ` [PATCH 18/21] i965: Add fd-fence backend to intel_syncobject Chris Wilson
                   ` (3 subsequent siblings)
  20 siblings, 0 replies; 49+ messages in thread
From: Chris Wilson @ 2016-08-25  9:08 UTC (permalink / raw)
  To: intel-gfx

Separate out the underlying fence implementation for the sync object so
that we can extend the internal seqno based fence with an external fd
fence in the next few patches.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 src/mesa/drivers/dri/i965/intel_syncobj.c | 33 ++++++++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 5 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/intel_syncobj.c b/src/mesa/drivers/dri/i965/intel_syncobj.c
index d5071f6..14fc4b8 100644
--- a/src/mesa/drivers/dri/i965/intel_syncobj.c
+++ b/src/mesa/drivers/dri/i965/intel_syncobj.c
@@ -76,7 +76,16 @@ void brw_context_fini_fences(struct brw_context *brw)
    munmap(brw->fences.map, 4096);
 }
 
+struct brw_fence;
+
+struct brw_fence_ops {
+   bool (*check)(struct brw_fence *fence);
+   bool (*client_wait)(struct brw_fence *, uint64_t timeout);
+   void (*server_wait)(struct brw_fence *, struct brw_context *brw);
+};
+
 struct brw_fence {
+   const struct brw_fence_ops *ops;
    drm_intel_bo *batch;
    uint32_t *hw_seqno;
    uint32_t seqno;
@@ -102,6 +111,11 @@ static void seqno_insert(struct brw_fence *fence, struct brw_context *brw)
    fence->batch = brw->batch.bo;
 }
 
+static bool seqno_check(struct brw_fence *fence)
+{
+   return seqno_passed(fence);
+}
+
 static bool seqno_client_wait(struct brw_fence *fence, uint64_t timeout)
 {
    if (seqno_passed(fence))
@@ -132,6 +146,12 @@ static void seqno_server_wait(struct brw_fence *fence, struct brw_context *brw)
     */
 }
 
+static const struct brw_fence_ops seqno_ops = {
+   .check = seqno_check,
+   .client_wait = seqno_client_wait,
+   .server_wait = seqno_server_wait,
+};
+
 static void brw_fence_finish(struct brw_fence *fence)
 {
    if (fence->batch)
@@ -152,6 +172,7 @@ intel_gl_new_sync_object(struct gl_context *ctx, GLuint id)
    if (!sync)
       return NULL;
 
+   sync->fence.ops = &seqno_ops;
    return &sync->Base;
 }
 
@@ -171,6 +192,7 @@ intel_gl_fence_sync(struct gl_context *ctx, struct gl_sync_object *s,
    struct brw_context *brw = brw_context(ctx);
    struct intel_gl_sync_object *sync = (struct intel_gl_sync_object *)s;
 
+   assert(sync->fence.ops == &seqno_ops);
    seqno_insert(&sync->fence, brw);
 }
 
@@ -184,7 +206,7 @@ intel_gl_client_wait_sync(struct gl_context *ctx, struct gl_sync_object *s,
    if (sync->fence.batch == brw->batch.bo)
       intel_batchbuffer_flush(brw);
 
-   if (seqno_client_wait(&sync->fence, timeout))
+   if (sync->fence.ops->client_wait(&sync->fence, timeout))
       s->StatusFlag = 1;
 }
 
@@ -195,7 +217,7 @@ intel_gl_server_wait_sync(struct gl_context *ctx, struct gl_sync_object *s,
    struct brw_context *brw = brw_context(ctx);
    struct intel_gl_sync_object *sync = (struct intel_gl_sync_object *)s;
 
-   seqno_server_wait(&sync->fence, brw);
+   sync->fence.ops->server_wait(&sync->fence, brw);
 }
 
 static void
@@ -207,7 +229,7 @@ intel_gl_check_sync(struct gl_context *ctx, struct gl_sync_object *s)
    if (sync->fence.batch == brw->batch.bo)
       intel_batchbuffer_flush(brw);
 
-   if (seqno_passed(&sync->fence))
+   if (sync->fence.ops->check(&sync->fence))
       s->StatusFlag = 1;
 }
 
@@ -232,6 +254,7 @@ intel_dri_create_fence(__DRIcontext *ctx)
    if (!fence)
       return NULL;
 
+   fence->ops = &seqno_ops;
    seqno_insert(fence, brw);
 
    return fence;
@@ -258,7 +281,7 @@ intel_dri_client_wait_sync(__DRIcontext *ctx, void *driver_fence, unsigned flags
          intel_batchbuffer_flush(brw);
    }
 
-   return seqno_client_wait(fence, timeout);
+   return fence->ops->client_wait(fence, timeout);
 }
 
 static void
@@ -273,7 +296,7 @@ intel_dri_server_wait_sync(__DRIcontext *ctx, void *driver_fence, unsigned flags
    if (!fence)
       return;
 
-   seqno_server_wait(fence, brw);
+   fence->ops->server_wait(fence, brw);
 }
 
 const __DRI2fenceExtension intelFenceExtension = {
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 49+ messages in thread

* [PATCH 18/21] i965: Add fd-fence backend to intel_syncobject
  2016-08-25  9:08 Shortest path to EGL_ANDRIOD_native_sync Chris Wilson
                   ` (16 preceding siblings ...)
  2016-08-25  9:08 ` [PATCH 17/21] i965: Split intel_syncobject into vfuncs Chris Wilson
@ 2016-08-25  9:08 ` Chris Wilson
  2016-08-25  9:08 ` [PATCH 19/21] rfc! i965: Add intel_screen::has_fence_fd Chris Wilson
                   ` (2 subsequent siblings)
  20 siblings, 0 replies; 49+ messages in thread
From: Chris Wilson @ 2016-08-25  9:08 UTC (permalink / raw)
  To: intel-gfx

Handle querying and waiting on an external fence fd as opposed to the
internal seqno fence. The key difference with the fd fences are that we
can pass these external fences to the kernel/GPU for it to
asynchronously wait upon (internal seqno fences are ordered by the GL
command stream, hence do not need injection).

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 src/mesa/drivers/dri/i965/intel_syncobj.c | 84 +++++++++++++++++++++++++++++++
 1 file changed, 84 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/intel_syncobj.c b/src/mesa/drivers/dri/i965/intel_syncobj.c
index 14fc4b8..b5df498 100644
--- a/src/mesa/drivers/dri/i965/intel_syncobj.c
+++ b/src/mesa/drivers/dri/i965/intel_syncobj.c
@@ -39,6 +39,8 @@
  */
 
 #include <sys/mman.h>
+#include <sys/poll.h>
+#include <sys/ioctl.h>
 
 #include "main/imports.h"
 
@@ -89,6 +91,7 @@ struct brw_fence {
    drm_intel_bo *batch;
    uint32_t *hw_seqno;
    uint32_t seqno;
+   int handle;
 };
 
 static inline bool seqno_passed(const struct brw_fence *fence)
@@ -152,10 +155,89 @@ static const struct brw_fence_ops seqno_ops = {
    .server_wait = seqno_server_wait,
 };
 
+static inline bool fd_poll(const struct brw_fence *fence, int timeout)
+{
+   struct pollfd pfd = { .fd = fence->handle, .events = POLLOUT };
+   return poll(&pfd, 1, timeout) == 1;
+}
+
+static bool fd_check(struct brw_fence *fence)
+{
+   return fd_poll(fence, 0);
+}
+
+static bool fd_client_wait(struct brw_fence *fence, uint64_t timeout)
+{
+   int msecs;
+
+   msecs = -1;
+   if (timeout != __DRI2_FENCE_TIMEOUT_INFINITE) {
+      timeout /= 1000 * 1000; /* nsecs to msecs */
+      if (timeout < INT_MAX)
+         msecs = timeout;
+   }
+
+   return fd_poll(fence, msecs);
+}
+
+/*** libsync ***/
+
+struct sync_merge_data {
+   char    name[32];
+   __s32   fd2;
+   __s32   fence;
+   __u32   flags;
+   __u32   pad;
+};
+
+#define SYNC_IOC_MAGIC         '>'
+#define SYNC_IOC_MERGE         _IOWR(SYNC_IOC_MAGIC, 3, struct sync_merge_data)
+
+static int sync_merge(int fd1, int fd2)
+{
+   struct sync_merge_data data;
+
+   if (fd1 == -1)
+      return dup(fd2);
+
+   if (fd2 == -1)
+      return dup(fd1);
+
+   memset(&data, 0, sizeof(data));
+   data.fd2 = fd2;
+   strcpy(data.name, "i965");
+
+   if (ioctl(fd1, SYNC_IOC_MERGE, &data))
+      return -errno;
+
+   return data.fence;
+}
+
+/*** !libsync ***/
+
+static void fd_server_wait(struct brw_fence *fence, struct brw_context *brw)
+{
+   if (fence->batch)
+      return;
+
+   brw->batch.fence = sync_merge(brw->batch.fence, fence->handle);
+
+   drm_intel_bo_reference(brw->batch.bo);
+   fence->batch = brw->batch.bo;
+}
+
+static const struct brw_fence_ops fd_ops = {
+   .check = fd_check,
+   .client_wait = fd_client_wait,
+   .server_wait = fd_server_wait,
+};
+
 static void brw_fence_finish(struct brw_fence *fence)
 {
    if (fence->batch)
       drm_intel_bo_unreference(fence->batch);
+   if (fence->handle != -1)
+      close(fence->handle);
 }
 
 struct intel_gl_sync_object {
@@ -172,6 +254,7 @@ intel_gl_new_sync_object(struct gl_context *ctx, GLuint id)
    if (!sync)
       return NULL;
 
+   sync->fence.handle = -1;
    sync->fence.ops = &seqno_ops;
    return &sync->Base;
 }
@@ -254,6 +337,7 @@ intel_dri_create_fence(__DRIcontext *ctx)
    if (!fence)
       return NULL;
 
+   fence->handle = -1;
    fence->ops = &seqno_ops;
    seqno_insert(fence, brw);
 
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 49+ messages in thread

* [PATCH 19/21] rfc! i965: Add intel_screen::has_fence_fd
  2016-08-25  9:08 Shortest path to EGL_ANDRIOD_native_sync Chris Wilson
                   ` (17 preceding siblings ...)
  2016-08-25  9:08 ` [PATCH 18/21] i965: Add fd-fence backend to intel_syncobject Chris Wilson
@ 2016-08-25  9:08 ` Chris Wilson
  2016-08-25  9:08 ` [PATCH 20/21] i965: Implement EGL_ANDROID_native_fence_sync support for DRI2_FENCE Chris Wilson
  2016-08-25  9:08 ` [PATCH 21/21] i965: Disable implicit sync when using EGL_ANDROID_native_fence_sync Chris Wilson
  20 siblings, 0 replies; 49+ messages in thread
From: Chris Wilson @ 2016-08-25  9:08 UTC (permalink / raw)
  To: intel-gfx; +Cc: Chad Versace

From: Chad Versace <chad.versace@intel.com>

This bool maps to I915_PARAM_HAS_EXEC_FENCE_FD.

TODO: The i915 param is not yet upstream.  Wait for the kernel interface
  before committing.
---
 src/mesa/drivers/dri/i965/intel_screen.c | 4 ++++
 src/mesa/drivers/dri/i965/intel_screen.h | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c
index 9df888d..1a32295 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.c
+++ b/src/mesa/drivers/dri/i965/intel_screen.c
@@ -1637,6 +1637,10 @@ __DRIconfig **intelInitScreen2(__DRIscreen *psp)
         intel_get_boolean(intelScreen, I915_PARAM_HAS_RESOURCE_STREAMER);
    }
 
+#define I915_PARAM_HAS_EXEC_FENCE 42
+   intelScreen->has_fence_fd =
+     intel_get_boolean(intelScreen, I915_PARAM_HAS_EXEC_FENCE);
+
    return (const __DRIconfig**) intel_screen_make_configs(psp);
 }
 
diff --git a/src/mesa/drivers/dri/i965/intel_screen.h b/src/mesa/drivers/dri/i965/intel_screen.h
index 98f9d24..d167758 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.h
+++ b/src/mesa/drivers/dri/i965/intel_screen.h
@@ -45,8 +45,8 @@ struct intel_screen
    __DRIscreen *driScrnPriv;
 
    bool no_hw;
-
    bool hw_has_swizzling;
+   bool has_fence_fd;
 
    int hw_has_timestamp;
 
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 49+ messages in thread

* [PATCH 20/21] i965: Implement EGL_ANDROID_native_fence_sync support for DRI2_FENCE
  2016-08-25  9:08 Shortest path to EGL_ANDRIOD_native_sync Chris Wilson
                   ` (18 preceding siblings ...)
  2016-08-25  9:08 ` [PATCH 19/21] rfc! i965: Add intel_screen::has_fence_fd Chris Wilson
@ 2016-08-25  9:08 ` Chris Wilson
  2016-08-25  9:08 ` [PATCH 21/21] i965: Disable implicit sync when using EGL_ANDROID_native_fence_sync Chris Wilson
  20 siblings, 0 replies; 49+ messages in thread
From: Chris Wilson @ 2016-08-25  9:08 UTC (permalink / raw)
  To: intel-gfx

Wire up the callbacks from DRI2_FENCE for native fence support.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 src/mesa/drivers/dri/i965/intel_syncobj.c | 48 +++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/intel_syncobj.c b/src/mesa/drivers/dri/i965/intel_syncobj.c
index b5df498..8ac5324 100644
--- a/src/mesa/drivers/dri/i965/intel_syncobj.c
+++ b/src/mesa/drivers/dri/i965/intel_syncobj.c
@@ -383,6 +383,51 @@ intel_dri_server_wait_sync(__DRIcontext *ctx, void *driver_fence, unsigned flags
    fence->ops->server_wait(fence, brw);
 }
 
+static unsigned
+intel_dri_get_capabilities(__DRIscreen *screen)
+{
+   struct intel_screen *intelScreen = screen->driverPrivate;
+   unsigned caps = 0;
+
+   if (intelScreen->has_fence_fd)
+      caps |= __DRI_FENCE_CAP_NATIVE_FD;
+
+   return caps;
+}
+
+static void *
+intel_dri_create_fence_fd(__DRIcontext *ctx, int fd)
+{
+   struct brw_context *brw = ctx->driverPrivate;
+   struct brw_fence *fence;
+
+   if (fd == -1) {
+      fence = intel_dri_create_fence(ctx);
+
+      /* XXX defer the flush? Requires ctx later + locking */
+      intel_batchbuffer_flush_fence(brw, &fence->handle);
+
+      return fence;
+   }
+
+   fence = calloc(1, sizeof(*fence));
+   if (!fence)
+      return NULL;
+
+   fence->handle = fd;
+   fence->ops = &fd_ops;
+
+   return fence;
+}
+
+static int
+intel_dri_get_fence_fd(__DRIscreen *screen, void *_fence)
+{
+   struct brw_fence *fence = _fence;
+
+   return dup(fence->handle);
+}
+
 const __DRI2fenceExtension intelFenceExtension = {
    .base = { __DRI2_FENCE, 1 },
 
@@ -391,4 +436,7 @@ const __DRI2fenceExtension intelFenceExtension = {
    .client_wait_sync = intel_dri_client_wait_sync,
    .server_wait_sync = intel_dri_server_wait_sync,
    .get_fence_from_cl_event = NULL,
+   .get_capabilities = intel_dri_get_capabilities,
+   .create_fence_fd = intel_dri_create_fence_fd,
+   .get_fence_fd = intel_dri_get_fence_fd,
 };
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 49+ messages in thread

* [PATCH 21/21] i965: Disable implicit sync when using EGL_ANDROID_native_fence_sync
  2016-08-25  9:08 Shortest path to EGL_ANDRIOD_native_sync Chris Wilson
                   ` (19 preceding siblings ...)
  2016-08-25  9:08 ` [PATCH 20/21] i965: Implement EGL_ANDROID_native_fence_sync support for DRI2_FENCE Chris Wilson
@ 2016-08-25  9:08 ` Chris Wilson
  20 siblings, 0 replies; 49+ messages in thread
From: Chris Wilson @ 2016-08-25  9:08 UTC (permalink / raw)
  To: intel-gfx; +Cc: Daniel Vetter

If the client is controlling serialisation on shared buffers using
EGL_ANDROID_native_fence_sync, we assume that they are capable of fully
describing the serialisation required on those buffers and disable the
implicit syncs to prevent any "undue" serialisation between rendering.

This shotgun is likely too large - but it appears to be the default
standard on Android, and there is not yet a protocol for passing a
buffer and its fence between co-operating process. Other than the
existing implicit fences carried on dma-buf!

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel.vetter@intel.com>
---
 src/mesa/drivers/dri/i965/intel_screen.c  | 7 +++++++
 src/mesa/drivers/dri/i965/intel_screen.h  | 1 +
 src/mesa/drivers/dri/i965/intel_syncobj.c | 8 ++++++++
 3 files changed, 16 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c
index 1a32295..98f1c76 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.c
+++ b/src/mesa/drivers/dri/i965/intel_screen.c
@@ -751,6 +751,13 @@ intel_create_image_from_fds(__DRIscreen *screen,
       return NULL;
    }
 
+   /* If the GL clients are passing around EGL_ANDROID_native_fence_sync
+    * we presume that they do not want (or require!) any implicit
+    * serialisation. This affects all contexts using this surface.
+    */
+   if (intelScreen->explicit_fencing)
+      drm_intel_gem_bo_disable_implicit_sync(image->bo);
+
    if (f->nplanes == 1) {
       image->offset = image->offsets[0];
       intel_image_warn_if_unaligned(image, __func__);
diff --git a/src/mesa/drivers/dri/i965/intel_screen.h b/src/mesa/drivers/dri/i965/intel_screen.h
index d167758..35cd5df 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.h
+++ b/src/mesa/drivers/dri/i965/intel_screen.h
@@ -47,6 +47,7 @@ struct intel_screen
    bool no_hw;
    bool hw_has_swizzling;
    bool has_fence_fd;
+   bool explicit_fencing;
 
    int hw_has_timestamp;
 
diff --git a/src/mesa/drivers/dri/i965/intel_syncobj.c b/src/mesa/drivers/dri/i965/intel_syncobj.c
index 8ac5324..3fc3820 100644
--- a/src/mesa/drivers/dri/i965/intel_syncobj.c
+++ b/src/mesa/drivers/dri/i965/intel_syncobj.c
@@ -417,6 +417,14 @@ intel_dri_create_fence_fd(__DRIcontext *ctx, int fd)
    fence->handle = fd;
    fence->ops = &fd_ops;
 
+   /* Explicit fencing requires disabling implicit fences for all shared
+    * buffers (so that the client is wholly responsible for all stalls,
+    * and rendering corruption).
+    *
+    * This is not stated in any spec, but is the defacto standard on Android.
+    */
+   brw->intelScreen->explicit_fencing = true;
+
    return fence;
 }
 
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 49+ messages in thread

* Re: [PATCH 03/13] drm/i915: Record the position of the workarounds in the tail of the request
  2016-08-25  9:08 ` [PATCH 03/13] drm/i915: Record the position of the workarounds in the tail of the request Chris Wilson
@ 2016-08-25  9:35   ` Mika Kuoppala
  0 siblings, 0 replies; 49+ messages in thread
From: Mika Kuoppala @ 2016-08-25  9:35 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Rather than blindly assuming we need to advance the tail for
> resubmitting the request via the ELSP, record the position.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>

> ---
>  drivers/gpu/drm/i915/i915_gem_request.h | 15 +++++++++------
>  drivers/gpu/drm/i915/intel_lrc.c        |  4 ++--
>  2 files changed, 11 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
> index 91014de8bfbc..2faa3bb4c39b 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.h
> +++ b/drivers/gpu/drm/i915/i915_gem_request.h
> @@ -88,20 +88,23 @@ struct drm_i915_gem_request {
>  	 */
>  	u32 previous_seqno;
>  
> -	/** Position in the ringbuffer of the start of the request */
> +	/** Position in the ring of the start of the request */
>  	u32 head;
>  
>  	/**
> -	 * Position in the ringbuffer of the start of the postfix.
> -	 * This is required to calculate the maximum available ringbuffer
> -	 * space without overwriting the postfix.
> +	 * Position in the ring of the start of the postfix.
> +	 * This is required to calculate the maximum available ring space
> +	 * without overwriting the postfix.
>  	 */
>  	u32 postfix;
>  
> -	/** Position in the ringbuffer of the end of the whole request */
> +	/** Position in the ring of the end of the whole request */
>  	u32 tail;
>  
> -	/** Preallocate space in the ringbuffer for the emitting the request */
> +	/** Position in the ring of the end of any workarounds after the tail */
> +	u32 wa_tail;
> +
> +	/** Preallocate space in the ring for the emitting the request */
>  	u32 reserved_space;
>  
>  	/**
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index ca52b8e63305..25e0972e7166 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -482,8 +482,7 @@ static void execlists_unqueue(struct intel_engine_cs *engine)
>  		 * resubmit the request. See gen8_emit_request() for where we
>  		 * prepare the padding after the end of the request.
>  		 */
> -		req0->tail += 8;
> -		req0->tail &= req0->ring->size - 1;
> +		req0->tail = req0->wa_tail;
>  	}
>  
>  	execlists_elsp_submit_contexts(req0, req1);
> @@ -711,6 +710,7 @@ intel_logical_ring_advance(struct drm_i915_gem_request *request)
>  	intel_ring_emit(ring, MI_NOOP);
>  	intel_ring_emit(ring, MI_NOOP);
>  	intel_ring_advance(ring);
> +	request->wa_tail = ring->tail;
>  
>  	/* We keep the previous context alive until we retire the following
>  	 * request. This ensures that any the context object is still pinned
> -- 
> 2.9.3
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [PATCH 04/13] drm/i915: Compute the ELSP register location once
  2016-08-25  9:08 ` [PATCH 04/13] drm/i915: Compute the ELSP register location once Chris Wilson
@ 2016-08-25  9:51   ` Mika Kuoppala
  2016-08-25 10:37     ` Chris Wilson
  0 siblings, 1 reply; 49+ messages in thread
From: Mika Kuoppala @ 2016-08-25  9:51 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Similar to the issue with reading from the context status buffer, we
> frequently write to the ELSP register (4 writes per interrupt) and know
> we hold the required spinlock and forcewake throughout. We can
> shortcircuit the I915_WRITE() by precomputing the address of the ELSP

I915_WRITE_FW are for these situations so the commit message
seems stale in this part.

> register and avoid all the known checks.

...and the checking part is false claim? 

We get precomputing the loc and removal of the superfluous
posting write so it is an improvement.

s/I915_WRITE/I915_WRITE_FW and remove the mention of the checks
as I didn't find any.

With that,
Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>

>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/intel_lrc.c | 16 +++++++---------
>  1 file changed, 7 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 25e0972e7166..15873e69e7fe 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -331,10 +331,11 @@ uint64_t intel_lr_context_descriptor(struct i915_gem_context *ctx,
>  static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
>  				 struct drm_i915_gem_request *rq1)
>  {
> -
>  	struct intel_engine_cs *engine = rq0->engine;
>  	struct drm_i915_private *dev_priv = rq0->i915;
> -	uint64_t desc[2];
> +	u32 __iomem *elsp =
> +		dev_priv->regs + i915_mmio_reg_offset(RING_ELSP(engine));
> +	u64 desc[2];
>  
>  	if (rq1) {
>  		desc[1] = intel_lr_context_descriptor(rq1->ctx, rq1->engine);
> @@ -347,15 +348,12 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
>  	rq0->elsp_submitted++;
>  
>  	/* You must always write both descriptors in the order below. */
> -	I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[1]));
> -	I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[1]));
> +	writel(upper_32_bits(desc[1]), elsp);
> +	writel(lower_32_bits(desc[1]), elsp);
>  
> -	I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[0]));
> +	writel(upper_32_bits(desc[0]), elsp);
>  	/* The context is automatically loaded after the following */
> -	I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[0]));
> -
> -	/* ELSP is a wo register, use another nearby reg for posting */
> -	POSTING_READ_FW(RING_EXECLIST_STATUS_LO(engine));
> +	writel(lower_32_bits(desc[0]), elsp);
>  }
>  
>  static void
> -- 
> 2.9.3
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [PATCH 04/13] drm/i915: Compute the ELSP register location once
  2016-08-25  9:51   ` Mika Kuoppala
@ 2016-08-25 10:37     ` Chris Wilson
  0 siblings, 0 replies; 49+ messages in thread
From: Chris Wilson @ 2016-08-25 10:37 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx

On Thu, Aug 25, 2016 at 12:51:16PM +0300, Mika Kuoppala wrote:
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > Similar to the issue with reading from the context status buffer, we
> > frequently write to the ELSP register (4 writes per interrupt) and know
> > we hold the required spinlock and forcewake throughout. We can
> > shortcircuit the I915_WRITE() by precomputing the address of the ELSP
> 
> I915_WRITE_FW are for these situations so the commit message
> seems stale in this part.
> 
> > register and avoid all the known checks.
> 
> ...and the checking part is false claim? 

I was thinking of WRITE_NOTRACE :|

> We get precomputing the loc and removal of the superfluous
> posting write so it is an improvement.

Yup, tightened up the language to not claim what we already did in
26720ab97fea.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [PATCH 01/13] drm/i915: Add a sw fence for collecting up dma fences
  2016-08-25  9:08 ` [PATCH 01/13] drm/i915: Add a sw fence for collecting up dma fences Chris Wilson
@ 2016-08-25 10:42   ` John Harrison
  2016-08-25 10:51     ` Chris Wilson
  2016-08-25 11:49   ` Joonas Lahtinen
  1 sibling, 1 reply; 49+ messages in thread
From: John Harrison @ 2016-08-25 10:42 UTC (permalink / raw)
  To: intel-gfx

Writing up the IRC discussion...

I think the naming of _complete() vs _done() is unnecessarily confusing 
- the first being an action and the second a query. Chris says that the 
idea is to match the existing naming convention of 'struct completion'. 
If the plan is to eventually move this code out of i915 and into the 
kernel proper then I guess following like a lemming is probably the way 
to go :(.

I also don't like the idea of packing the flags into the lower bits of a 
function pointer. Or going on the naming, packing a function pointer 
into the upper bits of a flags word. There was a comment about atomic 
updates of both flags and pointer but that is never actually used in the 
code. Chris says it is only for better structure packing. I'm not 
convinced the benefit is worth the confusion and maintenance risk. E.g. 
is unsigned long guaranteed to always and ever be sufficient for a pointer?

John.

On 25/08/2016 10:08, Chris Wilson wrote:
> This is really a core kernel struct in disguise until we can finally
> place it in kernel/. There is an immediate need for a fence collection
> mechanism that is more flexible than fence-array, in particular being
> able to easily drive request submission via events (and not just
> interrupt driven). The same mechanism would be useful for handling
> nonblocking and asynchronous atomic modesets, parallel execution and
> more, but for the time being just create a local sw fence for execbuf.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/Makefile        |   1 +
>   drivers/gpu/drm/i915/i915_sw_fence.c | 329 +++++++++++++++++++++++++++++++++++
>   drivers/gpu/drm/i915/i915_sw_fence.h |  56 ++++++
>   3 files changed, 386 insertions(+)
>   create mode 100644 drivers/gpu/drm/i915/i915_sw_fence.c
>   create mode 100644 drivers/gpu/drm/i915/i915_sw_fence.h
>
> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
> index a7da24640e88..a998c2bce70a 100644
> --- a/drivers/gpu/drm/i915/Makefile
> +++ b/drivers/gpu/drm/i915/Makefile
> @@ -16,6 +16,7 @@ i915-y := i915_drv.o \
>   	  i915_params.o \
>   	  i915_pci.o \
>             i915_suspend.o \
> +	  i915_sw_fence.o \
>   	  i915_sysfs.o \
>   	  intel_csr.o \
>   	  intel_device_info.o \
> diff --git a/drivers/gpu/drm/i915/i915_sw_fence.c b/drivers/gpu/drm/i915/i915_sw_fence.c
> new file mode 100644
> index 000000000000..44a02d836a94
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/i915_sw_fence.c
> @@ -0,0 +1,329 @@
> +/*
> + * (C) Copyright 2016 Intel Corporation
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation; version 2
> + * of the License.
> + */
> +
> +#include <linux/slab.h>
> +#include <linux/fence.h>
> +#include <linux/reservation.h>
> +
> +#include "i915_sw_fence.h"
> +
> +static DEFINE_SPINLOCK(i915_sw_fence_lock);
> +
> +static int __i915_sw_fence_notify(struct i915_sw_fence *fence)
> +{
> +	i915_sw_fence_notify_t fn;
> +
> +	fn = (i915_sw_fence_notify_t)(fence->flags & I915_SW_FENCE_MASK);
> +	return fn(fence);
> +}
> +
> +static void i915_sw_fence_free(struct kref *kref)
> +{
> +	struct i915_sw_fence *fence = container_of(kref, typeof(*fence), kref);
> +
> +	WARN_ON(atomic_read(&fence->pending) > 0);
> +
> +	if (fence->flags & I915_SW_FENCE_MASK)
> +		WARN_ON(__i915_sw_fence_notify(fence) != NOTIFY_DONE);
> +	else
> +		kfree(fence);
> +}
> +
> +static void i915_sw_fence_put(struct i915_sw_fence *fence)
> +{
> +	kref_put(&fence->kref, i915_sw_fence_free);
> +}
> +
> +static struct i915_sw_fence *i915_sw_fence_get(struct i915_sw_fence *fence)
> +{
> +	kref_get(&fence->kref);
> +	return fence;
> +}
> +
> +static void __i915_sw_fence_wake_up_all(struct i915_sw_fence *fence,
> +				 struct list_head *continuation)
> +{
> +	wait_queue_head_t *x = &fence->wait;
> +	unsigned long flags;
> +
> +	atomic_dec(&fence->pending);
> +
> +	/*
> +	 * To prevent unbounded recursion as we traverse the graph of i915_sw_fences,
> +	 * we move the task_list from this the next ready fence to the tail of
> +	 * the original fence's task_list (and so added to the list to be
> +	 * woken).
> +	 */
> +	smp_mb__before_spinlock();
> +	if (!list_empty_careful(&x->task_list)) {
> +		spin_lock_irqsave_nested(&x->lock, flags, 1 + !!continuation);
> +		if (continuation) {
> +			list_splice_tail_init(&x->task_list, continuation);
> +		} else {
> +			LIST_HEAD(extra);
> +
> +			do {
> +				__wake_up_locked_key(x, TASK_NORMAL, &extra);
> +
> +				if (list_empty(&extra))
> +					break;
> +
> +				list_splice_tail_init(&extra, &x->task_list);
> +			} while (1);
> +		}
> +		spin_unlock_irqrestore(&x->lock, flags);
> +	}
> +}
> +
> +static void __i915_sw_fence_complete(struct i915_sw_fence *fence,
> +				     struct list_head *continuation)
> +{
> +	if (!atomic_dec_and_test(&fence->pending))
> +		return;
> +
> +	if (fence->flags & I915_SW_FENCE_MASK &&
> +	    __i915_sw_fence_notify(fence) != NOTIFY_DONE)
> +		return;
> +
> +	__i915_sw_fence_wake_up_all(fence, continuation);
> +}
> +
> +static void i915_sw_fence_complete(struct i915_sw_fence *fence)
> +{
> +	if (WARN_ON(i915_sw_fence_done(fence)))
> +		return;
> +
> +	__i915_sw_fence_complete(fence, NULL);
> +}
> +
> +static void i915_sw_fence_await(struct i915_sw_fence *fence)
> +{
> +	WARN_ON(atomic_inc_return(&fence->pending) <= 1);
> +}
> +
> +static void i915_sw_fence_wait(struct i915_sw_fence *fence)
> +{
> +	wait_event(fence->wait, i915_sw_fence_done(fence));
> +}
> +
> +void i915_sw_fence_init(struct i915_sw_fence *fence, i915_sw_fence_notify_t fn)
> +{
> +	BUG_ON((unsigned long)fn & ~I915_SW_FENCE_MASK);
> +
> +	init_waitqueue_head(&fence->wait);
> +	kref_init(&fence->kref);
> +	atomic_set(&fence->pending, 1);
> +	fence->flags = (unsigned long)fn;
> +}
> +
> +void i915_sw_fence_commit(struct i915_sw_fence *fence)
> +{
> +	i915_sw_fence_complete(fence);
> +	i915_sw_fence_put(fence);
> +}
> +
> +static int i915_sw_fence_wake(wait_queue_t *wq, unsigned mode, int flags, void *key)
> +{
> +	list_del(&wq->task_list);
> +	__i915_sw_fence_complete(wq->private, key);
> +	i915_sw_fence_put(wq->private);
> +	kfree(wq);
> +	return 0;
> +}
> +
> +static bool __i915_sw_fence_check_if_after(struct i915_sw_fence *fence,
> +				    const struct i915_sw_fence * const signaler)
> +{
> +	wait_queue_t *wq;
> +
> +	if (__test_and_set_bit(I915_SW_FENCE_CHECKED_BIT, &fence->flags))
> +		return false;
> +
> +	if (fence == signaler)
> +		return true;
> +
> +	list_for_each_entry(wq, &fence->wait.task_list, task_list) {
> +		if (wq->func != i915_sw_fence_wake)
> +			continue;
> +
> +		if (__i915_sw_fence_check_if_after(wq->private, signaler))
> +			return true;
> +	}
> +
> +	return false;
> +}
> +
> +static void __i915_sw_fence_clear_checked_bit(struct i915_sw_fence *fence)
> +{
> +	wait_queue_t *wq;
> +
> +	if (!__test_and_clear_bit(I915_SW_FENCE_CHECKED_BIT, &fence->flags))
> +		return;
> +
> +	list_for_each_entry(wq, &fence->wait.task_list, task_list) {
> +		if (wq->func != i915_sw_fence_wake)
> +			continue;
> +
> +		__i915_sw_fence_clear_checked_bit(wq->private);
> +	}
> +}
> +
> +static bool i915_sw_fence_check_if_after(struct i915_sw_fence *fence,
> +				  const struct i915_sw_fence * const signaler)
> +{
> +	unsigned long flags;
> +	bool err;
> +
> +	if (!IS_ENABLED(CONFIG_I915_SW_FENCE_CHECK_DAG))
> +		return false;
> +
> +	spin_lock_irqsave(&i915_sw_fence_lock, flags);
> +	err = __i915_sw_fence_check_if_after(fence, signaler);
> +	__i915_sw_fence_clear_checked_bit(fence);
> +	spin_unlock_irqrestore(&i915_sw_fence_lock, flags);
> +
> +	return err;
> +}
> +
> +static wait_queue_t *__i915_sw_fence_create_wq(struct i915_sw_fence *fence, gfp_t gfp)
> +{
> +	wait_queue_t *wq;
> +
> +	wq = kmalloc(sizeof(*wq), gfp);
> +	if (unlikely(!wq))
> +		return NULL;
> +
> +	INIT_LIST_HEAD(&wq->task_list);
> +	wq->flags = 0;
> +	wq->func = i915_sw_fence_wake;
> +	wq->private = i915_sw_fence_get(fence);
> +
> +	i915_sw_fence_await(fence);
> +
> +	return wq;
> +}
> +
> +int i915_sw_fence_await_sw_fence(struct i915_sw_fence *fence,
> +				 struct i915_sw_fence *signaler,
> +				 gfp_t gfp)
> +{
> +	wait_queue_t *wq;
> +	unsigned long flags;
> +	int pending;
> +
> +	if (i915_sw_fence_done(signaler))
> +		return 0;
> +
> +	/* The dependency graph must be acyclic. */
> +	if (unlikely(i915_sw_fence_check_if_after(fence, signaler)))
> +		return -EINVAL;
> +
> +	wq = __i915_sw_fence_create_wq(fence, gfp);
> +	if (unlikely(!wq)) {
> +		if (!gfpflags_allow_blocking(gfp))
> +			return -ENOMEM;
> +
> +		i915_sw_fence_wait(signaler);
> +		return 0;
> +	}
> +
> +	spin_lock_irqsave(&signaler->wait.lock, flags);
> +	if (likely(!i915_sw_fence_done(signaler))) {
> +		__add_wait_queue_tail(&signaler->wait, wq);
> +		pending = 1;
> +	} else {
> +		i915_sw_fence_wake(wq, 0, 0, NULL);
> +		pending = 0;
> +	}
> +	spin_unlock_irqrestore(&signaler->wait.lock, flags);
> +
> +	return pending;
> +}
> +
> +struct dma_fence_cb {
> +	struct fence_cb base;
> +	struct i915_sw_fence *fence;
> +};
> +
> +static void dma_i915_sw_fence_wake(struct fence *dma, struct fence_cb *data)
> +{
> +	struct dma_fence_cb *cb = container_of(data, typeof(*cb), base);
> +
> +	i915_sw_fence_commit(cb->fence);
> +	kfree(cb);
> +}
> +
> +int i915_sw_fence_await_dma_fence(struct i915_sw_fence *fence,
> +				  struct fence *dma, gfp_t gfp)
> +{
> +	struct dma_fence_cb *cb;
> +	int ret;
> +
> +	if (fence_is_signaled(dma))
> +		return 0;
> +
> +	cb = kmalloc(sizeof(*cb), gfp);
> +	if (!cb) {
> +		if (!gfpflags_allow_blocking(gfp))
> +			return -ENOMEM;
> +
> +		return fence_wait(dma, false);
> +	}
> +
> +	cb->fence = i915_sw_fence_get(fence);
> +	i915_sw_fence_await(fence);
> +
> +	ret = fence_add_callback(dma, &cb->base, dma_i915_sw_fence_wake);
> +	if (ret == 0) {
> +		ret = 1;
> +	} else {
> +		dma_i915_sw_fence_wake(dma, &cb->base);
> +		if (ret == -ENOENT) /* fence already signaled */
> +			ret = 0;
> +	}
> +
> +	return ret;
> +}
> +
> +int i915_sw_fence_await_reservation(struct i915_sw_fence *fence,
> +				    struct reservation_object *resv,
> +				    const struct fence_ops *exclude,
> +				    bool write,
> +				    gfp_t gfp)
> +{
> +	struct fence *excl, **shared;
> +	unsigned int count, i;
> +	int ret;
> +
> +	ret = reservation_object_get_fences_rcu(resv, &excl, &count, &shared);
> +	if (ret)
> +		return ret;
> +
> +	if (write) {
> +		for (i = 0; i < count; i++) {
> +			if (shared[i]->ops == exclude)
> +				continue;
> +
> +			ret |= i915_sw_fence_await_dma_fence(fence, shared[i], gfp);
> +			if (ret < 0)
> +				goto out;
> +		}
> +	}
> +
> +	if (excl && excl->ops != exclude)
> +		ret |= i915_sw_fence_await_dma_fence(fence, excl, gfp);
> +
> +out:
> +	fence_put(excl);
> +	for (i = 0; i < count; i++)
> +		fence_put(shared[i]);
> +	kfree(shared);
> +
> +	return ret;
> +}
> diff --git a/drivers/gpu/drm/i915/i915_sw_fence.h b/drivers/gpu/drm/i915/i915_sw_fence.h
> new file mode 100644
> index 000000000000..ac99c7965436
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/i915_sw_fence.h
> @@ -0,0 +1,56 @@
> +/*
> + * i915_sw_fence.h - library routines for N:M synchronisation points
> + *
> + * Copyright (C) 2016 Intel Corporation
> + *
> + * This file is released under the GPLv2.
> + *
> + */
> +
> +#ifndef _I915_SW_FENCE_H_
> +#define _I915_SW_FENCE_H_
> +
> +#include <linux/gfp.h>
> +#include <linux/kref.h>
> +#include <linux/notifier.h> /* for NOTIFY_DONE */
> +#include <linux/wait.h>
> +
> +struct completion;
> +struct fence;
> +struct fence_ops;
> +struct reservation_object;
> +
> +struct i915_sw_fence {
> +	wait_queue_head_t wait;
> +	unsigned long flags;
> +	struct kref kref;
> +	atomic_t pending;
> +};
> +
> +#define I915_SW_FENCE_CHECKED_BIT	0 /* used internally for DAG checking */
> +#define I915_SW_FENCE_PRIVATE_BIT	1 /* available for use by owner */
> +#define I915_SW_FENCE_MASK		(~3)
> +
> +typedef int (*i915_sw_fence_notify_t)(struct i915_sw_fence *);
> +#define __i915_sw_fence_call __aligned(4)
> +
> +void i915_sw_fence_init(struct i915_sw_fence *fence, i915_sw_fence_notify_t fn);
> +void i915_sw_fence_commit(struct i915_sw_fence *fence);
> +
> +int i915_sw_fence_await_sw_fence(struct i915_sw_fence *fence,
> +				 struct i915_sw_fence *after,
> +				 gfp_t gfp);
> +int i915_sw_fence_await_dma_fence(struct i915_sw_fence *fence,
> +				  struct fence *dma, gfp_t gfp);
> +int i915_sw_fence_await_reservation(struct i915_sw_fence *fence,
> +			     struct reservation_object *resv,
> +			     const struct fence_ops *exclude,
> +			     bool write,
> +			     gfp_t gfp);
> +
> +static inline bool i915_sw_fence_done(const struct i915_sw_fence *fence)
> +{
> +	return atomic_read(&fence->pending) < 0;
> +}
> +
> +#endif /* _I915_SW_FENCE_H_ */

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [PATCH 01/13] drm/i915: Add a sw fence for collecting up dma fences
  2016-08-25 10:42   ` John Harrison
@ 2016-08-25 10:51     ` Chris Wilson
  0 siblings, 0 replies; 49+ messages in thread
From: Chris Wilson @ 2016-08-25 10:51 UTC (permalink / raw)
  To: John Harrison; +Cc: intel-gfx

On Thu, Aug 25, 2016 at 11:42:41AM +0100, John Harrison wrote:
> I'm not convinced the benefit is worth the
> confusion and maintenance risk. E.g. is unsigned long guaranteed to
> always and ever be sufficient for a pointer?

Yes, unsigned long will always be sufficient for a kernel pointer. I'm
sure there is a classic rant from Linus about this somewhere...
Quite a few core structs will pass around unsigned long instead of
pointers, which can be quite annoying when trying to use!
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [PATCH 01/13] drm/i915: Add a sw fence for collecting up dma fences
  2016-08-25  9:08 ` [PATCH 01/13] drm/i915: Add a sw fence for collecting up dma fences Chris Wilson
  2016-08-25 10:42   ` John Harrison
@ 2016-08-25 11:49   ` Joonas Lahtinen
  2016-08-25 12:25     ` Chris Wilson
  2016-08-25 16:52     ` Chris Wilson
  1 sibling, 2 replies; 49+ messages in thread
From: Joonas Lahtinen @ 2016-08-25 11:49 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

On to, 2016-08-25 at 10:08 +0100, Chris Wilson wrote:
> +++ b/drivers/gpu/drm/i915/i915_sw_fence.c
> @@ -0,0 +1,329 @@
> +/*
> + * (C) Copyright 2016 Intel Corporation
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation; version 2
> + * of the License.
> + */

Copyright notice is rather brief, copy from i915_gem_execbuffer.c

> +static int __i915_sw_fence_notify(struct i915_sw_fence *fence)
> +{
> +	i915_sw_fence_notify_t fn;
> +
> +	fn = (i915_sw_fence_notify_t)(fence->flags & I915_SW_FENCE_MASK);

I'd define an accessor functions in scatterlist.h fashion.

Although I'm not convinced of packing the flags with the function
pointer. How many fences do we expect to be allocated simultaneously on
a real usage scenario?

> +static void __i915_sw_fence_wake_up_all(struct i915_sw_fence *fence,
> +				 struct list_head *continuation)
> +{
> +	wait_queue_head_t *x = &fence->wait;
> +	unsigned long flags;
> +
> +	atomic_dec(&fence->pending);
> +
> +	/*
> +	 * To prevent unbounded recursion as we traverse the graph of i915_sw_fences,

Long line due to s/kfence/.../

> +	 * we move the task_list from this the next ready fence to the tail of
> +	 * the original fence's task_list (and so added to the list to be
> +	 * woken).
> +	 */
> +	smp_mb__before_spinlock();
> +	if (!list_empty_careful(&x->task_list)) {

if (list_empty_careful()
	return;

> +static bool __i915_sw_fence_check_if_after(struct i915_sw_fence *fence,
> +				    const struct i915_sw_fence * const signaler)
> +{
> +	wait_queue_t *wq;
> +
> +	if (__test_and_set_bit(I915_SW_FENCE_CHECKED_BIT, &fence->flags))
> +		return false;
> +
> +	if (fence == signaler)
> +		return true;

Not sure if I would expect _if_after to return true on an equal
object...

> +int i915_sw_fence_await_reservation(struct i915_sw_fence *fence,
> +				    struct reservation_object *resv,
> +				    const struct fence_ops *exclude,
> +				    bool write,
> +				    gfp_t gfp)
> +{
> +	struct fence *excl, **shared;
> +	unsigned int count, i;
> +	int ret;
> +
> +	ret = reservation_object_get_fences_rcu(resv, &excl, &count, &shared);
> +	if (ret)
> +		return ret;
> +
> +	if (write) {
> +		for (i = 0; i < count; i++) {
> +			if (shared[i]->ops == exclude)
> +				continue;
> +
> +			ret |= i915_sw_fence_await_dma_fence(fence, shared[i], gfp);

Do we really want to bitwise here?

> +			if (ret < 0)
> +				goto out;
> +		}
> +	}
> +
> +	if (excl && excl->ops != exclude)
> +		ret |= i915_sw_fence_await_dma_fence(fence, excl, gfp);

Ditto.

> +
> +out:
> +	fence_put(excl);
> +	for (i = 0; i < count; i++)
> +		fence_put(shared[i]);
> +	kfree(shared);
> +
> +	return ret;
> +}

<SNIP>

> +struct i915_sw_fence {
> +	wait_queue_head_t wait;
> +	unsigned long flags;

Name is rather deceiving to contain a callback function.

> +#define __i915_sw_fence_call __aligned(4)

Not packing would get rid of this too...

> +
> +void i915_sw_fence_init(struct i915_sw_fence *fence, i915_sw_fence_notify_t fn);
> +void i915_sw_fence_commit(struct i915_sw_fence *fence);

Not _signal?

> +static inline bool i915_sw_fence_done(const struct i915_sw_fence *fence)

_is_done() or _is_completed()

Regards, Joonas
-- 
Joonas Lahtinen
Open Source Technology Center
Intel Corporation
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [PATCH 08/13] drm/i915: Drive request submission through fence callbacks
  2016-08-25  9:08 ` [PATCH 08/13] drm/i915: Drive request submission through fence callbacks Chris Wilson
@ 2016-08-25 12:08   ` Joonas Lahtinen
  2016-08-25 12:35     ` Chris Wilson
  2016-08-26 12:47   ` John Harrison
  1 sibling, 1 reply; 49+ messages in thread
From: Joonas Lahtinen @ 2016-08-25 12:08 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

On to, 2016-08-25 at 10:08 +0100, Chris Wilson wrote:
> @@ -462,7 +462,10 @@ static int intel_breadcrumbs_signaler(void *arg)
>  			 */
>  			intel_engine_remove_wait(engine,
>  						 &request->signaling.wait);
> +
> +			local_bh_disable();
>  			fence_signal(&request->fence);
> +			local_bh_enable(); /* kick start the tasklets */

Why disable bottom half processing around here too?

Regards, Joonas
-- 
Joonas Lahtinen
Open Source Technology Center
Intel Corporation
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [PATCH 01/13] drm/i915: Add a sw fence for collecting up dma fences
  2016-08-25 11:49   ` Joonas Lahtinen
@ 2016-08-25 12:25     ` Chris Wilson
  2016-08-25 16:52     ` Chris Wilson
  1 sibling, 0 replies; 49+ messages in thread
From: Chris Wilson @ 2016-08-25 12:25 UTC (permalink / raw)
  To: Joonas Lahtinen; +Cc: intel-gfx

On Thu, Aug 25, 2016 at 02:49:59PM +0300, Joonas Lahtinen wrote:
> On to, 2016-08-25 at 10:08 +0100, Chris Wilson wrote:
> > +++ b/drivers/gpu/drm/i915/i915_sw_fence.c
> > @@ -0,0 +1,329 @@
> > +/*
> > + * (C) Copyright 2016 Intel Corporation
> > + *
> > + * This program is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU General Public License
> > + * as published by the Free Software Foundation; version 2
> > + * of the License.
> > + */
> 
> Copyright notice is rather brief, copy from i915_gem_execbuffer.c
> 
> > +static int __i915_sw_fence_notify(struct i915_sw_fence *fence)
> > +{
> > +	i915_sw_fence_notify_t fn;
> > +
> > +	fn = (i915_sw_fence_notify_t)(fence->flags & I915_SW_FENCE_MASK);
> 
> I'd define an accessor functions in scatterlist.h fashion.

Hmm, this was meant to be the accessor function - as nowhere else was
meant to be pulling out the fn from the mask. But that can be split out
again.
 
> Although I'm not convinced of packing the flags with the function
> pointer. How many fences do we expect to be allocated simultaneously on
> a real usage scenario?

Several thousand for async init. Tens of thousands per second for i915,
though we only except a few hundred in flight at any time. Ditto for
kms. Ditto for async work. And places I haven't looked at yet.

That's more than there are mutex_init!

> > +static bool __i915_sw_fence_check_if_after(struct i915_sw_fence *fence,
> > +				    const struct i915_sw_fence * const signaler)
> > +{
> > +	wait_queue_t *wq;
> > +
> > +	if (__test_and_set_bit(I915_SW_FENCE_CHECKED_BIT, &fence->flags))
> > +		return false;
> > +
> > +	if (fence == signaler)
> > +		return true;
> 
> Not sure if I would expect _if_after to return true on an equal
> object...

True being the error case.

kfence_err_if_after?
kfence_abort_if_after?

> > +int i915_sw_fence_await_reservation(struct i915_sw_fence *fence,
> > +				    struct reservation_object *resv,
> > +				    const struct fence_ops *exclude,
> > +				    bool write,
> > +				    gfp_t gfp)
> > +{
> > +	struct fence *excl, **shared;
> > +	unsigned int count, i;
> > +	int ret;
> > +
> > +	ret = reservation_object_get_fences_rcu(resv, &excl, &count, &shared);
> > +	if (ret)
> > +		return ret;
> > +
> > +	if (write) {
> > +		for (i = 0; i < count; i++) {
> > +			if (shared[i]->ops == exclude)
> > +				continue;
> > +
> > +			ret |= i915_sw_fence_await_dma_fence(fence, shared[i], gfp);
> 
> Do we really want to bitwise here?

Correctly it should be

int pending = await_dma_fence();
if (pending < 0) {
	ret = pending;
	goto out;
}
ret |= pending;

I've been using the ternary result as a shortcut (if the fence is
passed, we can dispose of it for example). The code becomes much simpler
if we didn't pass that information back - but I think it is worth it.

> > +void i915_sw_fence_init(struct i915_sw_fence *fence, i915_sw_fence_notify_t fn);
> > +void i915_sw_fence_commit(struct i915_sw_fence *fence);
> 
> Not _signal?

signal is fence_complete() (corresponding to complete(struct
completion))

commit is a convenience function for complete + kref_put. I do like it
as it gives a clear finish to a
	kfence_init(); .. setup ...; kfence_commit()
sequence. But the name is only so-so.

> > +static inline bool i915_sw_fence_done(const struct i915_sw_fence *fence)
> 
> _is_done() or _is_completed()

As above, instructions were to use _done, so that is matched struct
completion as closely as possible. They have already chosen their
interface, we have to try and fit in as close as possible.

I do agree, I much prefer having _is_ in my boolean functions - but
matching struct completion is a must for upstream proposal (imo).
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [PATCH 08/13] drm/i915: Drive request submission through fence callbacks
  2016-08-25 12:08   ` Joonas Lahtinen
@ 2016-08-25 12:35     ` Chris Wilson
  0 siblings, 0 replies; 49+ messages in thread
From: Chris Wilson @ 2016-08-25 12:35 UTC (permalink / raw)
  To: Joonas Lahtinen; +Cc: intel-gfx

On Thu, Aug 25, 2016 at 03:08:58PM +0300, Joonas Lahtinen wrote:
> On to, 2016-08-25 at 10:08 +0100, Chris Wilson wrote:
> > @@ -462,7 +462,10 @@ static int intel_breadcrumbs_signaler(void *arg)
> >  			 */
> >  			intel_engine_remove_wait(engine,
> >  						 &request->signaling.wait);
> > +
> > +			local_bh_disable();
> >  			fence_signal(&request->fence);
> > +			local_bh_enable(); /* kick start the tasklets */
> 
> Why disable bottom half processing around here too?

It is to reduce the latency of running the execlists tasklet after
submitting the request. fence_signal using an irq spinlock and so
doesn't fire off sortirqs - but if we are waking up this engine after an
inter-engine/cross-driver wait, that delay can cause a throughput drop
of about 30% due to the increased latency.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [PATCH 05/13] drm/i915: Reorder submitting the requests to ELSP
  2016-08-25  9:08 ` [PATCH 05/13] drm/i915: Reorder submitting the requests to ELSP Chris Wilson
@ 2016-08-25 13:02   ` Mika Kuoppala
  0 siblings, 0 replies; 49+ messages in thread
From: Mika Kuoppala @ 2016-08-25 13:02 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Just rearrange the code to reduce churn in the next patch.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>

> ---
>  drivers/gpu/drm/i915/intel_lrc.c | 76 ++++++++++++++++++++--------------------
>  1 file changed, 38 insertions(+), 38 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 15873e69e7fe..d0a1f14a4f14 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -328,32 +328,18 @@ uint64_t intel_lr_context_descriptor(struct i915_gem_context *ctx,
>  	return ctx->engine[engine->id].lrc_desc;
>  }
>  
> -static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
> -				 struct drm_i915_gem_request *rq1)
> +static inline void
> +execlists_context_status_change(struct drm_i915_gem_request *rq,
> +				unsigned long status)
>  {
> -	struct intel_engine_cs *engine = rq0->engine;
> -	struct drm_i915_private *dev_priv = rq0->i915;
> -	u32 __iomem *elsp =
> -		dev_priv->regs + i915_mmio_reg_offset(RING_ELSP(engine));
> -	u64 desc[2];
> -
> -	if (rq1) {
> -		desc[1] = intel_lr_context_descriptor(rq1->ctx, rq1->engine);
> -		rq1->elsp_submitted++;
> -	} else {
> -		desc[1] = 0;
> -	}
> -
> -	desc[0] = intel_lr_context_descriptor(rq0->ctx, rq0->engine);
> -	rq0->elsp_submitted++;
> -
> -	/* You must always write both descriptors in the order below. */
> -	writel(upper_32_bits(desc[1]), elsp);
> -	writel(lower_32_bits(desc[1]), elsp);
> +	/*
> +	 * Only used when GVT-g is enabled now. When GVT-g is disabled,
> +	 * The compiler should eliminate this function as dead-code.
> +	 */
> +	if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
> +		return;
>  
> -	writel(upper_32_bits(desc[0]), elsp);
> -	/* The context is automatically loaded after the following */
> -	writel(lower_32_bits(desc[0]), elsp);
> +	atomic_notifier_call_chain(&rq->ctx->status_notifier, status, rq);
>  }
>  
>  static void
> @@ -382,6 +368,34 @@ static void execlists_update_context(struct drm_i915_gem_request *rq)
>  		execlists_update_context_pdps(ppgtt, reg_state);
>  }
>  
> +static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
> +				 struct drm_i915_gem_request *rq1)
> +{
> +	struct intel_engine_cs *engine = rq0->engine;
> +	struct drm_i915_private *dev_priv = rq0->i915;
> +	u32 __iomem *elsp =
> +		dev_priv->regs + i915_mmio_reg_offset(RING_ELSP(engine));
> +	u64 desc[2];
> +
> +	if (rq1) {
> +		desc[1] = intel_lr_context_descriptor(rq1->ctx, rq1->engine);
> +		rq1->elsp_submitted++;
> +	} else {
> +		desc[1] = 0;
> +	}
> +
> +	desc[0] = intel_lr_context_descriptor(rq0->ctx, rq0->engine);
> +	rq0->elsp_submitted++;
> +
> +	/* You must always write both descriptors in the order below. */
> +	writel(upper_32_bits(desc[1]), elsp);
> +	writel(lower_32_bits(desc[1]), elsp);
> +
> +	writel(upper_32_bits(desc[0]), elsp);
> +	/* The context is automatically loaded after the following */
> +	writel(lower_32_bits(desc[0]), elsp);
> +}
> +
>  static void execlists_elsp_submit_contexts(struct drm_i915_gem_request *rq0,
>  					   struct drm_i915_gem_request *rq1)
>  {
> @@ -402,20 +416,6 @@ static void execlists_elsp_submit_contexts(struct drm_i915_gem_request *rq0,
>  	spin_unlock_irq(&dev_priv->uncore.lock);
>  }
>  
> -static inline void execlists_context_status_change(
> -		struct drm_i915_gem_request *rq,
> -		unsigned long status)
> -{
> -	/*
> -	 * Only used when GVT-g is enabled now. When GVT-g is disabled,
> -	 * The compiler should eliminate this function as dead-code.
> -	 */
> -	if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
> -		return;
> -
> -	atomic_notifier_call_chain(&rq->ctx->status_notifier, status, rq);
> -}
> -
>  static void execlists_unqueue(struct intel_engine_cs *engine)
>  {
>  	struct drm_i915_gem_request *req0 = NULL, *req1 = NULL;
> -- 
> 2.9.3
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [PATCH 01/13] drm/i915: Add a sw fence for collecting up dma fences
  2016-08-25 11:49   ` Joonas Lahtinen
  2016-08-25 12:25     ` Chris Wilson
@ 2016-08-25 16:52     ` Chris Wilson
  1 sibling, 0 replies; 49+ messages in thread
From: Chris Wilson @ 2016-08-25 16:52 UTC (permalink / raw)
  To: Joonas Lahtinen; +Cc: intel-gfx

On Thu, Aug 25, 2016 at 02:49:59PM +0300, Joonas Lahtinen wrote:
> > +	 * we move the task_list from this the next ready fence to the tail of
> > +	 * the original fence's task_list (and so added to the list to be
> > +	 * woken).
> > +	 */
> > +	smp_mb__before_spinlock();
> > +	if (!list_empty_careful(&x->task_list)) {
> 
> if (list_empty_careful()
> 	return;

It's just broken. I added it recently after reading

void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
{
        unsigned long flags;

        __set_current_state(TASK_RUNNING);
        /*
         * We can check for list emptiness outside the lock
         * IFF:
         *  - we use the "careful" check that verifies both
         *    the next and prev pointers, so that there cannot
         *    be any half-pending updates in progress on other
         *    CPU's that we haven't seen yet (and that might
         *    still change the stack area.
         * and
         *  - all other users take the lock (ie we can only
         *    have _one_ other CPU that looks at or modifies
         *    the list).
         */
        if (!list_empty_careful(&wait->task_list)) {
                spin_lock_irqsave(&q->lock, flags);
                list_del_init(&wait->task_list);
                spin_unlock_irqrestore(&q->lock, flags);
        }
}

and convinced myself that it was also safe to apply here.
Turns out that spinlock is very hard to avoid.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [PATCH 02/13] drm/i915: Only queue requests during execlists submission
  2016-08-25  9:08 ` [PATCH 02/13] drm/i915: Only queue requests during execlists submission Chris Wilson
@ 2016-08-26  9:41   ` Mika Kuoppala
  2016-08-26  9:51     ` Chris Wilson
  2016-08-26  9:54     ` Chris Wilson
  0 siblings, 2 replies; 49+ messages in thread
From: Mika Kuoppala @ 2016-08-26  9:41 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Leave the more complicated request dequeueing to the tasklet and instead
> just kick start the tasklet if we detect we are adding the first
> request.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/intel_lrc.c | 28 ++++------------------------
>  1 file changed, 4 insertions(+), 24 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 6b49df4316f4..ca52b8e63305 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -609,35 +609,15 @@ static void intel_lrc_irq_handler(unsigned long data)
>  static void execlists_submit_request(struct drm_i915_gem_request *request)
>  {
>  	struct intel_engine_cs *engine = request->engine;
> -	struct drm_i915_gem_request *cursor;
> -	int num_elements = 0;
>  
>  	spin_lock_bh(&engine->execlist_lock);
>  
> -	list_for_each_entry(cursor, &engine->execlist_queue, execlist_link)
> -		if (++num_elements > 2)
> -			break;
> -
> -	if (num_elements > 2) {
> -		struct drm_i915_gem_request *tail_req;
> -
> -		tail_req = list_last_entry(&engine->execlist_queue,
> -					   struct drm_i915_gem_request,
> -					   execlist_link);
> -
> -		if (request->ctx == tail_req->ctx) {
> -			WARN(tail_req->elsp_submitted != 0,
> -				"More than 2 already-submitted reqs queued\n");
> -			list_del(&tail_req->execlist_link);
> -			i915_gem_request_put(tail_req);
> -		}
> -	}
> -
>  	i915_gem_request_get(request);
> -	list_add_tail(&request->execlist_link, &engine->execlist_queue);
>  	request->ctx_hw_id = request->ctx->hw_id;
> -	if (num_elements == 0)
> -		execlists_unqueue(engine);
> +
> +	list_add_tail(&request->execlist_link, &engine->execlist_queue);
> +	if (request->execlist_link.prev == &engine->execlist_queue)

Why not list_first_entry()?

-Mika


> +		tasklet_hi_schedule(&engine->irq_tasklet);
>  
>  	spin_unlock_bh(&engine->execlist_lock);
>  }
> -- 
> 2.9.3
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [PATCH 02/13] drm/i915: Only queue requests during execlists submission
  2016-08-26  9:41   ` Mika Kuoppala
@ 2016-08-26  9:51     ` Chris Wilson
  2016-08-26  9:54     ` Chris Wilson
  1 sibling, 0 replies; 49+ messages in thread
From: Chris Wilson @ 2016-08-26  9:51 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx

On Fri, Aug 26, 2016 at 12:41:16PM +0300, Mika Kuoppala wrote:
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > Leave the more complicated request dequeueing to the tasklet and instead
> > just kick start the tasklet if we detect we are adding the first
> > request.
> >
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > ---
> >  drivers/gpu/drm/i915/intel_lrc.c | 28 ++++------------------------
> >  1 file changed, 4 insertions(+), 24 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> > index 6b49df4316f4..ca52b8e63305 100644
> > --- a/drivers/gpu/drm/i915/intel_lrc.c
> > +++ b/drivers/gpu/drm/i915/intel_lrc.c
> > @@ -609,35 +609,15 @@ static void intel_lrc_irq_handler(unsigned long data)
> >  static void execlists_submit_request(struct drm_i915_gem_request *request)
> >  {
> >  	struct intel_engine_cs *engine = request->engine;
> > -	struct drm_i915_gem_request *cursor;
> > -	int num_elements = 0;
> >  
> >  	spin_lock_bh(&engine->execlist_lock);
> >  
> > -	list_for_each_entry(cursor, &engine->execlist_queue, execlist_link)
> > -		if (++num_elements > 2)
> > -			break;
> > -
> > -	if (num_elements > 2) {
> > -		struct drm_i915_gem_request *tail_req;
> > -
> > -		tail_req = list_last_entry(&engine->execlist_queue,
> > -					   struct drm_i915_gem_request,
> > -					   execlist_link);
> > -
> > -		if (request->ctx == tail_req->ctx) {
> > -			WARN(tail_req->elsp_submitted != 0,
> > -				"More than 2 already-submitted reqs queued\n");
> > -			list_del(&tail_req->execlist_link);
> > -			i915_gem_request_put(tail_req);
> > -		}
> > -	}
> > -
> >  	i915_gem_request_get(request);
> > -	list_add_tail(&request->execlist_link, &engine->execlist_queue);
> >  	request->ctx_hw_id = request->ctx->hw_id;
> > -	if (num_elements == 0)
> > -		execlists_unqueue(engine);
> > +
> > +	list_add_tail(&request->execlist_link, &engine->execlist_queue);
> > +	if (request->execlist_link.prev == &engine->execlist_queue)
> 
> Why not list_first_entry()?

if (list_first_entry(&engine->execlist_queue) == request)

Could do. It was only a placeholder from patch splitting! :)

(I'm just more used to comparing prev/next from list iteration ;)
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [PATCH 02/13] drm/i915: Only queue requests during execlists submission
  2016-08-26  9:41   ` Mika Kuoppala
  2016-08-26  9:51     ` Chris Wilson
@ 2016-08-26  9:54     ` Chris Wilson
  2016-08-26 10:43       ` Mika Kuoppala
  1 sibling, 1 reply; 49+ messages in thread
From: Chris Wilson @ 2016-08-26  9:54 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx

On Fri, Aug 26, 2016 at 12:41:16PM +0300, Mika Kuoppala wrote:
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > Leave the more complicated request dequeueing to the tasklet and instead
> > just kick start the tasklet if we detect we are adding the first
> > request.
> >
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > ---
> >  drivers/gpu/drm/i915/intel_lrc.c | 28 ++++------------------------
> >  1 file changed, 4 insertions(+), 24 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> > index 6b49df4316f4..ca52b8e63305 100644
> > --- a/drivers/gpu/drm/i915/intel_lrc.c
> > +++ b/drivers/gpu/drm/i915/intel_lrc.c
> > @@ -609,35 +609,15 @@ static void intel_lrc_irq_handler(unsigned long data)
> >  static void execlists_submit_request(struct drm_i915_gem_request *request)
> >  {
> >  	struct intel_engine_cs *engine = request->engine;
> > -	struct drm_i915_gem_request *cursor;
> > -	int num_elements = 0;
> >  
> >  	spin_lock_bh(&engine->execlist_lock);
> >  
> > -	list_for_each_entry(cursor, &engine->execlist_queue, execlist_link)
> > -		if (++num_elements > 2)
> > -			break;
> > -
> > -	if (num_elements > 2) {
> > -		struct drm_i915_gem_request *tail_req;
> > -
> > -		tail_req = list_last_entry(&engine->execlist_queue,
> > -					   struct drm_i915_gem_request,
> > -					   execlist_link);
> > -
> > -		if (request->ctx == tail_req->ctx) {
> > -			WARN(tail_req->elsp_submitted != 0,
> > -				"More than 2 already-submitted reqs queued\n");
> > -			list_del(&tail_req->execlist_link);
> > -			i915_gem_request_put(tail_req);
> > -		}
> > -	}
> > -
> >  	i915_gem_request_get(request);
> > -	list_add_tail(&request->execlist_link, &engine->execlist_queue);
> >  	request->ctx_hw_id = request->ctx->hw_id;
> > -	if (num_elements == 0)
> > -		execlists_unqueue(engine);
> > +
> > +	list_add_tail(&request->execlist_link, &engine->execlist_queue);
> > +	if (request->execlist_link.prev == &engine->execlist_queue)
> 
> Why not list_first_entry()?

Ah, because it's not as magic as I thought :)

if (list_first_entry(&engine->execlist_queue,
		     struct drm_i915_gem_request,
		     execlist_link) == request)

I think is reason enough.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [PATCH 02/13] drm/i915: Only queue requests during execlists submission
  2016-08-26  9:54     ` Chris Wilson
@ 2016-08-26 10:43       ` Mika Kuoppala
  2016-08-26 10:53         ` Chris Wilson
  0 siblings, 1 reply; 49+ messages in thread
From: Mika Kuoppala @ 2016-08-26 10:43 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> On Fri, Aug 26, 2016 at 12:41:16PM +0300, Mika Kuoppala wrote:
>> Chris Wilson <chris@chris-wilson.co.uk> writes:
>> 
>> > Leave the more complicated request dequeueing to the tasklet and instead
>> > just kick start the tasklet if we detect we are adding the first
>> > request.
>> >
>> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>> > ---
>> >  drivers/gpu/drm/i915/intel_lrc.c | 28 ++++------------------------
>> >  1 file changed, 4 insertions(+), 24 deletions(-)
>> >
>> > diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
>> > index 6b49df4316f4..ca52b8e63305 100644
>> > --- a/drivers/gpu/drm/i915/intel_lrc.c
>> > +++ b/drivers/gpu/drm/i915/intel_lrc.c
>> > @@ -609,35 +609,15 @@ static void intel_lrc_irq_handler(unsigned long data)
>> >  static void execlists_submit_request(struct drm_i915_gem_request *request)
>> >  {
>> >  	struct intel_engine_cs *engine = request->engine;
>> > -	struct drm_i915_gem_request *cursor;
>> > -	int num_elements = 0;
>> >  
>> >  	spin_lock_bh(&engine->execlist_lock);
>> >  
>> > -	list_for_each_entry(cursor, &engine->execlist_queue, execlist_link)
>> > -		if (++num_elements > 2)
>> > -			break;
>> > -
>> > -	if (num_elements > 2) {
>> > -		struct drm_i915_gem_request *tail_req;
>> > -
>> > -		tail_req = list_last_entry(&engine->execlist_queue,
>> > -					   struct drm_i915_gem_request,
>> > -					   execlist_link);
>> > -
>> > -		if (request->ctx == tail_req->ctx) {
>> > -			WARN(tail_req->elsp_submitted != 0,
>> > -				"More than 2 already-submitted reqs queued\n");
>> > -			list_del(&tail_req->execlist_link);
>> > -			i915_gem_request_put(tail_req);
>> > -		}
>> > -	}
>> > -
>> >  	i915_gem_request_get(request);
>> > -	list_add_tail(&request->execlist_link, &engine->execlist_queue);
>> >  	request->ctx_hw_id = request->ctx->hw_id;
>> > -	if (num_elements == 0)
>> > -		execlists_unqueue(engine);
>> > +
>> > +	list_add_tail(&request->execlist_link, &engine->execlist_queue);
>> > +	if (request->execlist_link.prev == &engine->execlist_queue)
>> 
>> Why not list_first_entry()?
>
> Ah, because it's not as magic as I thought :)
>
> if (list_first_entry(&engine->execlist_queue,
> 		     struct drm_i915_gem_request,
> 		     execlist_link) == request)
>
> I think is reason enough.

Agreed. Looked at

if (list_is_singular(&engine->execlist_queue))

but that it will add needless empty check.

If that is abomination, just add comment that we need to resubmit
if the list was empty.

Without digging the list internals, the next reader also
could be more happy with

head->next == head->prev check.

-Mika

> -Chris
>
> -- 
> Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [PATCH 02/13] drm/i915: Only queue requests during execlists submission
  2016-08-26 10:43       ` Mika Kuoppala
@ 2016-08-26 10:53         ` Chris Wilson
  0 siblings, 0 replies; 49+ messages in thread
From: Chris Wilson @ 2016-08-26 10:53 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx

On Fri, Aug 26, 2016 at 01:43:42PM +0300, Mika Kuoppala wrote:
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > On Fri, Aug 26, 2016 at 12:41:16PM +0300, Mika Kuoppala wrote:
> >> Chris Wilson <chris@chris-wilson.co.uk> writes:
> >> 
> >> > Leave the more complicated request dequeueing to the tasklet and instead
> >> > just kick start the tasklet if we detect we are adding the first
> >> > request.
> >> >
> >> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >> > ---
> >> >  drivers/gpu/drm/i915/intel_lrc.c | 28 ++++------------------------
> >> >  1 file changed, 4 insertions(+), 24 deletions(-)
> >> >
> >> > diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> >> > index 6b49df4316f4..ca52b8e63305 100644
> >> > --- a/drivers/gpu/drm/i915/intel_lrc.c
> >> > +++ b/drivers/gpu/drm/i915/intel_lrc.c
> >> > @@ -609,35 +609,15 @@ static void intel_lrc_irq_handler(unsigned long data)
> >> >  static void execlists_submit_request(struct drm_i915_gem_request *request)
> >> >  {
> >> >  	struct intel_engine_cs *engine = request->engine;
> >> > -	struct drm_i915_gem_request *cursor;
> >> > -	int num_elements = 0;
> >> >  
> >> >  	spin_lock_bh(&engine->execlist_lock);
> >> >  
> >> > -	list_for_each_entry(cursor, &engine->execlist_queue, execlist_link)
> >> > -		if (++num_elements > 2)
> >> > -			break;
> >> > -
> >> > -	if (num_elements > 2) {
> >> > -		struct drm_i915_gem_request *tail_req;
> >> > -
> >> > -		tail_req = list_last_entry(&engine->execlist_queue,
> >> > -					   struct drm_i915_gem_request,
> >> > -					   execlist_link);
> >> > -
> >> > -		if (request->ctx == tail_req->ctx) {
> >> > -			WARN(tail_req->elsp_submitted != 0,
> >> > -				"More than 2 already-submitted reqs queued\n");
> >> > -			list_del(&tail_req->execlist_link);
> >> > -			i915_gem_request_put(tail_req);
> >> > -		}
> >> > -	}
> >> > -
> >> >  	i915_gem_request_get(request);
> >> > -	list_add_tail(&request->execlist_link, &engine->execlist_queue);
> >> >  	request->ctx_hw_id = request->ctx->hw_id;
> >> > -	if (num_elements == 0)
> >> > -		execlists_unqueue(engine);
> >> > +
> >> > +	list_add_tail(&request->execlist_link, &engine->execlist_queue);
> >> > +	if (request->execlist_link.prev == &engine->execlist_queue)
> >> 
> >> Why not list_first_entry()?
> >
> > Ah, because it's not as magic as I thought :)
> >
> > if (list_first_entry(&engine->execlist_queue,
> > 		     struct drm_i915_gem_request,
> > 		     execlist_link) == request)
> >
> > I think is reason enough.
> 
> Agreed. Looked at
> 
> if (list_is_singular(&engine->execlist_queue))
> 
> but that it will add needless empty check.
> 
> If that is abomination, just add comment that we need to resubmit
> if the list was empty.
> 
> Without digging the list internals, the next reader also
> could be more happy with
> 
> head->next == head->prev check.

As a temporary, we can use list_empty() before the add since we are
holding the spinlock.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [PATCH 08/13] drm/i915: Drive request submission through fence callbacks
  2016-08-25  9:08 ` [PATCH 08/13] drm/i915: Drive request submission through fence callbacks Chris Wilson
  2016-08-25 12:08   ` Joonas Lahtinen
@ 2016-08-26 12:47   ` John Harrison
  2016-08-26 16:20     ` Chris Wilson
  1 sibling, 1 reply; 49+ messages in thread
From: John Harrison @ 2016-08-26 12:47 UTC (permalink / raw)
  To: intel-gfx

On 25/08/2016 10:08, Chris Wilson wrote:
> Drive final request submission from a callback from the fence. This way
> the request is queued until all dependencies are resolved, at which
> point it is handed to the backend for queueing to hardware. At this
> point, no dependencies are set on the request, so the callback is
> immediate.
>
> A side-effect of imposing a heavier-irqsafe spinlock for execlist
> submission is that we lose the softirq enabling after scheduling the
> execlists tasklet. To compensate, we manually kickstart the softirq by
> disabling and enabling the bh around the fence signaling.
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_gem_request.c  | 20 +++++++++++++++++++-
>   drivers/gpu/drm/i915/i915_gem_request.h  |  3 +++
>   drivers/gpu/drm/i915/intel_breadcrumbs.c |  3 +++
>   drivers/gpu/drm/i915/intel_lrc.c         |  5 +++--
>   4 files changed, 28 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> index fa5e36de55d0..db45482ea194 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -314,6 +314,19 @@ static int i915_gem_get_seqno(struct drm_i915_private *dev_priv, u32 *seqno)
>   	return 0;
>   }
>   
> +static int __i915_sw_fence_call submit_notify(struct i915_sw_fence *fence)
> +{
> +	struct drm_i915_gem_request *request =
> +		container_of(fence, typeof(*request), submit);
> +
> +	if (i915_sw_fence_done(fence))
> +		i915_gem_request_put(request);
> +	else
> +		request->engine->submit_request(request);
> +
> +	return NOTIFY_DONE;
> +}
> +
>   /**
>    * i915_gem_request_alloc - allocate a request structure
>    *
> @@ -412,6 +425,8 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
>   	 */
>   	req->reserved_space = MIN_SPACE_FOR_ADD_REQUEST;
>   
> +	i915_sw_fence_init(&i915_gem_request_get(req)->submit, submit_notify);
> +
>   	if (i915.enable_execlists)
>   		ret = intel_logical_ring_alloc_request_extras(req);
>   	else
> @@ -527,7 +542,10 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
>   		  reserved_tail, ret);
>   
>   	i915_gem_mark_busy(engine);
> -	engine->submit_request(request);
> +
> +	local_bh_disable();
> +	i915_sw_fence_commit(&request->submit);
> +	local_bh_enable(); /* Kick the execlists tasklet if just scheduled */
>   }
>   
>   static unsigned long local_clock_us(unsigned int *cpu)
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
> index a231bd318ef0..a85723463978 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.h
> +++ b/drivers/gpu/drm/i915/i915_gem_request.h
> @@ -28,6 +28,7 @@
>   #include <linux/fence.h>
>   
>   #include "i915_gem.h"
> +#include "i915_sw_fence.h"
>   
>   struct intel_wait {
>   	struct rb_node node;
> @@ -82,6 +83,8 @@ struct drm_i915_gem_request {
>   	struct intel_ring *ring;
>   	struct intel_signal_node signaling;
>   
> +	struct i915_sw_fence submit;
Needs a documentation comment?

> +
>   	/** GEM sequence number associated with the previous request,
>   	 * when the HWS breadcrumb is equal to this the GPU is processing
>   	 * this request.
> diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
> index 2491e4c1eaf0..9bad14d22c95 100644
> --- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
> +++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
> @@ -462,7 +462,10 @@ static int intel_breadcrumbs_signaler(void *arg)
>   			 */
>   			intel_engine_remove_wait(engine,
>   						 &request->signaling.wait);
> +
> +			local_bh_disable();
>   			fence_signal(&request->fence);
> +			local_bh_enable(); /* kick start the tasklets */
>   
>   			/* Find the next oldest signal. Note that as we have
>   			 * not been holding the lock, another client may
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 5e60519ede8d..babeaa8b1273 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -538,14 +538,15 @@ static void intel_lrc_irq_handler(unsigned long data)
>   static void execlists_submit_request(struct drm_i915_gem_request *request)
Need to document that submit_request is now called from a callback and 
potentially from a tasklet or even an IRQ?

>   {
>   	struct intel_engine_cs *engine = request->engine;
> +	unsigned long flags;
>   
> -	spin_lock_bh(&engine->execlist_lock);
> +	spin_lock_irqsave(&engine->execlist_lock, flags);
>   
>   	list_add_tail(&request->execlist_link, &engine->execlist_queue);
>   	if (execlists_elsp_idle(engine))
>   		tasklet_hi_schedule(&engine->irq_tasklet);
>   
> -	spin_unlock_bh(&engine->execlist_lock);
> +	spin_unlock_irqrestore(&engine->execlist_lock, flags);
>   }
>   
>   int intel_logical_ring_alloc_request_extras(struct drm_i915_gem_request *request)

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [PATCH 09/13] drm/i915: Move execbuf object synchronisation to i915_gem_execbuffer
  2016-08-25  9:08 ` [PATCH 09/13] drm/i915: Move execbuf object synchronisation to i915_gem_execbuffer Chris Wilson
@ 2016-08-26 13:29   ` John Harrison
  0 siblings, 0 replies; 49+ messages in thread
From: John Harrison @ 2016-08-26 13:29 UTC (permalink / raw)
  To: intel-gfx


[-- Attachment #1.1: Type: text/plain, Size: 2796 bytes --]

On 25/08/2016 10:08, Chris Wilson wrote:
> We are about to specialize object synchronisation to enable nonblocking
> execbuf submission. First we make a copy of the current object
> synchronisation for execbuffer. The general i915_gem_object_sync() will
> be removed following the removal of CS flips in the near future.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_gem_execbuffer.c | 64 +++++++++++++++++++++++++++++-
>   1 file changed, 63 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index 601156c353cc..b7cc158733ad 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -1122,6 +1122,68 @@ static unsigned int eb_other_engines(struct drm_i915_gem_request *req)
>   }
>   
>   static int
> +__eb_sync(struct drm_i915_gem_request *to,
> +	  struct drm_i915_gem_request *from)
> +{
> +	int idx, ret;
> +
> +	if (to->engine == from->engine)
> +		return 0;
> +
> +	idx = intel_engine_sync_index(from->engine, to->engine);
> +	if (from->fence.seqno <= from->engine->semaphore.sync_seqno[idx])
> +		return 0;
> +
> +	trace_i915_gem_ring_sync_to(to, from);
> +	if (!i915.semaphores) {
> +		ret = i915_wait_request(from, true, NULL, NO_WAITBOOST);
> +		if (ret)
> +			return ret;
> +	} else {
> +		ret = to->engine->semaphore.sync_to(to, from);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	from->engine->semaphore.sync_seqno[idx] = from->fence.seqno;
> +	return 0;
> +}
> +
> +static int
> +eb_sync(struct drm_i915_gem_object *obj,
> +	struct drm_i915_gem_request *to,
> +	bool write)
> +{
> +	struct i915_gem_active *active;
> +	unsigned long active_mask;
> +	int idx;
> +
> +	if (write) {
> +		active_mask = i915_gem_object_get_active(obj);
> +		active = obj->last_read;
> +	} else {
> +		active_mask = 1;
> +		active = &obj->last_write;
> +	}
> +
> +	for_each_active(active_mask, idx) {
> +		struct drm_i915_gem_request *request;
> +		int ret;
> +
> +		request = i915_gem_active_peek(&active[idx],
> +					       &obj->base.dev->struct_mutex);
> +		if (!request)
> +			continue;
> +
> +		ret = __eb_sync(to, request);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	return 0;
> +}
> +
> +static int
>   i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req,
>   				struct list_head *vmas)
>   {
> @@ -1133,7 +1195,7 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req,
>   		struct drm_i915_gem_object *obj = vma->obj;
>   
>   		if (obj->flags & other_rings) {
> -			ret = i915_gem_object_sync(obj, req);
> +			ret = eb_sync(obj, req, obj->base.pending_write_domain);
>   			if (ret)
>   				return ret;
>   		}

Reviewed-by: John Harrison <john.c.harrison@intel.com>


[-- Attachment #1.2: Type: text/html, Size: 3210 bytes --]

[-- Attachment #2: Type: text/plain, Size: 160 bytes --]

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [PATCH 10/13] drm/i915: Nonblocking request submission
  2016-08-25  9:08 ` [PATCH 10/13] drm/i915: Nonblocking request submission Chris Wilson
@ 2016-08-26 13:39   ` John Harrison
  2016-08-26 16:14     ` Chris Wilson
  0 siblings, 1 reply; 49+ messages in thread
From: John Harrison @ 2016-08-26 13:39 UTC (permalink / raw)
  To: intel-gfx

On 25/08/2016 10:08, Chris Wilson wrote:
> Now that we have fences in place to drive request submission, we can
> employ those to queue requests after their dependencies as opposed to
> stalling in the middle of an execbuf ioctl. (However, we still choose to
> spin before enabling the IRQ as that is faster - though contentious.)
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_gem_execbuffer.c | 10 +++++++---
>   drivers/gpu/drm/i915/i915_gem_request.c    | 14 +++++++++++++-
>   2 files changed, 20 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index b7cc158733ad..104c713ec681 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -1136,9 +1136,13 @@ __eb_sync(struct drm_i915_gem_request *to,
>   
>   	trace_i915_gem_ring_sync_to(to, from);
>   	if (!i915.semaphores) {
> -		ret = i915_wait_request(from, true, NULL, NO_WAITBOOST);
> -		if (ret)
> -			return ret;
> +		if (!i915_spin_request(from, TASK_INTERRUPTIBLE, 2)) {
> +			ret = i915_sw_fence_await_dma_fence(&to->submit,
> +							    &from->fence,
Why not use 'i915_sw_fence_await_sw_fence(from->submit)' as below? Or 
conversely, why not use '_await_dma_fence(prev->fence)' below?


> +							    GFP_KERNEL);
> +			if (ret < 0)
> +				return ret;
> +		}
>   	} else {
>   		ret = to->engine->semaphore.sync_to(to, from);
>   		if (ret)
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> index db45482ea194..d3477dabb534 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -345,7 +345,7 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
>   {
>   	struct drm_i915_private *dev_priv = engine->i915;
>   	unsigned int reset_counter = i915_reset_counter(&dev_priv->gpu_error);
> -	struct drm_i915_gem_request *req;
> +	struct drm_i915_gem_request *req, *prev;
>   	u32 seqno;
>   	int ret;
>   
> @@ -441,6 +441,18 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
>   	 */
>   	req->head = req->ring->tail;
>   
> +	prev = i915_gem_active_peek(&engine->last_request,
> +				    &req->i915->drm.struct_mutex);
> +	if (prev) {
> +		ret = i915_sw_fence_await_sw_fence(&req->submit,
> +						   &prev->submit,
> +						   GFP_KERNEL);
Is this only required to guarantee seqno ordering?

If the previous request is not sharing any objects, context, etc. then 
there is no fundamental reason why this request should be dependent upon 
the last one.


> +		if (ret < 0) {
> +			i915_add_request(req);
> +			return ERR_PTR(ret);
> +		}
> +	}
> +
>   	return req;
>   
>   err_ctx:
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [PATCH 11/13] drm/i915: Serialise execbuf operation after a dma-buf reservation object
  2016-08-25  9:08 ` [PATCH 11/13] drm/i915: Serialise execbuf operation after a dma-buf reservation object Chris Wilson
@ 2016-08-26 13:52   ` John Harrison
  0 siblings, 0 replies; 49+ messages in thread
From: John Harrison @ 2016-08-26 13:52 UTC (permalink / raw)
  To: intel-gfx


[-- Attachment #1.1: Type: text/plain, Size: 1566 bytes --]

On 25/08/2016 10:08, Chris Wilson wrote:
> Now that we can wait upon fences before emitting the request, it becomes
> trivial to wait upon any implicit fence provided by the dma-buf
> reservation object.
>
> Testcase: igt/prime_vgem/fence-wait
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_gem_execbuffer.c | 11 +++++++++++
>   1 file changed, 11 insertions(+)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index 104c713ec681..d9b6fe9f7542 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -1197,6 +1197,7 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req,
>   
>   	list_for_each_entry(vma, vmas, exec_list) {
>   		struct drm_i915_gem_object *obj = vma->obj;
> +		struct reservation_object *resv;
>   
>   		if (obj->flags & other_rings) {
>   			ret = eb_sync(obj, req, obj->base.pending_write_domain);
> @@ -1204,6 +1205,16 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req,
>   				return ret;
>   		}
>   
> +		resv = i915_gem_object_get_dmabuf_resv(obj);
> +		if (resv) {
> +			ret = i915_sw_fence_await_reservation
> +				(&req->submit, resv, &i915_fence_ops,
> +				 obj->base.pending_write_domain,
> +				 GFP_KERNEL | __GFP_NOWARN);
> +			if (ret < 0)
> +				return ret;
> +		}
> +
>   		if (obj->base.write_domain & I915_GEM_DOMAIN_CPU)
>   			i915_gem_clflush_object(obj, false);
>   	}


Reviewed-by: John Harrison <john.c.harrison@intel.com>


[-- Attachment #1.2: Type: text/html, Size: 2055 bytes --]

[-- Attachment #2: Type: text/plain, Size: 160 bytes --]

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [PATCH 13/13] drm/i915: Support explicit fencing for execbuf
  2016-08-25  9:08 ` [PATCH 13/13] drm/i915: Support explicit fencing for execbuf Chris Wilson
@ 2016-08-26 15:08   ` John Harrison
  2016-08-26 15:29     ` John Harrison
  0 siblings, 1 reply; 49+ messages in thread
From: John Harrison @ 2016-08-26 15:08 UTC (permalink / raw)
  To: intel-gfx

On 25/08/2016 10:08, Chris Wilson wrote:
> Now that the user can opt-out of implicit fencing, we need to give them
> back control over the fencing. We employ sync_file to wrap our
> drm_i915_gem_request and provide an fd that userspace can merge with
> other sync_file fds and pass back to the kernel to wait upon before
> future execution.
>
> Testcase: igt/gem_exec_fence
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/Kconfig               |  1 +
>   drivers/gpu/drm/i915/i915_drv.c            |  5 +-
>   drivers/gpu/drm/i915/i915_gem_execbuffer.c | 79 ++++++++++++++++++++++++++++--
>   include/uapi/drm/i915_drm.h                |  8 ++-
>   4 files changed, 86 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig
> index 7769e469118f..319ca27ea719 100644
> --- a/drivers/gpu/drm/i915/Kconfig
> +++ b/drivers/gpu/drm/i915/Kconfig
> @@ -18,6 +18,7 @@ config DRM_I915
>   	select INPUT if ACPI
>   	select ACPI_VIDEO if ACPI
>   	select ACPI_BUTTON if ACPI
> +	select SYNC_FILE
>   	help
>   	  Choose this option if you have a system that has "Intel Graphics
>   	  Media Accelerator" or "HD Graphics" integrated graphics,
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index 3eecfef8bcca..9460bfff360f 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -358,6 +358,9 @@ static int i915_getparam(struct drm_device *dev, void *data,
>   	case I915_PARAM_HAS_EXEC_ASYNC:
>   		value = 1;
>   		break;
> +	case I915_PARAM_HAS_EXEC_FENCE:
> +		value = 1;
> +		break;
>   	default:
>   		DRM_DEBUG("Unknown parameter %d\n", param->param);
>   		return -EINVAL;
> @@ -2547,7 +2550,7 @@ static const struct drm_ioctl_desc i915_ioctls[] = {
>   	DRM_IOCTL_DEF_DRV(I915_HWS_ADDR, drm_noop, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY),
>   	DRM_IOCTL_DEF_DRV(I915_GEM_INIT, drm_noop, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY),
>   	DRM_IOCTL_DEF_DRV(I915_GEM_EXECBUFFER, i915_gem_execbuffer, DRM_AUTH),
> -	DRM_IOCTL_DEF_DRV(I915_GEM_EXECBUFFER2, i915_gem_execbuffer2, DRM_AUTH|DRM_RENDER_ALLOW),
> +	DRM_IOCTL_DEF_DRV(I915_GEM_EXECBUFFER2_WR, i915_gem_execbuffer2, DRM_AUTH|DRM_RENDER_ALLOW),
>   	DRM_IOCTL_DEF_DRV(I915_GEM_PIN, i915_gem_reject_pin_ioctl, DRM_AUTH|DRM_ROOT_ONLY),
>   	DRM_IOCTL_DEF_DRV(I915_GEM_UNPIN, i915_gem_reject_pin_ioctl, DRM_AUTH|DRM_ROOT_ONLY),
>   	DRM_IOCTL_DEF_DRV(I915_GEM_BUSY, i915_gem_busy_ioctl, DRM_AUTH|DRM_RENDER_ALLOW),
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index a0f46293e492..8701b8bb46fc 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -28,6 +28,7 @@
>   
>   #include <linux/dma_remapping.h>
>   #include <linux/reservation.h>
> +#include <linux/sync_file.h>
>   #include <linux/uaccess.h>
>   
>   #include <drm/drmP.h>
> @@ -1686,6 +1687,33 @@ eb_select_engine(struct drm_i915_private *dev_priv,
>   	return engine;
>   }
>   
> +static int eb_add_fence(struct drm_i915_gem_request *req, struct fence *fence)
> +{
> +	struct fence_array *array;
> +	int ret, i;
> +
> +	if (fence_is_i915(fence))
> +		return __eb_sync(req, to_request(fence));
> +
> +	if (!fence_is_array(fence))
> +		return i915_sw_fence_await_dma_fence(&req->submit, fence, GFP_KERNEL);
> +
> +	array = to_fence_array(fence);
> +	for (i = 0; i < array->num_fences; i++) {
> +		struct fence *child = array->fences[i];
> +
> +		if (fence_is_i915(child))
> +			ret = __eb_sync(req, to_request(child));
> +		else
> +			ret = i915_sw_fence_await_dma_fence(&req->submit,
> +							    child, GFP_KERNEL);
> +		if (ret < 0)
> +			return ret;
> +	}
> +
> +	return 0;
> +}
> +
>   static int
>   i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>   		       struct drm_file *file,
> @@ -1703,6 +1731,9 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>   	struct i915_execbuffer_params *params = &params_master;
>   	const u32 ctx_id = i915_execbuffer2_get_context_id(*args);
>   	u32 dispatch_flags;
> +	struct fence *in_fence = NULL;
> +	struct sync_file *out_fence = NULL;
> +	int out_fence_fd = -1;
>   	int ret;
>   	bool need_relocs;
>   
> @@ -1746,6 +1777,23 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>   		dispatch_flags |= I915_DISPATCH_RS;
>   	}
>   
> +	if (args->flags & I915_EXEC_FENCE_IN) {
> +		in_fence = sync_file_get_fence(lower_32_bits(args->rsvd2));
> +		if (!in_fence) {
> +			ret = -EINVAL;
> +			goto pre_mutex_err;
> +		}
> +	}
> +
> +	if (args->flags & I915_EXEC_FENCE_OUT) {
> +		out_fence_fd = get_unused_fd_flags(O_CLOEXEC);
> +		if (out_fence_fd < 0) {
> +			ret = out_fence_fd;
> +			out_fence_fd = -1;
> +			goto pre_mutex_err;
> +		}
> +	}
> +
>   	/* Take a local wakeref for preparing to dispatch the execbuf as
>   	 * we expect to access the hardware fairly frequently in the
>   	 * process. Upon first dispatch, we acquire another prolonged
> @@ -1890,6 +1938,20 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>   		goto err_batch_unpin;
>   	}
>   
> +	if (in_fence) {
> +		ret = eb_add_fence(params->request, in_fence);
> +		if (ret < 0)
> +			goto err_request;
> +	}
> +
> +	if (out_fence_fd != -1) {
> +		out_fence = sync_file_create(fence_get(&params->request->fence));
> +		if (!out_fence) {
> +			ret = -ENOMEM;
> +			goto err_request;
> +		}
> +	}
> +
>   	/* Whilst this request exists, batch_obj will be on the
>   	 * active_list, and so will hold the active reference. Only when this
>   	 * request is retired will the the batch_obj be moved onto the
> @@ -1917,6 +1979,15 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>   	ret = execbuf_submit(params, args, &eb->vmas);
>   err_request:
>   	__i915_add_request(params->request, ret == 0);
> +	if (out_fence) {
> +		if (ret == 0) {
> +			fd_install(out_fence_fd, out_fence->file);
> +			args->rsvd2 &= ~(u64)0xffffffff << 32;
> +			args->rsvd2 |= (u64)out_fence_fd << 32;
> +			out_fence_fd = -1;
> +		} else
> +			fput(out_fence->file);
> +	}
>   
>   err_batch_unpin:
>   	/*
> @@ -1938,6 +2009,9 @@ pre_mutex_err:
>   	/* intel_gpu_busy should also get a ref, so it will free when the device
>   	 * is really idle. */
>   	intel_runtime_pm_put(dev_priv);
> +	if (out_fence_fd != -1)
> +		put_unused_fd(out_fence_fd);
> +	fence_put(in_fence);
>   	return ret;
>   }
>   
> @@ -2045,11 +2119,6 @@ i915_gem_execbuffer2(struct drm_device *dev, void *data,
>   		return -EINVAL;
>   	}
>   
> -	if (args->rsvd2 != 0) {
> -		DRM_DEBUG("dirty rvsd2 field\n");
> -		return -EINVAL;
> -	}
> -
>   	exec2_list = drm_malloc_gfp(args->buffer_count,
>   				    sizeof(*exec2_list),
>   				    GFP_TEMPORARY);
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index 75f9cab23249..9fe670cc7bf3 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -246,6 +246,7 @@ typedef struct _drm_i915_sarea {
>   #define DRM_I915_OVERLAY_PUT_IMAGE	0x27
>   #define DRM_I915_OVERLAY_ATTRS	0x28
>   #define DRM_I915_GEM_EXECBUFFER2	0x29
> +#define DRM_I915_GEM_EXECBUFFER2_WR	DRM_I915_GEM_EXECBUFFER2
>   #define DRM_I915_GET_SPRITE_COLORKEY	0x2a
>   #define DRM_I915_SET_SPRITE_COLORKEY	0x2b
>   #define DRM_I915_GEM_WAIT	0x2c
> @@ -279,6 +280,7 @@ typedef struct _drm_i915_sarea {
>   #define DRM_IOCTL_I915_GEM_INIT		DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_INIT, struct drm_i915_gem_init)
>   #define DRM_IOCTL_I915_GEM_EXECBUFFER	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_EXECBUFFER, struct drm_i915_gem_execbuffer)
>   #define DRM_IOCTL_I915_GEM_EXECBUFFER2	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_EXECBUFFER2, struct drm_i915_gem_execbuffer2)
> +#define DRM_IOCTL_I915_GEM_EXECBUFFER2_WR	DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_EXECBUFFER2_WR, struct drm_i915_gem_execbuffer2)
>   #define DRM_IOCTL_I915_GEM_PIN		DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_PIN, struct drm_i915_gem_pin)
>   #define DRM_IOCTL_I915_GEM_UNPIN	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_UNPIN, struct drm_i915_gem_unpin)
>   #define DRM_IOCTL_I915_GEM_BUSY		DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_BUSY, struct drm_i915_gem_busy)
> @@ -388,6 +390,7 @@ typedef struct drm_i915_irq_wait {
>   #define I915_PARAM_HAS_POOLED_EU	 38
>   #define I915_PARAM_MIN_EU_IN_POOL	 39
>   #define I915_PARAM_HAS_EXEC_ASYNC	 40
> +#define I915_PARAM_HAS_EXEC_FENCE	 41
>   
>   typedef struct drm_i915_getparam {
>   	__s32 param;
> @@ -821,7 +824,10 @@ struct drm_i915_gem_execbuffer2 {
>    */
>   #define I915_EXEC_RESOURCE_STREAMER     (1<<15)
>   
> -#define __I915_EXEC_UNKNOWN_FLAGS -(I915_EXEC_RESOURCE_STREAMER<<1)
> +#define I915_EXEC_FENCE_IN		(1<<16)
> +#define I915_EXEC_FENCE_OUT		(1<<17)
> +
> +#define __I915_EXEC_UNKNOWN_FLAGS (-(I915_EXEC_FENCE_OUT<<1))
>   
>   #define I915_EXEC_CONTEXT_ID_MASK	(0xffffffff)
>   #define i915_execbuffer2_set_context_id(eb2, context) \

I think it is also worth adding something like:
@@ -705,7 +705,7 @@ struct drm_i915_gem_exec_object2 {
         __u64 flags;

         __u64 rsvd1;
-       __u64 rsvd2;
+       __u64 rsvd2;    /* Used for fence fd */
  };

Or even renaming 'rsvd2' to 'fence_io' or some such? Maybe split it into 
two u32 entries to avoid all the messy bitmasking and shifting.

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [PATCH 13/13] drm/i915: Support explicit fencing for execbuf
  2016-08-26 15:08   ` John Harrison
@ 2016-08-26 15:29     ` John Harrison
  2016-08-26 15:44       ` Chris Wilson
  0 siblings, 1 reply; 49+ messages in thread
From: John Harrison @ 2016-08-26 15:29 UTC (permalink / raw)
  To: intel-gfx

On 26/08/2016 16:08, John Harrison wrote:
> On 25/08/2016 10:08, Chris Wilson wrote:
>> Now that the user can opt-out of implicit fencing, we need to give them
>> back control over the fencing. We employ sync_file to wrap our
>> drm_i915_gem_request and provide an fd that userspace can merge with
>> other sync_file fds and pass back to the kernel to wait upon before
>> future execution.
It is worth mentioning in the description that this isn't just useful 
for pushing the synchronisation down to user land. It also allows 
synchronisation with non-rendering operations. E.g. submit a batch 
buffer for delayed execution when some external process (e.g. video 
capture card, other CPU thread, ...) has finished supplying the source 
data. Android also uses it for generating flip chains - submit rendering 
that waits on SurfaceFlinger page flip -> submit page flip that waits on 
render -> submit render that waits on page flip -> ...

Note also that with in order only request execution, it also allows the 
driver to be completely blocked on events that are outside of its control.


>>
>> Testcase: igt/gem_exec_fence
The new IGT is not included as part of this patch set?

>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>> ---
>>   drivers/gpu/drm/i915/Kconfig               |  1 +
>>   drivers/gpu/drm/i915/i915_drv.c            |  5 +-
>>   drivers/gpu/drm/i915/i915_gem_execbuffer.c | 79 
>> ++++++++++++++++++++++++++++--
>>   include/uapi/drm/i915_drm.h                |  8 ++-
>>   4 files changed, 86 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig
>> index 7769e469118f..319ca27ea719 100644
>> --- a/drivers/gpu/drm/i915/Kconfig
>> +++ b/drivers/gpu/drm/i915/Kconfig
>> @@ -18,6 +18,7 @@ config DRM_I915
>>       select INPUT if ACPI
>>       select ACPI_VIDEO if ACPI
>>       select ACPI_BUTTON if ACPI
>> +    select SYNC_FILE
>>       help
>>         Choose this option if you have a system that has "Intel Graphics
>>         Media Accelerator" or "HD Graphics" integrated graphics,
>> diff --git a/drivers/gpu/drm/i915/i915_drv.c 
>> b/drivers/gpu/drm/i915/i915_drv.c
>> index 3eecfef8bcca..9460bfff360f 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.c
>> +++ b/drivers/gpu/drm/i915/i915_drv.c
>> @@ -358,6 +358,9 @@ static int i915_getparam(struct drm_device *dev, 
>> void *data,
>>       case I915_PARAM_HAS_EXEC_ASYNC:
>>           value = 1;
>>           break;
>> +    case I915_PARAM_HAS_EXEC_FENCE:
>> +        value = 1;
>> +        break;
>>       default:
>>           DRM_DEBUG("Unknown parameter %d\n", param->param);
>>           return -EINVAL;
>> @@ -2547,7 +2550,7 @@ static const struct drm_ioctl_desc 
>> i915_ioctls[] = {
>>       DRM_IOCTL_DEF_DRV(I915_HWS_ADDR, drm_noop, 
>> DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY),
>>       DRM_IOCTL_DEF_DRV(I915_GEM_INIT, drm_noop, 
>> DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY),
>>       DRM_IOCTL_DEF_DRV(I915_GEM_EXECBUFFER, i915_gem_execbuffer, 
>> DRM_AUTH),
>> -    DRM_IOCTL_DEF_DRV(I915_GEM_EXECBUFFER2, i915_gem_execbuffer2, 
>> DRM_AUTH|DRM_RENDER_ALLOW),
>> +    DRM_IOCTL_DEF_DRV(I915_GEM_EXECBUFFER2_WR, i915_gem_execbuffer2, 
>> DRM_AUTH|DRM_RENDER_ALLOW),
>>       DRM_IOCTL_DEF_DRV(I915_GEM_PIN, i915_gem_reject_pin_ioctl, 
>> DRM_AUTH|DRM_ROOT_ONLY),
>>       DRM_IOCTL_DEF_DRV(I915_GEM_UNPIN, i915_gem_reject_pin_ioctl, 
>> DRM_AUTH|DRM_ROOT_ONLY),
>>       DRM_IOCTL_DEF_DRV(I915_GEM_BUSY, i915_gem_busy_ioctl, 
>> DRM_AUTH|DRM_RENDER_ALLOW),
>> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c 
>> b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
>> index a0f46293e492..8701b8bb46fc 100644
>> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
>> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
>> @@ -28,6 +28,7 @@
>>     #include <linux/dma_remapping.h>
>>   #include <linux/reservation.h>
>> +#include <linux/sync_file.h>
>>   #include <linux/uaccess.h>
>>     #include <drm/drmP.h>
>> @@ -1686,6 +1687,33 @@ eb_select_engine(struct drm_i915_private 
>> *dev_priv,
>>       return engine;
>>   }
>>   +static int eb_add_fence(struct drm_i915_gem_request *req, struct 
>> fence *fence)
>> +{
>> +    struct fence_array *array;
>> +    int ret, i;
>> +
>> +    if (fence_is_i915(fence))
>> +        return __eb_sync(req, to_request(fence));
>> +
>> +    if (!fence_is_array(fence))
>> +        return i915_sw_fence_await_dma_fence(&req->submit, fence, 
>> GFP_KERNEL);
>> +
>> +    array = to_fence_array(fence);
>> +    for (i = 0; i < array->num_fences; i++) {
>> +        struct fence *child = array->fences[i];
>> +
>> +        if (fence_is_i915(child))
>> +            ret = __eb_sync(req, to_request(child));
>> +        else
>> +            ret = i915_sw_fence_await_dma_fence(&req->submit,
>> +                                child, GFP_KERNEL);
>> +        if (ret < 0)
>> +            return ret;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>>   static int
>>   i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>>                  struct drm_file *file,
>> @@ -1703,6 +1731,9 @@ i915_gem_do_execbuffer(struct drm_device *dev, 
>> void *data,
>>       struct i915_execbuffer_params *params = &params_master;
>>       const u32 ctx_id = i915_execbuffer2_get_context_id(*args);
>>       u32 dispatch_flags;
>> +    struct fence *in_fence = NULL;
>> +    struct sync_file *out_fence = NULL;
>> +    int out_fence_fd = -1;
>>       int ret;
>>       bool need_relocs;
>>   @@ -1746,6 +1777,23 @@ i915_gem_do_execbuffer(struct drm_device 
>> *dev, void *data,
>>           dispatch_flags |= I915_DISPATCH_RS;
>>       }
>>   +    if (args->flags & I915_EXEC_FENCE_IN) {
>> +        in_fence = sync_file_get_fence(lower_32_bits(args->rsvd2));
>> +        if (!in_fence) {
>> +            ret = -EINVAL;
>> +            goto pre_mutex_err;
>> +        }
>> +    }
>> +
>> +    if (args->flags & I915_EXEC_FENCE_OUT) {
>> +        out_fence_fd = get_unused_fd_flags(O_CLOEXEC);
>> +        if (out_fence_fd < 0) {
>> +            ret = out_fence_fd;
>> +            out_fence_fd = -1;
>> +            goto pre_mutex_err;
>> +        }
>> +    }
>> +
>>       /* Take a local wakeref for preparing to dispatch the execbuf as
>>        * we expect to access the hardware fairly frequently in the
>>        * process. Upon first dispatch, we acquire another prolonged
>> @@ -1890,6 +1938,20 @@ i915_gem_do_execbuffer(struct drm_device *dev, 
>> void *data,
>>           goto err_batch_unpin;
>>       }
>>   +    if (in_fence) {
>> +        ret = eb_add_fence(params->request, in_fence);
>> +        if (ret < 0)
>> +            goto err_request;
>> +    }
>> +
>> +    if (out_fence_fd != -1) {
>> +        out_fence = 
>> sync_file_create(fence_get(&params->request->fence));
>> +        if (!out_fence) {
>> +            ret = -ENOMEM;
>> +            goto err_request;
>> +        }
>> +    }
>> +
>>       /* Whilst this request exists, batch_obj will be on the
>>        * active_list, and so will hold the active reference. Only 
>> when this
>>        * request is retired will the the batch_obj be moved onto the
>> @@ -1917,6 +1979,15 @@ i915_gem_do_execbuffer(struct drm_device *dev, 
>> void *data,
>>       ret = execbuf_submit(params, args, &eb->vmas);
>>   err_request:
>>       __i915_add_request(params->request, ret == 0);
>> +    if (out_fence) {
>> +        if (ret == 0) {
>> +            fd_install(out_fence_fd, out_fence->file);
>> +            args->rsvd2 &= ~(u64)0xffffffff << 32;
>> +            args->rsvd2 |= (u64)out_fence_fd << 32;
>> +            out_fence_fd = -1;
>> +        } else
>> +            fput(out_fence->file);
>> +    }
>>     err_batch_unpin:
>>       /*
>> @@ -1938,6 +2009,9 @@ pre_mutex_err:
>>       /* intel_gpu_busy should also get a ref, so it will free when 
>> the device
>>        * is really idle. */
>>       intel_runtime_pm_put(dev_priv);
>> +    if (out_fence_fd != -1)
>> +        put_unused_fd(out_fence_fd);
>> +    fence_put(in_fence);
>>       return ret;
>>   }
>>   @@ -2045,11 +2119,6 @@ i915_gem_execbuffer2(struct drm_device *dev, 
>> void *data,
>>           return -EINVAL;
>>       }
>>   -    if (args->rsvd2 != 0) {
>> -        DRM_DEBUG("dirty rvsd2 field\n");
>> -        return -EINVAL;
>> -    }
>> -
>>       exec2_list = drm_malloc_gfp(args->buffer_count,
>>                       sizeof(*exec2_list),
>>                       GFP_TEMPORARY);
>> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
>> index 75f9cab23249..9fe670cc7bf3 100644
>> --- a/include/uapi/drm/i915_drm.h
>> +++ b/include/uapi/drm/i915_drm.h
>> @@ -246,6 +246,7 @@ typedef struct _drm_i915_sarea {
>>   #define DRM_I915_OVERLAY_PUT_IMAGE    0x27
>>   #define DRM_I915_OVERLAY_ATTRS    0x28
>>   #define DRM_I915_GEM_EXECBUFFER2    0x29
>> +#define DRM_I915_GEM_EXECBUFFER2_WR    DRM_I915_GEM_EXECBUFFER2
>>   #define DRM_I915_GET_SPRITE_COLORKEY    0x2a
>>   #define DRM_I915_SET_SPRITE_COLORKEY    0x2b
>>   #define DRM_I915_GEM_WAIT    0x2c
>> @@ -279,6 +280,7 @@ typedef struct _drm_i915_sarea {
>>   #define DRM_IOCTL_I915_GEM_INIT DRM_IOW(DRM_COMMAND_BASE + 
>> DRM_I915_GEM_INIT, struct drm_i915_gem_init)
>>   #define DRM_IOCTL_I915_GEM_EXECBUFFER DRM_IOW(DRM_COMMAND_BASE + 
>> DRM_I915_GEM_EXECBUFFER, struct drm_i915_gem_execbuffer)
>>   #define DRM_IOCTL_I915_GEM_EXECBUFFER2 DRM_IOW(DRM_COMMAND_BASE + 
>> DRM_I915_GEM_EXECBUFFER2, struct drm_i915_gem_execbuffer2)
>> +#define DRM_IOCTL_I915_GEM_EXECBUFFER2_WR DRM_IOWR(DRM_COMMAND_BASE 
>> + DRM_I915_GEM_EXECBUFFER2_WR, struct drm_i915_gem_execbuffer2)
>>   #define DRM_IOCTL_I915_GEM_PIN DRM_IOWR(DRM_COMMAND_BASE + 
>> DRM_I915_GEM_PIN, struct drm_i915_gem_pin)
>>   #define DRM_IOCTL_I915_GEM_UNPIN    DRM_IOW(DRM_COMMAND_BASE + 
>> DRM_I915_GEM_UNPIN, struct drm_i915_gem_unpin)
>>   #define DRM_IOCTL_I915_GEM_BUSY DRM_IOWR(DRM_COMMAND_BASE + 
>> DRM_I915_GEM_BUSY, struct drm_i915_gem_busy)
>> @@ -388,6 +390,7 @@ typedef struct drm_i915_irq_wait {
>>   #define I915_PARAM_HAS_POOLED_EU     38
>>   #define I915_PARAM_MIN_EU_IN_POOL     39
>>   #define I915_PARAM_HAS_EXEC_ASYNC     40
>> +#define I915_PARAM_HAS_EXEC_FENCE     41
>>     typedef struct drm_i915_getparam {
>>       __s32 param;
>> @@ -821,7 +824,10 @@ struct drm_i915_gem_execbuffer2 {
>>    */
>>   #define I915_EXEC_RESOURCE_STREAMER     (1<<15)
>>   -#define __I915_EXEC_UNKNOWN_FLAGS -(I915_EXEC_RESOURCE_STREAMER<<1)
>> +#define I915_EXEC_FENCE_IN        (1<<16)
>> +#define I915_EXEC_FENCE_OUT        (1<<17)
>> +
>> +#define __I915_EXEC_UNKNOWN_FLAGS (-(I915_EXEC_FENCE_OUT<<1))
>>     #define I915_EXEC_CONTEXT_ID_MASK    (0xffffffff)
>>   #define i915_execbuffer2_set_context_id(eb2, context) \
>
> I think it is also worth adding something like:
> @@ -705,7 +705,7 @@ struct drm_i915_gem_exec_object2 {
>         __u64 flags;
>
>         __u64 rsvd1;
> -       __u64 rsvd2;
> +       __u64 rsvd2;    /* Used for fence fd */
>  };
>
> Or even renaming 'rsvd2' to 'fence_io' or some such? Maybe split it 
> into two u32 entries to avoid all the messy bitmasking and shifting.
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [PATCH 13/13] drm/i915: Support explicit fencing for execbuf
  2016-08-26 15:29     ` John Harrison
@ 2016-08-26 15:44       ` Chris Wilson
  0 siblings, 0 replies; 49+ messages in thread
From: Chris Wilson @ 2016-08-26 15:44 UTC (permalink / raw)
  To: John Harrison; +Cc: intel-gfx

On Fri, Aug 26, 2016 at 04:29:57PM +0100, John Harrison wrote:
> On 26/08/2016 16:08, John Harrison wrote:
> >On 25/08/2016 10:08, Chris Wilson wrote:
> >>Now that the user can opt-out of implicit fencing, we need to give them
> >>back control over the fencing. We employ sync_file to wrap our
> >>drm_i915_gem_request and provide an fd that userspace can merge with
> >>other sync_file fds and pass back to the kernel to wait upon before
> >>future execution.
> It is worth mentioning in the description that this isn't just
> useful for pushing the synchronisation down to user land. It also
> allows synchronisation with non-rendering operations.

But we already cover that in dma-buf. This is only useful for explicit
synchronisation outside of the kernel, since we are trying to get
everybody inside talking dma-buf.

Software controlled fences are a no-go due to resitance from devs not
wanting userspace being able to arbitrary hang the kernel. From that
perspective, the only source of a fence is from the kernel. And the only
desire to use explicit fencing is because implicit may be too coarse, or
too accurate depending on your pov ;-)
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [PATCH 10/13] drm/i915: Nonblocking request submission
  2016-08-26 13:39   ` John Harrison
@ 2016-08-26 16:14     ` Chris Wilson
  0 siblings, 0 replies; 49+ messages in thread
From: Chris Wilson @ 2016-08-26 16:14 UTC (permalink / raw)
  To: John Harrison; +Cc: intel-gfx

On Fri, Aug 26, 2016 at 02:39:01PM +0100, John Harrison wrote:
> On 25/08/2016 10:08, Chris Wilson wrote:
> >Now that we have fences in place to drive request submission, we can
> >employ those to queue requests after their dependencies as opposed to
> >stalling in the middle of an execbuf ioctl. (However, we still choose to
> >spin before enabling the IRQ as that is faster - though contentious.)
> >
> >Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >---
> >  drivers/gpu/drm/i915/i915_gem_execbuffer.c | 10 +++++++---
> >  drivers/gpu/drm/i915/i915_gem_request.c    | 14 +++++++++++++-
> >  2 files changed, 20 insertions(+), 4 deletions(-)
> >
> >diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> >index b7cc158733ad..104c713ec681 100644
> >--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> >+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> >@@ -1136,9 +1136,13 @@ __eb_sync(struct drm_i915_gem_request *to,
> >  	trace_i915_gem_ring_sync_to(to, from);
> >  	if (!i915.semaphores) {
> >-		ret = i915_wait_request(from, true, NULL, NO_WAITBOOST);
> >-		if (ret)
> >-			return ret;
> >+		if (!i915_spin_request(from, TASK_INTERRUPTIBLE, 2)) {
> >+			ret = i915_sw_fence_await_dma_fence(&to->submit,
> >+							    &from->fence,
> Why not use 'i915_sw_fence_await_sw_fence(from->submit)' as below?
> Or conversely, why not use '_await_dma_fence(prev->fence)' below?

On the same engine the requests are in execution order and so once the
first is ready to submit so are its dependents. This extends naturally
to timelines. Between engines, we have to wait until the request is
complete before we can submit the second (note this applies to the
!semaphore branch).

> >+							    GFP_KERNEL);
> >+			if (ret < 0)
> >+				return ret;
> >+		}
> >  	} else {
> >  		ret = to->engine->semaphore.sync_to(to, from);
> >  		if (ret)
> >diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> >index db45482ea194..d3477dabb534 100644
> >--- a/drivers/gpu/drm/i915/i915_gem_request.c
> >+++ b/drivers/gpu/drm/i915/i915_gem_request.c
> >@@ -345,7 +345,7 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
> >  {
> >  	struct drm_i915_private *dev_priv = engine->i915;
> >  	unsigned int reset_counter = i915_reset_counter(&dev_priv->gpu_error);
> >-	struct drm_i915_gem_request *req;
> >+	struct drm_i915_gem_request *req, *prev;
> >  	u32 seqno;
> >  	int ret;
> >@@ -441,6 +441,18 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
> >  	 */
> >  	req->head = req->ring->tail;
> >+	prev = i915_gem_active_peek(&engine->last_request,
> >+				    &req->i915->drm.struct_mutex);
> >+	if (prev) {
> >+		ret = i915_sw_fence_await_sw_fence(&req->submit,
> >+						   &prev->submit,
> >+						   GFP_KERNEL);
> Is this only required to guarantee seqno ordering?

Correct. We only have a single global timeline at the moment. So
requests are ordered within that timeline.

> If the previous request is not sharing any objects, context, etc.
> then there is no fundamental reason why this request should be
> dependent upon the last one.

Indeed. This becomes timeline->last_request and then at the time of
execution (when all dependencies are resolved) do we know its global
ordering - which remains important for being able to keep fast wakeups,
retirements etc. With preemptions being an exception and generally a wart
on the side of everything.
-chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [PATCH 08/13] drm/i915: Drive request submission through fence callbacks
  2016-08-26 12:47   ` John Harrison
@ 2016-08-26 16:20     ` Chris Wilson
  0 siblings, 0 replies; 49+ messages in thread
From: Chris Wilson @ 2016-08-26 16:20 UTC (permalink / raw)
  To: John Harrison; +Cc: intel-gfx

On Fri, Aug 26, 2016 at 01:47:50PM +0100, John Harrison wrote:
> On 25/08/2016 10:08, Chris Wilson wrote:
> >diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> >index 5e60519ede8d..babeaa8b1273 100644
> >--- a/drivers/gpu/drm/i915/intel_lrc.c
> >+++ b/drivers/gpu/drm/i915/intel_lrc.c
> >@@ -538,14 +538,15 @@ static void intel_lrc_irq_handler(unsigned long data)
> >  static void execlists_submit_request(struct drm_i915_gem_request *request)
> Need to document that submit_request is now called from a callback
> and potentially from a tasklet or even an IRQ?

Yes, the signal notify is always in atomic context, because the parent is
holding an irq spinlock. We should avoid limiting when kfence_complete()
is legal (hardirq support should be a requirement), but different users
may know that their fences are nevery used from such contexts (though
this notify is always atomic due to the spinlock).
The release notify may also be in any context - when mixing with dma
fences it may indeed be from atomic context due to the fence tacking a
spinlock around its callbacks.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [PATCH libdrm 15/15] intel: Support passing of explicit fencing from execbuf
  2016-08-25  9:08 ` [PATCH libdrm 15/15] intel: Support passing of explicit fencing from execbuf Chris Wilson
@ 2016-09-30 20:53   ` Rafael Antognolli
  2016-10-01  8:36     ` Chris Wilson
  0 siblings, 1 reply; 49+ messages in thread
From: Rafael Antognolli @ 2016-09-30 20:53 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

Hi Chris,

On Thu, Aug 25, 2016 at 10:08:33AM +0100, Chris Wilson wrote:
> Allow the caller to pass in an fd to an array of fences to control
> serialisation of the execbuf in the kernel and on the GPU, and in return
> allow creation of a fence fd for signaling the completion (and flushing)
> of the batch. When the returned fence is signaled, all writes to the
> buffers inside the batch will be complete and coherent from the cpu, or
> other consumers. The return fence is a sync_file object and can be
> passed to other users (such as atomic modesetting, or other drivers).
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  intel/intel_bufmgr.h     |  6 ++++++
>  intel/intel_bufmgr_gem.c | 38 +++++++++++++++++++++++++++++++++-----
>  2 files changed, 39 insertions(+), 5 deletions(-)
> 
> diff --git a/intel/intel_bufmgr_gem.c b/intel/intel_bufmgr_gem.c
> index 259543f..02ec757 100644
> --- a/intel/intel_bufmgr_gem.c
> +++ b/intel/intel_bufmgr_gem.c
> @@ -2412,13 +2419,20 @@ do_exec2(drm_intel_bo *bo, int used, drm_intel_context *ctx,
>  		i915_execbuffer2_set_context_id(execbuf, 0);
>  	else
>  		i915_execbuffer2_set_context_id(execbuf, ctx->ctx_id);
> -	execbuf.rsvd2 = 0;
> +	if (in_fence != -1) {
> +		execbuf.rsvd2 = in_fence;
> +		flags |= I915_EXEC_FENCE_IN;

The flags are being set here, but not really used anywhere.
Maybe you meant something like:

execbuf.flags != I915_EXEC_FENCE_IN;

?

> +	if (out_fence != NULL) {
> +		*out_fence = -1;
> +		flags |= I915_EXEC_FENCE_OUT;
> +	}

Same as above.

Regards,
Rafael
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [PATCH libdrm 15/15] intel: Support passing of explicit fencing from execbuf
  2016-09-30 20:53   ` Rafael Antognolli
@ 2016-10-01  8:36     ` Chris Wilson
  0 siblings, 0 replies; 49+ messages in thread
From: Chris Wilson @ 2016-10-01  8:36 UTC (permalink / raw)
  To: Rafael Antognolli; +Cc: intel-gfx

On Fri, Sep 30, 2016 at 01:53:06PM -0700, Rafael Antognolli wrote:
> Hi Chris,
> 
> On Thu, Aug 25, 2016 at 10:08:33AM +0100, Chris Wilson wrote:
> > Allow the caller to pass in an fd to an array of fences to control
> > serialisation of the execbuf in the kernel and on the GPU, and in return
> > allow creation of a fence fd for signaling the completion (and flushing)
> > of the batch. When the returned fence is signaled, all writes to the
> > buffers inside the batch will be complete and coherent from the cpu, or
> > other consumers. The return fence is a sync_file object and can be
> > passed to other users (such as atomic modesetting, or other drivers).
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > ---
> >  intel/intel_bufmgr.h     |  6 ++++++
> >  intel/intel_bufmgr_gem.c | 38 +++++++++++++++++++++++++++++++++-----
> >  2 files changed, 39 insertions(+), 5 deletions(-)
> > 
> > diff --git a/intel/intel_bufmgr_gem.c b/intel/intel_bufmgr_gem.c
> > index 259543f..02ec757 100644
> > --- a/intel/intel_bufmgr_gem.c
> > +++ b/intel/intel_bufmgr_gem.c
> > @@ -2412,13 +2419,20 @@ do_exec2(drm_intel_bo *bo, int used, drm_intel_context *ctx,
> >  		i915_execbuffer2_set_context_id(execbuf, 0);
> >  	else
> >  		i915_execbuffer2_set_context_id(execbuf, ctx->ctx_id);
> > -	execbuf.rsvd2 = 0;
> > +	if (in_fence != -1) {
> > +		execbuf.rsvd2 = in_fence;
> > +		flags |= I915_EXEC_FENCE_IN;
> 
> The flags are being set here, but not really used anywhere.
> Maybe you meant something like:
> 
> execbuf.flags |= I915_EXEC_FENCE_IN;

Oops.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 49+ messages in thread

end of thread, other threads:[~2016-10-01  8:36 UTC | newest]

Thread overview: 49+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-08-25  9:08 Shortest path to EGL_ANDRIOD_native_sync Chris Wilson
2016-08-25  9:08 ` [PATCH 01/13] drm/i915: Add a sw fence for collecting up dma fences Chris Wilson
2016-08-25 10:42   ` John Harrison
2016-08-25 10:51     ` Chris Wilson
2016-08-25 11:49   ` Joonas Lahtinen
2016-08-25 12:25     ` Chris Wilson
2016-08-25 16:52     ` Chris Wilson
2016-08-25  9:08 ` [PATCH 02/13] drm/i915: Only queue requests during execlists submission Chris Wilson
2016-08-26  9:41   ` Mika Kuoppala
2016-08-26  9:51     ` Chris Wilson
2016-08-26  9:54     ` Chris Wilson
2016-08-26 10:43       ` Mika Kuoppala
2016-08-26 10:53         ` Chris Wilson
2016-08-25  9:08 ` [PATCH 03/13] drm/i915: Record the position of the workarounds in the tail of the request Chris Wilson
2016-08-25  9:35   ` Mika Kuoppala
2016-08-25  9:08 ` [PATCH 04/13] drm/i915: Compute the ELSP register location once Chris Wilson
2016-08-25  9:51   ` Mika Kuoppala
2016-08-25 10:37     ` Chris Wilson
2016-08-25  9:08 ` [PATCH 05/13] drm/i915: Reorder submitting the requests to ELSP Chris Wilson
2016-08-25 13:02   ` Mika Kuoppala
2016-08-25  9:08 ` [PATCH 06/13] drm/i915: Simplify ELSP queue request tracking Chris Wilson
2016-08-25  9:08 ` [PATCH 07/13] drm/i915: Update reset path to fix incomplete requests Chris Wilson
2016-08-25  9:08 ` [PATCH 08/13] drm/i915: Drive request submission through fence callbacks Chris Wilson
2016-08-25 12:08   ` Joonas Lahtinen
2016-08-25 12:35     ` Chris Wilson
2016-08-26 12:47   ` John Harrison
2016-08-26 16:20     ` Chris Wilson
2016-08-25  9:08 ` [PATCH 09/13] drm/i915: Move execbuf object synchronisation to i915_gem_execbuffer Chris Wilson
2016-08-26 13:29   ` John Harrison
2016-08-25  9:08 ` [PATCH 10/13] drm/i915: Nonblocking request submission Chris Wilson
2016-08-26 13:39   ` John Harrison
2016-08-26 16:14     ` Chris Wilson
2016-08-25  9:08 ` [PATCH 11/13] drm/i915: Serialise execbuf operation after a dma-buf reservation object Chris Wilson
2016-08-26 13:52   ` John Harrison
2016-08-25  9:08 ` [PATCH 12/13] drm/i915: Enable userspace to opt-out of implicit fencing Chris Wilson
2016-08-25  9:08 ` [PATCH 13/13] drm/i915: Support explicit fencing for execbuf Chris Wilson
2016-08-26 15:08   ` John Harrison
2016-08-26 15:29     ` John Harrison
2016-08-26 15:44       ` Chris Wilson
2016-08-25  9:08 ` [PATCH libdrm 14/15] intel: Allow the client to control implicit synchronisation Chris Wilson
2016-08-25  9:08 ` [PATCH libdrm 15/15] intel: Support passing of explicit fencing from execbuf Chris Wilson
2016-09-30 20:53   ` Rafael Antognolli
2016-10-01  8:36     ` Chris Wilson
2016-08-25  9:08 ` [PATCH 16/21] i965: Add explicit fence tracking to batch flush Chris Wilson
2016-08-25  9:08 ` [PATCH 17/21] i965: Split intel_syncobject into vfuncs Chris Wilson
2016-08-25  9:08 ` [PATCH 18/21] i965: Add fd-fence backend to intel_syncobject Chris Wilson
2016-08-25  9:08 ` [PATCH 19/21] rfc! i965: Add intel_screen::has_fence_fd Chris Wilson
2016-08-25  9:08 ` [PATCH 20/21] i965: Implement EGL_ANDROID_native_fence_sync support for DRI2_FENCE Chris Wilson
2016-08-25  9:08 ` [PATCH 21/21] i965: Disable implicit sync when using EGL_ANDROID_native_fence_sync Chris Wilson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.