All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC] drm/i915: Add sync framework support to execbuff IOCTL
@ 2015-07-02 11:09 John.C.Harrison
  2015-07-02 11:54 ` Chris Wilson
  0 siblings, 1 reply; 22+ messages in thread
From: John.C.Harrison @ 2015-07-02 11:09 UTC (permalink / raw)
  To: Intel-GFX

From: John Harrison <John.C.Harrison@Intel.com>

Various projects desire a mechanism for managing dependencies between
work items asynchronously. This can also include work items across
complete different and independent systems. For example, an
application wants to retreive a frame from a video in device,
using it for rendering on a GPU then send it to the video out device
for display all without having to stall waiting for completion along
the way. The sync framework allows this. It encapsulates
synchronisation events in file descriptors. The application can
request a sync point for the completion of each piece of work. Drivers
should also take sync points in with each new work request and not
schedule the work to start until the sync has been signalled.

This patch adds sync framework support to the exec buffer IOCTL. A
sync point can be passed in to stall execution of the batch buffer
until signalled. And a sync point can be returned after each batch
buffer submission which will be signalled upon that batch buffer's
completion.

At present, the input sync point is simply waited on synchronously
inside the exec buffer IOCTL call. Once the GPU scheduler arrives,
this will be handled asynchronously inside the scheduler and the IOCTL
can return without having to wait.

Note also that the scheduler will re-order the execution of batch
buffers, e.g. because a batch buffer is stalled on a sync point and
cannot be submitted yet but other, independent, batch buffers are
being presented to the driver. This means that the timeline within the
sync points returned cannot be global to the engine. Instead they must
be kept per context per engine (the scheduler may not re-order batches
within a context). Hence the timeline cannot be based on the existing
seqno values but must be a new implementation.

Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
---
 drivers/gpu/drm/i915/Makefile              |   2 +
 drivers/gpu/drm/i915/i915_drv.h            |  18 ++
 drivers/gpu/drm/i915/i915_gem.c            |  38 +++
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |  89 ++++++-
 drivers/gpu/drm/i915/intel_sync.c          | 357 +++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/intel_sync.h          |  76 ++++++
 include/uapi/drm/i915_drm.h                |  18 +-
 7 files changed, 592 insertions(+), 6 deletions(-)
 create mode 100644 drivers/gpu/drm/i915/intel_sync.c
 create mode 100644 drivers/gpu/drm/i915/intel_sync.h

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index de21965..1b813ea 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -90,6 +90,8 @@ i915-y += i915_vgpu.o
 # legacy horrors
 i915-y += i915_dma.o
 
+i915-$(CONFIG_SYNC) += intel_sync.o
+
 obj-$(CONFIG_DRM_I915)  += i915.o
 
 CFLAGS_i915_trace_points.o := -I$(src)
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index dca69d1..36d40d4 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -836,6 +836,10 @@ struct i915_ctx_hang_stats {
 	bool banned;
 };
 
+#ifdef CONFIG_SYNC
+struct i915_sync_timeline;
+#endif
+
 /* This must match up with the value previously used for execbuf2.rsvd1. */
 #define DEFAULT_CONTEXT_HANDLE 0
 
@@ -880,6 +884,9 @@ struct intel_context {
 		struct drm_i915_gem_object *state;
 		struct intel_ringbuffer *ringbuf;
 		int pin_count;
+#ifdef CONFIG_SYNC
+		struct i915_sync_timeline *sync_timeline;
+#endif
 	} engine[I915_NUM_RINGS];
 
 	struct list_head link;
@@ -2201,6 +2208,11 @@ struct drm_i915_gem_request {
 	/** process identifier submitting this request */
 	struct pid *pid;
 
+#ifdef CONFIG_SYNC
+	/** native sync timeline value **/
+	uint32_t sync_value;
+#endif
+
 	/**
 	 * The ELSP only accepts two elements at a time, so we queue
 	 * context/tail pairs on a given queue (ring->execlist_queue) until the
@@ -2291,6 +2303,12 @@ static inline void i915_gem_request_assign(struct drm_i915_gem_request **pdst,
 	*pdst = src;
 }
 
+#ifdef CONFIG_SYNC
+struct drm_i915_gem_request *i915_gem_request_find_by_sync_value(struct intel_engine_cs *ring,
+								 struct intel_context *ctx,
+								 uint32_t sync_value);
+#endif
+
 /*
  * A command that requires special handling by the command parser.
  */
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index ae40a65..4ef1469 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -37,6 +37,7 @@
 #include <linux/swap.h>
 #include <linux/pci.h>
 #include <linux/dma-buf.h>
+#include "intel_sync.h"
 
 #define RQ_BUG_ON(expr)
 
@@ -2534,6 +2535,17 @@ void __i915_add_request(struct drm_i915_gem_request *request,
 	 */
 	i915_gem_request_submit(request);
 
+#ifdef CONFIG_SYNC
+	/*
+	 * If an external sync point has been requested for this request then
+	 * it can be waited on without the driver's knowledge, i.e. without
+	 * calling __i915_wait_request(). Thus interrupts must be enabled
+	 * from the start rather than only on demand.
+	 */
+	if (request->sync_value)
+		i915_gem_request_enable_interrupt(request);
+#endif
+
 	if (i915.enable_execlists)
 		ret = ring->emit_request(request);
 	else {
@@ -2735,6 +2747,11 @@ void i915_gem_request_notify(struct intel_engine_cs *ring)
 
 			fence_signal_locked(&req->fence);
 			trace_i915_gem_request_complete(req);
+
+#ifdef CONFIG_SYNC
+			if (req->sync_value)
+				i915_sync_timeline_advance(req->ctx, req->ring, req->sync_value);
+#endif
 		}
 
 		list_del_init(&req->signal_list);
@@ -2830,6 +2847,27 @@ void i915_gem_request_cancel(struct drm_i915_gem_request *req)
 	i915_gem_request_unreference(req);
 }
 
+#ifdef CONFIG_SYNC
+struct drm_i915_gem_request *i915_gem_request_find_by_sync_value(struct intel_engine_cs *ring,
+								 struct intel_context *ctx,
+								 uint32_t sync_value)
+{
+	struct drm_i915_gem_request *req = NULL;
+
+	BUG_ON(!mutex_is_locked(&ring->dev->struct_mutex));
+
+	list_for_each_entry(req, &ring->request_list, list) {
+		if (req->ctx != ctx)
+			continue;
+
+		if (req->sync_value == sync_value)
+			break;
+	}
+
+	return req;
+}
+#endif
+
 struct drm_i915_gem_request *
 i915_gem_find_active_request(struct intel_engine_cs *ring)
 {
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 600db74..f556826 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -26,12 +26,14 @@
  *
  */
 
+#include <linux/syscalls.h>
 #include <drm/drmP.h>
 #include <drm/i915_drm.h>
 #include "i915_drv.h"
 #include "i915_trace.h"
 #include "intel_drv.h"
 #include <linux/dma_remapping.h>
+#include "intel_sync.h"
 
 #define  __EXEC_OBJECT_HAS_PIN (1<<31)
 #define  __EXEC_OBJECT_HAS_FENCE (1<<30)
@@ -1402,6 +1404,35 @@ eb_get_batch(struct eb_vmas *eb)
 	return vma->obj;
 }
 
+#ifdef CONFIG_SYNC
+static int i915_early_fence_wait(struct intel_engine_cs *ring, int fence_fd)
+{
+	struct sync_fence *fence;
+	int ret = 0;
+
+	if (fence_fd < 0) {
+		DRM_ERROR("Invalid wait fence fd %d on ring %d\n", fence_fd,
+			  (int) ring->id);
+		return 1;
+	}
+
+	fence = sync_fence_fdget(fence_fd);
+	if (fence == NULL) {
+		DRM_ERROR("Invalid wait fence %d on ring %d\n", fence_fd,
+			  (int) ring->id);
+		return 1;
+	}
+
+	if (atomic_read(&fence->status) == 0) {
+		if (!i915_safe_to_ignore_fence(ring, fence))
+			ret = sync_fence_wait(fence, 1000);
+	}
+
+	sync_fence_put(fence);
+	return ret;
+}
+#endif
+
 static int
 i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 		       struct drm_file *file,
@@ -1421,6 +1452,18 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	u32 dispatch_flags;
 	int ret;
 	bool need_relocs;
+	int fd_fence_complete = -1;
+#ifdef CONFIG_SYNC
+	int fd_fence_wait = (int) args->rsvd2;
+#endif
+
+	/*
+	 * Make sure an broken fence handle is not returned no matter
+	 * how early an error might be hit. Note that rsvd2 has to be
+	 * saved away first because it is also an input parameter!
+	 */
+	if (args->flags & I915_EXEC_REQUEST_FENCE)
+		args->rsvd2 = (__u64) -1;
 
 	if (!i915_gem_check_execbuffer(args))
 		return -EINVAL;
@@ -1490,6 +1533,19 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 		return -EINVAL;
 	}
 
+#ifdef CONFIG_SYNC
+	/*
+	 * Without a GPU scheduler, any fence waits must be done up front.
+	 */
+	if (args->flags & I915_EXEC_WAIT_FENCE) {
+		ret = i915_early_fence_wait(ring, fd_fence_wait);
+		if (ret < 0)
+			goto pre_mutex_err;
+
+		args->flags &= ~I915_EXEC_WAIT_FENCE;
+	}
+#endif
+
 	intel_runtime_pm_get(dev_priv);
 
 	ret = i915_mutex_lock_interruptible(dev);
@@ -1637,6 +1693,28 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	params->batch_obj               = batch_obj;
 	params->ctx                     = ctx;
 
+#ifdef CONFIG_SYNC
+	if (args->flags & I915_EXEC_REQUEST_FENCE) {
+		/*
+		 * Caller has requested a sync fence.
+		 * User interrupts will be enabled to make sure that
+		 * the timeline is signalled on completion.
+		 */
+		ret = i915_sync_create_fence(params->request,
+					     &fd_fence_complete,
+					     args->flags & I915_EXEC_RING_MASK);
+		if (ret) {
+			DRM_ERROR("Fence creation failed for ring %d, ctx %p\n",
+				  ring->id, ctx);
+			args->rsvd2 = (__u64) -1;
+			goto err;
+		}
+
+		/* Return the fence through the rsvd2 field */
+		args->rsvd2 = (__u64) fd_fence_complete;
+	}
+#endif
+
 	ret = dev_priv->gt.execbuf_submit(params, args, &eb->vmas);
 
 err_batch_unpin:
@@ -1668,6 +1746,12 @@ pre_mutex_err:
 	/* intel_gpu_busy should also get a ref, so it will free when the device
 	 * is really idle. */
 	intel_runtime_pm_put(dev_priv);
+
+	if (fd_fence_complete != -1) {
+		sys_close(fd_fence_complete);
+		args->rsvd2 = (__u64) -1;
+	}
+
 	return ret;
 }
 
@@ -1773,11 +1857,6 @@ i915_gem_execbuffer2(struct drm_device *dev, void *data,
 		return -EINVAL;
 	}
 
-	if (args->rsvd2 != 0) {
-		DRM_DEBUG("dirty rvsd2 field\n");
-		return -EINVAL;
-	}
-
 	exec2_list = kmalloc(sizeof(*exec2_list)*args->buffer_count,
 			     GFP_TEMPORARY | __GFP_NOWARN | __GFP_NORETRY);
 	if (exec2_list == NULL)
diff --git a/drivers/gpu/drm/i915/intel_sync.c b/drivers/gpu/drm/i915/intel_sync.c
new file mode 100644
index 0000000..c610078
--- /dev/null
+++ b/drivers/gpu/drm/i915/intel_sync.c
@@ -0,0 +1,357 @@
+/**************************************************************************
+ * Copyright (C) 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *      Satyanantha RamaGopal M <rama.gopal.m.satyanantha@intel.com>
+ *      Ian Lister <ian.lister@intel.com>
+ *      Tvrtko Ursulin <tvrtko.ursulin@intel.com>
+ *      John Harrison <John.C.Harrison@Intel.com>
+ */
+#include <linux/device.h>
+#include <drm/drmP.h>
+#include <uapi/drm/drm.h>
+#include <drm/i915_drm.h>
+#include "i915_drv.h"
+#include "intel_drv.h"
+#include "intel_sync.h"
+
+static int i915_sync_pt_has_signaled(struct sync_pt *sync_pt)
+{
+	struct i915_sync_pt *pt = container_of(sync_pt,
+					       struct i915_sync_pt, pt);
+	struct sync_timeline *tl = sync_pt_parent(sync_pt);
+	struct i915_sync_timeline *obj = container_of(tl,
+			struct i915_sync_timeline, obj);
+
+	/* On ring timeout fail the status of pending sync_pts.
+	 * This callback is synchronous with the thread which calls
+	 * sync_timeline_signal. If this has been signaled due to
+	 * an error then timeline->killed_at will be set to the dead
+	 * value.
+	 */
+	if (pt->pvt.value == obj->pvt.killed_at)
+		return -ETIMEDOUT;
+	else if (pt->pvt.cycle != obj->pvt.cycle) {
+		/* The seqno has wrapped so complete this point */
+		return 1;
+	} else
+		/* This shouldn't require locking as it is synchronous
+		 * with the timeline signal function which is the only updater
+		 * of these fields
+		 */
+		return (obj->pvt.value >= pt->pvt.value) ? 1 : 0;
+
+	return 0;
+}
+
+static int i915_sync_pt_compare(struct sync_pt *a, struct sync_pt *b)
+{
+	struct i915_sync_pt *pt_a = container_of(a, struct i915_sync_pt, pt);
+	struct i915_sync_pt *pt_b = container_of(b, struct i915_sync_pt, pt);
+
+	if (pt_a->pvt.value == pt_b->pvt.value)
+		return 0;
+	else
+		return (pt_a->pvt.value > pt_b->pvt.value) ? 1 : -1;
+}
+
+static int i915_sync_fill_driver_data(struct sync_pt *sync_pt,
+				    void *data, int size)
+{
+	struct i915_sync_pt *pt = container_of(sync_pt,
+					       struct i915_sync_pt, pt);
+
+	if (size < sizeof(pt->pvt))
+		return -ENOMEM;
+
+	memcpy(data, &pt->pvt, sizeof(pt->pvt));
+
+	return sizeof(pt->pvt);
+}
+
+static
+struct sync_pt *i915_sync_pt_create(struct i915_sync_timeline *obj,
+				    u32 value, u32 cycle, u64 ring_mask)
+{
+	struct i915_sync_pt *pt;
+
+	if (!obj)
+		return NULL;
+
+	pt = (struct i915_sync_pt *)
+		sync_pt_create(&obj->obj, sizeof(struct i915_sync_pt));
+
+	if (pt) {
+		pt->pvt.value = value;
+		pt->pvt.cycle = cycle;
+		pt->pvt.ring_mask = ring_mask;
+	}
+
+	return (struct sync_pt *)pt;
+}
+
+static struct sync_pt *i915_sync_pt_dup(struct sync_pt *sync_pt)
+{
+	struct i915_sync_pt *pt = container_of(sync_pt,
+					       struct i915_sync_pt, pt);
+	struct sync_pt *new_pt;
+	struct sync_timeline *tl = sync_pt_parent(sync_pt);
+	struct i915_sync_timeline *obj = container_of(tl,
+			struct i915_sync_timeline, obj);
+
+	new_pt = (struct sync_pt *)i915_sync_pt_create(obj, pt->pvt.value,
+					pt->pvt.cycle, pt->pvt.ring_mask);
+	return new_pt;
+}
+
+static void i915_sync_pt_free(struct sync_pt *sync_pt)
+{
+}
+
+void i915_sync_pt_timeline_value_str(struct sync_timeline *timeline, char *str, int size)
+{
+	struct i915_sync_timeline *obj = container_of(timeline, struct i915_sync_timeline, obj);
+
+	snprintf(str, size, "%d [%d]", obj->pvt.value, obj->pvt.ring->get_seqno(obj->pvt.ring, true));
+}
+
+void i915_sync_pt_pt_value_str(struct sync_pt *sync_pt, char *str, int size)
+{
+	struct i915_sync_pt *pt = container_of(sync_pt,
+					       struct i915_sync_pt, pt);
+	struct i915_sync_timeline *timeline =
+		container_of(sync_pt_parent(sync_pt),
+				struct i915_sync_timeline, obj);
+	struct drm_i915_gem_request *req;
+	int ret;
+
+	ret = i915_mutex_lock_interruptible(timeline->pvt.ring->dev);
+	if (ret) {
+		snprintf(str, size, "%d [err = %d!]", pt->pvt.value, ret);
+		return;
+	}
+
+	req = i915_gem_request_find_by_sync_value(timeline->pvt.ring, timeline->pvt.ctx, pt->pvt.value);
+
+	if (req)
+		snprintf(str, size, "%d [%d:%d]", pt->pvt.value, req->uniq, req->seqno);
+	else
+		snprintf(str, size, "%d [-]", pt->pvt.value);
+
+	mutex_unlock(&timeline->pvt.ring->dev->struct_mutex);
+}
+
+struct sync_timeline_ops i915_sync_timeline_ops = {
+	.driver_name = "i915_sync",
+	.dup = i915_sync_pt_dup,
+	.has_signaled = i915_sync_pt_has_signaled,
+	.compare = i915_sync_pt_compare,
+	.fill_driver_data = i915_sync_fill_driver_data,
+	.free_pt = i915_sync_pt_free,
+	.timeline_value_str = i915_sync_pt_timeline_value_str,
+	.pt_value_str = i915_sync_pt_pt_value_str,
+};
+
+int i915_sync_timeline_create(struct drm_device *dev,
+			      struct intel_context *ctx,
+			      struct intel_engine_cs *ring)
+{
+	struct i915_sync_timeline **timeline;
+	struct i915_sync_timeline *local;
+
+	timeline = &ctx->engine[ring->id].sync_timeline;
+
+	if (*timeline)
+		return 0;
+
+	local = (struct i915_sync_timeline *)
+			sync_timeline_create(&i915_sync_timeline_ops,
+				     sizeof(struct i915_sync_timeline),
+				     ring->name);
+
+	if (!local)
+		return -EINVAL;
+
+	local->pvt.killed_at = 0;
+	local->pvt.next      = 1;
+
+	/* Start the timeline from seqno 0 as this is a special value
+	 * that is reserved for invalid sync points.
+	 */
+	local->pvt.value = 0;
+	local->pvt.ctx = ctx;
+	local->pvt.ring = ring;
+
+	*timeline = local;
+
+	return 0;
+}
+
+static uint32_t get_next_value(struct i915_sync_timeline *timeline)
+{
+	uint32_t value;
+
+	value = timeline->pvt.next;
+
+	/* Reserve zero for invalid */
+	if (++timeline->pvt.next == 0 ) {
+		timeline->pvt.next = 1;
+		timeline->pvt.cycle++;
+	}
+
+	return value;
+}
+
+void i915_sync_timeline_destroy(struct intel_context *ctx,
+				struct intel_engine_cs *ring)
+{
+	struct i915_sync_timeline **timeline;
+
+	timeline = &ctx->engine[ring->id].sync_timeline;
+
+	if (*timeline) {
+		sync_timeline_destroy(&(*timeline)->obj);
+		*timeline = NULL;
+	}
+}
+
+void i915_sync_timeline_signal(struct i915_sync_timeline *obj, u32 value)
+{
+	/* Update the timeline to notify it that
+	 * the monotonic counter has advanced.
+	 */
+	if (obj) {
+		obj->pvt.value = value;
+		sync_timeline_signal(&obj->obj);
+	}
+}
+
+int i915_sync_create_fence(struct drm_i915_gem_request *req,
+			   int *fd_out, u64 ring_mask)
+{
+	struct sync_pt *pt;
+	int fd = -1, err;
+	struct sync_fence *fence;
+	struct i915_sync_timeline *timeline;
+
+	if (req->sync_value) {
+		DRM_DEBUG_DRIVER("Already got a sync point! [ring:%s, ctx:%p, seqno:%u]\n",
+				 req->ring->name, req->ctx, i915_gem_request_get_seqno(req));
+		*fd_out = -1;
+		return -EINVAL;
+	}
+
+	timeline = req->ctx->engine[req->ring->id].sync_timeline;
+
+	if (!timeline) {
+		DRM_DEBUG_DRIVER("Missing timeline! [ring:%s, ctx:%p, seqno:%u]\n",
+				 req->ring->name, req->ctx, i915_gem_request_get_seqno(req));
+		*fd_out = -1;
+		return -ENODEV;
+	}
+
+	req->sync_value = get_next_value(timeline);
+	pt = i915_sync_pt_create(timeline,
+				 req->sync_value,
+				 timeline->pvt.cycle,
+				 ring_mask);
+	if (!pt) {
+		DRM_DEBUG_DRIVER("Failed to create sync point for ring:%s, ctx:%p, seqno:%u\n",
+				 req->ring->name, req->ctx, i915_gem_request_get_seqno(req));
+		*fd_out = -1;
+		return -ENOMEM;
+	}
+
+	fd = get_unused_fd_flags(0);
+	if (fd < 0) {
+		DRM_DEBUG_DRIVER("Unable to get file descriptor for fence\n");
+		err = fd;
+		goto err;
+	}
+
+	fence = sync_fence_create("I915", pt);
+	if (fence) {
+		sync_fence_install(fence, fd);
+		*fd_out = fd;
+		return 0;
+	}
+
+	DRM_DEBUG_DRIVER("Fence creation failed\n");
+	err = -ENOMEM;
+	put_unused_fd(fd);
+err:
+	sync_pt_free(pt);
+	*fd_out = -1;
+	return err;
+}
+
+void i915_sync_timeline_advance(struct intel_context *ctx,
+				struct intel_engine_cs *ring,
+				uint32_t value)
+{
+	struct i915_sync_timeline *timeline;
+
+	timeline = ctx->engine[ring->id].sync_timeline;
+
+	if (timeline)
+		i915_sync_timeline_signal(timeline, value);
+}
+
+bool i915_safe_to_ignore_fence(struct intel_engine_cs *ring, struct sync_fence *fence)
+{
+	struct i915_sync_timeline *timeline;
+	struct fence *dma_fence;
+	struct sync_pt *pt;
+	bool ignore;
+	int i;
+
+	if (atomic_read(&fence->status) != 0)
+		return true;
+
+	ignore = true;
+	for(i = 0; i < fence->num_fences; i++) {
+		dma_fence = fence->cbs[i].sync_pt;
+		pt = container_of(dma_fence, struct sync_pt, base);
+
+		/* No need to worry about dead points: */
+		if (fence_is_signaled(dma_fence))
+			continue;
+
+		/* Can't ignore other people's points: */
+		if(sync_pt_parent(pt)->ops != &i915_sync_timeline_ops) {
+			ignore = false;
+			break;
+		}
+
+		timeline = container_of(sync_pt_parent(pt), struct i915_sync_timeline, obj);
+
+		/* Can't ignore points on other rings: */
+		if (timeline->pvt.ring != ring) {
+			ignore = false;
+			break;
+		}
+
+		/* Same ring means guaranteed to be in order so ignore it. */
+	}
+
+	return ignore;
+}
diff --git a/drivers/gpu/drm/i915/intel_sync.h b/drivers/gpu/drm/i915/intel_sync.h
new file mode 100644
index 0000000..45476a1
--- /dev/null
+++ b/drivers/gpu/drm/i915/intel_sync.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (C) 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *      Satyanantha RamaGopal M <rama.gopal.m.satyanantha@intel.com>
+ *      Ian Lister <ian.lister@intel.com>
+ *      Tvrtko Ursulin <tvrtko.ursulin@intel.com>
+ *      John Harrison <John.C.Harrison@Intel.com>
+ */
+#ifndef _INTEL_SYNC_H_
+#define _INTEL_SYNC_H_
+
+#include <../drivers/staging/android/sync.h>
+
+#ifdef CONFIG_SYNC
+
+struct drm_i915_private;
+
+struct i915_sync_timeline {
+	struct	sync_timeline	obj;
+
+	struct {
+		u32         value;
+		u32         cycle;
+		uint32_t    killed_at;
+		uint32_t    next;
+
+		struct intel_context *ctx;
+		struct intel_engine_cs *ring;
+	} pvt;
+};
+
+struct i915_sync_pt {
+	struct sync_pt		pt;
+
+	struct drm_i915_gem_syncpt_driver_data pvt;
+};
+
+bool i915_safe_to_ignore_fence(struct intel_engine_cs *ring, struct sync_fence *fence);
+
+int i915_sync_timeline_create(struct drm_device *dev,
+			      struct intel_context *ctx,
+			      struct intel_engine_cs *ring);
+
+void i915_sync_timeline_destroy(struct intel_context *ctx,
+				struct intel_engine_cs *ring);
+
+int i915_sync_create_fence(struct drm_i915_gem_request *req,
+			   int *fd_out, u64 ring_mask);
+
+void i915_sync_timeline_advance(struct intel_context *ctx,
+				struct intel_engine_cs *ring,
+				uint32_t value);
+
+#endif /* CONFIG_SYNC */
+
+#endif /* _INTEL_SYNC_H_ */
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index f88cc1c..9371347f 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -765,7 +765,17 @@ struct drm_i915_gem_execbuffer2 {
 #define I915_EXEC_BSD_RING1		(1<<13)
 #define I915_EXEC_BSD_RING2		(2<<13)
 
-#define __I915_EXEC_UNKNOWN_FLAGS -(1<<15)
+/** Caller supplies a sync fence fd in the rsvd2 field.
+ * Wait for it to be signalled before starting the work
+ */
+#define I915_EXEC_WAIT_FENCE		(1<<15)
+
+/** Caller wants a sync fence fd for this execbuffer.
+ *  It will be returned in rsvd2
+ */
+#define I915_EXEC_REQUEST_FENCE		(1<<16)
+
+#define __I915_EXEC_UNKNOWN_FLAGS -(1<<17)
 
 #define I915_EXEC_CONTEXT_ID_MASK	(0xffffffff)
 #define i915_execbuffer2_set_context_id(eb2, context) \
@@ -773,6 +783,12 @@ struct drm_i915_gem_execbuffer2 {
 #define i915_execbuffer2_get_context_id(eb2) \
 	((eb2).rsvd1 & I915_EXEC_CONTEXT_ID_MASK)
 
+struct drm_i915_gem_syncpt_driver_data {
+	__u32 value;
+	__u32 cycle;
+	__u64 ring_mask;
+};
+
 struct drm_i915_gem_pin {
 	/** Handle of the buffer to be pinned. */
 	__u32 handle;
-- 
1.9.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [RFC] drm/i915: Add sync framework support to execbuff IOCTL
  2015-07-02 11:09 [RFC] drm/i915: Add sync framework support to execbuff IOCTL John.C.Harrison
@ 2015-07-02 11:54 ` Chris Wilson
  2015-07-02 12:02   ` Chris Wilson
  2015-07-02 13:01   ` John Harrison
  0 siblings, 2 replies; 22+ messages in thread
From: Chris Wilson @ 2015-07-02 11:54 UTC (permalink / raw)
  To: John.C.Harrison; +Cc: Intel-GFX

On Thu, Jul 02, 2015 at 12:09:59PM +0100, John.C.Harrison@Intel.com wrote:
> From: John Harrison <John.C.Harrison@Intel.com>
> 
> Various projects desire a mechanism for managing dependencies between
> work items asynchronously. This can also include work items across
> complete different and independent systems. For example, an
> application wants to retreive a frame from a video in device,
> using it for rendering on a GPU then send it to the video out device
> for display all without having to stall waiting for completion along
> the way. The sync framework allows this. It encapsulates
> synchronisation events in file descriptors. The application can
> request a sync point for the completion of each piece of work. Drivers
> should also take sync points in with each new work request and not
> schedule the work to start until the sync has been signalled.
> 
> This patch adds sync framework support to the exec buffer IOCTL. A
> sync point can be passed in to stall execution of the batch buffer
> until signalled. And a sync point can be returned after each batch
> buffer submission which will be signalled upon that batch buffer's
> completion.
> 
> At present, the input sync point is simply waited on synchronously
> inside the exec buffer IOCTL call. Once the GPU scheduler arrives,
> this will be handled asynchronously inside the scheduler and the IOCTL
> can return without having to wait.
> 
> Note also that the scheduler will re-order the execution of batch
> buffers, e.g. because a batch buffer is stalled on a sync point and
> cannot be submitted yet but other, independent, batch buffers are
> being presented to the driver. This means that the timeline within the
> sync points returned cannot be global to the engine. Instead they must
> be kept per context per engine (the scheduler may not re-order batches
> within a context). Hence the timeline cannot be based on the existing
> seqno values but must be a new implementation.

But there is nothing preventing assignment of the sync value on
submission. Other than the debug .fence_value_str it's a private
implementation detail, and the interface is solely through the fd and
signalling. You could implement this as a secondary write to the HWS,
assigning the sync_value to the sync_pt on submission and
remove the request tracking, as when signalled you only need to compare
the sync_value against the timeline value in the HWS.

However, that equally applies to the existing request->seqno. That can
also be assigned on submission so that it always an ordered timeline, and
so can be used internally or externally.

It's a pity that the sync_pt didn't forward the
fence->enable_signaling().

As for using rsvd2, make sure you do

	int fence_fd = lower_32_bits(args->rsvd2);

and maybe I915_EXEC_CREATE_FENCE is a clearer name.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC] drm/i915: Add sync framework support to execbuff IOCTL
  2015-07-02 11:54 ` Chris Wilson
@ 2015-07-02 12:02   ` Chris Wilson
  2015-07-02 13:01   ` John Harrison
  1 sibling, 0 replies; 22+ messages in thread
From: Chris Wilson @ 2015-07-02 12:02 UTC (permalink / raw)
  To: John.C.Harrison, Intel-GFX

On Thu, Jul 02, 2015 at 12:54:27PM +0100, Chris Wilson wrote:
> However, that equally applies to the existing request->seqno. That can
> also be assigned on submission so that it always an ordered timeline, and
> so can be used internally or externally.

Hmm, we need to preallocate seqno space for the request during
i915_gem_request_alloc(). In the noop scheduler, we are then safe to
assign the next available. A reordering scheduler can insert whatever
commands it needs in between batches and wouldn't need to do anything in
the preallocate.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC] drm/i915: Add sync framework support to execbuff IOCTL
  2015-07-02 11:54 ` Chris Wilson
  2015-07-02 12:02   ` Chris Wilson
@ 2015-07-02 13:01   ` John Harrison
  2015-07-02 13:22     ` Chris Wilson
  1 sibling, 1 reply; 22+ messages in thread
From: John Harrison @ 2015-07-02 13:01 UTC (permalink / raw)
  To: Chris Wilson, Intel-GFX

On 02/07/2015 12:54, Chris Wilson wrote:
> On Thu, Jul 02, 2015 at 12:09:59PM +0100, John.C.Harrison@Intel.com wrote:
>> From: John Harrison <John.C.Harrison@Intel.com>
>>
>> Various projects desire a mechanism for managing dependencies between
>> work items asynchronously. This can also include work items across
>> complete different and independent systems. For example, an
>> application wants to retreive a frame from a video in device,
>> using it for rendering on a GPU then send it to the video out device
>> for display all without having to stall waiting for completion along
>> the way. The sync framework allows this. It encapsulates
>> synchronisation events in file descriptors. The application can
>> request a sync point for the completion of each piece of work. Drivers
>> should also take sync points in with each new work request and not
>> schedule the work to start until the sync has been signalled.
>>
>> This patch adds sync framework support to the exec buffer IOCTL. A
>> sync point can be passed in to stall execution of the batch buffer
>> until signalled. And a sync point can be returned after each batch
>> buffer submission which will be signalled upon that batch buffer's
>> completion.
>>
>> At present, the input sync point is simply waited on synchronously
>> inside the exec buffer IOCTL call. Once the GPU scheduler arrives,
>> this will be handled asynchronously inside the scheduler and the IOCTL
>> can return without having to wait.
>>
>> Note also that the scheduler will re-order the execution of batch
>> buffers, e.g. because a batch buffer is stalled on a sync point and
>> cannot be submitted yet but other, independent, batch buffers are
>> being presented to the driver. This means that the timeline within the
>> sync points returned cannot be global to the engine. Instead they must
>> be kept per context per engine (the scheduler may not re-order batches
>> within a context). Hence the timeline cannot be based on the existing
>> seqno values but must be a new implementation.
> But there is nothing preventing assignment of the sync value on
> submission. Other than the debug .fence_value_str it's a private
> implementation detail, and the interface is solely through the fd and
> signalling.
No, it needs to be public from the moment of creation. The sync 
framework API allows sync points to be combined together to create 
fences that either merge multiple points on the same timeline or 
amalgamate points across differing timelines. The merging part means 
that the sync point must be capable of doing arithmetic comparisons with 
other sync points from the instant it is returned to user land. And 
those comparisons must not change in the future due to scheduler 
re-ordering because by then it is too late to redo the test.


>   You could implement this as a secondary write to the HWS,
> assigning the sync_value to the sync_pt on submission and
> remove the request tracking, as when signalled you only need to compare
> the sync_value against the timeline value in the HWS.
>
> However, that equally applies to the existing request->seqno. That can
> also be assigned on submission so that it always an ordered timeline, and
> so can be used internally or externally.

One of the scheduler patches is to defer seqno assignment until batch 
submission rather than do it at request creation (for execbuffer 
requests). You still have a problem with pre-emption though. A request 
that is pre-empted will get a new seqno assigned when it is resubmitted 
so that the HWS page always sees ordered values popping out. For 
internal requests, this is fine but for external sync points that breaks 
the assumptions made by the framework.


> It's a pity that the sync_pt didn't forward the
> fence->enable_signaling().
>
> As for using rsvd2, make sure you do
>
> 	int fence_fd = lower_32_bits(args->rsvd2);
>
> and maybe I915_EXEC_CREATE_FENCE is a clearer name.
> -Chris
>

Thanks,
John.

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC] drm/i915: Add sync framework support to execbuff IOCTL
  2015-07-02 13:01   ` John Harrison
@ 2015-07-02 13:22     ` Chris Wilson
  2015-07-02 15:43       ` John Harrison
  0 siblings, 1 reply; 22+ messages in thread
From: Chris Wilson @ 2015-07-02 13:22 UTC (permalink / raw)
  To: John Harrison; +Cc: Intel-GFX

On Thu, Jul 02, 2015 at 02:01:56PM +0100, John Harrison wrote:
> On 02/07/2015 12:54, Chris Wilson wrote:
> >On Thu, Jul 02, 2015 at 12:09:59PM +0100, John.C.Harrison@Intel.com wrote:
> >>From: John Harrison <John.C.Harrison@Intel.com>
> >>
> >>Various projects desire a mechanism for managing dependencies between
> >>work items asynchronously. This can also include work items across
> >>complete different and independent systems. For example, an
> >>application wants to retreive a frame from a video in device,
> >>using it for rendering on a GPU then send it to the video out device
> >>for display all without having to stall waiting for completion along
> >>the way. The sync framework allows this. It encapsulates
> >>synchronisation events in file descriptors. The application can
> >>request a sync point for the completion of each piece of work. Drivers
> >>should also take sync points in with each new work request and not
> >>schedule the work to start until the sync has been signalled.
> >>
> >>This patch adds sync framework support to the exec buffer IOCTL. A
> >>sync point can be passed in to stall execution of the batch buffer
> >>until signalled. And a sync point can be returned after each batch
> >>buffer submission which will be signalled upon that batch buffer's
> >>completion.
> >>
> >>At present, the input sync point is simply waited on synchronously
> >>inside the exec buffer IOCTL call. Once the GPU scheduler arrives,
> >>this will be handled asynchronously inside the scheduler and the IOCTL
> >>can return without having to wait.
> >>
> >>Note also that the scheduler will re-order the execution of batch
> >>buffers, e.g. because a batch buffer is stalled on a sync point and
> >>cannot be submitted yet but other, independent, batch buffers are
> >>being presented to the driver. This means that the timeline within the
> >>sync points returned cannot be global to the engine. Instead they must
> >>be kept per context per engine (the scheduler may not re-order batches
> >>within a context). Hence the timeline cannot be based on the existing
> >>seqno values but must be a new implementation.
> >But there is nothing preventing assignment of the sync value on
> >submission. Other than the debug .fence_value_str it's a private
> >implementation detail, and the interface is solely through the fd and
> >signalling.
> No, it needs to be public from the moment of creation. The sync
> framework API allows sync points to be combined together to create
> fences that either merge multiple points on the same timeline or
> amalgamate points across differing timelines. The merging part means
> that the sync point must be capable of doing arithmetic comparisons
> with other sync points from the instant it is returned to user land.
> And those comparisons must not change in the future due to scheduler
> re-ordering because by then it is too late to redo the test.

You know that's not documented at all. The only information userspace
gets is afaict

struct sync_pt_info {
	__u32   len;
	char    obj_name[32];
	char    driver_name[32];
	__s32   status;
	__u64   timestamp_ns;

	__u8    driver_data[0];
};

There is a merge operation done by combining two fence into a new one.
Merging is done by ordering the fences based on the context pointers and
then by sync_pt->fence.seqno, not the private sync value.

How does userspace try to order the fences other than as opaque fd? You
actually mean driver_data is undefined ABI...

> >  You could implement this as a secondary write to the HWS,
> >assigning the sync_value to the sync_pt on submission and
> >remove the request tracking, as when signalled you only need to compare
> >the sync_value against the timeline value in the HWS.
> >
> >However, that equally applies to the existing request->seqno. That can
> >also be assigned on submission so that it always an ordered timeline, and
> >so can be used internally or externally.
> 
> One of the scheduler patches is to defer seqno assignment until
> batch submission rather than do it at request creation (for
> execbuffer requests). You still have a problem with pre-emption
> though. A request that is pre-empted will get a new seqno assigned
> when it is resubmitted so that the HWS page always sees ordered
> values popping out. For internal requests, this is fine but for
> external sync points that breaks the assumptions made by the
> framework.

I fail to see how. Nothing in uapi/sync.h says anything about the order
of fences or gives any such guarantees. If the external callers only
have access through the fd, there is no restriction that the timeline
sync_pt->value must be set prior to submission.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC] drm/i915: Add sync framework support to execbuff IOCTL
  2015-07-02 13:22     ` Chris Wilson
@ 2015-07-02 15:43       ` John Harrison
  2015-07-02 15:55         ` Chris Wilson
  0 siblings, 1 reply; 22+ messages in thread
From: John Harrison @ 2015-07-02 15:43 UTC (permalink / raw)
  To: Chris Wilson, Intel-GFX

On 02/07/2015 14:22, Chris Wilson wrote:
> On Thu, Jul 02, 2015 at 02:01:56PM +0100, John Harrison wrote:
>> On 02/07/2015 12:54, Chris Wilson wrote:
>>> On Thu, Jul 02, 2015 at 12:09:59PM +0100, John.C.Harrison@Intel.com wrote:
>>>> From: John Harrison <John.C.Harrison@Intel.com>
>>>>
>>>> Various projects desire a mechanism for managing dependencies between
>>>> work items asynchronously. This can also include work items across
>>>> complete different and independent systems. For example, an
>>>> application wants to retreive a frame from a video in device,
>>>> using it for rendering on a GPU then send it to the video out device
>>>> for display all without having to stall waiting for completion along
>>>> the way. The sync framework allows this. It encapsulates
>>>> synchronisation events in file descriptors. The application can
>>>> request a sync point for the completion of each piece of work. Drivers
>>>> should also take sync points in with each new work request and not
>>>> schedule the work to start until the sync has been signalled.
>>>>
>>>> This patch adds sync framework support to the exec buffer IOCTL. A
>>>> sync point can be passed in to stall execution of the batch buffer
>>>> until signalled. And a sync point can be returned after each batch
>>>> buffer submission which will be signalled upon that batch buffer's
>>>> completion.
>>>>
>>>> At present, the input sync point is simply waited on synchronously
>>>> inside the exec buffer IOCTL call. Once the GPU scheduler arrives,
>>>> this will be handled asynchronously inside the scheduler and the IOCTL
>>>> can return without having to wait.
>>>>
>>>> Note also that the scheduler will re-order the execution of batch
>>>> buffers, e.g. because a batch buffer is stalled on a sync point and
>>>> cannot be submitted yet but other, independent, batch buffers are
>>>> being presented to the driver. This means that the timeline within the
>>>> sync points returned cannot be global to the engine. Instead they must
>>>> be kept per context per engine (the scheduler may not re-order batches
>>>> within a context). Hence the timeline cannot be based on the existing
>>>> seqno values but must be a new implementation.
>>> But there is nothing preventing assignment of the sync value on
>>> submission. Other than the debug .fence_value_str it's a private
>>> implementation detail, and the interface is solely through the fd and
>>> signalling.
>> No, it needs to be public from the moment of creation. The sync
>> framework API allows sync points to be combined together to create
>> fences that either merge multiple points on the same timeline or
>> amalgamate points across differing timelines. The merging part means
>> that the sync point must be capable of doing arithmetic comparisons
>> with other sync points from the instant it is returned to user land.
>> And those comparisons must not change in the future due to scheduler
>> re-ordering because by then it is too late to redo the test.
> You know that's not documented at all. The only information userspace
> gets is afaict
>
> struct sync_pt_info {
> 	__u32   len;
> 	char    obj_name[32];
> 	char    driver_name[32];
> 	__s32   status;
> 	__u64   timestamp_ns;
>
> 	__u8    driver_data[0];
> };
>
> There is a merge operation done by combining two fence into a new one.
> Merging is done by ordering the fences based on the context pointers and
> then by sync_pt->fence.seqno, not the private sync value.
>
> How does userspace try to order the fences other than as opaque fd? You
> actually mean driver_data is undefined ABI...
Hmm, something looks confused. Way back when (i.e. the shipping Android 
tree), the 'private' seqno value was very definitely being exposed in 
various ways but it looks like it has actually been superseded by the 
seqno value inside the (new) fence object that is inside the sync_pt. 
Despite that, there is still a 'compare' callback in the timeline_ops 
for doing comparisons between sync_pts based on their private 
implementation specific seqno. Although this is marked as 'required' it 
is not actually called anywhere anymore! So it looks like we can drop 
the 'private' seqno value completely and just use the fence version.

In my defense, this code is all coming from the Android tree and the 
port to the nightly was done by someone else. I hadn't realised that 
nightly had changed quite so significantly.

>>>   You could implement this as a secondary write to the HWS,
>>> assigning the sync_value to the sync_pt on submission and
>>> remove the request tracking, as when signalled you only need to compare
>>> the sync_value against the timeline value in the HWS.
>>>
>>> However, that equally applies to the existing request->seqno. That can
>>> also be assigned on submission so that it always an ordered timeline, and
>>> so can be used internally or externally.
>> One of the scheduler patches is to defer seqno assignment until
>> batch submission rather than do it at request creation (for
>> execbuffer requests). You still have a problem with pre-emption
>> though. A request that is pre-empted will get a new seqno assigned
>> when it is resubmitted so that the HWS page always sees ordered
>> values popping out. For internal requests, this is fine but for
>> external sync points that breaks the assumptions made by the
>> framework.
> I fail to see how. Nothing in uapi/sync.h says anything about the order
> of fences or gives any such guarantees. If the external callers only
> have access through the fd, there is no restriction that the timeline
> sync_pt->value must be set prior to submission.

As noted above, it looks like the new version of the sync framework now 
allocates its own context/value seqno pair internally at creation time. 
So all my comments really refer to that seqno inside the struct fence. 
It seems like a bad idea that we are trying to shoe horn another seqno 
value on top.

Back to the drawing board...


> -Chris
>

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC] drm/i915: Add sync framework support to execbuff IOCTL
  2015-07-02 15:43       ` John Harrison
@ 2015-07-02 15:55         ` Chris Wilson
  2015-07-03 11:17           ` Tvrtko Ursulin
  0 siblings, 1 reply; 22+ messages in thread
From: Chris Wilson @ 2015-07-02 15:55 UTC (permalink / raw)
  To: John Harrison; +Cc: Intel-GFX

On Thu, Jul 02, 2015 at 04:43:12PM +0100, John Harrison wrote:
> On 02/07/2015 14:22, Chris Wilson wrote:
> >On Thu, Jul 02, 2015 at 02:01:56PM +0100, John Harrison wrote:
> >>On 02/07/2015 12:54, Chris Wilson wrote:
> >>>On Thu, Jul 02, 2015 at 12:09:59PM +0100, John.C.Harrison@Intel.com wrote:
> >>>>From: John Harrison <John.C.Harrison@Intel.com>
> >>>>
> >>>>Various projects desire a mechanism for managing dependencies between
> >>>>work items asynchronously. This can also include work items across
> >>>>complete different and independent systems. For example, an
> >>>>application wants to retreive a frame from a video in device,
> >>>>using it for rendering on a GPU then send it to the video out device
> >>>>for display all without having to stall waiting for completion along
> >>>>the way. The sync framework allows this. It encapsulates
> >>>>synchronisation events in file descriptors. The application can
> >>>>request a sync point for the completion of each piece of work. Drivers
> >>>>should also take sync points in with each new work request and not
> >>>>schedule the work to start until the sync has been signalled.
> >>>>
> >>>>This patch adds sync framework support to the exec buffer IOCTL. A
> >>>>sync point can be passed in to stall execution of the batch buffer
> >>>>until signalled. And a sync point can be returned after each batch
> >>>>buffer submission which will be signalled upon that batch buffer's
> >>>>completion.
> >>>>
> >>>>At present, the input sync point is simply waited on synchronously
> >>>>inside the exec buffer IOCTL call. Once the GPU scheduler arrives,
> >>>>this will be handled asynchronously inside the scheduler and the IOCTL
> >>>>can return without having to wait.
> >>>>
> >>>>Note also that the scheduler will re-order the execution of batch
> >>>>buffers, e.g. because a batch buffer is stalled on a sync point and
> >>>>cannot be submitted yet but other, independent, batch buffers are
> >>>>being presented to the driver. This means that the timeline within the
> >>>>sync points returned cannot be global to the engine. Instead they must
> >>>>be kept per context per engine (the scheduler may not re-order batches
> >>>>within a context). Hence the timeline cannot be based on the existing
> >>>>seqno values but must be a new implementation.
> >>>But there is nothing preventing assignment of the sync value on
> >>>submission. Other than the debug .fence_value_str it's a private
> >>>implementation detail, and the interface is solely through the fd and
> >>>signalling.
> >>No, it needs to be public from the moment of creation. The sync
> >>framework API allows sync points to be combined together to create
> >>fences that either merge multiple points on the same timeline or
> >>amalgamate points across differing timelines. The merging part means
> >>that the sync point must be capable of doing arithmetic comparisons
> >>with other sync points from the instant it is returned to user land.
> >>And those comparisons must not change in the future due to scheduler
> >>re-ordering because by then it is too late to redo the test.
> >You know that's not documented at all. The only information userspace
> >gets is afaict
> >
> >struct sync_pt_info {
> >	__u32   len;
> >	char    obj_name[32];
> >	char    driver_name[32];
> >	__s32   status;
> >	__u64   timestamp_ns;
> >
> >	__u8    driver_data[0];
> >};
> >
> >There is a merge operation done by combining two fence into a new one.
> >Merging is done by ordering the fences based on the context pointers and
> >then by sync_pt->fence.seqno, not the private sync value.
> >
> >How does userspace try to order the fences other than as opaque fd? You
> >actually mean driver_data is undefined ABI...
> Hmm, something looks confused. Way back when (i.e. the shipping
> Android tree), the 'private' seqno value was very definitely being
> exposed in various ways but it looks like it has actually been
> superseded by the seqno value inside the (new) fence object that is
> inside the sync_pt. Despite that, there is still a 'compare'
> callback in the timeline_ops for doing comparisons between sync_pts
> based on their private implementation specific seqno. Although this
> is marked as 'required' it is not actually called anywhere anymore!
> So it looks like we can drop the 'private' seqno value completely
> and just use the fence version.
> 
> In my defense, this code is all coming from the Android tree and the
> port to the nightly was done by someone else. I hadn't realised that
> nightly had changed quite so significantly.

It would be nice if we could reuse one seqno both for internal/external
fences. If you need to expose a fence ordering within a timeline that is
based on the creation stamp rather than execution stamp, it seems like
we could just add such a stamp when creating the sync_pt and not worry
about its relationship to the execution seqno.

Doing so does expose that requests are reordered to userspace since the
signalling timeline is not the same as userspace's ordered timeline. Not
sure if that is a problem or not.

Afaict the sync uapi is based on waiting for all of a set of fences to
retire. It doesn't seem to rely on fence ordering (that is knowing that
fence A will signal before fence B so it need only wait on fence B).

Here's hoping that we can have both simplicity and efficiency...
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC] drm/i915: Add sync framework support to execbuff IOCTL
  2015-07-02 15:55         ` Chris Wilson
@ 2015-07-03 11:17           ` Tvrtko Ursulin
  2015-07-06  9:29             ` Daniel Vetter
  0 siblings, 1 reply; 22+ messages in thread
From: Tvrtko Ursulin @ 2015-07-03 11:17 UTC (permalink / raw)
  To: Chris Wilson, John Harrison, Intel-GFX


On 07/02/2015 04:55 PM, Chris Wilson wrote:
> It would be nice if we could reuse one seqno both for internal/external
> fences. If you need to expose a fence ordering within a timeline that is
> based on the creation stamp rather than execution stamp, it seems like
> we could just add such a stamp when creating the sync_pt and not worry
> about its relationship to the execution seqno.
> 
> Doing so does expose that requests are reordered to userspace since the
> signalling timeline is not the same as userspace's ordered timeline. Not
> sure if that is a problem or not.
> 
> Afaict the sync uapi is based on waiting for all of a set of fences to
> retire. It doesn't seem to rely on fence ordering (that is knowing that
> fence A will signal before fence B so it need only wait on fence B).
> 
> Here's hoping that we can have both simplicity and efficiency...

Jumping in with not even perfect understanding of everything here - but
timeline business has always been confusing me. There is nothing in the 
uapi which needs it afaics and iirc there was some discussion at the time
Jesse floated his patches that it can be removed. Based on that when I
squashed his patches and ported them on top of John's request to fence
conversion it ended up something like the below (manually edited a bit to
be less noisy and some prep patches omitted):

This implements the ioctl based uapi and indeed seqnos are not actually
used in waits. So is this insufficient for some reason? (Other that it
does not implement the input fence side of things.)

diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig
index 74acca9..07f6ad9 100644
--- a/drivers/gpu/drm/i915/Kconfig
+++ b/drivers/gpu/drm/i915/Kconfig
@@ -71,3 +71,17 @@ config DRM_I915_PRELIMINARY_HW_SUPPORT
 	  option changes the default for that module option.
 
 	  If in doubt, say "N".
+
+config DRM_I915_SYNC
+	bool "Enable explicit sync support"
+	depends on DRM_I915
+	default y if STAGING
+	depends on STAGING
+	select ANDROID
+	select SYNC
+	help
+	  Choose this option to enable Android native sync support and the
+	  corresponding i915 driver code to expose it.  Slightly increases
+	  driver size and pulls in sync support from staging.
+
+	  If in doubt, say "Y".
diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index db21c93..93a3bc0 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -91,6 +91,9 @@ i915-y += i915_vgpu.o
 # legacy horrors
 i915-y += i915_dma.o
 
+# sync points
+i915-$(CONFIG_DRM_I915_SYNC)	+= i915_sync.o
+
 obj-$(CONFIG_DRM_I915)  += i915.o
 
 CFLAGS_i915_trace_points.o := -I$(src)
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 3ef3997..2cf4d3f 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2753,6 +2753,26 @@ void i915_init_vm(struct drm_i915_private *dev_priv,
 void i915_gem_free_object(struct drm_gem_object *obj);
 void i915_gem_vma_destroy(struct i915_vma *vma);
 
+/* i915_sync.c */
+struct sync_fence;
+
+#ifdef CONFIG_DRM_I915_SYNC
+int i915_fence_ring_fill_driver_data(struct fence *fence, void *data, int size);
+void i915_fence_ring_value_str(struct fence *fence, char *str, int size);
+void i915_fence_ring_timeline_value_str(struct fence *fence, char *str,
+					int size);
+
+int i915_create_sync_fence_ring(struct drm_i915_gem_request *req,
+				struct sync_fence **sync_fence, int *fence_fd);
+#else
+static inline
+int i915_create_sync_fence_ring(struct drm_i915_gem_request *req,
+				struct sync_fence **sync_fence, int *fence_fd)
+{
+	return -ENODEV;
+}
+#endif
+
 #define PIN_MAPPABLE 0x1
 #define PIN_NONBLOCK 0x2
 #define PIN_GLOBAL 0x4
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 560d244..a04853c 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2633,7 +2633,7 @@ static const char *i915_gem_request_get_timeline_name(struct fence *req_fence)
 	return req->ring->name;
 }
 
-#if 0
+#if CONFIG_DRM_I915_SYNC
 static bool i915_gem_request_is_signaled(struct fence *req_fence)
 {
 	struct drm_i915_gem_request *req = container_of(req_fence,
@@ -2770,12 +2770,14 @@ static const struct fence_ops i915_gem_request_fops = {
 	.get_driver_name	= i915_gem_request_get_driver_name,
 	.get_timeline_name	= i915_gem_request_get_timeline_name,
 	.enable_signaling	= i915_gem_request_enable_signaling,
 	.wait			= fence_default_wait,
 	.release		= i915_gem_request_free,
+#if CONFIG_DRM_I915_SYNC
+	.signaled		= i915_gem_request_is_signaled,
+	.fill_driver_data	= i915_fence_ring_fill_driver_data,
+	.fence_value_str	= i915_fence_ring_value_str,
+	.timeline_value_str	= i915_fence_ring_timeline_value_str
+#endif
 };
 
 int _i915_gem_request_alloc(struct intel_engine_cs *ring,
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 182c730..e6342ac 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -32,6 +32,7 @@
 #include "i915_trace.h"
 #include "intel_drv.h"
 #include <linux/dma_remapping.h>
+#include "../../../staging/android/sync.h"
 
 #define  __EXEC_OBJECT_HAS_PIN (1<<31)
 #define  __EXEC_OBJECT_HAS_FENCE (1<<30)
@@ -1417,6 +1418,8 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	u32 dispatch_flags;
 	int ret;
 	bool need_relocs;
+	struct sync_fence *sync_fence = NULL;
+	int fence_fd = -1;
 
 	if (!i915_gem_check_execbuffer(args))
 		return -EINVAL;
@@ -1610,6 +1613,13 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	if (ret)
 		goto err_batch_unpin;
 
+	if (args->flags & I915_EXEC_FENCE_OUT) {
+		ret = i915_create_sync_fence_ring(params->request, &sync_fence,
+						  &fence_fd);
+		if (ret)
+			goto err_batch_unpin;
+	}
+
 	ret = i915_gem_request_add_to_client(params->request, file);
 	if (ret)
 		goto err_batch_unpin;
@@ -1628,6 +1638,26 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	params->ctx                     = ctx;
 
 	ret = dev_priv->gt.execbuf_submit(params, args, &eb->vmas);
+	if (ret)
+		goto err_submit;
+
+	if (sync_fence) {
+		sync_fence_install(sync_fence, fence_fd);
+		args->rsvd2 = fence_fd;
+		sync_fence = NULL;
+	}
+
+err_submit:
+	if (sync_fence) {
+		/*
+		 * We are under the struct mutex here and sync fence we
+		 * created will attempt to grab it in its destructor.
+		 * Therefore remove the lock before unreferencing.
+		 */
+		sync_fence->lock = NULL;
+		fput(sync_fence->file);
+		put_unused_fd(fence_fd);
+	}
 
 err_batch_unpin:
 	/*
diff --git a/drivers/gpu/drm/i915/i915_sync.c b/drivers/gpu/drm/i915/i915_sync.c
new file mode 100644
index 0000000..1a50610
--- /dev/null
+++ b/drivers/gpu/drm/i915/i915_sync.c
@@ -0,0 +1,106 @@
+/*
+ * Copyright © 2013-2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Jesse Barnes <jbarnes@virtuousgeek.org>
+ *    Tvrtko Ursulin <tvrtko.ursulin@intel.com>
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/fence.h>
+#include "../../../staging/android/sync.h"
+#include "i915_drv.h"
+
+/*
+ * i915 Android native sync fences.
+ *
+ * We implement sync points in terms of i915 seqnos. They're exposed through
+ * the DRM_IOCTL_I915_GEM_EXECBUFFER2 ioctl, and can be mixed and matched
+ * with other Android timelines and aggregated into sync_fences, etc.
+ *
+ * TODO:
+ *   * Display engine fences.
+ *   * Extend driver data with context id / ring id.
+ */
+
+int i915_fence_ring_fill_driver_data(struct fence *fence, void *data, int size)
+{
+	struct drm_i915_gem_request *req = container_of(fence, typeof(*req),
+							fence);
+
+	if (size < sizeof(req->seqno))
+		return -ENOMEM;
+
+	memcpy(data, &req->seqno, sizeof(req->seqno));
+
+	return sizeof(req->seqno);
+}
+
+void i915_fence_ring_value_str(struct fence *fence, char *str, int size)
+{
+	struct drm_i915_gem_request *req = container_of(fence, typeof(*req),
+							fence);
+
+	snprintf(str, size, "%u", req->seqno);
+}
+
+void i915_fence_ring_timeline_value_str(struct fence *fence, char *str,
+					int size)
+{
+	struct drm_i915_gem_request *req = container_of(fence, typeof(*req),
+							fence);
+	struct intel_engine_cs *ring = req->ring;
+
+	snprintf(str, size, "%u", ring->get_seqno(ring, false));
+}
+
+int i915_create_sync_fence_ring(struct drm_i915_gem_request *req,
+				struct sync_fence **sync_fence, int *fence_fd)
+{
+	struct drm_device *dev = req->i915->dev;
+	struct intel_engine_cs *ring = req->ring;
+	struct sync_fence *sfence;
+	char ring_name[6] = "ring0";
+	int fd;
+
+	fd = get_unused_fd_flags(O_CLOEXEC);
+	if (fd < 0) {
+		DRM_DEBUG("No available file descriptors!\n");
+		return fd;
+	}
+
+	ring_name[4] += ring->id;
+	sfence = sync_fence_create_dma(ring_name, &req->fence,
+				       &dev->struct_mutex);
+	if (!sfence) {
+		put_unused_fd(fd);
+		return -ENOMEM;
+	}
+
+	*sync_fence = sfence;
+	*fence_fd = fd;
+
+	fence_get(&req->fence);
+
+	return 0;
+}
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 4851d66..2522f78 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -246,7 +246,7 @@ typedef struct _drm_i915_sarea {
 #define DRM_IOCTL_I915_HWS_ADDR		DRM_IOW(DRM_COMMAND_BASE + DRM_I915_HWS_ADDR, struct drm_i915_gem_init)
 #define DRM_IOCTL_I915_GEM_INIT		DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_INIT, struct drm_i915_gem_init)
 #define DRM_IOCTL_I915_GEM_EXECBUFFER	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_EXECBUFFER, struct drm_i915_gem_execbuffer)
-#define DRM_IOCTL_I915_GEM_EXECBUFFER2	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_EXECBUFFER2, struct drm_i915_gem_execbuffer2)
+#define DRM_IOCTL_I915_GEM_EXECBUFFER2	DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_EXECBUFFER2, struct drm_i915_gem_execbuffer2)
 #define DRM_IOCTL_I915_GEM_PIN		DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_PIN, struct drm_i915_gem_pin)
 #define DRM_IOCTL_I915_GEM_UNPIN	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_UNPIN, struct drm_i915_gem_unpin)
 #define DRM_IOCTL_I915_GEM_BUSY		DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_BUSY, struct drm_i915_gem_busy)
@@ -722,7 +722,7 @@ struct drm_i915_gem_execbuffer2 {
 #define I915_EXEC_CONSTANTS_REL_SURFACE (2<<6) /* gen4/5 only */
 	__u64 flags;
 	__u64 rsvd1; /* now used for context info */
-	__u64 rsvd2;
+	__u64 rsvd2; /* now used for fence fd */
 };
 
 /** Resets the SO write offset registers for transform feedback on gen7. */
@@ -760,7 +760,9 @@ struct drm_i915_gem_execbuffer2 {
 #define I915_EXEC_BSD_RING1		(1<<13)
 #define I915_EXEC_BSD_RING2		(2<<13)
 
-#define __I915_EXEC_UNKNOWN_FLAGS -(1<<15)
+#define I915_EXEC_FENCE_OUT		(1<<15)
+
+#define __I915_EXEC_UNKNOWN_FLAGS -(1<<16)
 
 #define I915_EXEC_CONTEXT_ID_MASK	(0xffffffff)
 #define i915_execbuffer2_set_context_id(eb2, context) \
-- 
2.4.0


_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [RFC] drm/i915: Add sync framework support to execbuff IOCTL
  2015-07-03 11:17           ` Tvrtko Ursulin
@ 2015-07-06  9:29             ` Daniel Vetter
  2015-07-06 12:58               ` John Harrison
  0 siblings, 1 reply; 22+ messages in thread
From: Daniel Vetter @ 2015-07-06  9:29 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: Intel-GFX

On Fri, Jul 03, 2015 at 12:17:33PM +0100, Tvrtko Ursulin wrote:
> 
> On 07/02/2015 04:55 PM, Chris Wilson wrote:
> > It would be nice if we could reuse one seqno both for internal/external
> > fences. If you need to expose a fence ordering within a timeline that is
> > based on the creation stamp rather than execution stamp, it seems like
> > we could just add such a stamp when creating the sync_pt and not worry
> > about its relationship to the execution seqno.
> > 
> > Doing so does expose that requests are reordered to userspace since the
> > signalling timeline is not the same as userspace's ordered timeline. Not
> > sure if that is a problem or not.
> > 
> > Afaict the sync uapi is based on waiting for all of a set of fences to
> > retire. It doesn't seem to rely on fence ordering (that is knowing that
> > fence A will signal before fence B so it need only wait on fence B).
> > 
> > Here's hoping that we can have both simplicity and efficiency...
> 
> Jumping in with not even perfect understanding of everything here - but
> timeline business has always been confusing me. There is nothing in the 
> uapi which needs it afaics and iirc there was some discussion at the time
> Jesse floated his patches that it can be removed. Based on that when I
> squashed his patches and ported them on top of John's request to fence
> conversion it ended up something like the below (manually edited a bit to
> be less noisy and some prep patches omitted):
> 
> This implements the ioctl based uapi and indeed seqnos are not actually
> used in waits. So is this insufficient for some reason? (Other that it
> does not implement the input fence side of things.)

Yeah android syncpt on top of struct fence embedded int i915 request is
what I'd have expected.
> 
> diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig
> index 74acca9..07f6ad9 100644
> --- a/drivers/gpu/drm/i915/Kconfig
> +++ b/drivers/gpu/drm/i915/Kconfig
> @@ -71,3 +71,17 @@ config DRM_I915_PRELIMINARY_HW_SUPPORT
>  	  option changes the default for that module option.
>  
>  	  If in doubt, say "N".
> +
> +config DRM_I915_SYNC
> +	bool "Enable explicit sync support"
> +	depends on DRM_I915
> +	default y if STAGING
> +	depends on STAGING
> +	select ANDROID
> +	select SYNC
> +	help

No Kconfig for userspace ABI please. Yes this means we need to destage
android syncpts first. The problem I see there is that apparently google
is still changing the uabi a lot, and that's a no-go for upstream. And it
needs to be cleaned up to work more seamlessly with struct fence (i.e.
anything that's missing there should be moved to struct fence, drivers
should only use fd_to_fence and fenct_to_fd functions similar to dma-buf).

And we don't have anyone except android using syncpts, so a bit a trouble
with finding userspace vehicles for this. We probably need agreement from
google to be happy with a frozen abi for syncpts first ...
-Daniel

> +	  Choose this option to enable Android native sync support and the
> +	  corresponding i915 driver code to expose it.  Slightly increases
> +	  driver size and pulls in sync support from staging.
> +
> +	  If in doubt, say "Y".
> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
> index db21c93..93a3bc0 100644
> --- a/drivers/gpu/drm/i915/Makefile
> +++ b/drivers/gpu/drm/i915/Makefile
> @@ -91,6 +91,9 @@ i915-y += i915_vgpu.o
>  # legacy horrors
>  i915-y += i915_dma.o
>  
> +# sync points
> +i915-$(CONFIG_DRM_I915_SYNC)	+= i915_sync.o
> +
>  obj-$(CONFIG_DRM_I915)  += i915.o
>  
>  CFLAGS_i915_trace_points.o := -I$(src)
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 3ef3997..2cf4d3f 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2753,6 +2753,26 @@ void i915_init_vm(struct drm_i915_private *dev_priv,
>  void i915_gem_free_object(struct drm_gem_object *obj);
>  void i915_gem_vma_destroy(struct i915_vma *vma);
>  
> +/* i915_sync.c */
> +struct sync_fence;
> +
> +#ifdef CONFIG_DRM_I915_SYNC
> +int i915_fence_ring_fill_driver_data(struct fence *fence, void *data, int size);
> +void i915_fence_ring_value_str(struct fence *fence, char *str, int size);
> +void i915_fence_ring_timeline_value_str(struct fence *fence, char *str,
> +					int size);
> +
> +int i915_create_sync_fence_ring(struct drm_i915_gem_request *req,
> +				struct sync_fence **sync_fence, int *fence_fd);
> +#else
> +static inline
> +int i915_create_sync_fence_ring(struct drm_i915_gem_request *req,
> +				struct sync_fence **sync_fence, int *fence_fd)
> +{
> +	return -ENODEV;
> +}
> +#endif
> +
>  #define PIN_MAPPABLE 0x1
>  #define PIN_NONBLOCK 0x2
>  #define PIN_GLOBAL 0x4
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 560d244..a04853c 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2633,7 +2633,7 @@ static const char *i915_gem_request_get_timeline_name(struct fence *req_fence)
>  	return req->ring->name;
>  }
>  
> -#if 0
> +#if CONFIG_DRM_I915_SYNC
>  static bool i915_gem_request_is_signaled(struct fence *req_fence)
>  {
>  	struct drm_i915_gem_request *req = container_of(req_fence,
> @@ -2770,12 +2770,14 @@ static const struct fence_ops i915_gem_request_fops = {
>  	.get_driver_name	= i915_gem_request_get_driver_name,
>  	.get_timeline_name	= i915_gem_request_get_timeline_name,
>  	.enable_signaling	= i915_gem_request_enable_signaling,
>  	.wait			= fence_default_wait,
>  	.release		= i915_gem_request_free,
> +#if CONFIG_DRM_I915_SYNC
> +	.signaled		= i915_gem_request_is_signaled,
> +	.fill_driver_data	= i915_fence_ring_fill_driver_data,
> +	.fence_value_str	= i915_fence_ring_value_str,
> +	.timeline_value_str	= i915_fence_ring_timeline_value_str
> +#endif
>  };
>  
>  int _i915_gem_request_alloc(struct intel_engine_cs *ring,
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index 182c730..e6342ac 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -32,6 +32,7 @@
>  #include "i915_trace.h"
>  #include "intel_drv.h"
>  #include <linux/dma_remapping.h>
> +#include "../../../staging/android/sync.h"
>  
>  #define  __EXEC_OBJECT_HAS_PIN (1<<31)
>  #define  __EXEC_OBJECT_HAS_FENCE (1<<30)
> @@ -1417,6 +1418,8 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>  	u32 dispatch_flags;
>  	int ret;
>  	bool need_relocs;
> +	struct sync_fence *sync_fence = NULL;
> +	int fence_fd = -1;
>  
>  	if (!i915_gem_check_execbuffer(args))
>  		return -EINVAL;
> @@ -1610,6 +1613,13 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>  	if (ret)
>  		goto err_batch_unpin;
>  
> +	if (args->flags & I915_EXEC_FENCE_OUT) {
> +		ret = i915_create_sync_fence_ring(params->request, &sync_fence,
> +						  &fence_fd);
> +		if (ret)
> +			goto err_batch_unpin;
> +	}
> +
>  	ret = i915_gem_request_add_to_client(params->request, file);
>  	if (ret)
>  		goto err_batch_unpin;
> @@ -1628,6 +1638,26 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>  	params->ctx                     = ctx;
>  
>  	ret = dev_priv->gt.execbuf_submit(params, args, &eb->vmas);
> +	if (ret)
> +		goto err_submit;
> +
> +	if (sync_fence) {
> +		sync_fence_install(sync_fence, fence_fd);
> +		args->rsvd2 = fence_fd;
> +		sync_fence = NULL;
> +	}
> +
> +err_submit:
> +	if (sync_fence) {
> +		/*
> +		 * We are under the struct mutex here and sync fence we
> +		 * created will attempt to grab it in its destructor.
> +		 * Therefore remove the lock before unreferencing.
> +		 */
> +		sync_fence->lock = NULL;
> +		fput(sync_fence->file);
> +		put_unused_fd(fence_fd);
> +	}
>  
>  err_batch_unpin:
>  	/*
> diff --git a/drivers/gpu/drm/i915/i915_sync.c b/drivers/gpu/drm/i915/i915_sync.c
> new file mode 100644
> index 0000000..1a50610
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/i915_sync.c
> @@ -0,0 +1,106 @@
> +/*
> + * Copyright © 2013-2015 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + *
> + * Authors:
> + *    Jesse Barnes <jbarnes@virtuousgeek.org>
> + *    Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> + *
> + */
> +
> +#include <linux/fs.h>
> +#include <linux/fence.h>
> +#include "../../../staging/android/sync.h"
> +#include "i915_drv.h"
> +
> +/*
> + * i915 Android native sync fences.
> + *
> + * We implement sync points in terms of i915 seqnos. They're exposed through
> + * the DRM_IOCTL_I915_GEM_EXECBUFFER2 ioctl, and can be mixed and matched
> + * with other Android timelines and aggregated into sync_fences, etc.
> + *
> + * TODO:
> + *   * Display engine fences.
> + *   * Extend driver data with context id / ring id.
> + */
> +
> +int i915_fence_ring_fill_driver_data(struct fence *fence, void *data, int size)
> +{
> +	struct drm_i915_gem_request *req = container_of(fence, typeof(*req),
> +							fence);
> +
> +	if (size < sizeof(req->seqno))
> +		return -ENOMEM;
> +
> +	memcpy(data, &req->seqno, sizeof(req->seqno));
> +
> +	return sizeof(req->seqno);
> +}
> +
> +void i915_fence_ring_value_str(struct fence *fence, char *str, int size)
> +{
> +	struct drm_i915_gem_request *req = container_of(fence, typeof(*req),
> +							fence);
> +
> +	snprintf(str, size, "%u", req->seqno);
> +}
> +
> +void i915_fence_ring_timeline_value_str(struct fence *fence, char *str,
> +					int size)
> +{
> +	struct drm_i915_gem_request *req = container_of(fence, typeof(*req),
> +							fence);
> +	struct intel_engine_cs *ring = req->ring;
> +
> +	snprintf(str, size, "%u", ring->get_seqno(ring, false));
> +}
> +
> +int i915_create_sync_fence_ring(struct drm_i915_gem_request *req,
> +				struct sync_fence **sync_fence, int *fence_fd)
> +{
> +	struct drm_device *dev = req->i915->dev;
> +	struct intel_engine_cs *ring = req->ring;
> +	struct sync_fence *sfence;
> +	char ring_name[6] = "ring0";
> +	int fd;
> +
> +	fd = get_unused_fd_flags(O_CLOEXEC);
> +	if (fd < 0) {
> +		DRM_DEBUG("No available file descriptors!\n");
> +		return fd;
> +	}
> +
> +	ring_name[4] += ring->id;
> +	sfence = sync_fence_create_dma(ring_name, &req->fence,
> +				       &dev->struct_mutex);
> +	if (!sfence) {
> +		put_unused_fd(fd);
> +		return -ENOMEM;
> +	}
> +
> +	*sync_fence = sfence;
> +	*fence_fd = fd;
> +
> +	fence_get(&req->fence);
> +
> +	return 0;
> +}
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index 4851d66..2522f78 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -246,7 +246,7 @@ typedef struct _drm_i915_sarea {
>  #define DRM_IOCTL_I915_HWS_ADDR		DRM_IOW(DRM_COMMAND_BASE + DRM_I915_HWS_ADDR, struct drm_i915_gem_init)
>  #define DRM_IOCTL_I915_GEM_INIT		DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_INIT, struct drm_i915_gem_init)
>  #define DRM_IOCTL_I915_GEM_EXECBUFFER	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_EXECBUFFER, struct drm_i915_gem_execbuffer)
> -#define DRM_IOCTL_I915_GEM_EXECBUFFER2	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_EXECBUFFER2, struct drm_i915_gem_execbuffer2)
> +#define DRM_IOCTL_I915_GEM_EXECBUFFER2	DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_EXECBUFFER2, struct drm_i915_gem_execbuffer2)
>  #define DRM_IOCTL_I915_GEM_PIN		DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_PIN, struct drm_i915_gem_pin)
>  #define DRM_IOCTL_I915_GEM_UNPIN	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_UNPIN, struct drm_i915_gem_unpin)
>  #define DRM_IOCTL_I915_GEM_BUSY		DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_BUSY, struct drm_i915_gem_busy)
> @@ -722,7 +722,7 @@ struct drm_i915_gem_execbuffer2 {
>  #define I915_EXEC_CONSTANTS_REL_SURFACE (2<<6) /* gen4/5 only */
>  	__u64 flags;
>  	__u64 rsvd1; /* now used for context info */
> -	__u64 rsvd2;
> +	__u64 rsvd2; /* now used for fence fd */
>  };
>  
>  /** Resets the SO write offset registers for transform feedback on gen7. */
> @@ -760,7 +760,9 @@ struct drm_i915_gem_execbuffer2 {
>  #define I915_EXEC_BSD_RING1		(1<<13)
>  #define I915_EXEC_BSD_RING2		(2<<13)
>  
> -#define __I915_EXEC_UNKNOWN_FLAGS -(1<<15)
> +#define I915_EXEC_FENCE_OUT		(1<<15)
> +
> +#define __I915_EXEC_UNKNOWN_FLAGS -(1<<16)
>  
>  #define I915_EXEC_CONTEXT_ID_MASK	(0xffffffff)
>  #define i915_execbuffer2_set_context_id(eb2, context) \
> -- 
> 2.4.0
> 
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC] drm/i915: Add sync framework support to execbuff IOCTL
  2015-07-06  9:29             ` Daniel Vetter
@ 2015-07-06 12:58               ` John Harrison
  2015-07-06 13:59                 ` Daniel Vetter
  2015-07-07  9:15                 ` Tvrtko Ursulin
  0 siblings, 2 replies; 22+ messages in thread
From: John Harrison @ 2015-07-06 12:58 UTC (permalink / raw)
  To: Daniel Vetter, Tvrtko Ursulin; +Cc: Intel-GFX

On 06/07/2015 10:29, Daniel Vetter wrote:
> On Fri, Jul 03, 2015 at 12:17:33PM +0100, Tvrtko Ursulin wrote:
>> On 07/02/2015 04:55 PM, Chris Wilson wrote:
>>> It would be nice if we could reuse one seqno both for internal/external
>>> fences. If you need to expose a fence ordering within a timeline that is
>>> based on the creation stamp rather than execution stamp, it seems like
>>> we could just add such a stamp when creating the sync_pt and not worry
>>> about its relationship to the execution seqno.
>>>
>>> Doing so does expose that requests are reordered to userspace since the
>>> signalling timeline is not the same as userspace's ordered timeline. Not
>>> sure if that is a problem or not.
>>>
>>> Afaict the sync uapi is based on waiting for all of a set of fences to
>>> retire. It doesn't seem to rely on fence ordering (that is knowing that
>>> fence A will signal before fence B so it need only wait on fence B).
>>>
>>> Here's hoping that we can have both simplicity and efficiency...
>> Jumping in with not even perfect understanding of everything here - but
>> timeline business has always been confusing me. There is nothing in the
>> uapi which needs it afaics and iirc there was some discussion at the time
>> Jesse floated his patches that it can be removed. Based on that when I
>> squashed his patches and ported them on top of John's request to fence
>> conversion it ended up something like the below (manually edited a bit to
>> be less noisy and some prep patches omitted):
>>
>> This implements the ioctl based uapi and indeed seqnos are not actually
>> used in waits. So is this insufficient for some reason? (Other that it
>> does not implement the input fence side of things.)
> Yeah android syncpt on top of struct fence embedded int i915 request is
> what I'd have expected.
The thing I'm not happy with in that plan is that it leaves the kernel 
driver at the mercy of user land applications. If we return a fence 
object to user land via a file descriptor (or indeed any other 
mechanism) then that fence object must be locked until user land closes 
the file. If the fence object is the one embedded within our request 
structure then that means user land is effectively locking our request 
structure. Given that more and more stuff is being attached to the 
request, that could be a fair bit of memory tied up that we can do 
nothing about. E.g. if a rogue/buggy application requests a fence be 
returned for every batch buffer submitted but never closes them. 
Whereas, if we go the route of a separate fence object specifically for 
user land then they can leak them like a sieve and we won't really care 
so much.


>> diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig
>> index 74acca9..07f6ad9 100644
>> --- a/drivers/gpu/drm/i915/Kconfig
>> +++ b/drivers/gpu/drm/i915/Kconfig
>> @@ -71,3 +71,17 @@ config DRM_I915_PRELIMINARY_HW_SUPPORT
>>   	  option changes the default for that module option.
>>   
>>   	  If in doubt, say "N".
>> +
>> +config DRM_I915_SYNC
>> +	bool "Enable explicit sync support"
>> +	depends on DRM_I915
>> +	default y if STAGING
>> +	depends on STAGING
>> +	select ANDROID
>> +	select SYNC
>> +	help
> No Kconfig for userspace ABI please. Yes this means we need to destage
> android syncpts first.

There is already a CONFIG_SYNC flag that wraps up all the existing sync 
code in the staging branch. There's not a lot we can do about that is 
there? We have to at least wrap the sync specific code in the i915 
driver with '#if CONFIG_SYNC' otherwise it won't compile.


> The problem I see there is that apparently google
> is still changing the uabi a lot, and that's a no-go for upstream. And it
> needs to be cleaned up to work more seamlessly with struct fence (i.e.
> anything that's missing there should be moved to struct fence, drivers
> should only use fd_to_fence and fenct_to_fd functions similar to dma-buf).

Are Google changing it or is it upstream that are changing it? The only 
changes to android/staging/sync.c have been a few minor bug fixes and 
Maarten Lankhorst's conversion to use struct fence which was back in 
July last year.



> And we don't have anyone except android using syncpts, so a bit a trouble
> with finding userspace vehicles for this. We probably need agreement from
> google to be happy with a frozen abi for syncpts first ...
> -Daniel

I believe Jesse is wanting to use it for his work.


John.

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC] drm/i915: Add sync framework support to execbuff IOCTL
  2015-07-06 12:58               ` John Harrison
@ 2015-07-06 13:59                 ` Daniel Vetter
  2015-07-06 14:26                   ` John Harrison
  2015-07-07  9:15                 ` Tvrtko Ursulin
  1 sibling, 1 reply; 22+ messages in thread
From: Daniel Vetter @ 2015-07-06 13:59 UTC (permalink / raw)
  To: John Harrison; +Cc: Intel-GFX

On Mon, Jul 06, 2015 at 01:58:25PM +0100, John Harrison wrote:
> On 06/07/2015 10:29, Daniel Vetter wrote:
> >On Fri, Jul 03, 2015 at 12:17:33PM +0100, Tvrtko Ursulin wrote:
> >>On 07/02/2015 04:55 PM, Chris Wilson wrote:
> >>>It would be nice if we could reuse one seqno both for internal/external
> >>>fences. If you need to expose a fence ordering within a timeline that is
> >>>based on the creation stamp rather than execution stamp, it seems like
> >>>we could just add such a stamp when creating the sync_pt and not worry
> >>>about its relationship to the execution seqno.
> >>>
> >>>Doing so does expose that requests are reordered to userspace since the
> >>>signalling timeline is not the same as userspace's ordered timeline. Not
> >>>sure if that is a problem or not.
> >>>
> >>>Afaict the sync uapi is based on waiting for all of a set of fences to
> >>>retire. It doesn't seem to rely on fence ordering (that is knowing that
> >>>fence A will signal before fence B so it need only wait on fence B).
> >>>
> >>>Here's hoping that we can have both simplicity and efficiency...
> >>Jumping in with not even perfect understanding of everything here - but
> >>timeline business has always been confusing me. There is nothing in the
> >>uapi which needs it afaics and iirc there was some discussion at the time
> >>Jesse floated his patches that it can be removed. Based on that when I
> >>squashed his patches and ported them on top of John's request to fence
> >>conversion it ended up something like the below (manually edited a bit to
> >>be less noisy and some prep patches omitted):
> >>
> >>This implements the ioctl based uapi and indeed seqnos are not actually
> >>used in waits. So is this insufficient for some reason? (Other that it
> >>does not implement the input fence side of things.)
> >Yeah android syncpt on top of struct fence embedded int i915 request is
> >what I'd have expected.
> The thing I'm not happy with in that plan is that it leaves the kernel
> driver at the mercy of user land applications. If we return a fence object
> to user land via a file descriptor (or indeed any other mechanism) then that
> fence object must be locked until user land closes the file. If the fence
> object is the one embedded within our request structure then that means user
> land is effectively locking our request structure. Given that more and more
> stuff is being attached to the request, that could be a fair bit of memory
> tied up that we can do nothing about. E.g. if a rogue/buggy application
> requests a fence be returned for every batch buffer submitted but never
> closes them. Whereas, if we go the route of a separate fence object
> specifically for user land then they can leak them like a sieve and we won't
> really care so much.

Userspace can exhaust kernel allocations, that's nothing new. And if we
keep it userspace simply needs to leak a few more fence fds than if
there's a bit more data attached to it.

The solution to this problem is to have a mem cgroup limit set. No need to
complicate our kernel code.

> >>diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig
> >>index 74acca9..07f6ad9 100644
> >>--- a/drivers/gpu/drm/i915/Kconfig
> >>+++ b/drivers/gpu/drm/i915/Kconfig
> >>@@ -71,3 +71,17 @@ config DRM_I915_PRELIMINARY_HW_SUPPORT
> >>  	  option changes the default for that module option.
> >>  	  If in doubt, say "N".
> >>+
> >>+config DRM_I915_SYNC
> >>+	bool "Enable explicit sync support"
> >>+	depends on DRM_I915
> >>+	default y if STAGING
> >>+	depends on STAGING
> >>+	select ANDROID
> >>+	select SYNC
> >>+	help
> >No Kconfig for userspace ABI please. Yes this means we need to destage
> >android syncpts first.
> 
> There is already a CONFIG_SYNC flag that wraps up all the existing sync code
> in the staging branch. There's not a lot we can do about that is there? We
> have to at least wrap the sync specific code in the i915 driver with '#if
> CONFIG_SYNC' otherwise it won't compile.

User-settable CONFIG_SYNC is one of these bits we need to fix up when
de-staging - it should be an internal variable which is selected by i915,
like all the other optional kernel services we use.

> >The problem I see there is that apparently google
> >is still changing the uabi a lot, and that's a no-go for upstream. And it
> >needs to be cleaned up to work more seamlessly with struct fence (i.e.
> >anything that's missing there should be moved to struct fence, drivers
> >should only use fd_to_fence and fenct_to_fd functions similar to dma-buf).
> 
> Are Google changing it or is it upstream that are changing it? The only
> changes to android/staging/sync.c have been a few minor bug fixes and
> Maarten Lankhorst's conversion to use struct fence which was back in July
> last year.

destaging android syncpt will probably require a few changes, but more so
it will freeze the abi. If we do that effort and google ignores it it's
fairly pointless (as long as android is the only serious user of syncpts).

> >And we don't have anyone except android using syncpts, so a bit a trouble
> >with finding userspace vehicles for this. We probably need agreement from
> >google to be happy with a frozen abi for syncpts first ...
> >-Daniel
> 
> I believe Jesse is wanting to use it for his work.

Yes I know, but afaik it's also a long way off. Which means not useful as
an open-source demonstration vehicle unfortunately.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC] drm/i915: Add sync framework support to execbuff IOCTL
  2015-07-06 13:59                 ` Daniel Vetter
@ 2015-07-06 14:26                   ` John Harrison
  2015-07-06 14:41                     ` Daniel Vetter
  2015-07-06 14:46                     ` Tvrtko Ursulin
  0 siblings, 2 replies; 22+ messages in thread
From: John Harrison @ 2015-07-06 14:26 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: Intel-GFX

On 06/07/2015 14:59, Daniel Vetter wrote:
> On Mon, Jul 06, 2015 at 01:58:25PM +0100, John Harrison wrote:
>> On 06/07/2015 10:29, Daniel Vetter wrote:
>>> On Fri, Jul 03, 2015 at 12:17:33PM +0100, Tvrtko Ursulin wrote:
>>>> On 07/02/2015 04:55 PM, Chris Wilson wrote:
>>>>> It would be nice if we could reuse one seqno both for internal/external
>>>>> fences. If you need to expose a fence ordering within a timeline that is
>>>>> based on the creation stamp rather than execution stamp, it seems like
>>>>> we could just add such a stamp when creating the sync_pt and not worry
>>>>> about its relationship to the execution seqno.
>>>>>
>>>>> Doing so does expose that requests are reordered to userspace since the
>>>>> signalling timeline is not the same as userspace's ordered timeline. Not
>>>>> sure if that is a problem or not.
>>>>>
>>>>> Afaict the sync uapi is based on waiting for all of a set of fences to
>>>>> retire. It doesn't seem to rely on fence ordering (that is knowing that
>>>>> fence A will signal before fence B so it need only wait on fence B).
>>>>>
>>>>> Here's hoping that we can have both simplicity and efficiency...
>>>> Jumping in with not even perfect understanding of everything here - but
>>>> timeline business has always been confusing me. There is nothing in the
>>>> uapi which needs it afaics and iirc there was some discussion at the time
>>>> Jesse floated his patches that it can be removed. Based on that when I
>>>> squashed his patches and ported them on top of John's request to fence
>>>> conversion it ended up something like the below (manually edited a bit to
>>>> be less noisy and some prep patches omitted):
>>>>
>>>> This implements the ioctl based uapi and indeed seqnos are not actually
>>>> used in waits. So is this insufficient for some reason? (Other that it
>>>> does not implement the input fence side of things.)
>>> Yeah android syncpt on top of struct fence embedded int i915 request is
>>> what I'd have expected.
>> The thing I'm not happy with in that plan is that it leaves the kernel
>> driver at the mercy of user land applications. If we return a fence object
>> to user land via a file descriptor (or indeed any other mechanism) then that
>> fence object must be locked until user land closes the file. If the fence
>> object is the one embedded within our request structure then that means user
>> land is effectively locking our request structure. Given that more and more
>> stuff is being attached to the request, that could be a fair bit of memory
>> tied up that we can do nothing about. E.g. if a rogue/buggy application
>> requests a fence be returned for every batch buffer submitted but never
>> closes them. Whereas, if we go the route of a separate fence object
>> specifically for user land then they can leak them like a sieve and we won't
>> really care so much.
> Userspace can exhaust kernel allocations, that's nothing new. And if we
> keep it userspace simply needs to leak a few more fence fds than if
> there's a bit more data attached to it.
>
> The solution to this problem is to have a mem cgroup limit set. No need to
> complicate our kernel code.

There is still the extra complication that request unreferencing cannot 
require any kind of mutex lock if we are allowing it to happen from 
outside of the driver. That means the unreference callback must move the 
request to a 'please clean me later' list, schedule a worker thread to 
run, and thus do the clean up asynchronously.


>>>> diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig
>>>> index 74acca9..07f6ad9 100644
>>>> --- a/drivers/gpu/drm/i915/Kconfig
>>>> +++ b/drivers/gpu/drm/i915/Kconfig
>>>> @@ -71,3 +71,17 @@ config DRM_I915_PRELIMINARY_HW_SUPPORT
>>>>   	  option changes the default for that module option.
>>>>   	  If in doubt, say "N".
>>>> +
>>>> +config DRM_I915_SYNC
>>>> +	bool "Enable explicit sync support"
>>>> +	depends on DRM_I915
>>>> +	default y if STAGING
>>>> +	depends on STAGING
>>>> +	select ANDROID
>>>> +	select SYNC
>>>> +	help
>>> No Kconfig for userspace ABI please. Yes this means we need to destage
>>> android syncpts first.
>> There is already a CONFIG_SYNC flag that wraps up all the existing sync code
>> in the staging branch. There's not a lot we can do about that is there? We
>> have to at least wrap the sync specific code in the i915 driver with '#if
>> CONFIG_SYNC' otherwise it won't compile.
> User-settable CONFIG_SYNC is one of these bits we need to fix up when
> de-staging - it should be an internal variable which is selected by i915,
> like all the other optional kernel services we use.
>
>>> The problem I see there is that apparently google
>>> is still changing the uabi a lot, and that's a no-go for upstream. And it
>>> needs to be cleaned up to work more seamlessly with struct fence (i.e.
>>> anything that's missing there should be moved to struct fence, drivers
>>> should only use fd_to_fence and fenct_to_fd functions similar to dma-buf).
>> Are Google changing it or is it upstream that are changing it? The only
>> changes to android/staging/sync.c have been a few minor bug fixes and
>> Maarten Lankhorst's conversion to use struct fence which was back in July
>> last year.
> destaging android syncpt will probably require a few changes, but more so
> it will freeze the abi. If we do that effort and google ignores it it's
> fairly pointless (as long as android is the only serious user of syncpts).
>
>>> And we don't have anyone except android using syncpts, so a bit a trouble
>>> with finding userspace vehicles for this. We probably need agreement from
>>> google to be happy with a frozen abi for syncpts first ...
>>> -Daniel
>> I believe Jesse is wanting to use it for his work.
> Yes I know, but afaik it's also a long way off. Which means not useful as
> an open-source demonstration vehicle unfortunately.
> -Daniel

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC] drm/i915: Add sync framework support to execbuff IOCTL
  2015-07-06 14:26                   ` John Harrison
@ 2015-07-06 14:41                     ` Daniel Vetter
  2015-07-06 14:46                     ` Tvrtko Ursulin
  1 sibling, 0 replies; 22+ messages in thread
From: Daniel Vetter @ 2015-07-06 14:41 UTC (permalink / raw)
  To: John Harrison; +Cc: Intel-GFX

On Mon, Jul 06, 2015 at 03:26:12PM +0100, John Harrison wrote:
> On 06/07/2015 14:59, Daniel Vetter wrote:
> >On Mon, Jul 06, 2015 at 01:58:25PM +0100, John Harrison wrote:
> >>On 06/07/2015 10:29, Daniel Vetter wrote:
> >>>On Fri, Jul 03, 2015 at 12:17:33PM +0100, Tvrtko Ursulin wrote:
> >>>>On 07/02/2015 04:55 PM, Chris Wilson wrote:
> >>>>>It would be nice if we could reuse one seqno both for internal/external
> >>>>>fences. If you need to expose a fence ordering within a timeline that is
> >>>>>based on the creation stamp rather than execution stamp, it seems like
> >>>>>we could just add such a stamp when creating the sync_pt and not worry
> >>>>>about its relationship to the execution seqno.
> >>>>>
> >>>>>Doing so does expose that requests are reordered to userspace since the
> >>>>>signalling timeline is not the same as userspace's ordered timeline. Not
> >>>>>sure if that is a problem or not.
> >>>>>
> >>>>>Afaict the sync uapi is based on waiting for all of a set of fences to
> >>>>>retire. It doesn't seem to rely on fence ordering (that is knowing that
> >>>>>fence A will signal before fence B so it need only wait on fence B).
> >>>>>
> >>>>>Here's hoping that we can have both simplicity and efficiency...
> >>>>Jumping in with not even perfect understanding of everything here - but
> >>>>timeline business has always been confusing me. There is nothing in the
> >>>>uapi which needs it afaics and iirc there was some discussion at the time
> >>>>Jesse floated his patches that it can be removed. Based on that when I
> >>>>squashed his patches and ported them on top of John's request to fence
> >>>>conversion it ended up something like the below (manually edited a bit to
> >>>>be less noisy and some prep patches omitted):
> >>>>
> >>>>This implements the ioctl based uapi and indeed seqnos are not actually
> >>>>used in waits. So is this insufficient for some reason? (Other that it
> >>>>does not implement the input fence side of things.)
> >>>Yeah android syncpt on top of struct fence embedded int i915 request is
> >>>what I'd have expected.
> >>The thing I'm not happy with in that plan is that it leaves the kernel
> >>driver at the mercy of user land applications. If we return a fence object
> >>to user land via a file descriptor (or indeed any other mechanism) then that
> >>fence object must be locked until user land closes the file. If the fence
> >>object is the one embedded within our request structure then that means user
> >>land is effectively locking our request structure. Given that more and more
> >>stuff is being attached to the request, that could be a fair bit of memory
> >>tied up that we can do nothing about. E.g. if a rogue/buggy application
> >>requests a fence be returned for every batch buffer submitted but never
> >>closes them. Whereas, if we go the route of a separate fence object
> >>specifically for user land then they can leak them like a sieve and we won't
> >>really care so much.
> >Userspace can exhaust kernel allocations, that's nothing new. And if we
> >keep it userspace simply needs to leak a few more fence fds than if
> >there's a bit more data attached to it.
> >
> >The solution to this problem is to have a mem cgroup limit set. No need to
> >complicate our kernel code.
> 
> There is still the extra complication that request unreferencing cannot
> require any kind of mutex lock if we are allowing it to happen from outside
> of the driver. That means the unreference callback must move the request to
> a 'please clean me later' list, schedule a worker thread to run, and thus do
> the clean up asynchronously.

Yeah, struct_mutex locking design is terribly, and we'll pay the prize for
that dearly until it's eventually fixed up. We can optimize it at least
with a mutex_try_lock.

Or we just fix up request tracking to not require struct_mutex, that might
be better. All the references we hold onto the request should point one
way with no weak references going the other direction, so this should be
possible.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC] drm/i915: Add sync framework support to execbuff IOCTL
  2015-07-06 14:26                   ` John Harrison
  2015-07-06 14:41                     ` Daniel Vetter
@ 2015-07-06 14:46                     ` Tvrtko Ursulin
  2015-07-06 15:12                       ` Daniel Vetter
  1 sibling, 1 reply; 22+ messages in thread
From: Tvrtko Ursulin @ 2015-07-06 14:46 UTC (permalink / raw)
  To: John Harrison, Daniel Vetter; +Cc: Intel-GFX


On 07/06/2015 03:26 PM, John Harrison wrote:
> On 06/07/2015 14:59, Daniel Vetter wrote:
>> On Mon, Jul 06, 2015 at 01:58:25PM +0100, John Harrison wrote:
>>> On 06/07/2015 10:29, Daniel Vetter wrote:
>>>> On Fri, Jul 03, 2015 at 12:17:33PM +0100, Tvrtko Ursulin wrote:
>>>>> On 07/02/2015 04:55 PM, Chris Wilson wrote:
>>>>>> It would be nice if we could reuse one seqno both for
>>>>>> internal/external
>>>>>> fences. If you need to expose a fence ordering within a timeline
>>>>>> that is
>>>>>> based on the creation stamp rather than execution stamp, it seems
>>>>>> like
>>>>>> we could just add such a stamp when creating the sync_pt and not
>>>>>> worry
>>>>>> about its relationship to the execution seqno.
>>>>>>
>>>>>> Doing so does expose that requests are reordered to userspace
>>>>>> since the
>>>>>> signalling timeline is not the same as userspace's ordered
>>>>>> timeline. Not
>>>>>> sure if that is a problem or not.
>>>>>>
>>>>>> Afaict the sync uapi is based on waiting for all of a set of
>>>>>> fences to
>>>>>> retire. It doesn't seem to rely on fence ordering (that is knowing
>>>>>> that
>>>>>> fence A will signal before fence B so it need only wait on fence B).
>>>>>>
>>>>>> Here's hoping that we can have both simplicity and efficiency...
>>>>> Jumping in with not even perfect understanding of everything here -
>>>>> but
>>>>> timeline business has always been confusing me. There is nothing in
>>>>> the
>>>>> uapi which needs it afaics and iirc there was some discussion at
>>>>> the time
>>>>> Jesse floated his patches that it can be removed. Based on that when I
>>>>> squashed his patches and ported them on top of John's request to fence
>>>>> conversion it ended up something like the below (manually edited a
>>>>> bit to
>>>>> be less noisy and some prep patches omitted):
>>>>>
>>>>> This implements the ioctl based uapi and indeed seqnos are not
>>>>> actually
>>>>> used in waits. So is this insufficient for some reason? (Other that it
>>>>> does not implement the input fence side of things.)
>>>> Yeah android syncpt on top of struct fence embedded int i915 request is
>>>> what I'd have expected.
>>> The thing I'm not happy with in that plan is that it leaves the kernel
>>> driver at the mercy of user land applications. If we return a fence
>>> object
>>> to user land via a file descriptor (or indeed any other mechanism)
>>> then that
>>> fence object must be locked until user land closes the file. If the
>>> fence
>>> object is the one embedded within our request structure then that
>>> means user
>>> land is effectively locking our request structure. Given that more
>>> and more
>>> stuff is being attached to the request, that could be a fair bit of
>>> memory
>>> tied up that we can do nothing about. E.g. if a rogue/buggy application
>>> requests a fence be returned for every batch buffer submitted but never
>>> closes them. Whereas, if we go the route of a separate fence object
>>> specifically for user land then they can leak them like a sieve and
>>> we won't
>>> really care so much.
>> Userspace can exhaust kernel allocations, that's nothing new. And if we
>> keep it userspace simply needs to leak a few more fence fds than if
>> there's a bit more data attached to it.
>>
>> The solution to this problem is to have a mem cgroup limit set. No
>> need to
>> complicate our kernel code.
>
> There is still the extra complication that request unreferencing cannot
> require any kind of mutex lock if we are allowing it to happen from
> outside of the driver. That means the unreference callback must move the
> request to a 'please clean me later' list, schedule a worker thread to
> run, and thus do the clean up asynchronously.

For this particular issue my solution was to extend the sync_fence 
constructor to take a mutex and store it inside the object. Then at 
destruction time, which happens at sync_fd->f_ops->release() time, it is 
just a matter of calling kref_put_mutex instead of kref_put.

Seemed to work under some quick testing but that is as much as I did 
back then.

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC] drm/i915: Add sync framework support to execbuff IOCTL
  2015-07-06 14:46                     ` Tvrtko Ursulin
@ 2015-07-06 15:12                       ` Daniel Vetter
  2015-07-06 15:21                         ` Tvrtko Ursulin
  0 siblings, 1 reply; 22+ messages in thread
From: Daniel Vetter @ 2015-07-06 15:12 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: Intel-GFX

On Mon, Jul 06, 2015 at 03:46:49PM +0100, Tvrtko Ursulin wrote:
> 
> On 07/06/2015 03:26 PM, John Harrison wrote:
> >On 06/07/2015 14:59, Daniel Vetter wrote:
> >>On Mon, Jul 06, 2015 at 01:58:25PM +0100, John Harrison wrote:
> >>>On 06/07/2015 10:29, Daniel Vetter wrote:
> >>>>On Fri, Jul 03, 2015 at 12:17:33PM +0100, Tvrtko Ursulin wrote:
> >>>>>On 07/02/2015 04:55 PM, Chris Wilson wrote:
> >>>>>>It would be nice if we could reuse one seqno both for
> >>>>>>internal/external
> >>>>>>fences. If you need to expose a fence ordering within a timeline
> >>>>>>that is
> >>>>>>based on the creation stamp rather than execution stamp, it seems
> >>>>>>like
> >>>>>>we could just add such a stamp when creating the sync_pt and not
> >>>>>>worry
> >>>>>>about its relationship to the execution seqno.
> >>>>>>
> >>>>>>Doing so does expose that requests are reordered to userspace
> >>>>>>since the
> >>>>>>signalling timeline is not the same as userspace's ordered
> >>>>>>timeline. Not
> >>>>>>sure if that is a problem or not.
> >>>>>>
> >>>>>>Afaict the sync uapi is based on waiting for all of a set of
> >>>>>>fences to
> >>>>>>retire. It doesn't seem to rely on fence ordering (that is knowing
> >>>>>>that
> >>>>>>fence A will signal before fence B so it need only wait on fence B).
> >>>>>>
> >>>>>>Here's hoping that we can have both simplicity and efficiency...
> >>>>>Jumping in with not even perfect understanding of everything here -
> >>>>>but
> >>>>>timeline business has always been confusing me. There is nothing in
> >>>>>the
> >>>>>uapi which needs it afaics and iirc there was some discussion at
> >>>>>the time
> >>>>>Jesse floated his patches that it can be removed. Based on that when I
> >>>>>squashed his patches and ported them on top of John's request to fence
> >>>>>conversion it ended up something like the below (manually edited a
> >>>>>bit to
> >>>>>be less noisy and some prep patches omitted):
> >>>>>
> >>>>>This implements the ioctl based uapi and indeed seqnos are not
> >>>>>actually
> >>>>>used in waits. So is this insufficient for some reason? (Other that it
> >>>>>does not implement the input fence side of things.)
> >>>>Yeah android syncpt on top of struct fence embedded int i915 request is
> >>>>what I'd have expected.
> >>>The thing I'm not happy with in that plan is that it leaves the kernel
> >>>driver at the mercy of user land applications. If we return a fence
> >>>object
> >>>to user land via a file descriptor (or indeed any other mechanism)
> >>>then that
> >>>fence object must be locked until user land closes the file. If the
> >>>fence
> >>>object is the one embedded within our request structure then that
> >>>means user
> >>>land is effectively locking our request structure. Given that more
> >>>and more
> >>>stuff is being attached to the request, that could be a fair bit of
> >>>memory
> >>>tied up that we can do nothing about. E.g. if a rogue/buggy application
> >>>requests a fence be returned for every batch buffer submitted but never
> >>>closes them. Whereas, if we go the route of a separate fence object
> >>>specifically for user land then they can leak them like a sieve and
> >>>we won't
> >>>really care so much.
> >>Userspace can exhaust kernel allocations, that's nothing new. And if we
> >>keep it userspace simply needs to leak a few more fence fds than if
> >>there's a bit more data attached to it.
> >>
> >>The solution to this problem is to have a mem cgroup limit set. No
> >>need to
> >>complicate our kernel code.
> >
> >There is still the extra complication that request unreferencing cannot
> >require any kind of mutex lock if we are allowing it to happen from
> >outside of the driver. That means the unreference callback must move the
> >request to a 'please clean me later' list, schedule a worker thread to
> >run, and thus do the clean up asynchronously.
> 
> For this particular issue my solution was to extend the sync_fence
> constructor to take a mutex and store it inside the object. Then at
> destruction time, which happens at sync_fd->f_ops->release() time, it is
> just a matter of calling kref_put_mutex instead of kref_put.
> 
> Seemed to work under some quick testing but that is as much as I did back
> then.

The problem is that it doesn't scale since everyone wants some other kind
of mutex to serialize the final kref_put. If something is supposed to be
cross-subsystem/driver (which is the case for fences) then we really can't
do that kind of leaky locking design. Imo we should have a kref_put_mutex
considered harmful sign somewhere ...

If you have weak references somewhere and need to prevent the object from
disappearing untimely while chasing that weak reference then imo the
better design pattern is to use kref_get_unless_zero. If you need the
serialization the mutex provides for some other reason (someone is only
hodling the mutex instead of grabbing a proper refernce when they really
should grab one) then your refcounting scheme probably needs another kind
of fixup patch.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC] drm/i915: Add sync framework support to execbuff IOCTL
  2015-07-06 15:12                       ` Daniel Vetter
@ 2015-07-06 15:21                         ` Tvrtko Ursulin
  2015-07-06 15:37                           ` Daniel Vetter
  0 siblings, 1 reply; 22+ messages in thread
From: Tvrtko Ursulin @ 2015-07-06 15:21 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: Intel-GFX


On 07/06/2015 04:12 PM, Daniel Vetter wrote:
> On Mon, Jul 06, 2015 at 03:46:49PM +0100, Tvrtko Ursulin wrote:
>> On 07/06/2015 03:26 PM, John Harrison wrote:
>>> On 06/07/2015 14:59, Daniel Vetter wrote:
>>>> On Mon, Jul 06, 2015 at 01:58:25PM +0100, John Harrison wrote:
>>>>> On 06/07/2015 10:29, Daniel Vetter wrote:
>>>>>> On Fri, Jul 03, 2015 at 12:17:33PM +0100, Tvrtko Ursulin wrote:
>>>>>>> On 07/02/2015 04:55 PM, Chris Wilson wrote:
>>>>>>>> It would be nice if we could reuse one seqno both for
>>>>>>>> internal/external
>>>>>>>> fences. If you need to expose a fence ordering within a timeline
>>>>>>>> that is
>>>>>>>> based on the creation stamp rather than execution stamp, it seems
>>>>>>>> like
>>>>>>>> we could just add such a stamp when creating the sync_pt and not
>>>>>>>> worry
>>>>>>>> about its relationship to the execution seqno.
>>>>>>>>
>>>>>>>> Doing so does expose that requests are reordered to userspace
>>>>>>>> since the
>>>>>>>> signalling timeline is not the same as userspace's ordered
>>>>>>>> timeline. Not
>>>>>>>> sure if that is a problem or not.
>>>>>>>>
>>>>>>>> Afaict the sync uapi is based on waiting for all of a set of
>>>>>>>> fences to
>>>>>>>> retire. It doesn't seem to rely on fence ordering (that is knowing
>>>>>>>> that
>>>>>>>> fence A will signal before fence B so it need only wait on fence B).
>>>>>>>>
>>>>>>>> Here's hoping that we can have both simplicity and efficiency...
>>>>>>> Jumping in with not even perfect understanding of everything here -
>>>>>>> but
>>>>>>> timeline business has always been confusing me. There is nothing in
>>>>>>> the
>>>>>>> uapi which needs it afaics and iirc there was some discussion at
>>>>>>> the time
>>>>>>> Jesse floated his patches that it can be removed. Based on that when I
>>>>>>> squashed his patches and ported them on top of John's request to fence
>>>>>>> conversion it ended up something like the below (manually edited a
>>>>>>> bit to
>>>>>>> be less noisy and some prep patches omitted):
>>>>>>>
>>>>>>> This implements the ioctl based uapi and indeed seqnos are not
>>>>>>> actually
>>>>>>> used in waits. So is this insufficient for some reason? (Other that it
>>>>>>> does not implement the input fence side of things.)
>>>>>> Yeah android syncpt on top of struct fence embedded int i915 request is
>>>>>> what I'd have expected.
>>>>> The thing I'm not happy with in that plan is that it leaves the kernel
>>>>> driver at the mercy of user land applications. If we return a fence
>>>>> object
>>>>> to user land via a file descriptor (or indeed any other mechanism)
>>>>> then that
>>>>> fence object must be locked until user land closes the file. If the
>>>>> fence
>>>>> object is the one embedded within our request structure then that
>>>>> means user
>>>>> land is effectively locking our request structure. Given that more
>>>>> and more
>>>>> stuff is being attached to the request, that could be a fair bit of
>>>>> memory
>>>>> tied up that we can do nothing about. E.g. if a rogue/buggy application
>>>>> requests a fence be returned for every batch buffer submitted but never
>>>>> closes them. Whereas, if we go the route of a separate fence object
>>>>> specifically for user land then they can leak them like a sieve and
>>>>> we won't
>>>>> really care so much.
>>>> Userspace can exhaust kernel allocations, that's nothing new. And if we
>>>> keep it userspace simply needs to leak a few more fence fds than if
>>>> there's a bit more data attached to it.
>>>>
>>>> The solution to this problem is to have a mem cgroup limit set. No
>>>> need to
>>>> complicate our kernel code.
>>>
>>> There is still the extra complication that request unreferencing cannot
>>> require any kind of mutex lock if we are allowing it to happen from
>>> outside of the driver. That means the unreference callback must move the
>>> request to a 'please clean me later' list, schedule a worker thread to
>>> run, and thus do the clean up asynchronously.
>>
>> For this particular issue my solution was to extend the sync_fence
>> constructor to take a mutex and store it inside the object. Then at
>> destruction time, which happens at sync_fd->f_ops->release() time, it is
>> just a matter of calling kref_put_mutex instead of kref_put.
>>
>> Seemed to work under some quick testing but that is as much as I did back
>> then.
>
> The problem is that it doesn't scale since everyone wants some other kind
> of mutex to serialize the final kref_put. If something is supposed to be
> cross-subsystem/driver (which is the case for fences) then we really can't
> do that kind of leaky locking design. Imo we should have a kref_put_mutex
> considered harmful sign somewhere ...

I get the argument about everything wanting to add their own not 
scaling, but don't tie with the leaky comment? Also mutex is a pretty 
standard thing, especially since kref_put_mutex. :D If you look at it 
from that angle it kind of just exposes to the super class what the base 
class can do.

> If you have weak references somewhere and need to prevent the object from
> disappearing untimely while chasing that weak reference then imo the
> better design pattern is to use kref_get_unless_zero. If you need the
> serialization the mutex provides for some other reason (someone is only
> hodling the mutex instead of grabbing a proper refernce when they really
> should grab one) then your refcounting scheme probably needs another kind
> of fixup patch.

I don't see how weak references can work since if the request goes 
information is lost, unless stored somewhere else.

Regards,

Tvrtko


_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC] drm/i915: Add sync framework support to execbuff IOCTL
  2015-07-06 15:21                         ` Tvrtko Ursulin
@ 2015-07-06 15:37                           ` Daniel Vetter
  2015-07-06 16:34                             ` Tvrtko Ursulin
  0 siblings, 1 reply; 22+ messages in thread
From: Daniel Vetter @ 2015-07-06 15:37 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: Intel-GFX

On Mon, Jul 06, 2015 at 04:21:28PM +0100, Tvrtko Ursulin wrote:
> 
> On 07/06/2015 04:12 PM, Daniel Vetter wrote:
> >On Mon, Jul 06, 2015 at 03:46:49PM +0100, Tvrtko Ursulin wrote:
> >>On 07/06/2015 03:26 PM, John Harrison wrote:
> >>>On 06/07/2015 14:59, Daniel Vetter wrote:
> >>>>On Mon, Jul 06, 2015 at 01:58:25PM +0100, John Harrison wrote:
> >>>>>On 06/07/2015 10:29, Daniel Vetter wrote:
> >>>>>>On Fri, Jul 03, 2015 at 12:17:33PM +0100, Tvrtko Ursulin wrote:
> >>>>>>>On 07/02/2015 04:55 PM, Chris Wilson wrote:
> >>>>>>>>It would be nice if we could reuse one seqno both for
> >>>>>>>>internal/external
> >>>>>>>>fences. If you need to expose a fence ordering within a timeline
> >>>>>>>>that is
> >>>>>>>>based on the creation stamp rather than execution stamp, it seems
> >>>>>>>>like
> >>>>>>>>we could just add such a stamp when creating the sync_pt and not
> >>>>>>>>worry
> >>>>>>>>about its relationship to the execution seqno.
> >>>>>>>>
> >>>>>>>>Doing so does expose that requests are reordered to userspace
> >>>>>>>>since the
> >>>>>>>>signalling timeline is not the same as userspace's ordered
> >>>>>>>>timeline. Not
> >>>>>>>>sure if that is a problem or not.
> >>>>>>>>
> >>>>>>>>Afaict the sync uapi is based on waiting for all of a set of
> >>>>>>>>fences to
> >>>>>>>>retire. It doesn't seem to rely on fence ordering (that is knowing
> >>>>>>>>that
> >>>>>>>>fence A will signal before fence B so it need only wait on fence B).
> >>>>>>>>
> >>>>>>>>Here's hoping that we can have both simplicity and efficiency...
> >>>>>>>Jumping in with not even perfect understanding of everything here -
> >>>>>>>but
> >>>>>>>timeline business has always been confusing me. There is nothing in
> >>>>>>>the
> >>>>>>>uapi which needs it afaics and iirc there was some discussion at
> >>>>>>>the time
> >>>>>>>Jesse floated his patches that it can be removed. Based on that when I
> >>>>>>>squashed his patches and ported them on top of John's request to fence
> >>>>>>>conversion it ended up something like the below (manually edited a
> >>>>>>>bit to
> >>>>>>>be less noisy and some prep patches omitted):
> >>>>>>>
> >>>>>>>This implements the ioctl based uapi and indeed seqnos are not
> >>>>>>>actually
> >>>>>>>used in waits. So is this insufficient for some reason? (Other that it
> >>>>>>>does not implement the input fence side of things.)
> >>>>>>Yeah android syncpt on top of struct fence embedded int i915 request is
> >>>>>>what I'd have expected.
> >>>>>The thing I'm not happy with in that plan is that it leaves the kernel
> >>>>>driver at the mercy of user land applications. If we return a fence
> >>>>>object
> >>>>>to user land via a file descriptor (or indeed any other mechanism)
> >>>>>then that
> >>>>>fence object must be locked until user land closes the file. If the
> >>>>>fence
> >>>>>object is the one embedded within our request structure then that
> >>>>>means user
> >>>>>land is effectively locking our request structure. Given that more
> >>>>>and more
> >>>>>stuff is being attached to the request, that could be a fair bit of
> >>>>>memory
> >>>>>tied up that we can do nothing about. E.g. if a rogue/buggy application
> >>>>>requests a fence be returned for every batch buffer submitted but never
> >>>>>closes them. Whereas, if we go the route of a separate fence object
> >>>>>specifically for user land then they can leak them like a sieve and
> >>>>>we won't
> >>>>>really care so much.
> >>>>Userspace can exhaust kernel allocations, that's nothing new. And if we
> >>>>keep it userspace simply needs to leak a few more fence fds than if
> >>>>there's a bit more data attached to it.
> >>>>
> >>>>The solution to this problem is to have a mem cgroup limit set. No
> >>>>need to
> >>>>complicate our kernel code.
> >>>
> >>>There is still the extra complication that request unreferencing cannot
> >>>require any kind of mutex lock if we are allowing it to happen from
> >>>outside of the driver. That means the unreference callback must move the
> >>>request to a 'please clean me later' list, schedule a worker thread to
> >>>run, and thus do the clean up asynchronously.
> >>
> >>For this particular issue my solution was to extend the sync_fence
> >>constructor to take a mutex and store it inside the object. Then at
> >>destruction time, which happens at sync_fd->f_ops->release() time, it is
> >>just a matter of calling kref_put_mutex instead of kref_put.
> >>
> >>Seemed to work under some quick testing but that is as much as I did back
> >>then.
> >
> >The problem is that it doesn't scale since everyone wants some other kind
> >of mutex to serialize the final kref_put. If something is supposed to be
> >cross-subsystem/driver (which is the case for fences) then we really can't
> >do that kind of leaky locking design. Imo we should have a kref_put_mutex
> >considered harmful sign somewhere ...
> 
> I get the argument about everything wanting to add their own not scaling,
> but don't tie with the leaky comment? Also mutex is a pretty standard thing,
> especially since kref_put_mutex. :D If you look at it from that angle it
> kind of just exposes to the super class what the base class can do.

leaky not as in leaking memory, but leaky abstraction - we impose locking
internals for i915 onto users of the fence interface (somewhat at least).
> 
> >If you have weak references somewhere and need to prevent the object from
> >disappearing untimely while chasing that weak reference then imo the
> >better design pattern is to use kref_get_unless_zero. If you need the
> >serialization the mutex provides for some other reason (someone is only
> >hodling the mutex instead of grabbing a proper refernce when they really
> >should grab one) then your refcounting scheme probably needs another kind
> >of fixup patch.
> 
> I don't see how weak references can work since if the request goes
> information is lost, unless stored somewhere else.

So weak refernce is a pointer which doesn't hold a reference to the object
controlled with kref, but protected by some lock. Latest when the object
is destroyed we need to clear that pointer in the free callback of
kref_put (and so also grab the lock that protects that pointer). The
problem is that anyone else chasing that weak reference might race with
the final kref_put and increase the refcount from 0 to 1, which isn't
good.

There's two ways to do that:
- kref_put_mutex on the release side.
- in the acquire side do a kref_get_unless_zero _while_ holding that lock.

Two upsides of the later approach:
- You can have an unrestricted amount of weak references, each protected
  with their own lock. kref_put_mutex only works up to one.
- It doesn't serialize the final kref_put with the lock - doing that
  allows folks to rely on the mutex instead of a proper refcount to make
  the obj stick around, which is imo a design antipattern that I suffered
  through trying to cleaning it up agian a lot.

But that's really a tangent to the discussion here, I have no idea whether
this applies here since I didn't read your patch in detail.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC] drm/i915: Add sync framework support to execbuff IOCTL
  2015-07-06 15:37                           ` Daniel Vetter
@ 2015-07-06 16:34                             ` Tvrtko Ursulin
  2015-07-06 17:58                               ` Daniel Vetter
  0 siblings, 1 reply; 22+ messages in thread
From: Tvrtko Ursulin @ 2015-07-06 16:34 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: Intel-GFX


On 07/06/2015 04:37 PM, Daniel Vetter wrote:
> On Mon, Jul 06, 2015 at 04:21:28PM +0100, Tvrtko Ursulin wrote:
>>
>> On 07/06/2015 04:12 PM, Daniel Vetter wrote:
>>> On Mon, Jul 06, 2015 at 03:46:49PM +0100, Tvrtko Ursulin wrote:
>>
>>> If you have weak references somewhere and need to prevent the object from
>>> disappearing untimely while chasing that weak reference then imo the
>>> better design pattern is to use kref_get_unless_zero. If you need the
>>> serialization the mutex provides for some other reason (someone is only
>>> hodling the mutex instead of grabbing a proper refernce when they really
>>> should grab one) then your refcounting scheme probably needs another kind
>>> of fixup patch.
>>
>> I don't see how weak references can work since if the request goes
>> information is lost, unless stored somewhere else.
>
> So weak refernce is a pointer which doesn't hold a reference to the object
> controlled with kref, but protected by some lock. Latest when the object
> is destroyed we need to clear that pointer in the free callback of
> kref_put (and so also grab the lock that protects that pointer). The
> problem is that anyone else chasing that weak reference might race with
> the final kref_put and increase the refcount from 0 to 1, which isn't
> good.
>
> There's two ways to do that:
> - kref_put_mutex on the release side.
> - in the acquire side do a kref_get_unless_zero _while_ holding that lock.
>
> Two upsides of the later approach:
> - You can have an unrestricted amount of weak references, each protected
>    with their own lock. kref_put_mutex only works up to one.
> - It doesn't serialize the final kref_put with the lock - doing that
>    allows folks to rely on the mutex instead of a proper refcount to make
>    the obj stick around, which is imo a design antipattern that I suffered
>    through trying to cleaning it up agian a lot.
>
> But that's really a tangent to the discussion here, I have no idea whether
> this applies here since I didn't read your patch in detail.

I think it is not about my patch but whether the Android native sync 
implementation handles losing the weak reference on a fence.

I don't think it does and I am not sure how easy or hard would it be to 
change it right now. (The whole area traditionally confuses me since 
there are too many things called sync something and fence something.)

I would need to handle it to be able at least report the status of a 
fence to userspace and gracefully fail other operations. And that is 
assuming that wouldn't break existing userspace.

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC] drm/i915: Add sync framework support to execbuff IOCTL
  2015-07-06 16:34                             ` Tvrtko Ursulin
@ 2015-07-06 17:58                               ` Daniel Vetter
  0 siblings, 0 replies; 22+ messages in thread
From: Daniel Vetter @ 2015-07-06 17:58 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: Intel-GFX

On Mon, Jul 06, 2015 at 05:34:22PM +0100, Tvrtko Ursulin wrote:
> 
> On 07/06/2015 04:37 PM, Daniel Vetter wrote:
> >On Mon, Jul 06, 2015 at 04:21:28PM +0100, Tvrtko Ursulin wrote:
> >>
> >>On 07/06/2015 04:12 PM, Daniel Vetter wrote:
> >>>On Mon, Jul 06, 2015 at 03:46:49PM +0100, Tvrtko Ursulin wrote:
> >>
> >>>If you have weak references somewhere and need to prevent the object from
> >>>disappearing untimely while chasing that weak reference then imo the
> >>>better design pattern is to use kref_get_unless_zero. If you need the
> >>>serialization the mutex provides for some other reason (someone is only
> >>>hodling the mutex instead of grabbing a proper refernce when they really
> >>>should grab one) then your refcounting scheme probably needs another kind
> >>>of fixup patch.
> >>
> >>I don't see how weak references can work since if the request goes
> >>information is lost, unless stored somewhere else.
> >
> >So weak refernce is a pointer which doesn't hold a reference to the object
> >controlled with kref, but protected by some lock. Latest when the object
> >is destroyed we need to clear that pointer in the free callback of
> >kref_put (and so also grab the lock that protects that pointer). The
> >problem is that anyone else chasing that weak reference might race with
> >the final kref_put and increase the refcount from 0 to 1, which isn't
> >good.
> >
> >There's two ways to do that:
> >- kref_put_mutex on the release side.
> >- in the acquire side do a kref_get_unless_zero _while_ holding that lock.
> >
> >Two upsides of the later approach:
> >- You can have an unrestricted amount of weak references, each protected
> >   with their own lock. kref_put_mutex only works up to one.
> >- It doesn't serialize the final kref_put with the lock - doing that
> >   allows folks to rely on the mutex instead of a proper refcount to make
> >   the obj stick around, which is imo a design antipattern that I suffered
> >   through trying to cleaning it up agian a lot.
> >
> >But that's really a tangent to the discussion here, I have no idea whether
> >this applies here since I didn't read your patch in detail.
> 
> I think it is not about my patch but whether the Android native sync
> implementation handles losing the weak reference on a fence.
> 
> I don't think it does and I am not sure how easy or hard would it be to
> change it right now. (The whole area traditionally confuses me since there
> are too many things called sync something and fence something.)
> 
> I would need to handle it to be able at least report the status of a fence
> to userspace and gracefully fail other operations. And that is assuming that
> wouldn't break existing userspace.

For reporting status it should grab a strong reference. I thought this was
about i915-internal lists and stuff we use to keep track of our own
requests, for which we'd need struct_mutex. struct_mutex won't protect any
of the syncpt internal state. I guess I'm rather confused now what this is
about ...
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC] drm/i915: Add sync framework support to execbuff IOCTL
  2015-07-06 12:58               ` John Harrison
  2015-07-06 13:59                 ` Daniel Vetter
@ 2015-07-07  9:15                 ` Tvrtko Ursulin
  2015-07-29 21:19                   ` Jesse Barnes
  1 sibling, 1 reply; 22+ messages in thread
From: Tvrtko Ursulin @ 2015-07-07  9:15 UTC (permalink / raw)
  To: John Harrison, Daniel Vetter; +Cc: Intel-GFX


On 07/06/2015 01:58 PM, John Harrison wrote:
> On 06/07/2015 10:29, Daniel Vetter wrote:
>> On Fri, Jul 03, 2015 at 12:17:33PM +0100, Tvrtko Ursulin wrote:
>>> On 07/02/2015 04:55 PM, Chris Wilson wrote:
>>>> It would be nice if we could reuse one seqno both for internal/external
>>>> fences. If you need to expose a fence ordering within a timeline
>>>> that is
>>>> based on the creation stamp rather than execution stamp, it seems like
>>>> we could just add such a stamp when creating the sync_pt and not worry
>>>> about its relationship to the execution seqno.
>>>>
>>>> Doing so does expose that requests are reordered to userspace since the
>>>> signalling timeline is not the same as userspace's ordered timeline.
>>>> Not
>>>> sure if that is a problem or not.
>>>>
>>>> Afaict the sync uapi is based on waiting for all of a set of fences to
>>>> retire. It doesn't seem to rely on fence ordering (that is knowing that
>>>> fence A will signal before fence B so it need only wait on fence B).
>>>>
>>>> Here's hoping that we can have both simplicity and efficiency...
>>> Jumping in with not even perfect understanding of everything here - but
>>> timeline business has always been confusing me. There is nothing in the
>>> uapi which needs it afaics and iirc there was some discussion at the
>>> time
>>> Jesse floated his patches that it can be removed. Based on that when I
>>> squashed his patches and ported them on top of John's request to fence
>>> conversion it ended up something like the below (manually edited a
>>> bit to
>>> be less noisy and some prep patches omitted):
>>>
>>> This implements the ioctl based uapi and indeed seqnos are not actually
>>> used in waits. So is this insufficient for some reason? (Other that it
>>> does not implement the input fence side of things.)
>> Yeah android syncpt on top of struct fence embedded int i915 request is
>> what I'd have expected.
> The thing I'm not happy with in that plan is that it leaves the kernel
> driver at the mercy of user land applications. If we return a fence
> object to user land via a file descriptor (or indeed any other
> mechanism) then that fence object must be locked until user land closes
> the file. If the fence object is the one embedded within our request
> structure then that means user land is effectively locking our request
> structure. Given that more and more stuff is being attached to the
> request, that could be a fair bit of memory tied up that we can do
> nothing about. E.g. if a rogue/buggy application requests a fence be
> returned for every batch buffer submitted but never closes them.
> Whereas, if we go the route of a separate fence object specifically for
> user land then they can leak them like a sieve and we won't really care
> so much.

I am starting to agree gradually with this view. Given all the 
complications, referencing requests for exporting via fds feels quite 
heavy-weight, with potentially unbound dependencies and more trickiness 
in the future, even if we agreed on referencing and locking details.

Seqnos per context sounds like a significantly more light-weight and 
decoupled implementation.

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC] drm/i915: Add sync framework support to execbuff IOCTL
  2015-07-07  9:15                 ` Tvrtko Ursulin
@ 2015-07-29 21:19                   ` Jesse Barnes
  2015-07-30 11:36                     ` John Harrison
  0 siblings, 1 reply; 22+ messages in thread
From: Jesse Barnes @ 2015-07-29 21:19 UTC (permalink / raw)
  To: Tvrtko Ursulin, John Harrison, Daniel Vetter; +Cc: Intel-GFX

On 07/07/2015 02:15 AM, Tvrtko Ursulin wrote:
> 
> On 07/06/2015 01:58 PM, John Harrison wrote:
>> On 06/07/2015 10:29, Daniel Vetter wrote:
>>> On Fri, Jul 03, 2015 at 12:17:33PM +0100, Tvrtko Ursulin wrote:
>>>> On 07/02/2015 04:55 PM, Chris Wilson wrote:
>>>>> It would be nice if we could reuse one seqno both for internal/external
>>>>> fences. If you need to expose a fence ordering within a timeline
>>>>> that is
>>>>> based on the creation stamp rather than execution stamp, it seems like
>>>>> we could just add such a stamp when creating the sync_pt and not worry
>>>>> about its relationship to the execution seqno.
>>>>>
>>>>> Doing so does expose that requests are reordered to userspace since the
>>>>> signalling timeline is not the same as userspace's ordered timeline.
>>>>> Not
>>>>> sure if that is a problem or not.
>>>>>
>>>>> Afaict the sync uapi is based on waiting for all of a set of fences to
>>>>> retire. It doesn't seem to rely on fence ordering (that is knowing that
>>>>> fence A will signal before fence B so it need only wait on fence B).
>>>>>
>>>>> Here's hoping that we can have both simplicity and efficiency...
>>>> Jumping in with not even perfect understanding of everything here - but
>>>> timeline business has always been confusing me. There is nothing in the
>>>> uapi which needs it afaics and iirc there was some discussion at the
>>>> time
>>>> Jesse floated his patches that it can be removed. Based on that when I
>>>> squashed his patches and ported them on top of John's request to fence
>>>> conversion it ended up something like the below (manually edited a
>>>> bit to
>>>> be less noisy and some prep patches omitted):
>>>>
>>>> This implements the ioctl based uapi and indeed seqnos are not actually
>>>> used in waits. So is this insufficient for some reason? (Other that it
>>>> does not implement the input fence side of things.)
>>> Yeah android syncpt on top of struct fence embedded int i915 request is
>>> what I'd have expected.
>> The thing I'm not happy with in that plan is that it leaves the kernel
>> driver at the mercy of user land applications. If we return a fence
>> object to user land via a file descriptor (or indeed any other
>> mechanism) then that fence object must be locked until user land closes
>> the file. If the fence object is the one embedded within our request
>> structure then that means user land is effectively locking our request
>> structure. Given that more and more stuff is being attached to the
>> request, that could be a fair bit of memory tied up that we can do
>> nothing about. E.g. if a rogue/buggy application requests a fence be
>> returned for every batch buffer submitted but never closes them.
>> Whereas, if we go the route of a separate fence object specifically for
>> user land then they can leak them like a sieve and we won't really care
>> so much.
> 
> I am starting to agree gradually with this view. Given all the
> complications, referencing requests for exporting via fds feels quite
> heavy-weight, with potentially unbound dependencies and more
> trickiness in the future, even if we agreed on referencing and locking
> details.
> 
> Seqnos per context sounds like a significantly more light-weight and
> decoupled implementation.

I think this is the right long term direction as well; conceptually the
per-context seqnos make the most sense in light of scheduling, and they
let us keep things simple for sync pts as well.  Only question is, who
is signed up to make it all work?

Jesse

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC] drm/i915: Add sync framework support to execbuff IOCTL
  2015-07-29 21:19                   ` Jesse Barnes
@ 2015-07-30 11:36                     ` John Harrison
  0 siblings, 0 replies; 22+ messages in thread
From: John Harrison @ 2015-07-30 11:36 UTC (permalink / raw)
  To: Jesse Barnes, Tvrtko Ursulin, Daniel Vetter; +Cc: Intel-GFX

On 29/07/2015 22:19, Jesse Barnes wrote:
> On 07/07/2015 02:15 AM, Tvrtko Ursulin wrote:
>> On 07/06/2015 01:58 PM, John Harrison wrote:
>>> On 06/07/2015 10:29, Daniel Vetter wrote:
>>>> On Fri, Jul 03, 2015 at 12:17:33PM +0100, Tvrtko Ursulin wrote:
>>>>> On 07/02/2015 04:55 PM, Chris Wilson wrote:
>>>>>> It would be nice if we could reuse one seqno both for internal/external
>>>>>> fences. If you need to expose a fence ordering within a timeline
>>>>>> that is
>>>>>> based on the creation stamp rather than execution stamp, it seems like
>>>>>> we could just add such a stamp when creating the sync_pt and not worry
>>>>>> about its relationship to the execution seqno.
>>>>>>
>>>>>> Doing so does expose that requests are reordered to userspace since the
>>>>>> signalling timeline is not the same as userspace's ordered timeline.
>>>>>> Not
>>>>>> sure if that is a problem or not.
>>>>>>
>>>>>> Afaict the sync uapi is based on waiting for all of a set of fences to
>>>>>> retire. It doesn't seem to rely on fence ordering (that is knowing that
>>>>>> fence A will signal before fence B so it need only wait on fence B).
>>>>>>
>>>>>> Here's hoping that we can have both simplicity and efficiency...
>>>>> Jumping in with not even perfect understanding of everything here - but
>>>>> timeline business has always been confusing me. There is nothing in the
>>>>> uapi which needs it afaics and iirc there was some discussion at the
>>>>> time
>>>>> Jesse floated his patches that it can be removed. Based on that when I
>>>>> squashed his patches and ported them on top of John's request to fence
>>>>> conversion it ended up something like the below (manually edited a
>>>>> bit to
>>>>> be less noisy and some prep patches omitted):
>>>>>
>>>>> This implements the ioctl based uapi and indeed seqnos are not actually
>>>>> used in waits. So is this insufficient for some reason? (Other that it
>>>>> does not implement the input fence side of things.)
>>>> Yeah android syncpt on top of struct fence embedded int i915 request is
>>>> what I'd have expected.
>>> The thing I'm not happy with in that plan is that it leaves the kernel
>>> driver at the mercy of user land applications. If we return a fence
>>> object to user land via a file descriptor (or indeed any other
>>> mechanism) then that fence object must be locked until user land closes
>>> the file. If the fence object is the one embedded within our request
>>> structure then that means user land is effectively locking our request
>>> structure. Given that more and more stuff is being attached to the
>>> request, that could be a fair bit of memory tied up that we can do
>>> nothing about. E.g. if a rogue/buggy application requests a fence be
>>> returned for every batch buffer submitted but never closes them.
>>> Whereas, if we go the route of a separate fence object specifically for
>>> user land then they can leak them like a sieve and we won't really care
>>> so much.
>> I am starting to agree gradually with this view. Given all the
>> complications, referencing requests for exporting via fds feels quite
>> heavy-weight, with potentially unbound dependencies and more
>> trickiness in the future, even if we agreed on referencing and locking
>> details.
>>
>> Seqnos per context sounds like a significantly more light-weight and
>> decoupled implementation.
> I think this is the right long term direction as well; conceptually the
> per-context seqnos make the most sense in light of scheduling, and they
> let us keep things simple for sync pts as well.  Only question is, who
> is signed up to make it all work?
>
> Jesse
>

That's the version I had originally. A separate fence object using a per 
context per ring timeline that is safe to export to user land. However, 
Daniel Vetter was very strongly convinced that using a single shared 
fence object both internally and externally was the better solution.

My current implementation has a per context per ring timeline which is 
used to give the fence a definitely in order and sensible seqno. It is 
just not the same seqno that goes through the hardware. At least, not 
yet! Although that change could be quite significant.

John.

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2015-07-30 11:36 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-07-02 11:09 [RFC] drm/i915: Add sync framework support to execbuff IOCTL John.C.Harrison
2015-07-02 11:54 ` Chris Wilson
2015-07-02 12:02   ` Chris Wilson
2015-07-02 13:01   ` John Harrison
2015-07-02 13:22     ` Chris Wilson
2015-07-02 15:43       ` John Harrison
2015-07-02 15:55         ` Chris Wilson
2015-07-03 11:17           ` Tvrtko Ursulin
2015-07-06  9:29             ` Daniel Vetter
2015-07-06 12:58               ` John Harrison
2015-07-06 13:59                 ` Daniel Vetter
2015-07-06 14:26                   ` John Harrison
2015-07-06 14:41                     ` Daniel Vetter
2015-07-06 14:46                     ` Tvrtko Ursulin
2015-07-06 15:12                       ` Daniel Vetter
2015-07-06 15:21                         ` Tvrtko Ursulin
2015-07-06 15:37                           ` Daniel Vetter
2015-07-06 16:34                             ` Tvrtko Ursulin
2015-07-06 17:58                               ` Daniel Vetter
2015-07-07  9:15                 ` Tvrtko Ursulin
2015-07-29 21:19                   ` Jesse Barnes
2015-07-30 11:36                     ` John Harrison

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.