Make execbuf fast[er]

All of lore.kernel.org
 help / color / mirror / Atom feed

* Make execbuf fast[er]
@ 2017-02-23 16:18 Chris Wilson
  2017-02-23 16:18 ` [PATCH 01/15] drm/i915: Copy user requested buffers into the error state Chris Wilson
                   ` (14 more replies)
  0 siblings, 15 replies; 23+ messages in thread
From: Chris Wilson @ 2017-02-23 16:18 UTC (permalink / raw)
  To: intel-gfx

Many eons ago, we add ppgtt support. Among the rejoicing, was a bitter
pill, it was slow, much slower due to driver overhead in looking up the
vma. In part this was due to obj_to_vma being a linear walk, but there
was also the effect in execbuf of going from handle to obj to vma, and
we have had talked for a long time of finding a faster method, so why
not use a ht to go from handle to vma. That's the headline, the majority
of the code deals with little fixes to avoid relocations and to reduce
the cost in processing them (and reduce the cost in avoiding them).
-Chris

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH 01/15] drm/i915: Copy user requested buffers into the error state
  2017-02-23 16:18 Make execbuf fast[er] Chris Wilson
@ 2017-02-23 16:18 ` Chris Wilson
  2017-02-28  6:11   ` Ben Widawsky
  2017-02-28 14:17   ` Joonas Lahtinen
  2017-02-23 16:18 ` [PATCH 02/15] drm/i915: Retire an active batch pool object rather than allocate new Chris Wilson
                   ` (13 subsequent siblings)
  14 siblings, 2 replies; 23+ messages in thread
From: Chris Wilson @ 2017-02-23 16:18 UTC (permalink / raw)
  To: intel-gfx; +Cc: Ben Widawsky

Introduce a new execobject.flag (EXEC_OBJECT_CAPTURE) that userspace may
use to indicate that it wants the contents of this buffer preserved in
the error state (/sys/class/drm/cardN/error) following a GPU hang
involving this batch.

Use this at your discretion, the contents of the error state. although
compressed, are allocated with GFP_ATOMIC (i.e. limited) and kept for all
eternity (until the error state is destroyed).

Based on an earlier patch by Ben Widawsky <ben@bwidawsk.net>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Ben Widawsky <ben@bwidawsk.net>
Cc: Matt Turner <mattst88@gmail.com>
---
 drivers/gpu/drm/i915/i915_drv.c            |  1 +
 drivers/gpu/drm/i915/i915_drv.h            |  3 +++
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 12 +++++++++
 drivers/gpu/drm/i915/i915_gem_request.c    | 16 ++++++++++++
 drivers/gpu/drm/i915/i915_gem_request.h    | 11 ++++++++
 drivers/gpu/drm/i915/i915_gpu_error.c      | 40 +++++++++++++++++++++++++++++-
 include/uapi/drm/i915_drm.h                | 15 ++++++++++-
 7 files changed, 96 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 409fc32ce2bd..842c62b96a83 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -353,6 +353,7 @@ static int i915_getparam(struct drm_device *dev, void *data,
 	case I915_PARAM_HAS_EXEC_ASYNC:
 	case I915_PARAM_HAS_EXEC_FENCE:
 	case I915_PARAM_HAS_EXEC_FENCE_DMABUF:
+	case I915_PARAM_HAS_EXEC_CAPTURE:
 		/* For the time being all of these are always true;
 		 * if some supported hardware does not have one of these
 		 * features this value needs to be provided from
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 440a4725b87f..2cc0253d6ef7 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1018,6 +1018,9 @@ struct i915_gpu_state {
 			u32 *pages[0];
 		} *ringbuffer, *batchbuffer, *wa_batchbuffer, *ctx, *hws_page;
 
+		struct drm_i915_error_object **user_bo;
+		long user_bo_count;
+
 		struct drm_i915_error_object *wa_ctx;
 
 		struct drm_i915_error_request {
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 3f2796131410..e8ffe0c9a20e 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1113,6 +1113,18 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req,
 	list_for_each_entry(vma, vmas, exec_list) {
 		struct drm_i915_gem_object *obj = vma->obj;
 
+		if (vma->exec_entry->flags & EXEC_OBJECT_CAPTURE) {
+			struct i915_gem_capture_list *capture;
+
+			capture = kmalloc(sizeof(*capture), GFP_KERNEL);
+			if (unlikely(!capture))
+				return -ENOMEM;
+
+			capture->next = req->capture_list;
+			capture->vma = vma;
+			req->capture_list = capture;
+		}
+
 		if (vma->exec_entry->flags & EXEC_OBJECT_ASYNC)
 			continue;
 
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index ad9d4ce07fb6..3a159cac2172 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -286,6 +286,19 @@ void i915_gem_retire_noop(struct i915_gem_active *active,
 	/* Space left intentionally blank */
 }
 
+static void request_free_capture_list(struct drm_i915_gem_request *request)
+{
+	struct i915_gem_capture_list *capture;
+
+	capture = request->capture_list;
+	while (capture) {
+		struct i915_gem_capture_list *next = capture->next;
+
+		kfree(capture);
+		capture = next;
+	}
+}
+
 static void i915_gem_request_retire(struct drm_i915_gem_request *request)
 {
 	struct intel_engine_cs *engine = request->engine;
@@ -320,6 +333,8 @@ static void i915_gem_request_retire(struct drm_i915_gem_request *request)
 	}
 	unreserve_seqno(request->engine);
 
+	request_free_capture_list(request);
+
 	/* Walk through the active list, calling retire on each. This allows
 	 * objects to track their GPU activity and mark themselves as idle
 	 * when their *last* active request is completed (updating state
@@ -615,6 +630,7 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 	req->global_seqno = 0;
 	req->file_priv = NULL;
 	req->batch = NULL;
+	req->capture_list = NULL;
 
 	/*
 	 * Reserve space in the ring buffer for all the commands required to
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index 0efee879df23..cc24a6c72748 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -73,6 +73,11 @@ struct i915_priotree {
 #define I915_PRIORITY_MIN (-I915_PRIORITY_MAX)
 };
 
+struct i915_gem_capture_list {
+	struct i915_gem_capture_list *next;
+	struct i915_vma *vma;
+};
+
 /**
  * Request queue structure.
  *
@@ -167,6 +172,12 @@ struct drm_i915_gem_request {
 	 * error state dump only).
 	 */
 	struct i915_vma *batch;
+	/** Additional buffers requested by userspace to be captured upon
+	 * a GPU hang. The vma/obj on this list are protected by their
+	 * active reference - all objects on this list must also be
+	 * on the active_list (of their final request).
+	 */
+	struct i915_gem_capture_list *capture_list;
 	struct list_head active_list;
 
 	/** Time at which this request was emitted, in jiffies. */
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 2b1d15668192..76855e1d8795 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -709,6 +709,10 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 			print_error_obj(m, dev_priv->engine[i], NULL, obj);
 		}
 
+		for (j = 0; j < ee->user_bo_count; j++)
+			print_error_obj(m, dev_priv->engine[i],
+					"user", ee->user_bo[j]);
+
 		if (ee->num_requests) {
 			err_printf(m, "%s --- %d requests\n",
 				   dev_priv->engine[i]->name,
@@ -822,11 +826,15 @@ void __i915_gpu_state_free(struct kref *error_ref)
 {
 	struct i915_gpu_state *error =
 		container_of(error_ref, typeof(*error), ref);
-	int i;
+	long i, j;
 
 	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
 		struct drm_i915_error_engine *ee = &error->engine[i];
 
+		for (j = 0; j < ee->user_bo_count; j++)
+			i915_error_object_free(ee->user_bo[j]);
+		kfree(ee->user_bo);
+
 		i915_error_object_free(ee->batchbuffer);
 		i915_error_object_free(ee->wa_batchbuffer);
 		i915_error_object_free(ee->ringbuffer);
@@ -1343,6 +1351,35 @@ static void record_context(struct drm_i915_error_context *e,
 	e->active = ctx->active_count;
 }
 
+static void request_record_user_bo(struct drm_i915_gem_request *request,
+				   struct drm_i915_error_engine *ee)
+{
+	struct i915_gem_capture_list *c;
+	struct drm_i915_error_object **bo;
+	long count;
+
+	count = 0;
+	for (c = request->capture_list; c; c = c->next)
+		count++;
+
+	bo = NULL;
+	if (count)
+		bo = kcalloc(count, sizeof(*bo), GFP_ATOMIC);
+	if (!bo)
+		return;
+
+	count = 0;
+	for (c = request->capture_list; c; c = c->next) {
+		bo[count] = i915_error_object_create(request->i915, c->vma);
+		if (!bo[count])
+			break;
+		count++;
+	}
+
+	ee->user_bo = bo;
+	ee->user_bo_count = count;
+}
+
 static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
 				  struct i915_gpu_state *error)
 {
@@ -1389,6 +1426,7 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
 				ee->wa_batchbuffer =
 					i915_error_object_create(dev_priv,
 								 engine->scratch);
+			request_record_user_bo(request, ee);
 
 			ee->ctx =
 				i915_error_object_create(dev_priv,
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index ebc7641b5252..9eda849df680 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -418,6 +418,12 @@ typedef struct drm_i915_irq_wait {
  */
 #define I915_PARAM_HAS_EXEC_FENCE_DMABUF 45
 
+/* Query whether DRM_I915_GEM_EXECBUFFER2 supports the ability to capture
+ * user specified bufffers for post-mortem debugging of GPU hangs. See
+ * EXEC_OBJECT_CAPTURE.
+ */
+#define I915_PARAM_HAS_EXEC_CAPTURE	 46
+
 typedef struct drm_i915_getparam {
 	__s32 param;
 	/*
@@ -779,8 +785,15 @@ struct drm_i915_gem_exec_object2 {
  * I915_PARAM_HAS_EXEC_FENCE to order execbufs and execute them asynchronously.
  */
 #define EXEC_OBJECT_ASYNC		(1<<6)
+/* Request that the contents of this execobject be copied into the error
+ * state upon a GPU hang involving this batch for post-mortem debugging.
+ * These buffers are recorded in no particular order as "user" in
+ * /sys/class/drm/cardN/error. Query I915_PARAM_HAS_EXEC_CAPTURE to see
+ * if the kernel supports this flag.
+ */
+#define EXEC_OBJECT_CAPTURE		(1<<7)
 /* All remaining bits are MBZ and RESERVED FOR FUTURE USE */
-#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_ASYNC<<1)
+#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_CAPTURE<<1)
 	__u64 flags;
 
 	union {
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 02/15] drm/i915: Retire an active batch pool object rather than allocate new
  2017-02-23 16:18 Make execbuf fast[er] Chris Wilson
  2017-02-23 16:18 ` [PATCH 01/15] drm/i915: Copy user requested buffers into the error state Chris Wilson
@ 2017-02-23 16:18 ` Chris Wilson
  2017-02-23 16:18 ` [PATCH 03/15] drm/i915: Drop spinlocks around adding to the client request list Chris Wilson
                   ` (12 subsequent siblings)
  14 siblings, 0 replies; 23+ messages in thread
From: Chris Wilson @ 2017-02-23 16:18 UTC (permalink / raw)
  To: intel-gfx

Since obj->active_count is only updated upon retirement, if we see an
active object in the batch pool, double check that is still active
before deciding to allocate a new object.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem_batch_pool.c | 37 ++++++++++++++----------------
 1 file changed, 17 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_batch_pool.c b/drivers/gpu/drm/i915/i915_gem_batch_pool.c
index 99ceae7855f8..41aa598c4f3b 100644
--- a/drivers/gpu/drm/i915/i915_gem_batch_pool.c
+++ b/drivers/gpu/drm/i915/i915_gem_batch_pool.c
@@ -96,8 +96,7 @@ struct drm_i915_gem_object *
 i915_gem_batch_pool_get(struct i915_gem_batch_pool *pool,
 			size_t size)
 {
-	struct drm_i915_gem_object *obj = NULL;
-	struct drm_i915_gem_object *tmp;
+	struct drm_i915_gem_object *obj;
 	struct list_head *list;
 	int n, ret;
 
@@ -112,31 +111,29 @@ i915_gem_batch_pool_get(struct i915_gem_batch_pool *pool,
 		n = ARRAY_SIZE(pool->cache_list) - 1;
 	list = &pool->cache_list[n];
 
-	list_for_each_entry(tmp, list, batch_pool_link) {
+	list_for_each_entry(obj, list, batch_pool_link) {
 		/* The batches are strictly LRU ordered */
-		if (i915_gem_object_is_active(tmp))
-			break;
+		if (i915_gem_object_is_active(obj)) {
+			if (!reservation_object_test_signaled_rcu(obj->resv,
+								  true))
+				break;
 
-		GEM_BUG_ON(!reservation_object_test_signaled_rcu(tmp->resv,
-								 true));
+			i915_gem_retire_requests(pool->engine->i915);
+			GEM_BUG_ON(i915_gem_object_is_active(obj));
+		}
 
-		if (tmp->base.size >= size) {
-			/* Clear the set of shared fences early */
-			reservation_object_lock(tmp->resv, NULL);
-			reservation_object_add_excl_fence(tmp->resv, NULL);
-			reservation_object_unlock(tmp->resv);
+		GEM_BUG_ON(!reservation_object_test_signaled_rcu(obj->resv,
+								 true));
 
-			obj = tmp;
-			break;
-		}
+		if (obj->base.size >= size)
+			goto found;
 	}
 
-	if (obj == NULL) {
-		obj = i915_gem_object_create_internal(pool->engine->i915, size);
-		if (IS_ERR(obj))
-			return obj;
-	}
+	obj = i915_gem_object_create_internal(pool->engine->i915, size);
+	if (IS_ERR(obj))
+		return obj;
 
+found:
 	ret = i915_gem_object_pin_pages(obj);
 	if (ret)
 		return ERR_PTR(ret);
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 03/15] drm/i915: Drop spinlocks around adding to the client request list
  2017-02-23 16:18 Make execbuf fast[er] Chris Wilson
  2017-02-23 16:18 ` [PATCH 01/15] drm/i915: Copy user requested buffers into the error state Chris Wilson
  2017-02-23 16:18 ` [PATCH 02/15] drm/i915: Retire an active batch pool object rather than allocate new Chris Wilson
@ 2017-02-23 16:18 ` Chris Wilson
  2017-02-24 12:05   ` Mika Kuoppala
  2017-02-23 16:18 ` [PATCH 04/15] drm/i915: Amalgamate execbuffer parameter structures Chris Wilson
                   ` (11 subsequent siblings)
  14 siblings, 1 reply; 23+ messages in thread
From: Chris Wilson @ 2017-02-23 16:18 UTC (permalink / raw)
  To: intel-gfx

Adding to the tail of the client request list as the only other user is
in the throttle ioctl that iterates forwards over the list. It only
needs protection against deletion of a request as it reads it, it simply
won't see a new request added to the end of the list, or it would be too
early and rejected. We can further reduce the number of spinlocks
required when throttling by removing stale requests from the client_list
as we throttle.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_debugfs.c        |  2 +-
 drivers/gpu/drm/i915/i915_gem.c            | 14 ++++++------
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 13 ++++++++----
 drivers/gpu/drm/i915/i915_gem_request.c    | 34 ++++++------------------------
 drivers/gpu/drm/i915/i915_gem_request.h    |  4 +---
 5 files changed, 23 insertions(+), 44 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 1a28b5279bec..ddae8e442176 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -506,7 +506,7 @@ static int i915_gem_object_info(struct seq_file *m, void *data)
 		mutex_lock(&dev->struct_mutex);
 		request = list_first_entry_or_null(&file_priv->mm.request_list,
 						   struct drm_i915_gem_request,
-						   client_list);
+						   client_link);
 		rcu_read_lock();
 		task = pid_task(request && request->ctx->pid ?
 				request->ctx->pid : file->pid,
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index de1fc98e041d..92ab989bb05f 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3667,16 +3667,14 @@ i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
 		return -EIO;
 
 	spin_lock(&file_priv->mm.lock);
-	list_for_each_entry(request, &file_priv->mm.request_list, client_list) {
+	list_for_each_entry(request, &file_priv->mm.request_list, client_link) {
 		if (time_after_eq(request->emitted_jiffies, recent_enough))
 			break;
 
-		/*
-		 * Note that the request might not have been submitted yet.
-		 * In which case emitted_jiffies will be zero.
-		 */
-		if (!request->emitted_jiffies)
-			continue;
+		if (target) {
+			list_del(&target->client_link);
+			target->file_priv = NULL;
+		}
 
 		target = request;
 	}
@@ -4735,7 +4733,7 @@ void i915_gem_release(struct drm_device *dev, struct drm_file *file)
 	 * file_priv.
 	 */
 	spin_lock(&file_priv->mm.lock);
-	list_for_each_entry(request, &file_priv->mm.request_list, client_list)
+	list_for_each_entry(request, &file_priv->mm.request_list, client_link)
 		request->file_priv = NULL;
 	spin_unlock(&file_priv->mm.lock);
 
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index e8ffe0c9a20e..2b570d0b2392 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1420,6 +1420,14 @@ i915_gem_execbuffer_parse(struct intel_engine_cs *engine,
 	return vma;
 }
 
+static void
+add_to_client(struct drm_i915_gem_request *req,
+	      struct drm_file *file)
+{
+	req->file_priv = file->driver_priv;
+	list_add_tail(&req->client_link, &req->file_priv->mm.request_list);
+}
+
 static int
 execbuf_submit(struct i915_execbuffer_params *params,
 	       struct drm_i915_gem_execbuffer2 *args,
@@ -1507,6 +1515,7 @@ execbuf_submit(struct i915_execbuffer_params *params,
 		return ret;
 
 	i915_gem_execbuffer_move_to_active(vmas, params->request);
+	add_to_client(params->request, params->file);
 
 	return 0;
 }
@@ -1886,10 +1895,6 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	 */
 	params->request->batch = params->batch;
 
-	ret = i915_gem_request_add_to_client(params->request, file);
-	if (ret)
-		goto err_request;
-
 	/*
 	 * Save assorted stuff away to pass through to *_submission().
 	 * NB: This data should be 'persistent' and not local as it will
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 3a159cac2172..5bca3e25bf61 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -82,42 +82,20 @@ const struct dma_fence_ops i915_fence_ops = {
 	.release = i915_fence_release,
 };
 
-int i915_gem_request_add_to_client(struct drm_i915_gem_request *req,
-				   struct drm_file *file)
-{
-	struct drm_i915_private *dev_private;
-	struct drm_i915_file_private *file_priv;
-
-	WARN_ON(!req || !file || req->file_priv);
-
-	if (!req || !file)
-		return -EINVAL;
-
-	if (req->file_priv)
-		return -EINVAL;
-
-	dev_private = req->i915;
-	file_priv = file->driver_priv;
-
-	spin_lock(&file_priv->mm.lock);
-	req->file_priv = file_priv;
-	list_add_tail(&req->client_list, &file_priv->mm.request_list);
-	spin_unlock(&file_priv->mm.lock);
-
-	return 0;
-}
-
 static inline void
 i915_gem_request_remove_from_client(struct drm_i915_gem_request *request)
 {
-	struct drm_i915_file_private *file_priv = request->file_priv;
+	struct drm_i915_file_private *file_priv;
 
+	file_priv = request->file_priv;
 	if (!file_priv)
 		return;
 
 	spin_lock(&file_priv->mm.lock);
-	list_del(&request->client_list);
-	request->file_priv = NULL;
+	if (request->file_priv) {
+		list_del(&request->client_link);
+		request->file_priv = NULL;
+	}
 	spin_unlock(&file_priv->mm.lock);
 }
 
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index cc24a6c72748..1edc0fa7794c 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -191,7 +191,7 @@ struct drm_i915_gem_request {
 
 	struct drm_i915_file_private *file_priv;
 	/** file_priv list entry for this request */
-	struct list_head client_list;
+	struct list_head client_link;
 };
 
 extern const struct dma_fence_ops i915_fence_ops;
@@ -204,8 +204,6 @@ static inline bool dma_fence_is_i915(const struct dma_fence *fence)
 struct drm_i915_gem_request * __must_check
 i915_gem_request_alloc(struct intel_engine_cs *engine,
 		       struct i915_gem_context *ctx);
-int i915_gem_request_add_to_client(struct drm_i915_gem_request *req,
-				   struct drm_file *file);
 void i915_gem_request_retire_upto(struct drm_i915_gem_request *req);
 
 static inline struct drm_i915_gem_request *
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 04/15] drm/i915: Amalgamate execbuffer parameter structures
  2017-02-23 16:18 Make execbuf fast[er] Chris Wilson
                   ` (2 preceding siblings ...)
  2017-02-23 16:18 ` [PATCH 03/15] drm/i915: Drop spinlocks around adding to the client request list Chris Wilson
@ 2017-02-23 16:18 ` Chris Wilson
  2017-02-23 16:18 ` [PATCH 05/15] drm/i915: Use vma->exec_entry as our double-entry placeholder Chris Wilson
                   ` (10 subsequent siblings)
  14 siblings, 0 replies; 23+ messages in thread
From: Chris Wilson @ 2017-02-23 16:18 UTC (permalink / raw)
  To: intel-gfx

Combine the two slightly overlapping parameter structures we pass around
the execbuffer routines into one.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 560 ++++++++++++-----------------
 1 file changed, 238 insertions(+), 322 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 2b570d0b2392..0f6c7acbc063 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -51,70 +51,74 @@
 
 #define BATCH_OFFSET_BIAS (256*1024)
 
-struct i915_execbuffer_params {
-	struct drm_device               *dev;
-	struct drm_file                 *file;
-	struct i915_vma			*batch;
-	u32				dispatch_flags;
-	u32				args_batch_start_offset;
-	struct intel_engine_cs          *engine;
-	struct i915_gem_context         *ctx;
-	struct drm_i915_gem_request     *request;
-};
-
-struct eb_vmas {
+struct i915_execbuffer {
 	struct drm_i915_private *i915;
+	struct drm_file *file;
+	struct drm_i915_gem_execbuffer2 *args;
+	struct drm_i915_gem_exec_object2 *exec;
+	struct intel_engine_cs *engine;
+	struct i915_gem_context *ctx;
+	struct i915_address_space *vm;
+	struct i915_vma *batch;
+	struct drm_i915_gem_request *request;
+	u32 batch_start_offset;
+	u32 batch_len;
+	unsigned int dispatch_flags;
+	struct drm_i915_gem_exec_object2 shadow_exec_entry;
+	bool need_relocs;
 	struct list_head vmas;
+	struct reloc_cache {
+		struct drm_mm_node node;
+		unsigned long vaddr;
+		unsigned int page;
+		bool use_64bit_reloc;
+	} reloc_cache;
 	int and;
 	union {
-		struct i915_vma *lut[0];
-		struct hlist_head buckets[0];
+		struct i915_vma **lut;
+		struct hlist_head *buckets;
 	};
 };
 
-static struct eb_vmas *
-eb_create(struct drm_i915_private *i915,
-	  struct drm_i915_gem_execbuffer2 *args)
+static int
+eb_create(struct i915_execbuffer *eb)
 {
-	struct eb_vmas *eb = NULL;
-
-	if (args->flags & I915_EXEC_HANDLE_LUT) {
-		unsigned size = args->buffer_count;
+	eb->lut = NULL;
+	if (eb->args->flags & I915_EXEC_HANDLE_LUT) {
+		unsigned int size = eb->args->buffer_count;
 		size *= sizeof(struct i915_vma *);
-		size += sizeof(struct eb_vmas);
-		eb = kmalloc(size, GFP_TEMPORARY | __GFP_NOWARN | __GFP_NORETRY);
+		eb->lut = kmalloc(size,
+				  GFP_TEMPORARY | __GFP_NOWARN | __GFP_NORETRY);
 	}
 
-	if (eb == NULL) {
-		unsigned size = args->buffer_count;
-		unsigned count = PAGE_SIZE / sizeof(struct hlist_head) / 2;
+	if (!eb->lut) {
+		unsigned int size = eb->args->buffer_count;
+		unsigned int count = PAGE_SIZE / sizeof(struct hlist_head) / 2;
 		BUILD_BUG_ON_NOT_POWER_OF_2(PAGE_SIZE / sizeof(struct hlist_head));
 		while (count > 2*size)
 			count >>= 1;
-		eb = kzalloc(count*sizeof(struct hlist_head) +
-			     sizeof(struct eb_vmas),
-			     GFP_TEMPORARY);
-		if (eb == NULL)
-			return eb;
+		eb->lut = kzalloc(count*sizeof(struct hlist_head),
+				  GFP_TEMPORARY);
+		if (!eb->lut)
+			return -ENOMEM;
 
 		eb->and = count - 1;
 	} else
-		eb->and = -args->buffer_count;
+		eb->and = -eb->args->buffer_count;
 
-	eb->i915 = i915;
 	INIT_LIST_HEAD(&eb->vmas);
-	return eb;
+	return 0;
 }
 
 static void
-eb_reset(struct eb_vmas *eb)
+eb_reset(struct i915_execbuffer *eb)
 {
 	if (eb->and >= 0)
 		memset(eb->buckets, 0, (eb->and+1)*sizeof(struct hlist_head));
 }
 
 static struct i915_vma *
-eb_get_batch(struct eb_vmas *eb)
+eb_get_batch(struct i915_execbuffer *eb)
 {
 	struct i915_vma *vma = list_entry(eb->vmas.prev, typeof(*vma), exec_list);
 
@@ -134,34 +138,30 @@ eb_get_batch(struct eb_vmas *eb)
 }
 
 static int
-eb_lookup_vmas(struct eb_vmas *eb,
-	       struct drm_i915_gem_exec_object2 *exec,
-	       const struct drm_i915_gem_execbuffer2 *args,
-	       struct i915_address_space *vm,
-	       struct drm_file *file)
+eb_lookup_vmas(struct i915_execbuffer *eb)
 {
 	struct drm_i915_gem_object *obj;
 	struct list_head objects;
 	int i, ret;
 
 	INIT_LIST_HEAD(&objects);
-	spin_lock(&file->table_lock);
+	spin_lock(&eb->file->table_lock);
 	/* Grab a reference to the object and release the lock so we can lookup
 	 * or create the VMA without using GFP_ATOMIC */
-	for (i = 0; i < args->buffer_count; i++) {
-		obj = to_intel_bo(idr_find(&file->object_idr, exec[i].handle));
+	for (i = 0; i < eb->args->buffer_count; i++) {
+		obj = to_intel_bo(idr_find(&eb->file->object_idr, eb->exec[i].handle));
 		if (obj == NULL) {
-			spin_unlock(&file->table_lock);
+			spin_unlock(&eb->file->table_lock);
 			DRM_DEBUG("Invalid object handle %d at index %d\n",
-				   exec[i].handle, i);
+				   eb->exec[i].handle, i);
 			ret = -ENOENT;
 			goto err;
 		}
 
 		if (!list_empty(&obj->obj_exec_link)) {
-			spin_unlock(&file->table_lock);
+			spin_unlock(&eb->file->table_lock);
 			DRM_DEBUG("Object %p [handle %d, index %d] appears more than once in object list\n",
-				   obj, exec[i].handle, i);
+				   obj, eb->exec[i].handle, i);
 			ret = -EINVAL;
 			goto err;
 		}
@@ -169,7 +169,7 @@ eb_lookup_vmas(struct eb_vmas *eb,
 		i915_gem_object_get(obj);
 		list_add_tail(&obj->obj_exec_link, &objects);
 	}
-	spin_unlock(&file->table_lock);
+	spin_unlock(&eb->file->table_lock);
 
 	i = 0;
 	while (!list_empty(&objects)) {
@@ -187,7 +187,7 @@ eb_lookup_vmas(struct eb_vmas *eb,
 		 * from the (obj, vm) we don't run the risk of creating
 		 * duplicated vmas for the same vm.
 		 */
-		vma = i915_vma_instance(obj, vm, NULL);
+		vma = i915_vma_instance(obj, eb->vm, NULL);
 		if (unlikely(IS_ERR(vma))) {
 			DRM_DEBUG("Failed to lookup VMA\n");
 			ret = PTR_ERR(vma);
@@ -198,11 +198,13 @@ eb_lookup_vmas(struct eb_vmas *eb,
 		list_add_tail(&vma->exec_list, &eb->vmas);
 		list_del_init(&obj->obj_exec_link);
 
-		vma->exec_entry = &exec[i];
+		vma->exec_entry = &eb->exec[i];
 		if (eb->and < 0) {
 			eb->lut[i] = vma;
 		} else {
-			uint32_t handle = args->flags & I915_EXEC_HANDLE_LUT ? i : exec[i].handle;
+			u32 handle =
+				eb->args->flags & I915_EXEC_HANDLE_LUT ?
+				i : eb->exec[i].handle;
 			vma->exec_handle = handle;
 			hlist_add_head(&vma->exec_node,
 				       &eb->buckets[handle & eb->and]);
@@ -229,7 +231,7 @@ eb_lookup_vmas(struct eb_vmas *eb,
 	return ret;
 }
 
-static struct i915_vma *eb_get_vma(struct eb_vmas *eb, unsigned long handle)
+static struct i915_vma *eb_get_vma(struct i915_execbuffer *eb, unsigned long handle)
 {
 	if (eb->and < 0) {
 		if (handle >= -eb->and)
@@ -249,7 +251,7 @@ static struct i915_vma *eb_get_vma(struct eb_vmas *eb, unsigned long handle)
 }
 
 static void
-i915_gem_execbuffer_unreserve_vma(struct i915_vma *vma)
+eb_unreserve_vma(struct i915_vma *vma)
 {
 	struct drm_i915_gem_exec_object2 *entry;
 
@@ -267,8 +269,10 @@ i915_gem_execbuffer_unreserve_vma(struct i915_vma *vma)
 	entry->flags &= ~(__EXEC_OBJECT_HAS_FENCE | __EXEC_OBJECT_HAS_PIN);
 }
 
-static void eb_destroy(struct eb_vmas *eb)
+static void eb_destroy(struct i915_execbuffer *eb)
 {
+	i915_gem_context_put(eb->ctx);
+
 	while (!list_empty(&eb->vmas)) {
 		struct i915_vma *vma;
 
@@ -276,11 +280,10 @@ static void eb_destroy(struct eb_vmas *eb)
 				       struct i915_vma,
 				       exec_list);
 		list_del_init(&vma->exec_list);
-		i915_gem_execbuffer_unreserve_vma(vma);
+		eb_unreserve_vma(vma);
 		vma->exec_entry = NULL;
 		i915_vma_put(vma);
 	}
-	kfree(eb);
 }
 
 static inline int use_cpu_reloc(struct drm_i915_gem_object *obj)
@@ -321,20 +324,11 @@ relocation_target(const struct drm_i915_gem_relocation_entry *reloc,
 	return gen8_canonical_addr((int)reloc->delta + target_offset);
 }
 
-struct reloc_cache {
-	struct drm_i915_private *i915;
-	struct drm_mm_node node;
-	unsigned long vaddr;
-	unsigned int page;
-	bool use_64bit_reloc;
-};
-
 static void reloc_cache_init(struct reloc_cache *cache,
 			     struct drm_i915_private *i915)
 {
 	cache->page = -1;
 	cache->vaddr = 0;
-	cache->i915 = i915;
 	/* Must be a variable in the struct to allow GCC to unroll. */
 	cache->use_64bit_reloc = HAS_64BIT_RELOC(i915);
 	cache->node.allocated = false;
@@ -352,7 +346,14 @@ static inline unsigned int unmask_flags(unsigned long p)
 
 #define KMAP 0x4 /* after CLFLUSH_FLAGS */
 
-static void reloc_cache_fini(struct reloc_cache *cache)
+static inline struct i915_ggtt *cache_to_ggtt(struct reloc_cache *cache)
+{
+	struct drm_i915_private *i915 =
+		container_of(cache, struct i915_execbuffer, reloc_cache)->i915;
+	return &i915->ggtt;
+}
+
+static void reloc_cache_reset(struct reloc_cache *cache)
 {
 	void *vaddr;
 
@@ -370,7 +371,7 @@ static void reloc_cache_fini(struct reloc_cache *cache)
 		wmb();
 		io_mapping_unmap_atomic((void __iomem *)vaddr);
 		if (cache->node.allocated) {
-			struct i915_ggtt *ggtt = &cache->i915->ggtt;
+			struct i915_ggtt *ggtt = cache_to_ggtt(cache);
 
 			ggtt->base.clear_range(&ggtt->base,
 					       cache->node.start,
@@ -380,6 +381,9 @@ static void reloc_cache_fini(struct reloc_cache *cache)
 			i915_vma_unpin((struct i915_vma *)cache->node.mm);
 		}
 	}
+
+	cache->vaddr = 0;
+	cache->page = -1;
 }
 
 static void *reloc_kmap(struct drm_i915_gem_object *obj,
@@ -418,7 +422,7 @@ static void *reloc_iomap(struct drm_i915_gem_object *obj,
 			 struct reloc_cache *cache,
 			 int page)
 {
-	struct i915_ggtt *ggtt = &cache->i915->ggtt;
+	struct i915_ggtt *ggtt = cache_to_ggtt(cache);
 	unsigned long offset;
 	void *vaddr;
 
@@ -468,7 +472,7 @@ static void *reloc_iomap(struct drm_i915_gem_object *obj,
 		offset += page << PAGE_SHIFT;
 	}
 
-	vaddr = (void __force *) io_mapping_map_atomic_wc(&cache->i915->ggtt.mappable, offset);
+	vaddr = (void __force *) io_mapping_map_atomic_wc(&ggtt->mappable, offset);
 	cache->page = page;
 	cache->vaddr = (unsigned long)vaddr;
 
@@ -547,12 +551,10 @@ relocate_entry(struct drm_i915_gem_object *obj,
 }
 
 static int
-i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
-				   struct eb_vmas *eb,
-				   struct drm_i915_gem_relocation_entry *reloc,
-				   struct reloc_cache *cache)
+eb_relocate_entry(struct drm_i915_gem_object *obj,
+		  struct i915_execbuffer *eb,
+		  struct drm_i915_gem_relocation_entry *reloc)
 {
-	struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
 	struct drm_gem_object *target_obj;
 	struct drm_i915_gem_object *target_i915_obj;
 	struct i915_vma *target_vma;
@@ -571,8 +573,8 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
 	/* Sandybridge PPGTT errata: We need a global gtt mapping for MI and
 	 * pipe_control writes because the gpu doesn't properly redirect them
 	 * through the ppgtt for non_secure batchbuffers. */
-	if (unlikely(IS_GEN6(dev_priv) &&
-	    reloc->write_domain == I915_GEM_DOMAIN_INSTRUCTION)) {
+	if (unlikely(IS_GEN6(eb->i915) &&
+		     reloc->write_domain == I915_GEM_DOMAIN_INSTRUCTION)) {
 		ret = i915_vma_bind(target_vma, target_i915_obj->cache_level,
 				    PIN_GLOBAL);
 		if (WARN_ONCE(ret, "Unexpected failure to bind target VMA!"))
@@ -613,7 +615,7 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
 
 	/* Check that the relocation address is valid... */
 	if (unlikely(reloc->offset >
-		     obj->base.size - (cache->use_64bit_reloc ? 8 : 4))) {
+		     obj->base.size - (eb->reloc_cache.use_64bit_reloc ? 8 : 4))) {
 		DRM_DEBUG("Relocation beyond object bounds: "
 			  "obj %p target %d offset %d size %d.\n",
 			  obj, reloc->target_handle,
@@ -629,7 +631,7 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
 		return -EINVAL;
 	}
 
-	ret = relocate_entry(obj, reloc, cache, target_offset);
+	ret = relocate_entry(obj, reloc, &eb->reloc_cache, target_offset);
 	if (ret)
 		return ret;
 
@@ -638,19 +640,15 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
 	return 0;
 }
 
-static int
-i915_gem_execbuffer_relocate_vma(struct i915_vma *vma,
-				 struct eb_vmas *eb)
+static int eb_relocate_vma(struct i915_vma *vma, struct i915_execbuffer *eb)
 {
 #define N_RELOC(x) ((x) / sizeof(struct drm_i915_gem_relocation_entry))
 	struct drm_i915_gem_relocation_entry stack_reloc[N_RELOC(512)];
 	struct drm_i915_gem_relocation_entry __user *user_relocs;
 	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
-	struct reloc_cache cache;
 	int remain, ret = 0;
 
 	user_relocs = u64_to_user_ptr(entry->relocs_ptr);
-	reloc_cache_init(&cache, eb->i915);
 
 	remain = entry->relocation_count;
 	while (remain) {
@@ -679,7 +677,7 @@ i915_gem_execbuffer_relocate_vma(struct i915_vma *vma,
 		do {
 			u64 offset = r->presumed_offset;
 
-			ret = i915_gem_execbuffer_relocate_entry(vma->obj, eb, r, &cache);
+			ret = eb_relocate_entry(vma->obj, eb, r);
 			if (ret)
 				goto out;
 
@@ -711,39 +709,35 @@ i915_gem_execbuffer_relocate_vma(struct i915_vma *vma,
 	}
 
 out:
-	reloc_cache_fini(&cache);
+	reloc_cache_reset(&eb->reloc_cache);
 	return ret;
 #undef N_RELOC
 }
 
 static int
-i915_gem_execbuffer_relocate_vma_slow(struct i915_vma *vma,
-				      struct eb_vmas *eb,
-				      struct drm_i915_gem_relocation_entry *relocs)
+eb_relocate_vma_slow(struct i915_vma *vma,
+		     struct i915_execbuffer *eb,
+		     struct drm_i915_gem_relocation_entry *relocs)
 {
 	const struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
-	struct reloc_cache cache;
 	int i, ret = 0;
 
-	reloc_cache_init(&cache, eb->i915);
 	for (i = 0; i < entry->relocation_count; i++) {
-		ret = i915_gem_execbuffer_relocate_entry(vma->obj, eb, &relocs[i], &cache);
+		ret = eb_relocate_entry(vma->obj, eb, &relocs[i]);
 		if (ret)
 			break;
 	}
-	reloc_cache_fini(&cache);
-
+	reloc_cache_reset(&eb->reloc_cache);
 	return ret;
 }
 
-static int
-i915_gem_execbuffer_relocate(struct eb_vmas *eb)
+static int eb_relocate(struct i915_execbuffer *eb)
 {
 	struct i915_vma *vma;
 	int ret = 0;
 
 	list_for_each_entry(vma, &eb->vmas, exec_list) {
-		ret = i915_gem_execbuffer_relocate_vma(vma, eb);
+		ret = eb_relocate_vma(vma, eb);
 		if (ret)
 			break;
 	}
@@ -758,9 +752,9 @@ static bool only_mappable_for_reloc(unsigned int flags)
 }
 
 static int
-i915_gem_execbuffer_reserve_vma(struct i915_vma *vma,
-				struct intel_engine_cs *engine,
-				bool *need_reloc)
+eb_reserve_vma(struct i915_vma *vma,
+	       struct intel_engine_cs *engine,
+	       bool *need_reloc)
 {
 	struct drm_i915_gem_object *obj = vma->obj;
 	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
@@ -879,33 +873,26 @@ eb_vma_misplaced(struct i915_vma *vma)
 	return false;
 }
 
-static int
-i915_gem_execbuffer_reserve(struct intel_engine_cs *engine,
-			    struct list_head *vmas,
-			    struct i915_gem_context *ctx,
-			    bool *need_relocs)
+static int eb_reserve(struct i915_execbuffer *eb)
 {
+	const bool has_fenced_gpu_access = INTEL_GEN(eb->i915) < 4;
 	struct drm_i915_gem_object *obj;
 	struct i915_vma *vma;
-	struct i915_address_space *vm;
 	struct list_head ordered_vmas;
 	struct list_head pinned_vmas;
-	bool has_fenced_gpu_access = INTEL_GEN(engine->i915) < 4;
 	int retry;
 
-	vm = list_first_entry(vmas, struct i915_vma, exec_list)->vm;
-
 	INIT_LIST_HEAD(&ordered_vmas);
 	INIT_LIST_HEAD(&pinned_vmas);
-	while (!list_empty(vmas)) {
+	while (!list_empty(&eb->vmas)) {
 		struct drm_i915_gem_exec_object2 *entry;
 		bool need_fence, need_mappable;
 
-		vma = list_first_entry(vmas, struct i915_vma, exec_list);
+		vma = list_first_entry(&eb->vmas, struct i915_vma, exec_list);
 		obj = vma->obj;
 		entry = vma->exec_entry;
 
-		if (ctx->flags & CONTEXT_NO_ZEROMAP)
+		if (eb->ctx->flags & CONTEXT_NO_ZEROMAP)
 			entry->flags |= __EXEC_OBJECT_NEEDS_BIAS;
 
 		if (!has_fenced_gpu_access)
@@ -926,8 +913,8 @@ i915_gem_execbuffer_reserve(struct intel_engine_cs *engine,
 		obj->base.pending_read_domains = I915_GEM_GPU_DOMAINS & ~I915_GEM_DOMAIN_COMMAND;
 		obj->base.pending_write_domain = 0;
 	}
-	list_splice(&ordered_vmas, vmas);
-	list_splice(&pinned_vmas, vmas);
+	list_splice(&ordered_vmas, &eb->vmas);
+	list_splice(&pinned_vmas, &eb->vmas);
 
 	/* Attempt to pin all of the buffers into the GTT.
 	 * This is done in 3 phases:
@@ -946,27 +933,24 @@ i915_gem_execbuffer_reserve(struct intel_engine_cs *engine,
 		int ret = 0;
 
 		/* Unbind any ill-fitting objects or pin. */
-		list_for_each_entry(vma, vmas, exec_list) {
+		list_for_each_entry(vma, &eb->vmas, exec_list) {
 			if (!drm_mm_node_allocated(&vma->node))
 				continue;
 
 			if (eb_vma_misplaced(vma))
 				ret = i915_vma_unbind(vma);
 			else
-				ret = i915_gem_execbuffer_reserve_vma(vma,
-								      engine,
-								      need_relocs);
+				ret = eb_reserve_vma(vma, eb->engine, &eb->need_relocs);
 			if (ret)
 				goto err;
 		}
 
 		/* Bind fresh objects */
-		list_for_each_entry(vma, vmas, exec_list) {
+		list_for_each_entry(vma, &eb->vmas, exec_list) {
 			if (drm_mm_node_allocated(&vma->node))
 				continue;
 
-			ret = i915_gem_execbuffer_reserve_vma(vma, engine,
-							      need_relocs);
+			ret = eb_reserve_vma(vma, eb->engine, &eb->need_relocs);
 			if (ret)
 				goto err;
 		}
@@ -976,39 +960,30 @@ i915_gem_execbuffer_reserve(struct intel_engine_cs *engine,
 			return ret;
 
 		/* Decrement pin count for bound objects */
-		list_for_each_entry(vma, vmas, exec_list)
-			i915_gem_execbuffer_unreserve_vma(vma);
+		list_for_each_entry(vma, &eb->vmas, exec_list)
+			eb_unreserve_vma(vma);
 
-		ret = i915_gem_evict_vm(vm, true);
+		ret = i915_gem_evict_vm(eb->vm, true);
 		if (ret)
 			return ret;
 	} while (1);
 }
 
 static int
-i915_gem_execbuffer_relocate_slow(struct drm_device *dev,
-				  struct drm_i915_gem_execbuffer2 *args,
-				  struct drm_file *file,
-				  struct intel_engine_cs *engine,
-				  struct eb_vmas *eb,
-				  struct drm_i915_gem_exec_object2 *exec,
-				  struct i915_gem_context *ctx)
+eb_relocate_slow(struct i915_execbuffer *eb)
 {
+	const unsigned int count = eb->args->buffer_count;
+	struct drm_device *dev = &eb->i915->drm;
 	struct drm_i915_gem_relocation_entry *reloc;
-	struct i915_address_space *vm;
 	struct i915_vma *vma;
-	bool need_relocs;
 	int *reloc_offset;
 	int i, total, ret;
-	unsigned count = args->buffer_count;
-
-	vm = list_first_entry(&eb->vmas, struct i915_vma, exec_list)->vm;
 
 	/* We may process another execbuffer during the unlock... */
 	while (!list_empty(&eb->vmas)) {
 		vma = list_first_entry(&eb->vmas, struct i915_vma, exec_list);
 		list_del_init(&vma->exec_list);
-		i915_gem_execbuffer_unreserve_vma(vma);
+		eb_unreserve_vma(vma);
 		i915_vma_put(vma);
 	}
 
@@ -1016,7 +991,7 @@ i915_gem_execbuffer_relocate_slow(struct drm_device *dev,
 
 	total = 0;
 	for (i = 0; i < count; i++)
-		total += exec[i].relocation_count;
+		total += eb->exec[i].relocation_count;
 
 	reloc_offset = drm_malloc_ab(count, sizeof(*reloc_offset));
 	reloc = drm_malloc_ab(total, sizeof(*reloc));
@@ -1033,10 +1008,10 @@ i915_gem_execbuffer_relocate_slow(struct drm_device *dev,
 		u64 invalid_offset = (u64)-1;
 		int j;
 
-		user_relocs = u64_to_user_ptr(exec[i].relocs_ptr);
+		user_relocs = u64_to_user_ptr(eb->exec[i].relocs_ptr);
 
 		if (copy_from_user(reloc+total, user_relocs,
-				   exec[i].relocation_count * sizeof(*reloc))) {
+				   eb->exec[i].relocation_count * sizeof(*reloc))) {
 			ret = -EFAULT;
 			mutex_lock(&dev->struct_mutex);
 			goto err;
@@ -1051,7 +1026,7 @@ i915_gem_execbuffer_relocate_slow(struct drm_device *dev,
 		 * happened we would make the mistake of assuming that the
 		 * relocations were valid.
 		 */
-		for (j = 0; j < exec[i].relocation_count; j++) {
+		for (j = 0; j < eb->exec[i].relocation_count; j++) {
 			if (__copy_to_user(&user_relocs[j].presumed_offset,
 					   &invalid_offset,
 					   sizeof(invalid_offset))) {
@@ -1062,7 +1037,7 @@ i915_gem_execbuffer_relocate_slow(struct drm_device *dev,
 		}
 
 		reloc_offset[i] = total;
-		total += exec[i].relocation_count;
+		total += eb->exec[i].relocation_count;
 	}
 
 	ret = i915_mutex_lock_interruptible(dev);
@@ -1073,20 +1048,18 @@ i915_gem_execbuffer_relocate_slow(struct drm_device *dev,
 
 	/* reacquire the objects */
 	eb_reset(eb);
-	ret = eb_lookup_vmas(eb, exec, args, vm, file);
+	ret = eb_lookup_vmas(eb);
 	if (ret)
 		goto err;
 
-	need_relocs = (args->flags & I915_EXEC_NO_RELOC) == 0;
-	ret = i915_gem_execbuffer_reserve(engine, &eb->vmas, ctx,
-					  &need_relocs);
+	ret = eb_reserve(eb);
 	if (ret)
 		goto err;
 
 	list_for_each_entry(vma, &eb->vmas, exec_list) {
-		int offset = vma->exec_entry - exec;
-		ret = i915_gem_execbuffer_relocate_vma_slow(vma, eb,
-							    reloc + reloc_offset[offset]);
+		int idx = vma->exec_entry - eb->exec;
+
+		ret = eb_relocate_vma_slow(vma, eb, reloc + reloc_offset[idx]);
 		if (ret)
 			goto err;
 	}
@@ -1104,13 +1077,12 @@ i915_gem_execbuffer_relocate_slow(struct drm_device *dev,
 }
 
 static int
-i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req,
-				struct list_head *vmas)
+eb_move_to_gpu(struct i915_execbuffer *eb)
 {
 	struct i915_vma *vma;
 	int ret;
 
-	list_for_each_entry(vma, vmas, exec_list) {
+	list_for_each_entry(vma, &eb->vmas, exec_list) {
 		struct drm_i915_gem_object *obj = vma->obj;
 
 		if (vma->exec_entry->flags & EXEC_OBJECT_CAPTURE) {
@@ -1120,9 +1092,9 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req,
 			if (unlikely(!capture))
 				return -ENOMEM;
 
-			capture->next = req->capture_list;
+			capture->next = eb->request->capture_list;
 			capture->vma = vma;
-			req->capture_list = capture;
+			eb->request->capture_list = capture;
 		}
 
 		if (vma->exec_entry->flags & EXEC_OBJECT_ASYNC)
@@ -1134,16 +1106,16 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req,
 		}
 
 		ret = i915_gem_request_await_object
-			(req, obj, obj->base.pending_write_domain);
+			(eb->request, obj, obj->base.pending_write_domain);
 		if (ret)
 			return ret;
 	}
 
 	/* Unconditionally flush any chipset caches (for streaming writes). */
-	i915_gem_chipset_flush(req->engine->i915);
+	i915_gem_chipset_flush(eb->i915);
 
 	/* Unconditionally invalidate GPU caches and TLBs. */
-	return req->engine->emit_flush(req, EMIT_INVALIDATE);
+	return eb->engine->emit_flush(eb->request, EMIT_INVALIDATE);
 }
 
 static bool
@@ -1246,22 +1218,24 @@ validate_exec_list(struct drm_device *dev,
 	return 0;
 }
 
-static struct i915_gem_context *
-i915_gem_validate_context(struct drm_device *dev, struct drm_file *file,
-			  struct intel_engine_cs *engine, const u32 ctx_id)
+static int eb_select_context(struct i915_execbuffer *eb)
 {
+	unsigned int ctx_id = i915_execbuffer2_get_context_id(*eb->args);
 	struct i915_gem_context *ctx;
 
-	ctx = i915_gem_context_lookup(file->driver_priv, ctx_id);
-	if (IS_ERR(ctx))
-		return ctx;
+	ctx = i915_gem_context_lookup(eb->file->driver_priv, ctx_id);
+	if (unlikely(IS_ERR(ctx)))
+		return PTR_ERR(ctx);
 
-	if (i915_gem_context_is_banned(ctx)) {
+	if (unlikely(i915_gem_context_is_banned(ctx))) {
 		DRM_DEBUG("Context %u tried to submit while banned\n", ctx_id);
-		return ERR_PTR(-EIO);
+		return -EIO;
 	}
 
-	return ctx;
+	eb->ctx = i915_gem_context_get(ctx);
+	eb->vm = ctx->ppgtt ? &ctx->ppgtt->base : &eb->i915->ggtt.base;
+
+	return 0;
 }
 
 static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
@@ -1326,12 +1300,11 @@ static void eb_export_fence(struct drm_i915_gem_object *obj,
 }
 
 static void
-i915_gem_execbuffer_move_to_active(struct list_head *vmas,
-				   struct drm_i915_gem_request *req)
+eb_move_to_active(struct i915_execbuffer *eb)
 {
 	struct i915_vma *vma;
 
-	list_for_each_entry(vma, vmas, exec_list) {
+	list_for_each_entry(vma, &eb->vmas, exec_list) {
 		struct drm_i915_gem_object *obj = vma->obj;
 
 		obj->base.write_domain = obj->base.pending_write_domain;
@@ -1341,8 +1314,8 @@ i915_gem_execbuffer_move_to_active(struct list_head *vmas,
 			obj->base.pending_read_domains |= obj->base.read_domains;
 		obj->base.read_domains = obj->base.pending_read_domains;
 
-		i915_vma_move_to_active(vma, req, vma->exec_entry->flags);
-		eb_export_fence(obj, req, vma->exec_entry->flags);
+		i915_vma_move_to_active(vma, eb->request, vma->exec_entry->flags);
+		eb_export_fence(obj, eb->request, vma->exec_entry->flags);
 	}
 }
 
@@ -1372,29 +1345,22 @@ i915_reset_gen7_sol_offsets(struct drm_i915_gem_request *req)
 	return 0;
 }
 
-static struct i915_vma *
-i915_gem_execbuffer_parse(struct intel_engine_cs *engine,
-			  struct drm_i915_gem_exec_object2 *shadow_exec_entry,
-			  struct drm_i915_gem_object *batch_obj,
-			  struct eb_vmas *eb,
-			  u32 batch_start_offset,
-			  u32 batch_len,
-			  bool is_master)
+static struct i915_vma *eb_parse(struct i915_execbuffer *eb, bool is_master)
 {
 	struct drm_i915_gem_object *shadow_batch_obj;
 	struct i915_vma *vma;
 	int ret;
 
-	shadow_batch_obj = i915_gem_batch_pool_get(&engine->batch_pool,
-						   PAGE_ALIGN(batch_len));
+	shadow_batch_obj = i915_gem_batch_pool_get(&eb->engine->batch_pool,
+						   PAGE_ALIGN(eb->batch_len));
 	if (IS_ERR(shadow_batch_obj))
 		return ERR_CAST(shadow_batch_obj);
 
-	ret = intel_engine_cmd_parser(engine,
-				      batch_obj,
+	ret = intel_engine_cmd_parser(eb->engine,
+				      eb->batch->obj,
 				      shadow_batch_obj,
-				      batch_start_offset,
-				      batch_len,
+				      eb->batch_start_offset,
+				      eb->batch_len,
 				      is_master);
 	if (ret) {
 		if (ret == -EACCES) /* unhandled chained batch */
@@ -1408,9 +1374,8 @@ i915_gem_execbuffer_parse(struct intel_engine_cs *engine,
 	if (IS_ERR(vma))
 		goto out;
 
-	memset(shadow_exec_entry, 0, sizeof(*shadow_exec_entry));
-
-	vma->exec_entry = shadow_exec_entry;
+	vma->exec_entry =
+		memset(&eb->shadow_exec_entry, 0, sizeof(*vma->exec_entry));
 	vma->exec_entry->flags = __EXEC_OBJECT_HAS_PIN;
 	i915_gem_object_get(shadow_batch_obj);
 	list_add_tail(&vma->exec_list, &eb->vmas);
@@ -1429,49 +1394,45 @@ add_to_client(struct drm_i915_gem_request *req,
 }
 
 static int
-execbuf_submit(struct i915_execbuffer_params *params,
-	       struct drm_i915_gem_execbuffer2 *args,
-	       struct list_head *vmas)
+execbuf_submit(struct i915_execbuffer *eb)
 {
-	struct drm_i915_private *dev_priv = params->request->i915;
-	u64 exec_start, exec_len;
 	int instp_mode;
 	u32 instp_mask, *cs;
 	int ret;
 
-	ret = i915_gem_execbuffer_move_to_gpu(params->request, vmas);
+	ret = eb_move_to_gpu(eb);
 	if (ret)
 		return ret;
 
-	ret = i915_switch_context(params->request);
+	ret = i915_switch_context(eb->request);
 	if (ret)
 		return ret;
 
-	instp_mode = args->flags & I915_EXEC_CONSTANTS_MASK;
+	instp_mode = eb->args->flags & I915_EXEC_CONSTANTS_MASK;
 	instp_mask = I915_EXEC_CONSTANTS_MASK;
 	switch (instp_mode) {
 	case I915_EXEC_CONSTANTS_REL_GENERAL:
 	case I915_EXEC_CONSTANTS_ABSOLUTE:
 	case I915_EXEC_CONSTANTS_REL_SURFACE:
-		if (instp_mode != 0 && params->engine->id != RCS) {
+		if (instp_mode != 0 && eb->engine->id != RCS) {
 			DRM_DEBUG("non-0 rel constants mode on non-RCS\n");
 			return -EINVAL;
 		}
 
-		if (instp_mode != dev_priv->relative_constants_mode) {
-			if (INTEL_INFO(dev_priv)->gen < 4) {
+		if (instp_mode != eb->i915->relative_constants_mode) {
+			if (INTEL_INFO(eb->i915)->gen < 4) {
 				DRM_DEBUG("no rel constants on pre-gen4\n");
 				return -EINVAL;
 			}
 
-			if (INTEL_INFO(dev_priv)->gen > 5 &&
+			if (INTEL_INFO(eb->i915)->gen > 5 &&
 			    instp_mode == I915_EXEC_CONSTANTS_REL_SURFACE) {
 				DRM_DEBUG("rel surface constants mode invalid on gen5+\n");
 				return -EINVAL;
 			}
 
 			/* The HW changed the meaning on this bit on gen6 */
-			if (INTEL_INFO(dev_priv)->gen >= 6)
+			if (INTEL_INFO(eb->i915)->gen >= 6)
 				instp_mask &= ~I915_EXEC_CONSTANTS_REL_SURFACE;
 		}
 		break;
@@ -1480,9 +1441,9 @@ execbuf_submit(struct i915_execbuffer_params *params,
 		return -EINVAL;
 	}
 
-	if (params->engine->id == RCS &&
-	    instp_mode != dev_priv->relative_constants_mode) {
-		cs = intel_ring_begin(params->request, 4);
+	if (eb->engine->id == RCS &&
+	    instp_mode != eb->i915->relative_constants_mode) {
+		cs = intel_ring_begin(eb->request, 4);
 		if (IS_ERR(cs))
 			return PTR_ERR(cs);
 
@@ -1490,32 +1451,27 @@ execbuf_submit(struct i915_execbuffer_params *params,
 		*cs++ = MI_LOAD_REGISTER_IMM(1);
 		*cs++ = i915_mmio_reg_offset(INSTPM);
 		*cs++ = instp_mask << 16 | instp_mode;
-		intel_ring_advance(params->request, cs);
+		intel_ring_advance(eb->request, cs);
 
-		dev_priv->relative_constants_mode = instp_mode;
+		eb->i915->relative_constants_mode = instp_mode;
 	}
 
-	if (args->flags & I915_EXEC_GEN7_SOL_RESET) {
-		ret = i915_reset_gen7_sol_offsets(params->request);
+	if (eb->args->flags & I915_EXEC_GEN7_SOL_RESET) {
+		ret = i915_reset_gen7_sol_offsets(eb->request);
 		if (ret)
 			return ret;
 	}
 
-	exec_len   = args->batch_len;
-	exec_start = params->batch->node.start +
-		     params->args_batch_start_offset;
-
-	if (exec_len == 0)
-		exec_len = params->batch->size - params->args_batch_start_offset;
-
-	ret = params->engine->emit_bb_start(params->request,
-					    exec_start, exec_len,
-					    params->dispatch_flags);
+	ret = eb->engine->emit_bb_start(eb->request,
+					eb->batch->node.start +
+					eb->batch_start_offset,
+					eb->batch_len,
+					eb->dispatch_flags);
 	if (ret)
 		return ret;
 
-	i915_gem_execbuffer_move_to_active(vmas, params->request);
-	add_to_client(params->request, params->file);
+	eb_move_to_active(eb);
+	add_to_client(eb->request, eb->file);
 
 	return 0;
 }
@@ -1640,27 +1596,16 @@ dma_buf_get_fence(int fd, unsigned int flags)
 }
 
 static int
-i915_gem_do_execbuffer(struct drm_device *dev, void *data,
+i915_gem_do_execbuffer(struct drm_device *dev,
 		       struct drm_file *file,
 		       struct drm_i915_gem_execbuffer2 *args,
 		       struct drm_i915_gem_exec_object2 *exec)
 {
-	struct drm_i915_private *dev_priv = to_i915(dev);
-	struct i915_ggtt *ggtt = &dev_priv->ggtt;
-	struct eb_vmas *eb;
-	struct drm_i915_gem_exec_object2 shadow_exec_entry;
-	struct intel_engine_cs *engine;
-	struct i915_gem_context *ctx;
-	struct i915_address_space *vm;
-	struct i915_execbuffer_params params_master; /* XXX: will be removed later */
-	struct i915_execbuffer_params *params = &params_master;
-	const u32 ctx_id = i915_execbuffer2_get_context_id(*args);
-	u32 dispatch_flags;
+	struct i915_execbuffer eb;
 	struct dma_fence *in_fence = NULL;
 	struct sync_file *out_fence = NULL;
 	int out_fence_fd = -1;
 	int ret;
-	bool need_relocs;
 
 	if (!i915_gem_check_execbuffer(args))
 		return -EINVAL;
@@ -1669,37 +1614,42 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	if (ret)
 		return ret;
 
-	dispatch_flags = 0;
+	eb.i915 = to_i915(dev);
+	eb.file = file;
+	eb.args = args;
+	eb.exec = exec;
+	eb.need_relocs = (args->flags & I915_EXEC_NO_RELOC) == 0;
+	reloc_cache_init(&eb.reloc_cache, eb.i915);
+
+	eb.batch_start_offset = args->batch_start_offset;
+	eb.batch_len = args->batch_len;
+
+	eb.dispatch_flags = 0;
 	if (args->flags & I915_EXEC_SECURE) {
 		if (!drm_is_current_master(file) || !capable(CAP_SYS_ADMIN))
 		    return -EPERM;
 
-		dispatch_flags |= I915_DISPATCH_SECURE;
+		eb.dispatch_flags |= I915_DISPATCH_SECURE;
 	}
 	if (args->flags & I915_EXEC_IS_PINNED)
-		dispatch_flags |= I915_DISPATCH_PINNED;
+		eb.dispatch_flags |= I915_DISPATCH_PINNED;
 
-	engine = eb_select_engine(dev_priv, file, args);
-	if (!engine)
+	eb.engine = eb_select_engine(eb.i915, file, args);
+	if (!eb.engine)
 		return -EINVAL;
 
-	if (args->buffer_count < 1) {
-		DRM_DEBUG("execbuf with %d buffers\n", args->buffer_count);
-		return -EINVAL;
-	}
-
 	if (args->flags & I915_EXEC_RESOURCE_STREAMER) {
-		if (!HAS_RESOURCE_STREAMER(dev_priv)) {
+		if (!HAS_RESOURCE_STREAMER(eb.i915)) {
 			DRM_DEBUG("RS is only allowed for Haswell, Gen8 and above\n");
 			return -EINVAL;
 		}
-		if (engine->id != RCS) {
+		if (eb.engine->id != RCS) {
 			DRM_DEBUG("RS is not available on %s\n",
-				 engine->name);
+				 eb.engine->name);
 			return -EINVAL;
 		}
 
-		dispatch_flags |= I915_DISPATCH_RS;
+		eb.dispatch_flags |= I915_DISPATCH_RS;
 	}
 
 	if (args->flags & I915_EXEC_FENCE_IN) {
@@ -1734,59 +1684,44 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	 * wakeref that we hold until the GPU has been idle for at least
 	 * 100ms.
 	 */
-	intel_runtime_pm_get(dev_priv);
+	intel_runtime_pm_get(eb.i915);
 
 	ret = i915_mutex_lock_interruptible(dev);
 	if (ret)
 		goto pre_mutex_err;
 
-	ctx = i915_gem_validate_context(dev, file, engine, ctx_id);
-	if (IS_ERR(ctx)) {
+	ret = eb_select_context(&eb);
+	if (ret) {
 		mutex_unlock(&dev->struct_mutex);
-		ret = PTR_ERR(ctx);
 		goto pre_mutex_err;
 	}
 
-	i915_gem_context_get(ctx);
-
-	if (ctx->ppgtt)
-		vm = &ctx->ppgtt->base;
-	else
-		vm = &ggtt->base;
-
-	memset(&params_master, 0x00, sizeof(params_master));
-
-	eb = eb_create(dev_priv, args);
-	if (eb == NULL) {
-		i915_gem_context_put(ctx);
+	if (eb_create(&eb)) {
+		i915_gem_context_put(eb.ctx);
 		mutex_unlock(&dev->struct_mutex);
 		ret = -ENOMEM;
 		goto pre_mutex_err;
 	}
 
 	/* Look up object handles */
-	ret = eb_lookup_vmas(eb, exec, args, vm, file);
+	ret = eb_lookup_vmas(&eb);
 	if (ret)
 		goto err;
 
 	/* take note of the batch buffer before we might reorder the lists */
-	params->batch = eb_get_batch(eb);
+	eb.batch = eb_get_batch(&eb);
 
 	/* Move the objects en-masse into the GTT, evicting if necessary. */
-	need_relocs = (args->flags & I915_EXEC_NO_RELOC) == 0;
-	ret = i915_gem_execbuffer_reserve(engine, &eb->vmas, ctx,
-					  &need_relocs);
+	ret = eb_reserve(&eb);
 	if (ret)
 		goto err;
 
 	/* The objects are in their final locations, apply the relocations. */
-	if (need_relocs)
-		ret = i915_gem_execbuffer_relocate(eb);
+	if (eb.need_relocs)
+		ret = eb_relocate(&eb);
 	if (ret) {
 		if (ret == -EFAULT) {
-			ret = i915_gem_execbuffer_relocate_slow(dev, args, file,
-								engine,
-								eb, exec, ctx);
+			ret = eb_relocate_slow(&eb);
 			BUG_ON(!mutex_is_locked(&dev->struct_mutex));
 		}
 		if (ret)
@@ -1794,28 +1729,22 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	}
 
 	/* Set the pending read domains for the batch buffer to COMMAND */
-	if (params->batch->obj->base.pending_write_domain) {
+	if (eb.batch->obj->base.pending_write_domain) {
 		DRM_DEBUG("Attempting to use self-modifying batch buffer\n");
 		ret = -EINVAL;
 		goto err;
 	}
-	if (args->batch_start_offset > params->batch->size ||
-	    args->batch_len > params->batch->size - args->batch_start_offset) {
+	if (eb.batch_start_offset > eb.batch->size ||
+	    eb.batch_len > eb.batch->size - eb.batch_start_offset) {
 		DRM_DEBUG("Attempting to use out-of-bounds batch\n");
 		ret = -EINVAL;
 		goto err;
 	}
 
-	params->args_batch_start_offset = args->batch_start_offset;
-	if (engine->needs_cmd_parser && args->batch_len) {
+	if (eb.engine->needs_cmd_parser && eb.batch_len) {
 		struct i915_vma *vma;
 
-		vma = i915_gem_execbuffer_parse(engine, &shadow_exec_entry,
-						params->batch->obj,
-						eb,
-						args->batch_start_offset,
-						args->batch_len,
-						drm_is_current_master(file));
+		vma = eb_parse(&eb, drm_is_current_master(file));
 		if (IS_ERR(vma)) {
 			ret = PTR_ERR(vma);
 			goto err;
@@ -1831,19 +1760,21 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 			 * specifically don't want that set on batches the
 			 * command parser has accepted.
 			 */
-			dispatch_flags |= I915_DISPATCH_SECURE;
-			params->args_batch_start_offset = 0;
-			params->batch = vma;
+			eb.dispatch_flags |= I915_DISPATCH_SECURE;
+			eb.batch_start_offset = 0;
+			eb.batch = vma;
 		}
 	}
 
-	params->batch->obj->base.pending_read_domains |= I915_GEM_DOMAIN_COMMAND;
+	eb.batch->obj->base.pending_read_domains |= I915_GEM_DOMAIN_COMMAND;
+	if (eb.batch_len == 0)
+		eb.batch_len = eb.batch->size - eb.batch_start_offset;
 
 	/* snb/ivb/vlv conflate the "batch in ppgtt" bit with the "non-secure
 	 * batch" bit. Hence we need to pin secure batches into the global gtt.
 	 * hsw should have this fixed, but bdw mucks it up again. */
-	if (dispatch_flags & I915_DISPATCH_SECURE) {
-		struct drm_i915_gem_object *obj = params->batch->obj;
+	if (eb.dispatch_flags & I915_DISPATCH_SECURE) {
+		struct drm_i915_gem_object *obj = eb.batch->obj;
 		struct i915_vma *vma;
 
 		/*
@@ -1862,25 +1793,24 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 			goto err;
 		}
 
-		params->batch = vma;
+		eb.batch = vma;
 	}
 
 	/* Allocate a request for this batch buffer nice and early. */
-	params->request = i915_gem_request_alloc(engine, ctx);
-	if (IS_ERR(params->request)) {
-		ret = PTR_ERR(params->request);
+	eb.request = i915_gem_request_alloc(eb.engine, eb.ctx);
+	if (IS_ERR(eb.request)) {
+		ret = PTR_ERR(eb.request);
 		goto err_batch_unpin;
 	}
 
 	if (in_fence) {
-		ret = i915_gem_request_await_dma_fence(params->request,
-						       in_fence);
+		ret = i915_gem_request_await_dma_fence(eb.request, in_fence);
 		if (ret < 0)
 			goto err_request;
 	}
 
 	if (out_fence_fd != -1) {
-		out_fence = sync_file_create(&params->request->fence);
+		out_fence = sync_file_create(&eb.request->fence);
 		if (!out_fence) {
 			ret = -ENOMEM;
 			goto err_request;
@@ -1893,25 +1823,13 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	 * inactive_list and lose its active reference. Hence we do not need
 	 * to explicitly hold another reference here.
 	 */
-	params->request->batch = params->batch;
-
-	/*
-	 * Save assorted stuff away to pass through to *_submission().
-	 * NB: This data should be 'persistent' and not local as it will
-	 * kept around beyond the duration of the IOCTL once the GPU
-	 * scheduler arrives.
-	 */
-	params->dev                     = dev;
-	params->file                    = file;
-	params->engine                    = engine;
-	params->dispatch_flags          = dispatch_flags;
-	params->ctx                     = ctx;
-
-	trace_i915_gem_request_queue(params->request, dispatch_flags);
+	eb.request->batch = eb.batch;
 
-	ret = execbuf_submit(params, args, &eb->vmas);
+	trace_i915_gem_request_queue(eb.request, eb.dispatch_flags);
+	ret = execbuf_submit(&eb);
 err_request:
-	__i915_add_request(params->request, ret == 0);
+	__i915_add_request(eb.request, ret == 0);
+
 	if (out_fence) {
 		if (ret == 0) {
 			fd_install(out_fence_fd, out_fence->file);
@@ -1930,19 +1848,17 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	 * needs to be adjusted to also track the ggtt batch vma properly as
 	 * active.
 	 */
-	if (dispatch_flags & I915_DISPATCH_SECURE)
-		i915_vma_unpin(params->batch);
+	if (eb.dispatch_flags & I915_DISPATCH_SECURE)
+		i915_vma_unpin(eb.batch);
 err:
 	/* the request owns the ref now */
-	i915_gem_context_put(ctx);
-	eb_destroy(eb);
-
+	eb_destroy(&eb);
 	mutex_unlock(&dev->struct_mutex);
 
 pre_mutex_err:
 	/* intel_gpu_busy should also get a ref, so it will free when the device
 	 * is really idle. */
-	intel_runtime_pm_put(dev_priv);
+	intel_runtime_pm_put(eb.i915);
 	if (out_fence_fd != -1)
 		put_unused_fd(out_fence_fd);
 err_in_fence:
@@ -2013,7 +1929,7 @@ i915_gem_execbuffer(struct drm_device *dev, void *data,
 	exec2.flags = I915_EXEC_RENDER;
 	i915_execbuffer2_set_context_id(exec2, 0);
 
-	ret = i915_gem_do_execbuffer(dev, data, file, &exec2, exec2_list);
+	ret = i915_gem_do_execbuffer(dev, file, &exec2, exec2_list);
 	if (!ret) {
 		struct drm_i915_gem_exec_object __user *user_exec_list =
 			u64_to_user_ptr(args->buffers_ptr);
@@ -2072,7 +1988,7 @@ i915_gem_execbuffer2(struct drm_device *dev, void *data,
 		return -EFAULT;
 	}
 
-	ret = i915_gem_do_execbuffer(dev, data, file, args, exec2_list);
+	ret = i915_gem_do_execbuffer(dev, file, args, exec2_list);
 	if (!ret) {
 		/* Copy the new buffer offsets back to the user's exec list. */
 		struct drm_i915_gem_exec_object2 __user *user_exec_list =
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 05/15] drm/i915: Use vma->exec_entry as our double-entry placeholder
  2017-02-23 16:18 Make execbuf fast[er] Chris Wilson
                   ` (3 preceding siblings ...)
  2017-02-23 16:18 ` [PATCH 04/15] drm/i915: Amalgamate execbuffer parameter structures Chris Wilson
@ 2017-02-23 16:18 ` Chris Wilson
  2017-02-23 16:18 ` [PATCH 06/15] drm/i915: Split vma exec_link/evict_link Chris Wilson
                   ` (9 subsequent siblings)
  14 siblings, 0 replies; 23+ messages in thread
From: Chris Wilson @ 2017-02-23 16:18 UTC (permalink / raw)
  To: intel-gfx

This has the benefit of not requiring us to manipulate the
vma->exec_link list when tearing down the execbuffer, and is a
marginally cheaper test to detect the user error.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem_evict.c      | 17 ++-----
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 77 ++++++++++++++++--------------
 drivers/gpu/drm/i915/i915_vma.c            |  1 -
 3 files changed, 44 insertions(+), 51 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_evict.c b/drivers/gpu/drm/i915/i915_gem_evict.c
index a0de5734f7d0..4753c3f46f7e 100644
--- a/drivers/gpu/drm/i915/i915_gem_evict.c
+++ b/drivers/gpu/drm/i915/i915_gem_evict.c
@@ -59,9 +59,6 @@ mark_free(struct drm_mm_scan *scan,
 	if (i915_vma_is_pinned(vma))
 		return false;
 
-	if (WARN_ON(!list_empty(&vma->exec_list)))
-		return false;
-
 	if (flags & PIN_NONFAULT && !list_empty(&vma->obj->userfault_link))
 		return false;
 
@@ -160,8 +157,6 @@ i915_gem_evict_something(struct i915_address_space *vm,
 	list_for_each_entry_safe(vma, next, &eviction_list, exec_list) {
 		ret = drm_mm_scan_remove_block(&scan, &vma->node);
 		BUG_ON(ret);
-
-		INIT_LIST_HEAD(&vma->exec_list);
 	}
 
 	/* Can we unpin some objects such as idle hw contents,
@@ -210,17 +205,12 @@ i915_gem_evict_something(struct i915_address_space *vm,
 		if (drm_mm_scan_remove_block(&scan, &vma->node))
 			__i915_vma_pin(vma);
 		else
-			list_del_init(&vma->exec_list);
+			list_del(&vma->exec_list);
 	}
 
 	/* Unbinding will emit any required flushes */
 	ret = 0;
-	while (!list_empty(&eviction_list)) {
-		vma = list_first_entry(&eviction_list,
-				       struct i915_vma,
-				       exec_list);
-
-		list_del_init(&vma->exec_list);
+	list_for_each_entry_safe(vma, next, &eviction_list, exec_list) {
 		__i915_vma_unpin(vma);
 		if (ret == 0)
 			ret = i915_vma_unbind(vma);
@@ -316,7 +306,7 @@ int i915_gem_evict_for_node(struct i915_address_space *vm,
 		}
 
 		/* Overlap of objects in the same batch? */
-		if (i915_vma_is_pinned(vma) || !list_empty(&vma->exec_list)) {
+		if (i915_vma_is_pinned(vma)) {
 			ret = -ENOSPC;
 			if (vma->exec_entry &&
 			    vma->exec_entry->flags & EXEC_OBJECT_PINNED)
@@ -337,7 +327,6 @@ int i915_gem_evict_for_node(struct i915_address_space *vm,
 	}
 
 	list_for_each_entry_safe(vma, next, &eviction_list, exec_list) {
-		list_del_init(&vma->exec_list);
 		__i915_vma_unpin(vma);
 		if (ret == 0)
 			ret = i915_vma_unbind(vma);
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 0f6c7acbc063..9c1dacabe7ef 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -106,13 +106,40 @@ eb_create(struct i915_execbuffer *eb)
 	} else
 		eb->and = -eb->args->buffer_count;
 
-	INIT_LIST_HEAD(&eb->vmas);
 	return 0;
 }
 
+static inline void
+__eb_unreserve_vma(struct i915_vma *vma,
+		   const struct drm_i915_gem_exec_object2 *entry)
+{
+	if (unlikely(entry->flags & __EXEC_OBJECT_HAS_FENCE))
+		i915_vma_unpin_fence(vma);
+
+	if (entry->flags & __EXEC_OBJECT_HAS_PIN)
+		__i915_vma_unpin(vma);
+}
+
+static void
+eb_unreserve_vma(struct i915_vma *vma)
+{
+	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
+
+	__eb_unreserve_vma(vma, entry);
+	entry->flags &= ~(__EXEC_OBJECT_HAS_FENCE | __EXEC_OBJECT_HAS_PIN);
+}
+
 static void
 eb_reset(struct i915_execbuffer *eb)
 {
+	struct i915_vma *vma;
+
+	list_for_each_entry(vma, &eb->vmas, exec_list) {
+		eb_unreserve_vma(vma);
+		i915_vma_put(vma);
+		vma->exec_entry = NULL;
+	}
+
 	if (eb->and >= 0)
 		memset(eb->buckets, 0, (eb->and+1)*sizeof(struct hlist_head));
 }
@@ -144,6 +171,8 @@ eb_lookup_vmas(struct i915_execbuffer *eb)
 	struct list_head objects;
 	int i, ret;
 
+	INIT_LIST_HEAD(&eb->vmas);
+
 	INIT_LIST_HEAD(&objects);
 	spin_lock(&eb->file->table_lock);
 	/* Grab a reference to the object and release the lock so we can lookup
@@ -250,40 +279,23 @@ static struct i915_vma *eb_get_vma(struct i915_execbuffer *eb, unsigned long han
 	}
 }
 
-static void
-eb_unreserve_vma(struct i915_vma *vma)
-{
-	struct drm_i915_gem_exec_object2 *entry;
-
-	if (!drm_mm_node_allocated(&vma->node))
-		return;
-
-	entry = vma->exec_entry;
-
-	if (entry->flags & __EXEC_OBJECT_HAS_FENCE)
-		i915_vma_unpin_fence(vma);
-
-	if (entry->flags & __EXEC_OBJECT_HAS_PIN)
-		__i915_vma_unpin(vma);
-
-	entry->flags &= ~(__EXEC_OBJECT_HAS_FENCE | __EXEC_OBJECT_HAS_PIN);
-}
-
 static void eb_destroy(struct i915_execbuffer *eb)
 {
-	i915_gem_context_put(eb->ctx);
+	struct i915_vma *vma;
 
-	while (!list_empty(&eb->vmas)) {
-		struct i915_vma *vma;
+	list_for_each_entry(vma, &eb->vmas, exec_list) {
+		if (!vma->exec_entry)
+			continue;
 
-		vma = list_first_entry(&eb->vmas,
-				       struct i915_vma,
-				       exec_list);
-		list_del_init(&vma->exec_list);
-		eb_unreserve_vma(vma);
+		__eb_unreserve_vma(vma, vma->exec_entry);
 		vma->exec_entry = NULL;
 		i915_vma_put(vma);
 	}
+
+	i915_gem_context_put(eb->ctx);
+
+	if (eb->buckets)
+		kfree(eb->buckets);
 }
 
 static inline int use_cpu_reloc(struct drm_i915_gem_object *obj)
@@ -980,13 +992,7 @@ eb_relocate_slow(struct i915_execbuffer *eb)
 	int i, total, ret;
 
 	/* We may process another execbuffer during the unlock... */
-	while (!list_empty(&eb->vmas)) {
-		vma = list_first_entry(&eb->vmas, struct i915_vma, exec_list);
-		list_del_init(&vma->exec_list);
-		eb_unreserve_vma(vma);
-		i915_vma_put(vma);
-	}
-
+	eb_reset(eb);
 	mutex_unlock(&dev->struct_mutex);
 
 	total = 0;
@@ -1047,7 +1053,6 @@ eb_relocate_slow(struct i915_execbuffer *eb)
 	}
 
 	/* reacquire the objects */
-	eb_reset(eb);
 	ret = eb_lookup_vmas(eb);
 	if (ret)
 		goto err;
diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index c1abfe7b48ea..fab3fa2062c5 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -85,7 +85,6 @@ vma_create(struct drm_i915_gem_object *obj,
 	if (vma == NULL)
 		return ERR_PTR(-ENOMEM);
 
-	INIT_LIST_HEAD(&vma->exec_list);
 	for (i = 0; i < ARRAY_SIZE(vma->last_read); i++)
 		init_request_active(&vma->last_read[i], i915_vma_retire);
 	init_request_active(&vma->last_fence, NULL);
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 06/15] drm/i915: Split vma exec_link/evict_link
  2017-02-23 16:18 Make execbuf fast[er] Chris Wilson
                   ` (4 preceding siblings ...)
  2017-02-23 16:18 ` [PATCH 05/15] drm/i915: Use vma->exec_entry as our double-entry placeholder Chris Wilson
@ 2017-02-23 16:18 ` Chris Wilson
  2017-02-24 12:20   ` Mika Kuoppala
  2017-02-23 16:18 ` [PATCH 07/15] drm/i915: Stop using obj->obj_exec_link outside of execbuf Chris Wilson
                   ` (8 subsequent siblings)
  14 siblings, 1 reply; 23+ messages in thread
From: Chris Wilson @ 2017-02-23 16:18 UTC (permalink / raw)
  To: intel-gfx

Currently the vma has one link member that is used for both holding its
place in the execbuf reservation list, and in any eviction list. This
dual property is quite tricky and error prone.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem_evict.c      | 14 ++++++-------
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 32 +++++++++++++++---------------
 drivers/gpu/drm/i915/i915_vma.h            |  7 +++++--
 3 files changed, 28 insertions(+), 25 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_evict.c b/drivers/gpu/drm/i915/i915_gem_evict.c
index 4753c3f46f7e..2a6eb2ceff79 100644
--- a/drivers/gpu/drm/i915/i915_gem_evict.c
+++ b/drivers/gpu/drm/i915/i915_gem_evict.c
@@ -62,7 +62,7 @@ mark_free(struct drm_mm_scan *scan,
 	if (flags & PIN_NONFAULT && !list_empty(&vma->obj->userfault_link))
 		return false;
 
-	list_add(&vma->exec_list, unwind);
+	list_add(&vma->evict_link, unwind);
 	return drm_mm_scan_add_block(scan, &vma->node);
 }
 
@@ -154,7 +154,7 @@ i915_gem_evict_something(struct i915_address_space *vm,
 	} while (*++phase);
 
 	/* Nothing found, clean up and bail out! */
-	list_for_each_entry_safe(vma, next, &eviction_list, exec_list) {
+	list_for_each_entry_safe(vma, next, &eviction_list, evict_link) {
 		ret = drm_mm_scan_remove_block(&scan, &vma->node);
 		BUG_ON(ret);
 	}
@@ -201,16 +201,16 @@ i915_gem_evict_something(struct i915_address_space *vm,
 	 * calling unbind (which may remove the active reference
 	 * of any of our objects, thus corrupting the list).
 	 */
-	list_for_each_entry_safe(vma, next, &eviction_list, exec_list) {
+	list_for_each_entry_safe(vma, next, &eviction_list, evict_link) {
 		if (drm_mm_scan_remove_block(&scan, &vma->node))
 			__i915_vma_pin(vma);
 		else
-			list_del(&vma->exec_list);
+			list_del(&vma->evict_link);
 	}
 
 	/* Unbinding will emit any required flushes */
 	ret = 0;
-	list_for_each_entry_safe(vma, next, &eviction_list, exec_list) {
+	list_for_each_entry_safe(vma, next, &eviction_list, evict_link) {
 		__i915_vma_unpin(vma);
 		if (ret == 0)
 			ret = i915_vma_unbind(vma);
@@ -323,10 +323,10 @@ int i915_gem_evict_for_node(struct i915_address_space *vm,
 		 * reference) another in our eviction list.
 		 */
 		__i915_vma_pin(vma);
-		list_add(&vma->exec_list, &eviction_list);
+		list_add(&vma->evict_link, &eviction_list);
 	}
 
-	list_for_each_entry_safe(vma, next, &eviction_list, exec_list) {
+	list_for_each_entry_safe(vma, next, &eviction_list, evict_link) {
 		__i915_vma_unpin(vma);
 		if (ret == 0)
 			ret = i915_vma_unbind(vma);
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 9c1dacabe7ef..c229d69b8757 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -134,7 +134,7 @@ eb_reset(struct i915_execbuffer *eb)
 {
 	struct i915_vma *vma;
 
-	list_for_each_entry(vma, &eb->vmas, exec_list) {
+	list_for_each_entry(vma, &eb->vmas, exec_link) {
 		eb_unreserve_vma(vma);
 		i915_vma_put(vma);
 		vma->exec_entry = NULL;
@@ -147,7 +147,7 @@ eb_reset(struct i915_execbuffer *eb)
 static struct i915_vma *
 eb_get_batch(struct i915_execbuffer *eb)
 {
-	struct i915_vma *vma = list_entry(eb->vmas.prev, typeof(*vma), exec_list);
+	struct i915_vma *vma = list_entry(eb->vmas.prev, typeof(*vma), exec_link);
 
 	/*
 	 * SNA is doing fancy tricks with compressing batch buffers, which leads
@@ -224,7 +224,7 @@ eb_lookup_vmas(struct i915_execbuffer *eb)
 		}
 
 		/* Transfer ownership from the objects list to the vmas list. */
-		list_add_tail(&vma->exec_list, &eb->vmas);
+		list_add_tail(&vma->exec_link, &eb->vmas);
 		list_del_init(&obj->obj_exec_link);
 
 		vma->exec_entry = &eb->exec[i];
@@ -283,7 +283,7 @@ static void eb_destroy(struct i915_execbuffer *eb)
 {
 	struct i915_vma *vma;
 
-	list_for_each_entry(vma, &eb->vmas, exec_list) {
+	list_for_each_entry(vma, &eb->vmas, exec_link) {
 		if (!vma->exec_entry)
 			continue;
 
@@ -748,7 +748,7 @@ static int eb_relocate(struct i915_execbuffer *eb)
 	struct i915_vma *vma;
 	int ret = 0;
 
-	list_for_each_entry(vma, &eb->vmas, exec_list) {
+	list_for_each_entry(vma, &eb->vmas, exec_link) {
 		ret = eb_relocate_vma(vma, eb);
 		if (ret)
 			break;
@@ -900,7 +900,7 @@ static int eb_reserve(struct i915_execbuffer *eb)
 		struct drm_i915_gem_exec_object2 *entry;
 		bool need_fence, need_mappable;
 
-		vma = list_first_entry(&eb->vmas, struct i915_vma, exec_list);
+		vma = list_first_entry(&eb->vmas, struct i915_vma, exec_link);
 		obj = vma->obj;
 		entry = vma->exec_entry;
 
@@ -915,12 +915,12 @@ static int eb_reserve(struct i915_execbuffer *eb)
 		need_mappable = need_fence || need_reloc_mappable(vma);
 
 		if (entry->flags & EXEC_OBJECT_PINNED)
-			list_move_tail(&vma->exec_list, &pinned_vmas);
+			list_move_tail(&vma->exec_link, &pinned_vmas);
 		else if (need_mappable) {
 			entry->flags |= __EXEC_OBJECT_NEEDS_MAP;
-			list_move(&vma->exec_list, &ordered_vmas);
+			list_move(&vma->exec_link, &ordered_vmas);
 		} else
-			list_move_tail(&vma->exec_list, &ordered_vmas);
+			list_move_tail(&vma->exec_link, &ordered_vmas);
 
 		obj->base.pending_read_domains = I915_GEM_GPU_DOMAINS & ~I915_GEM_DOMAIN_COMMAND;
 		obj->base.pending_write_domain = 0;
@@ -945,7 +945,7 @@ static int eb_reserve(struct i915_execbuffer *eb)
 		int ret = 0;
 
 		/* Unbind any ill-fitting objects or pin. */
-		list_for_each_entry(vma, &eb->vmas, exec_list) {
+		list_for_each_entry(vma, &eb->vmas, exec_link) {
 			if (!drm_mm_node_allocated(&vma->node))
 				continue;
 
@@ -958,7 +958,7 @@ static int eb_reserve(struct i915_execbuffer *eb)
 		}
 
 		/* Bind fresh objects */
-		list_for_each_entry(vma, &eb->vmas, exec_list) {
+		list_for_each_entry(vma, &eb->vmas, exec_link) {
 			if (drm_mm_node_allocated(&vma->node))
 				continue;
 
@@ -972,7 +972,7 @@ static int eb_reserve(struct i915_execbuffer *eb)
 			return ret;
 
 		/* Decrement pin count for bound objects */
-		list_for_each_entry(vma, &eb->vmas, exec_list)
+		list_for_each_entry(vma, &eb->vmas, exec_link)
 			eb_unreserve_vma(vma);
 
 		ret = i915_gem_evict_vm(eb->vm, true);
@@ -1061,7 +1061,7 @@ eb_relocate_slow(struct i915_execbuffer *eb)
 	if (ret)
 		goto err;
 
-	list_for_each_entry(vma, &eb->vmas, exec_list) {
+	list_for_each_entry(vma, &eb->vmas, exec_link) {
 		int idx = vma->exec_entry - eb->exec;
 
 		ret = eb_relocate_vma_slow(vma, eb, reloc + reloc_offset[idx]);
@@ -1087,7 +1087,7 @@ eb_move_to_gpu(struct i915_execbuffer *eb)
 	struct i915_vma *vma;
 	int ret;
 
-	list_for_each_entry(vma, &eb->vmas, exec_list) {
+	list_for_each_entry(vma, &eb->vmas, exec_link) {
 		struct drm_i915_gem_object *obj = vma->obj;
 
 		if (vma->exec_entry->flags & EXEC_OBJECT_CAPTURE) {
@@ -1309,7 +1309,7 @@ eb_move_to_active(struct i915_execbuffer *eb)
 {
 	struct i915_vma *vma;
 
-	list_for_each_entry(vma, &eb->vmas, exec_list) {
+	list_for_each_entry(vma, &eb->vmas, exec_link) {
 		struct drm_i915_gem_object *obj = vma->obj;
 
 		obj->base.write_domain = obj->base.pending_write_domain;
@@ -1383,7 +1383,7 @@ static struct i915_vma *eb_parse(struct i915_execbuffer *eb, bool is_master)
 		memset(&eb->shadow_exec_entry, 0, sizeof(*vma->exec_entry));
 	vma->exec_entry->flags = __EXEC_OBJECT_HAS_PIN;
 	i915_gem_object_get(shadow_batch_obj);
-	list_add_tail(&vma->exec_list, &eb->vmas);
+	list_add_tail(&vma->exec_link, &eb->vmas);
 
 out:
 	i915_gem_object_unpin_pages(shadow_batch_obj);
diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h
index 2e03f81dddbe..4d827300d1a8 100644
--- a/drivers/gpu/drm/i915/i915_vma.h
+++ b/drivers/gpu/drm/i915/i915_vma.h
@@ -100,8 +100,11 @@ struct i915_vma {
 	struct list_head obj_link; /* Link in the object's VMA list */
 	struct rb_node obj_node;
 
-	/** This vma's place in the batchbuffer or on the eviction list */
-	struct list_head exec_list;
+	/** This vma's place in the execbuf reservation list */
+	struct list_head exec_link;
+
+	/** This vma's place in the eviction list */
+	struct list_head evict_link;
 
 	/**
 	 * Used for performing relocations during execbuffer insertion.
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 07/15] drm/i915: Stop using obj->obj_exec_link outside of execbuf
  2017-02-23 16:18 Make execbuf fast[er] Chris Wilson
                   ` (5 preceding siblings ...)
  2017-02-23 16:18 ` [PATCH 06/15] drm/i915: Split vma exec_link/evict_link Chris Wilson
@ 2017-02-23 16:18 ` Chris Wilson
  2017-02-24 12:32   ` Mika Kuoppala
  2017-02-23 16:18 ` [PATCH 08/15] drm/i915: Store a direct lookup from object handle to vma Chris Wilson
                   ` (7 subsequent siblings)
  14 siblings, 1 reply; 23+ messages in thread
From: Chris Wilson @ 2017-02-23 16:18 UTC (permalink / raw)
  To: intel-gfx

i915_gem_stolen_list_info() sneakily takes advantage of the
obj->obj_exec_link to save itself from having to allocate. Enough of the
subterfuge, just allocate an array of pointers and sort them instead of
the list.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_debugfs.c | 52 ++++++++++++++++++++-----------------
 1 file changed, 28 insertions(+), 24 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index ddae8e442176..75efa1ae234e 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -27,7 +27,7 @@
  */
 
 #include <linux/debugfs.h>
-#include <linux/list_sort.h>
+#include <linux/sort.h>
 #include "intel_drv.h"
 
 static inline struct drm_i915_private *node_to_i915(struct drm_info_node *node)
@@ -230,13 +230,12 @@ describe_obj(struct seq_file *m, struct drm_i915_gem_object *obj)
 		seq_printf(m, " (frontbuffer: 0x%03x)", frontbuffer_bits);
 }
 
-static int obj_rank_by_stolen(void *priv,
-			      struct list_head *A, struct list_head *B)
+static int obj_rank_by_stolen(const void *A, const void *B)
 {
-	struct drm_i915_gem_object *a =
-		container_of(A, struct drm_i915_gem_object, obj_exec_link);
-	struct drm_i915_gem_object *b =
-		container_of(B, struct drm_i915_gem_object, obj_exec_link);
+	const struct drm_i915_gem_object *a =
+		*(const struct drm_i915_gem_object **)A;
+	const struct drm_i915_gem_object *b =
+		*(const struct drm_i915_gem_object **)B;
 
 	if (a->stolen->start < b->stolen->start)
 		return -1;
@@ -249,49 +248,54 @@ static int i915_gem_stolen_list_info(struct seq_file *m, void *data)
 {
 	struct drm_i915_private *dev_priv = node_to_i915(m->private);
 	struct drm_device *dev = &dev_priv->drm;
+	struct drm_i915_gem_object **objects;
 	struct drm_i915_gem_object *obj;
 	u64 total_obj_size, total_gtt_size;
-	LIST_HEAD(stolen);
-	int count, ret;
+	unsigned long count, n;
+	int ret;
 
 	ret = mutex_lock_interruptible(&dev->struct_mutex);
 	if (ret)
 		return ret;
 
+	objects = drm_malloc_ab(dev_priv->mm.object_count, sizeof(*objects));
+	if (!objects) {
+		ret = -ENOMEM;
+		goto out_unlock;
+	}
+
 	total_obj_size = total_gtt_size = count = 0;
 	list_for_each_entry(obj, &dev_priv->mm.bound_list, global_link) {
 		if (obj->stolen == NULL)
 			continue;
 
-		list_add(&obj->obj_exec_link, &stolen);
-
+		objects[count++] = obj;
 		total_obj_size += obj->base.size;
 		total_gtt_size += i915_gem_obj_total_ggtt_size(obj);
-		count++;
 	}
 	list_for_each_entry(obj, &dev_priv->mm.unbound_list, global_link) {
 		if (obj->stolen == NULL)
 			continue;
 
-		list_add(&obj->obj_exec_link, &stolen);
-
+		objects[count++] = obj;
 		total_obj_size += obj->base.size;
-		count++;
 	}
-	list_sort(NULL, &stolen, obj_rank_by_stolen);
+
+	sort(objects, count, sizeof(*objects), obj_rank_by_stolen, NULL);
+
 	seq_puts(m, "Stolen:\n");
-	while (!list_empty(&stolen)) {
-		obj = list_first_entry(&stolen, typeof(*obj), obj_exec_link);
+	for (n = 0; n < count; n++) {
 		seq_puts(m, "   ");
-		describe_obj(m, obj);
+		describe_obj(m, objects[n]);
 		seq_putc(m, '\n');
-		list_del_init(&obj->obj_exec_link);
 	}
-	mutex_unlock(&dev->struct_mutex);
-
-	seq_printf(m, "Total %d objects, %llu bytes, %llu GTT size\n",
+	seq_printf(m, "Total %lu objects, %llu bytes, %llu GTT size\n",
 		   count, total_obj_size, total_gtt_size);
-	return 0;
+
+	drm_free_large(objects);
+out_unlock:
+	mutex_unlock(&dev->struct_mutex);
+	return ret;
 }
 
 struct file_stats {
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 08/15] drm/i915: Store a direct lookup from object handle to vma
  2017-02-23 16:18 Make execbuf fast[er] Chris Wilson
                   ` (6 preceding siblings ...)
  2017-02-23 16:18 ` [PATCH 07/15] drm/i915: Stop using obj->obj_exec_link outside of execbuf Chris Wilson
@ 2017-02-23 16:18 ` Chris Wilson
  2017-02-23 16:18 ` [PATCH 09/15] drm/i915: Pass vma to relocate entry Chris Wilson
                   ` (6 subsequent siblings)
  14 siblings, 0 replies; 23+ messages in thread
From: Chris Wilson @ 2017-02-23 16:18 UTC (permalink / raw)
  To: intel-gfx

The advent of full-ppgtt lead to an extra indirection between the object
and its binding. That extra indirection has a noticeable impact on how
fast we can convert from the user handles to our internal vma for
execbuffer. In order to bypass the extra indirection, we use a
resizable hashtable to jump from the object to the per-ctx vma.
rhashtable was considered but we don't need the online resizing feature
and the extra complexity proved to undermine its usefulness. Instead, we
simply reallocate the hastable on demand in a background task and
serialize it before iterating.

In non-full-ppgtt modes, multiple files and multiple contexts can share
the same vma. This leads to having multiple possible handle->vma links,
so we only use the first to establish the fast path. The majority of
buffers are not shared and so we should still be able to realise
speedups with multiple clients.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_debugfs.c           |   5 +
 drivers/gpu/drm/i915/i915_drv.h               |   2 +-
 drivers/gpu/drm/i915/i915_gem.c               |   5 +-
 drivers/gpu/drm/i915/i915_gem_context.c       |  73 ++++++++
 drivers/gpu/drm/i915/i915_gem_context.h       |   8 +
 drivers/gpu/drm/i915/i915_gem_execbuffer.c    | 245 +++++++++++++++-----------
 drivers/gpu/drm/i915/i915_gem_gtt.h           |   1 +
 drivers/gpu/drm/i915/i915_gem_object.h        |   4 +-
 drivers/gpu/drm/i915/i915_vma.c               |  20 +++
 drivers/gpu/drm/i915/i915_vma.h               |   8 +-
 drivers/gpu/drm/i915/selftests/mock_context.c |  11 +-
 11 files changed, 271 insertions(+), 111 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 75efa1ae234e..972aef13587e 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -2004,6 +2004,11 @@ static int i915_context_status(struct seq_file *m, void *unused)
 			seq_putc(m, '\n');
 		}
 
+		seq_printf(m, "\tvma hashtable size=%u (actual %u), count=%u\n",
+			   ctx->vma.ht_size,
+			   1 << ctx->vma.ht_bits,
+			   ctx->vma.ht_count);
+
 		seq_putc(m, '\n');
 	}
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 2cc0253d6ef7..00f8bcec5d71 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -37,7 +37,7 @@
 #include <linux/i2c.h>
 #include <linux/i2c-algo-bit.h>
 #include <linux/backlight.h>
-#include <linux/hashtable.h>
+#include <linux/hash.h>
 #include <linux/intel-iommu.h>
 #include <linux/kref.h>
 #include <linux/pm_qos.h>
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 92ab989bb05f..4974b150bc3a 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3022,6 +3022,10 @@ void i915_gem_close_object(struct drm_gem_object *gem, struct drm_file *file)
 		if (vma->vm->file == fpriv)
 			i915_vma_close(vma);
 
+	vma = obj->vma_hashed;
+	if (vma && vma->ctx->file_priv == fpriv)
+		i915_vma_unlink_ctx(vma);
+
 	if (i915_gem_object_is_active(obj) &&
 	    !i915_gem_object_has_active_reference(obj)) {
 		i915_gem_object_set_active_reference(obj);
@@ -3970,7 +3974,6 @@ void i915_gem_object_init(struct drm_i915_gem_object *obj,
 
 	INIT_LIST_HEAD(&obj->global_link);
 	INIT_LIST_HEAD(&obj->userfault_link);
-	INIT_LIST_HEAD(&obj->obj_exec_link);
 	INIT_LIST_HEAD(&obj->vma_list);
 	INIT_LIST_HEAD(&obj->batch_pool_link);
 
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 41d4fa569bcf..1764c70e6460 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -85,6 +85,7 @@
  *
  */
 
+#include <linux/log2.h>
 #include <drm/drmP.h>
 #include <drm/i915_drm.h>
 #include "i915_drv.h"
@@ -92,6 +93,9 @@
 
 #define ALL_L3_SLICES(dev) (1 << NUM_L3_SLICES(dev)) - 1
 
+/* Initial size (as log2) to preallocate the handle->object hashtable */
+#define VMA_HT_BITS 2u /* 4 x 2 pointers, 64 bytes minimum */
+
 static int get_context_size(struct drm_i915_private *dev_priv)
 {
 	int ret;
@@ -119,6 +123,64 @@ static int get_context_size(struct drm_i915_private *dev_priv)
 	return ret;
 }
 
+static void resize_vma_ht(struct work_struct *work)
+{
+	struct i915_gem_context *ctx =
+		container_of(work, typeof(*ctx), vma.resize);
+	unsigned int size, bits, new_bits, i;
+	struct hlist_head *new_ht;
+
+	bits = 1 + ilog2(4*ctx->vma.ht_count/3);
+	new_bits = min_t(unsigned int,
+			 max(bits, VMA_HT_BITS),
+			 sizeof(unsigned int)*8);
+	if (new_bits == ctx->vma.ht_bits)
+		goto out;
+
+	new_ht = kzalloc(sizeof(*new_ht)<<new_bits, GFP_KERNEL | __GFP_NOWARN);
+	if (!new_ht)
+		new_ht = vzalloc(sizeof(*new_ht)<<new_bits);
+	if (!new_ht)
+		/* pretend the resize suceeded and stop calling us for a bit! */
+		goto out;
+
+	size = 1 << ctx->vma.ht_bits;
+	for (i = 0; i < size; i++) {
+		struct i915_vma *vma;
+		struct hlist_node *tmp;
+
+		hlist_for_each_entry_safe(vma, tmp, &ctx->vma.ht[i], ctx_node)
+			hlist_add_head(&vma->ctx_node,
+				       &new_ht[hash_32(vma->ctx_handle,
+						       new_bits)]);
+	}
+	kvfree(ctx->vma.ht);
+	ctx->vma.ht = new_ht;
+	ctx->vma.ht_bits = new_bits;
+	smp_wmb();
+out:
+	ctx->vma.ht_size = 1 << bits;
+}
+
+static void decouple_vma(struct i915_gem_context *ctx)
+{
+	unsigned int i, size;
+
+	if (ctx->vma.ht_size & 1)
+		cancel_work_sync(&ctx->vma.resize);
+
+	size = 1 << ctx->vma.ht_bits;
+	for (i = 0; i < size; i++) {
+		struct i915_vma *vma;
+
+		hlist_for_each_entry(vma, &ctx->vma.ht[i], ctx_node) {
+			vma->obj->vma_hashed = NULL;
+			vma->ctx = NULL;
+		}
+	}
+	kvfree(ctx->vma.ht);
+}
+
 void i915_gem_context_free(struct kref *ctx_ref)
 {
 	struct i915_gem_context *ctx = container_of(ctx_ref, typeof(*ctx), ref);
@@ -128,6 +190,7 @@ void i915_gem_context_free(struct kref *ctx_ref)
 	trace_i915_context_free(ctx);
 	GEM_BUG_ON(!i915_gem_context_is_closed(ctx));
 
+	decouple_vma(ctx);
 	i915_ppgtt_put(ctx->ppgtt);
 
 	for (i = 0; i < I915_NUM_ENGINES; i++) {
@@ -145,6 +208,7 @@ void i915_gem_context_free(struct kref *ctx_ref)
 
 	kfree(ctx->name);
 	put_pid(ctx->pid);
+
 	list_del(&ctx->link);
 
 	ida_simple_remove(&ctx->i915->context_hw_ida, ctx->hw_id);
@@ -266,6 +330,15 @@ __create_hw_context(struct drm_i915_private *dev_priv,
 	list_add_tail(&ctx->link, &dev_priv->context_list);
 	ctx->i915 = dev_priv;
 
+	ctx->vma.ht_bits = VMA_HT_BITS;
+	ctx->vma.ht_size = 1 << ctx->vma.ht_bits;
+	ctx->vma.ht = kzalloc(sizeof(*ctx->vma.ht)*ctx->vma.ht_size,
+			      GFP_KERNEL);
+	if (!ctx->vma.ht)
+		goto err_out;
+
+	INIT_WORK(&ctx->vma.resize, resize_vma_ht);
+
 	if (dev_priv->hw_context_size) {
 		struct drm_i915_gem_object *obj;
 		struct i915_vma *vma;
diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
index 81268c9770a6..ecdf3e92dac2 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.h
+++ b/drivers/gpu/drm/i915/i915_gem_context.h
@@ -143,6 +143,14 @@ struct i915_gem_context {
 	/** ggtt_offset_bias: placement restriction for context objects */
 	u32 ggtt_offset_bias;
 
+	struct {
+		struct work_struct resize;
+		struct hlist_head *ht;
+		unsigned int ht_bits;
+		unsigned int ht_size;
+		unsigned int ht_count;
+	} vma;
+
 	/** engine: per-engine logical HW state */
 	struct intel_context {
 		struct i915_vma *state;
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index c229d69b8757..4a05f0fe65e3 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -73,38 +73,33 @@ struct i915_execbuffer {
 		unsigned int page;
 		bool use_64bit_reloc;
 	} reloc_cache;
-	int and;
-	union {
-		struct i915_vma **lut;
-		struct hlist_head *buckets;
-	};
+	int lut_mask;
+	struct hlist_head *buckets;
 };
 
 static int
 eb_create(struct i915_execbuffer *eb)
 {
-	eb->lut = NULL;
-	if (eb->args->flags & I915_EXEC_HANDLE_LUT) {
-		unsigned int size = eb->args->buffer_count;
-		size *= sizeof(struct i915_vma *);
-		eb->lut = kmalloc(size,
-				  GFP_TEMPORARY | __GFP_NOWARN | __GFP_NORETRY);
-	}
-
-	if (!eb->lut) {
-		unsigned int size = eb->args->buffer_count;
-		unsigned int count = PAGE_SIZE / sizeof(struct hlist_head) / 2;
-		BUILD_BUG_ON_NOT_POWER_OF_2(PAGE_SIZE / sizeof(struct hlist_head));
-		while (count > 2*size)
-			count >>= 1;
-		eb->lut = kzalloc(count*sizeof(struct hlist_head),
-				  GFP_TEMPORARY);
-		if (!eb->lut)
-			return -ENOMEM;
-
-		eb->and = count - 1;
+	if ((eb->args->flags & I915_EXEC_HANDLE_LUT) == 0) {
+		unsigned int size = 1 + ilog2(eb->args->buffer_count);
+
+		do {
+			eb->buckets = kzalloc(sizeof(struct hlist_head) << size,
+					     GFP_TEMPORARY | __GFP_NOWARN | __GFP_NORETRY);
+			if (eb->buckets)
+				break;
+		} while (--size);
+
+		if (unlikely(!eb->buckets)) {
+			eb->buckets = kzalloc(sizeof(struct hlist_head),
+					      GFP_TEMPORARY);
+			if (unlikely(!eb->buckets))
+				return -ENOMEM;
+		}
+
+		eb->lut_mask = size;
 	} else
-		eb->and = -eb->args->buffer_count;
+		eb->lut_mask = -eb->args->buffer_count;
 
 	return 0;
 }
@@ -140,73 +135,104 @@ eb_reset(struct i915_execbuffer *eb)
 		vma->exec_entry = NULL;
 	}
 
-	if (eb->and >= 0)
-		memset(eb->buckets, 0, (eb->and+1)*sizeof(struct hlist_head));
+	if (eb->lut_mask >= 0)
+		memset(eb->buckets, 0,
+		       (1<<eb->lut_mask)*sizeof(struct hlist_head));
 }
 
-static struct i915_vma *
-eb_get_batch(struct i915_execbuffer *eb)
+#define to_ptr(T, x) ((T *)(uintptr_t)(x))
+
+static bool
+eb_add_vma(struct i915_execbuffer *eb, struct i915_vma *vma, int i)
 {
-	struct i915_vma *vma = list_entry(eb->vmas.prev, typeof(*vma), exec_link);
+	if (unlikely(vma->exec_entry)) {
+		DRM_DEBUG("Object [handle %d, index %d] appears more than once in object list\n",
+			  eb->exec[i].handle, i);
+		return false;
+	}
+	list_add_tail(&vma->exec_link, &eb->vmas);
 
-	/*
-	 * SNA is doing fancy tricks with compressing batch buffers, which leads
-	 * to negative relocation deltas. Usually that works out ok since the
-	 * relocate address is still positive, except when the batch is placed
-	 * very low in the GTT. Ensure this doesn't happen.
-	 *
-	 * Note that actual hangs have only been observed on gen7, but for
-	 * paranoia do it everywhere.
-	 */
-	if ((vma->exec_entry->flags & EXEC_OBJECT_PINNED) == 0)
-		vma->exec_entry->flags |= __EXEC_OBJECT_NEEDS_BIAS;
+	vma->exec_entry = &eb->exec[i];
+	if (eb->lut_mask >= 0) {
+		vma->exec_handle = eb->exec[i].handle;
+		hlist_add_head(&vma->exec_node,
+			       &eb->buckets[hash_32(vma->exec_handle,
+						    eb->lut_mask)]);
+	}
 
-	return vma;
+	i915_vma_get(vma);
+	eb->exec[i].rsvd2 = (uintptr_t)vma;
+	return true;
+}
+
+static inline struct hlist_head *ht_head(struct i915_gem_context *ctx,
+					 u32 handle)
+{
+	return &ctx->vma.ht[hash_32(handle, ctx->vma.ht_bits)];
 }
 
 static int
 eb_lookup_vmas(struct i915_execbuffer *eb)
 {
-	struct drm_i915_gem_object *obj;
-	struct list_head objects;
-	int i, ret;
+	const int count = eb->args->buffer_count;
+	struct i915_vma *vma;
+	int slow_pass = -1;
+	int i;
 
 	INIT_LIST_HEAD(&eb->vmas);
 
-	INIT_LIST_HEAD(&objects);
+	if (unlikely(eb->ctx->vma.ht_size & 1))
+		flush_work(&eb->ctx->vma.resize);
+	for (i = 0; i < count; i++) {
+		eb->exec[i].rsvd2 = 0;
+
+		hlist_for_each_entry(vma,
+				     ht_head(eb->ctx, eb->exec[i].handle),
+				     ctx_node) {
+			if (vma->ctx_handle != eb->exec[i].handle)
+				continue;
+
+			if (!eb_add_vma(eb, vma, i))
+				return -EINVAL;
+
+			goto next_vma;
+		}
+
+		if (slow_pass < 0)
+			slow_pass = i;
+next_vma: ;
+	}
+
+	if (slow_pass < 0)
+		return 0;
+
 	spin_lock(&eb->file->table_lock);
 	/* Grab a reference to the object and release the lock so we can lookup
 	 * or create the VMA without using GFP_ATOMIC */
-	for (i = 0; i < eb->args->buffer_count; i++) {
-		obj = to_intel_bo(idr_find(&eb->file->object_idr, eb->exec[i].handle));
-		if (obj == NULL) {
-			spin_unlock(&eb->file->table_lock);
-			DRM_DEBUG("Invalid object handle %d at index %d\n",
-				   eb->exec[i].handle, i);
-			ret = -ENOENT;
-			goto err;
-		}
+	for (i = slow_pass; i < count; i++) {
+		struct drm_i915_gem_object *obj;
 
-		if (!list_empty(&obj->obj_exec_link)) {
+		if (eb->exec[i].rsvd2)
+			continue;
+
+		obj = to_intel_bo(idr_find(&eb->file->object_idr,
+					   eb->exec[i].handle));
+		if (unlikely(!obj)) {
 			spin_unlock(&eb->file->table_lock);
-			DRM_DEBUG("Object %p [handle %d, index %d] appears more than once in object list\n",
-				   obj, eb->exec[i].handle, i);
-			ret = -EINVAL;
-			goto err;
+			DRM_DEBUG("Invalid object handle %d at index %d\n",
+				  eb->exec[i].handle, i);
+			return -ENOENT;
 		}
 
-		i915_gem_object_get(obj);
-		list_add_tail(&obj->obj_exec_link, &objects);
+		eb->exec[i].rsvd2 = 1 | (uintptr_t)obj;
 	}
 	spin_unlock(&eb->file->table_lock);
 
-	i = 0;
-	while (!list_empty(&objects)) {
-		struct i915_vma *vma;
+	for (i = slow_pass; i < count; i++) {
+		struct drm_i915_gem_object *obj;
 
-		obj = list_first_entry(&objects,
-				       struct drm_i915_gem_object,
-				       obj_exec_link);
+		if ((eb->exec[i].rsvd2 & 1) == 0)
+			continue;
 
 		/*
 		 * NOTE: We can leak any vmas created here when something fails
@@ -216,61 +242,72 @@ eb_lookup_vmas(struct i915_execbuffer *eb)
 		 * from the (obj, vm) we don't run the risk of creating
 		 * duplicated vmas for the same vm.
 		 */
+		obj = to_ptr(struct drm_i915_gem_object, eb->exec[i].rsvd2 & ~1);
 		vma = i915_vma_instance(obj, eb->vm, NULL);
 		if (unlikely(IS_ERR(vma))) {
 			DRM_DEBUG("Failed to lookup VMA\n");
-			ret = PTR_ERR(vma);
-			goto err;
+			return PTR_ERR(vma);
 		}
 
-		/* Transfer ownership from the objects list to the vmas list. */
-		list_add_tail(&vma->exec_link, &eb->vmas);
-		list_del_init(&obj->obj_exec_link);
-
-		vma->exec_entry = &eb->exec[i];
-		if (eb->and < 0) {
-			eb->lut[i] = vma;
-		} else {
-			u32 handle =
-				eb->args->flags & I915_EXEC_HANDLE_LUT ?
-				i : eb->exec[i].handle;
-			vma->exec_handle = handle;
-			hlist_add_head(&vma->exec_node,
-				       &eb->buckets[handle & eb->and]);
+		/* First come, first served */
+		if (!vma->ctx) {
+			vma->ctx = eb->ctx;
+			vma->ctx_handle = eb->exec[i].handle;
+			hlist_add_head(&vma->ctx_node,
+				       ht_head(eb->ctx, eb->exec[i].handle));
+			eb->ctx->vma.ht_count++;
+			if (i915_vma_is_ggtt(vma)) {
+				GEM_BUG_ON(obj->vma_hashed);
+				obj->vma_hashed = vma;
+			}
 		}
-		++i;
+
+		if (!eb_add_vma(eb, vma, i))
+			return -EINVAL;
+	}
+	if (4*eb->ctx->vma.ht_count > 3*eb->ctx->vma.ht_size ||
+	    4*eb->ctx->vma.ht_count < eb->ctx->vma.ht_size) {
+		eb->ctx->vma.ht_size |= 1;
+		queue_work(system_highpri_wq, &eb->ctx->vma.resize);
 	}
 
 	return 0;
+}
 
+static struct i915_vma *
+eb_get_batch(struct i915_execbuffer *eb)
+{
+	struct i915_vma *vma;
+
+	vma = to_ptr(struct i915_vma, eb->exec[eb->args->buffer_count-1].rsvd2);
 
-err:
-	while (!list_empty(&objects)) {
-		obj = list_first_entry(&objects,
-				       struct drm_i915_gem_object,
-				       obj_exec_link);
-		list_del_init(&obj->obj_exec_link);
-		i915_gem_object_put(obj);
-	}
 	/*
-	 * Objects already transfered to the vmas list will be unreferenced by
-	 * eb_destroy.
+	 * SNA is doing fancy tricks with compressing batch buffers, which leads
+	 * to negative relocation deltas. Usually that works out ok since the
+	 * relocate address is still positive, except when the batch is placed
+	 * very low in the GTT. Ensure this doesn't happen.
+	 *
+	 * Note that actual hangs have only been observed on gen7, but for
+	 * paranoia do it everywhere.
 	 */
+	if ((vma->exec_entry->flags & EXEC_OBJECT_PINNED) == 0)
+		vma->exec_entry->flags |= __EXEC_OBJECT_NEEDS_BIAS;
 
-	return ret;
+	return vma;
 }
 
-static struct i915_vma *eb_get_vma(struct i915_execbuffer *eb, unsigned long handle)
+static struct i915_vma *
+eb_get_vma(struct i915_execbuffer *eb, unsigned long handle)
 {
-	if (eb->and < 0) {
-		if (handle >= -eb->and)
+	if (eb->lut_mask < 0) {
+		if (handle >= -eb->lut_mask)
 			return NULL;
-		return eb->lut[handle];
+		return to_ptr(struct i915_vma, eb->exec[handle].rsvd2);
 	} else {
 		struct hlist_head *head;
 		struct i915_vma *vma;
 
-		head = &eb->buckets[handle & eb->and];
+		head = &eb->buckets[hash_32(handle, eb->lut_mask)];
 		hlist_for_each_entry(vma, head, exec_node) {
 			if (vma->exec_handle == handle)
 				return vma;
@@ -294,7 +331,7 @@ static void eb_destroy(struct i915_execbuffer *eb)
 
 	i915_gem_context_put(eb->ctx);
 
-	if (eb->buckets)
+	if (eb->lut_mask >= 0)
 		kfree(eb->buckets);
 }
 
@@ -911,7 +948,7 @@ static int eb_reserve(struct i915_execbuffer *eb)
 			entry->flags &= ~EXEC_OBJECT_NEEDS_FENCE;
 		need_fence =
 			entry->flags & EXEC_OBJECT_NEEDS_FENCE &&
-			i915_gem_object_is_tiled(obj);
+			i915_gem_object_is_tiled(vma->obj);
 		need_mappable = need_fence || need_reloc_mappable(vma);
 
 		if (entry->flags & EXEC_OBJECT_PINNED)
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
index f7d4e194a227..069fc4e1be2a 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -198,6 +198,7 @@ struct i915_ggtt_view {
 	};
 };
 
+struct i915_gem_context;
 enum i915_cache_level;
 
 struct i915_vma;
diff --git a/drivers/gpu/drm/i915/i915_gem_object.h b/drivers/gpu/drm/i915/i915_gem_object.h
index d6a77da81c9d..c9c9a6cf8bb1 100644
--- a/drivers/gpu/drm/i915/i915_gem_object.h
+++ b/drivers/gpu/drm/i915/i915_gem_object.h
@@ -68,6 +68,7 @@ struct drm_i915_gem_object {
 	/** List of VMAs backed by this object */
 	struct list_head vma_list;
 	struct rb_root vma_tree;
+	struct i915_vma *vma_hashed;
 
 	/** Stolen memory for this object, instead of being backed by shmem. */
 	struct drm_mm_node *stolen;
@@ -82,9 +83,6 @@ struct drm_i915_gem_object {
 	 */
 	struct list_head userfault_link;
 
-	/** Used in execbuf to temporarily hold a ref */
-	struct list_head obj_exec_link;
-
 	struct list_head batch_pool_link;
 	I915_SELFTEST_DECLARE(struct list_head st_link);
 
diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index fab3fa2062c5..77003eec0725 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -556,11 +556,31 @@ void i915_vma_destroy(struct i915_vma *vma)
 	kmem_cache_free(to_i915(vma->obj->base.dev)->vmas, vma);
 }
 
+void i915_vma_unlink_ctx(struct i915_vma *vma)
+{
+	struct i915_gem_context *ctx = vma->ctx;
+
+	if (ctx->vma.ht_size & 1) {
+		cancel_work_sync(&ctx->vma.resize);
+		ctx->vma.ht_size &= ~1;
+	}
+
+	__hlist_del(&vma->ctx_node);
+	ctx->vma.ht_count--;
+
+	if (i915_vma_is_ggtt(vma))
+		vma->obj->vma_hashed = NULL;
+	vma->ctx = NULL;
+}
+
 void i915_vma_close(struct i915_vma *vma)
 {
 	GEM_BUG_ON(i915_vma_is_closed(vma));
 	vma->flags |= I915_VMA_CLOSED;
 
+	if (vma->ctx)
+		i915_vma_unlink_ctx(vma);
+
 	list_del(&vma->obj_link);
 	rb_erase(&vma->obj_node, &vma->obj->vma_tree);
 
diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h
index 4d827300d1a8..88543fafcffc 100644
--- a/drivers/gpu/drm/i915/i915_vma.h
+++ b/drivers/gpu/drm/i915/i915_vma.h
@@ -99,6 +99,7 @@ struct i915_vma {
 
 	struct list_head obj_link; /* Link in the object's VMA list */
 	struct rb_node obj_node;
+	struct hlist_node obj_hash;
 
 	/** This vma's place in the execbuf reservation list */
 	struct list_head exec_link;
@@ -110,8 +111,12 @@ struct i915_vma {
 	 * Used for performing relocations during execbuffer insertion.
 	 */
 	struct hlist_node exec_node;
-	unsigned long exec_handle;
 	struct drm_i915_gem_exec_object2 *exec_entry;
+	u32 exec_handle;
+
+	struct i915_gem_context *ctx;
+	struct hlist_node ctx_node;
+	u32 ctx_handle;
 };
 
 struct i915_vma *
@@ -235,6 +240,7 @@ bool i915_vma_misplaced(const struct i915_vma *vma,
 			u64 size, u64 alignment, u64 flags);
 void __i915_vma_set_map_and_fenceable(struct i915_vma *vma);
 int __must_check i915_vma_unbind(struct i915_vma *vma);
+void i915_vma_unlink_ctx(struct i915_vma *vma);
 void i915_vma_close(struct i915_vma *vma);
 void i915_vma_destroy(struct i915_vma *vma);
 
diff --git a/drivers/gpu/drm/i915/selftests/mock_context.c b/drivers/gpu/drm/i915/selftests/mock_context.c
index 8d3a90c3f8ac..4e4615d5e003 100644
--- a/drivers/gpu/drm/i915/selftests/mock_context.c
+++ b/drivers/gpu/drm/i915/selftests/mock_context.c
@@ -40,10 +40,17 @@ mock_context(struct drm_i915_private *i915,
 	INIT_LIST_HEAD(&ctx->link);
 	ctx->i915 = i915;
 
+	ctx->vma.ht_bits = VMA_HT_BITS;
+	ctx->vma.ht_size = 1 << ctx->vma.ht_bits;
+	ctx->vma.ht = kzalloc(sizeof(*ctx->vma.ht)*ctx->vma.ht_size,
+			      GFP_KERNEL);
+	if (!ctx->vma.ht)
+		goto err_free;
+
 	ret = ida_simple_get(&i915->context_hw_ida,
 			     0, MAX_CONTEXT_HW_ID, GFP_KERNEL);
 	if (ret < 0)
-		goto err_free;
+		goto err_vma_ht;
 	ctx->hw_id = ret;
 
 	if (name) {
@@ -58,6 +65,8 @@ mock_context(struct drm_i915_private *i915,
 
 	return ctx;
 
+err_vma_ht:
+	kvfree(ctx->vma.ht);
 err_free:
 	kfree(ctx);
 	return NULL;
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 09/15] drm/i915: Pass vma to relocate entry
  2017-02-23 16:18 Make execbuf fast[er] Chris Wilson
                   ` (7 preceding siblings ...)
  2017-02-23 16:18 ` [PATCH 08/15] drm/i915: Store a direct lookup from object handle to vma Chris Wilson
@ 2017-02-23 16:18 ` Chris Wilson
  2017-02-23 16:18 ` [PATCH 10/15] drm/i915: Eliminate lots of iterations over the execobjects array Chris Wilson
                   ` (5 subsequent siblings)
  14 siblings, 0 replies; 23+ messages in thread
From: Chris Wilson @ 2017-02-23 16:18 UTC (permalink / raw)
  To: intel-gfx

We can simplify our tracking of pending writes in an execbuf to the
single bit in the vma->exec_entry->flags, but that requires the
relocation function knowing the object's vma. Pass it along.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 100 ++++++++++++-----------------
 1 file changed, 41 insertions(+), 59 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 4a05f0fe65e3..77ca777a7da1 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -600,42 +600,25 @@ relocate_entry(struct drm_i915_gem_object *obj,
 }
 
 static int
-eb_relocate_entry(struct drm_i915_gem_object *obj,
+eb_relocate_entry(struct i915_vma *vma,
 		  struct i915_execbuffer *eb,
 		  struct drm_i915_gem_relocation_entry *reloc)
 {
-	struct drm_gem_object *target_obj;
-	struct drm_i915_gem_object *target_i915_obj;
-	struct i915_vma *target_vma;
-	uint64_t target_offset;
+	struct i915_vma *target;
+	u64 target_offset;
 	int ret;
 
 	/* we've already hold a reference to all valid objects */
-	target_vma = eb_get_vma(eb, reloc->target_handle);
-	if (unlikely(target_vma == NULL))
+	target = eb_get_vma(eb, reloc->target_handle);
+	if (unlikely(!target))
 		return -ENOENT;
-	target_i915_obj = target_vma->obj;
-	target_obj = &target_vma->obj->base;
-
-	target_offset = gen8_canonical_addr(target_vma->node.start);
-
-	/* Sandybridge PPGTT errata: We need a global gtt mapping for MI and
-	 * pipe_control writes because the gpu doesn't properly redirect them
-	 * through the ppgtt for non_secure batchbuffers. */
-	if (unlikely(IS_GEN6(eb->i915) &&
-		     reloc->write_domain == I915_GEM_DOMAIN_INSTRUCTION)) {
-		ret = i915_vma_bind(target_vma, target_i915_obj->cache_level,
-				    PIN_GLOBAL);
-		if (WARN_ONCE(ret, "Unexpected failure to bind target VMA!"))
-			return ret;
-	}
 
 	/* Validate that the target is in a valid r/w GPU domain */
 	if (unlikely(reloc->write_domain & (reloc->write_domain - 1))) {
 		DRM_DEBUG("reloc with multiple write domains: "
-			  "obj %p target %d offset %d "
+			  "target %d offset %d "
 			  "read %08x write %08x",
-			  obj, reloc->target_handle,
+			  reloc->target_handle,
 			  (int) reloc->offset,
 			  reloc->read_domains,
 			  reloc->write_domain);
@@ -644,43 +627,56 @@ eb_relocate_entry(struct drm_i915_gem_object *obj,
 	if (unlikely((reloc->write_domain | reloc->read_domains)
 		     & ~I915_GEM_GPU_DOMAINS)) {
 		DRM_DEBUG("reloc with read/write non-GPU domains: "
-			  "obj %p target %d offset %d "
+			  "target %d offset %d "
 			  "read %08x write %08x",
-			  obj, reloc->target_handle,
+			  reloc->target_handle,
 			  (int) reloc->offset,
 			  reloc->read_domains,
 			  reloc->write_domain);
 		return -EINVAL;
 	}
 
-	target_obj->pending_read_domains |= reloc->read_domains;
-	target_obj->pending_write_domain |= reloc->write_domain;
+	if (reloc->write_domain)
+		target->exec_entry->flags |= EXEC_OBJECT_WRITE;
+
+	/* Sandybridge PPGTT errata: We need a global gtt mapping for MI and
+	 * pipe_control writes because the gpu doesn't properly redirect them
+	 * through the ppgtt for non_secure batchbuffers.
+	 */
+	if (unlikely(IS_GEN6(eb->i915) &&
+		     reloc->write_domain == I915_GEM_DOMAIN_INSTRUCTION)) {
+		ret = i915_vma_bind(target, target->obj->cache_level,
+				    PIN_GLOBAL);
+		if (WARN_ONCE(ret, "Unexpected failure to bind target VMA!"))
+			return ret;
+	}
 
 	/* If the relocation already has the right value in it, no
 	 * more work needs to be done.
 	 */
+	target_offset = gen8_canonical_addr(target->node.start);
 	if (target_offset == reloc->presumed_offset)
 		return 0;
 
 	/* Check that the relocation address is valid... */
 	if (unlikely(reloc->offset >
-		     obj->base.size - (eb->reloc_cache.use_64bit_reloc ? 8 : 4))) {
+		     vma->size - (eb->reloc_cache.use_64bit_reloc ? 8 : 4))) {
 		DRM_DEBUG("Relocation beyond object bounds: "
-			  "obj %p target %d offset %d size %d.\n",
-			  obj, reloc->target_handle,
-			  (int) reloc->offset,
-			  (int) obj->base.size);
+			  "target %d offset %d size %d.\n",
+			  reloc->target_handle,
+			  (int)reloc->offset,
+			  (int)vma->size);
 		return -EINVAL;
 	}
 	if (unlikely(reloc->offset & 3)) {
 		DRM_DEBUG("Relocation not 4-byte aligned: "
-			  "obj %p target %d offset %d.\n",
-			  obj, reloc->target_handle,
-			  (int) reloc->offset);
+			  "target %d offset %d.\n",
+			  reloc->target_handle,
+			  (int)reloc->offset);
 		return -EINVAL;
 	}
 
-	ret = relocate_entry(obj, reloc, &eb->reloc_cache, target_offset);
+	ret = relocate_entry(vma->obj, reloc, &eb->reloc_cache, target_offset);
 	if (ret)
 		return ret;
 
@@ -726,7 +722,7 @@ static int eb_relocate_vma(struct i915_vma *vma, struct i915_execbuffer *eb)
 		do {
 			u64 offset = r->presumed_offset;
 
-			ret = eb_relocate_entry(vma->obj, eb, r);
+			ret = eb_relocate_entry(vma, eb, r);
 			if (ret)
 				goto out;
 
@@ -772,7 +768,7 @@ eb_relocate_vma_slow(struct i915_vma *vma,
 	int i, ret = 0;
 
 	for (i = 0; i < entry->relocation_count; i++) {
-		ret = eb_relocate_entry(vma->obj, eb, &relocs[i]);
+		ret = eb_relocate_entry(vma, eb, &relocs[i]);
 		if (ret)
 			break;
 	}
@@ -805,7 +801,6 @@ eb_reserve_vma(struct i915_vma *vma,
 	       struct intel_engine_cs *engine,
 	       bool *need_reloc)
 {
-	struct drm_i915_gem_object *obj = vma->obj;
 	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
 	uint64_t flags;
 	int ret;
@@ -859,11 +854,6 @@ eb_reserve_vma(struct i915_vma *vma,
 		*need_reloc = true;
 	}
 
-	if (entry->flags & EXEC_OBJECT_WRITE) {
-		obj->base.pending_read_domains = I915_GEM_DOMAIN_RENDER;
-		obj->base.pending_write_domain = I915_GEM_DOMAIN_RENDER;
-	}
-
 	return 0;
 }
 
@@ -925,7 +915,6 @@ eb_vma_misplaced(struct i915_vma *vma)
 static int eb_reserve(struct i915_execbuffer *eb)
 {
 	const bool has_fenced_gpu_access = INTEL_GEN(eb->i915) < 4;
-	struct drm_i915_gem_object *obj;
 	struct i915_vma *vma;
 	struct list_head ordered_vmas;
 	struct list_head pinned_vmas;
@@ -938,7 +927,6 @@ static int eb_reserve(struct i915_execbuffer *eb)
 		bool need_fence, need_mappable;
 
 		vma = list_first_entry(&eb->vmas, struct i915_vma, exec_link);
-		obj = vma->obj;
 		entry = vma->exec_entry;
 
 		if (eb->ctx->flags & CONTEXT_NO_ZEROMAP)
@@ -958,9 +946,6 @@ static int eb_reserve(struct i915_execbuffer *eb)
 			list_move(&vma->exec_link, &ordered_vmas);
 		} else
 			list_move_tail(&vma->exec_link, &ordered_vmas);
-
-		obj->base.pending_read_domains = I915_GEM_GPU_DOMAINS & ~I915_GEM_DOMAIN_COMMAND;
-		obj->base.pending_write_domain = 0;
 	}
 	list_splice(&ordered_vmas, &eb->vmas);
 	list_splice(&pinned_vmas, &eb->vmas);
@@ -1148,7 +1133,7 @@ eb_move_to_gpu(struct i915_execbuffer *eb)
 		}
 
 		ret = i915_gem_request_await_object
-			(eb->request, obj, obj->base.pending_write_domain);
+			(eb->request, obj, vma->exec_entry->flags & EXEC_OBJECT_WRITE);
 		if (ret)
 			return ret;
 	}
@@ -1349,12 +1334,10 @@ eb_move_to_active(struct i915_execbuffer *eb)
 	list_for_each_entry(vma, &eb->vmas, exec_link) {
 		struct drm_i915_gem_object *obj = vma->obj;
 
-		obj->base.write_domain = obj->base.pending_write_domain;
-		if (obj->base.write_domain)
-			vma->exec_entry->flags |= EXEC_OBJECT_WRITE;
-		else
-			obj->base.pending_read_domains |= obj->base.read_domains;
-		obj->base.read_domains = obj->base.pending_read_domains;
+		obj->base.write_domain = 0;
+		if (vma->exec_entry->flags & EXEC_OBJECT_WRITE)
+			obj->base.read_domains = 0;
+		obj->base.read_domains |= I915_GEM_GPU_DOMAINS;
 
 		i915_vma_move_to_active(vma, eb->request, vma->exec_entry->flags);
 		eb_export_fence(obj, eb->request, vma->exec_entry->flags);
@@ -1771,7 +1754,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 	}
 
 	/* Set the pending read domains for the batch buffer to COMMAND */
-	if (eb.batch->obj->base.pending_write_domain) {
+	if (eb.batch->exec_entry->flags & EXEC_OBJECT_WRITE) {
 		DRM_DEBUG("Attempting to use self-modifying batch buffer\n");
 		ret = -EINVAL;
 		goto err;
@@ -1808,7 +1791,6 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 		}
 	}
 
-	eb.batch->obj->base.pending_read_domains |= I915_GEM_DOMAIN_COMMAND;
 	if (eb.batch_len == 0)
 		eb.batch_len = eb.batch->size - eb.batch_start_offset;
 
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 10/15] drm/i915: Eliminate lots of iterations over the execobjects array
  2017-02-23 16:18 Make execbuf fast[er] Chris Wilson
                   ` (8 preceding siblings ...)
  2017-02-23 16:18 ` [PATCH 09/15] drm/i915: Pass vma to relocate entry Chris Wilson
@ 2017-02-23 16:18 ` Chris Wilson
  2017-02-23 16:18 ` [PATCH 11/15] drm/i915: First try the previous execbuffer location Chris Wilson
                   ` (4 subsequent siblings)
  14 siblings, 0 replies; 23+ messages in thread
From: Chris Wilson @ 2017-02-23 16:18 UTC (permalink / raw)
  To: intel-gfx

The major scaling bottleneck in execbuffer is the processing of the
execobjects. Creating an auxiliary list is inefficient when compared to
using the execobject array we already have allocated.

Reservation is then split into phases. As we lookup up the VMA, we
try and bind it back into active location. Only if that fails, do we add
it to the unbound list for phase 2. In phase 2, we try and add all those
objects that could not fit into their previous location, with fallback
to retrying all objects and evicting the VM in case of severe
fragmentation. (This is the same as before, except that phase 1 is now
done inline with looking up the VMA to avoid an iteration over the
execobject array. In the ideal case, we eliminate the separate reservation
phase). During the reservation phase, we only evict from the VM between
passes (rather than currently as we try to fit every new VMA). In
testing with Unreal Engine's Atlantis demo which stresses the eviction
logic on gen7 class hardware, this speed up the framerate by a factor of
2.

The second loop amalgamation is between move_to_gpu and move_to_active.
As we always submit the request, even if incomplete, we can use the
current request to track active VMA as we perform the flushes and
synchronisation required.

The next big advancement is to avoid copying back to the user any
execobjects and relocations that are not changed.

v2: Add a Theory of Operation spiel.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.h                 |    2 +-
 drivers/gpu/drm/i915/i915_gem_evict.c           |   95 +-
 drivers/gpu/drm/i915/i915_gem_execbuffer.c      | 1774 +++++++++++++----------
 drivers/gpu/drm/i915/i915_vma.c                 |    2 +-
 drivers/gpu/drm/i915/i915_vma.h                 |    1 +
 drivers/gpu/drm/i915/selftests/i915_gem_evict.c |    4 +-
 6 files changed, 1070 insertions(+), 808 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 00f8bcec5d71..36c1d19a9fef 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3513,7 +3513,7 @@ int __must_check i915_gem_evict_something(struct i915_address_space *vm,
 int __must_check i915_gem_evict_for_node(struct i915_address_space *vm,
 					 struct drm_mm_node *node,
 					 unsigned int flags);
-int i915_gem_evict_vm(struct i915_address_space *vm, bool do_idle);
+int i915_gem_evict_vm(struct i915_address_space *vm);
 
 /* belongs in i915_gem_gtt.h */
 static inline void i915_gem_chipset_flush(struct drm_i915_private *dev_priv)
diff --git a/drivers/gpu/drm/i915/i915_gem_evict.c b/drivers/gpu/drm/i915/i915_gem_evict.c
index 2a6eb2ceff79..9d4aa51c3d0a 100644
--- a/drivers/gpu/drm/i915/i915_gem_evict.c
+++ b/drivers/gpu/drm/i915/i915_gem_evict.c
@@ -50,6 +50,30 @@ static bool ggtt_is_idle(struct drm_i915_private *dev_priv)
 	return true;
 }
 
+static int ggtt_flush(struct drm_i915_private *i915)
+{
+	int err;
+
+	/* Not everything in the GGTT is tracked via vma (otherwise we
+	 * could evict as required with minimal stalling) so we are forced
+	 * to idle the GPU and explicitly retire outstanding requests in
+	 * the hopes that we can then remove contexts and the like only
+	 * bound by their active reference.
+	 */
+	err = i915_gem_switch_to_kernel_context(i915);
+	if (err)
+		return err;
+
+	err = i915_gem_wait_for_idle(i915,
+				     I915_WAIT_INTERRUPTIBLE |
+				     I915_WAIT_LOCKED);
+	if (err)
+		return err;
+
+	i915_gem_retire_requests(i915);
+	return 0;
+}
+
 static bool
 mark_free(struct drm_mm_scan *scan,
 	  struct i915_vma *vma,
@@ -175,23 +199,10 @@ i915_gem_evict_something(struct i915_address_space *vm,
 		return intel_has_pending_fb_unpin(dev_priv) ? -EAGAIN : -ENOSPC;
 	}
 
-	/* Not everything in the GGTT is tracked via vma (otherwise we
-	 * could evict as required with minimal stalling) so we are forced
-	 * to idle the GPU and explicitly retire outstanding requests in
-	 * the hopes that we can then remove contexts and the like only
-	 * bound by their active reference.
-	 */
-	ret = i915_gem_switch_to_kernel_context(dev_priv);
-	if (ret)
-		return ret;
-
-	ret = i915_gem_wait_for_idle(dev_priv,
-				     I915_WAIT_INTERRUPTIBLE |
-				     I915_WAIT_LOCKED);
+	ret = ggtt_flush(dev_priv);
 	if (ret)
 		return ret;
 
-	i915_gem_retire_requests(dev_priv);
 	goto search_again;
 
 found:
@@ -338,10 +349,8 @@ int i915_gem_evict_for_node(struct i915_address_space *vm,
 /**
  * i915_gem_evict_vm - Evict all idle vmas from a vm
  * @vm: Address space to cleanse
- * @do_idle: Boolean directing whether to idle first.
  *
- * This function evicts all idles vmas from a vm. If all unpinned vmas should be
- * evicted the @do_idle needs to be set to true.
+ * This function evicts all vmas from a vm.
  *
  * This is used by the execbuf code as a last-ditch effort to defragment the
  * address space.
@@ -349,38 +358,50 @@ int i915_gem_evict_for_node(struct i915_address_space *vm,
  * To clarify: This is for freeing up virtual address space, not for freeing
  * memory in e.g. the shrinker.
  */
-int i915_gem_evict_vm(struct i915_address_space *vm, bool do_idle)
+int i915_gem_evict_vm(struct i915_address_space *vm)
 {
+	struct list_head *phases[] = {
+		&vm->inactive_list,
+		&vm->active_list,
+		NULL
+	}, **phase;
+	struct list_head eviction_list;
 	struct i915_vma *vma, *next;
 	int ret;
 
 	lockdep_assert_held(&vm->i915->drm.struct_mutex);
 	trace_i915_gem_evict_vm(vm);
 
-	if (do_idle) {
-		struct drm_i915_private *dev_priv = vm->i915;
-
-		if (i915_is_ggtt(vm)) {
-			ret = i915_gem_switch_to_kernel_context(dev_priv);
-			if (ret)
-				return ret;
-		}
-
-		ret = i915_gem_wait_for_idle(dev_priv,
-					     I915_WAIT_INTERRUPTIBLE |
-					     I915_WAIT_LOCKED);
+	/* Switch back to the default context in order to unpin
+	 * the existing context objects. However, such objects only
+	 * pin themselves inside the global GTT and performing the
+	 * switch otherwise is ineffective.
+	 */
+	if (i915_is_ggtt(vm)) {
+		ret = ggtt_flush(vm->i915);
 		if (ret)
 			return ret;
-
-		i915_gem_retire_requests(dev_priv);
-		WARN_ON(!list_empty(&vm->active_list));
 	}
 
-	list_for_each_entry_safe(vma, next, &vm->inactive_list, vm_link)
-		if (!i915_vma_is_pinned(vma))
-			WARN_ON(i915_vma_unbind(vma));
+	INIT_LIST_HEAD(&eviction_list);
+	phase = phases;
+	do {
+		list_for_each_entry(vma, *phase, vm_link) {
+			if (i915_vma_is_pinned(vma))
+				continue;
+
+			__i915_vma_pin(vma);
+			list_add(&vma->evict_link, &eviction_list);
+		}
+	} while (*++phase);
 
-	return 0;
+	ret = 0;
+	list_for_each_entry_safe(vma, next, &eviction_list, evict_link) {
+		__i915_vma_unpin(vma);
+		if (ret == 0)
+			ret = i915_vma_unbind(vma);
+	}
+	return ret;
 }
 
 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 77ca777a7da1..6ff282c225d0 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -43,14 +43,131 @@
 
 #define DBG_USE_CPU_RELOC 0 /* -1 force GTT relocs; 1 force CPU relocs */
 
-#define  __EXEC_OBJECT_HAS_PIN		(1<<31)
-#define  __EXEC_OBJECT_HAS_FENCE	(1<<30)
-#define  __EXEC_OBJECT_NEEDS_MAP	(1<<29)
-#define  __EXEC_OBJECT_NEEDS_BIAS	(1<<28)
+#define  __EXEC_OBJECT_HAS_PIN		BIT(31)
+#define  __EXEC_OBJECT_HAS_FENCE	BIT(30)
+#define  __EXEC_OBJECT_NEEDS_MAP	BIT(29)
+#define  __EXEC_OBJECT_NEEDS_BIAS	BIT(28)
 #define  __EXEC_OBJECT_INTERNAL_FLAGS (0xf<<28) /* all of the above */
+#define __EB_RESERVED (__EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_FENCE)
+
+#define __EXEC_HAS_RELOC	BIT(31)
+#define __EXEC_VALIDATED	BIT(30)
+#define UPDATE			PIN_OFFSET_FIXED
 
 #define BATCH_OFFSET_BIAS (256*1024)
 
+/**
+ * DOC: User command execution
+ *
+ * Userspace submits commands to be executed on the GPU as an instruction
+ * stream within a GEM object we call a batchbuffer. This instructions may
+ * refer to other GEM objects containing auxiliary state such as kernels,
+ * samplers, render targets and even secondary batchbuffers. Userspace does
+ * not know where in the GPU memory these objects reside and so before the
+ * batchbuffer is passed to the GPU for execution, those addresses in the
+ * batchbuffer and auxiliary objects are updated. This is known as relocation,
+ * or patching. To try and avoid having to relocate each object on the next
+ * execution, userspace is told the location of those objects in this pass,
+ * but this remains just a hint as the kernel may choose a new location for
+ * any object in the future.
+ *
+ * Processing an execbuf ioctl is conceptually split up into a few phases.
+ *
+ * 1. Validation - Ensure all the pointers, handles and flags are valid.
+ * 2. Reservation - Assign GPU address space for every object
+ * 3. Relocation - Update any addresses to point to the final locations
+ * 4. Serialisation - Order the request with respect to its dependencies
+ * 5. Construction - Construct a request to execute the batchbuffer
+ * 6. Submission (at some point in the future execution)
+ *
+ * Reserving resources for the execbuf is the most complicated phase. We
+ * neither want to have to migrate the object in the address space, nor do
+ * we want to have to update any relocations pointing to this object. Ideally,
+ * we want to leave the object where it is and for all the existing relocations
+ * to match. If the object is given a new address, or if userspace thinks the
+ * object is elsewhere, we have to parse all the relocation entries and update
+ * the addresses. Userspace can set the I915_EXEC_NORELOC flag to hint that
+ * all the target addresses in all of its objects match the value in the
+ * relocation entries and that they all match the presumed offsets given by the
+ * list of execbuffer objects. Using this knowledge, we know that if we haven't
+ * moved any buffers, all the relocation entries are valid and we can skip
+ * the update. (If userspace is wrong, the likely outcome is an impromptu GPU
+ * hang.)
+ *
+ * The reservation is done is multiple phases. First we try and keep any
+ * object already bound in its current location - so as long as meets the
+ * constraints imposed by the new execbuffer. Any object left unbound after the
+ * first pass is then fitted into any available idle space. If an object does
+ * not fit, all objects are removed from the reservation and the process rerun
+ * after sorting the objects into a priority order (more difficult to fit
+ * objects are tried first). Failing that, the entire VM is cleared and we try
+ * to fit the execbuf once last time before concluding that it simply will not
+ * fit.
+ *
+ * A small complication to all of this is that we allow userspace not only to
+ * specify an alignment and a size for the object in the address space, but
+ * we also allow userspace to specify the exact offset. This objects are
+ * simpler to place (the location is known a priori) all we have to do is make
+ * sure the space is available.
+ *
+ * Once all the objects are in place, patching up the buried pointers to point
+ * to the final locations is a fairly simple job of walking over the relocation
+ * entry arrays, looking up the right address and rewriting the value into
+ * the object. Simple! ... The relocation entries are stored in user memory
+ * and so to access them we have to copy them into a local buffer. That copy
+ * has to avoid taking any pagefaults as they may lead back to a GEM object
+ * requiring the struct_mutex (i.e. recursive deadlock). So once again we split
+ * the relocation into multiple passes. First we try to do everything within an
+ * atomic context (avoid the pagefaults) which requires that we never wait. If
+ * we detect that we may wait, or if we need to fault, then we have to fallback
+ * to a slower path. The slowpath has to drop the mutex. (Can you hear alarm
+ * bells yet?) Dropping the mutex means that we lose all the state we have
+ * built up so far for the execbuf and we must reset any global data. However,
+ * we do leave the objects pinned in their final locations - which is a
+ * potential issue for concurrent execbufs. Once we have left the mutex, we can
+ * allocate and copy all the relocation entries into a large array at our
+ * leisure, reacquire the mutex, reclaim all the objects and other state and
+ * then proceed to update any incorrect addresses with the objects.
+ *
+ * As we process the relocation entries, we maintain a record of whether the
+ * object is being written to. Using NORELOC, we expect userspace to provide
+ * this information instead. We also check whether we can skip the relocation
+ * by comparing the expected value inside the relocation entry with the target's
+ * final address. If they differ, we have to map the current object and rewrite
+ * the 4 or 8 byte pointer within.
+ *
+ * Serialising an execbuf is quite simple according to the rules of the GEM
+ * ABI. Execution within each context is ordered by the order of submission.
+ * Writes to any GEM object are in order of submission and are exclusive. Reads
+ * from a GEM object are unordered with respect to other reads, but ordered by
+ * writes. A write submitted after a read cannot occur before the read, and
+ * similarly any read submitted after a write cannot occur before the write.
+ * Writes are ordered between engines such that only one write occurs at any
+ * time (completing any reads beforehand) - using semaphores where available
+ * and CPU serialisation otherwise. Other GEM access obey the same rules, any
+ * write (either via mmaps using set-domain, or via pwrite) must flush all GPU
+ * reads before starting, and any read (either using set-domain or pread) must
+ * flush all GPU writes before starting. (Note we only employ a barrier before,
+ * we currently rely on userspace not concurrently starting a new execution
+ * whilst reading or writing to an object. This may be an advantage or not
+ * depending on how much you trust userspace not to shoot themselves in the
+ * foot.) Serialisation may just result in the request being inserted into
+ * a DAG awaiting its turn, but most simple is to wait on the CPU until
+ * all dependencies are resolved.
+ *
+ * After all of that, is just a matter of closing the request and handing it to
+ * the hardware (well, leaving it in a queue to be executed). However, we also
+ * offer the ability for batchbuffers to be run with elevated privileges so
+ * that they access otherwise hidden registers. (Used to adjust L3 cache etc.)
+ * Before any batch is given extra privileges we first must check that it
+ * contains no nefarious instructions, we check that each instruction is from
+ * our whitelist and all registers are also from an allowed list. We first
+ * copy the user's batchbuffer to a shadow (so that the user doesn't have
+ * access to it, either by the CPU or GPU as we scan it) and then parse each
+ * instruction. If everything is ok, we set a flag telling the hardware to run
+ * the batchbuffer in trusted mode, otherwise the ioctl is rejected.
+ */
+
 struct i915_execbuffer {
 	struct drm_i915_private *i915;
 	struct drm_file *file;
@@ -61,28 +178,61 @@ struct i915_execbuffer {
 	struct i915_address_space *vm;
 	struct i915_vma *batch;
 	struct drm_i915_gem_request *request;
-	u32 batch_start_offset;
-	u32 batch_len;
-	unsigned int dispatch_flags;
-	struct drm_i915_gem_exec_object2 shadow_exec_entry;
-	bool need_relocs;
-	struct list_head vmas;
+	struct list_head unbound;
+	struct list_head relocs;
 	struct reloc_cache {
 		struct drm_mm_node node;
 		unsigned long vaddr;
 		unsigned int page;
 		bool use_64bit_reloc;
+		bool has_llc;
+		bool has_fence;
 	} reloc_cache;
-	int lut_mask;
+	u64 invalid_flags;
+	u32 context_flags;
+	u32 dispatch_flags;
+	u32 batch_start_offset;
+	u32 batch_len;
+	int lut_size;
 	struct hlist_head *buckets;
 };
 
+#define to_ptr(T, x) ((T *)(uintptr_t)(x))
+
+/* Used to convert any address to canonical form.
+ * Starting from gen8, some commands (e.g. STATE_BASE_ADDRESS,
+ * MI_LOAD_REGISTER_MEM and others, see Broadwell PRM Vol2a) require the
+ * addresses to be in a canonical form:
+ * "GraphicsAddress[63:48] are ignored by the HW and assumed to be in correct
+ * canonical form [63:48] == [47]."
+ */
+#define GEN8_HIGH_ADDRESS_BIT 47
+static inline u64 gen8_canonical_addr(u64 address)
+{
+	return sign_extend64(address, GEN8_HIGH_ADDRESS_BIT);
+}
+
+static inline u64 gen8_noncanonical_addr(u64 address)
+{
+	return address & ((1ULL << (GEN8_HIGH_ADDRESS_BIT + 1)) - 1);
+}
+
 static int
 eb_create(struct i915_execbuffer *eb)
 {
 	if ((eb->args->flags & I915_EXEC_HANDLE_LUT) == 0) {
 		unsigned int size = 1 + ilog2(eb->args->buffer_count);
 
+		/* Without a 1:1 association between relocation handles and
+		 * the execobject[] index, we instead create a hashtable.
+		 * We size it dynamically based on available memory, starting
+		 * first with 1:1 assocative hash and scaling back until
+		 * the allocation succeeds.
+		 *
+		 * Later on we use a positive lut_size to indicate we are
+		 * using this hashtable, and a negative value to indicate a
+		 * direct lookup.
+		 */
 		do {
 			eb->buckets = kzalloc(sizeof(struct hlist_head) << size,
 					     GFP_TEMPORARY | __GFP_NOWARN | __GFP_NORETRY);
@@ -97,89 +247,348 @@ eb_create(struct i915_execbuffer *eb)
 				return -ENOMEM;
 		}
 
-		eb->lut_mask = size;
+		eb->lut_size = size;
 	} else
-		eb->lut_mask = -eb->args->buffer_count;
+		eb->lut_size = -eb->args->buffer_count;
 
 	return 0;
 }
 
+static bool
+eb_vma_misplaced(const struct drm_i915_gem_exec_object2 *entry,
+		 const struct i915_vma *vma)
+{
+	if ((entry->flags & __EXEC_OBJECT_HAS_PIN) == 0)
+		return true;
+
+	if (vma->node.size < entry->pad_to_size)
+		return true;
+
+	if (entry->alignment && !IS_ALIGNED(vma->node.start, entry->alignment))
+		return true;
+
+	if (entry->flags & EXEC_OBJECT_PINNED &&
+	    vma->node.start != entry->offset)
+		return true;
+
+	if (entry->flags & __EXEC_OBJECT_NEEDS_BIAS &&
+	    vma->node.start < BATCH_OFFSET_BIAS)
+		return true;
+
+	if ((entry->flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) == 0 &&
+	    (vma->node.start + vma->node.size - 1) >> 32)
+		return true;
+
+	return false;
+}
+
+static void
+eb_pin_vma(struct i915_execbuffer *eb,
+	   struct drm_i915_gem_exec_object2 *entry,
+	   struct i915_vma *vma)
+{
+	u64 flags;
+
+	flags = vma->node.start;
+	flags |= PIN_USER | PIN_NONBLOCK | PIN_OFFSET_FIXED;
+	if (unlikely(entry->flags & EXEC_OBJECT_NEEDS_GTT))
+		flags |= PIN_GLOBAL;
+	if (unlikely(i915_vma_pin(vma, 0, 0, flags)))
+		return;
+
+	if (entry->flags & EXEC_OBJECT_NEEDS_FENCE) {
+		if (unlikely(i915_vma_get_fence(vma))) {
+			i915_vma_unpin(vma);
+			return;
+		}
+
+		if (i915_vma_pin_fence(vma))
+			entry->flags |= __EXEC_OBJECT_HAS_FENCE;
+	}
+
+	entry->flags |= __EXEC_OBJECT_HAS_PIN;
+}
+
 static inline void
 __eb_unreserve_vma(struct i915_vma *vma,
 		   const struct drm_i915_gem_exec_object2 *entry)
 {
+	GEM_BUG_ON((entry->flags & __EXEC_OBJECT_HAS_PIN) == 0);
+
 	if (unlikely(entry->flags & __EXEC_OBJECT_HAS_FENCE))
 		i915_vma_unpin_fence(vma);
 
-	if (entry->flags & __EXEC_OBJECT_HAS_PIN)
-		__i915_vma_unpin(vma);
+	__i915_vma_unpin(vma);
 }
 
-static void
-eb_unreserve_vma(struct i915_vma *vma)
+static inline void
+eb_unreserve_vma(struct i915_vma *vma,
+		 struct drm_i915_gem_exec_object2 *entry)
 {
-	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
-
-	__eb_unreserve_vma(vma, entry);
-	entry->flags &= ~(__EXEC_OBJECT_HAS_FENCE | __EXEC_OBJECT_HAS_PIN);
+	if (entry->flags & __EXEC_OBJECT_HAS_PIN) {
+		__eb_unreserve_vma(vma, entry);
+		entry->flags &= ~__EB_RESERVED;
+	}
 }
 
-static void
-eb_reset(struct i915_execbuffer *eb)
+static int
+eb_add_vma(struct i915_execbuffer *eb,
+	   struct drm_i915_gem_exec_object2 *entry,
+	   struct i915_vma *vma)
 {
-	struct i915_vma *vma;
+	int ret;
 
-	list_for_each_entry(vma, &eb->vmas, exec_link) {
-		eb_unreserve_vma(vma);
-		i915_vma_put(vma);
-		vma->exec_entry = NULL;
+	GEM_BUG_ON(i915_vma_is_closed(vma));
+
+	if ((eb->args->flags & __EXEC_VALIDATED) == 0) {
+		if (unlikely(entry->flags & eb->invalid_flags))
+			return -EINVAL;
+
+		if (unlikely(entry->alignment && !is_power_of_2(entry->alignment)))
+			return -EINVAL;
+
+		/* Offset can be used as input (EXEC_OBJECT_PINNED), reject
+		 * any non-page-aligned or non-canonical addresses.
+		 */
+		if (entry->flags & EXEC_OBJECT_PINNED) {
+			if (unlikely(entry->offset !=
+				     gen8_canonical_addr(entry->offset & PAGE_MASK)))
+				return -EINVAL;
+		}
+
+		/* From drm_mm perspective address space is continuous,
+		 * so from this point we're always using non-canonical
+		 * form internally.
+		 */
+		entry->offset = gen8_noncanonical_addr(entry->offset);
+
+		/* pad_to_size was once a reserved field, so sanitize it */
+		if (entry->flags & EXEC_OBJECT_PAD_TO_SIZE) {
+			if (unlikely(offset_in_page(entry->pad_to_size)))
+				return -EINVAL;
+		} else {
+			entry->pad_to_size = 0;
+		}
+
+		if (unlikely(vma->exec_entry)) {
+			DRM_DEBUG("Object [handle %d, index %d] appears more than once in object list\n",
+				  entry->handle, (int)(entry - eb->exec));
+			return -EINVAL;
+		}
 	}
 
-	if (eb->lut_mask >= 0)
-		memset(eb->buckets, 0,
-		       (1<<eb->lut_mask)*sizeof(struct hlist_head));
-}
+	vma->exec_entry = entry;
+	entry->rsvd2 = (uintptr_t)vma;
+	i915_vma_get(vma);
 
-#define to_ptr(T, x) ((T *)(uintptr_t)(x))
+	if (eb->lut_size >= 0) {
+		vma->exec_handle = entry->handle;
+		hlist_add_head(&vma->exec_node,
+			       &eb->buckets[hash_32(entry->handle,
+						    eb->lut_size)]);
+	}
 
-static bool
-eb_add_vma(struct i915_execbuffer *eb, struct i915_vma *vma, int i)
+	if (entry->relocation_count)
+		list_add_tail(&vma->reloc_link, &eb->relocs);
+
+	if (!eb->reloc_cache.has_fence) {
+		entry->flags &= ~EXEC_OBJECT_NEEDS_FENCE;
+	} else {
+		if (entry->flags & EXEC_OBJECT_NEEDS_FENCE &&
+		    i915_gem_object_is_tiled(vma->obj))
+			entry->flags |= EXEC_OBJECT_NEEDS_GTT | __EXEC_OBJECT_NEEDS_MAP;
+	}
+
+	if ((entry->flags & EXEC_OBJECT_PINNED) == 0)
+		entry->flags |= eb->context_flags;
+
+	ret = 0;
+	if (vma->node.size)
+		eb_pin_vma(eb, entry, vma);
+	if (eb_vma_misplaced(entry, vma)) {
+		eb_unreserve_vma(vma, entry);
+
+		list_add_tail(&vma->exec_link, &eb->unbound);
+		if (drm_mm_node_allocated(&vma->node))
+			ret = i915_vma_unbind(vma);
+	} else {
+		if (entry->offset != vma->node.start) {
+			entry->offset = vma->node.start | UPDATE;
+			eb->args->flags |= __EXEC_HAS_RELOC;
+		}
+	}
+	return ret;
+}
+
+static inline int use_cpu_reloc(const struct reloc_cache *cache,
+				const struct drm_i915_gem_object *obj)
 {
-	if (unlikely(vma->exec_entry)) {
-		DRM_DEBUG("Object [handle %d, index %d] appears more than once in object list\n",
-			  eb->exec[i].handle, i);
+	if (!i915_gem_object_has_struct_page(obj))
 		return false;
+
+	if (DBG_USE_CPU_RELOC)
+		return DBG_USE_CPU_RELOC > 0;
+
+	return (cache->has_llc ||
+		obj->base.write_domain == I915_GEM_DOMAIN_CPU ||
+		obj->cache_level != I915_CACHE_NONE);
+}
+
+static int
+eb_reserve_vma(struct i915_execbuffer *eb, struct i915_vma *vma)
+{
+	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
+	u64 flags;
+	int ret;
+
+	flags = PIN_USER | PIN_NONBLOCK;
+	if (entry->flags & EXEC_OBJECT_NEEDS_GTT)
+		flags |= PIN_GLOBAL;
+
+	if (!drm_mm_node_allocated(&vma->node)) {
+		/* Wa32bitGeneralStateOffset & Wa32bitInstructionBaseOffset,
+		 * limit address to the first 4GBs for unflagged objects.
+		 */
+		if ((entry->flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) == 0)
+			flags |= PIN_ZONE_4G;
+
+		if (entry->flags & __EXEC_OBJECT_NEEDS_MAP)
+			flags |= PIN_MAPPABLE;
+
+		if (entry->flags & EXEC_OBJECT_PINNED) {
+			flags |= entry->offset | PIN_OFFSET_FIXED;
+			/* force overlapping PINNED checks */
+			flags &= ~PIN_NONBLOCK;
+		} else if (entry->flags & __EXEC_OBJECT_NEEDS_BIAS)
+			flags |= BATCH_OFFSET_BIAS | PIN_OFFSET_BIAS;
 	}
-	list_add_tail(&vma->exec_link, &eb->vmas);
 
-	vma->exec_entry = &eb->exec[i];
-	if (eb->lut_mask >= 0) {
-		vma->exec_handle = eb->exec[i].handle;
-		hlist_add_head(&vma->exec_node,
-			       &eb->buckets[hash_32(vma->exec_handle,
-						    eb->lut_mask)]);
+	ret = i915_vma_pin(vma, entry->pad_to_size, entry->alignment, flags);
+	if (ret)
+		return ret;
+
+	if (entry->offset != vma->node.start) {
+		entry->offset = vma->node.start | UPDATE;
+		eb->args->flags |= __EXEC_HAS_RELOC;
 	}
+	entry->flags |= __EXEC_OBJECT_HAS_PIN;
 
-	i915_vma_get(vma);
-	eb->exec[i].rsvd2 = (uintptr_t)vma;
-	return true;
+	if (entry->flags & EXEC_OBJECT_NEEDS_FENCE) {
+		ret = i915_vma_get_fence(vma);
+		if (ret)
+			return ret;
+
+		if (i915_vma_pin_fence(vma))
+			entry->flags |= __EXEC_OBJECT_HAS_FENCE;
+	}
+
+	GEM_BUG_ON(eb_vma_misplaced(entry, vma));
+	return 0;
+}
+
+static int eb_reserve(struct i915_execbuffer *eb)
+{
+	const unsigned int count = eb->args->buffer_count;
+	struct list_head last;
+	struct i915_vma *vma;
+	unsigned int i, pass;
+	int ret;
+
+	/* Attempt to pin all of the buffers into the GTT.
+	 * This is done in 3 phases:
+	 *
+	 * 1a. Unbind all objects that do not match the GTT constraints for
+	 *     the execbuffer (fenceable, mappable, alignment etc).
+	 * 1b. Increment pin count for already bound objects.
+	 * 2.  Bind new objects.
+	 * 3.  Decrement pin count.
+	 *
+	 * This avoid unnecessary unbinding of later objects in order to make
+	 * room for the earlier objects *unless* we need to defragment.
+	 */
+
+	pass = 0;
+	ret = 0;
+	do {
+		list_for_each_entry(vma, &eb->unbound, exec_link) {
+			ret = eb_reserve_vma(eb, vma);
+			if (ret)
+				break;
+		}
+		if (ret != -ENOSPC || pass++)
+			return ret;
+
+		/* Resort *all* the objects into priority order */
+		INIT_LIST_HEAD(&eb->unbound);
+		INIT_LIST_HEAD(&last);
+		for (i = 0; i < count; i++) {
+			struct drm_i915_gem_exec_object2 *entry = &eb->exec[i];
+
+			vma = to_ptr(struct i915_vma, entry->rsvd2);
+			eb_unreserve_vma(vma, entry);
+
+			if (entry->flags & EXEC_OBJECT_PINNED)
+				list_add(&vma->exec_link, &eb->unbound);
+			else if (entry->flags & __EXEC_OBJECT_NEEDS_MAP)
+				list_add_tail(&vma->exec_link, &eb->unbound);
+			else
+				list_add_tail(&vma->exec_link, &last);
+		}
+		list_splice_tail(&last, &eb->unbound);
+
+		/* Too fragmented, unbind everything and retry */
+		ret = i915_gem_evict_vm(eb->vm);
+		if (ret)
+			return ret;
+	} while (1);
 }
 
-static inline struct hlist_head *ht_head(struct i915_gem_context *ctx,
-					 u32 handle)
+static inline struct hlist_head *
+ht_head(const struct i915_gem_context *ctx, u32 handle)
 {
 	return &ctx->vma.ht[hash_32(handle, ctx->vma.ht_bits)];
 }
 
+static int eb_batch_index(const struct i915_execbuffer *eb)
+{
+	return eb->args->buffer_count - 1;
+}
+
+static int eb_select_context(struct i915_execbuffer *eb)
+{
+	struct i915_gem_context *ctx;
+
+	ctx = i915_gem_context_lookup(eb->file->driver_priv, eb->args->rsvd1);
+	if (unlikely(IS_ERR(ctx)))
+		return PTR_ERR(ctx);
+
+	if (unlikely(i915_gem_context_is_banned(ctx))) {
+		DRM_DEBUG("Context %u tried to submit while banned\n",
+			  ctx->user_handle);
+		return -EIO;
+	}
+
+	eb->ctx = i915_gem_context_get(ctx);
+	eb->vm = ctx->ppgtt ? &ctx->ppgtt->base : &eb->i915->ggtt.base;
+
+	eb->context_flags = 0;
+	if (ctx->flags & CONTEXT_NO_ZEROMAP)
+		eb->context_flags |= __EXEC_OBJECT_NEEDS_BIAS;
+
+	return 0;
+}
+
 static int
 eb_lookup_vmas(struct i915_execbuffer *eb)
 {
 	const int count = eb->args->buffer_count;
 	struct i915_vma *vma;
+	struct idr *idr;
 	int slow_pass = -1;
-	int i;
+	int i, ret;
 
-	INIT_LIST_HEAD(&eb->vmas);
+	INIT_LIST_HEAD(&eb->relocs);
+	INIT_LIST_HEAD(&eb->unbound);
 
 	if (unlikely(eb->ctx->vma.ht_size & 1))
 		flush_work(&eb->ctx->vma.resize);
@@ -192,8 +601,9 @@ eb_lookup_vmas(struct i915_execbuffer *eb)
 			if (vma->ctx_handle != eb->exec[i].handle)
 				continue;
 
-			if (!eb_add_vma(eb, vma, i))
-				return -EINVAL;
+			ret = eb_add_vma(eb, &eb->exec[i], vma);
+			if (unlikely(ret))
+				return ret;
 
 			goto next_vma;
 		}
@@ -204,24 +614,25 @@ next_vma: ;
 	}
 
 	if (slow_pass < 0)
-		return 0;
+		goto out;
 
 	spin_lock(&eb->file->table_lock);
 	/* Grab a reference to the object and release the lock so we can lookup
 	 * or create the VMA without using GFP_ATOMIC */
+	idr = &eb->file->object_idr;
 	for (i = slow_pass; i < count; i++) {
 		struct drm_i915_gem_object *obj;
 
 		if (eb->exec[i].rsvd2)
 			continue;
 
-		obj = to_intel_bo(idr_find(&eb->file->object_idr,
-					   eb->exec[i].handle));
+		obj = to_intel_bo(idr_find(idr, eb->exec[i].handle));
 		if (unlikely(!obj)) {
 			spin_unlock(&eb->file->table_lock);
 			DRM_DEBUG("Invalid object handle %d at index %d\n",
 				  eb->exec[i].handle, i);
-			return -ENOENT;
+			ret = -ENOENT;
+			goto err;
 		}
 
 		eb->exec[i].rsvd2 = 1 | (uintptr_t)obj;
@@ -242,11 +653,12 @@ next_vma: ;
 		 * from the (obj, vm) we don't run the risk of creating
 		 * duplicated vmas for the same vm.
 		 */
-		obj = to_ptr(struct drm_i915_gem_object, eb->exec[i].rsvd2 & ~1);
+		obj = to_ptr(typeof(*obj), eb->exec[i].rsvd2 & ~1);
 		vma = i915_vma_instance(obj, eb->vm, NULL);
 		if (unlikely(IS_ERR(vma))) {
 			DRM_DEBUG("Failed to lookup VMA\n");
-			return PTR_ERR(vma);
+			ret = PTR_ERR(vma);
+			goto err;
 		}
 
 		/* First come, first served */
@@ -262,8 +674,9 @@ next_vma: ;
 			}
 		}
 
-		if (!eb_add_vma(eb, vma, i))
-			return -EINVAL;
+		ret = eb_add_vma(eb, &eb->exec[i], vma);
+		if (unlikely(ret))
+			goto err;
 	}
 	if (4*eb->ctx->vma.ht_count > 3*eb->ctx->vma.ht_size ||
 	    4*eb->ctx->vma.ht_count < eb->ctx->vma.ht_size) {
@@ -271,15 +684,10 @@ next_vma: ;
 		queue_work(system_highpri_wq, &eb->ctx->vma.resize);
 	}
 
-	return 0;
-}
-
-static struct i915_vma *
-eb_get_batch(struct i915_execbuffer *eb)
-{
-	struct i915_vma *vma;
-
-	vma = to_ptr(struct i915_vma, eb->exec[eb->args->buffer_count-1].rsvd2);
+out:
+	/* take note of the batch buffer before we might reorder the lists */
+	i = eb_batch_index(eb);
+	eb->batch = to_ptr(struct i915_vma, eb->exec[i].rsvd2);
 
 	/*
 	 * SNA is doing fancy tricks with compressing batch buffers, which leads
@@ -290,24 +698,34 @@ eb_get_batch(struct i915_execbuffer *eb)
 	 * Note that actual hangs have only been observed on gen7, but for
 	 * paranoia do it everywhere.
 	 */
-	if ((vma->exec_entry->flags & EXEC_OBJECT_PINNED) == 0)
-		vma->exec_entry->flags |= __EXEC_OBJECT_NEEDS_BIAS;
+	if ((eb->exec[i].flags & EXEC_OBJECT_PINNED) == 0)
+		eb->exec[i].flags |= __EXEC_OBJECT_NEEDS_BIAS;
+	if (eb->reloc_cache.has_fence)
+		eb->exec[i].flags |= EXEC_OBJECT_NEEDS_FENCE;
 
-	return vma;
+	eb->args->flags |= __EXEC_VALIDATED;
+	return eb_reserve(eb);
+
+err:
+	for (i = slow_pass; i < count; i++) {
+		if (eb->exec[i].rsvd2 & 1)
+			eb->exec[i].rsvd2 = 0;
+	}
+	return ret;
 }
 
 static struct i915_vma *
-eb_get_vma(struct i915_execbuffer *eb, unsigned long handle)
+eb_get_vma(const struct i915_execbuffer *eb, unsigned long handle)
 {
-	if (eb->lut_mask < 0) {
-		if (handle >= -eb->lut_mask)
+	if (eb->lut_size < 0) {
+		if (handle >= -eb->lut_size)
 			return NULL;
 		return to_ptr(struct i915_vma, eb->exec[handle].rsvd2);
 	} else {
 		struct hlist_head *head;
 		struct i915_vma *vma;
 
-		head = &eb->buckets[hash_32(handle, eb->lut_mask)];
+		head = &eb->buckets[hash_32(handle, eb->lut_size)];
 		hlist_for_each_entry(vma, head, exec_node) {
 			if (vma->exec_handle == handle)
 				return vma;
@@ -316,61 +734,60 @@ eb_get_vma(struct i915_execbuffer *eb, unsigned long handle)
 	}
 }
 
-static void eb_destroy(struct i915_execbuffer *eb)
+static void
+eb_reset(const struct i915_execbuffer *eb)
 {
-	struct i915_vma *vma;
+	const unsigned int count = eb->args->buffer_count;
+	unsigned int i;
 
-	list_for_each_entry(vma, &eb->vmas, exec_link) {
-		if (!vma->exec_entry)
-			continue;
+	for (i = 0; i < count; i++) {
+		struct drm_i915_gem_exec_object2 *entry = &eb->exec[i];
+		struct i915_vma *vma = to_ptr(struct i915_vma, entry->rsvd2);
 
-		__eb_unreserve_vma(vma, vma->exec_entry);
+		eb_unreserve_vma(vma, entry);
 		vma->exec_entry = NULL;
 		i915_vma_put(vma);
 	}
 
-	i915_gem_context_put(eb->ctx);
-
-	if (eb->lut_mask >= 0)
-		kfree(eb->buckets);
+	if (eb->lut_size >= 0)
+		memset(eb->buckets, 0,
+		       sizeof(struct hlist_head) << eb->lut_size);
 }
 
-static inline int use_cpu_reloc(struct drm_i915_gem_object *obj)
+static void eb_release_vma(const struct i915_execbuffer *eb)
 {
-	if (!i915_gem_object_has_struct_page(obj))
-		return false;
+	const unsigned int count = eb->args->buffer_count;
+	unsigned int i;
 
-	if (DBG_USE_CPU_RELOC)
-		return DBG_USE_CPU_RELOC > 0;
+	if (!eb->exec)
+		return;
 
-	return (HAS_LLC(to_i915(obj->base.dev)) ||
-		obj->base.write_domain == I915_GEM_DOMAIN_CPU ||
-		obj->cache_level != I915_CACHE_NONE);
-}
+	for (i = 0; i < count; i++) {
+		struct drm_i915_gem_exec_object2 *entry = &eb->exec[i];
+		struct i915_vma *vma = to_ptr(struct i915_vma, entry->rsvd2);
 
-/* Used to convert any address to canonical form.
- * Starting from gen8, some commands (e.g. STATE_BASE_ADDRESS,
- * MI_LOAD_REGISTER_MEM and others, see Broadwell PRM Vol2a) require the
- * addresses to be in a canonical form:
- * "GraphicsAddress[63:48] are ignored by the HW and assumed to be in correct
- * canonical form [63:48] == [47]."
- */
-#define GEN8_HIGH_ADDRESS_BIT 47
-static inline uint64_t gen8_canonical_addr(uint64_t address)
-{
-	return sign_extend64(address, GEN8_HIGH_ADDRESS_BIT);
+		if (!vma || !vma->exec_entry)
+			continue;
+
+		GEM_BUG_ON(vma->exec_entry != entry);
+		if (entry->flags & __EXEC_OBJECT_HAS_PIN)
+			__eb_unreserve_vma(vma, entry);
+		vma->exec_entry = NULL;
+		i915_vma_put(vma);
+	}
 }
 
-static inline uint64_t gen8_noncanonical_addr(uint64_t address)
+static void eb_destroy(const struct i915_execbuffer *eb)
 {
-	return address & ((1ULL << (GEN8_HIGH_ADDRESS_BIT + 1)) - 1);
+	if (eb->lut_size >= 0)
+		kfree(eb->buckets);
 }
 
-static inline uint64_t
+static inline u64
 relocation_target(const struct drm_i915_gem_relocation_entry *reloc,
-		  uint64_t target_offset)
+		  const struct i915_vma *target)
 {
-	return gen8_canonical_addr((int)reloc->delta + target_offset);
+	return gen8_canonical_addr((int)reloc->delta + target->node.start);
 }
 
 static void reloc_cache_init(struct reloc_cache *cache,
@@ -379,6 +796,8 @@ static void reloc_cache_init(struct reloc_cache *cache,
 	cache->page = -1;
 	cache->vaddr = 0;
 	/* Must be a variable in the struct to allow GCC to unroll. */
+	cache->has_llc = HAS_LLC(i915);
+	cache->has_fence = INTEL_GEN(i915) < 4;
 	cache->use_64bit_reloc = HAS_64BIT_RELOC(i915);
 	cache->node.allocated = false;
 }
@@ -481,7 +900,7 @@ static void *reloc_iomap(struct drm_i915_gem_object *obj,
 		struct i915_vma *vma;
 		int ret;
 
-		if (use_cpu_reloc(obj))
+		if (use_cpu_reloc(cache, obj))
 			return NULL;
 
 		ret = i915_gem_object_set_to_gtt_domain(obj, true);
@@ -569,25 +988,26 @@ static void clflush_write32(u32 *addr, u32 value, unsigned int flushes)
 		*addr = value;
 }
 
-static int
-relocate_entry(struct drm_i915_gem_object *obj,
+static u64
+relocate_entry(struct i915_vma *vma,
 	       const struct drm_i915_gem_relocation_entry *reloc,
-	       struct reloc_cache *cache,
-	       u64 target_offset)
+	       struct i915_execbuffer *eb,
+	       const struct i915_vma *target)
 {
+	struct drm_i915_gem_object *obj = vma->obj;
 	u64 offset = reloc->offset;
-	bool wide = cache->use_64bit_reloc;
+	u64 target_offset = relocation_target(reloc, target);
+	bool wide = eb->reloc_cache.use_64bit_reloc;
 	void *vaddr;
 
-	target_offset = relocation_target(reloc, target_offset);
 repeat:
-	vaddr = reloc_vaddr(obj, cache, offset >> PAGE_SHIFT);
+	vaddr = reloc_vaddr(obj, &eb->reloc_cache, offset >> PAGE_SHIFT);
 	if (IS_ERR(vaddr))
 		return PTR_ERR(vaddr);
 
 	clflush_write32(vaddr + offset_in_page(offset),
 			lower_32_bits(target_offset),
-			cache->vaddr);
+			eb->reloc_cache.vaddr);
 
 	if (wide) {
 		offset += sizeof(u32);
@@ -596,16 +1016,15 @@ relocate_entry(struct drm_i915_gem_object *obj,
 		goto repeat;
 	}
 
-	return 0;
+	return gen8_canonical_addr(target->node.start) | 1;
 }
 
-static int
-eb_relocate_entry(struct i915_vma *vma,
-		  struct i915_execbuffer *eb,
-		  struct drm_i915_gem_relocation_entry *reloc)
+static u64
+eb_relocate_entry(struct i915_execbuffer *eb,
+		  struct i915_vma *vma,
+		  const struct drm_i915_gem_relocation_entry *reloc)
 {
 	struct i915_vma *target;
-	u64 target_offset;
 	int ret;
 
 	/* we've already hold a reference to all valid objects */
@@ -636,26 +1055,28 @@ eb_relocate_entry(struct i915_vma *vma,
 		return -EINVAL;
 	}
 
-	if (reloc->write_domain)
+	if (reloc->write_domain) {
 		target->exec_entry->flags |= EXEC_OBJECT_WRITE;
 
-	/* Sandybridge PPGTT errata: We need a global gtt mapping for MI and
-	 * pipe_control writes because the gpu doesn't properly redirect them
-	 * through the ppgtt for non_secure batchbuffers.
-	 */
-	if (unlikely(IS_GEN6(eb->i915) &&
-		     reloc->write_domain == I915_GEM_DOMAIN_INSTRUCTION)) {
-		ret = i915_vma_bind(target, target->obj->cache_level,
-				    PIN_GLOBAL);
-		if (WARN_ONCE(ret, "Unexpected failure to bind target VMA!"))
-			return ret;
+		/* Sandybridge PPGTT errata: We need a global gtt mapping
+		 * for MI and pipe_control writes because the gpu doesn't
+		 * properly redirect them through the ppgtt for non_secure
+		 * batchbuffers.
+		 */
+		if (reloc->write_domain == I915_GEM_DOMAIN_INSTRUCTION &&
+		    IS_GEN6(eb->i915)) {
+			ret = i915_vma_bind(target, target->obj->cache_level,
+					    PIN_GLOBAL);
+			if (WARN_ONCE(ret,
+				      "Unexpected failure to bind target VMA!"))
+				return ret;
+		}
 	}
 
 	/* If the relocation already has the right value in it, no
 	 * more work needs to be done.
 	 */
-	target_offset = gen8_canonical_addr(target->node.start);
-	if (target_offset == reloc->presumed_offset)
+	if (gen8_canonical_addr(target->node.start) == reloc->presumed_offset)
 		return 0;
 
 	/* Check that the relocation address is valid... */
@@ -676,34 +1097,33 @@ eb_relocate_entry(struct i915_vma *vma,
 		return -EINVAL;
 	}
 
-	ret = relocate_entry(vma->obj, reloc, &eb->reloc_cache, target_offset);
-	if (ret)
-		return ret;
-
 	/* and update the user's relocation entry */
-	reloc->presumed_offset = target_offset;
-	return 0;
+	return relocate_entry(vma, reloc, eb, target);
 }
 
-static int eb_relocate_vma(struct i915_vma *vma, struct i915_execbuffer *eb)
+static int eb_relocate_vma(struct i915_execbuffer *eb, struct i915_vma *vma)
 {
 #define N_RELOC(x) ((x) / sizeof(struct drm_i915_gem_relocation_entry))
-	struct drm_i915_gem_relocation_entry stack_reloc[N_RELOC(512)];
-	struct drm_i915_gem_relocation_entry __user *user_relocs;
-	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
-	int remain, ret = 0;
-
-	user_relocs = u64_to_user_ptr(entry->relocs_ptr);
+	struct drm_i915_gem_relocation_entry stack[N_RELOC(512)];
+	struct drm_i915_gem_relocation_entry __user *urelocs;
+	const struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
+	unsigned int remain;
 
+	urelocs = u64_to_user_ptr(entry->relocs_ptr);
 	remain = entry->relocation_count;
-	while (remain) {
-		struct drm_i915_gem_relocation_entry *r = stack_reloc;
-		unsigned long unwritten;
-		unsigned int count;
+	if (unlikely(remain > ULONG_MAX / sizeof(*urelocs)))
+		return -EINVAL;
 
-		count = min_t(unsigned int, remain, ARRAY_SIZE(stack_reloc));
-		remain -= count;
+	/*
+	 * We must check that the entire relocation array is safe
+	 * to read. However, if the array is not writable the user loses
+	 * the updated relocation values.
+	 */
 
+	do {
+		struct drm_i915_gem_relocation_entry *r = stack;
+		unsigned int count =
+			min_t(unsigned int, remain, ARRAY_SIZE(stack));
 		/* This is the fast path and we cannot handle a pagefault
 		 * whilst holding the struct mutex lest the user pass in the
 		 * relocations contained within a mmaped bo. For in such a case
@@ -712,66 +1132,66 @@ static int eb_relocate_vma(struct i915_vma *vma, struct i915_execbuffer *eb)
 		 * this is bad and so lockdep complains vehemently.
 		 */
 		pagefault_disable();
-		unwritten = __copy_from_user_inatomic(r, user_relocs, count*sizeof(r[0]));
-		pagefault_enable();
-		if (unlikely(unwritten)) {
-			ret = -EFAULT;
+		if (__copy_from_user_inatomic(r, urelocs, count*sizeof(r[0]))) {
+			pagefault_enable();
+			remain = -EFAULT;
 			goto out;
 		}
+		pagefault_enable();
 
+		remain -= count;
 		do {
-			u64 offset = r->presumed_offset;
+			u64 offset = eb_relocate_entry(eb, vma, r);
 
-			ret = eb_relocate_entry(vma, eb, r);
-			if (ret)
+			if (likely(offset == 0)) {
+			} else if ((s64)offset < 0) {
+				remain = (s64)offset;
 				goto out;
-
-			if (r->presumed_offset != offset) {
+			} else {
+				/* Note that reporting an error now
+				 * leaves everything in an inconsistent
+				 * state as we have *already* changed
+				 * the relocation value inside the
+				 * object. As we have not changed the
+				 * reloc.presumed_offset or will not
+				 * change the execobject.offset, on the
+				 * call we may not rewrite the value
+				 * inside the object, leaving it
+				 * dangling and causing a GPU hang.
+				 */
 				pagefault_disable();
-				unwritten = __put_user(r->presumed_offset,
-						       &user_relocs->presumed_offset);
+				__put_user(offset & ~1,
+					   &urelocs[r-stack].presumed_offset);
 				pagefault_enable();
-				if (unlikely(unwritten)) {
-					/* Note that reporting an error now
-					 * leaves everything in an inconsistent
-					 * state as we have *already* changed
-					 * the relocation value inside the
-					 * object. As we have not changed the
-					 * reloc.presumed_offset or will not
-					 * change the execobject.offset, on the
-					 * call we may not rewrite the value
-					 * inside the object, leaving it
-					 * dangling and causing a GPU hang.
-					 */
-					ret = -EFAULT;
-					goto out;
-				}
 			}
-
-			user_relocs++;
-			r++;
-		} while (--count);
-	}
-
+		} while (r++, --count);
+		urelocs += ARRAY_SIZE(stack);
+	} while (remain);
 out:
 	reloc_cache_reset(&eb->reloc_cache);
-	return ret;
+	return remain;
 #undef N_RELOC
 }
 
 static int
-eb_relocate_vma_slow(struct i915_vma *vma,
-		     struct i915_execbuffer *eb,
-		     struct drm_i915_gem_relocation_entry *relocs)
+eb_relocate_vma_slow(struct i915_execbuffer *eb, struct i915_vma *vma)
 {
 	const struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
-	int i, ret = 0;
+	struct drm_i915_gem_relocation_entry *relocs =
+		to_ptr(typeof(*relocs), entry->relocs_ptr);
+	unsigned int i;
+	int ret;
 
 	for (i = 0; i < entry->relocation_count; i++) {
-		ret = eb_relocate_entry(vma, eb, &relocs[i]);
-		if (ret)
-			break;
+		u64 offset = eb_relocate_entry(eb, vma, &relocs[i]);
+
+		if ((s64)offset < 0) {
+			ret = (s64)offset;
+			goto err;
+		}
 	}
+	ret = 0;
+err:
 	reloc_cache_reset(&eb->reloc_cache);
 	return ret;
 }
@@ -779,299 +1199,184 @@ eb_relocate_vma_slow(struct i915_vma *vma,
 static int eb_relocate(struct i915_execbuffer *eb)
 {
 	struct i915_vma *vma;
-	int ret = 0;
 
-	list_for_each_entry(vma, &eb->vmas, exec_link) {
-		ret = eb_relocate_vma(vma, eb);
+	list_for_each_entry(vma, &eb->relocs, reloc_link) {
+		int ret = eb_relocate_vma(eb, vma);
 		if (ret)
-			break;
+			return ret;
 	}
 
-	return ret;
-}
-
-static bool only_mappable_for_reloc(unsigned int flags)
-{
-	return (flags & (EXEC_OBJECT_NEEDS_FENCE | __EXEC_OBJECT_NEEDS_MAP)) ==
-		__EXEC_OBJECT_NEEDS_MAP;
+	return 0;
 }
 
-static int
-eb_reserve_vma(struct i915_vma *vma,
-	       struct intel_engine_cs *engine,
-	       bool *need_reloc)
+static int check_relocations(const struct drm_i915_gem_exec_object2 *entry)
 {
-	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
-	uint64_t flags;
-	int ret;
-
-	flags = PIN_USER;
-	if (entry->flags & EXEC_OBJECT_NEEDS_GTT)
-		flags |= PIN_GLOBAL;
+	const unsigned long relocs_max =
+		ULONG_MAX / sizeof(struct drm_i915_gem_relocation_entry);
+	const char __user *addr, *end;
+	unsigned long size;
+	char __maybe_unused c;
+
+	size = entry->relocation_count;
+	if (size == 0)
+		return 0;
 
-	if (!drm_mm_node_allocated(&vma->node)) {
-		/* Wa32bitGeneralStateOffset & Wa32bitInstructionBaseOffset,
-		 * limit address to the first 4GBs for unflagged objects.
-		 */
-		if ((entry->flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) == 0)
-			flags |= PIN_ZONE_4G;
-		if (entry->flags & __EXEC_OBJECT_NEEDS_MAP)
-			flags |= PIN_GLOBAL | PIN_MAPPABLE;
-		if (entry->flags & __EXEC_OBJECT_NEEDS_BIAS)
-			flags |= BATCH_OFFSET_BIAS | PIN_OFFSET_BIAS;
-		if (entry->flags & EXEC_OBJECT_PINNED)
-			flags |= entry->offset | PIN_OFFSET_FIXED;
-		if ((flags & PIN_MAPPABLE) == 0)
-			flags |= PIN_HIGH;
-	}
-
-	ret = i915_vma_pin(vma,
-			   entry->pad_to_size,
-			   entry->alignment,
-			   flags);
-	if ((ret == -ENOSPC || ret == -E2BIG) &&
-	    only_mappable_for_reloc(entry->flags))
-		ret = i915_vma_pin(vma,
-				   entry->pad_to_size,
-				   entry->alignment,
-				   flags & ~PIN_MAPPABLE);
-	if (ret)
-		return ret;
+	if (size > relocs_max)
+		return -EINVAL;
 
-	entry->flags |= __EXEC_OBJECT_HAS_PIN;
+	addr = u64_to_user_ptr(entry->relocs_ptr);
+	size *= sizeof(struct drm_i915_gem_relocation_entry);
+	if (!access_ok(VERIFY_WRITE, addr, size))
+		return -EFAULT;
 
-	if (entry->flags & EXEC_OBJECT_NEEDS_FENCE) {
-		ret = i915_vma_get_fence(vma);
+	end = addr + size;
+	for (; addr < end; addr += PAGE_SIZE) {
+		int ret = __get_user(c, addr);
 		if (ret)
 			return ret;
-
-		if (i915_vma_pin_fence(vma))
-			entry->flags |= __EXEC_OBJECT_HAS_FENCE;
-	}
-
-	if (entry->offset != vma->node.start) {
-		entry->offset = vma->node.start;
-		*need_reloc = true;
 	}
-
-	return 0;
+	return __get_user(c, end - 1);
 }
 
-static bool
-need_reloc_mappable(struct i915_vma *vma)
+static int
+eb_copy_relocations(const struct i915_execbuffer *eb)
 {
-	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
-
-	if (entry->relocation_count == 0)
-		return false;
-
-	if (!i915_vma_is_ggtt(vma))
-		return false;
+	const unsigned int count = eb->args->buffer_count;
+	unsigned int i;
+	int ret;
 
-	/* See also use_cpu_reloc() */
-	if (HAS_LLC(to_i915(vma->obj->base.dev)))
-		return false;
+	for (i = 0; i < count; i++) {
+		struct drm_i915_gem_relocation_entry __user *urelocs;
+		struct drm_i915_gem_relocation_entry *relocs;
+		unsigned int nreloc = eb->exec[i].relocation_count, j;
+		unsigned long size;
 
-	if (vma->obj->base.write_domain == I915_GEM_DOMAIN_CPU)
-		return false;
+		if (nreloc == 0)
+			continue;
 
-	return true;
-}
+		ret = check_relocations(&eb->exec[i]);
+		if (ret)
+			goto err;
 
-static bool
-eb_vma_misplaced(struct i915_vma *vma)
-{
-	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
+		urelocs = u64_to_user_ptr(eb->exec[i].relocs_ptr);
+		size = nreloc * sizeof(*relocs);
 
-	WARN_ON(entry->flags & __EXEC_OBJECT_NEEDS_MAP &&
-		!i915_vma_is_ggtt(vma));
+		relocs = drm_malloc_gfp(size, 1, GFP_TEMPORARY);
+		if (!relocs) {
+			ret = -ENOMEM;
+			goto err;
+		}
 
-	if (entry->alignment && !IS_ALIGNED(vma->node.start, entry->alignment))
-		return true;
+		/* copy_from_user is limited to 4GiB */
+		j = 0;
+		do {
+			u32 len = min_t(u64, 1ull<<31, size);
 
-	if (vma->node.size < entry->pad_to_size)
-		return true;
+			if (__copy_from_user(relocs + j, urelocs + j, len)) {
+				ret = -EFAULT;
+				goto err;
+			}
 
-	if (entry->flags & EXEC_OBJECT_PINNED &&
-	    vma->node.start != entry->offset)
-		return true;
+			size -= len;
+			BUILD_BUG_ON_NOT_POWER_OF_2(sizeof(*relocs));
+			j += len / sizeof(*relocs);
+		} while (size);
 
-	if (entry->flags & __EXEC_OBJECT_NEEDS_BIAS &&
-	    vma->node.start < BATCH_OFFSET_BIAS)
-		return true;
+		/* As we do not update the known relocation offsets after
+		 * relocating (due to the complexities in lock handling),
+		 * we need to mark them as invalid now so that we force the
+		 * relocation processing next time. Just in case the target
+		 * object is evicted and then rebound into its old
+		 * presumed_offset before the next execbuffer - if that
+		 * happened we would make the mistake of assuming that the
+		 * relocations were valid.
+		 */
+		user_access_begin();
+		for (j = 0; j < nreloc; j++)
+			unsafe_put_user(-1,
+					&urelocs[j].presumed_offset,
+					end_user);
+end_user:
+		user_access_end();
 
-	/* avoid costly ping-pong once a batch bo ended up non-mappable */
-	if (entry->flags & __EXEC_OBJECT_NEEDS_MAP &&
-	    !i915_vma_is_map_and_fenceable(vma))
-		return !only_mappable_for_reloc(entry->flags);
+		eb->exec[i].relocs_ptr = (uintptr_t)relocs;
+	}
 
-	if ((entry->flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) == 0 &&
-	    (vma->node.start + vma->node.size - 1) >> 32)
-		return true;
+	return 0;
 
-	return false;
+err:
+	while (i--) {
+		struct drm_i915_gem_relocation_entry *relocs =
+			to_ptr(typeof(*relocs), eb->exec[i].relocs_ptr);
+		if (eb->exec[i].relocation_count)
+			drm_free_large(relocs);
+	}
+	return ret;
 }
 
-static int eb_reserve(struct i915_execbuffer *eb)
+static int eb_prefault_relocations(const struct i915_execbuffer *eb)
 {
-	const bool has_fenced_gpu_access = INTEL_GEN(eb->i915) < 4;
-	struct i915_vma *vma;
-	struct list_head ordered_vmas;
-	struct list_head pinned_vmas;
-	int retry;
-
-	INIT_LIST_HEAD(&ordered_vmas);
-	INIT_LIST_HEAD(&pinned_vmas);
-	while (!list_empty(&eb->vmas)) {
-		struct drm_i915_gem_exec_object2 *entry;
-		bool need_fence, need_mappable;
-
-		vma = list_first_entry(&eb->vmas, struct i915_vma, exec_link);
-		entry = vma->exec_entry;
-
-		if (eb->ctx->flags & CONTEXT_NO_ZEROMAP)
-			entry->flags |= __EXEC_OBJECT_NEEDS_BIAS;
-
-		if (!has_fenced_gpu_access)
-			entry->flags &= ~EXEC_OBJECT_NEEDS_FENCE;
-		need_fence =
-			entry->flags & EXEC_OBJECT_NEEDS_FENCE &&
-			i915_gem_object_is_tiled(vma->obj);
-		need_mappable = need_fence || need_reloc_mappable(vma);
-
-		if (entry->flags & EXEC_OBJECT_PINNED)
-			list_move_tail(&vma->exec_link, &pinned_vmas);
-		else if (need_mappable) {
-			entry->flags |= __EXEC_OBJECT_NEEDS_MAP;
-			list_move(&vma->exec_link, &ordered_vmas);
-		} else
-			list_move_tail(&vma->exec_link, &ordered_vmas);
-	}
-	list_splice(&ordered_vmas, &eb->vmas);
-	list_splice(&pinned_vmas, &eb->vmas);
-
-	/* Attempt to pin all of the buffers into the GTT.
-	 * This is done in 3 phases:
-	 *
-	 * 1a. Unbind all objects that do not match the GTT constraints for
-	 *     the execbuffer (fenceable, mappable, alignment etc).
-	 * 1b. Increment pin count for already bound objects.
-	 * 2.  Bind new objects.
-	 * 3.  Decrement pin count.
-	 *
-	 * This avoid unnecessary unbinding of later objects in order to make
-	 * room for the earlier objects *unless* we need to defragment.
-	 */
-	retry = 0;
-	do {
-		int ret = 0;
-
-		/* Unbind any ill-fitting objects or pin. */
-		list_for_each_entry(vma, &eb->vmas, exec_link) {
-			if (!drm_mm_node_allocated(&vma->node))
-				continue;
-
-			if (eb_vma_misplaced(vma))
-				ret = i915_vma_unbind(vma);
-			else
-				ret = eb_reserve_vma(vma, eb->engine, &eb->need_relocs);
-			if (ret)
-				goto err;
-		}
-
-		/* Bind fresh objects */
-		list_for_each_entry(vma, &eb->vmas, exec_link) {
-			if (drm_mm_node_allocated(&vma->node))
-				continue;
-
-			ret = eb_reserve_vma(vma, eb->engine, &eb->need_relocs);
-			if (ret)
-				goto err;
-		}
-
-err:
-		if (ret != -ENOSPC || retry++)
-			return ret;
+	const unsigned int count = eb->args->buffer_count;
+	unsigned int i;
 
-		/* Decrement pin count for bound objects */
-		list_for_each_entry(vma, &eb->vmas, exec_link)
-			eb_unreserve_vma(vma);
+	for (i = 0; i < count; i++) {
+		int ret;
 
-		ret = i915_gem_evict_vm(eb->vm, true);
+		ret = check_relocations(&eb->exec[i]);
 		if (ret)
 			return ret;
-	} while (1);
+	}
+
+	return 0;
 }
 
-static int
-eb_relocate_slow(struct i915_execbuffer *eb)
+static int eb_relocate_slow(struct i915_execbuffer *eb)
 {
-	const unsigned int count = eb->args->buffer_count;
 	struct drm_device *dev = &eb->i915->drm;
-	struct drm_i915_gem_relocation_entry *reloc;
+	bool have_copy = false;
 	struct i915_vma *vma;
-	int *reloc_offset;
-	int i, total, ret;
+	int ret = 0;
+
+repeat:
+	if (signal_pending(current)) {
+		ret = -ERESTARTSYS;
+		goto out;
+	}
 
 	/* We may process another execbuffer during the unlock... */
 	eb_reset(eb);
 	mutex_unlock(&dev->struct_mutex);
 
-	total = 0;
-	for (i = 0; i < count; i++)
-		total += eb->exec[i].relocation_count;
-
-	reloc_offset = drm_malloc_ab(count, sizeof(*reloc_offset));
-	reloc = drm_malloc_ab(total, sizeof(*reloc));
-	if (reloc == NULL || reloc_offset == NULL) {
-		drm_free_large(reloc);
-		drm_free_large(reloc_offset);
-		mutex_lock(&dev->struct_mutex);
-		return -ENOMEM;
+	/* We take 3 passes through the slowpatch.
+	 *
+	 * 1 - we try to just prefault all the user relocation entries and
+	 * then attempt to reuse the atomic pagefault disabled fast path again.
+	 *
+	 * 2 - we copy the user entries to a local buffer here outside of the
+	 * local and allow ourselves to wait upon any rendering before
+	 * relocations
+	 *
+	 * 3 - we already have a local copy of the relocation entries, but
+	 * were interrupted (EAGAIN) whilst waiting for the objects, try again.
+	 */
+	if (ret == 0 && likely(!i915.prefault_disable)) {
+		ret = eb_prefault_relocations(eb);
+	} else if (!have_copy) {
+		ret = eb_copy_relocations(eb);
+		have_copy = ret == 0;
+	} else {
+		cond_resched();
+		ret = 0;
 	}
-
-	total = 0;
-	for (i = 0; i < count; i++) {
-		struct drm_i915_gem_relocation_entry __user *user_relocs;
-		u64 invalid_offset = (u64)-1;
-		int j;
-
-		user_relocs = u64_to_user_ptr(eb->exec[i].relocs_ptr);
-
-		if (copy_from_user(reloc+total, user_relocs,
-				   eb->exec[i].relocation_count * sizeof(*reloc))) {
-			ret = -EFAULT;
-			mutex_lock(&dev->struct_mutex);
-			goto err;
-		}
-
-		/* As we do not update the known relocation offsets after
-		 * relocating (due to the complexities in lock handling),
-		 * we need to mark them as invalid now so that we force the
-		 * relocation processing next time. Just in case the target
-		 * object is evicted and then rebound into its old
-		 * presumed_offset before the next execbuffer - if that
-		 * happened we would make the mistake of assuming that the
-		 * relocations were valid.
-		 */
-		for (j = 0; j < eb->exec[i].relocation_count; j++) {
-			if (__copy_to_user(&user_relocs[j].presumed_offset,
-					   &invalid_offset,
-					   sizeof(invalid_offset))) {
-				ret = -EFAULT;
-				mutex_lock(&dev->struct_mutex);
-				goto err;
-			}
-		}
-
-		reloc_offset[i] = total;
-		total += eb->exec[i].relocation_count;
+	if (ret) {
+		mutex_lock(&dev->struct_mutex);
+		goto out;
 	}
 
 	ret = i915_mutex_lock_interruptible(dev);
 	if (ret) {
 		mutex_lock(&dev->struct_mutex);
-		goto err;
+		goto out;
 	}
 
 	/* reacquire the objects */
@@ -1079,16 +1384,18 @@ eb_relocate_slow(struct i915_execbuffer *eb)
 	if (ret)
 		goto err;
 
-	ret = eb_reserve(eb);
-	if (ret)
-		goto err;
-
-	list_for_each_entry(vma, &eb->vmas, exec_link) {
-		int idx = vma->exec_entry - eb->exec;
-
-		ret = eb_relocate_vma_slow(vma, eb, reloc + reloc_offset[idx]);
-		if (ret)
-			goto err;
+	list_for_each_entry(vma, &eb->relocs, reloc_link) {
+		if (!have_copy) {
+			pagefault_disable();
+			ret = eb_relocate_vma(eb, vma);
+			pagefault_enable();
+			if (ret)
+				goto repeat;
+		} else {
+			ret = eb_relocate_vma_slow(eb, vma);
+			if (ret)
+				goto err;
+		}
 	}
 
 	/* Leave the user relocations as are, this is the painfully slow path,
@@ -1098,21 +1405,67 @@ eb_relocate_slow(struct i915_execbuffer *eb)
 	 */
 
 err:
-	drm_free_large(reloc);
-	drm_free_large(reloc_offset);
-	return ret;
+	if (ret == -EAGAIN)
+		goto repeat;
+
+out:
+	if (have_copy) {
+		const unsigned int count = eb->args->buffer_count;
+		unsigned int i;
+
+		for (i = 0; i < count; i++) {
+			const struct drm_i915_gem_exec_object2 *entry =
+				&eb->exec[i];
+			struct drm_i915_gem_relocation_entry *relocs;
+
+			if (entry->relocation_count == 0)
+				continue;
+
+			relocs = to_ptr(typeof(*relocs), entry->relocs_ptr);
+			drm_free_large(relocs);
+		}
+	}
+
+	return ret ?: have_copy;
+}
+
+static void eb_export_fence(struct drm_i915_gem_object *obj,
+			    struct drm_i915_gem_request *req,
+			    unsigned int flags)
+{
+	struct reservation_object *resv = obj->resv;
+
+	/* Ignore errors from failing to allocate the new fence, we can't
+	 * handle an error right now. Worst case should be missed
+	 * synchronisation leading to rendering corruption.
+	 */
+	reservation_object_lock(resv, NULL);
+	if (flags & EXEC_OBJECT_WRITE)
+		reservation_object_add_excl_fence(resv, &req->fence);
+	else if (reservation_object_reserve_shared(resv) == 0)
+		reservation_object_add_shared_fence(resv, &req->fence);
+	reservation_object_unlock(resv);
+}
+
+static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
+{
+	return !(obj->cache_level == I915_CACHE_NONE ||
+		 obj->cache_level == I915_CACHE_WT);
 }
 
 static int
 eb_move_to_gpu(struct i915_execbuffer *eb)
 {
-	struct i915_vma *vma;
+	const unsigned int count = eb->args->buffer_count;
+	unsigned int i;
 	int ret;
 
-	list_for_each_entry(vma, &eb->vmas, exec_link) {
+	for (i = 0; i < count; i++) {
+		const struct drm_i915_gem_exec_object2 *entry = &eb->exec[i];
+		struct i915_vma *vma = to_ptr(struct i915_vma, entry->rsvd2);
 		struct drm_i915_gem_object *obj = vma->obj;
 
-		if (vma->exec_entry->flags & EXEC_OBJECT_CAPTURE) {
+		if (entry->flags & EXEC_OBJECT_CAPTURE) {
 			struct i915_gem_capture_list *capture;
 
 			capture = kmalloc(sizeof(*capture), GFP_KERNEL);
@@ -1124,8 +1477,8 @@ eb_move_to_gpu(struct i915_execbuffer *eb)
 			eb->request->capture_list = capture;
 		}
 
-		if (vma->exec_entry->flags & EXEC_OBJECT_ASYNC)
-			continue;
+		if (entry->flags & EXEC_OBJECT_ASYNC)
+			goto skip_flushes;
 
 		if (obj->base.write_domain & I915_GEM_DOMAIN_CPU) {
 			i915_gem_clflush_object(obj, 0);
@@ -1133,10 +1486,33 @@ eb_move_to_gpu(struct i915_execbuffer *eb)
 		}
 
 		ret = i915_gem_request_await_object
-			(eb->request, obj, vma->exec_entry->flags & EXEC_OBJECT_WRITE);
+			(eb->request, obj, entry->flags & EXEC_OBJECT_WRITE);
 		if (ret)
 			return ret;
+
+skip_flushes:
+		obj->base.write_domain = 0;
+		if (entry->flags & EXEC_OBJECT_WRITE) {
+			obj->base.read_domains = 0;
+			if (!obj->cache_dirty && gpu_write_needs_clflush(obj))
+				obj->cache_dirty = true;
+			intel_fb_obj_invalidate(obj, ORIGIN_CS);
+		}
+		obj->base.read_domains |= I915_GEM_GPU_DOMAINS;
+
+		i915_vma_move_to_active(vma, eb->request, entry->flags);
+		__eb_unreserve_vma(vma, entry);
+		vma->exec_entry = NULL;
+	}
+
+	for (i = 0; i < count; i++) {
+		const struct drm_i915_gem_exec_object2 *entry = &eb->exec[i];
+		struct i915_vma *vma = to_ptr(struct i915_vma, entry->rsvd2);
+
+		eb_export_fence(vma->obj, eb->request, entry->flags);
+		i915_vma_put(vma);
 	}
+	eb->exec = NULL;
 
 	/* Unconditionally flush any chipset caches (for streaming writes). */
 	i915_gem_chipset_flush(eb->i915);
@@ -1168,114 +1544,10 @@ i915_gem_check_execbuffer(struct drm_i915_gem_execbuffer2 *exec)
 	return true;
 }
 
-static int
-validate_exec_list(struct drm_device *dev,
-		   struct drm_i915_gem_exec_object2 *exec,
-		   int count)
-{
-	unsigned relocs_total = 0;
-	unsigned relocs_max = UINT_MAX / sizeof(struct drm_i915_gem_relocation_entry);
-	unsigned invalid_flags;
-	int i;
-
-	/* INTERNAL flags must not overlap with external ones */
-	BUILD_BUG_ON(__EXEC_OBJECT_INTERNAL_FLAGS & ~__EXEC_OBJECT_UNKNOWN_FLAGS);
-
-	invalid_flags = __EXEC_OBJECT_UNKNOWN_FLAGS;
-	if (USES_FULL_PPGTT(dev))
-		invalid_flags |= EXEC_OBJECT_NEEDS_GTT;
-
-	for (i = 0; i < count; i++) {
-		char __user *ptr = u64_to_user_ptr(exec[i].relocs_ptr);
-		int length; /* limited by fault_in_pages_readable() */
-
-		if (exec[i].flags & invalid_flags)
-			return -EINVAL;
-
-		/* Offset can be used as input (EXEC_OBJECT_PINNED), reject
-		 * any non-page-aligned or non-canonical addresses.
-		 */
-		if (exec[i].flags & EXEC_OBJECT_PINNED) {
-			if (exec[i].offset !=
-			    gen8_canonical_addr(exec[i].offset & PAGE_MASK))
-				return -EINVAL;
-		}
-
-		/* From drm_mm perspective address space is continuous,
-		 * so from this point we're always using non-canonical
-		 * form internally.
-		 */
-		exec[i].offset = gen8_noncanonical_addr(exec[i].offset);
-
-		if (exec[i].alignment && !is_power_of_2(exec[i].alignment))
-			return -EINVAL;
-
-		/* pad_to_size was once a reserved field, so sanitize it */
-		if (exec[i].flags & EXEC_OBJECT_PAD_TO_SIZE) {
-			if (offset_in_page(exec[i].pad_to_size))
-				return -EINVAL;
-		} else {
-			exec[i].pad_to_size = 0;
-		}
-
-		/* First check for malicious input causing overflow in
-		 * the worst case where we need to allocate the entire
-		 * relocation tree as a single array.
-		 */
-		if (exec[i].relocation_count > relocs_max - relocs_total)
-			return -EINVAL;
-		relocs_total += exec[i].relocation_count;
-
-		length = exec[i].relocation_count *
-			sizeof(struct drm_i915_gem_relocation_entry);
-		/*
-		 * We must check that the entire relocation array is safe
-		 * to read, but since we may need to update the presumed
-		 * offsets during execution, check for full write access.
-		 */
-		if (!access_ok(VERIFY_WRITE, ptr, length))
-			return -EFAULT;
-
-		if (likely(!i915.prefault_disable)) {
-			if (fault_in_pages_readable(ptr, length))
-				return -EFAULT;
-		}
-	}
-
-	return 0;
-}
-
-static int eb_select_context(struct i915_execbuffer *eb)
-{
-	unsigned int ctx_id = i915_execbuffer2_get_context_id(*eb->args);
-	struct i915_gem_context *ctx;
-
-	ctx = i915_gem_context_lookup(eb->file->driver_priv, ctx_id);
-	if (unlikely(IS_ERR(ctx)))
-		return PTR_ERR(ctx);
-
-	if (unlikely(i915_gem_context_is_banned(ctx))) {
-		DRM_DEBUG("Context %u tried to submit while banned\n", ctx_id);
-		return -EIO;
-	}
-
-	eb->ctx = i915_gem_context_get(ctx);
-	eb->vm = ctx->ppgtt ? &ctx->ppgtt->base : &eb->i915->ggtt.base;
-
-	return 0;
-}
-
-static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
-{
-	return !(obj->cache_level == I915_CACHE_NONE ||
-		 obj->cache_level == I915_CACHE_WT);
-}
-
 void i915_vma_move_to_active(struct i915_vma *vma,
 			     struct drm_i915_gem_request *req,
 			     unsigned int flags)
 {
-	struct drm_i915_gem_object *obj = vma->obj;
 	const unsigned int idx = req->engine->id;
 
 	lockdep_assert_held(&req->i915->drm.struct_mutex);
@@ -1289,17 +1561,17 @@ void i915_vma_move_to_active(struct i915_vma *vma,
 	 * *last*.
 	 */
 	if (!i915_vma_is_active(vma))
-		obj->active_count++;
+		vma->obj->active_count++;
 	i915_vma_set_active(vma, idx);
 	i915_gem_active_set(&vma->last_read[idx], req);
 	list_move_tail(&vma->vm_link, &vma->vm->active_list);
 
 	if (flags & EXEC_OBJECT_WRITE) {
+		struct drm_i915_gem_object *obj = vma->obj;
+
 		if (intel_fb_obj_invalidate(obj, ORIGIN_CS))
 			i915_gem_active_set(&obj->frontbuffer_write, req);
 
-		/* update for the implicit flush after a batch */
-		obj->base.write_domain &= ~I915_GEM_GPU_DOMAINS;
 		if (!obj->cache_dirty && gpu_write_needs_clflush(obj))
 			obj->cache_dirty = true;
 	}
@@ -1308,42 +1580,6 @@ void i915_vma_move_to_active(struct i915_vma *vma,
 		i915_gem_active_set(&vma->last_fence, req);
 }
 
-static void eb_export_fence(struct drm_i915_gem_object *obj,
-			    struct drm_i915_gem_request *req,
-			    unsigned int flags)
-{
-	struct reservation_object *resv = obj->resv;
-
-	/* Ignore errors from failing to allocate the new fence, we can't
-	 * handle an error right now. Worst case should be missed
-	 * synchronisation leading to rendering corruption.
-	 */
-	reservation_object_lock(resv, NULL);
-	if (flags & EXEC_OBJECT_WRITE)
-		reservation_object_add_excl_fence(resv, &req->fence);
-	else if (reservation_object_reserve_shared(resv) == 0)
-		reservation_object_add_shared_fence(resv, &req->fence);
-	reservation_object_unlock(resv);
-}
-
-static void
-eb_move_to_active(struct i915_execbuffer *eb)
-{
-	struct i915_vma *vma;
-
-	list_for_each_entry(vma, &eb->vmas, exec_link) {
-		struct drm_i915_gem_object *obj = vma->obj;
-
-		obj->base.write_domain = 0;
-		if (vma->exec_entry->flags & EXEC_OBJECT_WRITE)
-			obj->base.read_domains = 0;
-		obj->base.read_domains |= I915_GEM_GPU_DOMAINS;
-
-		i915_vma_move_to_active(vma, eb->request, vma->exec_entry->flags);
-		eb_export_fence(obj, eb->request, vma->exec_entry->flags);
-	}
-}
-
 static int
 i915_reset_gen7_sol_offsets(struct drm_i915_gem_request *req)
 {
@@ -1355,16 +1591,16 @@ i915_reset_gen7_sol_offsets(struct drm_i915_gem_request *req)
 		return -EINVAL;
 	}
 
-	cs = intel_ring_begin(req, 4 * 3);
+	cs = intel_ring_begin(req, 4 * 2 + 2);
 	if (IS_ERR(cs))
 		return PTR_ERR(cs);
 
+	*cs++ = MI_LOAD_REGISTER_IMM(4);
 	for (i = 0; i < 4; i++) {
-		*cs++ = MI_LOAD_REGISTER_IMM(1);
 		*cs++ = i915_mmio_reg_offset(GEN7_SO_WRITE_OFFSET(i));
 		*cs++ = 0;
 	}
-
+	*cs++ = MI_NOOP;
 	intel_ring_advance(req, cs);
 
 	return 0;
@@ -1400,10 +1636,11 @@ static struct i915_vma *eb_parse(struct i915_execbuffer *eb, bool is_master)
 		goto out;
 
 	vma->exec_entry =
-		memset(&eb->shadow_exec_entry, 0, sizeof(*vma->exec_entry));
+		memset(&eb->exec[eb->args->buffer_count++],
+		       0, sizeof(*vma->exec_entry));
 	vma->exec_entry->flags = __EXEC_OBJECT_HAS_PIN;
-	i915_gem_object_get(shadow_batch_obj);
-	list_add_tail(&vma->exec_link, &eb->vmas);
+	vma->exec_entry->rsvd2 = (uintptr_t)vma;
+	i915_vma_get(vma);
 
 out:
 	i915_gem_object_unpin_pages(shadow_batch_obj);
@@ -1419,68 +1656,79 @@ add_to_client(struct drm_i915_gem_request *req,
 }
 
 static int
-execbuf_submit(struct i915_execbuffer *eb)
+eb_set_constants_offset(struct i915_execbuffer *eb)
 {
-	int instp_mode;
-	u32 instp_mask, *cs;
-	int ret;
-
-	ret = eb_move_to_gpu(eb);
-	if (ret)
-		return ret;
-
-	ret = i915_switch_context(eb->request);
-	if (ret)
-		return ret;
+	struct drm_i915_private *dev_priv = eb->i915;
+	u32 mode, mask;
+	u32 *cs;
 
-	instp_mode = eb->args->flags & I915_EXEC_CONSTANTS_MASK;
-	instp_mask = I915_EXEC_CONSTANTS_MASK;
-	switch (instp_mode) {
+	mode = eb->args->flags & I915_EXEC_CONSTANTS_MASK;
+	switch (mode) {
 	case I915_EXEC_CONSTANTS_REL_GENERAL:
 	case I915_EXEC_CONSTANTS_ABSOLUTE:
 	case I915_EXEC_CONSTANTS_REL_SURFACE:
-		if (instp_mode != 0 && eb->engine->id != RCS) {
-			DRM_DEBUG("non-0 rel constants mode on non-RCS\n");
-			return -EINVAL;
-		}
-
-		if (instp_mode != eb->i915->relative_constants_mode) {
-			if (INTEL_INFO(eb->i915)->gen < 4) {
-				DRM_DEBUG("no rel constants on pre-gen4\n");
-				return -EINVAL;
-			}
-
-			if (INTEL_INFO(eb->i915)->gen > 5 &&
-			    instp_mode == I915_EXEC_CONSTANTS_REL_SURFACE) {
-				DRM_DEBUG("rel surface constants mode invalid on gen5+\n");
-				return -EINVAL;
-			}
-
-			/* The HW changed the meaning on this bit on gen6 */
-			if (INTEL_INFO(eb->i915)->gen >= 6)
-				instp_mask &= ~I915_EXEC_CONSTANTS_REL_SURFACE;
-		}
 		break;
 	default:
-		DRM_DEBUG("execbuf with unknown constants: %d\n", instp_mode);
+		DRM_DEBUG("execbuf with unknown constants: %d\n", mode);
 		return -EINVAL;
 	}
 
-	if (eb->engine->id == RCS &&
-	    instp_mode != eb->i915->relative_constants_mode) {
-		cs = intel_ring_begin(eb->request, 4);
-		if (IS_ERR(cs))
-			return PTR_ERR(cs);
+	if (mode == dev_priv->relative_constants_mode)
+		return 0;
+
+	if (eb->engine->id != RCS) {
+		DRM_DEBUG("non-0 rel constants mode on non-RCS\n");
+		return -EINVAL;
+	}
 
-		*cs++ = MI_NOOP;
-		*cs++ = MI_LOAD_REGISTER_IMM(1);
-		*cs++ = i915_mmio_reg_offset(INSTPM);
-		*cs++ = instp_mask << 16 | instp_mode;
-		intel_ring_advance(eb->request, cs);
+	if (INTEL_GEN(dev_priv) < 4) {
+		DRM_DEBUG("no rel constants on pre-gen4\n");
+		return -EINVAL;
+	}
 
-		eb->i915->relative_constants_mode = instp_mode;
+	if (INTEL_GEN(dev_priv) > 5 &&
+	    mode == I915_EXEC_CONSTANTS_REL_SURFACE) {
+		DRM_DEBUG("rel surface constants mode invalid on gen5+\n");
+		return -EINVAL;
 	}
 
+	/* The HW changed the meaning on this bit on gen6 */
+	mask = I915_EXEC_CONSTANTS_MASK;
+	if (INTEL_GEN(dev_priv) >= 6)
+		mask &= ~I915_EXEC_CONSTANTS_REL_SURFACE;
+
+	cs = intel_ring_begin(eb->request, 4);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
+
+	*cs++ = MI_NOOP;
+	*cs++ = MI_LOAD_REGISTER_IMM(1);
+	*cs++ = i915_mmio_reg_offset(INSTPM);
+	*cs++ = mask << 16 | mode;
+	intel_ring_advance(eb->request, cs);
+
+	dev_priv->relative_constants_mode = mode;
+
+	return 0;
+}
+
+static int
+eb_submit(struct i915_execbuffer *eb)
+{
+	int ret;
+
+	ret = eb_move_to_gpu(eb);
+	if (ret)
+		return ret;
+
+	ret = i915_switch_context(eb->request);
+	if (ret)
+		return ret;
+
+	ret = eb_set_constants_offset(eb);
+	if (ret)
+		return ret;
+
 	if (eb->args->flags & I915_EXEC_GEN7_SOL_RESET) {
 		ret = i915_reset_gen7_sol_offsets(eb->request);
 		if (ret)
@@ -1495,7 +1743,6 @@ execbuf_submit(struct i915_execbuffer *eb)
 	if (ret)
 		return ret;
 
-	eb_move_to_active(eb);
 	add_to_client(eb->request, eb->file);
 
 	return 0;
@@ -1632,18 +1879,18 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 	int out_fence_fd = -1;
 	int ret;
 
-	if (!i915_gem_check_execbuffer(args))
-		return -EINVAL;
-
-	ret = validate_exec_list(dev, exec, args->buffer_count);
-	if (ret)
-		return ret;
+	BUILD_BUG_ON(__EXEC_OBJECT_INTERNAL_FLAGS & ~__EXEC_OBJECT_UNKNOWN_FLAGS);
 
 	eb.i915 = to_i915(dev);
 	eb.file = file;
 	eb.args = args;
+	if ((args->flags & I915_EXEC_NO_RELOC) == 0)
+		args->flags |= __EXEC_HAS_RELOC;
 	eb.exec = exec;
-	eb.need_relocs = (args->flags & I915_EXEC_NO_RELOC) == 0;
+	eb.ctx = NULL;
+	eb.invalid_flags = __EXEC_OBJECT_UNKNOWN_FLAGS;
+	if (USES_FULL_PPGTT(eb.i915))
+		eb.invalid_flags |= EXEC_OBJECT_NEEDS_GTT;
 	reloc_cache_init(&eb.reloc_cache, eb.i915);
 
 	eb.batch_start_offset = args->batch_start_offset;
@@ -1703,6 +1950,9 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 		}
 	}
 
+	if (eb_create(&eb))
+		return -ENOMEM;
+
 	/* Take a local wakeref for preparing to dispatch the execbuf as
 	 * we expect to access the hardware fairly frequently in the
 	 * process. Upon first dispatch, we acquire another prolonged
@@ -1710,60 +1960,46 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 	 * 100ms.
 	 */
 	intel_runtime_pm_get(eb.i915);
-
 	ret = i915_mutex_lock_interruptible(dev);
 	if (ret)
-		goto pre_mutex_err;
+		goto err_rpm;
 
 	ret = eb_select_context(&eb);
-	if (ret) {
-		mutex_unlock(&dev->struct_mutex);
-		goto pre_mutex_err;
-	}
-
-	if (eb_create(&eb)) {
-		i915_gem_context_put(eb.ctx);
-		mutex_unlock(&dev->struct_mutex);
-		ret = -ENOMEM;
-		goto pre_mutex_err;
-	}
+	if (unlikely(ret))
+		goto err_unlock;
 
-	/* Look up object handles */
 	ret = eb_lookup_vmas(&eb);
-	if (ret)
-		goto err;
-
-	/* take note of the batch buffer before we might reorder the lists */
-	eb.batch = eb_get_batch(&eb);
-
-	/* Move the objects en-masse into the GTT, evicting if necessary. */
-	ret = eb_reserve(&eb);
-	if (ret)
-		goto err;
+	if (unlikely(ret))
+		goto err_vma;
 
 	/* The objects are in their final locations, apply the relocations. */
-	if (eb.need_relocs)
+	if (args->flags & __EXEC_HAS_RELOC && !list_empty(&eb.relocs)) {
 		ret = eb_relocate(&eb);
-	if (ret) {
-		if (ret == -EFAULT) {
+		if (ret == -EAGAIN || ret == -EFAULT)
 			ret = eb_relocate_slow(&eb);
-			BUG_ON(!mutex_is_locked(&dev->struct_mutex));
-		}
-		if (ret)
-			goto err;
+		if (ret && args->flags & I915_EXEC_NO_RELOC)
+			/* If the user expects the execobject.offset and
+			 * reloc.presumed_offset to be an exact match,
+			 * as for using NO_RELOC, then we cannot update
+			 * the execobject.offset until we have completed
+			 * relocation.
+			 */
+			args->flags &= ~__EXEC_HAS_RELOC;
+		if (ret < 0)
+			goto err_vma;
 	}
 
 	/* Set the pending read domains for the batch buffer to COMMAND */
-	if (eb.batch->exec_entry->flags & EXEC_OBJECT_WRITE) {
+	if (unlikely(eb.batch->exec_entry->flags & EXEC_OBJECT_WRITE)) {
 		DRM_DEBUG("Attempting to use self-modifying batch buffer\n");
 		ret = -EINVAL;
-		goto err;
+		goto err_vma;
 	}
 	if (eb.batch_start_offset > eb.batch->size ||
 	    eb.batch_len > eb.batch->size - eb.batch_start_offset) {
 		DRM_DEBUG("Attempting to use out-of-bounds batch\n");
 		ret = -EINVAL;
-		goto err;
+		goto err_vma;
 	}
 
 	if (eb.engine->needs_cmd_parser && eb.batch_len) {
@@ -1772,7 +2008,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 		vma = eb_parse(&eb, drm_is_current_master(file));
 		if (IS_ERR(vma)) {
 			ret = PTR_ERR(vma);
-			goto err;
+			goto err_vma;
 		}
 
 		if (vma) {
@@ -1786,7 +2022,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 			 * command parser has accepted.
 			 */
 			eb.dispatch_flags |= I915_DISPATCH_SECURE;
-			eb.batch_start_offset = 0;
+			eb.args->batch_start_offset = 0;
 			eb.batch = vma;
 		}
 	}
@@ -1798,7 +2034,6 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 	 * batch" bit. Hence we need to pin secure batches into the global gtt.
 	 * hsw should have this fixed, but bdw mucks it up again. */
 	if (eb.dispatch_flags & I915_DISPATCH_SECURE) {
-		struct drm_i915_gem_object *obj = eb.batch->obj;
 		struct i915_vma *vma;
 
 		/*
@@ -1811,10 +2046,10 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 		 *   fitting due to fragmentation.
 		 * So this is actually safe.
 		 */
-		vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0, 0);
+		vma = i915_gem_object_ggtt_pin(eb.batch->obj, NULL, 0, 0, 0);
 		if (IS_ERR(vma)) {
 			ret = PTR_ERR(vma);
-			goto err;
+			goto err_vma;
 		}
 
 		eb.batch = vma;
@@ -1850,7 +2085,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 	eb.request->batch = eb.batch;
 
 	trace_i915_gem_request_queue(eb.request, eb.dispatch_flags);
-	ret = execbuf_submit(&eb);
+	ret = eb_submit(&eb);
 err_request:
 	__i915_add_request(eb.request, ret == 0);
 
@@ -1866,23 +2101,16 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 	}
 
 err_batch_unpin:
-	/*
-	 * FIXME: We crucially rely upon the active tracking for the (ppgtt)
-	 * batch vma for correctness. For less ugly and less fragility this
-	 * needs to be adjusted to also track the ggtt batch vma properly as
-	 * active.
-	 */
 	if (eb.dispatch_flags & I915_DISPATCH_SECURE)
 		i915_vma_unpin(eb.batch);
-err:
-	/* the request owns the ref now */
-	eb_destroy(&eb);
+err_vma:
+	eb_release_vma(&eb);
+	i915_gem_context_put(eb.ctx);
+err_unlock:
 	mutex_unlock(&dev->struct_mutex);
-
-pre_mutex_err:
-	/* intel_gpu_busy should also get a ref, so it will free when the device
-	 * is really idle. */
+err_rpm:
 	intel_runtime_pm_put(eb.i915);
+	eb_destroy(&eb);
 	if (out_fence_fd != -1)
 		put_unused_fd(out_fence_fd);
 err_in_fence:
@@ -1909,9 +2137,27 @@ i915_gem_execbuffer(struct drm_device *dev, void *data,
 		return -EINVAL;
 	}
 
+	exec2.buffers_ptr = args->buffers_ptr;
+	exec2.buffer_count = args->buffer_count;
+	exec2.batch_start_offset = args->batch_start_offset;
+	exec2.batch_len = args->batch_len;
+	exec2.DR1 = args->DR1;
+	exec2.DR4 = args->DR4;
+	exec2.num_cliprects = args->num_cliprects;
+	exec2.cliprects_ptr = args->cliprects_ptr;
+	exec2.flags = I915_EXEC_RENDER;
+	i915_execbuffer2_set_context_id(exec2, 0);
+
+	if (!i915_gem_check_execbuffer(&exec2))
+		return -EINVAL;
+
 	/* Copy in the exec list from userland */
-	exec_list = drm_malloc_ab(sizeof(*exec_list), args->buffer_count);
-	exec2_list = drm_malloc_ab(sizeof(*exec2_list), args->buffer_count);
+	exec_list = drm_malloc_gfp(args->buffer_count,
+				   sizeof(*exec_list),
+				   __GFP_NOWARN | GFP_TEMPORARY);
+	exec2_list = drm_malloc_gfp(args->buffer_count + 1,
+				    sizeof(*exec2_list),
+				    __GFP_NOWARN | GFP_TEMPORARY);
 	if (exec_list == NULL || exec2_list == NULL) {
 		DRM_DEBUG("Failed to allocate exec list for %d buffers\n",
 			  args->buffer_count);
@@ -1942,36 +2188,23 @@ i915_gem_execbuffer(struct drm_device *dev, void *data,
 			exec2_list[i].flags = 0;
 	}
 
-	exec2.buffers_ptr = args->buffers_ptr;
-	exec2.buffer_count = args->buffer_count;
-	exec2.batch_start_offset = args->batch_start_offset;
-	exec2.batch_len = args->batch_len;
-	exec2.DR1 = args->DR1;
-	exec2.DR4 = args->DR4;
-	exec2.num_cliprects = args->num_cliprects;
-	exec2.cliprects_ptr = args->cliprects_ptr;
-	exec2.flags = I915_EXEC_RENDER;
-	i915_execbuffer2_set_context_id(exec2, 0);
-
 	ret = i915_gem_do_execbuffer(dev, file, &exec2, exec2_list);
-	if (!ret) {
+	if (exec2.flags & __EXEC_HAS_RELOC) {
 		struct drm_i915_gem_exec_object __user *user_exec_list =
 			u64_to_user_ptr(args->buffers_ptr);
 
 		/* Copy the new buffer offsets back to the user's exec list. */
 		for (i = 0; i < args->buffer_count; i++) {
+			if ((exec2_list[i].offset & UPDATE) == 0)
+				continue;
+
 			exec2_list[i].offset =
-				gen8_canonical_addr(exec2_list[i].offset);
-			ret = __copy_to_user(&user_exec_list[i].offset,
-					     &exec2_list[i].offset,
-					     sizeof(user_exec_list[i].offset));
-			if (ret) {
-				ret = -EFAULT;
-				DRM_DEBUG("failed to copy %d exec entries "
-					  "back to user (%d)\n",
-					  args->buffer_count, ret);
+				gen8_canonical_addr(exec2_list[i].offset & PIN_OFFSET_MASK);
+			exec2_list[i].offset &= PIN_OFFSET_MASK;
+			if (__copy_to_user(&user_exec_list[i].offset,
+					   &exec2_list[i].offset,
+					   sizeof(user_exec_list[i].offset)))
 				break;
-			}
 		}
 	}
 
@@ -1985,56 +2218,63 @@ i915_gem_execbuffer2(struct drm_device *dev, void *data,
 		     struct drm_file *file)
 {
 	struct drm_i915_gem_execbuffer2 *args = data;
-	struct drm_i915_gem_exec_object2 *exec2_list = NULL;
+	struct drm_i915_gem_exec_object2 *exec2_list;
 	int ret;
 
 	if (args->buffer_count < 1 ||
-	    args->buffer_count > UINT_MAX / sizeof(*exec2_list)) {
+	    args->buffer_count >= UINT_MAX / sizeof(*exec2_list)) {
 		DRM_DEBUG("execbuf2 with %d buffers\n", args->buffer_count);
 		return -EINVAL;
 	}
 
-	exec2_list = drm_malloc_gfp(args->buffer_count,
+	if (!i915_gem_check_execbuffer(args))
+		return -EINVAL;
+
+	exec2_list = drm_malloc_gfp(args->buffer_count + 1,
 				    sizeof(*exec2_list),
-				    GFP_TEMPORARY);
+				    __GFP_NOWARN | GFP_TEMPORARY);
 	if (exec2_list == NULL) {
 		DRM_DEBUG("Failed to allocate exec list for %d buffers\n",
 			  args->buffer_count);
 		return -ENOMEM;
 	}
-	ret = copy_from_user(exec2_list,
-			     u64_to_user_ptr(args->buffers_ptr),
-			     sizeof(*exec2_list) * args->buffer_count);
-	if (ret != 0) {
-		DRM_DEBUG("copy %d exec entries failed %d\n",
-			  args->buffer_count, ret);
+	if (copy_from_user(exec2_list,
+			   u64_to_user_ptr(args->buffers_ptr),
+			   sizeof(*exec2_list) * args->buffer_count)) {
+		DRM_DEBUG("copy %d exec entries failed\n", args->buffer_count);
 		drm_free_large(exec2_list);
 		return -EFAULT;
 	}
 
 	ret = i915_gem_do_execbuffer(dev, file, args, exec2_list);
-	if (!ret) {
-		/* Copy the new buffer offsets back to the user's exec list. */
+
+	/* Now that we have begun execution of the batchbuffer, we ignore
+	 * any new error after this point. Also given that we have already
+	 * updated the associated relocations, we try to write out the current
+	 * object locations irrespective of any error.
+	 */
+	if (args->flags & __EXEC_HAS_RELOC) {
 		struct drm_i915_gem_exec_object2 __user *user_exec_list =
-				   u64_to_user_ptr(args->buffers_ptr);
+			u64_to_user_ptr(args->buffers_ptr);
 		int i;
 
+		/* Copy the new buffer offsets back to the user's exec list. */
+		user_access_begin();
 		for (i = 0; i < args->buffer_count; i++) {
+			if ((exec2_list[i].offset & UPDATE) == 0)
+				continue;
+
 			exec2_list[i].offset =
-				gen8_canonical_addr(exec2_list[i].offset);
-			ret = __copy_to_user(&user_exec_list[i].offset,
-					     &exec2_list[i].offset,
-					     sizeof(user_exec_list[i].offset));
-			if (ret) {
-				ret = -EFAULT;
-				DRM_DEBUG("failed to copy %d exec entries "
-					  "back to user\n",
-					  args->buffer_count);
-				break;
-			}
+				gen8_canonical_addr(exec2_list[i].offset & PIN_OFFSET_MASK);
+			unsafe_put_user(exec2_list[i].offset,
+					&user_exec_list[i].offset,
+					end_user);
 		}
+end_user:
+		user_access_end();
 	}
 
+	args->flags &= ~__I915_EXEC_UNKNOWN_FLAGS;
 	drm_free_large(exec2_list);
 	return ret;
 }
diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index 77003eec0725..784fba279e7b 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -460,7 +460,7 @@ i915_vma_insert(struct i915_vma *vma, u64 size, u64 alignment, u64 flags)
 			  size, obj->base.size,
 			  flags & PIN_MAPPABLE ? "mappable" : "total",
 			  end);
-		return -E2BIG;
+		return -ENOSPC;
 	}
 
 	ret = i915_gem_object_pin_pages(obj);
diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h
index 88543fafcffc..062addfee6ef 100644
--- a/drivers/gpu/drm/i915/i915_vma.h
+++ b/drivers/gpu/drm/i915/i915_vma.h
@@ -103,6 +103,7 @@ struct i915_vma {
 
 	/** This vma's place in the execbuf reservation list */
 	struct list_head exec_link;
+	struct list_head reloc_link;
 
 	/** This vma's place in the eviction list */
 	struct list_head evict_link;
diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_evict.c b/drivers/gpu/drm/i915/selftests/i915_gem_evict.c
index 97af353db218..f8155563ce19 100644
--- a/drivers/gpu/drm/i915/selftests/i915_gem_evict.c
+++ b/drivers/gpu/drm/i915/selftests/i915_gem_evict.c
@@ -215,7 +215,7 @@ static int igt_evict_vm(void *arg)
 		goto cleanup;
 
 	/* Everything is pinned, nothing should happen */
-	err = i915_gem_evict_vm(&ggtt->base, false);
+	err = i915_gem_evict_vm(&ggtt->base);
 	if (err) {
 		pr_err("i915_gem_evict_vm on a full GGTT returned err=%d]\n",
 		       err);
@@ -224,7 +224,7 @@ static int igt_evict_vm(void *arg)
 
 	unpin_ggtt(i915);
 
-	err = i915_gem_evict_vm(&ggtt->base, false);
+	err = i915_gem_evict_vm(&ggtt->base);
 	if (err) {
 		pr_err("i915_gem_evict_vm on a full GGTT returned err=%d]\n",
 		       err);
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 11/15] drm/i915: First try the previous execbuffer location
  2017-02-23 16:18 Make execbuf fast[er] Chris Wilson
                   ` (9 preceding siblings ...)
  2017-02-23 16:18 ` [PATCH 10/15] drm/i915: Eliminate lots of iterations over the execobjects array Chris Wilson
@ 2017-02-23 16:18 ` Chris Wilson
  2017-02-23 16:18 ` [PATCH 12/15] drm/i915: Wait upon userptr get-user-pages within execbuffer Chris Wilson
                   ` (3 subsequent siblings)
  14 siblings, 0 replies; 23+ messages in thread
From: Chris Wilson @ 2017-02-23 16:18 UTC (permalink / raw)
  To: intel-gfx

When choosing a slot for an execbuffer, we ideally want to use the same
address as last time (so that we don't have to rebind it) and the same
address as expected by the user (so that we don't have to fixup any
relocations pointing to it). If we first try to bind the incoming
execbuffer->offset from the user, or the currently bound offset that
should hopefully achieve the goal of avoiding the rebind cost and the
relocation penalty. However, if the object is not currently bound there
we don't want to arbitrarily unbind an object in our chosen position and
so choose to rebind/relocate the incoming object instead. After we
report the new position back to the user, on the next pass the
relocations should have settled down.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 12 ++++++++----
 drivers/gpu/drm/i915/i915_gem_gtt.c        |  6 ++++++
 drivers/gpu/drm/i915/i915_gem_gtt.h        |  1 +
 3 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 6ff282c225d0..cdc57c1bd1db 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -289,10 +289,15 @@ eb_pin_vma(struct i915_execbuffer *eb,
 {
 	u64 flags;
 
-	flags = vma->node.start;
-	flags |= PIN_USER | PIN_NONBLOCK | PIN_OFFSET_FIXED;
+	if (vma->node.size)
+		flags = vma->node.start;
+	else
+		flags = entry->offset & PIN_OFFSET_MASK;
+
+	flags |= PIN_USER | PIN_NOEVICT | PIN_OFFSET_FIXED;
 	if (unlikely(entry->flags & EXEC_OBJECT_NEEDS_GTT))
 		flags |= PIN_GLOBAL;
+
 	if (unlikely(i915_vma_pin(vma, 0, 0, flags)))
 		return;
 
@@ -403,8 +408,7 @@ eb_add_vma(struct i915_execbuffer *eb,
 		entry->flags |= eb->context_flags;
 
 	ret = 0;
-	if (vma->node.size)
-		eb_pin_vma(eb, entry, vma);
+	eb_pin_vma(eb, entry, vma);
 	if (eb_vma_misplaced(entry, vma)) {
 		eb_unreserve_vma(vma, entry);
 
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 999f15455f48..d5cb1299f4f3 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -3203,6 +3203,9 @@ int i915_gem_gtt_reserve(struct i915_address_space *vm,
 	if (err != -ENOSPC)
 		return err;
 
+	if (flags & PIN_NOEVICT)
+		return -ENOSPC;
+
 	err = i915_gem_evict_for_node(vm, node, flags);
 	if (err == 0)
 		err = drm_mm_reserve_node(&vm->mm, node);
@@ -3317,6 +3320,9 @@ int i915_gem_gtt_insert(struct i915_address_space *vm,
 	if (err != -ENOSPC)
 		return err;
 
+	if (flags & PIN_NOEVICT)
+		return -ENOSPC;
+
 	/* No free space, pick a slot at random.
 	 *
 	 * There is a pathological case here using a GTT shared between
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
index 069fc4e1be2a..6e68e7719ba4 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -582,6 +582,7 @@ int i915_gem_gtt_insert(struct i915_address_space *vm,
 #define PIN_MAPPABLE		BIT(1)
 #define PIN_ZONE_4G		BIT(2)
 #define PIN_NONFAULT		BIT(3)
+#define PIN_NOEVICT		BIT(4)
 
 #define PIN_MBZ			BIT(5) /* I915_VMA_PIN_OVERFLOW */
 #define PIN_GLOBAL		BIT(6) /* I915_VMA_GLOBAL_BIND */
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 12/15] drm/i915: Wait upon userptr get-user-pages within execbuffer
  2017-02-23 16:18 Make execbuf fast[er] Chris Wilson
                   ` (10 preceding siblings ...)
  2017-02-23 16:18 ` [PATCH 11/15] drm/i915: First try the previous execbuffer location Chris Wilson
@ 2017-02-23 16:18 ` Chris Wilson
  2017-02-24 13:53   ` Michał Winiarski
  2017-02-23 16:18 ` [PATCH 13/15] drm/i915: Remove superfluous i915_add_request_no_flush() helper Chris Wilson
                   ` (2 subsequent siblings)
  14 siblings, 1 reply; 23+ messages in thread
From: Chris Wilson @ 2017-02-23 16:18 UTC (permalink / raw)
  To: intel-gfx

This simply hides the EAGAIN caused by userptr when userspace causes
resource contention. However, it is quite beneficial with highly
contended userptr users as we avoid repeating the setup costs and
kernel-user context switches.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.c            |  1 +
 drivers/gpu/drm/i915/i915_drv.h            | 10 +++++++++-
 drivers/gpu/drm/i915/i915_gem.c            |  4 +++-
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |  3 +++
 drivers/gpu/drm/i915/i915_gem_userptr.c    | 18 +++++++++++++++---
 5 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 842c62b96a83..c04bd1058562 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -555,6 +555,7 @@ static void i915_gem_fini(struct drm_i915_private *dev_priv)
 	mutex_lock(&dev_priv->drm.struct_mutex);
 	i915_gem_cleanup_engines(dev_priv);
 	i915_gem_context_fini(dev_priv);
+	i915_gem_cleanup_userptr(dev_priv);
 	mutex_unlock(&dev_priv->drm.struct_mutex);
 
 	i915_gem_drain_freed_objects(dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 36c1d19a9fef..539a201359d2 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1508,6 +1508,13 @@ struct i915_gem_mm {
 	struct list_head fence_list;
 
 	/**
+	 * Workqueue to fault in userptr pages, flushed by the execbuf
+	 * when required but otherwise left to userspace to try again
+	 * on EAGAIN.
+	 */
+	struct workqueue_struct *userptr_wq;
+
+	/**
 	 * Are we in a non-interruptible section of code like
 	 * modesetting?
 	 */
@@ -3170,7 +3177,8 @@ int i915_gem_set_tiling_ioctl(struct drm_device *dev, void *data,
 			      struct drm_file *file_priv);
 int i915_gem_get_tiling_ioctl(struct drm_device *dev, void *data,
 			      struct drm_file *file_priv);
-void i915_gem_init_userptr(struct drm_i915_private *dev_priv);
+int i915_gem_init_userptr(struct drm_i915_private *dev_priv);
+void i915_gem_cleanup_userptr(struct drm_i915_private *dev_priv);
 int i915_gem_userptr_ioctl(struct drm_device *dev, void *data,
 			   struct drm_file *file);
 int i915_gem_get_aperture_ioctl(struct drm_device *dev, void *data,
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 4974b150bc3a..f46bad7680ec 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4510,7 +4510,9 @@ int i915_gem_init(struct drm_i915_private *dev_priv)
 	 */
 	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
 
-	i915_gem_init_userptr(dev_priv);
+	ret = i915_gem_init_userptr(dev_priv);
+	if (ret)
+		goto out_unlock;
 
 	ret = i915_gem_init_ggtt(dev_priv);
 	if (ret)
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index cdc57c1bd1db..8582a96b0e9b 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1377,6 +1377,9 @@ static int eb_relocate_slow(struct i915_execbuffer *eb)
 		goto out;
 	}
 
+	/* A frequent cause for EAGAIN are currently unavailable client pages */
+	flush_workqueue(eb->i915->mm.userptr_wq);
+
 	ret = i915_mutex_lock_interruptible(dev);
 	if (ret) {
 		mutex_lock(&dev->struct_mutex);
diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 6a8fa085b74e..120186122c82 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -369,7 +369,7 @@ __i915_mm_struct_free(struct kref *kref)
 	mutex_unlock(&mm->i915->mm_lock);
 
 	INIT_WORK(&mm->work, __i915_mm_struct_free__worker);
-	schedule_work(&mm->work);
+	queue_work(mm->i915->mm.userptr_wq, &mm->work);
 }
 
 static void
@@ -588,7 +588,7 @@ __i915_gem_userptr_get_pages_schedule(struct drm_i915_gem_object *obj,
 	get_task_struct(work->task);
 
 	INIT_WORK(&work->work, __i915_gem_userptr_get_pages_worker);
-	schedule_work(&work->work);
+	queue_work(to_i915(obj->base.dev)->mm.userptr_wq, &work->work);
 
 	*active = true;
 	return ERR_PTR(-EAGAIN);
@@ -816,8 +816,20 @@ i915_gem_userptr_ioctl(struct drm_device *dev, void *data, struct drm_file *file
 	return 0;
 }
 
-void i915_gem_init_userptr(struct drm_i915_private *dev_priv)
+int i915_gem_init_userptr(struct drm_i915_private *dev_priv)
 {
 	mutex_init(&dev_priv->mm_lock);
 	hash_init(dev_priv->mm_structs);
+
+	dev_priv->mm.userptr_wq =
+		alloc_workqueue("i915-userptr-acquire", WQ_HIGHPRI, 0);
+	if (!dev_priv->mm.userptr_wq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void i915_gem_cleanup_userptr(struct drm_i915_private *dev_priv)
+{
+	destroy_workqueue(dev_priv->mm.userptr_wq);
 }
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 13/15] drm/i915: Remove superfluous i915_add_request_no_flush() helper
  2017-02-23 16:18 Make execbuf fast[er] Chris Wilson
                   ` (11 preceding siblings ...)
  2017-02-23 16:18 ` [PATCH 12/15] drm/i915: Wait upon userptr get-user-pages within execbuffer Chris Wilson
@ 2017-02-23 16:18 ` Chris Wilson
  2017-02-23 16:18 ` [PATCH 14/15] drm/i915: Allow execbuffer to use the first object as the batch Chris Wilson
  2017-02-23 16:18 ` [PATCH 15/15] drm/i915: Async GPU relocation processing Chris Wilson
  14 siblings, 0 replies; 23+ messages in thread
From: Chris Wilson @ 2017-02-23 16:18 UTC (permalink / raw)
  To: intel-gfx

The only time we need to emit a flush inside request emission is after
an execbuffer, for which we can use the full __i915_add_request(). All
other instances want the simpler i915_add_request() without flushing, so
remove the useless helper.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/gvt/scheduler.c    | 2 +-
 drivers/gpu/drm/i915/i915_gem_context.c | 2 +-
 drivers/gpu/drm/i915/i915_gem_request.h | 2 --
 drivers/gpu/drm/i915/intel_display.c    | 4 ++--
 drivers/gpu/drm/i915/intel_overlay.c    | 8 ++++----
 drivers/gpu/drm/i915/intel_pm.c         | 2 +-
 6 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/i915/gvt/scheduler.c b/drivers/gpu/drm/i915/gvt/scheduler.c
index d6b6d0efdd1a..950b62f4fff8 100644
--- a/drivers/gpu/drm/i915/gvt/scheduler.c
+++ b/drivers/gpu/drm/i915/gvt/scheduler.c
@@ -214,7 +214,7 @@ static int dispatch_workload(struct intel_vgpu_workload *workload)
 		workload->status = ret;
 
 	if (!IS_ERR_OR_NULL(rq))
-		i915_add_request_no_flush(rq);
+		i915_add_request(rq);
 	mutex_unlock(&dev_priv->drm.struct_mutex);
 	return ret;
 }
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 1764c70e6460..36abd914ffef 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -1009,7 +1009,7 @@ int i915_gem_switch_to_kernel_context(struct drm_i915_private *dev_priv)
 		}
 
 		ret = i915_switch_context(req);
-		i915_add_request_no_flush(req);
+		i915_add_request(req);
 		if (ret)
 			return ret;
 	}
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index 1edc0fa7794c..316c86c98b6a 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -278,8 +278,6 @@ int i915_gem_request_await_dma_fence(struct drm_i915_gem_request *req,
 
 void __i915_add_request(struct drm_i915_gem_request *req, bool flush_caches);
 #define i915_add_request(req) \
-	__i915_add_request(req, true)
-#define i915_add_request_no_flush(req) \
 	__i915_add_request(req, false)
 
 void __i915_gem_request_submit(struct drm_i915_gem_request *request);
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index d1831809ad3d..1212ce5fd99b 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -10708,7 +10708,7 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc,
 		intel_mark_page_flip_active(intel_crtc, work);
 
 		work->flip_queued_req = i915_gem_request_get(request);
-		i915_add_request_no_flush(request);
+		i915_add_request(request);
 	}
 
 	i915_gem_object_wait_priority(obj, 0, I915_PRIORITY_DISPLAY);
@@ -10724,7 +10724,7 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc,
 	return 0;
 
 cleanup_request:
-	i915_add_request_no_flush(request);
+	i915_add_request(request);
 cleanup_unpin:
 	to_intel_plane_state(primary->state)->vma = work->old_vma;
 	intel_unpin_fb_vma(vma);
diff --git a/drivers/gpu/drm/i915/intel_overlay.c b/drivers/gpu/drm/i915/intel_overlay.c
index 5ef9f5bfb92c..2e0c56ed22bb 100644
--- a/drivers/gpu/drm/i915/intel_overlay.c
+++ b/drivers/gpu/drm/i915/intel_overlay.c
@@ -278,7 +278,7 @@ static int intel_overlay_on(struct intel_overlay *overlay)
 
 	cs = intel_ring_begin(req, 4);
 	if (IS_ERR(cs)) {
-		i915_add_request_no_flush(req);
+		i915_add_request(req);
 		return PTR_ERR(cs);
 	}
 
@@ -343,7 +343,7 @@ static int intel_overlay_continue(struct intel_overlay *overlay,
 
 	cs = intel_ring_begin(req, 2);
 	if (IS_ERR(cs)) {
-		i915_add_request_no_flush(req);
+		i915_add_request(req);
 		return PTR_ERR(cs);
 	}
 
@@ -419,7 +419,7 @@ static int intel_overlay_off(struct intel_overlay *overlay)
 
 	cs = intel_ring_begin(req, 6);
 	if (IS_ERR(cs)) {
-		i915_add_request_no_flush(req);
+		i915_add_request(req);
 		return PTR_ERR(cs);
 	}
 
@@ -477,7 +477,7 @@ static int intel_overlay_release_old_vid(struct intel_overlay *overlay)
 
 		cs = intel_ring_begin(req, 2);
 		if (IS_ERR(cs)) {
-			i915_add_request_no_flush(req);
+			i915_add_request(req);
 			return PTR_ERR(cs);
 		}
 
diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
index 5a6db1b17cb4..2a7624880ee8 100644
--- a/drivers/gpu/drm/i915/intel_pm.c
+++ b/drivers/gpu/drm/i915/intel_pm.c
@@ -6823,7 +6823,7 @@ static void __intel_autoenable_gt_powersave(struct work_struct *work)
 		rcs->init_context(req);
 
 	/* Mark the device busy, calling intel_enable_gt_powersave() */
-	i915_add_request_no_flush(req);
+	i915_add_request(req);
 
 unlock:
 	mutex_unlock(&dev_priv->drm.struct_mutex);
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 14/15] drm/i915: Allow execbuffer to use the first object as the batch
  2017-02-23 16:18 Make execbuf fast[er] Chris Wilson
                   ` (12 preceding siblings ...)
  2017-02-23 16:18 ` [PATCH 13/15] drm/i915: Remove superfluous i915_add_request_no_flush() helper Chris Wilson
@ 2017-02-23 16:18 ` Chris Wilson
  2017-02-23 16:18 ` [PATCH 15/15] drm/i915: Async GPU relocation processing Chris Wilson
  14 siblings, 0 replies; 23+ messages in thread
From: Chris Wilson @ 2017-02-23 16:18 UTC (permalink / raw)
  To: intel-gfx

Currently, the last object in the execlist is the always the batch.
However, when building the batch buffer we often know the batch object
first and if we can use the first slot in the execlist we can emit
relocation instructions relative to it immediately and avoid a separate
pass to adjust the relocations to point to the last execlist slot.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.c            | 1 +
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 5 ++++-
 include/uapi/drm/i915_drm.h                | 8 +++++++-
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index c04bd1058562..4868586722e7 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -354,6 +354,7 @@ static int i915_getparam(struct drm_device *dev, void *data,
 	case I915_PARAM_HAS_EXEC_FENCE:
 	case I915_PARAM_HAS_EXEC_FENCE_DMABUF:
 	case I915_PARAM_HAS_EXEC_CAPTURE:
+	case I915_PARAM_HAS_EXEC_BATCH_FIRST:
 		/* For the time being all of these are always true;
 		 * if some supported hardware does not have one of these
 		 * features this value needs to be provided from
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 8582a96b0e9b..dbb53281ddb1 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -555,7 +555,10 @@ ht_head(const struct i915_gem_context *ctx, u32 handle)
 
 static int eb_batch_index(const struct i915_execbuffer *eb)
 {
-	return eb->args->buffer_count - 1;
+	if (eb->args->flags & I915_EXEC_BATCH_FIRST)
+		return 0;
+	else
+		return eb->args->buffer_count - 1;
 }
 
 static int eb_select_context(struct i915_execbuffer *eb)
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 9eda849df680..8923af8e81c0 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -424,6 +424,11 @@ typedef struct drm_i915_irq_wait {
  */
 #define I915_PARAM_HAS_EXEC_CAPTURE	 46
 
+/* Query whether DRM_I915_GEM_EXECBUFFER2 supports supplying the batch buffer
+ * as the first execobject as opposed to the last. See I915_EXEC_BATCH_FIRST.
+ */
+#define I915_PARAM_HAS_EXEC_BATCH_FIRST	 47
+
 typedef struct drm_i915_getparam {
 	__s32 param;
 	/*
@@ -922,7 +927,8 @@ struct drm_i915_gem_execbuffer2 {
  */
 #define I915_EXEC_FENCE_ANY		(1<<19)
 
-#define __I915_EXEC_UNKNOWN_FLAGS (-(I915_EXEC_FENCE_ANY<<1))
+#define I915_EXEC_BATCH_FIRST		(1<<19)
+#define __I915_EXEC_UNKNOWN_FLAGS (-(I915_EXEC_BATCH_FIRST<<1))
 
 #define I915_EXEC_CONTEXT_ID_MASK	(0xffffffff)
 #define i915_execbuffer2_set_context_id(eb2, context) \
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 15/15] drm/i915: Async GPU relocation processing
  2017-02-23 16:18 Make execbuf fast[er] Chris Wilson
                   ` (13 preceding siblings ...)
  2017-02-23 16:18 ` [PATCH 14/15] drm/i915: Allow execbuffer to use the first object as the batch Chris Wilson
@ 2017-02-23 16:18 ` Chris Wilson
  14 siblings, 0 replies; 23+ messages in thread
From: Chris Wilson @ 2017-02-23 16:18 UTC (permalink / raw)
  To: intel-gfx

If the user requires patching of their batch or auxiliary buffers, we
currently make the alterations on the cpu. If they are active on the GPU
at the time, we wait under the struct_mutex for them to finish executing
before we rewrite the contents. This happens if shared relocation trees
are used between different contexts with separate address space (and the
buffers then have different addresses in each), the 3D state will need
to be adjusted between execution on each context. However, we don't need
to use the CPU to do the relocation patching, as we could queue commands
to the GPU to perform it and use fences to serialise the operation with
the current activity and future - so the operation on the GPU appears
just as atomic as performing it immediately. Performing the relocation
rewrites on the GPU is not free, in terms of pure throughput, the number
of relocations/s is about halved - but more importantly so is the time
under the struct_mutex.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem.c            |   1 -
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 217 +++++++++++++++++++++++++++--
 2 files changed, 208 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index f46bad7680ec..7f33ef1cfca8 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4106,7 +4106,6 @@ static void __i915_gem_free_objects(struct drm_i915_private *i915,
 		GEM_BUG_ON(i915_gem_object_is_active(obj));
 		list_for_each_entry_safe(vma, vn,
 					 &obj->vma_list, obj_link) {
-			GEM_BUG_ON(!i915_vma_is_ggtt(vma));
 			GEM_BUG_ON(i915_vma_is_active(vma));
 			vma->flags &= ~I915_VMA_PIN_MASK;
 			i915_vma_close(vma);
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index dbb53281ddb1..058e0bf6697a 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -41,7 +41,12 @@
 #include "intel_drv.h"
 #include "intel_frontbuffer.h"
 
-#define DBG_USE_CPU_RELOC 0 /* -1 force GTT relocs; 1 force CPU relocs */
+enum {
+	FORCE_CPU_RELOC = 1,
+	FORCE_GTT_RELOC,
+	FORCE_GPU_RELOC,
+};
+#define DBG_FORCE_RELOC 0 /* choose one of the above! */
 
 #define  __EXEC_OBJECT_HAS_PIN		BIT(31)
 #define  __EXEC_OBJECT_HAS_FENCE	BIT(30)
@@ -184,9 +189,14 @@ struct i915_execbuffer {
 		struct drm_mm_node node;
 		unsigned long vaddr;
 		unsigned int page;
-		bool use_64bit_reloc;
-		bool has_llc;
-		bool has_fence;
+		unsigned int gen;
+		bool use_64bit_reloc : 1;
+		bool has_llc : 1;
+		bool has_fence : 1;
+
+		struct drm_i915_gem_request *rq;
+		u32 *rq_cmd;
+		unsigned int rq_size;
 	} reloc_cache;
 	u64 invalid_flags;
 	u32 context_flags;
@@ -430,8 +440,11 @@ static inline int use_cpu_reloc(const struct reloc_cache *cache,
 	if (!i915_gem_object_has_struct_page(obj))
 		return false;
 
-	if (DBG_USE_CPU_RELOC)
-		return DBG_USE_CPU_RELOC > 0;
+	if (DBG_FORCE_RELOC == FORCE_CPU_RELOC)
+		return true;
+
+	if (DBG_FORCE_RELOC == FORCE_GTT_RELOC)
+		return false;
 
 	return (cache->has_llc ||
 		obj->base.write_domain == I915_GEM_DOMAIN_CPU ||
@@ -803,10 +816,13 @@ static void reloc_cache_init(struct reloc_cache *cache,
 	cache->page = -1;
 	cache->vaddr = 0;
 	/* Must be a variable in the struct to allow GCC to unroll. */
+	cache->gen = INTEL_GEN(i915);
 	cache->has_llc = HAS_LLC(i915);
-	cache->has_fence = INTEL_GEN(i915) < 4;
-	cache->use_64bit_reloc = HAS_64BIT_RELOC(i915);
+	cache->has_fence = cache->gen < 4;
+	cache->use_64bit_reloc = cache->gen >= 8;
 	cache->node.allocated = false;
+	cache->rq = NULL;
+	cache->rq_size = 0;
 }
 
 static inline void *unmask_page(unsigned long p)
@@ -828,10 +844,24 @@ static inline struct i915_ggtt *cache_to_ggtt(struct reloc_cache *cache)
 	return &i915->ggtt;
 }
 
+static void reloc_gpu_flush(struct reloc_cache *cache)
+{
+	GEM_BUG_ON(cache->rq_size >= cache->rq->batch->obj->base.size / sizeof(u32));
+	cache->rq_cmd[cache->rq_size] = MI_BATCH_BUFFER_END;
+	i915_gem_object_unpin_map(cache->rq->batch->obj);
+	i915_gem_chipset_flush(cache->rq->i915);
+
+	__i915_add_request(cache->rq, true);
+	cache->rq = NULL;
+}
+
 static void reloc_cache_reset(struct reloc_cache *cache)
 {
 	void *vaddr;
 
+	if (cache->rq)
+		reloc_gpu_flush(cache);
+
 	if (!cache->vaddr)
 		return;
 
@@ -995,6 +1025,112 @@ static void clflush_write32(u32 *addr, u32 value, unsigned int flushes)
 		*addr = value;
 }
 
+static u32 *reloc_gpu(struct i915_execbuffer *eb,
+		      struct i915_vma *vma,
+		      unsigned int len)
+{
+	struct reloc_cache *cache = &eb->reloc_cache;
+	u32 *cmd;
+
+	if (cache->rq_size > PAGE_SIZE/sizeof(u32) - (len + 1))
+		reloc_gpu_flush(cache);
+
+	if (!cache->rq) {
+		struct drm_i915_gem_object *obj;
+		struct drm_i915_gem_request *rq;
+		struct i915_vma *batch;
+		int err;
+
+		GEM_BUG_ON(vma->obj->base.write_domain & I915_GEM_DOMAIN_CPU);
+
+		obj = i915_gem_batch_pool_get(&eb->engine->batch_pool,
+					      PAGE_SIZE);
+		if (IS_ERR(obj))
+			return ERR_CAST(obj);
+
+		cmd = i915_gem_object_pin_map(obj,
+					      cache->has_llc ? I915_MAP_WB : I915_MAP_WC);
+		i915_gem_object_unpin_pages(obj);
+		if (IS_ERR(cmd))
+			return ERR_CAST(cmd);
+
+		err = i915_gem_object_set_to_gtt_domain(obj, false);
+		if (err) {
+err_unmap:
+			i915_gem_object_unpin_map(obj);
+			return ERR_PTR(err);
+		}
+
+		batch = i915_vma_instance(obj, vma->vm, NULL);
+		if (IS_ERR(batch)) {
+			err = PTR_ERR(batch);
+			goto err_unmap;
+		}
+
+		err = i915_vma_pin(batch, 0, 0, PIN_USER | PIN_NONBLOCK);
+		if (err)
+			goto err_unmap;
+
+		rq = i915_gem_request_alloc(eb->engine, eb->ctx);
+		if (IS_ERR(rq)) {
+			err = PTR_ERR(rq);
+err_unpin:
+			i915_vma_unpin(batch);
+			goto err_unmap;
+		}
+
+		err = i915_gem_request_await_object(rq,
+						    vma->obj,
+						    EXEC_OBJECT_WRITE);
+		if (err) {
+err_request:
+			i915_add_request(rq);
+			goto err_unpin;
+		}
+
+		err = eb->engine->emit_flush(rq, EMIT_INVALIDATE);
+		if (err)
+			goto err_request;
+
+		err = i915_switch_context(rq);
+		if (err)
+			goto err_request;
+
+		err = eb->engine->emit_bb_start(rq,
+						batch->node.start, PAGE_SIZE,
+						cache->gen > 5 ? 0 : I915_DISPATCH_SECURE);
+		if (err)
+			goto err_request;
+
+		GEM_BUG_ON(!reservation_object_test_signaled_rcu(obj->resv,
+								 true));
+		i915_vma_move_to_active(batch, rq, 0);
+		reservation_object_lock(obj->resv, NULL);
+		reservation_object_add_excl_fence(obj->resv, &rq->fence);
+		reservation_object_unlock(obj->resv);
+		i915_vma_unpin(batch);
+
+		i915_vma_move_to_active(vma, rq, true);
+		reservation_object_lock(vma->obj->resv, NULL);
+		reservation_object_add_excl_fence(vma->obj->resv, &rq->fence);
+		reservation_object_unlock(vma->obj->resv);
+
+		vma->obj->base.write_domain = 0;
+		vma->obj->base.read_domains = I915_GEM_GPU_DOMAINS;
+
+		rq->batch = batch;
+
+		cache->rq = rq;
+		cache->rq_cmd = cmd;
+		cache->rq_size = 0;
+	}
+
+	cmd = cache->rq_cmd + cache->rq_size;
+	cache->rq_size += len;
+
+	return cmd;
+}
+
 static u64
 relocate_entry(struct i915_vma *vma,
 	       const struct drm_i915_gem_relocation_entry *reloc,
@@ -1007,6 +1143,67 @@ relocate_entry(struct i915_vma *vma,
 	bool wide = eb->reloc_cache.use_64bit_reloc;
 	void *vaddr;
 
+	if (!eb->reloc_cache.vaddr &&
+	    (DBG_FORCE_RELOC == FORCE_GPU_RELOC ||
+	     !reservation_object_test_signaled_rcu(obj->resv, true))) {
+		const unsigned int gen = eb->reloc_cache.gen;
+		unsigned int len;
+		u32 *batch;
+		u64 addr;
+
+		if (wide)
+			len = offset & 7 ? 8 : 5;
+		else if (gen >= 4)
+			len = 4;
+		else if (gen >= 3)
+			len = 3;
+		else /* On gen2 MI_STORE_DWORD_IMM uses a physical address */
+			goto repeat;
+
+		batch = reloc_gpu(eb, vma, len);
+		if (IS_ERR(batch))
+			goto repeat;
+
+		addr = gen8_canonical_addr(vma->node.start + offset);
+		if (wide) {
+			if (offset & 7) {
+				*batch++ = MI_STORE_DWORD_IMM_GEN4;
+				*batch++ = lower_32_bits(addr);
+				*batch++ = upper_32_bits(addr);
+				*batch++ = lower_32_bits(target_offset);
+
+				addr = gen8_canonical_addr(addr + 4);
+
+				*batch++ = MI_STORE_DWORD_IMM_GEN4;
+				*batch++ = lower_32_bits(addr);
+				*batch++ = upper_32_bits(addr);
+				*batch++ = upper_32_bits(target_offset);
+			} else {
+				*batch++ = (MI_STORE_DWORD_IMM_GEN4 | (1 << 21)) + 1;
+				*batch++ = lower_32_bits(addr);
+				*batch++ = upper_32_bits(addr);
+				*batch++ = lower_32_bits(target_offset);
+				*batch++ = upper_32_bits(target_offset);
+			}
+		} else if (gen >= 6) {
+			*batch++ = MI_STORE_DWORD_IMM_GEN4;
+			*batch++ = 0;
+			*batch++ = addr;
+			*batch++ = target_offset;
+		} else if (gen >= 4) {
+			*batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+			*batch++ = 0;
+			*batch++ = addr;
+			*batch++ = target_offset;
+		} else {
+			*batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
+			*batch++ = addr;
+			*batch++ = target_offset;
+		}
+
+		goto out;
+	}
+
 repeat:
 	vaddr = reloc_vaddr(obj, &eb->reloc_cache, offset >> PAGE_SHIFT);
 	if (IS_ERR(vaddr))
@@ -1023,6 +1220,7 @@ relocate_entry(struct i915_vma *vma,
 		goto repeat;
 	}
 
+out:
 	return gen8_canonical_addr(target->node.start) | 1;
 }
 
@@ -1083,7 +1281,8 @@ eb_relocate_entry(struct i915_execbuffer *eb,
 	/* If the relocation already has the right value in it, no
 	 * more work needs to be done.
 	 */
-	if (gen8_canonical_addr(target->node.start) == reloc->presumed_offset)
+	if (!DBG_FORCE_RELOC &&
+	    gen8_canonical_addr(target->node.start) == reloc->presumed_offset)
 		return 0;
 
 	/* Check that the relocation address is valid... */
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH 03/15] drm/i915: Drop spinlocks around adding to the client request list
  2017-02-23 16:18 ` [PATCH 03/15] drm/i915: Drop spinlocks around adding to the client request list Chris Wilson
@ 2017-02-24 12:05   ` Mika Kuoppala
  0 siblings, 0 replies; 23+ messages in thread
From: Mika Kuoppala @ 2017-02-24 12:05 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Adding to the tail of the client request list as the only other user is
> in the throttle ioctl that iterates forwards over the list. It only
> needs protection against deletion of a request as it reads it, it simply
> won't see a new request added to the end of the list, or it would be too
> early and rejected. We can further reduce the number of spinlocks
> required when throttling by removing stale requests from the client_list
> as we throttle.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>


> ---
>  drivers/gpu/drm/i915/i915_debugfs.c        |  2 +-
>  drivers/gpu/drm/i915/i915_gem.c            | 14 ++++++------
>  drivers/gpu/drm/i915/i915_gem_execbuffer.c | 13 ++++++++----
>  drivers/gpu/drm/i915/i915_gem_request.c    | 34 ++++++------------------------
>  drivers/gpu/drm/i915/i915_gem_request.h    |  4 +---
>  5 files changed, 23 insertions(+), 44 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 1a28b5279bec..ddae8e442176 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -506,7 +506,7 @@ static int i915_gem_object_info(struct seq_file *m, void *data)
>  		mutex_lock(&dev->struct_mutex);
>  		request = list_first_entry_or_null(&file_priv->mm.request_list,
>  						   struct drm_i915_gem_request,
> -						   client_list);
> +						   client_link);
>  		rcu_read_lock();
>  		task = pid_task(request && request->ctx->pid ?
>  				request->ctx->pid : file->pid,
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index de1fc98e041d..92ab989bb05f 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -3667,16 +3667,14 @@ i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
>  		return -EIO;
>  
>  	spin_lock(&file_priv->mm.lock);
> -	list_for_each_entry(request, &file_priv->mm.request_list, client_list) {
> +	list_for_each_entry(request, &file_priv->mm.request_list, client_link) {
>  		if (time_after_eq(request->emitted_jiffies, recent_enough))
>  			break;
>  
> -		/*
> -		 * Note that the request might not have been submitted yet.
> -		 * In which case emitted_jiffies will be zero.
> -		 */
> -		if (!request->emitted_jiffies)
> -			continue;
> +		if (target) {
> +			list_del(&target->client_link);
> +			target->file_priv = NULL;
> +		}
>  
>  		target = request;
>  	}
> @@ -4735,7 +4733,7 @@ void i915_gem_release(struct drm_device *dev, struct drm_file *file)
>  	 * file_priv.
>  	 */
>  	spin_lock(&file_priv->mm.lock);
> -	list_for_each_entry(request, &file_priv->mm.request_list, client_list)
> +	list_for_each_entry(request, &file_priv->mm.request_list, client_link)
>  		request->file_priv = NULL;
>  	spin_unlock(&file_priv->mm.lock);
>  
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index e8ffe0c9a20e..2b570d0b2392 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -1420,6 +1420,14 @@ i915_gem_execbuffer_parse(struct intel_engine_cs *engine,
>  	return vma;
>  }
>  
> +static void
> +add_to_client(struct drm_i915_gem_request *req,
> +	      struct drm_file *file)
> +{
> +	req->file_priv = file->driver_priv;
> +	list_add_tail(&req->client_link, &req->file_priv->mm.request_list);
> +}
> +
>  static int
>  execbuf_submit(struct i915_execbuffer_params *params,
>  	       struct drm_i915_gem_execbuffer2 *args,
> @@ -1507,6 +1515,7 @@ execbuf_submit(struct i915_execbuffer_params *params,
>  		return ret;
>  
>  	i915_gem_execbuffer_move_to_active(vmas, params->request);
> +	add_to_client(params->request, params->file);
>  
>  	return 0;
>  }
> @@ -1886,10 +1895,6 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>  	 */
>  	params->request->batch = params->batch;
>  
> -	ret = i915_gem_request_add_to_client(params->request, file);
> -	if (ret)
> -		goto err_request;
> -
>  	/*
>  	 * Save assorted stuff away to pass through to *_submission().
>  	 * NB: This data should be 'persistent' and not local as it will
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> index 3a159cac2172..5bca3e25bf61 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -82,42 +82,20 @@ const struct dma_fence_ops i915_fence_ops = {
>  	.release = i915_fence_release,
>  };
>  
> -int i915_gem_request_add_to_client(struct drm_i915_gem_request *req,
> -				   struct drm_file *file)
> -{
> -	struct drm_i915_private *dev_private;
> -	struct drm_i915_file_private *file_priv;
> -
> -	WARN_ON(!req || !file || req->file_priv);
> -
> -	if (!req || !file)
> -		return -EINVAL;
> -
> -	if (req->file_priv)
> -		return -EINVAL;
> -
> -	dev_private = req->i915;
> -	file_priv = file->driver_priv;
> -
> -	spin_lock(&file_priv->mm.lock);
> -	req->file_priv = file_priv;
> -	list_add_tail(&req->client_list, &file_priv->mm.request_list);
> -	spin_unlock(&file_priv->mm.lock);
> -
> -	return 0;
> -}
> -
>  static inline void
>  i915_gem_request_remove_from_client(struct drm_i915_gem_request *request)
>  {
> -	struct drm_i915_file_private *file_priv = request->file_priv;
> +	struct drm_i915_file_private *file_priv;
>  
> +	file_priv = request->file_priv;
>  	if (!file_priv)
>  		return;
>  
>  	spin_lock(&file_priv->mm.lock);
> -	list_del(&request->client_list);
> -	request->file_priv = NULL;
> +	if (request->file_priv) {
> +		list_del(&request->client_link);
> +		request->file_priv = NULL;
> +	}
>  	spin_unlock(&file_priv->mm.lock);
>  }
>  
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
> index cc24a6c72748..1edc0fa7794c 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.h
> +++ b/drivers/gpu/drm/i915/i915_gem_request.h
> @@ -191,7 +191,7 @@ struct drm_i915_gem_request {
>  
>  	struct drm_i915_file_private *file_priv;
>  	/** file_priv list entry for this request */
> -	struct list_head client_list;
> +	struct list_head client_link;
>  };
>  
>  extern const struct dma_fence_ops i915_fence_ops;
> @@ -204,8 +204,6 @@ static inline bool dma_fence_is_i915(const struct dma_fence *fence)
>  struct drm_i915_gem_request * __must_check
>  i915_gem_request_alloc(struct intel_engine_cs *engine,
>  		       struct i915_gem_context *ctx);
> -int i915_gem_request_add_to_client(struct drm_i915_gem_request *req,
> -				   struct drm_file *file);
>  void i915_gem_request_retire_upto(struct drm_i915_gem_request *req);
>  
>  static inline struct drm_i915_gem_request *
> -- 
> 2.11.0
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 06/15] drm/i915: Split vma exec_link/evict_link
  2017-02-23 16:18 ` [PATCH 06/15] drm/i915: Split vma exec_link/evict_link Chris Wilson
@ 2017-02-24 12:20   ` Mika Kuoppala
  0 siblings, 0 replies; 23+ messages in thread
From: Mika Kuoppala @ 2017-02-24 12:20 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Currently the vma has one link member that is used for both holding its
> place in the execbuf reservation list, and in any eviction list. This
> dual property is quite tricky and error prone.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>

> ---
>  drivers/gpu/drm/i915/i915_gem_evict.c      | 14 ++++++-------
>  drivers/gpu/drm/i915/i915_gem_execbuffer.c | 32 +++++++++++++++---------------
>  drivers/gpu/drm/i915/i915_vma.h            |  7 +++++--
>  3 files changed, 28 insertions(+), 25 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_evict.c b/drivers/gpu/drm/i915/i915_gem_evict.c
> index 4753c3f46f7e..2a6eb2ceff79 100644
> --- a/drivers/gpu/drm/i915/i915_gem_evict.c
> +++ b/drivers/gpu/drm/i915/i915_gem_evict.c
> @@ -62,7 +62,7 @@ mark_free(struct drm_mm_scan *scan,
>  	if (flags & PIN_NONFAULT && !list_empty(&vma->obj->userfault_link))
>  		return false;
>  
> -	list_add(&vma->exec_list, unwind);
> +	list_add(&vma->evict_link, unwind);
>  	return drm_mm_scan_add_block(scan, &vma->node);
>  }
>  
> @@ -154,7 +154,7 @@ i915_gem_evict_something(struct i915_address_space *vm,
>  	} while (*++phase);
>  
>  	/* Nothing found, clean up and bail out! */
> -	list_for_each_entry_safe(vma, next, &eviction_list, exec_list) {
> +	list_for_each_entry_safe(vma, next, &eviction_list, evict_link) {
>  		ret = drm_mm_scan_remove_block(&scan, &vma->node);
>  		BUG_ON(ret);
>  	}
> @@ -201,16 +201,16 @@ i915_gem_evict_something(struct i915_address_space *vm,
>  	 * calling unbind (which may remove the active reference
>  	 * of any of our objects, thus corrupting the list).
>  	 */
> -	list_for_each_entry_safe(vma, next, &eviction_list, exec_list) {
> +	list_for_each_entry_safe(vma, next, &eviction_list, evict_link) {
>  		if (drm_mm_scan_remove_block(&scan, &vma->node))
>  			__i915_vma_pin(vma);
>  		else
> -			list_del(&vma->exec_list);
> +			list_del(&vma->evict_link);
>  	}
>  
>  	/* Unbinding will emit any required flushes */
>  	ret = 0;
> -	list_for_each_entry_safe(vma, next, &eviction_list, exec_list) {
> +	list_for_each_entry_safe(vma, next, &eviction_list, evict_link) {
>  		__i915_vma_unpin(vma);
>  		if (ret == 0)
>  			ret = i915_vma_unbind(vma);
> @@ -323,10 +323,10 @@ int i915_gem_evict_for_node(struct i915_address_space *vm,
>  		 * reference) another in our eviction list.
>  		 */
>  		__i915_vma_pin(vma);
> -		list_add(&vma->exec_list, &eviction_list);
> +		list_add(&vma->evict_link, &eviction_list);
>  	}
>  
> -	list_for_each_entry_safe(vma, next, &eviction_list, exec_list) {
> +	list_for_each_entry_safe(vma, next, &eviction_list, evict_link) {
>  		__i915_vma_unpin(vma);
>  		if (ret == 0)
>  			ret = i915_vma_unbind(vma);
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index 9c1dacabe7ef..c229d69b8757 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -134,7 +134,7 @@ eb_reset(struct i915_execbuffer *eb)
>  {
>  	struct i915_vma *vma;
>  
> -	list_for_each_entry(vma, &eb->vmas, exec_list) {
> +	list_for_each_entry(vma, &eb->vmas, exec_link) {
>  		eb_unreserve_vma(vma);
>  		i915_vma_put(vma);
>  		vma->exec_entry = NULL;
> @@ -147,7 +147,7 @@ eb_reset(struct i915_execbuffer *eb)
>  static struct i915_vma *
>  eb_get_batch(struct i915_execbuffer *eb)
>  {
> -	struct i915_vma *vma = list_entry(eb->vmas.prev, typeof(*vma), exec_list);
> +	struct i915_vma *vma = list_entry(eb->vmas.prev, typeof(*vma), exec_link);
>  
>  	/*
>  	 * SNA is doing fancy tricks with compressing batch buffers, which leads
> @@ -224,7 +224,7 @@ eb_lookup_vmas(struct i915_execbuffer *eb)
>  		}
>  
>  		/* Transfer ownership from the objects list to the vmas list. */
> -		list_add_tail(&vma->exec_list, &eb->vmas);
> +		list_add_tail(&vma->exec_link, &eb->vmas);
>  		list_del_init(&obj->obj_exec_link);
>  
>  		vma->exec_entry = &eb->exec[i];
> @@ -283,7 +283,7 @@ static void eb_destroy(struct i915_execbuffer *eb)
>  {
>  	struct i915_vma *vma;
>  
> -	list_for_each_entry(vma, &eb->vmas, exec_list) {
> +	list_for_each_entry(vma, &eb->vmas, exec_link) {
>  		if (!vma->exec_entry)
>  			continue;
>  
> @@ -748,7 +748,7 @@ static int eb_relocate(struct i915_execbuffer *eb)
>  	struct i915_vma *vma;
>  	int ret = 0;
>  
> -	list_for_each_entry(vma, &eb->vmas, exec_list) {
> +	list_for_each_entry(vma, &eb->vmas, exec_link) {
>  		ret = eb_relocate_vma(vma, eb);
>  		if (ret)
>  			break;
> @@ -900,7 +900,7 @@ static int eb_reserve(struct i915_execbuffer *eb)
>  		struct drm_i915_gem_exec_object2 *entry;
>  		bool need_fence, need_mappable;
>  
> -		vma = list_first_entry(&eb->vmas, struct i915_vma, exec_list);
> +		vma = list_first_entry(&eb->vmas, struct i915_vma, exec_link);
>  		obj = vma->obj;
>  		entry = vma->exec_entry;
>  
> @@ -915,12 +915,12 @@ static int eb_reserve(struct i915_execbuffer *eb)
>  		need_mappable = need_fence || need_reloc_mappable(vma);
>  
>  		if (entry->flags & EXEC_OBJECT_PINNED)
> -			list_move_tail(&vma->exec_list, &pinned_vmas);
> +			list_move_tail(&vma->exec_link, &pinned_vmas);
>  		else if (need_mappable) {
>  			entry->flags |= __EXEC_OBJECT_NEEDS_MAP;
> -			list_move(&vma->exec_list, &ordered_vmas);
> +			list_move(&vma->exec_link, &ordered_vmas);
>  		} else
> -			list_move_tail(&vma->exec_list, &ordered_vmas);
> +			list_move_tail(&vma->exec_link, &ordered_vmas);
>  
>  		obj->base.pending_read_domains = I915_GEM_GPU_DOMAINS & ~I915_GEM_DOMAIN_COMMAND;
>  		obj->base.pending_write_domain = 0;
> @@ -945,7 +945,7 @@ static int eb_reserve(struct i915_execbuffer *eb)
>  		int ret = 0;
>  
>  		/* Unbind any ill-fitting objects or pin. */
> -		list_for_each_entry(vma, &eb->vmas, exec_list) {
> +		list_for_each_entry(vma, &eb->vmas, exec_link) {
>  			if (!drm_mm_node_allocated(&vma->node))
>  				continue;
>  
> @@ -958,7 +958,7 @@ static int eb_reserve(struct i915_execbuffer *eb)
>  		}
>  
>  		/* Bind fresh objects */
> -		list_for_each_entry(vma, &eb->vmas, exec_list) {
> +		list_for_each_entry(vma, &eb->vmas, exec_link) {
>  			if (drm_mm_node_allocated(&vma->node))
>  				continue;
>  
> @@ -972,7 +972,7 @@ static int eb_reserve(struct i915_execbuffer *eb)
>  			return ret;
>  
>  		/* Decrement pin count for bound objects */
> -		list_for_each_entry(vma, &eb->vmas, exec_list)
> +		list_for_each_entry(vma, &eb->vmas, exec_link)
>  			eb_unreserve_vma(vma);
>  
>  		ret = i915_gem_evict_vm(eb->vm, true);
> @@ -1061,7 +1061,7 @@ eb_relocate_slow(struct i915_execbuffer *eb)
>  	if (ret)
>  		goto err;
>  
> -	list_for_each_entry(vma, &eb->vmas, exec_list) {
> +	list_for_each_entry(vma, &eb->vmas, exec_link) {
>  		int idx = vma->exec_entry - eb->exec;
>  
>  		ret = eb_relocate_vma_slow(vma, eb, reloc + reloc_offset[idx]);
> @@ -1087,7 +1087,7 @@ eb_move_to_gpu(struct i915_execbuffer *eb)
>  	struct i915_vma *vma;
>  	int ret;
>  
> -	list_for_each_entry(vma, &eb->vmas, exec_list) {
> +	list_for_each_entry(vma, &eb->vmas, exec_link) {
>  		struct drm_i915_gem_object *obj = vma->obj;
>  
>  		if (vma->exec_entry->flags & EXEC_OBJECT_CAPTURE) {
> @@ -1309,7 +1309,7 @@ eb_move_to_active(struct i915_execbuffer *eb)
>  {
>  	struct i915_vma *vma;
>  
> -	list_for_each_entry(vma, &eb->vmas, exec_list) {
> +	list_for_each_entry(vma, &eb->vmas, exec_link) {
>  		struct drm_i915_gem_object *obj = vma->obj;
>  
>  		obj->base.write_domain = obj->base.pending_write_domain;
> @@ -1383,7 +1383,7 @@ static struct i915_vma *eb_parse(struct i915_execbuffer *eb, bool is_master)
>  		memset(&eb->shadow_exec_entry, 0, sizeof(*vma->exec_entry));
>  	vma->exec_entry->flags = __EXEC_OBJECT_HAS_PIN;
>  	i915_gem_object_get(shadow_batch_obj);
> -	list_add_tail(&vma->exec_list, &eb->vmas);
> +	list_add_tail(&vma->exec_link, &eb->vmas);
>  
>  out:
>  	i915_gem_object_unpin_pages(shadow_batch_obj);
> diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h
> index 2e03f81dddbe..4d827300d1a8 100644
> --- a/drivers/gpu/drm/i915/i915_vma.h
> +++ b/drivers/gpu/drm/i915/i915_vma.h
> @@ -100,8 +100,11 @@ struct i915_vma {
>  	struct list_head obj_link; /* Link in the object's VMA list */
>  	struct rb_node obj_node;
>  
> -	/** This vma's place in the batchbuffer or on the eviction list */
> -	struct list_head exec_list;
> +	/** This vma's place in the execbuf reservation list */
> +	struct list_head exec_link;
> +
> +	/** This vma's place in the eviction list */
> +	struct list_head evict_link;
>  
>  	/**
>  	 * Used for performing relocations during execbuffer insertion.
> -- 
> 2.11.0
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 07/15] drm/i915: Stop using obj->obj_exec_link outside of execbuf
  2017-02-23 16:18 ` [PATCH 07/15] drm/i915: Stop using obj->obj_exec_link outside of execbuf Chris Wilson
@ 2017-02-24 12:32   ` Mika Kuoppala
  0 siblings, 0 replies; 23+ messages in thread
From: Mika Kuoppala @ 2017-02-24 12:32 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> i915_gem_stolen_list_info() sneakily takes advantage of the
> obj->obj_exec_link to save itself from having to allocate. Enough of the
> subterfuge, just allocate an array of pointers and sort them instead of
> the list.
>

Justifiable by itself but I suspect you have plans for mutex.

Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>

> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_debugfs.c | 52 ++++++++++++++++++++-----------------
>  1 file changed, 28 insertions(+), 24 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index ddae8e442176..75efa1ae234e 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -27,7 +27,7 @@
>   */
>  
>  #include <linux/debugfs.h>
> -#include <linux/list_sort.h>
> +#include <linux/sort.h>
>  #include "intel_drv.h"
>  
>  static inline struct drm_i915_private *node_to_i915(struct drm_info_node *node)
> @@ -230,13 +230,12 @@ describe_obj(struct seq_file *m, struct drm_i915_gem_object *obj)
>  		seq_printf(m, " (frontbuffer: 0x%03x)", frontbuffer_bits);
>  }
>  
> -static int obj_rank_by_stolen(void *priv,
> -			      struct list_head *A, struct list_head *B)
> +static int obj_rank_by_stolen(const void *A, const void *B)
>  {
> -	struct drm_i915_gem_object *a =
> -		container_of(A, struct drm_i915_gem_object, obj_exec_link);
> -	struct drm_i915_gem_object *b =
> -		container_of(B, struct drm_i915_gem_object, obj_exec_link);
> +	const struct drm_i915_gem_object *a =
> +		*(const struct drm_i915_gem_object **)A;
> +	const struct drm_i915_gem_object *b =
> +		*(const struct drm_i915_gem_object **)B;
>  
>  	if (a->stolen->start < b->stolen->start)
>  		return -1;
> @@ -249,49 +248,54 @@ static int i915_gem_stolen_list_info(struct seq_file *m, void *data)
>  {
>  	struct drm_i915_private *dev_priv = node_to_i915(m->private);
>  	struct drm_device *dev = &dev_priv->drm;
> +	struct drm_i915_gem_object **objects;
>  	struct drm_i915_gem_object *obj;
>  	u64 total_obj_size, total_gtt_size;
> -	LIST_HEAD(stolen);
> -	int count, ret;
> +	unsigned long count, n;
> +	int ret;
>  
>  	ret = mutex_lock_interruptible(&dev->struct_mutex);
>  	if (ret)
>  		return ret;
>  
> +	objects = drm_malloc_ab(dev_priv->mm.object_count, sizeof(*objects));
> +	if (!objects) {
> +		ret = -ENOMEM;
> +		goto out_unlock;
> +	}
> +
>  	total_obj_size = total_gtt_size = count = 0;
>  	list_for_each_entry(obj, &dev_priv->mm.bound_list, global_link) {
>  		if (obj->stolen == NULL)
>  			continue;
>  
> -		list_add(&obj->obj_exec_link, &stolen);
> -
> +		objects[count++] = obj;
>  		total_obj_size += obj->base.size;
>  		total_gtt_size += i915_gem_obj_total_ggtt_size(obj);
> -		count++;
>  	}
>  	list_for_each_entry(obj, &dev_priv->mm.unbound_list, global_link) {
>  		if (obj->stolen == NULL)
>  			continue;
>  
> -		list_add(&obj->obj_exec_link, &stolen);
> -
> +		objects[count++] = obj;
>  		total_obj_size += obj->base.size;
> -		count++;
>  	}
> -	list_sort(NULL, &stolen, obj_rank_by_stolen);
> +
> +	sort(objects, count, sizeof(*objects), obj_rank_by_stolen, NULL);
> +
>  	seq_puts(m, "Stolen:\n");
> -	while (!list_empty(&stolen)) {
> -		obj = list_first_entry(&stolen, typeof(*obj), obj_exec_link);
> +	for (n = 0; n < count; n++) {
>  		seq_puts(m, "   ");
> -		describe_obj(m, obj);
> +		describe_obj(m, objects[n]);
>  		seq_putc(m, '\n');
> -		list_del_init(&obj->obj_exec_link);
>  	}
> -	mutex_unlock(&dev->struct_mutex);
> -
> -	seq_printf(m, "Total %d objects, %llu bytes, %llu GTT size\n",
> +	seq_printf(m, "Total %lu objects, %llu bytes, %llu GTT size\n",
>  		   count, total_obj_size, total_gtt_size);
> -	return 0;
> +
> +	drm_free_large(objects);
> +out_unlock:
> +	mutex_unlock(&dev->struct_mutex);
> +	return ret;
>  }
>  
>  struct file_stats {
> -- 
> 2.11.0
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 12/15] drm/i915: Wait upon userptr get-user-pages within execbuffer
  2017-02-23 16:18 ` [PATCH 12/15] drm/i915: Wait upon userptr get-user-pages within execbuffer Chris Wilson
@ 2017-02-24 13:53   ` Michał Winiarski
  2017-02-24 14:23     ` Chris Wilson
  0 siblings, 1 reply; 23+ messages in thread
From: Michał Winiarski @ 2017-02-24 13:53 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

On Thu, Feb 23, 2017 at 04:18:27PM +0000, Chris Wilson wrote:
> This simply hides the EAGAIN caused by userptr when userspace causes
> resource contention. However, it is quite beneficial with highly
> contended userptr users as we avoid repeating the setup costs and
> kernel-user context switches.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

Reviewed-by: Michał Winiarski <michal.winiarski@intel.com>

-Michał

> ---
>  drivers/gpu/drm/i915/i915_drv.c            |  1 +
>  drivers/gpu/drm/i915/i915_drv.h            | 10 +++++++++-
>  drivers/gpu/drm/i915/i915_gem.c            |  4 +++-
>  drivers/gpu/drm/i915/i915_gem_execbuffer.c |  3 +++
>  drivers/gpu/drm/i915/i915_gem_userptr.c    | 18 +++++++++++++++---
>  5 files changed, 31 insertions(+), 5 deletions(-)
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 12/15] drm/i915: Wait upon userptr get-user-pages within execbuffer
  2017-02-24 13:53   ` Michał Winiarski
@ 2017-02-24 14:23     ` Chris Wilson
  0 siblings, 0 replies; 23+ messages in thread
From: Chris Wilson @ 2017-02-24 14:23 UTC (permalink / raw)
  To: Michał Winiarski; +Cc: intel-gfx

On Fri, Feb 24, 2017 at 02:53:07PM +0100, Michał Winiarski wrote:
> On Thu, Feb 23, 2017 at 04:18:27PM +0000, Chris Wilson wrote:
> > This simply hides the EAGAIN caused by userptr when userspace causes
> > resource contention. However, it is quite beneficial with highly
> > contended userptr users as we avoid repeating the setup costs and
> > kernel-user context switches.
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> 
> Reviewed-by: Michał Winiarski <michal.winiarski@intel.com>

Michał pointed out in irc that we need to hit the slowpath after failing
to pin the userptr pages in the earlier reservation phase even if we
have no relocation (flushing the wq inside the kernel is nicer than
spinning on EAGAIN). (Hmm, with the exception that flush_workqueue is
not interruptible. Hmm. I think this would be worth refining later to be
an interruptible flush on the right set of userptr.) Anway, just to say
that required changes to the preceding patch not to miss the slowpath on
reservation failure.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 01/15] drm/i915: Copy user requested buffers into the error state
  2017-02-23 16:18 ` [PATCH 01/15] drm/i915: Copy user requested buffers into the error state Chris Wilson
@ 2017-02-28  6:11   ` Ben Widawsky
  2017-02-28 14:17   ` Joonas Lahtinen
  1 sibling, 0 replies; 23+ messages in thread
From: Ben Widawsky @ 2017-02-28  6:11 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

On 17-02-23 16:18:16, Chris Wilson wrote:
>Introduce a new execobject.flag (EXEC_OBJECT_CAPTURE) that userspace may
>use to indicate that it wants the contents of this buffer preserved in
>the error state (/sys/class/drm/cardN/error) following a GPU hang
>involving this batch.
>
>Use this at your discretion, the contents of the error state. although
>compressed, are allocated with GFP_ATOMIC (i.e. limited) and kept for all
>eternity (until the error state is destroyed).
>
>Based on an earlier patch by Ben Widawsky <ben@bwidawsk.net>
>Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>Cc: Ben Widawsky <ben@bwidawsk.net>
>Cc: Matt Turner <mattst88@gmail.com>

Haven't tested it or used it, but I wanted it.
Acked-by: Ben Widawsky <ben@bwidawsk.net>

>---
> drivers/gpu/drm/i915/i915_drv.c            |  1 +
> drivers/gpu/drm/i915/i915_drv.h            |  3 +++
> drivers/gpu/drm/i915/i915_gem_execbuffer.c | 12 +++++++++
> drivers/gpu/drm/i915/i915_gem_request.c    | 16 ++++++++++++
> drivers/gpu/drm/i915/i915_gem_request.h    | 11 ++++++++
> drivers/gpu/drm/i915/i915_gpu_error.c      | 40 +++++++++++++++++++++++++++++-
> include/uapi/drm/i915_drm.h                | 15 ++++++++++-
> 7 files changed, 96 insertions(+), 2 deletions(-)
>
>diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
>index 409fc32ce2bd..842c62b96a83 100644
>--- a/drivers/gpu/drm/i915/i915_drv.c
>+++ b/drivers/gpu/drm/i915/i915_drv.c
>@@ -353,6 +353,7 @@ static int i915_getparam(struct drm_device *dev, void *data,
> 	case I915_PARAM_HAS_EXEC_ASYNC:
> 	case I915_PARAM_HAS_EXEC_FENCE:
> 	case I915_PARAM_HAS_EXEC_FENCE_DMABUF:
>+	case I915_PARAM_HAS_EXEC_CAPTURE:
> 		/* For the time being all of these are always true;
> 		 * if some supported hardware does not have one of these
> 		 * features this value needs to be provided from
>diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>index 440a4725b87f..2cc0253d6ef7 100644
>--- a/drivers/gpu/drm/i915/i915_drv.h
>+++ b/drivers/gpu/drm/i915/i915_drv.h
>@@ -1018,6 +1018,9 @@ struct i915_gpu_state {
> 			u32 *pages[0];
> 		} *ringbuffer, *batchbuffer, *wa_batchbuffer, *ctx, *hws_page;
>
>+		struct drm_i915_error_object **user_bo;
>+		long user_bo_count;
>+
> 		struct drm_i915_error_object *wa_ctx;
>
> 		struct drm_i915_error_request {
>diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
>index 3f2796131410..e8ffe0c9a20e 100644
>--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
>+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
>@@ -1113,6 +1113,18 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req,
> 	list_for_each_entry(vma, vmas, exec_list) {
> 		struct drm_i915_gem_object *obj = vma->obj;
>
>+		if (vma->exec_entry->flags & EXEC_OBJECT_CAPTURE) {
>+			struct i915_gem_capture_list *capture;
>+
>+			capture = kmalloc(sizeof(*capture), GFP_KERNEL);
>+			if (unlikely(!capture))
>+				return -ENOMEM;
>+
>+			capture->next = req->capture_list;
>+			capture->vma = vma;
>+			req->capture_list = capture;
>+		}
>+
> 		if (vma->exec_entry->flags & EXEC_OBJECT_ASYNC)
> 			continue;
>
>diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
>index ad9d4ce07fb6..3a159cac2172 100644
>--- a/drivers/gpu/drm/i915/i915_gem_request.c
>+++ b/drivers/gpu/drm/i915/i915_gem_request.c
>@@ -286,6 +286,19 @@ void i915_gem_retire_noop(struct i915_gem_active *active,
> 	/* Space left intentionally blank */
> }
>
>+static void request_free_capture_list(struct drm_i915_gem_request *request)
>+{
>+	struct i915_gem_capture_list *capture;
>+
>+	capture = request->capture_list;
>+	while (capture) {
>+		struct i915_gem_capture_list *next = capture->next;
>+
>+		kfree(capture);
>+		capture = next;
>+	}
>+}
>+
> static void i915_gem_request_retire(struct drm_i915_gem_request *request)
> {
> 	struct intel_engine_cs *engine = request->engine;
>@@ -320,6 +333,8 @@ static void i915_gem_request_retire(struct drm_i915_gem_request *request)
> 	}
> 	unreserve_seqno(request->engine);
>
>+	request_free_capture_list(request);
>+
> 	/* Walk through the active list, calling retire on each. This allows
> 	 * objects to track their GPU activity and mark themselves as idle
> 	 * when their *last* active request is completed (updating state
>@@ -615,6 +630,7 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
> 	req->global_seqno = 0;
> 	req->file_priv = NULL;
> 	req->batch = NULL;
>+	req->capture_list = NULL;
>
> 	/*
> 	 * Reserve space in the ring buffer for all the commands required to
>diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
>index 0efee879df23..cc24a6c72748 100644
>--- a/drivers/gpu/drm/i915/i915_gem_request.h
>+++ b/drivers/gpu/drm/i915/i915_gem_request.h
>@@ -73,6 +73,11 @@ struct i915_priotree {
> #define I915_PRIORITY_MIN (-I915_PRIORITY_MAX)
> };
>
>+struct i915_gem_capture_list {
>+	struct i915_gem_capture_list *next;
>+	struct i915_vma *vma;
>+};
>+
> /**
>  * Request queue structure.
>  *
>@@ -167,6 +172,12 @@ struct drm_i915_gem_request {
> 	 * error state dump only).
> 	 */
> 	struct i915_vma *batch;
>+	/** Additional buffers requested by userspace to be captured upon
>+	 * a GPU hang. The vma/obj on this list are protected by their
>+	 * active reference - all objects on this list must also be
>+	 * on the active_list (of their final request).
>+	 */
>+	struct i915_gem_capture_list *capture_list;
> 	struct list_head active_list;
>
> 	/** Time at which this request was emitted, in jiffies. */
>diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
>index 2b1d15668192..76855e1d8795 100644
>--- a/drivers/gpu/drm/i915/i915_gpu_error.c
>+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
>@@ -709,6 +709,10 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
> 			print_error_obj(m, dev_priv->engine[i], NULL, obj);
> 		}
>
>+		for (j = 0; j < ee->user_bo_count; j++)
>+			print_error_obj(m, dev_priv->engine[i],
>+					"user", ee->user_bo[j]);
>+
> 		if (ee->num_requests) {
> 			err_printf(m, "%s --- %d requests\n",
> 				   dev_priv->engine[i]->name,
>@@ -822,11 +826,15 @@ void __i915_gpu_state_free(struct kref *error_ref)
> {
> 	struct i915_gpu_state *error =
> 		container_of(error_ref, typeof(*error), ref);
>-	int i;
>+	long i, j;
>
> 	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
> 		struct drm_i915_error_engine *ee = &error->engine[i];
>
>+		for (j = 0; j < ee->user_bo_count; j++)
>+			i915_error_object_free(ee->user_bo[j]);
>+		kfree(ee->user_bo);
>+
> 		i915_error_object_free(ee->batchbuffer);
> 		i915_error_object_free(ee->wa_batchbuffer);
> 		i915_error_object_free(ee->ringbuffer);
>@@ -1343,6 +1351,35 @@ static void record_context(struct drm_i915_error_context *e,
> 	e->active = ctx->active_count;
> }
>
>+static void request_record_user_bo(struct drm_i915_gem_request *request,
>+				   struct drm_i915_error_engine *ee)
>+{
>+	struct i915_gem_capture_list *c;
>+	struct drm_i915_error_object **bo;
>+	long count;
>+
>+	count = 0;
>+	for (c = request->capture_list; c; c = c->next)
>+		count++;
>+
>+	bo = NULL;
>+	if (count)
>+		bo = kcalloc(count, sizeof(*bo), GFP_ATOMIC);
>+	if (!bo)
>+		return;
>+
>+	count = 0;
>+	for (c = request->capture_list; c; c = c->next) {
>+		bo[count] = i915_error_object_create(request->i915, c->vma);
>+		if (!bo[count])
>+			break;
>+		count++;
>+	}
>+
>+	ee->user_bo = bo;
>+	ee->user_bo_count = count;
>+}
>+
> static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
> 				  struct i915_gpu_state *error)
> {
>@@ -1389,6 +1426,7 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
> 				ee->wa_batchbuffer =
> 					i915_error_object_create(dev_priv,
> 								 engine->scratch);
>+			request_record_user_bo(request, ee);
>
> 			ee->ctx =
> 				i915_error_object_create(dev_priv,
>diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
>index ebc7641b5252..9eda849df680 100644
>--- a/include/uapi/drm/i915_drm.h
>+++ b/include/uapi/drm/i915_drm.h
>@@ -418,6 +418,12 @@ typedef struct drm_i915_irq_wait {
>  */
> #define I915_PARAM_HAS_EXEC_FENCE_DMABUF 45
>
>+/* Query whether DRM_I915_GEM_EXECBUFFER2 supports the ability to capture
>+ * user specified bufffers for post-mortem debugging of GPU hangs. See
>+ * EXEC_OBJECT_CAPTURE.
>+ */
>+#define I915_PARAM_HAS_EXEC_CAPTURE	 46
>+
> typedef struct drm_i915_getparam {
> 	__s32 param;
> 	/*
>@@ -779,8 +785,15 @@ struct drm_i915_gem_exec_object2 {
>  * I915_PARAM_HAS_EXEC_FENCE to order execbufs and execute them asynchronously.
>  */
> #define EXEC_OBJECT_ASYNC		(1<<6)
>+/* Request that the contents of this execobject be copied into the error
>+ * state upon a GPU hang involving this batch for post-mortem debugging.
>+ * These buffers are recorded in no particular order as "user" in
>+ * /sys/class/drm/cardN/error. Query I915_PARAM_HAS_EXEC_CAPTURE to see
>+ * if the kernel supports this flag.
>+ */
>+#define EXEC_OBJECT_CAPTURE		(1<<7)
> /* All remaining bits are MBZ and RESERVED FOR FUTURE USE */
>-#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_ASYNC<<1)
>+#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_CAPTURE<<1)
> 	__u64 flags;
>
> 	union {
>-- 
>2.11.0
>
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 01/15] drm/i915: Copy user requested buffers into the error state
  2017-02-23 16:18 ` [PATCH 01/15] drm/i915: Copy user requested buffers into the error state Chris Wilson
  2017-02-28  6:11   ` Ben Widawsky
@ 2017-02-28 14:17   ` Joonas Lahtinen
  1 sibling, 0 replies; 23+ messages in thread
From: Joonas Lahtinen @ 2017-02-28 14:17 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx; +Cc: Ben Widawsky

On to, 2017-02-23 at 16:18 +0000, Chris Wilson wrote:
> Introduce a new execobject.flag (EXEC_OBJECT_CAPTURE) that userspace may
> use to indicate that it wants the contents of this buffer preserved in
> the error state (/sys/class/drm/cardN/error) following a GPU hang
> involving this batch.
> 
> Use this at your discretion, the contents of the error state. although
> compressed, are allocated with GFP_ATOMIC (i.e. limited) and kept for all
> eternity (until the error state is destroyed).
> 
> Based on an earlier patch by Ben Widawsky <ben@bwidawsk.net>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Ben Widawsky <ben@bwidawsk.net>
> Cc: Matt Turner <mattst88@gmail.com>

Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>

Regards, Joonas
-- 
Joonas Lahtinen
Open Source Technology Center
Intel Corporation
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2017-02-28 14:17 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-02-23 16:18 Make execbuf fast[er] Chris Wilson
2017-02-23 16:18 ` [PATCH 01/15] drm/i915: Copy user requested buffers into the error state Chris Wilson
2017-02-28  6:11   ` Ben Widawsky
2017-02-28 14:17   ` Joonas Lahtinen
2017-02-23 16:18 ` [PATCH 02/15] drm/i915: Retire an active batch pool object rather than allocate new Chris Wilson
2017-02-23 16:18 ` [PATCH 03/15] drm/i915: Drop spinlocks around adding to the client request list Chris Wilson
2017-02-24 12:05   ` Mika Kuoppala
2017-02-23 16:18 ` [PATCH 04/15] drm/i915: Amalgamate execbuffer parameter structures Chris Wilson
2017-02-23 16:18 ` [PATCH 05/15] drm/i915: Use vma->exec_entry as our double-entry placeholder Chris Wilson
2017-02-23 16:18 ` [PATCH 06/15] drm/i915: Split vma exec_link/evict_link Chris Wilson
2017-02-24 12:20   ` Mika Kuoppala
2017-02-23 16:18 ` [PATCH 07/15] drm/i915: Stop using obj->obj_exec_link outside of execbuf Chris Wilson
2017-02-24 12:32   ` Mika Kuoppala
2017-02-23 16:18 ` [PATCH 08/15] drm/i915: Store a direct lookup from object handle to vma Chris Wilson
2017-02-23 16:18 ` [PATCH 09/15] drm/i915: Pass vma to relocate entry Chris Wilson
2017-02-23 16:18 ` [PATCH 10/15] drm/i915: Eliminate lots of iterations over the execobjects array Chris Wilson
2017-02-23 16:18 ` [PATCH 11/15] drm/i915: First try the previous execbuffer location Chris Wilson
2017-02-23 16:18 ` [PATCH 12/15] drm/i915: Wait upon userptr get-user-pages within execbuffer Chris Wilson
2017-02-24 13:53   ` Michał Winiarski
2017-02-24 14:23     ` Chris Wilson
2017-02-23 16:18 ` [PATCH 13/15] drm/i915: Remove superfluous i915_add_request_no_flush() helper Chris Wilson
2017-02-23 16:18 ` [PATCH 14/15] drm/i915: Allow execbuffer to use the first object as the batch Chris Wilson
2017-02-23 16:18 ` [PATCH 15/15] drm/i915: Async GPU relocation processing Chris Wilson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.