Make execbuf fast and green

All of lore.kernel.org
 help / color / mirror / Atom feed

* Make execbuf fast and green
@ 2017-03-16 13:19 Chris Wilson
  2017-03-16 13:19 ` [PATCH 01/15] drm/i915: Copy user requested buffers into the error state Chris Wilson
                   ` (18 more replies)
  0 siblings, 19 replies; 39+ messages in thread
From: Chris Wilson @ 2017-03-16 13:19 UTC (permalink / raw)
  To: intel-gfx

Same series as before (and so on...), resent to keep Joonas's inbox full.
-Chris

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [PATCH 01/15] drm/i915: Copy user requested buffers into the error state
  2017-03-16 13:19 Make execbuf fast and green Chris Wilson
@ 2017-03-16 13:19 ` Chris Wilson
  2017-03-16 13:19 ` [PATCH 02/15] drm/i915: Retire an active batch pool object rather than allocate new Chris Wilson
                   ` (17 subsequent siblings)
  18 siblings, 0 replies; 39+ messages in thread
From: Chris Wilson @ 2017-03-16 13:19 UTC (permalink / raw)
  To: intel-gfx; +Cc: Ben Widawsky

Introduce a new execobject.flag (EXEC_OBJECT_CAPTURE) that userspace may
use to indicate that it wants the contents of this buffer preserved in
the error state (/sys/class/drm/cardN/error) following a GPU hang
involving this batch.

Use this at your discretion, the contents of the error state. although
compressed, are allocated with GFP_ATOMIC (i.e. limited) and kept for all
eternity (until the error state is destroyed).

Based on an earlier patch by Ben Widawsky <ben@bwidawsk.net>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Ben Widawsky <ben@bwidawsk.net>
Cc: Matt Turner <mattst88@gmail.com>
Acked-by: Ben Widawsky <ben@bwidawsk.net>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_drv.c            |  1 +
 drivers/gpu/drm/i915/i915_drv.h            |  3 +++
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 12 +++++++++
 drivers/gpu/drm/i915/i915_gem_request.c    | 16 ++++++++++++
 drivers/gpu/drm/i915/i915_gem_request.h    | 11 ++++++++
 drivers/gpu/drm/i915/i915_gpu_error.c      | 40 +++++++++++++++++++++++++++++-
 include/uapi/drm/i915_drm.h                | 15 ++++++++++-
 7 files changed, 96 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 9164167cd147..9d8c8b928aab 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -350,6 +350,7 @@ static int i915_getparam(struct drm_device *dev, void *data,
 	case I915_PARAM_HAS_EXEC_SOFTPIN:
 	case I915_PARAM_HAS_EXEC_ASYNC:
 	case I915_PARAM_HAS_EXEC_FENCE:
+	case I915_PARAM_HAS_EXEC_CAPTURE:
 		/* For the time being all of these are always true;
 		 * if some supported hardware does not have one of these
 		 * features this value needs to be provided from
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 6e14c7d089b8..3c9551147e28 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1035,6 +1035,9 @@ struct i915_gpu_state {
 			u32 *pages[0];
 		} *ringbuffer, *batchbuffer, *wa_batchbuffer, *ctx, *hws_page;
 
+		struct drm_i915_error_object **user_bo;
+		long user_bo_count;
+
 		struct drm_i915_error_object *wa_ctx;
 
 		struct drm_i915_error_request {
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index dd7181ed5eca..cc6082a80d2d 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1112,6 +1112,18 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req,
 	list_for_each_entry(vma, vmas, exec_list) {
 		struct drm_i915_gem_object *obj = vma->obj;
 
+		if (vma->exec_entry->flags & EXEC_OBJECT_CAPTURE) {
+			struct i915_gem_capture_list *capture;
+
+			capture = kmalloc(sizeof(*capture), GFP_KERNEL);
+			if (unlikely(!capture))
+				return -ENOMEM;
+
+			capture->next = req->capture_list;
+			capture->vma = vma;
+			req->capture_list = capture;
+		}
+
 		if (vma->exec_entry->flags & EXEC_OBJECT_ASYNC)
 			continue;
 
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 1e1d9f2072cd..73e34cdc67c4 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -270,6 +270,19 @@ void i915_gem_retire_noop(struct i915_gem_active *active,
 	/* Space left intentionally blank */
 }
 
+static void request_free_capture_list(struct drm_i915_gem_request *request)
+{
+	struct i915_gem_capture_list *capture;
+
+	capture = request->capture_list;
+	while (capture) {
+		struct i915_gem_capture_list *next = capture->next;
+
+		kfree(capture);
+		capture = next;
+	}
+}
+
 static void i915_gem_request_retire(struct drm_i915_gem_request *request)
 {
 	struct intel_engine_cs *engine = request->engine;
@@ -304,6 +317,8 @@ static void i915_gem_request_retire(struct drm_i915_gem_request *request)
 	}
 	unreserve_seqno(request->engine);
 
+	request_free_capture_list(request);
+
 	/* Walk through the active list, calling retire on each. This allows
 	 * objects to track their GPU activity and mark themselves as idle
 	 * when their *last* active request is completed (updating state
@@ -602,6 +617,7 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 	req->global_seqno = 0;
 	req->file_priv = NULL;
 	req->batch = NULL;
+	req->capture_list = NULL;
 
 	/*
 	 * Reserve space in the ring buffer for all the commands required to
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index 0cef887b0de4..4eb642960393 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -73,6 +73,11 @@ struct i915_priotree {
 #define I915_PRIORITY_MIN (-I915_PRIORITY_MAX)
 };
 
+struct i915_gem_capture_list {
+	struct i915_gem_capture_list *next;
+	struct i915_vma *vma;
+};
+
 /**
  * Request queue structure.
  *
@@ -167,6 +172,12 @@ struct drm_i915_gem_request {
 	 * error state dump only).
 	 */
 	struct i915_vma *batch;
+	/** Additional buffers requested by userspace to be captured upon
+	 * a GPU hang. The vma/obj on this list are protected by their
+	 * active reference - all objects on this list must also be
+	 * on the active_list (of their final request).
+	 */
+	struct i915_gem_capture_list *capture_list;
 	struct list_head active_list;
 
 	/** Time at which this request was emitted, in jiffies. */
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 8effc59f5cb5..4b247b050dcd 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -712,6 +712,10 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 			print_error_obj(m, dev_priv->engine[i], NULL, obj);
 		}
 
+		for (j = 0; j < ee->user_bo_count; j++)
+			print_error_obj(m, dev_priv->engine[i],
+					"user", ee->user_bo[j]);
+
 		if (ee->num_requests) {
 			err_printf(m, "%s --- %d requests\n",
 				   dev_priv->engine[i]->name,
@@ -825,11 +829,15 @@ void __i915_gpu_state_free(struct kref *error_ref)
 {
 	struct i915_gpu_state *error =
 		container_of(error_ref, typeof(*error), ref);
-	int i;
+	long i, j;
 
 	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
 		struct drm_i915_error_engine *ee = &error->engine[i];
 
+		for (j = 0; j < ee->user_bo_count; j++)
+			i915_error_object_free(ee->user_bo[j]);
+		kfree(ee->user_bo);
+
 		i915_error_object_free(ee->batchbuffer);
 		i915_error_object_free(ee->wa_batchbuffer);
 		i915_error_object_free(ee->ringbuffer);
@@ -1346,6 +1354,35 @@ static void record_context(struct drm_i915_error_context *e,
 	e->active = ctx->active_count;
 }
 
+static void request_record_user_bo(struct drm_i915_gem_request *request,
+				   struct drm_i915_error_engine *ee)
+{
+	struct i915_gem_capture_list *c;
+	struct drm_i915_error_object **bo;
+	long count;
+
+	count = 0;
+	for (c = request->capture_list; c; c = c->next)
+		count++;
+
+	bo = NULL;
+	if (count)
+		bo = kcalloc(count, sizeof(*bo), GFP_ATOMIC);
+	if (!bo)
+		return;
+
+	count = 0;
+	for (c = request->capture_list; c; c = c->next) {
+		bo[count] = i915_error_object_create(request->i915, c->vma);
+		if (!bo[count])
+			break;
+		count++;
+	}
+
+	ee->user_bo = bo;
+	ee->user_bo_count = count;
+}
+
 static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
 				  struct i915_gpu_state *error)
 {
@@ -1392,6 +1429,7 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
 				ee->wa_batchbuffer =
 					i915_error_object_create(dev_priv,
 								 engine->scratch);
+			request_record_user_bo(request, ee);
 
 			ee->ctx =
 				i915_error_object_create(dev_priv,
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 3554495bef13..176c5a70300b 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -412,6 +412,12 @@ typedef struct drm_i915_irq_wait {
  */
 #define I915_PARAM_HAS_EXEC_FENCE	 44
 
+/* Query whether DRM_I915_GEM_EXECBUFFER2 supports the ability to capture
+ * user specified bufffers for post-mortem debugging of GPU hangs. See
+ * EXEC_OBJECT_CAPTURE.
+ */
+#define I915_PARAM_HAS_EXEC_CAPTURE	 45
+
 typedef struct drm_i915_getparam {
 	__s32 param;
 	/*
@@ -773,8 +779,15 @@ struct drm_i915_gem_exec_object2 {
  * I915_PARAM_HAS_EXEC_FENCE to order execbufs and execute them asynchronously.
  */
 #define EXEC_OBJECT_ASYNC		(1<<6)
+/* Request that the contents of this execobject be copied into the error
+ * state upon a GPU hang involving this batch for post-mortem debugging.
+ * These buffers are recorded in no particular order as "user" in
+ * /sys/class/drm/cardN/error. Query I915_PARAM_HAS_EXEC_CAPTURE to see
+ * if the kernel supports this flag.
+ */
+#define EXEC_OBJECT_CAPTURE		(1<<7)
 /* All remaining bits are MBZ and RESERVED FOR FUTURE USE */
-#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_ASYNC<<1)
+#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_CAPTURE<<1)
 	__u64 flags;
 
 	union {
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 39+ messages in thread

* [PATCH 02/15] drm/i915: Retire an active batch pool object rather than allocate new
  2017-03-16 13:19 Make execbuf fast and green Chris Wilson
  2017-03-16 13:19 ` [PATCH 01/15] drm/i915: Copy user requested buffers into the error state Chris Wilson
@ 2017-03-16 13:19 ` Chris Wilson
  2017-03-17  8:52   ` Joonas Lahtinen
  2017-03-16 13:19 ` [PATCH 03/15] drm/i915: Amalgamate execbuffer parameter structures Chris Wilson
                   ` (16 subsequent siblings)
  18 siblings, 1 reply; 39+ messages in thread
From: Chris Wilson @ 2017-03-16 13:19 UTC (permalink / raw)
  To: intel-gfx

Since obj->active_count is only updated upon retirement, if we see an
active object in the batch pool, double check that is still active
before deciding to allocate a new object.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem_batch_pool.c | 37 ++++++++++++++----------------
 1 file changed, 17 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_batch_pool.c b/drivers/gpu/drm/i915/i915_gem_batch_pool.c
index 99ceae7855f8..41aa598c4f3b 100644
--- a/drivers/gpu/drm/i915/i915_gem_batch_pool.c
+++ b/drivers/gpu/drm/i915/i915_gem_batch_pool.c
@@ -96,8 +96,7 @@ struct drm_i915_gem_object *
 i915_gem_batch_pool_get(struct i915_gem_batch_pool *pool,
 			size_t size)
 {
-	struct drm_i915_gem_object *obj = NULL;
-	struct drm_i915_gem_object *tmp;
+	struct drm_i915_gem_object *obj;
 	struct list_head *list;
 	int n, ret;
 
@@ -112,31 +111,29 @@ i915_gem_batch_pool_get(struct i915_gem_batch_pool *pool,
 		n = ARRAY_SIZE(pool->cache_list) - 1;
 	list = &pool->cache_list[n];
 
-	list_for_each_entry(tmp, list, batch_pool_link) {
+	list_for_each_entry(obj, list, batch_pool_link) {
 		/* The batches are strictly LRU ordered */
-		if (i915_gem_object_is_active(tmp))
-			break;
+		if (i915_gem_object_is_active(obj)) {
+			if (!reservation_object_test_signaled_rcu(obj->resv,
+								  true))
+				break;
 
-		GEM_BUG_ON(!reservation_object_test_signaled_rcu(tmp->resv,
-								 true));
+			i915_gem_retire_requests(pool->engine->i915);
+			GEM_BUG_ON(i915_gem_object_is_active(obj));
+		}
 
-		if (tmp->base.size >= size) {
-			/* Clear the set of shared fences early */
-			reservation_object_lock(tmp->resv, NULL);
-			reservation_object_add_excl_fence(tmp->resv, NULL);
-			reservation_object_unlock(tmp->resv);
+		GEM_BUG_ON(!reservation_object_test_signaled_rcu(obj->resv,
+								 true));
 
-			obj = tmp;
-			break;
-		}
+		if (obj->base.size >= size)
+			goto found;
 	}
 
-	if (obj == NULL) {
-		obj = i915_gem_object_create_internal(pool->engine->i915, size);
-		if (IS_ERR(obj))
-			return obj;
-	}
+	obj = i915_gem_object_create_internal(pool->engine->i915, size);
+	if (IS_ERR(obj))
+		return obj;
 
+found:
 	ret = i915_gem_object_pin_pages(obj);
 	if (ret)
 		return ERR_PTR(ret);
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 39+ messages in thread

* [PATCH 03/15] drm/i915: Amalgamate execbuffer parameter structures
  2017-03-16 13:19 Make execbuf fast and green Chris Wilson
  2017-03-16 13:19 ` [PATCH 01/15] drm/i915: Copy user requested buffers into the error state Chris Wilson
  2017-03-16 13:19 ` [PATCH 02/15] drm/i915: Retire an active batch pool object rather than allocate new Chris Wilson
@ 2017-03-16 13:19 ` Chris Wilson
  2017-03-17 10:04   ` Joonas Lahtinen
  2017-03-16 13:19 ` [PATCH 04/15] drm/i915: Use vma->exec_entry as our double-entry placeholder Chris Wilson
                   ` (15 subsequent siblings)
  18 siblings, 1 reply; 39+ messages in thread
From: Chris Wilson @ 2017-03-16 13:19 UTC (permalink / raw)
  To: intel-gfx

Combine the two slightly overlapping parameter structures we pass around
the execbuffer routines into one.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 548 ++++++++++++-----------------
 1 file changed, 232 insertions(+), 316 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index cc6082a80d2d..05e579c5cf46 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -50,70 +50,78 @@
 
 #define BATCH_OFFSET_BIAS (256*1024)
 
-struct i915_execbuffer_params {
-	struct drm_device               *dev;
-	struct drm_file                 *file;
-	struct i915_vma			*batch;
-	u32				dispatch_flags;
-	u32				args_batch_start_offset;
-	struct intel_engine_cs          *engine;
-	struct i915_gem_context         *ctx;
-	struct drm_i915_gem_request     *request;
-};
+#define __I915_EXEC_ILLEGAL_FLAGS \
+	(__I915_EXEC_UNKNOWN_FLAGS | I915_EXEC_CONSTANTS_MASK)
 
-struct eb_vmas {
+struct i915_execbuffer {
 	struct drm_i915_private *i915;
+	struct drm_file *file;
+	struct drm_i915_gem_execbuffer2 *args;
+	struct drm_i915_gem_exec_object2 *exec;
+	struct intel_engine_cs *engine;
+	struct i915_gem_context *ctx;
+	struct i915_address_space *vm;
+	struct i915_vma *batch;
+	struct drm_i915_gem_request *request;
+	u32 batch_start_offset;
+	u32 batch_len;
+	unsigned int dispatch_flags;
+	struct drm_i915_gem_exec_object2 shadow_exec_entry;
+	bool need_relocs;
 	struct list_head vmas;
+	struct reloc_cache {
+		struct drm_mm_node node;
+		unsigned long vaddr;
+		unsigned int page;
+		bool use_64bit_reloc;
+	} reloc_cache;
 	int and;
 	union {
-		struct i915_vma *lut[0];
-		struct hlist_head buckets[0];
+		struct i915_vma **lut;
+		struct hlist_head *buckets;
 	};
 };
 
-static struct eb_vmas *
-eb_create(struct drm_i915_private *i915,
-	  struct drm_i915_gem_execbuffer2 *args)
+static int
+eb_create(struct i915_execbuffer *eb)
 {
-	struct eb_vmas *eb = NULL;
-
-	if (args->flags & I915_EXEC_HANDLE_LUT) {
-		unsigned size = args->buffer_count;
+	eb->lut = NULL;
+	if (eb->args->flags & I915_EXEC_HANDLE_LUT) {
+		unsigned int size = eb->args->buffer_count;
 		size *= sizeof(struct i915_vma *);
-		size += sizeof(struct eb_vmas);
-		eb = kmalloc(size, GFP_TEMPORARY | __GFP_NOWARN | __GFP_NORETRY);
+		eb->lut = kmalloc(size,
+				  GFP_TEMPORARY | __GFP_NOWARN | __GFP_NORETRY);
 	}
 
-	if (eb == NULL) {
-		unsigned size = args->buffer_count;
-		unsigned count = PAGE_SIZE / sizeof(struct hlist_head) / 2;
+	if (!eb->lut) {
+		unsigned int size = eb->args->buffer_count;
+		unsigned int count = PAGE_SIZE / sizeof(struct hlist_head) / 2;
 		BUILD_BUG_ON_NOT_POWER_OF_2(PAGE_SIZE / sizeof(struct hlist_head));
 		while (count > 2*size)
 			count >>= 1;
-		eb = kzalloc(count*sizeof(struct hlist_head) +
-			     sizeof(struct eb_vmas),
-			     GFP_TEMPORARY);
-		if (eb == NULL)
-			return eb;
+		eb->lut = kzalloc(count*sizeof(struct hlist_head),
+				  GFP_TEMPORARY);
+		if (!eb->lut)
+			return -ENOMEM;
 
 		eb->and = count - 1;
-	} else
-		eb->and = -args->buffer_count;
+	} else {
+		eb->and = -eb->args->buffer_count;
+	}
 
-	eb->i915 = i915;
 	INIT_LIST_HEAD(&eb->vmas);
-	return eb;
+	return 0;
 }
 
 static void
-eb_reset(struct eb_vmas *eb)
+eb_reset(struct i915_execbuffer *eb)
 {
 	if (eb->and >= 0)
 		memset(eb->buckets, 0, (eb->and+1)*sizeof(struct hlist_head));
 }
 
 static struct i915_vma *
-eb_get_batch(struct eb_vmas *eb)
+eb_get_batch(struct i915_execbuffer *eb)
 {
 	struct i915_vma *vma = list_entry(eb->vmas.prev, typeof(*vma), exec_list);
 
@@ -133,34 +141,30 @@ eb_get_batch(struct eb_vmas *eb)
 }
 
 static int
-eb_lookup_vmas(struct eb_vmas *eb,
-	       struct drm_i915_gem_exec_object2 *exec,
-	       const struct drm_i915_gem_execbuffer2 *args,
-	       struct i915_address_space *vm,
-	       struct drm_file *file)
+eb_lookup_vmas(struct i915_execbuffer *eb)
 {
 	struct drm_i915_gem_object *obj;
 	struct list_head objects;
 	int i, ret;
 
 	INIT_LIST_HEAD(&objects);
-	spin_lock(&file->table_lock);
+	spin_lock(&eb->file->table_lock);
 	/* Grab a reference to the object and release the lock so we can lookup
 	 * or create the VMA without using GFP_ATOMIC */
-	for (i = 0; i < args->buffer_count; i++) {
-		obj = to_intel_bo(idr_find(&file->object_idr, exec[i].handle));
+	for (i = 0; i < eb->args->buffer_count; i++) {
+		obj = to_intel_bo(idr_find(&eb->file->object_idr, eb->exec[i].handle));
 		if (obj == NULL) {
-			spin_unlock(&file->table_lock);
+			spin_unlock(&eb->file->table_lock);
 			DRM_DEBUG("Invalid object handle %d at index %d\n",
-				   exec[i].handle, i);
+				   eb->exec[i].handle, i);
 			ret = -ENOENT;
 			goto err;
 		}
 
 		if (!list_empty(&obj->obj_exec_link)) {
-			spin_unlock(&file->table_lock);
+			spin_unlock(&eb->file->table_lock);
 			DRM_DEBUG("Object %p [handle %d, index %d] appears more than once in object list\n",
-				   obj, exec[i].handle, i);
+				   obj, eb->exec[i].handle, i);
 			ret = -EINVAL;
 			goto err;
 		}
@@ -168,7 +172,7 @@ eb_lookup_vmas(struct eb_vmas *eb,
 		i915_gem_object_get(obj);
 		list_add_tail(&obj->obj_exec_link, &objects);
 	}
-	spin_unlock(&file->table_lock);
+	spin_unlock(&eb->file->table_lock);
 
 	i = 0;
 	while (!list_empty(&objects)) {
@@ -186,7 +190,7 @@ eb_lookup_vmas(struct eb_vmas *eb,
 		 * from the (obj, vm) we don't run the risk of creating
 		 * duplicated vmas for the same vm.
 		 */
-		vma = i915_vma_instance(obj, vm, NULL);
+		vma = i915_vma_instance(obj, eb->vm, NULL);
 		if (unlikely(IS_ERR(vma))) {
 			DRM_DEBUG("Failed to lookup VMA\n");
 			ret = PTR_ERR(vma);
@@ -197,11 +201,13 @@ eb_lookup_vmas(struct eb_vmas *eb,
 		list_add_tail(&vma->exec_list, &eb->vmas);
 		list_del_init(&obj->obj_exec_link);
 
-		vma->exec_entry = &exec[i];
+		vma->exec_entry = &eb->exec[i];
 		if (eb->and < 0) {
 			eb->lut[i] = vma;
 		} else {
-			uint32_t handle = args->flags & I915_EXEC_HANDLE_LUT ? i : exec[i].handle;
+			u32 handle =
+				eb->args->flags & I915_EXEC_HANDLE_LUT ?
+				i : eb->exec[i].handle;
 			vma->exec_handle = handle;
 			hlist_add_head(&vma->exec_node,
 				       &eb->buckets[handle & eb->and]);
@@ -228,7 +234,7 @@ eb_lookup_vmas(struct eb_vmas *eb,
 	return ret;
 }
 
-static struct i915_vma *eb_get_vma(struct eb_vmas *eb, unsigned long handle)
+static struct i915_vma *eb_get_vma(struct i915_execbuffer *eb, unsigned long handle)
 {
 	if (eb->and < 0) {
 		if (handle >= -eb->and)
@@ -248,7 +254,7 @@ static struct i915_vma *eb_get_vma(struct eb_vmas *eb, unsigned long handle)
 }
 
 static void
-i915_gem_execbuffer_unreserve_vma(struct i915_vma *vma)
+eb_unreserve_vma(struct i915_vma *vma)
 {
 	struct drm_i915_gem_exec_object2 *entry;
 
@@ -266,8 +272,10 @@ i915_gem_execbuffer_unreserve_vma(struct i915_vma *vma)
 	entry->flags &= ~(__EXEC_OBJECT_HAS_FENCE | __EXEC_OBJECT_HAS_PIN);
 }
 
-static void eb_destroy(struct eb_vmas *eb)
+static void eb_destroy(struct i915_execbuffer *eb)
 {
+	i915_gem_context_put(eb->ctx);
+
 	while (!list_empty(&eb->vmas)) {
 		struct i915_vma *vma;
 
@@ -275,11 +283,10 @@ static void eb_destroy(struct eb_vmas *eb)
 				       struct i915_vma,
 				       exec_list);
 		list_del_init(&vma->exec_list);
-		i915_gem_execbuffer_unreserve_vma(vma);
+		eb_unreserve_vma(vma);
 		vma->exec_entry = NULL;
 		i915_vma_put(vma);
 	}
-	kfree(eb);
 }
 
 static inline int use_cpu_reloc(struct drm_i915_gem_object *obj)
@@ -320,20 +327,11 @@ relocation_target(const struct drm_i915_gem_relocation_entry *reloc,
 	return gen8_canonical_addr((int)reloc->delta + target_offset);
 }
 
-struct reloc_cache {
-	struct drm_i915_private *i915;
-	struct drm_mm_node node;
-	unsigned long vaddr;
-	unsigned int page;
-	bool use_64bit_reloc;
-};
-
 static void reloc_cache_init(struct reloc_cache *cache,
 			     struct drm_i915_private *i915)
 {
 	cache->page = -1;
 	cache->vaddr = 0;
-	cache->i915 = i915;
 	/* Must be a variable in the struct to allow GCC to unroll. */
 	cache->use_64bit_reloc = HAS_64BIT_RELOC(i915);
 	cache->node.allocated = false;
@@ -351,7 +349,14 @@ static inline unsigned int unmask_flags(unsigned long p)
 
 #define KMAP 0x4 /* after CLFLUSH_FLAGS */
 
-static void reloc_cache_fini(struct reloc_cache *cache)
+static inline struct i915_ggtt *cache_to_ggtt(struct reloc_cache *cache)
+{
+	struct drm_i915_private *i915 =
+		container_of(cache, struct i915_execbuffer, reloc_cache)->i915;
+	return &i915->ggtt;
+}
+
+static void reloc_cache_reset(struct reloc_cache *cache)
 {
 	void *vaddr;
 
@@ -369,7 +374,7 @@ static void reloc_cache_fini(struct reloc_cache *cache)
 		wmb();
 		io_mapping_unmap_atomic((void __iomem *)vaddr);
 		if (cache->node.allocated) {
-			struct i915_ggtt *ggtt = &cache->i915->ggtt;
+			struct i915_ggtt *ggtt = cache_to_ggtt(cache);
 
 			ggtt->base.clear_range(&ggtt->base,
 					       cache->node.start,
@@ -379,6 +384,9 @@ static void reloc_cache_fini(struct reloc_cache *cache)
 			i915_vma_unpin((struct i915_vma *)cache->node.mm);
 		}
 	}
+
+	cache->vaddr = 0;
+	cache->page = -1;
 }
 
 static void *reloc_kmap(struct drm_i915_gem_object *obj,
@@ -417,7 +425,7 @@ static void *reloc_iomap(struct drm_i915_gem_object *obj,
 			 struct reloc_cache *cache,
 			 int page)
 {
-	struct i915_ggtt *ggtt = &cache->i915->ggtt;
+	struct i915_ggtt *ggtt = cache_to_ggtt(cache);
 	unsigned long offset;
 	void *vaddr;
 
@@ -467,7 +475,8 @@ static void *reloc_iomap(struct drm_i915_gem_object *obj,
 		offset += page << PAGE_SHIFT;
 	}
 
-	vaddr = (void __force *) io_mapping_map_atomic_wc(&cache->i915->ggtt.mappable, offset);
+	vaddr = (void __force *)io_mapping_map_atomic_wc(&ggtt->mappable,
+							 offset);
 	cache->page = page;
 	cache->vaddr = (unsigned long)vaddr;
 
@@ -546,12 +555,10 @@ relocate_entry(struct drm_i915_gem_object *obj,
 }
 
 static int
-i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
-				   struct eb_vmas *eb,
-				   struct drm_i915_gem_relocation_entry *reloc,
-				   struct reloc_cache *cache)
+eb_relocate_entry(struct drm_i915_gem_object *obj,
+		  struct i915_execbuffer *eb,
+		  struct drm_i915_gem_relocation_entry *reloc)
 {
-	struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
 	struct drm_gem_object *target_obj;
 	struct drm_i915_gem_object *target_i915_obj;
 	struct i915_vma *target_vma;
@@ -570,8 +577,8 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
 	/* Sandybridge PPGTT errata: We need a global gtt mapping for MI and
 	 * pipe_control writes because the gpu doesn't properly redirect them
 	 * through the ppgtt for non_secure batchbuffers. */
-	if (unlikely(IS_GEN6(dev_priv) &&
-	    reloc->write_domain == I915_GEM_DOMAIN_INSTRUCTION)) {
+	if (unlikely(IS_GEN6(eb->i915) &&
+		     reloc->write_domain == I915_GEM_DOMAIN_INSTRUCTION)) {
 		ret = i915_vma_bind(target_vma, target_i915_obj->cache_level,
 				    PIN_GLOBAL);
 		if (WARN_ONCE(ret, "Unexpected failure to bind target VMA!"))
@@ -612,7 +619,7 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
 
 	/* Check that the relocation address is valid... */
 	if (unlikely(reloc->offset >
-		     obj->base.size - (cache->use_64bit_reloc ? 8 : 4))) {
+		     obj->base.size - (eb->reloc_cache.use_64bit_reloc ? 8 : 4))) {
 		DRM_DEBUG("Relocation beyond object bounds: "
 			  "obj %p target %d offset %d size %d.\n",
 			  obj, reloc->target_handle,
@@ -628,7 +635,7 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
 		return -EINVAL;
 	}
 
-	ret = relocate_entry(obj, reloc, cache, target_offset);
+	ret = relocate_entry(obj, reloc, &eb->reloc_cache, target_offset);
 	if (ret)
 		return ret;
 
@@ -637,19 +644,15 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
 	return 0;
 }
 
-static int
-i915_gem_execbuffer_relocate_vma(struct i915_vma *vma,
-				 struct eb_vmas *eb)
+static int eb_relocate_vma(struct i915_vma *vma, struct i915_execbuffer *eb)
 {
 #define N_RELOC(x) ((x) / sizeof(struct drm_i915_gem_relocation_entry))
 	struct drm_i915_gem_relocation_entry stack_reloc[N_RELOC(512)];
 	struct drm_i915_gem_relocation_entry __user *user_relocs;
 	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
-	struct reloc_cache cache;
 	int remain, ret = 0;
 
 	user_relocs = u64_to_user_ptr(entry->relocs_ptr);
-	reloc_cache_init(&cache, eb->i915);
 
 	remain = entry->relocation_count;
 	while (remain) {
@@ -678,7 +681,7 @@ i915_gem_execbuffer_relocate_vma(struct i915_vma *vma,
 		do {
 			u64 offset = r->presumed_offset;
 
-			ret = i915_gem_execbuffer_relocate_entry(vma->obj, eb, r, &cache);
+			ret = eb_relocate_entry(vma->obj, eb, r);
 			if (ret)
 				goto out;
 
@@ -710,39 +713,35 @@ i915_gem_execbuffer_relocate_vma(struct i915_vma *vma,
 	}
 
 out:
-	reloc_cache_fini(&cache);
+	reloc_cache_reset(&eb->reloc_cache);
 	return ret;
 #undef N_RELOC
 }
 
 static int
-i915_gem_execbuffer_relocate_vma_slow(struct i915_vma *vma,
-				      struct eb_vmas *eb,
-				      struct drm_i915_gem_relocation_entry *relocs)
+eb_relocate_vma_slow(struct i915_vma *vma,
+		     struct i915_execbuffer *eb,
+		     struct drm_i915_gem_relocation_entry *relocs)
 {
 	const struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
-	struct reloc_cache cache;
 	int i, ret = 0;
 
-	reloc_cache_init(&cache, eb->i915);
 	for (i = 0; i < entry->relocation_count; i++) {
-		ret = i915_gem_execbuffer_relocate_entry(vma->obj, eb, &relocs[i], &cache);
+		ret = eb_relocate_entry(vma->obj, eb, &relocs[i]);
 		if (ret)
 			break;
 	}
-	reloc_cache_fini(&cache);
-
+	reloc_cache_reset(&eb->reloc_cache);
 	return ret;
 }
 
-static int
-i915_gem_execbuffer_relocate(struct eb_vmas *eb)
+static int eb_relocate(struct i915_execbuffer *eb)
 {
 	struct i915_vma *vma;
 	int ret = 0;
 
 	list_for_each_entry(vma, &eb->vmas, exec_list) {
-		ret = i915_gem_execbuffer_relocate_vma(vma, eb);
+		ret = eb_relocate_vma(vma, eb);
 		if (ret)
 			break;
 	}
@@ -757,9 +756,9 @@ static bool only_mappable_for_reloc(unsigned int flags)
 }
 
 static int
-i915_gem_execbuffer_reserve_vma(struct i915_vma *vma,
-				struct intel_engine_cs *engine,
-				bool *need_reloc)
+eb_reserve_vma(struct i915_vma *vma,
+	       struct intel_engine_cs *engine,
+	       bool *need_reloc)
 {
 	struct drm_i915_gem_object *obj = vma->obj;
 	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
@@ -878,33 +877,26 @@ eb_vma_misplaced(struct i915_vma *vma)
 	return false;
 }
 
-static int
-i915_gem_execbuffer_reserve(struct intel_engine_cs *engine,
-			    struct list_head *vmas,
-			    struct i915_gem_context *ctx,
-			    bool *need_relocs)
+static int eb_reserve(struct i915_execbuffer *eb)
 {
+	const bool has_fenced_gpu_access = INTEL_GEN(eb->i915) < 4;
 	struct drm_i915_gem_object *obj;
 	struct i915_vma *vma;
-	struct i915_address_space *vm;
 	struct list_head ordered_vmas;
 	struct list_head pinned_vmas;
-	bool has_fenced_gpu_access = INTEL_GEN(engine->i915) < 4;
 	int retry;
 
-	vm = list_first_entry(vmas, struct i915_vma, exec_list)->vm;
-
 	INIT_LIST_HEAD(&ordered_vmas);
 	INIT_LIST_HEAD(&pinned_vmas);
-	while (!list_empty(vmas)) {
+	while (!list_empty(&eb->vmas)) {
 		struct drm_i915_gem_exec_object2 *entry;
 		bool need_fence, need_mappable;
 
-		vma = list_first_entry(vmas, struct i915_vma, exec_list);
+		vma = list_first_entry(&eb->vmas, struct i915_vma, exec_list);
 		obj = vma->obj;
 		entry = vma->exec_entry;
 
-		if (ctx->flags & CONTEXT_NO_ZEROMAP)
+		if (eb->ctx->flags & CONTEXT_NO_ZEROMAP)
 			entry->flags |= __EXEC_OBJECT_NEEDS_BIAS;
 
 		if (!has_fenced_gpu_access)
@@ -925,8 +917,8 @@ i915_gem_execbuffer_reserve(struct intel_engine_cs *engine,
 		obj->base.pending_read_domains = I915_GEM_GPU_DOMAINS & ~I915_GEM_DOMAIN_COMMAND;
 		obj->base.pending_write_domain = 0;
 	}
-	list_splice(&ordered_vmas, vmas);
-	list_splice(&pinned_vmas, vmas);
+	list_splice(&ordered_vmas, &eb->vmas);
+	list_splice(&pinned_vmas, &eb->vmas);
 
 	/* Attempt to pin all of the buffers into the GTT.
 	 * This is done in 3 phases:
@@ -945,27 +937,24 @@ i915_gem_execbuffer_reserve(struct intel_engine_cs *engine,
 		int ret = 0;
 
 		/* Unbind any ill-fitting objects or pin. */
-		list_for_each_entry(vma, vmas, exec_list) {
+		list_for_each_entry(vma, &eb->vmas, exec_list) {
 			if (!drm_mm_node_allocated(&vma->node))
 				continue;
 
 			if (eb_vma_misplaced(vma))
 				ret = i915_vma_unbind(vma);
 			else
-				ret = i915_gem_execbuffer_reserve_vma(vma,
-								      engine,
-								      need_relocs);
+				ret = eb_reserve_vma(vma, eb->engine, &eb->need_relocs);
 			if (ret)
 				goto err;
 		}
 
 		/* Bind fresh objects */
-		list_for_each_entry(vma, vmas, exec_list) {
+		list_for_each_entry(vma, &eb->vmas, exec_list) {
 			if (drm_mm_node_allocated(&vma->node))
 				continue;
 
-			ret = i915_gem_execbuffer_reserve_vma(vma, engine,
-							      need_relocs);
+			ret = eb_reserve_vma(vma, eb->engine, &eb->need_relocs);
 			if (ret)
 				goto err;
 		}
@@ -975,39 +964,30 @@ i915_gem_execbuffer_reserve(struct intel_engine_cs *engine,
 			return ret;
 
 		/* Decrement pin count for bound objects */
-		list_for_each_entry(vma, vmas, exec_list)
-			i915_gem_execbuffer_unreserve_vma(vma);
+		list_for_each_entry(vma, &eb->vmas, exec_list)
+			eb_unreserve_vma(vma);
 
-		ret = i915_gem_evict_vm(vm, true);
+		ret = i915_gem_evict_vm(eb->vm, true);
 		if (ret)
 			return ret;
 	} while (1);
 }
 
 static int
-i915_gem_execbuffer_relocate_slow(struct drm_device *dev,
-				  struct drm_i915_gem_execbuffer2 *args,
-				  struct drm_file *file,
-				  struct intel_engine_cs *engine,
-				  struct eb_vmas *eb,
-				  struct drm_i915_gem_exec_object2 *exec,
-				  struct i915_gem_context *ctx)
+eb_relocate_slow(struct i915_execbuffer *eb)
 {
+	const unsigned int count = eb->args->buffer_count;
+	struct drm_device *dev = &eb->i915->drm;
 	struct drm_i915_gem_relocation_entry *reloc;
-	struct i915_address_space *vm;
 	struct i915_vma *vma;
-	bool need_relocs;
 	int *reloc_offset;
 	int i, total, ret;
-	unsigned count = args->buffer_count;
-
-	vm = list_first_entry(&eb->vmas, struct i915_vma, exec_list)->vm;
 
 	/* We may process another execbuffer during the unlock... */
 	while (!list_empty(&eb->vmas)) {
 		vma = list_first_entry(&eb->vmas, struct i915_vma, exec_list);
 		list_del_init(&vma->exec_list);
-		i915_gem_execbuffer_unreserve_vma(vma);
+		eb_unreserve_vma(vma);
 		i915_vma_put(vma);
 	}
 
@@ -1015,7 +995,7 @@ i915_gem_execbuffer_relocate_slow(struct drm_device *dev,
 
 	total = 0;
 	for (i = 0; i < count; i++)
-		total += exec[i].relocation_count;
+		total += eb->exec[i].relocation_count;
 
 	reloc_offset = drm_malloc_ab(count, sizeof(*reloc_offset));
 	reloc = drm_malloc_ab(total, sizeof(*reloc));
@@ -1032,10 +1012,10 @@ i915_gem_execbuffer_relocate_slow(struct drm_device *dev,
 		u64 invalid_offset = (u64)-1;
 		int j;
 
-		user_relocs = u64_to_user_ptr(exec[i].relocs_ptr);
+		user_relocs = u64_to_user_ptr(eb->exec[i].relocs_ptr);
 
 		if (copy_from_user(reloc+total, user_relocs,
-				   exec[i].relocation_count * sizeof(*reloc))) {
+				   eb->exec[i].relocation_count * sizeof(*reloc))) {
 			ret = -EFAULT;
 			mutex_lock(&dev->struct_mutex);
 			goto err;
@@ -1050,7 +1030,7 @@ i915_gem_execbuffer_relocate_slow(struct drm_device *dev,
 		 * happened we would make the mistake of assuming that the
 		 * relocations were valid.
 		 */
-		for (j = 0; j < exec[i].relocation_count; j++) {
+		for (j = 0; j < eb->exec[i].relocation_count; j++) {
 			if (__copy_to_user(&user_relocs[j].presumed_offset,
 					   &invalid_offset,
 					   sizeof(invalid_offset))) {
@@ -1061,7 +1041,7 @@ i915_gem_execbuffer_relocate_slow(struct drm_device *dev,
 		}
 
 		reloc_offset[i] = total;
-		total += exec[i].relocation_count;
+		total += eb->exec[i].relocation_count;
 	}
 
 	ret = i915_mutex_lock_interruptible(dev);
@@ -1072,20 +1052,18 @@ i915_gem_execbuffer_relocate_slow(struct drm_device *dev,
 
 	/* reacquire the objects */
 	eb_reset(eb);
-	ret = eb_lookup_vmas(eb, exec, args, vm, file);
+	ret = eb_lookup_vmas(eb);
 	if (ret)
 		goto err;
 
-	need_relocs = (args->flags & I915_EXEC_NO_RELOC) == 0;
-	ret = i915_gem_execbuffer_reserve(engine, &eb->vmas, ctx,
-					  &need_relocs);
+	ret = eb_reserve(eb);
 	if (ret)
 		goto err;
 
 	list_for_each_entry(vma, &eb->vmas, exec_list) {
-		int offset = vma->exec_entry - exec;
-		ret = i915_gem_execbuffer_relocate_vma_slow(vma, eb,
-							    reloc + reloc_offset[offset]);
+		int idx = vma->exec_entry - eb->exec;
+
+		ret = eb_relocate_vma_slow(vma, eb, reloc + reloc_offset[idx]);
 		if (ret)
 			goto err;
 	}
@@ -1103,13 +1081,12 @@ i915_gem_execbuffer_relocate_slow(struct drm_device *dev,
 }
 
 static int
-i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req,
-				struct list_head *vmas)
+eb_move_to_gpu(struct i915_execbuffer *eb)
 {
 	struct i915_vma *vma;
 	int ret;
 
-	list_for_each_entry(vma, vmas, exec_list) {
+	list_for_each_entry(vma, &eb->vmas, exec_list) {
 		struct drm_i915_gem_object *obj = vma->obj;
 
 		if (vma->exec_entry->flags & EXEC_OBJECT_CAPTURE) {
@@ -1119,9 +1096,9 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req,
 			if (unlikely(!capture))
 				return -ENOMEM;
 
-			capture->next = req->capture_list;
+			capture->next = eb->request->capture_list;
 			capture->vma = vma;
-			req->capture_list = capture;
+			eb->request->capture_list = capture;
 		}
 
 		if (vma->exec_entry->flags & EXEC_OBJECT_ASYNC)
@@ -1133,22 +1110,22 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req,
 		}
 
 		ret = i915_gem_request_await_object
-			(req, obj, obj->base.pending_write_domain);
+			(eb->request, obj, obj->base.pending_write_domain);
 		if (ret)
 			return ret;
 	}
 
 	/* Unconditionally flush any chipset caches (for streaming writes). */
-	i915_gem_chipset_flush(req->engine->i915);
+	i915_gem_chipset_flush(eb->i915);
 
 	/* Unconditionally invalidate GPU caches and TLBs. */
-	return req->engine->emit_flush(req, EMIT_INVALIDATE);
+	return eb->engine->emit_flush(eb->request, EMIT_INVALIDATE);
 }
 
 static bool
 i915_gem_check_execbuffer(struct drm_i915_gem_execbuffer2 *exec)
 {
-	if (exec->flags & __I915_EXEC_UNKNOWN_FLAGS)
+	if (exec->flags & __I915_EXEC_ILLEGAL_FLAGS)
 		return false;
 
 	/* Kernel clipping was a DRI1 misfeature */
@@ -1245,22 +1222,24 @@ validate_exec_list(struct drm_device *dev,
 	return 0;
 }
 
-static struct i915_gem_context *
-i915_gem_validate_context(struct drm_device *dev, struct drm_file *file,
-			  struct intel_engine_cs *engine, const u32 ctx_id)
+static int eb_select_context(struct i915_execbuffer *eb)
 {
+	unsigned int ctx_id = i915_execbuffer2_get_context_id(*eb->args);
 	struct i915_gem_context *ctx;
 
-	ctx = i915_gem_context_lookup(file->driver_priv, ctx_id);
-	if (IS_ERR(ctx))
-		return ctx;
+	ctx = i915_gem_context_lookup(eb->file->driver_priv, ctx_id);
+	if (unlikely(IS_ERR(ctx)))
+		return PTR_ERR(ctx);
 
-	if (i915_gem_context_is_banned(ctx)) {
+	if (unlikely(i915_gem_context_is_banned(ctx))) {
 		DRM_DEBUG("Context %u tried to submit while banned\n", ctx_id);
-		return ERR_PTR(-EIO);
+		return -EIO;
 	}
 
-	return ctx;
+	eb->ctx = i915_gem_context_get(ctx);
+	eb->vm = ctx->ppgtt ? &ctx->ppgtt->base : &eb->i915->ggtt.base;
+
+	return 0;
 }
 
 static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
@@ -1325,12 +1304,11 @@ static void eb_export_fence(struct drm_i915_gem_object *obj,
 }
 
 static void
-i915_gem_execbuffer_move_to_active(struct list_head *vmas,
-				   struct drm_i915_gem_request *req)
+eb_move_to_active(struct i915_execbuffer *eb)
 {
 	struct i915_vma *vma;
 
-	list_for_each_entry(vma, vmas, exec_list) {
+	list_for_each_entry(vma, &eb->vmas, exec_list) {
 		struct drm_i915_gem_object *obj = vma->obj;
 
 		obj->base.write_domain = obj->base.pending_write_domain;
@@ -1340,8 +1318,8 @@ i915_gem_execbuffer_move_to_active(struct list_head *vmas,
 			obj->base.pending_read_domains |= obj->base.read_domains;
 		obj->base.read_domains = obj->base.pending_read_domains;
 
-		i915_vma_move_to_active(vma, req, vma->exec_entry->flags);
-		eb_export_fence(obj, req, vma->exec_entry->flags);
+		i915_vma_move_to_active(vma, eb->request, vma->exec_entry->flags);
+		eb_export_fence(obj, eb->request, vma->exec_entry->flags);
 	}
 }
 
@@ -1371,29 +1349,22 @@ i915_reset_gen7_sol_offsets(struct drm_i915_gem_request *req)
 	return 0;
 }
 
-static struct i915_vma *
-i915_gem_execbuffer_parse(struct intel_engine_cs *engine,
-			  struct drm_i915_gem_exec_object2 *shadow_exec_entry,
-			  struct drm_i915_gem_object *batch_obj,
-			  struct eb_vmas *eb,
-			  u32 batch_start_offset,
-			  u32 batch_len,
-			  bool is_master)
+static struct i915_vma *eb_parse(struct i915_execbuffer *eb, bool is_master)
 {
 	struct drm_i915_gem_object *shadow_batch_obj;
 	struct i915_vma *vma;
 	int ret;
 
-	shadow_batch_obj = i915_gem_batch_pool_get(&engine->batch_pool,
-						   PAGE_ALIGN(batch_len));
+	shadow_batch_obj = i915_gem_batch_pool_get(&eb->engine->batch_pool,
+						   PAGE_ALIGN(eb->batch_len));
 	if (IS_ERR(shadow_batch_obj))
 		return ERR_CAST(shadow_batch_obj);
 
-	ret = intel_engine_cmd_parser(engine,
-				      batch_obj,
+	ret = intel_engine_cmd_parser(eb->engine,
+				      eb->batch->obj,
 				      shadow_batch_obj,
-				      batch_start_offset,
-				      batch_len,
+				      eb->batch_start_offset,
+				      eb->batch_len,
 				      is_master);
 	if (ret) {
 		if (ret == -EACCES) /* unhandled chained batch */
@@ -1407,9 +1378,8 @@ i915_gem_execbuffer_parse(struct intel_engine_cs *engine,
 	if (IS_ERR(vma))
 		goto out;
 
-	memset(shadow_exec_entry, 0, sizeof(*shadow_exec_entry));
-
-	vma->exec_entry = shadow_exec_entry;
+	vma->exec_entry =
+		memset(&eb->shadow_exec_entry, 0, sizeof(*vma->exec_entry));
 	vma->exec_entry->flags = __EXEC_OBJECT_HAS_PIN;
 	i915_gem_object_get(shadow_batch_obj);
 	list_add_tail(&vma->exec_list, &eb->vmas);
@@ -1428,46 +1398,33 @@ add_to_client(struct drm_i915_gem_request *req,
 }
 
 static int
-execbuf_submit(struct i915_execbuffer_params *params,
-	       struct drm_i915_gem_execbuffer2 *args,
-	       struct list_head *vmas)
+execbuf_submit(struct i915_execbuffer *eb)
 {
-	u64 exec_start, exec_len;
 	int ret;
 
-	ret = i915_gem_execbuffer_move_to_gpu(params->request, vmas);
+	ret = eb_move_to_gpu(eb);
 	if (ret)
 		return ret;
 
-	ret = i915_switch_context(params->request);
+	ret = i915_switch_context(eb->request);
 	if (ret)
 		return ret;
 
-	if (args->flags & I915_EXEC_CONSTANTS_MASK) {
-		DRM_DEBUG("I915_EXEC_CONSTANTS_* unsupported\n");
-		return -EINVAL;
-	}
-
-	if (args->flags & I915_EXEC_GEN7_SOL_RESET) {
-		ret = i915_reset_gen7_sol_offsets(params->request);
+	if (eb->args->flags & I915_EXEC_GEN7_SOL_RESET) {
+		ret = i915_reset_gen7_sol_offsets(eb->request);
 		if (ret)
 			return ret;
 	}
 
-	exec_len   = args->batch_len;
-	exec_start = params->batch->node.start +
-		     params->args_batch_start_offset;
-
-	if (exec_len == 0)
-		exec_len = params->batch->size - params->args_batch_start_offset;
-
-	ret = params->engine->emit_bb_start(params->request,
-					    exec_start, exec_len,
-					    params->dispatch_flags);
+	ret = eb->engine->emit_bb_start(eb->request,
+					eb->batch->node.start +
+					eb->batch_start_offset,
+					eb->batch_len,
+					eb->dispatch_flags);
 	if (ret)
 		return ret;
 
-	i915_gem_execbuffer_move_to_active(vmas, params->request);
+	eb_move_to_active(eb);
 
 	return 0;
 }
@@ -1549,27 +1506,16 @@ eb_select_engine(struct drm_i915_private *dev_priv,
 }
 
 static int
-i915_gem_do_execbuffer(struct drm_device *dev, void *data,
+i915_gem_do_execbuffer(struct drm_device *dev,
 		       struct drm_file *file,
 		       struct drm_i915_gem_execbuffer2 *args,
 		       struct drm_i915_gem_exec_object2 *exec)
 {
-	struct drm_i915_private *dev_priv = to_i915(dev);
-	struct i915_ggtt *ggtt = &dev_priv->ggtt;
-	struct eb_vmas *eb;
-	struct drm_i915_gem_exec_object2 shadow_exec_entry;
-	struct intel_engine_cs *engine;
-	struct i915_gem_context *ctx;
-	struct i915_address_space *vm;
-	struct i915_execbuffer_params params_master; /* XXX: will be removed later */
-	struct i915_execbuffer_params *params = &params_master;
-	const u32 ctx_id = i915_execbuffer2_get_context_id(*args);
-	u32 dispatch_flags;
+	struct i915_execbuffer eb;
 	struct dma_fence *in_fence = NULL;
 	struct sync_file *out_fence = NULL;
 	int out_fence_fd = -1;
 	int ret;
-	bool need_relocs;
 
 	if (!i915_gem_check_execbuffer(args))
 		return -EINVAL;
@@ -1578,37 +1524,42 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	if (ret)
 		return ret;
 
-	dispatch_flags = 0;
+	eb.i915 = to_i915(dev);
+	eb.file = file;
+	eb.args = args;
+	eb.exec = exec;
+	eb.need_relocs = (args->flags & I915_EXEC_NO_RELOC) == 0;
+	reloc_cache_init(&eb.reloc_cache, eb.i915);
+
+	eb.batch_start_offset = args->batch_start_offset;
+	eb.batch_len = args->batch_len;
+
+	eb.dispatch_flags = 0;
 	if (args->flags & I915_EXEC_SECURE) {
 		if (!drm_is_current_master(file) || !capable(CAP_SYS_ADMIN))
 		    return -EPERM;
 
-		dispatch_flags |= I915_DISPATCH_SECURE;
+		eb.dispatch_flags |= I915_DISPATCH_SECURE;
 	}
 	if (args->flags & I915_EXEC_IS_PINNED)
-		dispatch_flags |= I915_DISPATCH_PINNED;
-
-	engine = eb_select_engine(dev_priv, file, args);
-	if (!engine)
-		return -EINVAL;
+		eb.dispatch_flags |= I915_DISPATCH_PINNED;
 
-	if (args->buffer_count < 1) {
-		DRM_DEBUG("execbuf with %d buffers\n", args->buffer_count);
+	eb.engine = eb_select_engine(eb.i915, file, args);
+	if (!eb.engine)
 		return -EINVAL;
-	}
 
 	if (args->flags & I915_EXEC_RESOURCE_STREAMER) {
-		if (!HAS_RESOURCE_STREAMER(dev_priv)) {
+		if (!HAS_RESOURCE_STREAMER(eb.i915)) {
 			DRM_DEBUG("RS is only allowed for Haswell, Gen8 and above\n");
 			return -EINVAL;
 		}
-		if (engine->id != RCS) {
+		if (eb.engine->id != RCS) {
 			DRM_DEBUG("RS is not available on %s\n",
-				 engine->name);
+				 eb.engine->name);
 			return -EINVAL;
 		}
 
-		dispatch_flags |= I915_DISPATCH_RS;
+		eb.dispatch_flags |= I915_DISPATCH_RS;
 	}
 
 	if (args->flags & I915_EXEC_FENCE_IN) {
@@ -1631,59 +1582,44 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	 * wakeref that we hold until the GPU has been idle for at least
 	 * 100ms.
 	 */
-	intel_runtime_pm_get(dev_priv);
+	intel_runtime_pm_get(eb.i915);
 
 	ret = i915_mutex_lock_interruptible(dev);
 	if (ret)
 		goto pre_mutex_err;
 
-	ctx = i915_gem_validate_context(dev, file, engine, ctx_id);
-	if (IS_ERR(ctx)) {
+	ret = eb_select_context(&eb);
+	if (ret) {
 		mutex_unlock(&dev->struct_mutex);
-		ret = PTR_ERR(ctx);
 		goto pre_mutex_err;
 	}
 
-	i915_gem_context_get(ctx);
-
-	if (ctx->ppgtt)
-		vm = &ctx->ppgtt->base;
-	else
-		vm = &ggtt->base;
-
-	memset(&params_master, 0x00, sizeof(params_master));
-
-	eb = eb_create(dev_priv, args);
-	if (eb == NULL) {
-		i915_gem_context_put(ctx);
+	if (eb_create(&eb)) {
+		i915_gem_context_put(eb.ctx);
 		mutex_unlock(&dev->struct_mutex);
 		ret = -ENOMEM;
 		goto pre_mutex_err;
 	}
 
 	/* Look up object handles */
-	ret = eb_lookup_vmas(eb, exec, args, vm, file);
+	ret = eb_lookup_vmas(&eb);
 	if (ret)
 		goto err;
 
 	/* take note of the batch buffer before we might reorder the lists */
-	params->batch = eb_get_batch(eb);
+	eb.batch = eb_get_batch(&eb);
 
 	/* Move the objects en-masse into the GTT, evicting if necessary. */
-	need_relocs = (args->flags & I915_EXEC_NO_RELOC) == 0;
-	ret = i915_gem_execbuffer_reserve(engine, &eb->vmas, ctx,
-					  &need_relocs);
+	ret = eb_reserve(&eb);
 	if (ret)
 		goto err;
 
 	/* The objects are in their final locations, apply the relocations. */
-	if (need_relocs)
-		ret = i915_gem_execbuffer_relocate(eb);
+	if (eb.need_relocs)
+		ret = eb_relocate(&eb);
 	if (ret) {
 		if (ret == -EFAULT) {
-			ret = i915_gem_execbuffer_relocate_slow(dev, args, file,
-								engine,
-								eb, exec, ctx);
+			ret = eb_relocate_slow(&eb);
 			BUG_ON(!mutex_is_locked(&dev->struct_mutex));
 		}
 		if (ret)
@@ -1691,28 +1627,22 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	}
 
 	/* Set the pending read domains for the batch buffer to COMMAND */
-	if (params->batch->obj->base.pending_write_domain) {
+	if (eb.batch->obj->base.pending_write_domain) {
 		DRM_DEBUG("Attempting to use self-modifying batch buffer\n");
 		ret = -EINVAL;
 		goto err;
 	}
-	if (args->batch_start_offset > params->batch->size ||
-	    args->batch_len > params->batch->size - args->batch_start_offset) {
+	if (eb.batch_start_offset > eb.batch->size ||
+	    eb.batch_len > eb.batch->size - eb.batch_start_offset) {
 		DRM_DEBUG("Attempting to use out-of-bounds batch\n");
 		ret = -EINVAL;
 		goto err;
 	}
 
-	params->args_batch_start_offset = args->batch_start_offset;
-	if (engine->needs_cmd_parser && args->batch_len) {
+	if (eb.engine->needs_cmd_parser && eb.batch_len) {
 		struct i915_vma *vma;
 
-		vma = i915_gem_execbuffer_parse(engine, &shadow_exec_entry,
-						params->batch->obj,
-						eb,
-						args->batch_start_offset,
-						args->batch_len,
-						drm_is_current_master(file));
+		vma = eb_parse(&eb, drm_is_current_master(file));
 		if (IS_ERR(vma)) {
 			ret = PTR_ERR(vma);
 			goto err;
@@ -1728,19 +1658,21 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 			 * specifically don't want that set on batches the
 			 * command parser has accepted.
 			 */
-			dispatch_flags |= I915_DISPATCH_SECURE;
-			params->args_batch_start_offset = 0;
-			params->batch = vma;
+			eb.dispatch_flags |= I915_DISPATCH_SECURE;
+			eb.batch_start_offset = 0;
+			eb.batch = vma;
 		}
 	}
 
-	params->batch->obj->base.pending_read_domains |= I915_GEM_DOMAIN_COMMAND;
+	eb.batch->obj->base.pending_read_domains |= I915_GEM_DOMAIN_COMMAND;
+	if (eb.batch_len == 0)
+		eb.batch_len = eb.batch->size - eb.batch_start_offset;
 
 	/* snb/ivb/vlv conflate the "batch in ppgtt" bit with the "non-secure
 	 * batch" bit. Hence we need to pin secure batches into the global gtt.
 	 * hsw should have this fixed, but bdw mucks it up again. */
-	if (dispatch_flags & I915_DISPATCH_SECURE) {
-		struct drm_i915_gem_object *obj = params->batch->obj;
+	if (eb.dispatch_flags & I915_DISPATCH_SECURE) {
+		struct drm_i915_gem_object *obj = eb.batch->obj;
 		struct i915_vma *vma;
 
 		/*
@@ -1759,25 +1691,24 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 			goto err;
 		}
 
-		params->batch = vma;
+		eb.batch = vma;
 	}
 
 	/* Allocate a request for this batch buffer nice and early. */
-	params->request = i915_gem_request_alloc(engine, ctx);
-	if (IS_ERR(params->request)) {
-		ret = PTR_ERR(params->request);
+	eb.request = i915_gem_request_alloc(eb.engine, eb.ctx);
+	if (IS_ERR(eb.request)) {
+		ret = PTR_ERR(eb.request);
 		goto err_batch_unpin;
 	}
 
 	if (in_fence) {
-		ret = i915_gem_request_await_dma_fence(params->request,
-						       in_fence);
+		ret = i915_gem_request_await_dma_fence(eb.request, in_fence);
 		if (ret < 0)
 			goto err_request;
 	}
 
 	if (out_fence_fd != -1) {
-		out_fence = sync_file_create(&params->request->fence);
+		out_fence = sync_file_create(&eb.request->fence);
 		if (!out_fence) {
 			ret = -ENOMEM;
 			goto err_request;
@@ -1790,26 +1721,13 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	 * inactive_list and lose its active reference. Hence we do not need
 	 * to explicitly hold another reference here.
 	 */
-	params->request->batch = params->batch;
-
-	/*
-	 * Save assorted stuff away to pass through to *_submission().
-	 * NB: This data should be 'persistent' and not local as it will
-	 * kept around beyond the duration of the IOCTL once the GPU
-	 * scheduler arrives.
-	 */
-	params->dev                     = dev;
-	params->file                    = file;
-	params->engine                    = engine;
-	params->dispatch_flags          = dispatch_flags;
-	params->ctx                     = ctx;
+	eb.request->batch = eb.batch;
 
-	trace_i915_gem_request_queue(params->request, dispatch_flags);
-
-	ret = execbuf_submit(params, args, &eb->vmas);
+	trace_i915_gem_request_queue(eb.request, eb.dispatch_flags);
+	ret = execbuf_submit(&eb);
 err_request:
-	__i915_add_request(params->request, ret == 0);
-	add_to_client(params->request, file);
+	__i915_add_request(eb.request, ret == 0);
+	add_to_client(eb.request, file);
 
 	if (out_fence) {
 		if (ret == 0) {
@@ -1829,19 +1747,17 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	 * needs to be adjusted to also track the ggtt batch vma properly as
 	 * active.
 	 */
-	if (dispatch_flags & I915_DISPATCH_SECURE)
-		i915_vma_unpin(params->batch);
+	if (eb.dispatch_flags & I915_DISPATCH_SECURE)
+		i915_vma_unpin(eb.batch);
 err:
 	/* the request owns the ref now */
-	i915_gem_context_put(ctx);
-	eb_destroy(eb);
-
+	eb_destroy(&eb);
 	mutex_unlock(&dev->struct_mutex);
 
 pre_mutex_err:
 	/* intel_gpu_busy should also get a ref, so it will free when the device
 	 * is really idle. */
-	intel_runtime_pm_put(dev_priv);
+	intel_runtime_pm_put(eb.i915);
 	if (out_fence_fd != -1)
 		put_unused_fd(out_fence_fd);
 err_in_fence:
@@ -1912,7 +1828,7 @@ i915_gem_execbuffer(struct drm_device *dev, void *data,
 	exec2.flags = I915_EXEC_RENDER;
 	i915_execbuffer2_set_context_id(exec2, 0);
 
-	ret = i915_gem_do_execbuffer(dev, data, file, &exec2, exec2_list);
+	ret = i915_gem_do_execbuffer(dev, file, &exec2, exec2_list);
 	if (!ret) {
 		struct drm_i915_gem_exec_object __user *user_exec_list =
 			u64_to_user_ptr(args->buffers_ptr);
@@ -1971,7 +1887,7 @@ i915_gem_execbuffer2(struct drm_device *dev, void *data,
 		return -EFAULT;
 	}
 
-	ret = i915_gem_do_execbuffer(dev, data, file, args, exec2_list);
+	ret = i915_gem_do_execbuffer(dev, file, args, exec2_list);
 	if (!ret) {
 		/* Copy the new buffer offsets back to the user's exec list. */
 		struct drm_i915_gem_exec_object2 __user *user_exec_list =
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 39+ messages in thread

* [PATCH 04/15] drm/i915: Use vma->exec_entry as our double-entry placeholder
  2017-03-16 13:19 Make execbuf fast and green Chris Wilson
                   ` (2 preceding siblings ...)
  2017-03-16 13:19 ` [PATCH 03/15] drm/i915: Amalgamate execbuffer parameter structures Chris Wilson
@ 2017-03-16 13:19 ` Chris Wilson
  2017-03-16 13:19 ` [PATCH 05/15] drm/i915: Split vma exec_link/evict_link Chris Wilson
                   ` (14 subsequent siblings)
  18 siblings, 0 replies; 39+ messages in thread
From: Chris Wilson @ 2017-03-16 13:19 UTC (permalink / raw)
  To: intel-gfx

This has the benefit of not requiring us to manipulate the
vma->exec_link list when tearing down the execbuffer, and is a
marginally cheaper test to detect the user error.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem_evict.c      | 17 ++-----
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 77 ++++++++++++++++--------------
 drivers/gpu/drm/i915/i915_vma.c            |  1 -
 3 files changed, 44 insertions(+), 51 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_evict.c b/drivers/gpu/drm/i915/i915_gem_evict.c
index 2da3a94fc9f3..ed34f54baef9 100644
--- a/drivers/gpu/drm/i915/i915_gem_evict.c
+++ b/drivers/gpu/drm/i915/i915_gem_evict.c
@@ -59,9 +59,6 @@ mark_free(struct drm_mm_scan *scan,
 	if (i915_vma_is_pinned(vma))
 		return false;
 
-	if (WARN_ON(!list_empty(&vma->exec_list)))
-		return false;
-
 	if (flags & PIN_NONFAULT && !list_empty(&vma->obj->userfault_link))
 		return false;
 
@@ -160,8 +157,6 @@ i915_gem_evict_something(struct i915_address_space *vm,
 	list_for_each_entry_safe(vma, next, &eviction_list, exec_list) {
 		ret = drm_mm_scan_remove_block(&scan, &vma->node);
 		BUG_ON(ret);
-
-		INIT_LIST_HEAD(&vma->exec_list);
 	}
 
 	/* Can we unpin some objects such as idle hw contents,
@@ -210,17 +205,12 @@ i915_gem_evict_something(struct i915_address_space *vm,
 		if (drm_mm_scan_remove_block(&scan, &vma->node))
 			__i915_vma_pin(vma);
 		else
-			list_del_init(&vma->exec_list);
+			list_del(&vma->exec_list);
 	}
 
 	/* Unbinding will emit any required flushes */
 	ret = 0;
-	while (!list_empty(&eviction_list)) {
-		vma = list_first_entry(&eviction_list,
-				       struct i915_vma,
-				       exec_list);
-
-		list_del_init(&vma->exec_list);
+	list_for_each_entry_safe(vma, next, &eviction_list, exec_list) {
 		__i915_vma_unpin(vma);
 		if (ret == 0)
 			ret = i915_vma_unbind(vma);
@@ -316,7 +306,7 @@ int i915_gem_evict_for_node(struct i915_address_space *vm,
 		}
 
 		/* Overlap of objects in the same batch? */
-		if (i915_vma_is_pinned(vma) || !list_empty(&vma->exec_list)) {
+		if (i915_vma_is_pinned(vma)) {
 			ret = -ENOSPC;
 			if (vma->exec_entry &&
 			    vma->exec_entry->flags & EXEC_OBJECT_PINNED)
@@ -337,7 +327,6 @@ int i915_gem_evict_for_node(struct i915_address_space *vm,
 	}
 
 	list_for_each_entry_safe(vma, next, &eviction_list, exec_list) {
-		list_del_init(&vma->exec_list);
 		__i915_vma_unpin(vma);
 		if (ret == 0)
 			ret = i915_vma_unbind(vma);
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 05e579c5cf46..cc624e2d794d 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -109,13 +109,40 @@ eb_create(struct i915_execbuffer *eb)
 		eb->and = -eb->args->buffer_count;
 	}
 
-	INIT_LIST_HEAD(&eb->vmas);
 	return 0;
 }
 
+static inline void
+__eb_unreserve_vma(struct i915_vma *vma,
+		   const struct drm_i915_gem_exec_object2 *entry)
+{
+	if (unlikely(entry->flags & __EXEC_OBJECT_HAS_FENCE))
+		i915_vma_unpin_fence(vma);
+
+	if (entry->flags & __EXEC_OBJECT_HAS_PIN)
+		__i915_vma_unpin(vma);
+}
+
+static void
+eb_unreserve_vma(struct i915_vma *vma)
+{
+	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
+
+	__eb_unreserve_vma(vma, entry);
+	entry->flags &= ~(__EXEC_OBJECT_HAS_FENCE | __EXEC_OBJECT_HAS_PIN);
+}
+
 static void
 eb_reset(struct i915_execbuffer *eb)
 {
+	struct i915_vma *vma;
+
+	list_for_each_entry(vma, &eb->vmas, exec_list) {
+		eb_unreserve_vma(vma);
+		i915_vma_put(vma);
+		vma->exec_entry = NULL;
+	}
+
 	if (eb->and >= 0)
 		memset(eb->buckets, 0, (eb->and+1)*sizeof(struct hlist_head));
 }
@@ -147,6 +174,8 @@ eb_lookup_vmas(struct i915_execbuffer *eb)
 	struct list_head objects;
 	int i, ret;
 
+	INIT_LIST_HEAD(&eb->vmas);
+
 	INIT_LIST_HEAD(&objects);
 	spin_lock(&eb->file->table_lock);
 	/* Grab a reference to the object and release the lock so we can lookup
@@ -253,40 +282,23 @@ static struct i915_vma *eb_get_vma(struct i915_execbuffer *eb, unsigned long han
 	}
 }
 
-static void
-eb_unreserve_vma(struct i915_vma *vma)
-{
-	struct drm_i915_gem_exec_object2 *entry;
-
-	if (!drm_mm_node_allocated(&vma->node))
-		return;
-
-	entry = vma->exec_entry;
-
-	if (entry->flags & __EXEC_OBJECT_HAS_FENCE)
-		i915_vma_unpin_fence(vma);
-
-	if (entry->flags & __EXEC_OBJECT_HAS_PIN)
-		__i915_vma_unpin(vma);
-
-	entry->flags &= ~(__EXEC_OBJECT_HAS_FENCE | __EXEC_OBJECT_HAS_PIN);
-}
-
 static void eb_destroy(struct i915_execbuffer *eb)
 {
-	i915_gem_context_put(eb->ctx);
+	struct i915_vma *vma;
 
-	while (!list_empty(&eb->vmas)) {
-		struct i915_vma *vma;
+	list_for_each_entry(vma, &eb->vmas, exec_list) {
+		if (!vma->exec_entry)
+			continue;
 
-		vma = list_first_entry(&eb->vmas,
-				       struct i915_vma,
-				       exec_list);
-		list_del_init(&vma->exec_list);
-		eb_unreserve_vma(vma);
+		__eb_unreserve_vma(vma, vma->exec_entry);
 		vma->exec_entry = NULL;
 		i915_vma_put(vma);
 	}
+
+	i915_gem_context_put(eb->ctx);
+
+	if (eb->buckets)
+		kfree(eb->buckets);
 }
 
 static inline int use_cpu_reloc(struct drm_i915_gem_object *obj)
@@ -984,13 +996,7 @@ eb_relocate_slow(struct i915_execbuffer *eb)
 	int i, total, ret;
 
 	/* We may process another execbuffer during the unlock... */
-	while (!list_empty(&eb->vmas)) {
-		vma = list_first_entry(&eb->vmas, struct i915_vma, exec_list);
-		list_del_init(&vma->exec_list);
-		eb_unreserve_vma(vma);
-		i915_vma_put(vma);
-	}
-
+	eb_reset(eb);
 	mutex_unlock(&dev->struct_mutex);
 
 	total = 0;
@@ -1051,7 +1057,6 @@ eb_relocate_slow(struct i915_execbuffer *eb)
 	}
 
 	/* reacquire the objects */
-	eb_reset(eb);
 	ret = eb_lookup_vmas(eb);
 	if (ret)
 		goto err;
diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index 1aba47024656..6cf32da682ec 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -85,7 +85,6 @@ vma_create(struct drm_i915_gem_object *obj,
 	if (vma == NULL)
 		return ERR_PTR(-ENOMEM);
 
-	INIT_LIST_HEAD(&vma->exec_list);
 	for (i = 0; i < ARRAY_SIZE(vma->last_read); i++)
 		init_request_active(&vma->last_read[i], i915_vma_retire);
 	init_request_active(&vma->last_fence, NULL);
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 39+ messages in thread

* [PATCH 05/15] drm/i915: Split vma exec_link/evict_link
  2017-03-16 13:19 Make execbuf fast and green Chris Wilson
                   ` (3 preceding siblings ...)
  2017-03-16 13:19 ` [PATCH 04/15] drm/i915: Use vma->exec_entry as our double-entry placeholder Chris Wilson
@ 2017-03-16 13:19 ` Chris Wilson
  2017-03-16 13:19 ` [PATCH 06/15] drm/i915: Stop using obj->obj_exec_link outside of execbuf Chris Wilson
                   ` (13 subsequent siblings)
  18 siblings, 0 replies; 39+ messages in thread
From: Chris Wilson @ 2017-03-16 13:19 UTC (permalink / raw)
  To: intel-gfx

Currently the vma has one link member that is used for both holding its
place in the execbuf reservation list, and in any eviction list. This
dual property is quite tricky and error prone.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_gem_evict.c      | 14 ++++++-------
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 32 +++++++++++++++---------------
 drivers/gpu/drm/i915/i915_vma.h            |  7 +++++--
 3 files changed, 28 insertions(+), 25 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_evict.c b/drivers/gpu/drm/i915/i915_gem_evict.c
index ed34f54baef9..fd5c75517143 100644
--- a/drivers/gpu/drm/i915/i915_gem_evict.c
+++ b/drivers/gpu/drm/i915/i915_gem_evict.c
@@ -62,7 +62,7 @@ mark_free(struct drm_mm_scan *scan,
 	if (flags & PIN_NONFAULT && !list_empty(&vma->obj->userfault_link))
 		return false;
 
-	list_add(&vma->exec_list, unwind);
+	list_add(&vma->evict_link, unwind);
 	return drm_mm_scan_add_block(scan, &vma->node);
 }
 
@@ -154,7 +154,7 @@ i915_gem_evict_something(struct i915_address_space *vm,
 	} while (*++phase);
 
 	/* Nothing found, clean up and bail out! */
-	list_for_each_entry_safe(vma, next, &eviction_list, exec_list) {
+	list_for_each_entry_safe(vma, next, &eviction_list, evict_link) {
 		ret = drm_mm_scan_remove_block(&scan, &vma->node);
 		BUG_ON(ret);
 	}
@@ -201,16 +201,16 @@ i915_gem_evict_something(struct i915_address_space *vm,
 	 * calling unbind (which may remove the active reference
 	 * of any of our objects, thus corrupting the list).
 	 */
-	list_for_each_entry_safe(vma, next, &eviction_list, exec_list) {
+	list_for_each_entry_safe(vma, next, &eviction_list, evict_link) {
 		if (drm_mm_scan_remove_block(&scan, &vma->node))
 			__i915_vma_pin(vma);
 		else
-			list_del(&vma->exec_list);
+			list_del(&vma->evict_link);
 	}
 
 	/* Unbinding will emit any required flushes */
 	ret = 0;
-	list_for_each_entry_safe(vma, next, &eviction_list, exec_list) {
+	list_for_each_entry_safe(vma, next, &eviction_list, evict_link) {
 		__i915_vma_unpin(vma);
 		if (ret == 0)
 			ret = i915_vma_unbind(vma);
@@ -323,10 +323,10 @@ int i915_gem_evict_for_node(struct i915_address_space *vm,
 		 * reference) another in our eviction list.
 		 */
 		__i915_vma_pin(vma);
-		list_add(&vma->exec_list, &eviction_list);
+		list_add(&vma->evict_link, &eviction_list);
 	}
 
-	list_for_each_entry_safe(vma, next, &eviction_list, exec_list) {
+	list_for_each_entry_safe(vma, next, &eviction_list, evict_link) {
 		__i915_vma_unpin(vma);
 		if (ret == 0)
 			ret = i915_vma_unbind(vma);
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index cc624e2d794d..0c73642825d0 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -137,7 +137,7 @@ eb_reset(struct i915_execbuffer *eb)
 {
 	struct i915_vma *vma;
 
-	list_for_each_entry(vma, &eb->vmas, exec_list) {
+	list_for_each_entry(vma, &eb->vmas, exec_link) {
 		eb_unreserve_vma(vma);
 		i915_vma_put(vma);
 		vma->exec_entry = NULL;
@@ -150,7 +150,7 @@ eb_reset(struct i915_execbuffer *eb)
 static struct i915_vma *
 eb_get_batch(struct i915_execbuffer *eb)
 {
-	struct i915_vma *vma = list_entry(eb->vmas.prev, typeof(*vma), exec_list);
+	struct i915_vma *vma = list_entry(eb->vmas.prev, typeof(*vma), exec_link);
 
 	/*
 	 * SNA is doing fancy tricks with compressing batch buffers, which leads
@@ -227,7 +227,7 @@ eb_lookup_vmas(struct i915_execbuffer *eb)
 		}
 
 		/* Transfer ownership from the objects list to the vmas list. */
-		list_add_tail(&vma->exec_list, &eb->vmas);
+		list_add_tail(&vma->exec_link, &eb->vmas);
 		list_del_init(&obj->obj_exec_link);
 
 		vma->exec_entry = &eb->exec[i];
@@ -286,7 +286,7 @@ static void eb_destroy(struct i915_execbuffer *eb)
 {
 	struct i915_vma *vma;
 
-	list_for_each_entry(vma, &eb->vmas, exec_list) {
+	list_for_each_entry(vma, &eb->vmas, exec_link) {
 		if (!vma->exec_entry)
 			continue;
 
@@ -752,7 +752,7 @@ static int eb_relocate(struct i915_execbuffer *eb)
 	struct i915_vma *vma;
 	int ret = 0;
 
-	list_for_each_entry(vma, &eb->vmas, exec_list) {
+	list_for_each_entry(vma, &eb->vmas, exec_link) {
 		ret = eb_relocate_vma(vma, eb);
 		if (ret)
 			break;
@@ -904,7 +904,7 @@ static int eb_reserve(struct i915_execbuffer *eb)
 		struct drm_i915_gem_exec_object2 *entry;
 		bool need_fence, need_mappable;
 
-		vma = list_first_entry(&eb->vmas, struct i915_vma, exec_list);
+		vma = list_first_entry(&eb->vmas, struct i915_vma, exec_link);
 		obj = vma->obj;
 		entry = vma->exec_entry;
 
@@ -919,12 +919,12 @@ static int eb_reserve(struct i915_execbuffer *eb)
 		need_mappable = need_fence || need_reloc_mappable(vma);
 
 		if (entry->flags & EXEC_OBJECT_PINNED)
-			list_move_tail(&vma->exec_list, &pinned_vmas);
+			list_move_tail(&vma->exec_link, &pinned_vmas);
 		else if (need_mappable) {
 			entry->flags |= __EXEC_OBJECT_NEEDS_MAP;
-			list_move(&vma->exec_list, &ordered_vmas);
+			list_move(&vma->exec_link, &ordered_vmas);
 		} else
-			list_move_tail(&vma->exec_list, &ordered_vmas);
+			list_move_tail(&vma->exec_link, &ordered_vmas);
 
 		obj->base.pending_read_domains = I915_GEM_GPU_DOMAINS & ~I915_GEM_DOMAIN_COMMAND;
 		obj->base.pending_write_domain = 0;
@@ -949,7 +949,7 @@ static int eb_reserve(struct i915_execbuffer *eb)
 		int ret = 0;
 
 		/* Unbind any ill-fitting objects or pin. */
-		list_for_each_entry(vma, &eb->vmas, exec_list) {
+		list_for_each_entry(vma, &eb->vmas, exec_link) {
 			if (!drm_mm_node_allocated(&vma->node))
 				continue;
 
@@ -962,7 +962,7 @@ static int eb_reserve(struct i915_execbuffer *eb)
 		}
 
 		/* Bind fresh objects */
-		list_for_each_entry(vma, &eb->vmas, exec_list) {
+		list_for_each_entry(vma, &eb->vmas, exec_link) {
 			if (drm_mm_node_allocated(&vma->node))
 				continue;
 
@@ -976,7 +976,7 @@ static int eb_reserve(struct i915_execbuffer *eb)
 			return ret;
 
 		/* Decrement pin count for bound objects */
-		list_for_each_entry(vma, &eb->vmas, exec_list)
+		list_for_each_entry(vma, &eb->vmas, exec_link)
 			eb_unreserve_vma(vma);
 
 		ret = i915_gem_evict_vm(eb->vm, true);
@@ -1065,7 +1065,7 @@ eb_relocate_slow(struct i915_execbuffer *eb)
 	if (ret)
 		goto err;
 
-	list_for_each_entry(vma, &eb->vmas, exec_list) {
+	list_for_each_entry(vma, &eb->vmas, exec_link) {
 		int idx = vma->exec_entry - eb->exec;
 
 		ret = eb_relocate_vma_slow(vma, eb, reloc + reloc_offset[idx]);
@@ -1091,7 +1091,7 @@ eb_move_to_gpu(struct i915_execbuffer *eb)
 	struct i915_vma *vma;
 	int ret;
 
-	list_for_each_entry(vma, &eb->vmas, exec_list) {
+	list_for_each_entry(vma, &eb->vmas, exec_link) {
 		struct drm_i915_gem_object *obj = vma->obj;
 
 		if (vma->exec_entry->flags & EXEC_OBJECT_CAPTURE) {
@@ -1313,7 +1313,7 @@ eb_move_to_active(struct i915_execbuffer *eb)
 {
 	struct i915_vma *vma;
 
-	list_for_each_entry(vma, &eb->vmas, exec_list) {
+	list_for_each_entry(vma, &eb->vmas, exec_link) {
 		struct drm_i915_gem_object *obj = vma->obj;
 
 		obj->base.write_domain = obj->base.pending_write_domain;
@@ -1387,7 +1387,7 @@ static struct i915_vma *eb_parse(struct i915_execbuffer *eb, bool is_master)
 		memset(&eb->shadow_exec_entry, 0, sizeof(*vma->exec_entry));
 	vma->exec_entry->flags = __EXEC_OBJECT_HAS_PIN;
 	i915_gem_object_get(shadow_batch_obj);
-	list_add_tail(&vma->exec_list, &eb->vmas);
+	list_add_tail(&vma->exec_link, &eb->vmas);
 
 out:
 	i915_gem_object_unpin_pages(shadow_batch_obj);
diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h
index 2e03f81dddbe..4d827300d1a8 100644
--- a/drivers/gpu/drm/i915/i915_vma.h
+++ b/drivers/gpu/drm/i915/i915_vma.h
@@ -100,8 +100,11 @@ struct i915_vma {
 	struct list_head obj_link; /* Link in the object's VMA list */
 	struct rb_node obj_node;
 
-	/** This vma's place in the batchbuffer or on the eviction list */
-	struct list_head exec_list;
+	/** This vma's place in the execbuf reservation list */
+	struct list_head exec_link;
+
+	/** This vma's place in the eviction list */
+	struct list_head evict_link;
 
 	/**
 	 * Used for performing relocations during execbuffer insertion.
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 39+ messages in thread

* [PATCH 06/15] drm/i915: Stop using obj->obj_exec_link outside of execbuf
  2017-03-16 13:19 Make execbuf fast and green Chris Wilson
                   ` (4 preceding siblings ...)
  2017-03-16 13:19 ` [PATCH 05/15] drm/i915: Split vma exec_link/evict_link Chris Wilson
@ 2017-03-16 13:19 ` Chris Wilson
  2017-03-16 13:19 ` [PATCH 07/15] drm/i915: Store a direct lookup from object handle to vma Chris Wilson
                   ` (12 subsequent siblings)
  18 siblings, 0 replies; 39+ messages in thread
From: Chris Wilson @ 2017-03-16 13:19 UTC (permalink / raw)
  To: intel-gfx

i915_gem_stolen_list_info() sneakily takes advantage of the
obj->obj_exec_link to save itself from having to allocate. Enough of the
subterfuge, just allocate an array of pointers and sort them instead of
the list.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c | 60 +++++++++++++++++++++----------------
 1 file changed, 35 insertions(+), 25 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 505c402cacaa..27f314b1d683 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -27,7 +27,7 @@
  */
 
 #include <linux/debugfs.h>
-#include <linux/list_sort.h>
+#include <linux/sort.h>
 #include "intel_drv.h"
 
 static inline struct drm_i915_private *node_to_i915(struct drm_info_node *node)
@@ -204,13 +204,12 @@ describe_obj(struct seq_file *m, struct drm_i915_gem_object *obj)
 		seq_printf(m, " (frontbuffer: 0x%03x)", frontbuffer_bits);
 }
 
-static int obj_rank_by_stolen(void *priv,
-			      struct list_head *A, struct list_head *B)
+static int obj_rank_by_stolen(const void *A, const void *B)
 {
-	struct drm_i915_gem_object *a =
-		container_of(A, struct drm_i915_gem_object, obj_exec_link);
-	struct drm_i915_gem_object *b =
-		container_of(B, struct drm_i915_gem_object, obj_exec_link);
+	const struct drm_i915_gem_object *a =
+		*(const struct drm_i915_gem_object **)A;
+	const struct drm_i915_gem_object *b =
+		*(const struct drm_i915_gem_object **)B;
 
 	if (a->stolen->start < b->stolen->start)
 		return -1;
@@ -223,49 +222,60 @@ static int i915_gem_stolen_list_info(struct seq_file *m, void *data)
 {
 	struct drm_i915_private *dev_priv = node_to_i915(m->private);
 	struct drm_device *dev = &dev_priv->drm;
+	struct drm_i915_gem_object **objects;
 	struct drm_i915_gem_object *obj;
 	u64 total_obj_size, total_gtt_size;
-	LIST_HEAD(stolen);
-	int count, ret;
+	unsigned long total, count, n;
+	int ret;
+
+	total = READ_ONCE(dev_priv->mm.object_count);
+	objects = drm_malloc_ab(total, sizeof(*objects));
+	if (!objects)
+		return -ENOMEM;
 
 	ret = mutex_lock_interruptible(&dev->struct_mutex);
 	if (ret)
-		return ret;
+		goto out;
 
 	total_obj_size = total_gtt_size = count = 0;
 	list_for_each_entry(obj, &dev_priv->mm.bound_list, global_link) {
+		if (count == total)
+			break;
+
 		if (obj->stolen == NULL)
 			continue;
 
-		list_add(&obj->obj_exec_link, &stolen);
-
+		objects[count++] = obj;
 		total_obj_size += obj->base.size;
 		total_gtt_size += i915_gem_obj_total_ggtt_size(obj);
-		count++;
+
 	}
 	list_for_each_entry(obj, &dev_priv->mm.unbound_list, global_link) {
+		if (count == total)
+			break;
+
 		if (obj->stolen == NULL)
 			continue;
 
-		list_add(&obj->obj_exec_link, &stolen);
-
+		objects[count++] = obj;
 		total_obj_size += obj->base.size;
-		count++;
 	}
-	list_sort(NULL, &stolen, obj_rank_by_stolen);
+
+	sort(objects, count, sizeof(*objects), obj_rank_by_stolen, NULL);
+
 	seq_puts(m, "Stolen:\n");
-	while (!list_empty(&stolen)) {
-		obj = list_first_entry(&stolen, typeof(*obj), obj_exec_link);
+	for (n = 0; n < count; n++) {
 		seq_puts(m, "   ");
-		describe_obj(m, obj);
+		describe_obj(m, objects[n]);
 		seq_putc(m, '\n');
-		list_del_init(&obj->obj_exec_link);
 	}
-	mutex_unlock(&dev->struct_mutex);
-
-	seq_printf(m, "Total %d objects, %llu bytes, %llu GTT size\n",
+	seq_printf(m, "Total %lu objects, %llu bytes, %llu GTT size\n",
 		   count, total_obj_size, total_gtt_size);
-	return 0;
+
+	mutex_unlock(&dev->struct_mutex);
+out:
+	drm_free_large(objects);
+	return ret;
 }
 
 struct file_stats {
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 39+ messages in thread

* [PATCH 07/15] drm/i915: Store a direct lookup from object handle to vma
  2017-03-16 13:19 Make execbuf fast and green Chris Wilson
                   ` (5 preceding siblings ...)
  2017-03-16 13:19 ` [PATCH 06/15] drm/i915: Stop using obj->obj_exec_link outside of execbuf Chris Wilson
@ 2017-03-16 13:19 ` Chris Wilson
  2017-03-22  9:33   ` [PATCH v2] " Chris Wilson
  2017-03-16 13:19 ` [PATCH 08/15] drm/i915: Pass vma to relocate entry Chris Wilson
                   ` (11 subsequent siblings)
  18 siblings, 1 reply; 39+ messages in thread
From: Chris Wilson @ 2017-03-16 13:19 UTC (permalink / raw)
  To: intel-gfx

The advent of full-ppgtt lead to an extra indirection between the object
and its binding. That extra indirection has a noticeable impact on how
fast we can convert from the user handles to our internal vma for
execbuffer. In order to bypass the extra indirection, we use a
resizable hashtable to jump from the object to the per-ctx vma.
rhashtable was considered but we don't need the online resizing feature
and the extra complexity proved to undermine its usefulness. Instead, we
simply reallocate the hastable on demand in a background task and
serialize it before iterating.

In non-full-ppgtt modes, multiple files and multiple contexts can share
the same vma. This leads to having multiple possible handle->vma links,
so we only use the first to establish the fast path. The majority of
buffers are not shared and so we should still be able to realise
speedups with multiple clients.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_debugfs.c           |   5 +
 drivers/gpu/drm/i915/i915_drv.h               |   2 +-
 drivers/gpu/drm/i915/i915_gem.c               |   5 +-
 drivers/gpu/drm/i915/i915_gem_context.c       |  73 ++++++++
 drivers/gpu/drm/i915/i915_gem_context.h       |   8 +
 drivers/gpu/drm/i915/i915_gem_execbuffer.c    | 245 +++++++++++++++-----------
 drivers/gpu/drm/i915/i915_gem_gtt.h           |   1 +
 drivers/gpu/drm/i915/i915_gem_object.h        |   4 +-
 drivers/gpu/drm/i915/i915_vma.c               |  20 +++
 drivers/gpu/drm/i915/i915_vma.h               |   8 +-
 drivers/gpu/drm/i915/selftests/mock_context.c |  11 +-
 11 files changed, 271 insertions(+), 111 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 27f314b1d683..123f5ea54748 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1986,6 +1986,11 @@ static int i915_context_status(struct seq_file *m, void *unused)
 			seq_putc(m, '\n');
 		}
 
+		seq_printf(m, "\tvma hashtable size=%u (actual %u), count=%u\n",
+			   ctx->vma.ht_size,
+			   1 << ctx->vma.ht_bits,
+			   ctx->vma.ht_count);
+
 		seq_putc(m, '\n');
 	}
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 3c9551147e28..bae251806fb6 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -37,7 +37,7 @@
 #include <linux/i2c.h>
 #include <linux/i2c-algo-bit.h>
 #include <linux/backlight.h>
-#include <linux/hashtable.h>
+#include <linux/hash.h>
 #include <linux/intel-iommu.h>
 #include <linux/kref.h>
 #include <linux/pm_qos.h>
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index d87983ba536f..1b6539a91625 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3105,6 +3105,10 @@ void i915_gem_close_object(struct drm_gem_object *gem, struct drm_file *file)
 		if (vma->vm->file == fpriv)
 			i915_vma_close(vma);
 
+	vma = obj->vma_hashed;
+	if (vma && vma->ctx->file_priv == fpriv)
+		i915_vma_unlink_ctx(vma);
+
 	if (i915_gem_object_is_active(obj) &&
 	    !i915_gem_object_has_active_reference(obj)) {
 		i915_gem_object_set_active_reference(obj);
@@ -4053,7 +4057,6 @@ void i915_gem_object_init(struct drm_i915_gem_object *obj,
 
 	INIT_LIST_HEAD(&obj->global_link);
 	INIT_LIST_HEAD(&obj->userfault_link);
-	INIT_LIST_HEAD(&obj->obj_exec_link);
 	INIT_LIST_HEAD(&obj->vma_list);
 	INIT_LIST_HEAD(&obj->batch_pool_link);
 
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index baceca14f5e0..9a9a8afff1e9 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -85,6 +85,7 @@
  *
  */
 
+#include <linux/log2.h>
 #include <drm/drmP.h>
 #include <drm/i915_drm.h>
 #include "i915_drv.h"
@@ -92,6 +93,9 @@
 
 #define ALL_L3_SLICES(dev) (1 << NUM_L3_SLICES(dev)) - 1
 
+/* Initial size (as log2) to preallocate the handle->object hashtable */
+#define VMA_HT_BITS 2u /* 4 x 2 pointers, 64 bytes minimum */
+
 static int get_context_size(struct drm_i915_private *dev_priv)
 {
 	int ret;
@@ -119,6 +123,64 @@ static int get_context_size(struct drm_i915_private *dev_priv)
 	return ret;
 }
 
+static void resize_vma_ht(struct work_struct *work)
+{
+	struct i915_gem_context *ctx =
+		container_of(work, typeof(*ctx), vma.resize);
+	unsigned int size, bits, new_bits, i;
+	struct hlist_head *new_ht;
+
+	bits = 1 + ilog2(4*ctx->vma.ht_count/3);
+	new_bits = min_t(unsigned int,
+			 max(bits, VMA_HT_BITS),
+			 sizeof(unsigned int)*8);
+	if (new_bits == ctx->vma.ht_bits)
+		goto out;
+
+	new_ht = kzalloc(sizeof(*new_ht)<<new_bits, GFP_KERNEL | __GFP_NOWARN);
+	if (!new_ht)
+		new_ht = vzalloc(sizeof(*new_ht)<<new_bits);
+	if (!new_ht)
+		/* pretend the resize suceeded and stop calling us for a bit! */
+		goto out;
+
+	size = 1 << ctx->vma.ht_bits;
+	for (i = 0; i < size; i++) {
+		struct i915_vma *vma;
+		struct hlist_node *tmp;
+
+		hlist_for_each_entry_safe(vma, tmp, &ctx->vma.ht[i], ctx_node)
+			hlist_add_head(&vma->ctx_node,
+				       &new_ht[hash_32(vma->ctx_handle,
+						       new_bits)]);
+	}
+	kvfree(ctx->vma.ht);
+	ctx->vma.ht = new_ht;
+	ctx->vma.ht_bits = new_bits;
+	smp_wmb();
+out:
+	ctx->vma.ht_size = 1 << bits;
+}
+
+static void decouple_vma(struct i915_gem_context *ctx)
+{
+	unsigned int i, size;
+
+	if (ctx->vma.ht_size & 1)
+		cancel_work_sync(&ctx->vma.resize);
+
+	size = 1 << ctx->vma.ht_bits;
+	for (i = 0; i < size; i++) {
+		struct i915_vma *vma;
+
+		hlist_for_each_entry(vma, &ctx->vma.ht[i], ctx_node) {
+			vma->obj->vma_hashed = NULL;
+			vma->ctx = NULL;
+		}
+	}
+	kvfree(ctx->vma.ht);
+}
+
 void i915_gem_context_free(struct kref *ctx_ref)
 {
 	struct i915_gem_context *ctx = container_of(ctx_ref, typeof(*ctx), ref);
@@ -128,6 +190,7 @@ void i915_gem_context_free(struct kref *ctx_ref)
 	trace_i915_context_free(ctx);
 	GEM_BUG_ON(!i915_gem_context_is_closed(ctx));
 
+	decouple_vma(ctx);
 	i915_ppgtt_put(ctx->ppgtt);
 
 	for (i = 0; i < I915_NUM_ENGINES; i++) {
@@ -145,6 +208,7 @@ void i915_gem_context_free(struct kref *ctx_ref)
 
 	kfree(ctx->name);
 	put_pid(ctx->pid);
+
 	list_del(&ctx->link);
 
 	ida_simple_remove(&ctx->i915->context_hw_ida, ctx->hw_id);
@@ -266,6 +330,15 @@ __create_hw_context(struct drm_i915_private *dev_priv,
 	list_add_tail(&ctx->link, &dev_priv->context_list);
 	ctx->i915 = dev_priv;
 
+	ctx->vma.ht_bits = VMA_HT_BITS;
+	ctx->vma.ht_size = 1 << ctx->vma.ht_bits;
+	ctx->vma.ht = kzalloc(sizeof(*ctx->vma.ht)*ctx->vma.ht_size,
+			      GFP_KERNEL);
+	if (!ctx->vma.ht)
+		goto err_out;
+
+	INIT_WORK(&ctx->vma.resize, resize_vma_ht);
+
 	if (dev_priv->hw_context_size) {
 		struct drm_i915_gem_object *obj;
 		struct i915_vma *vma;
diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
index 81268c9770a6..ecdf3e92dac2 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.h
+++ b/drivers/gpu/drm/i915/i915_gem_context.h
@@ -143,6 +143,14 @@ struct i915_gem_context {
 	/** ggtt_offset_bias: placement restriction for context objects */
 	u32 ggtt_offset_bias;
 
+	struct {
+		struct work_struct resize;
+		struct hlist_head *ht;
+		unsigned int ht_bits;
+		unsigned int ht_size;
+		unsigned int ht_count;
+	} vma;
+
 	/** engine: per-engine logical HW state */
 	struct intel_context {
 		struct i915_vma *state;
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 0c73642825d0..590a93c48574 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -75,38 +75,33 @@ struct i915_execbuffer {
 		unsigned int page;
 		bool use_64bit_reloc;
 	} reloc_cache;
-	int and;
-	union {
-		struct i915_vma **lut;
-		struct hlist_head *buckets;
-	};
+	int lut_mask;
+	struct hlist_head *buckets;
 };
 
 static int
 eb_create(struct i915_execbuffer *eb)
 {
-	eb->lut = NULL;
-	if (eb->args->flags & I915_EXEC_HANDLE_LUT) {
-		unsigned int size = eb->args->buffer_count;
-		size *= sizeof(struct i915_vma *);
-		eb->lut = kmalloc(size,
-				  GFP_TEMPORARY | __GFP_NOWARN | __GFP_NORETRY);
-	}
-
-	if (!eb->lut) {
-		unsigned int size = eb->args->buffer_count;
-		unsigned int count = PAGE_SIZE / sizeof(struct hlist_head) / 2;
-		BUILD_BUG_ON_NOT_POWER_OF_2(PAGE_SIZE / sizeof(struct hlist_head));
-		while (count > 2*size)
-			count >>= 1;
-		eb->lut = kzalloc(count*sizeof(struct hlist_head),
-				  GFP_TEMPORARY);
-		if (!eb->lut)
-			return -ENOMEM;
-
-		eb->and = count - 1;
+	if ((eb->args->flags & I915_EXEC_HANDLE_LUT) == 0) {
+		unsigned int size = 1 + ilog2(eb->args->buffer_count);
+
+		do {
+			eb->buckets = kzalloc(sizeof(struct hlist_head) << size,
+					     GFP_TEMPORARY | __GFP_NOWARN | __GFP_NORETRY);
+			if (eb->buckets)
+				break;
+		} while (--size);
+
+		if (unlikely(!eb->buckets)) {
+			eb->buckets = kzalloc(sizeof(struct hlist_head),
+					      GFP_TEMPORARY);
+			if (unlikely(!eb->buckets))
+				return -ENOMEM;
+		}
+
+		eb->lut_mask = size;
 	} else {
-		eb->and = -eb->args->buffer_count;
+		eb->lut_mask = -eb->args->buffer_count;
 	}
 
 	return 0;
@@ -143,73 +138,104 @@ eb_reset(struct i915_execbuffer *eb)
 		vma->exec_entry = NULL;
 	}
 
-	if (eb->and >= 0)
-		memset(eb->buckets, 0, (eb->and+1)*sizeof(struct hlist_head));
+	if (eb->lut_mask >= 0)
+		memset(eb->buckets, 0,
+		       (1<<eb->lut_mask)*sizeof(struct hlist_head));
 }
 
-static struct i915_vma *
-eb_get_batch(struct i915_execbuffer *eb)
+#define to_ptr(T, x) ((T *)(uintptr_t)(x))
+
+static bool
+eb_add_vma(struct i915_execbuffer *eb, struct i915_vma *vma, int i)
 {
-	struct i915_vma *vma = list_entry(eb->vmas.prev, typeof(*vma), exec_link);
+	if (unlikely(vma->exec_entry)) {
+		DRM_DEBUG("Object [handle %d, index %d] appears more than once in object list\n",
+			  eb->exec[i].handle, i);
+		return false;
+	}
+	list_add_tail(&vma->exec_link, &eb->vmas);
 
-	/*
-	 * SNA is doing fancy tricks with compressing batch buffers, which leads
-	 * to negative relocation deltas. Usually that works out ok since the
-	 * relocate address is still positive, except when the batch is placed
-	 * very low in the GTT. Ensure this doesn't happen.
-	 *
-	 * Note that actual hangs have only been observed on gen7, but for
-	 * paranoia do it everywhere.
-	 */
-	if ((vma->exec_entry->flags & EXEC_OBJECT_PINNED) == 0)
-		vma->exec_entry->flags |= __EXEC_OBJECT_NEEDS_BIAS;
+	vma->exec_entry = &eb->exec[i];
+	if (eb->lut_mask >= 0) {
+		vma->exec_handle = eb->exec[i].handle;
+		hlist_add_head(&vma->exec_node,
+			       &eb->buckets[hash_32(vma->exec_handle,
+						    eb->lut_mask)]);
+	}
 
-	return vma;
+	i915_vma_get(vma);
+	eb->exec[i].rsvd2 = (uintptr_t)vma;
+	return true;
+}
+
+static inline struct hlist_head *ht_head(struct i915_gem_context *ctx,
+					 u32 handle)
+{
+	return &ctx->vma.ht[hash_32(handle, ctx->vma.ht_bits)];
 }
 
 static int
 eb_lookup_vmas(struct i915_execbuffer *eb)
 {
-	struct drm_i915_gem_object *obj;
-	struct list_head objects;
-	int i, ret;
+	const int count = eb->args->buffer_count;
+	struct i915_vma *vma;
+	int slow_pass = -1;
+	int i;
 
 	INIT_LIST_HEAD(&eb->vmas);
 
-	INIT_LIST_HEAD(&objects);
+	if (unlikely(eb->ctx->vma.ht_size & 1))
+		flush_work(&eb->ctx->vma.resize);
+	for (i = 0; i < count; i++) {
+		eb->exec[i].rsvd2 = 0;
+
+		hlist_for_each_entry(vma,
+				     ht_head(eb->ctx, eb->exec[i].handle),
+				     ctx_node) {
+			if (vma->ctx_handle != eb->exec[i].handle)
+				continue;
+
+			if (!eb_add_vma(eb, vma, i))
+				return -EINVAL;
+
+			goto next_vma;
+		}
+
+		if (slow_pass < 0)
+			slow_pass = i;
+next_vma: ;
+	}
+
+	if (slow_pass < 0)
+		return 0;
+
 	spin_lock(&eb->file->table_lock);
 	/* Grab a reference to the object and release the lock so we can lookup
 	 * or create the VMA without using GFP_ATOMIC */
-	for (i = 0; i < eb->args->buffer_count; i++) {
-		obj = to_intel_bo(idr_find(&eb->file->object_idr, eb->exec[i].handle));
-		if (obj == NULL) {
-			spin_unlock(&eb->file->table_lock);
-			DRM_DEBUG("Invalid object handle %d at index %d\n",
-				   eb->exec[i].handle, i);
-			ret = -ENOENT;
-			goto err;
-		}
+	for (i = slow_pass; i < count; i++) {
+		struct drm_i915_gem_object *obj;
 
-		if (!list_empty(&obj->obj_exec_link)) {
+		if (eb->exec[i].rsvd2)
+			continue;
+
+		obj = to_intel_bo(idr_find(&eb->file->object_idr,
+					   eb->exec[i].handle));
+		if (unlikely(!obj)) {
 			spin_unlock(&eb->file->table_lock);
-			DRM_DEBUG("Object %p [handle %d, index %d] appears more than once in object list\n",
-				   obj, eb->exec[i].handle, i);
-			ret = -EINVAL;
-			goto err;
+			DRM_DEBUG("Invalid object handle %d at index %d\n",
+				  eb->exec[i].handle, i);
+			return -ENOENT;
 		}
 
-		i915_gem_object_get(obj);
-		list_add_tail(&obj->obj_exec_link, &objects);
+		eb->exec[i].rsvd2 = 1 | (uintptr_t)obj;
 	}
 	spin_unlock(&eb->file->table_lock);
 
-	i = 0;
-	while (!list_empty(&objects)) {
-		struct i915_vma *vma;
+	for (i = slow_pass; i < count; i++) {
+		struct drm_i915_gem_object *obj;
 
-		obj = list_first_entry(&objects,
-				       struct drm_i915_gem_object,
-				       obj_exec_link);
+		if ((eb->exec[i].rsvd2 & 1) == 0)
+			continue;
 
 		/*
 		 * NOTE: We can leak any vmas created here when something fails
@@ -219,61 +245,72 @@ eb_lookup_vmas(struct i915_execbuffer *eb)
 		 * from the (obj, vm) we don't run the risk of creating
 		 * duplicated vmas for the same vm.
 		 */
+		obj = to_ptr(struct drm_i915_gem_object, eb->exec[i].rsvd2 & ~1);
 		vma = i915_vma_instance(obj, eb->vm, NULL);
 		if (unlikely(IS_ERR(vma))) {
 			DRM_DEBUG("Failed to lookup VMA\n");
-			ret = PTR_ERR(vma);
-			goto err;
+			return PTR_ERR(vma);
 		}
 
-		/* Transfer ownership from the objects list to the vmas list. */
-		list_add_tail(&vma->exec_link, &eb->vmas);
-		list_del_init(&obj->obj_exec_link);
-
-		vma->exec_entry = &eb->exec[i];
-		if (eb->and < 0) {
-			eb->lut[i] = vma;
-		} else {
-			u32 handle =
-				eb->args->flags & I915_EXEC_HANDLE_LUT ?
-				i : eb->exec[i].handle;
-			vma->exec_handle = handle;
-			hlist_add_head(&vma->exec_node,
-				       &eb->buckets[handle & eb->and]);
+		/* First come, first served */
+		if (!vma->ctx) {
+			vma->ctx = eb->ctx;
+			vma->ctx_handle = eb->exec[i].handle;
+			hlist_add_head(&vma->ctx_node,
+				       ht_head(eb->ctx, eb->exec[i].handle));
+			eb->ctx->vma.ht_count++;
+			if (i915_vma_is_ggtt(vma)) {
+				GEM_BUG_ON(obj->vma_hashed);
+				obj->vma_hashed = vma;
+			}
 		}
-		++i;
+
+		if (!eb_add_vma(eb, vma, i))
+			return -EINVAL;
+	}
+	if (4*eb->ctx->vma.ht_count > 3*eb->ctx->vma.ht_size ||
+	    4*eb->ctx->vma.ht_count < eb->ctx->vma.ht_size) {
+		eb->ctx->vma.ht_size |= 1;
+		queue_work(system_highpri_wq, &eb->ctx->vma.resize);
 	}
 
 	return 0;
+}
 
+static struct i915_vma *
+eb_get_batch(struct i915_execbuffer *eb)
+{
+	struct i915_vma *vma;
+
+	vma = to_ptr(struct i915_vma, eb->exec[eb->args->buffer_count-1].rsvd2);
 
-err:
-	while (!list_empty(&objects)) {
-		obj = list_first_entry(&objects,
-				       struct drm_i915_gem_object,
-				       obj_exec_link);
-		list_del_init(&obj->obj_exec_link);
-		i915_gem_object_put(obj);
-	}
 	/*
-	 * Objects already transfered to the vmas list will be unreferenced by
-	 * eb_destroy.
+	 * SNA is doing fancy tricks with compressing batch buffers, which leads
+	 * to negative relocation deltas. Usually that works out ok since the
+	 * relocate address is still positive, except when the batch is placed
+	 * very low in the GTT. Ensure this doesn't happen.
+	 *
+	 * Note that actual hangs have only been observed on gen7, but for
+	 * paranoia do it everywhere.
 	 */
+	if ((vma->exec_entry->flags & EXEC_OBJECT_PINNED) == 0)
+		vma->exec_entry->flags |= __EXEC_OBJECT_NEEDS_BIAS;
 
-	return ret;
+	return vma;
 }
 
-static struct i915_vma *eb_get_vma(struct i915_execbuffer *eb, unsigned long handle)
+static struct i915_vma *
+eb_get_vma(struct i915_execbuffer *eb, unsigned long handle)
 {
-	if (eb->and < 0) {
-		if (handle >= -eb->and)
+	if (eb->lut_mask < 0) {
+		if (handle >= -eb->lut_mask)
 			return NULL;
-		return eb->lut[handle];
+		return to_ptr(struct i915_vma, eb->exec[handle].rsvd2);
 	} else {
 		struct hlist_head *head;
 		struct i915_vma *vma;
 
-		head = &eb->buckets[handle & eb->and];
+		head = &eb->buckets[hash_32(handle, eb->lut_mask)];
 		hlist_for_each_entry(vma, head, exec_node) {
 			if (vma->exec_handle == handle)
 				return vma;
@@ -297,7 +334,7 @@ static void eb_destroy(struct i915_execbuffer *eb)
 
 	i915_gem_context_put(eb->ctx);
 
-	if (eb->buckets)
+	if (eb->lut_mask >= 0)
 		kfree(eb->buckets);
 }
 
@@ -915,7 +952,7 @@ static int eb_reserve(struct i915_execbuffer *eb)
 			entry->flags &= ~EXEC_OBJECT_NEEDS_FENCE;
 		need_fence =
 			entry->flags & EXEC_OBJECT_NEEDS_FENCE &&
-			i915_gem_object_is_tiled(obj);
+			i915_gem_object_is_tiled(vma->obj);
 		need_mappable = need_fence || need_reloc_mappable(vma);
 
 		if (entry->flags & EXEC_OBJECT_PINNED)
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
index fb15684c1d83..948b2e68131a 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -196,6 +196,7 @@ struct i915_ggtt_view {
 	};
 };
 
+struct i915_gem_context;
 enum i915_cache_level;
 
 struct i915_vma;
diff --git a/drivers/gpu/drm/i915/i915_gem_object.h b/drivers/gpu/drm/i915/i915_gem_object.h
index 174cf923c236..5093e065b9a6 100644
--- a/drivers/gpu/drm/i915/i915_gem_object.h
+++ b/drivers/gpu/drm/i915/i915_gem_object.h
@@ -71,6 +71,7 @@ struct drm_i915_gem_object {
 	/** List of VMAs backed by this object */
 	struct list_head vma_list;
 	struct rb_root vma_tree;
+	struct i915_vma *vma_hashed;
 
 	/** Stolen memory for this object, instead of being backed by shmem. */
 	struct drm_mm_node *stolen;
@@ -85,9 +86,6 @@ struct drm_i915_gem_object {
 	 */
 	struct list_head userfault_link;
 
-	/** Used in execbuf to temporarily hold a ref */
-	struct list_head obj_exec_link;
-
 	struct list_head batch_pool_link;
 	I915_SELFTEST_DECLARE(struct list_head st_link);
 
diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index 6cf32da682ec..507463f144c9 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -590,11 +590,31 @@ void i915_vma_destroy(struct i915_vma *vma)
 	kmem_cache_free(to_i915(vma->obj->base.dev)->vmas, vma);
 }
 
+void i915_vma_unlink_ctx(struct i915_vma *vma)
+{
+	struct i915_gem_context *ctx = vma->ctx;
+
+	if (ctx->vma.ht_size & 1) {
+		cancel_work_sync(&ctx->vma.resize);
+		ctx->vma.ht_size &= ~1;
+	}
+
+	__hlist_del(&vma->ctx_node);
+	ctx->vma.ht_count--;
+
+	if (i915_vma_is_ggtt(vma))
+		vma->obj->vma_hashed = NULL;
+	vma->ctx = NULL;
+}
+
 void i915_vma_close(struct i915_vma *vma)
 {
 	GEM_BUG_ON(i915_vma_is_closed(vma));
 	vma->flags |= I915_VMA_CLOSED;
 
+	if (vma->ctx)
+		i915_vma_unlink_ctx(vma);
+
 	list_del(&vma->obj_link);
 	rb_erase(&vma->obj_node, &vma->obj->vma_tree);
 
diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h
index 4d827300d1a8..88543fafcffc 100644
--- a/drivers/gpu/drm/i915/i915_vma.h
+++ b/drivers/gpu/drm/i915/i915_vma.h
@@ -99,6 +99,7 @@ struct i915_vma {
 
 	struct list_head obj_link; /* Link in the object's VMA list */
 	struct rb_node obj_node;
+	struct hlist_node obj_hash;
 
 	/** This vma's place in the execbuf reservation list */
 	struct list_head exec_link;
@@ -110,8 +111,12 @@ struct i915_vma {
 	 * Used for performing relocations during execbuffer insertion.
 	 */
 	struct hlist_node exec_node;
-	unsigned long exec_handle;
 	struct drm_i915_gem_exec_object2 *exec_entry;
+	u32 exec_handle;
+
+	struct i915_gem_context *ctx;
+	struct hlist_node ctx_node;
+	u32 ctx_handle;
 };
 
 struct i915_vma *
@@ -235,6 +240,7 @@ bool i915_vma_misplaced(const struct i915_vma *vma,
 			u64 size, u64 alignment, u64 flags);
 void __i915_vma_set_map_and_fenceable(struct i915_vma *vma);
 int __must_check i915_vma_unbind(struct i915_vma *vma);
+void i915_vma_unlink_ctx(struct i915_vma *vma);
 void i915_vma_close(struct i915_vma *vma);
 void i915_vma_destroy(struct i915_vma *vma);
 
diff --git a/drivers/gpu/drm/i915/selftests/mock_context.c b/drivers/gpu/drm/i915/selftests/mock_context.c
index 8d3a90c3f8ac..4e4615d5e003 100644
--- a/drivers/gpu/drm/i915/selftests/mock_context.c
+++ b/drivers/gpu/drm/i915/selftests/mock_context.c
@@ -40,10 +40,17 @@ mock_context(struct drm_i915_private *i915,
 	INIT_LIST_HEAD(&ctx->link);
 	ctx->i915 = i915;
 
+	ctx->vma.ht_bits = VMA_HT_BITS;
+	ctx->vma.ht_size = 1 << ctx->vma.ht_bits;
+	ctx->vma.ht = kzalloc(sizeof(*ctx->vma.ht)*ctx->vma.ht_size,
+			      GFP_KERNEL);
+	if (!ctx->vma.ht)
+		goto err_free;
+
 	ret = ida_simple_get(&i915->context_hw_ida,
 			     0, MAX_CONTEXT_HW_ID, GFP_KERNEL);
 	if (ret < 0)
-		goto err_free;
+		goto err_vma_ht;
 	ctx->hw_id = ret;
 
 	if (name) {
@@ -58,6 +65,8 @@ mock_context(struct drm_i915_private *i915,
 
 	return ctx;
 
+err_vma_ht:
+	kvfree(ctx->vma.ht);
 err_free:
 	kfree(ctx);
 	return NULL;
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 39+ messages in thread

* [PATCH 08/15] drm/i915: Pass vma to relocate entry
  2017-03-16 13:19 Make execbuf fast and green Chris Wilson
                   ` (6 preceding siblings ...)
  2017-03-16 13:19 ` [PATCH 07/15] drm/i915: Store a direct lookup from object handle to vma Chris Wilson
@ 2017-03-16 13:19 ` Chris Wilson
  2017-03-21 14:17   ` Joonas Lahtinen
  2017-03-16 13:20 ` [PATCH 09/15] drm/i915: Eliminate lots of iterations over the execobjects array Chris Wilson
                   ` (10 subsequent siblings)
  18 siblings, 1 reply; 39+ messages in thread
From: Chris Wilson @ 2017-03-16 13:19 UTC (permalink / raw)
  To: intel-gfx

We can simplify our tracking of pending writes in an execbuf to the
single bit in the vma->exec_entry->flags, but that requires the
relocation function knowing the object's vma. Pass it along.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 100 ++++++++++++-----------------
 1 file changed, 41 insertions(+), 59 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 590a93c48574..3cfad68fc452 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -604,42 +604,25 @@ relocate_entry(struct drm_i915_gem_object *obj,
 }
 
 static int
-eb_relocate_entry(struct drm_i915_gem_object *obj,
+eb_relocate_entry(struct i915_vma *vma,
 		  struct i915_execbuffer *eb,
 		  struct drm_i915_gem_relocation_entry *reloc)
 {
-	struct drm_gem_object *target_obj;
-	struct drm_i915_gem_object *target_i915_obj;
-	struct i915_vma *target_vma;
-	uint64_t target_offset;
+	struct i915_vma *target;
+	u64 target_offset;
 	int ret;
 
 	/* we've already hold a reference to all valid objects */
-	target_vma = eb_get_vma(eb, reloc->target_handle);
-	if (unlikely(target_vma == NULL))
+	target = eb_get_vma(eb, reloc->target_handle);
+	if (unlikely(!target))
 		return -ENOENT;
-	target_i915_obj = target_vma->obj;
-	target_obj = &target_vma->obj->base;
-
-	target_offset = gen8_canonical_addr(target_vma->node.start);
-
-	/* Sandybridge PPGTT errata: We need a global gtt mapping for MI and
-	 * pipe_control writes because the gpu doesn't properly redirect them
-	 * through the ppgtt for non_secure batchbuffers. */
-	if (unlikely(IS_GEN6(eb->i915) &&
-		     reloc->write_domain == I915_GEM_DOMAIN_INSTRUCTION)) {
-		ret = i915_vma_bind(target_vma, target_i915_obj->cache_level,
-				    PIN_GLOBAL);
-		if (WARN_ONCE(ret, "Unexpected failure to bind target VMA!"))
-			return ret;
-	}
 
 	/* Validate that the target is in a valid r/w GPU domain */
 	if (unlikely(reloc->write_domain & (reloc->write_domain - 1))) {
 		DRM_DEBUG("reloc with multiple write domains: "
-			  "obj %p target %d offset %d "
+			  "target %d offset %d "
 			  "read %08x write %08x",
-			  obj, reloc->target_handle,
+			  reloc->target_handle,
 			  (int) reloc->offset,
 			  reloc->read_domains,
 			  reloc->write_domain);
@@ -648,43 +631,56 @@ eb_relocate_entry(struct drm_i915_gem_object *obj,
 	if (unlikely((reloc->write_domain | reloc->read_domains)
 		     & ~I915_GEM_GPU_DOMAINS)) {
 		DRM_DEBUG("reloc with read/write non-GPU domains: "
-			  "obj %p target %d offset %d "
+			  "target %d offset %d "
 			  "read %08x write %08x",
-			  obj, reloc->target_handle,
+			  reloc->target_handle,
 			  (int) reloc->offset,
 			  reloc->read_domains,
 			  reloc->write_domain);
 		return -EINVAL;
 	}
 
-	target_obj->pending_read_domains |= reloc->read_domains;
-	target_obj->pending_write_domain |= reloc->write_domain;
+	if (reloc->write_domain)
+		target->exec_entry->flags |= EXEC_OBJECT_WRITE;
+
+	/* Sandybridge PPGTT errata: We need a global gtt mapping for MI and
+	 * pipe_control writes because the gpu doesn't properly redirect them
+	 * through the ppgtt for non_secure batchbuffers.
+	 */
+	if (unlikely(IS_GEN6(eb->i915) &&
+		     reloc->write_domain == I915_GEM_DOMAIN_INSTRUCTION)) {
+		ret = i915_vma_bind(target, target->obj->cache_level,
+				    PIN_GLOBAL);
+		if (WARN_ONCE(ret, "Unexpected failure to bind target VMA!"))
+			return ret;
+	}
 
 	/* If the relocation already has the right value in it, no
 	 * more work needs to be done.
 	 */
+	target_offset = gen8_canonical_addr(target->node.start);
 	if (target_offset == reloc->presumed_offset)
 		return 0;
 
 	/* Check that the relocation address is valid... */
 	if (unlikely(reloc->offset >
-		     obj->base.size - (eb->reloc_cache.use_64bit_reloc ? 8 : 4))) {
+		     vma->size - (eb->reloc_cache.use_64bit_reloc ? 8 : 4))) {
 		DRM_DEBUG("Relocation beyond object bounds: "
-			  "obj %p target %d offset %d size %d.\n",
-			  obj, reloc->target_handle,
-			  (int) reloc->offset,
-			  (int) obj->base.size);
+			  "target %d offset %d size %d.\n",
+			  reloc->target_handle,
+			  (int)reloc->offset,
+			  (int)vma->size);
 		return -EINVAL;
 	}
 	if (unlikely(reloc->offset & 3)) {
 		DRM_DEBUG("Relocation not 4-byte aligned: "
-			  "obj %p target %d offset %d.\n",
-			  obj, reloc->target_handle,
-			  (int) reloc->offset);
+			  "target %d offset %d.\n",
+			  reloc->target_handle,
+			  (int)reloc->offset);
 		return -EINVAL;
 	}
 
-	ret = relocate_entry(obj, reloc, &eb->reloc_cache, target_offset);
+	ret = relocate_entry(vma->obj, reloc, &eb->reloc_cache, target_offset);
 	if (ret)
 		return ret;
 
@@ -730,7 +726,7 @@ static int eb_relocate_vma(struct i915_vma *vma, struct i915_execbuffer *eb)
 		do {
 			u64 offset = r->presumed_offset;
 
-			ret = eb_relocate_entry(vma->obj, eb, r);
+			ret = eb_relocate_entry(vma, eb, r);
 			if (ret)
 				goto out;
 
@@ -776,7 +772,7 @@ eb_relocate_vma_slow(struct i915_vma *vma,
 	int i, ret = 0;
 
 	for (i = 0; i < entry->relocation_count; i++) {
-		ret = eb_relocate_entry(vma->obj, eb, &relocs[i]);
+		ret = eb_relocate_entry(vma, eb, &relocs[i]);
 		if (ret)
 			break;
 	}
@@ -809,7 +805,6 @@ eb_reserve_vma(struct i915_vma *vma,
 	       struct intel_engine_cs *engine,
 	       bool *need_reloc)
 {
-	struct drm_i915_gem_object *obj = vma->obj;
 	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
 	uint64_t flags;
 	int ret;
@@ -863,11 +858,6 @@ eb_reserve_vma(struct i915_vma *vma,
 		*need_reloc = true;
 	}
 
-	if (entry->flags & EXEC_OBJECT_WRITE) {
-		obj->base.pending_read_domains = I915_GEM_DOMAIN_RENDER;
-		obj->base.pending_write_domain = I915_GEM_DOMAIN_RENDER;
-	}
-
 	return 0;
 }
 
@@ -929,7 +919,6 @@ eb_vma_misplaced(struct i915_vma *vma)
 static int eb_reserve(struct i915_execbuffer *eb)
 {
 	const bool has_fenced_gpu_access = INTEL_GEN(eb->i915) < 4;
-	struct drm_i915_gem_object *obj;
 	struct i915_vma *vma;
 	struct list_head ordered_vmas;
 	struct list_head pinned_vmas;
@@ -942,7 +931,6 @@ static int eb_reserve(struct i915_execbuffer *eb)
 		bool need_fence, need_mappable;
 
 		vma = list_first_entry(&eb->vmas, struct i915_vma, exec_link);
-		obj = vma->obj;
 		entry = vma->exec_entry;
 
 		if (eb->ctx->flags & CONTEXT_NO_ZEROMAP)
@@ -962,9 +950,6 @@ static int eb_reserve(struct i915_execbuffer *eb)
 			list_move(&vma->exec_link, &ordered_vmas);
 		} else
 			list_move_tail(&vma->exec_link, &ordered_vmas);
-
-		obj->base.pending_read_domains = I915_GEM_GPU_DOMAINS & ~I915_GEM_DOMAIN_COMMAND;
-		obj->base.pending_write_domain = 0;
 	}
 	list_splice(&ordered_vmas, &eb->vmas);
 	list_splice(&pinned_vmas, &eb->vmas);
@@ -1152,7 +1137,7 @@ eb_move_to_gpu(struct i915_execbuffer *eb)
 		}
 
 		ret = i915_gem_request_await_object
-			(eb->request, obj, obj->base.pending_write_domain);
+			(eb->request, obj, vma->exec_entry->flags & EXEC_OBJECT_WRITE);
 		if (ret)
 			return ret;
 	}
@@ -1353,12 +1338,10 @@ eb_move_to_active(struct i915_execbuffer *eb)
 	list_for_each_entry(vma, &eb->vmas, exec_link) {
 		struct drm_i915_gem_object *obj = vma->obj;
 
-		obj->base.write_domain = obj->base.pending_write_domain;
-		if (obj->base.write_domain)
-			vma->exec_entry->flags |= EXEC_OBJECT_WRITE;
-		else
-			obj->base.pending_read_domains |= obj->base.read_domains;
-		obj->base.read_domains = obj->base.pending_read_domains;
+		obj->base.write_domain = 0;
+		if (vma->exec_entry->flags & EXEC_OBJECT_WRITE)
+			obj->base.read_domains = 0;
+		obj->base.read_domains |= I915_GEM_GPU_DOMAINS;
 
 		i915_vma_move_to_active(vma, eb->request, vma->exec_entry->flags);
 		eb_export_fence(obj, eb->request, vma->exec_entry->flags);
@@ -1669,7 +1652,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 	}
 
 	/* Set the pending read domains for the batch buffer to COMMAND */
-	if (eb.batch->obj->base.pending_write_domain) {
+	if (eb.batch->exec_entry->flags & EXEC_OBJECT_WRITE) {
 		DRM_DEBUG("Attempting to use self-modifying batch buffer\n");
 		ret = -EINVAL;
 		goto err;
@@ -1706,7 +1689,6 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 		}
 	}
 
-	eb.batch->obj->base.pending_read_domains |= I915_GEM_DOMAIN_COMMAND;
 	if (eb.batch_len == 0)
 		eb.batch_len = eb.batch->size - eb.batch_start_offset;
 
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 39+ messages in thread

* [PATCH 09/15] drm/i915: Eliminate lots of iterations over the execobjects array
  2017-03-16 13:19 Make execbuf fast and green Chris Wilson
                   ` (7 preceding siblings ...)
  2017-03-16 13:19 ` [PATCH 08/15] drm/i915: Pass vma to relocate entry Chris Wilson
@ 2017-03-16 13:20 ` Chris Wilson
  2017-03-16 13:20 ` [PATCH 10/15] drm/i915: First try the previous execbuffer location Chris Wilson
                   ` (9 subsequent siblings)
  18 siblings, 0 replies; 39+ messages in thread
From: Chris Wilson @ 2017-03-16 13:20 UTC (permalink / raw)
  To: intel-gfx

The major scaling bottleneck in execbuffer is the processing of the
execobjects. Creating an auxiliary list is inefficient when compared to
using the execobject array we already have allocated.

Reservation is then split into phases. As we lookup up the VMA, we
try and bind it back into active location. Only if that fails, do we add
it to the unbound list for phase 2. In phase 2, we try and add all those
objects that could not fit into their previous location, with fallback
to retrying all objects and evicting the VM in case of severe
fragmentation. (This is the same as before, except that phase 1 is now
done inline with looking up the VMA to avoid an iteration over the
execobject array. In the ideal case, we eliminate the separate reservation
phase). During the reservation phase, we only evict from the VM between
passes (rather than currently as we try to fit every new VMA). In
testing with Unreal Engine's Atlantis demo which stresses the eviction
logic on gen7 class hardware, this speed up the framerate by a factor of
2.

The second loop amalgamation is between move_to_gpu and move_to_active.
As we always submit the request, even if incomplete, we can use the
current request to track active VMA as we perform the flushes and
synchronisation required.

The next big advancement is to avoid copying back to the user any
execobjects and relocations that are not changed.

v2: Add a Theory of Operation spiel.
v3: Fall back to slow relocations in preparation for flushing userptrs.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.h                 |    2 +-
 drivers/gpu/drm/i915/i915_gem_evict.c           |   95 +-
 drivers/gpu/drm/i915/i915_gem_execbuffer.c      | 1683 +++++++++++++----------
 drivers/gpu/drm/i915/i915_vma.c                 |    2 +-
 drivers/gpu/drm/i915/i915_vma.h                 |    1 +
 drivers/gpu/drm/i915/selftests/i915_gem_evict.c |    4 +-
 6 files changed, 1019 insertions(+), 768 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index bae251806fb6..4bb397d84923 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3534,7 +3534,7 @@ int __must_check i915_gem_evict_something(struct i915_address_space *vm,
 int __must_check i915_gem_evict_for_node(struct i915_address_space *vm,
 					 struct drm_mm_node *node,
 					 unsigned int flags);
-int i915_gem_evict_vm(struct i915_address_space *vm, bool do_idle);
+int i915_gem_evict_vm(struct i915_address_space *vm);
 
 /* belongs in i915_gem_gtt.h */
 static inline void i915_gem_chipset_flush(struct drm_i915_private *dev_priv)
diff --git a/drivers/gpu/drm/i915/i915_gem_evict.c b/drivers/gpu/drm/i915/i915_gem_evict.c
index fd5c75517143..c45d58e8a53d 100644
--- a/drivers/gpu/drm/i915/i915_gem_evict.c
+++ b/drivers/gpu/drm/i915/i915_gem_evict.c
@@ -50,6 +50,30 @@ static bool ggtt_is_idle(struct drm_i915_private *dev_priv)
 	return true;
 }
 
+static int ggtt_flush(struct drm_i915_private *i915)
+{
+	int err;
+
+	/* Not everything in the GGTT is tracked via vma (otherwise we
+	 * could evict as required with minimal stalling) so we are forced
+	 * to idle the GPU and explicitly retire outstanding requests in
+	 * the hopes that we can then remove contexts and the like only
+	 * bound by their active reference.
+	 */
+	err = i915_gem_switch_to_kernel_context(i915);
+	if (err)
+		return err;
+
+	err = i915_gem_wait_for_idle(i915,
+				     I915_WAIT_INTERRUPTIBLE |
+				     I915_WAIT_LOCKED);
+	if (err)
+		return err;
+
+	i915_gem_retire_requests(i915);
+	return 0;
+}
+
 static bool
 mark_free(struct drm_mm_scan *scan,
 	  struct i915_vma *vma,
@@ -175,23 +199,10 @@ i915_gem_evict_something(struct i915_address_space *vm,
 		return intel_has_pending_fb_unpin(dev_priv) ? -EAGAIN : -ENOSPC;
 	}
 
-	/* Not everything in the GGTT is tracked via vma (otherwise we
-	 * could evict as required with minimal stalling) so we are forced
-	 * to idle the GPU and explicitly retire outstanding requests in
-	 * the hopes that we can then remove contexts and the like only
-	 * bound by their active reference.
-	 */
-	ret = i915_gem_switch_to_kernel_context(dev_priv);
-	if (ret)
-		return ret;
-
-	ret = i915_gem_wait_for_idle(dev_priv,
-				     I915_WAIT_INTERRUPTIBLE |
-				     I915_WAIT_LOCKED);
+	ret = ggtt_flush(dev_priv);
 	if (ret)
 		return ret;
 
-	i915_gem_retire_requests(dev_priv);
 	goto search_again;
 
 found:
@@ -338,10 +349,8 @@ int i915_gem_evict_for_node(struct i915_address_space *vm,
 /**
  * i915_gem_evict_vm - Evict all idle vmas from a vm
  * @vm: Address space to cleanse
- * @do_idle: Boolean directing whether to idle first.
  *
- * This function evicts all idles vmas from a vm. If all unpinned vmas should be
- * evicted the @do_idle needs to be set to true.
+ * This function evicts all vmas from a vm.
  *
  * This is used by the execbuf code as a last-ditch effort to defragment the
  * address space.
@@ -349,38 +358,50 @@ int i915_gem_evict_for_node(struct i915_address_space *vm,
  * To clarify: This is for freeing up virtual address space, not for freeing
  * memory in e.g. the shrinker.
  */
-int i915_gem_evict_vm(struct i915_address_space *vm, bool do_idle)
+int i915_gem_evict_vm(struct i915_address_space *vm)
 {
+	struct list_head *phases[] = {
+		&vm->inactive_list,
+		&vm->active_list,
+		NULL
+	}, **phase;
+	struct list_head eviction_list;
 	struct i915_vma *vma, *next;
 	int ret;
 
 	lockdep_assert_held(&vm->i915->drm.struct_mutex);
 	trace_i915_gem_evict_vm(vm);
 
-	if (do_idle) {
-		struct drm_i915_private *dev_priv = vm->i915;
-
-		if (i915_is_ggtt(vm)) {
-			ret = i915_gem_switch_to_kernel_context(dev_priv);
-			if (ret)
-				return ret;
-		}
-
-		ret = i915_gem_wait_for_idle(dev_priv,
-					     I915_WAIT_INTERRUPTIBLE |
-					     I915_WAIT_LOCKED);
+	/* Switch back to the default context in order to unpin
+	 * the existing context objects. However, such objects only
+	 * pin themselves inside the global GTT and performing the
+	 * switch otherwise is ineffective.
+	 */
+	if (i915_is_ggtt(vm)) {
+		ret = ggtt_flush(vm->i915);
 		if (ret)
 			return ret;
-
-		i915_gem_retire_requests(dev_priv);
-		WARN_ON(!list_empty(&vm->active_list));
 	}
 
-	list_for_each_entry_safe(vma, next, &vm->inactive_list, vm_link)
-		if (!i915_vma_is_pinned(vma))
-			WARN_ON(i915_vma_unbind(vma));
+	INIT_LIST_HEAD(&eviction_list);
+	phase = phases;
+	do {
+		list_for_each_entry(vma, *phase, vm_link) {
+			if (i915_vma_is_pinned(vma))
+				continue;
+
+			__i915_vma_pin(vma);
+			list_add(&vma->evict_link, &eviction_list);
+		}
+	} while (*++phase);
 
-	return 0;
+	ret = 0;
+	list_for_each_entry_safe(vma, next, &eviction_list, evict_link) {
+		__i915_vma_unpin(vma);
+		if (ret == 0)
+			ret = i915_vma_unbind(vma);
+	}
+	return ret;
 }
 
 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 3cfad68fc452..5ce4f2e47eb9 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -42,17 +42,134 @@
 
 #define DBG_USE_CPU_RELOC 0 /* -1 force GTT relocs; 1 force CPU relocs */
 
-#define  __EXEC_OBJECT_HAS_PIN		(1<<31)
-#define  __EXEC_OBJECT_HAS_FENCE	(1<<30)
-#define  __EXEC_OBJECT_NEEDS_MAP	(1<<29)
-#define  __EXEC_OBJECT_NEEDS_BIAS	(1<<28)
+#define  __EXEC_OBJECT_HAS_PIN		BIT(31)
+#define  __EXEC_OBJECT_HAS_FENCE	BIT(30)
+#define  __EXEC_OBJECT_NEEDS_MAP	BIT(29)
+#define  __EXEC_OBJECT_NEEDS_BIAS	BIT(28)
 #define  __EXEC_OBJECT_INTERNAL_FLAGS (0xf<<28) /* all of the above */
+#define __EB_RESERVED (__EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_FENCE)
+
+#define __EXEC_HAS_RELOC	BIT(31)
+#define __EXEC_VALIDATED	BIT(30)
+#define UPDATE			PIN_OFFSET_FIXED
 
 #define BATCH_OFFSET_BIAS (256*1024)
 
 #define __I915_EXEC_ILLEGAL_FLAGS \
 	(__I915_EXEC_UNKNOWN_FLAGS | I915_EXEC_CONSTANTS_MASK)
 
+/**
+ * DOC: User command execution
+ *
+ * Userspace submits commands to be executed on the GPU as an instruction
+ * stream within a GEM object we call a batchbuffer. This instructions may
+ * refer to other GEM objects containing auxiliary state such as kernels,
+ * samplers, render targets and even secondary batchbuffers. Userspace does
+ * not know where in the GPU memory these objects reside and so before the
+ * batchbuffer is passed to the GPU for execution, those addresses in the
+ * batchbuffer and auxiliary objects are updated. This is known as relocation,
+ * or patching. To try and avoid having to relocate each object on the next
+ * execution, userspace is told the location of those objects in this pass,
+ * but this remains just a hint as the kernel may choose a new location for
+ * any object in the future.
+ *
+ * Processing an execbuf ioctl is conceptually split up into a few phases.
+ *
+ * 1. Validation - Ensure all the pointers, handles and flags are valid.
+ * 2. Reservation - Assign GPU address space for every object
+ * 3. Relocation - Update any addresses to point to the final locations
+ * 4. Serialisation - Order the request with respect to its dependencies
+ * 5. Construction - Construct a request to execute the batchbuffer
+ * 6. Submission (at some point in the future execution)
+ *
+ * Reserving resources for the execbuf is the most complicated phase. We
+ * neither want to have to migrate the object in the address space, nor do
+ * we want to have to update any relocations pointing to this object. Ideally,
+ * we want to leave the object where it is and for all the existing relocations
+ * to match. If the object is given a new address, or if userspace thinks the
+ * object is elsewhere, we have to parse all the relocation entries and update
+ * the addresses. Userspace can set the I915_EXEC_NORELOC flag to hint that
+ * all the target addresses in all of its objects match the value in the
+ * relocation entries and that they all match the presumed offsets given by the
+ * list of execbuffer objects. Using this knowledge, we know that if we haven't
+ * moved any buffers, all the relocation entries are valid and we can skip
+ * the update. (If userspace is wrong, the likely outcome is an impromptu GPU
+ * hang.)
+ *
+ * The reservation is done is multiple phases. First we try and keep any
+ * object already bound in its current location - so as long as meets the
+ * constraints imposed by the new execbuffer. Any object left unbound after the
+ * first pass is then fitted into any available idle space. If an object does
+ * not fit, all objects are removed from the reservation and the process rerun
+ * after sorting the objects into a priority order (more difficult to fit
+ * objects are tried first). Failing that, the entire VM is cleared and we try
+ * to fit the execbuf once last time before concluding that it simply will not
+ * fit.
+ *
+ * A small complication to all of this is that we allow userspace not only to
+ * specify an alignment and a size for the object in the address space, but
+ * we also allow userspace to specify the exact offset. This objects are
+ * simpler to place (the location is known a priori) all we have to do is make
+ * sure the space is available.
+ *
+ * Once all the objects are in place, patching up the buried pointers to point
+ * to the final locations is a fairly simple job of walking over the relocation
+ * entry arrays, looking up the right address and rewriting the value into
+ * the object. Simple! ... The relocation entries are stored in user memory
+ * and so to access them we have to copy them into a local buffer. That copy
+ * has to avoid taking any pagefaults as they may lead back to a GEM object
+ * requiring the struct_mutex (i.e. recursive deadlock). So once again we split
+ * the relocation into multiple passes. First we try to do everything within an
+ * atomic context (avoid the pagefaults) which requires that we never wait. If
+ * we detect that we may wait, or if we need to fault, then we have to fallback
+ * to a slower path. The slowpath has to drop the mutex. (Can you hear alarm
+ * bells yet?) Dropping the mutex means that we lose all the state we have
+ * built up so far for the execbuf and we must reset any global data. However,
+ * we do leave the objects pinned in their final locations - which is a
+ * potential issue for concurrent execbufs. Once we have left the mutex, we can
+ * allocate and copy all the relocation entries into a large array at our
+ * leisure, reacquire the mutex, reclaim all the objects and other state and
+ * then proceed to update any incorrect addresses with the objects.
+ *
+ * As we process the relocation entries, we maintain a record of whether the
+ * object is being written to. Using NORELOC, we expect userspace to provide
+ * this information instead. We also check whether we can skip the relocation
+ * by comparing the expected value inside the relocation entry with the target's
+ * final address. If they differ, we have to map the current object and rewrite
+ * the 4 or 8 byte pointer within.
+ *
+ * Serialising an execbuf is quite simple according to the rules of the GEM
+ * ABI. Execution within each context is ordered by the order of submission.
+ * Writes to any GEM object are in order of submission and are exclusive. Reads
+ * from a GEM object are unordered with respect to other reads, but ordered by
+ * writes. A write submitted after a read cannot occur before the read, and
+ * similarly any read submitted after a write cannot occur before the write.
+ * Writes are ordered between engines such that only one write occurs at any
+ * time (completing any reads beforehand) - using semaphores where available
+ * and CPU serialisation otherwise. Other GEM access obey the same rules, any
+ * write (either via mmaps using set-domain, or via pwrite) must flush all GPU
+ * reads before starting, and any read (either using set-domain or pread) must
+ * flush all GPU writes before starting. (Note we only employ a barrier before,
+ * we currently rely on userspace not concurrently starting a new execution
+ * whilst reading or writing to an object. This may be an advantage or not
+ * depending on how much you trust userspace not to shoot themselves in the
+ * foot.) Serialisation may just result in the request being inserted into
+ * a DAG awaiting its turn, but most simple is to wait on the CPU until
+ * all dependencies are resolved.
+ *
+ * After all of that, is just a matter of closing the request and handing it to
+ * the hardware (well, leaving it in a queue to be executed). However, we also
+ * offer the ability for batchbuffers to be run with elevated privileges so
+ * that they access otherwise hidden registers. (Used to adjust L3 cache etc.)
+ * Before any batch is given extra privileges we first must check that it
+ * contains no nefarious instructions, we check that each instruction is from
+ * our whitelist and all registers are also from an allowed list. We first
+ * copy the user's batchbuffer to a shadow (so that the user doesn't have
+ * access to it, either by the CPU or GPU as we scan it) and then parse each
+ * instruction. If everything is ok, we set a flag telling the hardware to run
+ * the batchbuffer in trusted mode, otherwise the ioctl is rejected.
+ */
+
 struct i915_execbuffer {
 	struct drm_i915_private *i915;
 	struct drm_file *file;
@@ -63,28 +180,62 @@ struct i915_execbuffer {
 	struct i915_address_space *vm;
 	struct i915_vma *batch;
 	struct drm_i915_gem_request *request;
-	u32 batch_start_offset;
-	u32 batch_len;
-	unsigned int dispatch_flags;
-	struct drm_i915_gem_exec_object2 shadow_exec_entry;
-	bool need_relocs;
-	struct list_head vmas;
+	unsigned int buffer_count;
+	struct list_head unbound;
+	struct list_head relocs;
 	struct reloc_cache {
 		struct drm_mm_node node;
 		unsigned long vaddr;
 		unsigned int page;
 		bool use_64bit_reloc;
+		bool has_llc;
+		bool has_fence;
 	} reloc_cache;
-	int lut_mask;
+	u64 invalid_flags;
+	u32 context_flags;
+	u32 dispatch_flags;
+	u32 batch_start_offset;
+	u32 batch_len;
+	int lut_size;
 	struct hlist_head *buckets;
 };
 
+#define to_ptr(T, x) ((T *)(uintptr_t)(x))
+
+/* Used to convert any address to canonical form.
+ * Starting from gen8, some commands (e.g. STATE_BASE_ADDRESS,
+ * MI_LOAD_REGISTER_MEM and others, see Broadwell PRM Vol2a) require the
+ * addresses to be in a canonical form:
+ * "GraphicsAddress[63:48] are ignored by the HW and assumed to be in correct
+ * canonical form [63:48] == [47]."
+ */
+#define GEN8_HIGH_ADDRESS_BIT 47
+static inline u64 gen8_canonical_addr(u64 address)
+{
+	return sign_extend64(address, GEN8_HIGH_ADDRESS_BIT);
+}
+
+static inline u64 gen8_noncanonical_addr(u64 address)
+{
+	return address & ((1ULL << (GEN8_HIGH_ADDRESS_BIT + 1)) - 1);
+}
+
 static int
 eb_create(struct i915_execbuffer *eb)
 {
 	if ((eb->args->flags & I915_EXEC_HANDLE_LUT) == 0) {
-		unsigned int size = 1 + ilog2(eb->args->buffer_count);
-
+		unsigned int size = 1 + ilog2(eb->buffer_count);
+
+		/* Without a 1:1 association between relocation handles and
+		 * the execobject[] index, we instead create a hashtable.
+		 * We size it dynamically based on available memory, starting
+		 * first with 1:1 assocative hash and scaling back until
+		 * the allocation succeeds.
+		 *
+		 * Later on we use a positive lut_size to indicate we are
+		 * using this hashtable, and a negative value to indicate a
+		 * direct lookup.
+		 */
 		do {
 			eb->buckets = kzalloc(sizeof(struct hlist_head) << size,
 					     GFP_TEMPORARY | __GFP_NOWARN | __GFP_NORETRY);
@@ -99,90 +250,349 @@ eb_create(struct i915_execbuffer *eb)
 				return -ENOMEM;
 		}
 
-		eb->lut_mask = size;
+		eb->lut_size = size;
 	} else {
-		eb->lut_mask = -eb->args->buffer_count;
+		eb->lut_size = -eb->buffer_count;
 	}
 
 	return 0;
 }
 
+static bool
+eb_vma_misplaced(const struct drm_i915_gem_exec_object2 *entry,
+		 const struct i915_vma *vma)
+{
+	if ((entry->flags & __EXEC_OBJECT_HAS_PIN) == 0)
+		return true;
+
+	if (vma->node.size < entry->pad_to_size)
+		return true;
+
+	if (entry->alignment && !IS_ALIGNED(vma->node.start, entry->alignment))
+		return true;
+
+	if (entry->flags & EXEC_OBJECT_PINNED &&
+	    vma->node.start != entry->offset)
+		return true;
+
+	if (entry->flags & __EXEC_OBJECT_NEEDS_BIAS &&
+	    vma->node.start < BATCH_OFFSET_BIAS)
+		return true;
+
+	if ((entry->flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) == 0 &&
+	    (vma->node.start + vma->node.size - 1) >> 32)
+		return true;
+
+	return false;
+}
+
+static void
+eb_pin_vma(struct i915_execbuffer *eb,
+	   struct drm_i915_gem_exec_object2 *entry,
+	   struct i915_vma *vma)
+{
+	u64 flags;
+
+	flags = vma->node.start;
+	flags |= PIN_USER | PIN_NONBLOCK | PIN_OFFSET_FIXED;
+	if (unlikely(entry->flags & EXEC_OBJECT_NEEDS_GTT))
+		flags |= PIN_GLOBAL;
+	if (unlikely(i915_vma_pin(vma, 0, 0, flags)))
+		return;
+
+	if (entry->flags & EXEC_OBJECT_NEEDS_FENCE) {
+		if (unlikely(i915_vma_get_fence(vma))) {
+			i915_vma_unpin(vma);
+			return;
+		}
+
+		if (i915_vma_pin_fence(vma))
+			entry->flags |= __EXEC_OBJECT_HAS_FENCE;
+	}
+
+	entry->flags |= __EXEC_OBJECT_HAS_PIN;
+}
+
 static inline void
 __eb_unreserve_vma(struct i915_vma *vma,
 		   const struct drm_i915_gem_exec_object2 *entry)
 {
+	GEM_BUG_ON((entry->flags & __EXEC_OBJECT_HAS_PIN) == 0);
+
 	if (unlikely(entry->flags & __EXEC_OBJECT_HAS_FENCE))
 		i915_vma_unpin_fence(vma);
 
-	if (entry->flags & __EXEC_OBJECT_HAS_PIN)
-		__i915_vma_unpin(vma);
+	__i915_vma_unpin(vma);
 }
 
-static void
-eb_unreserve_vma(struct i915_vma *vma)
+static inline void
+eb_unreserve_vma(struct i915_vma *vma,
+		 struct drm_i915_gem_exec_object2 *entry)
 {
-	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
-
-	__eb_unreserve_vma(vma, entry);
-	entry->flags &= ~(__EXEC_OBJECT_HAS_FENCE | __EXEC_OBJECT_HAS_PIN);
+	if (entry->flags & __EXEC_OBJECT_HAS_PIN) {
+		__eb_unreserve_vma(vma, entry);
+		entry->flags &= ~__EB_RESERVED;
+	}
 }
 
-static void
-eb_reset(struct i915_execbuffer *eb)
+static int
+eb_add_vma(struct i915_execbuffer *eb,
+	   struct drm_i915_gem_exec_object2 *entry,
+	   struct i915_vma *vma)
 {
-	struct i915_vma *vma;
+	int ret;
 
-	list_for_each_entry(vma, &eb->vmas, exec_link) {
-		eb_unreserve_vma(vma);
-		i915_vma_put(vma);
-		vma->exec_entry = NULL;
+	GEM_BUG_ON(i915_vma_is_closed(vma));
+
+	if ((eb->args->flags & __EXEC_VALIDATED) == 0) {
+		if (unlikely(entry->flags & eb->invalid_flags))
+			return -EINVAL;
+
+		if (unlikely(entry->alignment && !is_power_of_2(entry->alignment)))
+			return -EINVAL;
+
+		/* Offset can be used as input (EXEC_OBJECT_PINNED), reject
+		 * any non-page-aligned or non-canonical addresses.
+		 */
+		if (entry->flags & EXEC_OBJECT_PINNED) {
+			if (unlikely(entry->offset !=
+				     gen8_canonical_addr(entry->offset & PAGE_MASK)))
+				return -EINVAL;
+		}
+
+		/* From drm_mm perspective address space is continuous,
+		 * so from this point we're always using non-canonical
+		 * form internally.
+		 */
+		entry->offset = gen8_noncanonical_addr(entry->offset);
+
+		/* pad_to_size was once a reserved field, so sanitize it */
+		if (entry->flags & EXEC_OBJECT_PAD_TO_SIZE) {
+			if (unlikely(offset_in_page(entry->pad_to_size)))
+				return -EINVAL;
+		} else {
+			entry->pad_to_size = 0;
+		}
+
+		if (unlikely(vma->exec_entry)) {
+			DRM_DEBUG("Object [handle %d, index %d] appears more than once in object list\n",
+				  entry->handle, (int)(entry - eb->exec));
+			return -EINVAL;
+		}
 	}
 
-	if (eb->lut_mask >= 0)
-		memset(eb->buckets, 0,
-		       (1<<eb->lut_mask)*sizeof(struct hlist_head));
-}
+	vma->exec_entry = entry;
+	entry->rsvd2 = (uintptr_t)vma;
+	i915_vma_get(vma);
 
-#define to_ptr(T, x) ((T *)(uintptr_t)(x))
+	if (eb->lut_size >= 0) {
+		vma->exec_handle = entry->handle;
+		hlist_add_head(&vma->exec_node,
+			       &eb->buckets[hash_32(entry->handle,
+						    eb->lut_size)]);
+	}
 
-static bool
-eb_add_vma(struct i915_execbuffer *eb, struct i915_vma *vma, int i)
+	if (entry->relocation_count)
+		list_add_tail(&vma->reloc_link, &eb->relocs);
+
+	if (!eb->reloc_cache.has_fence) {
+		entry->flags &= ~EXEC_OBJECT_NEEDS_FENCE;
+	} else {
+		if (entry->flags & EXEC_OBJECT_NEEDS_FENCE &&
+		    i915_gem_object_is_tiled(vma->obj))
+			entry->flags |= EXEC_OBJECT_NEEDS_GTT | __EXEC_OBJECT_NEEDS_MAP;
+	}
+
+	if ((entry->flags & EXEC_OBJECT_PINNED) == 0)
+		entry->flags |= eb->context_flags;
+
+	ret = 0;
+	if (vma->node.size)
+		eb_pin_vma(eb, entry, vma);
+	if (eb_vma_misplaced(entry, vma)) {
+		eb_unreserve_vma(vma, entry);
+
+		list_add_tail(&vma->exec_link, &eb->unbound);
+		if (drm_mm_node_allocated(&vma->node))
+			ret = i915_vma_unbind(vma);
+	} else {
+		if (entry->offset != vma->node.start) {
+			entry->offset = vma->node.start | UPDATE;
+			eb->args->flags |= __EXEC_HAS_RELOC;
+		}
+	}
+	return ret;
+}
+
+static inline int use_cpu_reloc(const struct reloc_cache *cache,
+				const struct drm_i915_gem_object *obj)
 {
-	if (unlikely(vma->exec_entry)) {
-		DRM_DEBUG("Object [handle %d, index %d] appears more than once in object list\n",
-			  eb->exec[i].handle, i);
+	if (!i915_gem_object_has_struct_page(obj))
 		return false;
+
+	if (DBG_USE_CPU_RELOC)
+		return DBG_USE_CPU_RELOC > 0;
+
+	return (cache->has_llc ||
+		obj->base.write_domain == I915_GEM_DOMAIN_CPU ||
+		obj->cache_level != I915_CACHE_NONE);
+}
+
+static int
+eb_reserve_vma(struct i915_execbuffer *eb, struct i915_vma *vma)
+{
+	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
+	u64 flags;
+	int ret;
+
+	flags = PIN_USER | PIN_NONBLOCK;
+	if (entry->flags & EXEC_OBJECT_NEEDS_GTT)
+		flags |= PIN_GLOBAL;
+
+	if (!drm_mm_node_allocated(&vma->node)) {
+		/* Wa32bitGeneralStateOffset & Wa32bitInstructionBaseOffset,
+		 * limit address to the first 4GBs for unflagged objects.
+		 */
+		if ((entry->flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) == 0)
+			flags |= PIN_ZONE_4G;
+
+		if (entry->flags & __EXEC_OBJECT_NEEDS_MAP)
+			flags |= PIN_MAPPABLE;
+
+		if (entry->flags & EXEC_OBJECT_PINNED) {
+			flags |= entry->offset | PIN_OFFSET_FIXED;
+			/* force overlapping PINNED checks */
+			flags &= ~PIN_NONBLOCK;
+		} else if (entry->flags & __EXEC_OBJECT_NEEDS_BIAS)
+			flags |= BATCH_OFFSET_BIAS | PIN_OFFSET_BIAS;
 	}
-	list_add_tail(&vma->exec_link, &eb->vmas);
 
-	vma->exec_entry = &eb->exec[i];
-	if (eb->lut_mask >= 0) {
-		vma->exec_handle = eb->exec[i].handle;
-		hlist_add_head(&vma->exec_node,
-			       &eb->buckets[hash_32(vma->exec_handle,
-						    eb->lut_mask)]);
+	ret = i915_vma_pin(vma, entry->pad_to_size, entry->alignment, flags);
+	if (ret)
+		return ret;
+
+	if (entry->offset != vma->node.start) {
+		entry->offset = vma->node.start | UPDATE;
+		eb->args->flags |= __EXEC_HAS_RELOC;
 	}
+	entry->flags |= __EXEC_OBJECT_HAS_PIN;
 
-	i915_vma_get(vma);
-	eb->exec[i].rsvd2 = (uintptr_t)vma;
-	return true;
+	if (entry->flags & EXEC_OBJECT_NEEDS_FENCE) {
+		ret = i915_vma_get_fence(vma);
+		if (ret)
+			return ret;
+
+		if (i915_vma_pin_fence(vma))
+			entry->flags |= __EXEC_OBJECT_HAS_FENCE;
+	}
+
+	GEM_BUG_ON(eb_vma_misplaced(entry, vma));
+	return 0;
 }
 
-static inline struct hlist_head *ht_head(struct i915_gem_context *ctx,
-					 u32 handle)
+static int eb_reserve(struct i915_execbuffer *eb)
+{
+	const unsigned int count = eb->buffer_count;
+	struct list_head last;
+	struct i915_vma *vma;
+	unsigned int i, pass;
+	int ret;
+
+	/* Attempt to pin all of the buffers into the GTT.
+	 * This is done in 3 phases:
+	 *
+	 * 1a. Unbind all objects that do not match the GTT constraints for
+	 *     the execbuffer (fenceable, mappable, alignment etc).
+	 * 1b. Increment pin count for already bound objects.
+	 * 2.  Bind new objects.
+	 * 3.  Decrement pin count.
+	 *
+	 * This avoid unnecessary unbinding of later objects in order to make
+	 * room for the earlier objects *unless* we need to defragment.
+	 */
+
+	pass = 0;
+	ret = 0;
+	do {
+		list_for_each_entry(vma, &eb->unbound, exec_link) {
+			ret = eb_reserve_vma(eb, vma);
+			if (ret)
+				break;
+		}
+		if (ret != -ENOSPC || pass++)
+			return ret;
+
+		/* Resort *all* the objects into priority order */
+		INIT_LIST_HEAD(&eb->unbound);
+		INIT_LIST_HEAD(&last);
+		for (i = 0; i < count; i++) {
+			struct drm_i915_gem_exec_object2 *entry = &eb->exec[i];
+
+			vma = to_ptr(struct i915_vma, entry->rsvd2);
+			eb_unreserve_vma(vma, entry);
+
+			if (entry->flags & EXEC_OBJECT_PINNED)
+				list_add(&vma->exec_link, &eb->unbound);
+			else if (entry->flags & __EXEC_OBJECT_NEEDS_MAP)
+				list_add_tail(&vma->exec_link, &eb->unbound);
+			else
+				list_add_tail(&vma->exec_link, &last);
+		}
+		list_splice_tail(&last, &eb->unbound);
+
+		/* Too fragmented, unbind everything and retry */
+		ret = i915_gem_evict_vm(eb->vm);
+		if (ret)
+			return ret;
+	} while (1);
+}
+
+static inline struct hlist_head *
+ht_head(const struct i915_gem_context *ctx, u32 handle)
 {
 	return &ctx->vma.ht[hash_32(handle, ctx->vma.ht_bits)];
 }
 
+static int eb_batch_index(const struct i915_execbuffer *eb)
+{
+	return eb->buffer_count - 1;
+}
+
+static int eb_select_context(struct i915_execbuffer *eb)
+{
+	struct i915_gem_context *ctx;
+
+	ctx = i915_gem_context_lookup(eb->file->driver_priv, eb->args->rsvd1);
+	if (unlikely(IS_ERR(ctx)))
+		return PTR_ERR(ctx);
+
+	if (unlikely(i915_gem_context_is_banned(ctx))) {
+		DRM_DEBUG("Context %u tried to submit while banned\n",
+			  ctx->user_handle);
+		return -EIO;
+	}
+
+	eb->ctx = i915_gem_context_get(ctx);
+	eb->vm = ctx->ppgtt ? &ctx->ppgtt->base : &eb->i915->ggtt.base;
+
+	eb->context_flags = 0;
+	if (ctx->flags & CONTEXT_NO_ZEROMAP)
+		eb->context_flags |= __EXEC_OBJECT_NEEDS_BIAS;
+
+	return 0;
+}
+
 static int
 eb_lookup_vmas(struct i915_execbuffer *eb)
 {
-	const int count = eb->args->buffer_count;
+	const int count = eb->buffer_count;
 	struct i915_vma *vma;
+	struct idr *idr;
 	int slow_pass = -1;
-	int i;
+	int i, ret;
 
-	INIT_LIST_HEAD(&eb->vmas);
+	INIT_LIST_HEAD(&eb->relocs);
+	INIT_LIST_HEAD(&eb->unbound);
 
 	if (unlikely(eb->ctx->vma.ht_size & 1))
 		flush_work(&eb->ctx->vma.resize);
@@ -195,8 +605,9 @@ eb_lookup_vmas(struct i915_execbuffer *eb)
 			if (vma->ctx_handle != eb->exec[i].handle)
 				continue;
 
-			if (!eb_add_vma(eb, vma, i))
-				return -EINVAL;
+			ret = eb_add_vma(eb, &eb->exec[i], vma);
+			if (unlikely(ret))
+				return ret;
 
 			goto next_vma;
 		}
@@ -207,24 +618,25 @@ next_vma: ;
 	}
 
 	if (slow_pass < 0)
-		return 0;
+		goto out;
 
 	spin_lock(&eb->file->table_lock);
 	/* Grab a reference to the object and release the lock so we can lookup
 	 * or create the VMA without using GFP_ATOMIC */
+	idr = &eb->file->object_idr;
 	for (i = slow_pass; i < count; i++) {
 		struct drm_i915_gem_object *obj;
 
 		if (eb->exec[i].rsvd2)
 			continue;
 
-		obj = to_intel_bo(idr_find(&eb->file->object_idr,
-					   eb->exec[i].handle));
+		obj = to_intel_bo(idr_find(idr, eb->exec[i].handle));
 		if (unlikely(!obj)) {
 			spin_unlock(&eb->file->table_lock);
 			DRM_DEBUG("Invalid object handle %d at index %d\n",
 				  eb->exec[i].handle, i);
-			return -ENOENT;
+			ret = -ENOENT;
+			goto err;
 		}
 
 		eb->exec[i].rsvd2 = 1 | (uintptr_t)obj;
@@ -245,11 +657,12 @@ next_vma: ;
 		 * from the (obj, vm) we don't run the risk of creating
 		 * duplicated vmas for the same vm.
 		 */
-		obj = to_ptr(struct drm_i915_gem_object, eb->exec[i].rsvd2 & ~1);
+		obj = to_ptr(typeof(*obj), eb->exec[i].rsvd2 & ~1);
 		vma = i915_vma_instance(obj, eb->vm, NULL);
 		if (unlikely(IS_ERR(vma))) {
 			DRM_DEBUG("Failed to lookup VMA\n");
-			return PTR_ERR(vma);
+			ret = PTR_ERR(vma);
+			goto err;
 		}
 
 		/* First come, first served */
@@ -265,8 +678,9 @@ next_vma: ;
 			}
 		}
 
-		if (!eb_add_vma(eb, vma, i))
-			return -EINVAL;
+		ret = eb_add_vma(eb, &eb->exec[i], vma);
+		if (unlikely(ret))
+			goto err;
 	}
 	if (4*eb->ctx->vma.ht_count > 3*eb->ctx->vma.ht_size ||
 	    4*eb->ctx->vma.ht_count < eb->ctx->vma.ht_size) {
@@ -274,15 +688,10 @@ next_vma: ;
 		queue_work(system_highpri_wq, &eb->ctx->vma.resize);
 	}
 
-	return 0;
-}
-
-static struct i915_vma *
-eb_get_batch(struct i915_execbuffer *eb)
-{
-	struct i915_vma *vma;
-
-	vma = to_ptr(struct i915_vma, eb->exec[eb->args->buffer_count-1].rsvd2);
+out:
+	/* take note of the batch buffer before we might reorder the lists */
+	i = eb_batch_index(eb);
+	eb->batch = to_ptr(struct i915_vma, eb->exec[i].rsvd2);
 
 	/*
 	 * SNA is doing fancy tricks with compressing batch buffers, which leads
@@ -293,24 +702,34 @@ eb_get_batch(struct i915_execbuffer *eb)
 	 * Note that actual hangs have only been observed on gen7, but for
 	 * paranoia do it everywhere.
 	 */
-	if ((vma->exec_entry->flags & EXEC_OBJECT_PINNED) == 0)
-		vma->exec_entry->flags |= __EXEC_OBJECT_NEEDS_BIAS;
+	if ((eb->exec[i].flags & EXEC_OBJECT_PINNED) == 0)
+		eb->exec[i].flags |= __EXEC_OBJECT_NEEDS_BIAS;
+	if (eb->reloc_cache.has_fence)
+		eb->exec[i].flags |= EXEC_OBJECT_NEEDS_FENCE;
 
-	return vma;
+	eb->args->flags |= __EXEC_VALIDATED;
+	return eb_reserve(eb);
+
+err:
+	for (i = slow_pass; i < count; i++) {
+		if (eb->exec[i].rsvd2 & 1)
+			eb->exec[i].rsvd2 = 0;
+	}
+	return ret;
 }
 
 static struct i915_vma *
-eb_get_vma(struct i915_execbuffer *eb, unsigned long handle)
+eb_get_vma(const struct i915_execbuffer *eb, unsigned long handle)
 {
-	if (eb->lut_mask < 0) {
-		if (handle >= -eb->lut_mask)
+	if (eb->lut_size < 0) {
+		if (handle >= -eb->lut_size)
 			return NULL;
 		return to_ptr(struct i915_vma, eb->exec[handle].rsvd2);
 	} else {
 		struct hlist_head *head;
 		struct i915_vma *vma;
 
-		head = &eb->buckets[hash_32(handle, eb->lut_mask)];
+		head = &eb->buckets[hash_32(handle, eb->lut_size)];
 		hlist_for_each_entry(vma, head, exec_node) {
 			if (vma->exec_handle == handle)
 				return vma;
@@ -319,61 +738,60 @@ eb_get_vma(struct i915_execbuffer *eb, unsigned long handle)
 	}
 }
 
-static void eb_destroy(struct i915_execbuffer *eb)
+static void
+eb_reset(const struct i915_execbuffer *eb)
 {
-	struct i915_vma *vma;
+	const unsigned int count = eb->buffer_count;
+	unsigned int i;
 
-	list_for_each_entry(vma, &eb->vmas, exec_link) {
-		if (!vma->exec_entry)
-			continue;
+	for (i = 0; i < count; i++) {
+		struct drm_i915_gem_exec_object2 *entry = &eb->exec[i];
+		struct i915_vma *vma = to_ptr(struct i915_vma, entry->rsvd2);
 
-		__eb_unreserve_vma(vma, vma->exec_entry);
+		eb_unreserve_vma(vma, entry);
 		vma->exec_entry = NULL;
 		i915_vma_put(vma);
 	}
 
-	i915_gem_context_put(eb->ctx);
-
-	if (eb->lut_mask >= 0)
-		kfree(eb->buckets);
+	if (eb->lut_size >= 0)
+		memset(eb->buckets, 0,
+		       sizeof(struct hlist_head) << eb->lut_size);
 }
 
-static inline int use_cpu_reloc(struct drm_i915_gem_object *obj)
+static void eb_release_vma(const struct i915_execbuffer *eb)
 {
-	if (!i915_gem_object_has_struct_page(obj))
-		return false;
+	const unsigned int count = eb->buffer_count;
+	unsigned int i;
 
-	if (DBG_USE_CPU_RELOC)
-		return DBG_USE_CPU_RELOC > 0;
+	if (!eb->exec)
+		return;
 
-	return (HAS_LLC(to_i915(obj->base.dev)) ||
-		obj->base.write_domain == I915_GEM_DOMAIN_CPU ||
-		obj->cache_level != I915_CACHE_NONE);
-}
+	for (i = 0; i < count; i++) {
+		struct drm_i915_gem_exec_object2 *entry = &eb->exec[i];
+		struct i915_vma *vma = to_ptr(struct i915_vma, entry->rsvd2);
 
-/* Used to convert any address to canonical form.
- * Starting from gen8, some commands (e.g. STATE_BASE_ADDRESS,
- * MI_LOAD_REGISTER_MEM and others, see Broadwell PRM Vol2a) require the
- * addresses to be in a canonical form:
- * "GraphicsAddress[63:48] are ignored by the HW and assumed to be in correct
- * canonical form [63:48] == [47]."
- */
-#define GEN8_HIGH_ADDRESS_BIT 47
-static inline uint64_t gen8_canonical_addr(uint64_t address)
-{
-	return sign_extend64(address, GEN8_HIGH_ADDRESS_BIT);
+		if (!vma || !vma->exec_entry)
+			continue;
+
+		GEM_BUG_ON(vma->exec_entry != entry);
+		if (entry->flags & __EXEC_OBJECT_HAS_PIN)
+			__eb_unreserve_vma(vma, entry);
+		vma->exec_entry = NULL;
+		i915_vma_put(vma);
+	}
 }
 
-static inline uint64_t gen8_noncanonical_addr(uint64_t address)
+static void eb_destroy(const struct i915_execbuffer *eb)
 {
-	return address & ((1ULL << (GEN8_HIGH_ADDRESS_BIT + 1)) - 1);
+	if (eb->lut_size >= 0)
+		kfree(eb->buckets);
 }
 
-static inline uint64_t
+static inline u64
 relocation_target(const struct drm_i915_gem_relocation_entry *reloc,
-		  uint64_t target_offset)
+		  const struct i915_vma *target)
 {
-	return gen8_canonical_addr((int)reloc->delta + target_offset);
+	return gen8_canonical_addr((int)reloc->delta + target->node.start);
 }
 
 static void reloc_cache_init(struct reloc_cache *cache,
@@ -382,6 +800,8 @@ static void reloc_cache_init(struct reloc_cache *cache,
 	cache->page = -1;
 	cache->vaddr = 0;
 	/* Must be a variable in the struct to allow GCC to unroll. */
+	cache->has_llc = HAS_LLC(i915);
+	cache->has_fence = INTEL_GEN(i915) < 4;
 	cache->use_64bit_reloc = HAS_64BIT_RELOC(i915);
 	cache->node.allocated = false;
 }
@@ -484,7 +904,7 @@ static void *reloc_iomap(struct drm_i915_gem_object *obj,
 		struct i915_vma *vma;
 		int ret;
 
-		if (use_cpu_reloc(obj))
+		if (use_cpu_reloc(cache, obj))
 			return NULL;
 
 		ret = i915_gem_object_set_to_gtt_domain(obj, true);
@@ -573,25 +993,26 @@ static void clflush_write32(u32 *addr, u32 value, unsigned int flushes)
 		*addr = value;
 }
 
-static int
-relocate_entry(struct drm_i915_gem_object *obj,
+static u64
+relocate_entry(struct i915_vma *vma,
 	       const struct drm_i915_gem_relocation_entry *reloc,
-	       struct reloc_cache *cache,
-	       u64 target_offset)
+	       struct i915_execbuffer *eb,
+	       const struct i915_vma *target)
 {
+	struct drm_i915_gem_object *obj = vma->obj;
 	u64 offset = reloc->offset;
-	bool wide = cache->use_64bit_reloc;
+	u64 target_offset = relocation_target(reloc, target);
+	bool wide = eb->reloc_cache.use_64bit_reloc;
 	void *vaddr;
 
-	target_offset = relocation_target(reloc, target_offset);
 repeat:
-	vaddr = reloc_vaddr(obj, cache, offset >> PAGE_SHIFT);
+	vaddr = reloc_vaddr(obj, &eb->reloc_cache, offset >> PAGE_SHIFT);
 	if (IS_ERR(vaddr))
 		return PTR_ERR(vaddr);
 
 	clflush_write32(vaddr + offset_in_page(offset),
 			lower_32_bits(target_offset),
-			cache->vaddr);
+			eb->reloc_cache.vaddr);
 
 	if (wide) {
 		offset += sizeof(u32);
@@ -600,16 +1021,15 @@ relocate_entry(struct drm_i915_gem_object *obj,
 		goto repeat;
 	}
 
-	return 0;
+	return gen8_canonical_addr(target->node.start) | 1;
 }
 
-static int
-eb_relocate_entry(struct i915_vma *vma,
-		  struct i915_execbuffer *eb,
-		  struct drm_i915_gem_relocation_entry *reloc)
+static u64
+eb_relocate_entry(struct i915_execbuffer *eb,
+		  struct i915_vma *vma,
+		  const struct drm_i915_gem_relocation_entry *reloc)
 {
 	struct i915_vma *target;
-	u64 target_offset;
 	int ret;
 
 	/* we've already hold a reference to all valid objects */
@@ -640,26 +1060,28 @@ eb_relocate_entry(struct i915_vma *vma,
 		return -EINVAL;
 	}
 
-	if (reloc->write_domain)
+	if (reloc->write_domain) {
 		target->exec_entry->flags |= EXEC_OBJECT_WRITE;
 
-	/* Sandybridge PPGTT errata: We need a global gtt mapping for MI and
-	 * pipe_control writes because the gpu doesn't properly redirect them
-	 * through the ppgtt for non_secure batchbuffers.
-	 */
-	if (unlikely(IS_GEN6(eb->i915) &&
-		     reloc->write_domain == I915_GEM_DOMAIN_INSTRUCTION)) {
-		ret = i915_vma_bind(target, target->obj->cache_level,
-				    PIN_GLOBAL);
-		if (WARN_ONCE(ret, "Unexpected failure to bind target VMA!"))
-			return ret;
+		/* Sandybridge PPGTT errata: We need a global gtt mapping
+		 * for MI and pipe_control writes because the gpu doesn't
+		 * properly redirect them through the ppgtt for non_secure
+		 * batchbuffers.
+		 */
+		if (reloc->write_domain == I915_GEM_DOMAIN_INSTRUCTION &&
+		    IS_GEN6(eb->i915)) {
+			ret = i915_vma_bind(target, target->obj->cache_level,
+					    PIN_GLOBAL);
+			if (WARN_ONCE(ret,
+				      "Unexpected failure to bind target VMA!"))
+				return ret;
+		}
 	}
 
 	/* If the relocation already has the right value in it, no
 	 * more work needs to be done.
 	 */
-	target_offset = gen8_canonical_addr(target->node.start);
-	if (target_offset == reloc->presumed_offset)
+	if (gen8_canonical_addr(target->node.start) == reloc->presumed_offset)
 		return 0;
 
 	/* Check that the relocation address is valid... */
@@ -680,34 +1102,33 @@ eb_relocate_entry(struct i915_vma *vma,
 		return -EINVAL;
 	}
 
-	ret = relocate_entry(vma->obj, reloc, &eb->reloc_cache, target_offset);
-	if (ret)
-		return ret;
-
 	/* and update the user's relocation entry */
-	reloc->presumed_offset = target_offset;
-	return 0;
+	return relocate_entry(vma, reloc, eb, target);
 }
 
-static int eb_relocate_vma(struct i915_vma *vma, struct i915_execbuffer *eb)
+static int eb_relocate_vma(struct i915_execbuffer *eb, struct i915_vma *vma)
 {
 #define N_RELOC(x) ((x) / sizeof(struct drm_i915_gem_relocation_entry))
-	struct drm_i915_gem_relocation_entry stack_reloc[N_RELOC(512)];
-	struct drm_i915_gem_relocation_entry __user *user_relocs;
-	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
-	int remain, ret = 0;
-
-	user_relocs = u64_to_user_ptr(entry->relocs_ptr);
+	struct drm_i915_gem_relocation_entry stack[N_RELOC(512)];
+	struct drm_i915_gem_relocation_entry __user *urelocs;
+	const struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
+	unsigned int remain;
 
+	urelocs = u64_to_user_ptr(entry->relocs_ptr);
 	remain = entry->relocation_count;
-	while (remain) {
-		struct drm_i915_gem_relocation_entry *r = stack_reloc;
-		unsigned long unwritten;
-		unsigned int count;
+	if (unlikely(remain > ULONG_MAX / sizeof(*urelocs)))
+		return -EINVAL;
 
-		count = min_t(unsigned int, remain, ARRAY_SIZE(stack_reloc));
-		remain -= count;
+	/*
+	 * We must check that the entire relocation array is safe
+	 * to read. However, if the array is not writable the user loses
+	 * the updated relocation values.
+	 */
 
+	do {
+		struct drm_i915_gem_relocation_entry *r = stack;
+		unsigned int count =
+			min_t(unsigned int, remain, ARRAY_SIZE(stack));
 		/* This is the fast path and we cannot handle a pagefault
 		 * whilst holding the struct mutex lest the user pass in the
 		 * relocations contained within a mmaped bo. For in such a case
@@ -716,66 +1137,66 @@ static int eb_relocate_vma(struct i915_vma *vma, struct i915_execbuffer *eb)
 		 * this is bad and so lockdep complains vehemently.
 		 */
 		pagefault_disable();
-		unwritten = __copy_from_user_inatomic(r, user_relocs, count*sizeof(r[0]));
-		pagefault_enable();
-		if (unlikely(unwritten)) {
-			ret = -EFAULT;
+		if (__copy_from_user_inatomic(r, urelocs, count*sizeof(r[0]))) {
+			pagefault_enable();
+			remain = -EFAULT;
 			goto out;
 		}
+		pagefault_enable();
 
+		remain -= count;
 		do {
-			u64 offset = r->presumed_offset;
+			u64 offset = eb_relocate_entry(eb, vma, r);
 
-			ret = eb_relocate_entry(vma, eb, r);
-			if (ret)
+			if (likely(offset == 0)) {
+			} else if ((s64)offset < 0) {
+				remain = (s64)offset;
 				goto out;
-
-			if (r->presumed_offset != offset) {
+			} else {
+				/* Note that reporting an error now
+				 * leaves everything in an inconsistent
+				 * state as we have *already* changed
+				 * the relocation value inside the
+				 * object. As we have not changed the
+				 * reloc.presumed_offset or will not
+				 * change the execobject.offset, on the
+				 * call we may not rewrite the value
+				 * inside the object, leaving it
+				 * dangling and causing a GPU hang.
+				 */
 				pagefault_disable();
-				unwritten = __put_user(r->presumed_offset,
-						       &user_relocs->presumed_offset);
+				__put_user(offset & ~1,
+					   &urelocs[r-stack].presumed_offset);
 				pagefault_enable();
-				if (unlikely(unwritten)) {
-					/* Note that reporting an error now
-					 * leaves everything in an inconsistent
-					 * state as we have *already* changed
-					 * the relocation value inside the
-					 * object. As we have not changed the
-					 * reloc.presumed_offset or will not
-					 * change the execobject.offset, on the
-					 * call we may not rewrite the value
-					 * inside the object, leaving it
-					 * dangling and causing a GPU hang.
-					 */
-					ret = -EFAULT;
-					goto out;
-				}
 			}
-
-			user_relocs++;
-			r++;
-		} while (--count);
-	}
-
+		} while (r++, --count);
+		urelocs += ARRAY_SIZE(stack);
+	} while (remain);
 out:
 	reloc_cache_reset(&eb->reloc_cache);
-	return ret;
+	return remain;
 #undef N_RELOC
 }
 
 static int
-eb_relocate_vma_slow(struct i915_vma *vma,
-		     struct i915_execbuffer *eb,
-		     struct drm_i915_gem_relocation_entry *relocs)
+eb_relocate_vma_slow(struct i915_execbuffer *eb, struct i915_vma *vma)
 {
 	const struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
-	int i, ret = 0;
+	struct drm_i915_gem_relocation_entry *relocs =
+		to_ptr(typeof(*relocs), entry->relocs_ptr);
+	unsigned int i;
+	int ret;
 
 	for (i = 0; i < entry->relocation_count; i++) {
-		ret = eb_relocate_entry(vma, eb, &relocs[i]);
-		if (ret)
-			break;
+		u64 offset = eb_relocate_entry(eb, vma, &relocs[i]);
+
+		if ((s64)offset < 0) {
+			ret = (s64)offset;
+			goto err;
+		}
 	}
+	ret = 0;
+err:
 	reloc_cache_reset(&eb->reloc_cache);
 	return ret;
 }
@@ -783,299 +1204,188 @@ eb_relocate_vma_slow(struct i915_vma *vma,
 static int eb_relocate(struct i915_execbuffer *eb)
 {
 	struct i915_vma *vma;
-	int ret = 0;
 
-	list_for_each_entry(vma, &eb->vmas, exec_link) {
-		ret = eb_relocate_vma(vma, eb);
+	/* The objects are in their final locations, apply the relocations. */
+	list_for_each_entry(vma, &eb->relocs, reloc_link) {
+		int ret = eb_relocate_vma(eb, vma);
 		if (ret)
-			break;
+			return ret;
 	}
 
-	return ret;
-}
-
-static bool only_mappable_for_reloc(unsigned int flags)
-{
-	return (flags & (EXEC_OBJECT_NEEDS_FENCE | __EXEC_OBJECT_NEEDS_MAP)) ==
-		__EXEC_OBJECT_NEEDS_MAP;
+	return 0;
 }
 
-static int
-eb_reserve_vma(struct i915_vma *vma,
-	       struct intel_engine_cs *engine,
-	       bool *need_reloc)
+static int check_relocations(const struct drm_i915_gem_exec_object2 *entry)
 {
-	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
-	uint64_t flags;
-	int ret;
-
-	flags = PIN_USER;
-	if (entry->flags & EXEC_OBJECT_NEEDS_GTT)
-		flags |= PIN_GLOBAL;
+	const unsigned long relocs_max =
+		ULONG_MAX / sizeof(struct drm_i915_gem_relocation_entry);
+	const char __user *addr, *end;
+	unsigned long size;
+	char __maybe_unused c;
+
+	size = entry->relocation_count;
+	if (size == 0)
+		return 0;
 
-	if (!drm_mm_node_allocated(&vma->node)) {
-		/* Wa32bitGeneralStateOffset & Wa32bitInstructionBaseOffset,
-		 * limit address to the first 4GBs for unflagged objects.
-		 */
-		if ((entry->flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) == 0)
-			flags |= PIN_ZONE_4G;
-		if (entry->flags & __EXEC_OBJECT_NEEDS_MAP)
-			flags |= PIN_GLOBAL | PIN_MAPPABLE;
-		if (entry->flags & __EXEC_OBJECT_NEEDS_BIAS)
-			flags |= BATCH_OFFSET_BIAS | PIN_OFFSET_BIAS;
-		if (entry->flags & EXEC_OBJECT_PINNED)
-			flags |= entry->offset | PIN_OFFSET_FIXED;
-		if ((flags & PIN_MAPPABLE) == 0)
-			flags |= PIN_HIGH;
-	}
-
-	ret = i915_vma_pin(vma,
-			   entry->pad_to_size,
-			   entry->alignment,
-			   flags);
-	if ((ret == -ENOSPC || ret == -E2BIG) &&
-	    only_mappable_for_reloc(entry->flags))
-		ret = i915_vma_pin(vma,
-				   entry->pad_to_size,
-				   entry->alignment,
-				   flags & ~PIN_MAPPABLE);
-	if (ret)
-		return ret;
+	if (size > relocs_max)
+		return -EINVAL;
 
-	entry->flags |= __EXEC_OBJECT_HAS_PIN;
+	addr = u64_to_user_ptr(entry->relocs_ptr);
+	size *= sizeof(struct drm_i915_gem_relocation_entry);
+	if (!access_ok(VERIFY_WRITE, addr, size))
+		return -EFAULT;
 
-	if (entry->flags & EXEC_OBJECT_NEEDS_FENCE) {
-		ret = i915_vma_get_fence(vma);
+	end = addr + size;
+	for (; addr < end; addr += PAGE_SIZE) {
+		int ret = __get_user(c, addr);
 		if (ret)
 			return ret;
-
-		if (i915_vma_pin_fence(vma))
-			entry->flags |= __EXEC_OBJECT_HAS_FENCE;
 	}
-
-	if (entry->offset != vma->node.start) {
-		entry->offset = vma->node.start;
-		*need_reloc = true;
-	}
-
-	return 0;
+	return __get_user(c, end - 1);
 }
 
-static bool
-need_reloc_mappable(struct i915_vma *vma)
+static int
+eb_copy_relocations(const struct i915_execbuffer *eb)
 {
-	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
-
-	if (entry->relocation_count == 0)
-		return false;
-
-	if (!i915_vma_is_ggtt(vma))
-		return false;
+	const unsigned int count = eb->buffer_count;
+	unsigned int i;
+	int ret;
 
-	/* See also use_cpu_reloc() */
-	if (HAS_LLC(to_i915(vma->obj->base.dev)))
-		return false;
+	for (i = 0; i < count; i++) {
+		struct drm_i915_gem_relocation_entry __user *urelocs;
+		struct drm_i915_gem_relocation_entry *relocs;
+		unsigned int nreloc = eb->exec[i].relocation_count, j;
+		unsigned long size;
 
-	if (vma->obj->base.write_domain == I915_GEM_DOMAIN_CPU)
-		return false;
+		if (nreloc == 0)
+			continue;
 
-	return true;
-}
+		ret = check_relocations(&eb->exec[i]);
+		if (ret)
+			goto err;
 
-static bool
-eb_vma_misplaced(struct i915_vma *vma)
-{
-	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
+		urelocs = u64_to_user_ptr(eb->exec[i].relocs_ptr);
+		size = nreloc * sizeof(*relocs);
 
-	WARN_ON(entry->flags & __EXEC_OBJECT_NEEDS_MAP &&
-		!i915_vma_is_ggtt(vma));
+		relocs = drm_malloc_gfp(size, 1, GFP_TEMPORARY);
+		if (!relocs) {
+			ret = -ENOMEM;
+			goto err;
+		}
 
-	if (entry->alignment && !IS_ALIGNED(vma->node.start, entry->alignment))
-		return true;
+		/* copy_from_user is limited to 4GiB */
+		j = 0;
+		do {
+			u32 len = min_t(u64, 1ull<<31, size);
 
-	if (vma->node.size < entry->pad_to_size)
-		return true;
+			if (__copy_from_user(relocs + j, urelocs + j, len)) {
+				ret = -EFAULT;
+				goto err;
+			}
 
-	if (entry->flags & EXEC_OBJECT_PINNED &&
-	    vma->node.start != entry->offset)
-		return true;
+			size -= len;
+			BUILD_BUG_ON_NOT_POWER_OF_2(sizeof(*relocs));
+			j += len / sizeof(*relocs);
+		} while (size);
 
-	if (entry->flags & __EXEC_OBJECT_NEEDS_BIAS &&
-	    vma->node.start < BATCH_OFFSET_BIAS)
-		return true;
+		/* As we do not update the known relocation offsets after
+		 * relocating (due to the complexities in lock handling),
+		 * we need to mark them as invalid now so that we force the
+		 * relocation processing next time. Just in case the target
+		 * object is evicted and then rebound into its old
+		 * presumed_offset before the next execbuffer - if that
+		 * happened we would make the mistake of assuming that the
+		 * relocations were valid.
+		 */
+		user_access_begin();
+		for (j = 0; j < nreloc; j++)
+			unsafe_put_user(-1,
+					&urelocs[j].presumed_offset,
+					end_user);
+end_user:
+		user_access_end();
 
-	/* avoid costly ping-pong once a batch bo ended up non-mappable */
-	if (entry->flags & __EXEC_OBJECT_NEEDS_MAP &&
-	    !i915_vma_is_map_and_fenceable(vma))
-		return !only_mappable_for_reloc(entry->flags);
+		eb->exec[i].relocs_ptr = (uintptr_t)relocs;
+	}
 
-	if ((entry->flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) == 0 &&
-	    (vma->node.start + vma->node.size - 1) >> 32)
-		return true;
+	return 0;
 
-	return false;
+err:
+	while (i--) {
+		struct drm_i915_gem_relocation_entry *relocs =
+			to_ptr(typeof(*relocs), eb->exec[i].relocs_ptr);
+		if (eb->exec[i].relocation_count)
+			drm_free_large(relocs);
+	}
+	return ret;
 }
 
-static int eb_reserve(struct i915_execbuffer *eb)
+static int eb_prefault_relocations(const struct i915_execbuffer *eb)
 {
-	const bool has_fenced_gpu_access = INTEL_GEN(eb->i915) < 4;
-	struct i915_vma *vma;
-	struct list_head ordered_vmas;
-	struct list_head pinned_vmas;
-	int retry;
-
-	INIT_LIST_HEAD(&ordered_vmas);
-	INIT_LIST_HEAD(&pinned_vmas);
-	while (!list_empty(&eb->vmas)) {
-		struct drm_i915_gem_exec_object2 *entry;
-		bool need_fence, need_mappable;
-
-		vma = list_first_entry(&eb->vmas, struct i915_vma, exec_link);
-		entry = vma->exec_entry;
-
-		if (eb->ctx->flags & CONTEXT_NO_ZEROMAP)
-			entry->flags |= __EXEC_OBJECT_NEEDS_BIAS;
-
-		if (!has_fenced_gpu_access)
-			entry->flags &= ~EXEC_OBJECT_NEEDS_FENCE;
-		need_fence =
-			entry->flags & EXEC_OBJECT_NEEDS_FENCE &&
-			i915_gem_object_is_tiled(vma->obj);
-		need_mappable = need_fence || need_reloc_mappable(vma);
-
-		if (entry->flags & EXEC_OBJECT_PINNED)
-			list_move_tail(&vma->exec_link, &pinned_vmas);
-		else if (need_mappable) {
-			entry->flags |= __EXEC_OBJECT_NEEDS_MAP;
-			list_move(&vma->exec_link, &ordered_vmas);
-		} else
-			list_move_tail(&vma->exec_link, &ordered_vmas);
-	}
-	list_splice(&ordered_vmas, &eb->vmas);
-	list_splice(&pinned_vmas, &eb->vmas);
+	const unsigned int count = eb->buffer_count;
+	unsigned int i;
 
-	/* Attempt to pin all of the buffers into the GTT.
-	 * This is done in 3 phases:
-	 *
-	 * 1a. Unbind all objects that do not match the GTT constraints for
-	 *     the execbuffer (fenceable, mappable, alignment etc).
-	 * 1b. Increment pin count for already bound objects.
-	 * 2.  Bind new objects.
-	 * 3.  Decrement pin count.
-	 *
-	 * This avoid unnecessary unbinding of later objects in order to make
-	 * room for the earlier objects *unless* we need to defragment.
-	 */
-	retry = 0;
-	do {
-		int ret = 0;
-
-		/* Unbind any ill-fitting objects or pin. */
-		list_for_each_entry(vma, &eb->vmas, exec_link) {
-			if (!drm_mm_node_allocated(&vma->node))
-				continue;
-
-			if (eb_vma_misplaced(vma))
-				ret = i915_vma_unbind(vma);
-			else
-				ret = eb_reserve_vma(vma, eb->engine, &eb->need_relocs);
-			if (ret)
-				goto err;
-		}
-
-		/* Bind fresh objects */
-		list_for_each_entry(vma, &eb->vmas, exec_link) {
-			if (drm_mm_node_allocated(&vma->node))
-				continue;
-
-			ret = eb_reserve_vma(vma, eb->engine, &eb->need_relocs);
-			if (ret)
-				goto err;
-		}
-
-err:
-		if (ret != -ENOSPC || retry++)
-			return ret;
+	if (unlikely(i915.prefault_disable))
+		return 0;
 
-		/* Decrement pin count for bound objects */
-		list_for_each_entry(vma, &eb->vmas, exec_link)
-			eb_unreserve_vma(vma);
+	for (i = 0; i < count; i++) {
+		int ret;
 
-		ret = i915_gem_evict_vm(eb->vm, true);
+		ret = check_relocations(&eb->exec[i]);
 		if (ret)
 			return ret;
-	} while (1);
+	}
+
+	return 0;
 }
 
-static int
-eb_relocate_slow(struct i915_execbuffer *eb)
+static int eb_relocate_slow(struct i915_execbuffer *eb)
 {
-	const unsigned int count = eb->args->buffer_count;
 	struct drm_device *dev = &eb->i915->drm;
-	struct drm_i915_gem_relocation_entry *reloc;
+	bool have_copy = false;
 	struct i915_vma *vma;
-	int *reloc_offset;
-	int i, total, ret;
+	int ret = 0;
+
+repeat:
+	if (signal_pending(current)) {
+		ret = -ERESTARTSYS;
+		goto out;
+	}
 
 	/* We may process another execbuffer during the unlock... */
 	eb_reset(eb);
 	mutex_unlock(&dev->struct_mutex);
 
-	total = 0;
-	for (i = 0; i < count; i++)
-		total += eb->exec[i].relocation_count;
-
-	reloc_offset = drm_malloc_ab(count, sizeof(*reloc_offset));
-	reloc = drm_malloc_ab(total, sizeof(*reloc));
-	if (reloc == NULL || reloc_offset == NULL) {
-		drm_free_large(reloc);
-		drm_free_large(reloc_offset);
-		mutex_lock(&dev->struct_mutex);
-		return -ENOMEM;
-	}
-
-	total = 0;
-	for (i = 0; i < count; i++) {
-		struct drm_i915_gem_relocation_entry __user *user_relocs;
-		u64 invalid_offset = (u64)-1;
-		int j;
-
-		user_relocs = u64_to_user_ptr(eb->exec[i].relocs_ptr);
-
-		if (copy_from_user(reloc+total, user_relocs,
-				   eb->exec[i].relocation_count * sizeof(*reloc))) {
-			ret = -EFAULT;
-			mutex_lock(&dev->struct_mutex);
-			goto err;
-		}
-
-		/* As we do not update the known relocation offsets after
-		 * relocating (due to the complexities in lock handling),
-		 * we need to mark them as invalid now so that we force the
-		 * relocation processing next time. Just in case the target
-		 * object is evicted and then rebound into its old
-		 * presumed_offset before the next execbuffer - if that
-		 * happened we would make the mistake of assuming that the
-		 * relocations were valid.
-		 */
-		for (j = 0; j < eb->exec[i].relocation_count; j++) {
-			if (__copy_to_user(&user_relocs[j].presumed_offset,
-					   &invalid_offset,
-					   sizeof(invalid_offset))) {
-				ret = -EFAULT;
-				mutex_lock(&dev->struct_mutex);
-				goto err;
-			}
-		}
-
-		reloc_offset[i] = total;
-		total += eb->exec[i].relocation_count;
+	/* We take 3 passes through the slowpatch.
+	 *
+	 * 1 - we try to just prefault all the user relocation entries and
+	 * then attempt to reuse the atomic pagefault disabled fast path again.
+	 *
+	 * 2 - we copy the user entries to a local buffer here outside of the
+	 * local and allow ourselves to wait upon any rendering before
+	 * relocations
+	 *
+	 * 3 - we already have a local copy of the relocation entries, but
+	 * were interrupted (EAGAIN) whilst waiting for the objects, try again.
+	 */
+	if (ret == 0) {
+		ret = eb_prefault_relocations(eb);
+	} else if (!have_copy) {
+		ret = eb_copy_relocations(eb);
+		have_copy = ret == 0;
+	} else {
+		cond_resched();
+		ret = 0;
+	}
+	if (ret) {
+		mutex_lock(&dev->struct_mutex);
+		goto out;
 	}
 
 	ret = i915_mutex_lock_interruptible(dev);
 	if (ret) {
 		mutex_lock(&dev->struct_mutex);
-		goto err;
+		goto out;
 	}
 
 	/* reacquire the objects */
@@ -1083,16 +1393,18 @@ eb_relocate_slow(struct i915_execbuffer *eb)
 	if (ret)
 		goto err;
 
-	ret = eb_reserve(eb);
-	if (ret)
-		goto err;
-
-	list_for_each_entry(vma, &eb->vmas, exec_link) {
-		int idx = vma->exec_entry - eb->exec;
-
-		ret = eb_relocate_vma_slow(vma, eb, reloc + reloc_offset[idx]);
-		if (ret)
-			goto err;
+	list_for_each_entry(vma, &eb->relocs, reloc_link) {
+		if (!have_copy) {
+			pagefault_disable();
+			ret = eb_relocate_vma(eb, vma);
+			pagefault_enable();
+			if (ret)
+				goto repeat;
+		} else {
+			ret = eb_relocate_vma_slow(eb, vma);
+			if (ret)
+				goto err;
+		}
 	}
 
 	/* Leave the user relocations as are, this is the painfully slow path,
@@ -1102,21 +1414,67 @@ eb_relocate_slow(struct i915_execbuffer *eb)
 	 */
 
 err:
-	drm_free_large(reloc);
-	drm_free_large(reloc_offset);
-	return ret;
+	if (ret == -EAGAIN)
+		goto repeat;
+
+out:
+	if (have_copy) {
+		const unsigned int count = eb->buffer_count;
+		unsigned int i;
+
+		for (i = 0; i < count; i++) {
+			const struct drm_i915_gem_exec_object2 *entry =
+				&eb->exec[i];
+			struct drm_i915_gem_relocation_entry *relocs;
+
+			if (entry->relocation_count == 0)
+				continue;
+
+			relocs = to_ptr(typeof(*relocs), entry->relocs_ptr);
+			drm_free_large(relocs);
+		}
+	}
+
+	return ret ?: have_copy;
+}
+
+static void eb_export_fence(struct drm_i915_gem_object *obj,
+			    struct drm_i915_gem_request *req,
+			    unsigned int flags)
+{
+	struct reservation_object *resv = obj->resv;
+
+	/* Ignore errors from failing to allocate the new fence, we can't
+	 * handle an error right now. Worst case should be missed
+	 * synchronisation leading to rendering corruption.
+	 */
+	reservation_object_lock(resv, NULL);
+	if (flags & EXEC_OBJECT_WRITE)
+		reservation_object_add_excl_fence(resv, &req->fence);
+	else if (reservation_object_reserve_shared(resv) == 0)
+		reservation_object_add_shared_fence(resv, &req->fence);
+	reservation_object_unlock(resv);
+}
+
+static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
+{
+	return !(obj->cache_level == I915_CACHE_NONE ||
+		 obj->cache_level == I915_CACHE_WT);
 }
 
 static int
 eb_move_to_gpu(struct i915_execbuffer *eb)
 {
-	struct i915_vma *vma;
+	const unsigned int count = eb->buffer_count;
+	unsigned int i;
 	int ret;
 
-	list_for_each_entry(vma, &eb->vmas, exec_link) {
+	for (i = 0; i < count; i++) {
+		const struct drm_i915_gem_exec_object2 *entry = &eb->exec[i];
+		struct i915_vma *vma = to_ptr(struct i915_vma, entry->rsvd2);
 		struct drm_i915_gem_object *obj = vma->obj;
 
-		if (vma->exec_entry->flags & EXEC_OBJECT_CAPTURE) {
+		if (entry->flags & EXEC_OBJECT_CAPTURE) {
 			struct i915_gem_capture_list *capture;
 
 			capture = kmalloc(sizeof(*capture), GFP_KERNEL);
@@ -1128,8 +1486,8 @@ eb_move_to_gpu(struct i915_execbuffer *eb)
 			eb->request->capture_list = capture;
 		}
 
-		if (vma->exec_entry->flags & EXEC_OBJECT_ASYNC)
-			continue;
+		if (entry->flags & EXEC_OBJECT_ASYNC)
+			goto skip_flushes;
 
 		if (obj->base.write_domain & I915_GEM_DOMAIN_CPU) {
 			i915_gem_clflush_object(obj, 0);
@@ -1137,10 +1495,33 @@ eb_move_to_gpu(struct i915_execbuffer *eb)
 		}
 
 		ret = i915_gem_request_await_object
-			(eb->request, obj, vma->exec_entry->flags & EXEC_OBJECT_WRITE);
+			(eb->request, obj, entry->flags & EXEC_OBJECT_WRITE);
 		if (ret)
 			return ret;
+
+skip_flushes:
+		obj->base.write_domain = 0;
+		if (entry->flags & EXEC_OBJECT_WRITE) {
+			obj->base.read_domains = 0;
+			if (!obj->cache_dirty && gpu_write_needs_clflush(obj))
+				obj->cache_dirty = true;
+			intel_fb_obj_invalidate(obj, ORIGIN_CS);
+		}
+		obj->base.read_domains |= I915_GEM_GPU_DOMAINS;
+
+		i915_vma_move_to_active(vma, eb->request, entry->flags);
+		__eb_unreserve_vma(vma, entry);
+		vma->exec_entry = NULL;
+	}
+
+	for (i = 0; i < count; i++) {
+		const struct drm_i915_gem_exec_object2 *entry = &eb->exec[i];
+		struct i915_vma *vma = to_ptr(struct i915_vma, entry->rsvd2);
+
+		eb_export_fence(vma->obj, eb->request, entry->flags);
+		i915_vma_put(vma);
 	}
+	eb->exec = NULL;
 
 	/* Unconditionally flush any chipset caches (for streaming writes). */
 	i915_gem_chipset_flush(eb->i915);
@@ -1172,114 +1553,10 @@ i915_gem_check_execbuffer(struct drm_i915_gem_execbuffer2 *exec)
 	return true;
 }
 
-static int
-validate_exec_list(struct drm_device *dev,
-		   struct drm_i915_gem_exec_object2 *exec,
-		   int count)
-{
-	unsigned relocs_total = 0;
-	unsigned relocs_max = UINT_MAX / sizeof(struct drm_i915_gem_relocation_entry);
-	unsigned invalid_flags;
-	int i;
-
-	/* INTERNAL flags must not overlap with external ones */
-	BUILD_BUG_ON(__EXEC_OBJECT_INTERNAL_FLAGS & ~__EXEC_OBJECT_UNKNOWN_FLAGS);
-
-	invalid_flags = __EXEC_OBJECT_UNKNOWN_FLAGS;
-	if (USES_FULL_PPGTT(dev))
-		invalid_flags |= EXEC_OBJECT_NEEDS_GTT;
-
-	for (i = 0; i < count; i++) {
-		char __user *ptr = u64_to_user_ptr(exec[i].relocs_ptr);
-		int length; /* limited by fault_in_pages_readable() */
-
-		if (exec[i].flags & invalid_flags)
-			return -EINVAL;
-
-		/* Offset can be used as input (EXEC_OBJECT_PINNED), reject
-		 * any non-page-aligned or non-canonical addresses.
-		 */
-		if (exec[i].flags & EXEC_OBJECT_PINNED) {
-			if (exec[i].offset !=
-			    gen8_canonical_addr(exec[i].offset & PAGE_MASK))
-				return -EINVAL;
-		}
-
-		/* From drm_mm perspective address space is continuous,
-		 * so from this point we're always using non-canonical
-		 * form internally.
-		 */
-		exec[i].offset = gen8_noncanonical_addr(exec[i].offset);
-
-		if (exec[i].alignment && !is_power_of_2(exec[i].alignment))
-			return -EINVAL;
-
-		/* pad_to_size was once a reserved field, so sanitize it */
-		if (exec[i].flags & EXEC_OBJECT_PAD_TO_SIZE) {
-			if (offset_in_page(exec[i].pad_to_size))
-				return -EINVAL;
-		} else {
-			exec[i].pad_to_size = 0;
-		}
-
-		/* First check for malicious input causing overflow in
-		 * the worst case where we need to allocate the entire
-		 * relocation tree as a single array.
-		 */
-		if (exec[i].relocation_count > relocs_max - relocs_total)
-			return -EINVAL;
-		relocs_total += exec[i].relocation_count;
-
-		length = exec[i].relocation_count *
-			sizeof(struct drm_i915_gem_relocation_entry);
-		/*
-		 * We must check that the entire relocation array is safe
-		 * to read, but since we may need to update the presumed
-		 * offsets during execution, check for full write access.
-		 */
-		if (!access_ok(VERIFY_WRITE, ptr, length))
-			return -EFAULT;
-
-		if (likely(!i915.prefault_disable)) {
-			if (fault_in_pages_readable(ptr, length))
-				return -EFAULT;
-		}
-	}
-
-	return 0;
-}
-
-static int eb_select_context(struct i915_execbuffer *eb)
-{
-	unsigned int ctx_id = i915_execbuffer2_get_context_id(*eb->args);
-	struct i915_gem_context *ctx;
-
-	ctx = i915_gem_context_lookup(eb->file->driver_priv, ctx_id);
-	if (unlikely(IS_ERR(ctx)))
-		return PTR_ERR(ctx);
-
-	if (unlikely(i915_gem_context_is_banned(ctx))) {
-		DRM_DEBUG("Context %u tried to submit while banned\n", ctx_id);
-		return -EIO;
-	}
-
-	eb->ctx = i915_gem_context_get(ctx);
-	eb->vm = ctx->ppgtt ? &ctx->ppgtt->base : &eb->i915->ggtt.base;
-
-	return 0;
-}
-
-static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
-{
-	return !(obj->cache_level == I915_CACHE_NONE ||
-		 obj->cache_level == I915_CACHE_WT);
-}
-
 void i915_vma_move_to_active(struct i915_vma *vma,
 			     struct drm_i915_gem_request *req,
 			     unsigned int flags)
 {
-	struct drm_i915_gem_object *obj = vma->obj;
 	const unsigned int idx = req->engine->id;
 
 	lockdep_assert_held(&req->i915->drm.struct_mutex);
@@ -1293,17 +1570,17 @@ void i915_vma_move_to_active(struct i915_vma *vma,
 	 * *last*.
 	 */
 	if (!i915_vma_is_active(vma))
-		obj->active_count++;
+		vma->obj->active_count++;
 	i915_vma_set_active(vma, idx);
 	i915_gem_active_set(&vma->last_read[idx], req);
 	list_move_tail(&vma->vm_link, &vma->vm->active_list);
 
 	if (flags & EXEC_OBJECT_WRITE) {
+		struct drm_i915_gem_object *obj = vma->obj;
+
 		if (intel_fb_obj_invalidate(obj, ORIGIN_CS))
 			i915_gem_active_set(&obj->frontbuffer_write, req);
 
-		/* update for the implicit flush after a batch */
-		obj->base.write_domain &= ~I915_GEM_GPU_DOMAINS;
 		if (!obj->cache_dirty && gpu_write_needs_clflush(obj))
 			obj->cache_dirty = true;
 	}
@@ -1312,42 +1589,6 @@ void i915_vma_move_to_active(struct i915_vma *vma,
 		i915_gem_active_set(&vma->last_fence, req);
 }
 
-static void eb_export_fence(struct drm_i915_gem_object *obj,
-			    struct drm_i915_gem_request *req,
-			    unsigned int flags)
-{
-	struct reservation_object *resv = obj->resv;
-
-	/* Ignore errors from failing to allocate the new fence, we can't
-	 * handle an error right now. Worst case should be missed
-	 * synchronisation leading to rendering corruption.
-	 */
-	reservation_object_lock(resv, NULL);
-	if (flags & EXEC_OBJECT_WRITE)
-		reservation_object_add_excl_fence(resv, &req->fence);
-	else if (reservation_object_reserve_shared(resv) == 0)
-		reservation_object_add_shared_fence(resv, &req->fence);
-	reservation_object_unlock(resv);
-}
-
-static void
-eb_move_to_active(struct i915_execbuffer *eb)
-{
-	struct i915_vma *vma;
-
-	list_for_each_entry(vma, &eb->vmas, exec_link) {
-		struct drm_i915_gem_object *obj = vma->obj;
-
-		obj->base.write_domain = 0;
-		if (vma->exec_entry->flags & EXEC_OBJECT_WRITE)
-			obj->base.read_domains = 0;
-		obj->base.read_domains |= I915_GEM_GPU_DOMAINS;
-
-		i915_vma_move_to_active(vma, eb->request, vma->exec_entry->flags);
-		eb_export_fence(obj, eb->request, vma->exec_entry->flags);
-	}
-}
-
 static int
 i915_reset_gen7_sol_offsets(struct drm_i915_gem_request *req)
 {
@@ -1359,16 +1600,16 @@ i915_reset_gen7_sol_offsets(struct drm_i915_gem_request *req)
 		return -EINVAL;
 	}
 
-	cs = intel_ring_begin(req, 4 * 3);
+	cs = intel_ring_begin(req, 4 * 2 + 2);
 	if (IS_ERR(cs))
 		return PTR_ERR(cs);
 
+	*cs++ = MI_LOAD_REGISTER_IMM(4);
 	for (i = 0; i < 4; i++) {
-		*cs++ = MI_LOAD_REGISTER_IMM(1);
 		*cs++ = i915_mmio_reg_offset(GEN7_SO_WRITE_OFFSET(i));
 		*cs++ = 0;
 	}
-
+	*cs++ = MI_NOOP;
 	intel_ring_advance(req, cs);
 
 	return 0;
@@ -1404,10 +1645,11 @@ static struct i915_vma *eb_parse(struct i915_execbuffer *eb, bool is_master)
 		goto out;
 
 	vma->exec_entry =
-		memset(&eb->shadow_exec_entry, 0, sizeof(*vma->exec_entry));
+		memset(&eb->exec[eb->buffer_count++],
+		       0, sizeof(*vma->exec_entry));
 	vma->exec_entry->flags = __EXEC_OBJECT_HAS_PIN;
-	i915_gem_object_get(shadow_batch_obj);
-	list_add_tail(&vma->exec_link, &eb->vmas);
+	vma->exec_entry->rsvd2 = (uintptr_t)vma;
+	i915_vma_get(vma);
 
 out:
 	i915_gem_object_unpin_pages(shadow_batch_obj);
@@ -1423,7 +1665,7 @@ add_to_client(struct drm_i915_gem_request *req,
 }
 
 static int
-execbuf_submit(struct i915_execbuffer *eb)
+eb_submit(struct i915_execbuffer *eb)
 {
 	int ret;
 
@@ -1449,8 +1691,6 @@ execbuf_submit(struct i915_execbuffer *eb)
 	if (ret)
 		return ret;
 
-	eb_move_to_active(eb);
-
 	return 0;
 }
 
@@ -1542,20 +1782,21 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 	int out_fence_fd = -1;
 	int ret;
 
-	if (!i915_gem_check_execbuffer(args))
-		return -EINVAL;
-
-	ret = validate_exec_list(dev, exec, args->buffer_count);
-	if (ret)
-		return ret;
+	BUILD_BUG_ON(__EXEC_OBJECT_INTERNAL_FLAGS & ~__EXEC_OBJECT_UNKNOWN_FLAGS);
 
 	eb.i915 = to_i915(dev);
 	eb.file = file;
 	eb.args = args;
+	if ((args->flags & I915_EXEC_NO_RELOC) == 0)
+		args->flags |= __EXEC_HAS_RELOC;
 	eb.exec = exec;
-	eb.need_relocs = (args->flags & I915_EXEC_NO_RELOC) == 0;
+	eb.ctx = NULL;
+	eb.invalid_flags = __EXEC_OBJECT_UNKNOWN_FLAGS;
+	if (USES_FULL_PPGTT(eb.i915))
+		eb.invalid_flags |= EXEC_OBJECT_NEEDS_GTT;
 	reloc_cache_init(&eb.reloc_cache, eb.i915);
 
+	eb.buffer_count = args->buffer_count;
 	eb.batch_start_offset = args->batch_start_offset;
 	eb.batch_len = args->batch_len;
 
@@ -1601,6 +1842,9 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 		}
 	}
 
+	if (eb_create(&eb))
+		return -ENOMEM;
+
 	/* Take a local wakeref for preparing to dispatch the execbuf as
 	 * we expect to access the hardware fairly frequently in the
 	 * process. Upon first dispatch, we acquire another prolonged
@@ -1608,60 +1852,41 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 	 * 100ms.
 	 */
 	intel_runtime_pm_get(eb.i915);
-
 	ret = i915_mutex_lock_interruptible(dev);
 	if (ret)
-		goto pre_mutex_err;
+		goto err_rpm;
 
 	ret = eb_select_context(&eb);
-	if (ret) {
-		mutex_unlock(&dev->struct_mutex);
-		goto pre_mutex_err;
-	}
+	if (unlikely(ret))
+		goto err_unlock;
 
-	if (eb_create(&eb)) {
-		i915_gem_context_put(eb.ctx);
-		mutex_unlock(&dev->struct_mutex);
-		ret = -ENOMEM;
-		goto pre_mutex_err;
-	}
-
-	/* Look up object handles */
 	ret = eb_lookup_vmas(&eb);
-	if (ret)
-		goto err;
-
-	/* take note of the batch buffer before we might reorder the lists */
-	eb.batch = eb_get_batch(&eb);
-
-	/* Move the objects en-masse into the GTT, evicting if necessary. */
-	ret = eb_reserve(&eb);
-	if (ret)
-		goto err;
-
-	/* The objects are in their final locations, apply the relocations. */
-	if (eb.need_relocs)
+	if (likely(!ret && args->flags & __EXEC_HAS_RELOC))
 		ret = eb_relocate(&eb);
-	if (ret) {
-		if (ret == -EFAULT) {
-			ret = eb_relocate_slow(&eb);
-			BUG_ON(!mutex_is_locked(&dev->struct_mutex));
-		}
-		if (ret)
-			goto err;
-	}
+	if (ret == -EAGAIN || ret == -EFAULT)
+		ret = eb_relocate_slow(&eb);
+	if (ret && args->flags & I915_EXEC_NO_RELOC)
+		/* If the user expects the execobject.offset and
+		 * reloc.presumed_offset to be an exact match,
+		 * as for using NO_RELOC, then we cannot update
+		 * the execobject.offset until we have completed
+		 * relocation.
+		 */
+		args->flags &= ~__EXEC_HAS_RELOC;
+	if (ret < 0)
+		goto err_vma;
 
 	/* Set the pending read domains for the batch buffer to COMMAND */
-	if (eb.batch->exec_entry->flags & EXEC_OBJECT_WRITE) {
+	if (unlikely(eb.batch->exec_entry->flags & EXEC_OBJECT_WRITE)) {
 		DRM_DEBUG("Attempting to use self-modifying batch buffer\n");
 		ret = -EINVAL;
-		goto err;
+		goto err_vma;
 	}
 	if (eb.batch_start_offset > eb.batch->size ||
 	    eb.batch_len > eb.batch->size - eb.batch_start_offset) {
 		DRM_DEBUG("Attempting to use out-of-bounds batch\n");
 		ret = -EINVAL;
-		goto err;
+		goto err_vma;
 	}
 
 	if (eb.engine->needs_cmd_parser && eb.batch_len) {
@@ -1670,7 +1895,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 		vma = eb_parse(&eb, drm_is_current_master(file));
 		if (IS_ERR(vma)) {
 			ret = PTR_ERR(vma);
-			goto err;
+			goto err_vma;
 		}
 
 		if (vma) {
@@ -1696,7 +1921,6 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 	 * batch" bit. Hence we need to pin secure batches into the global gtt.
 	 * hsw should have this fixed, but bdw mucks it up again. */
 	if (eb.dispatch_flags & I915_DISPATCH_SECURE) {
-		struct drm_i915_gem_object *obj = eb.batch->obj;
 		struct i915_vma *vma;
 
 		/*
@@ -1709,10 +1933,10 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 		 *   fitting due to fragmentation.
 		 * So this is actually safe.
 		 */
-		vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0, 0);
+		vma = i915_gem_object_ggtt_pin(eb.batch->obj, NULL, 0, 0, 0);
 		if (IS_ERR(vma)) {
 			ret = PTR_ERR(vma);
-			goto err;
+			goto err_vma;
 		}
 
 		eb.batch = vma;
@@ -1748,7 +1972,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 	eb.request->batch = eb.batch;
 
 	trace_i915_gem_request_queue(eb.request, eb.dispatch_flags);
-	ret = execbuf_submit(&eb);
+	ret = eb_submit(&eb);
 err_request:
 	__i915_add_request(eb.request, ret == 0);
 	add_to_client(eb.request, file);
@@ -1765,23 +1989,16 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 	}
 
 err_batch_unpin:
-	/*
-	 * FIXME: We crucially rely upon the active tracking for the (ppgtt)
-	 * batch vma for correctness. For less ugly and less fragility this
-	 * needs to be adjusted to also track the ggtt batch vma properly as
-	 * active.
-	 */
 	if (eb.dispatch_flags & I915_DISPATCH_SECURE)
 		i915_vma_unpin(eb.batch);
-err:
-	/* the request owns the ref now */
-	eb_destroy(&eb);
+err_vma:
+	eb_release_vma(&eb);
+	i915_gem_context_put(eb.ctx);
+err_unlock:
 	mutex_unlock(&dev->struct_mutex);
-
-pre_mutex_err:
-	/* intel_gpu_busy should also get a ref, so it will free when the device
-	 * is really idle. */
+err_rpm:
 	intel_runtime_pm_put(eb.i915);
+	eb_destroy(&eb);
 	if (out_fence_fd != -1)
 		put_unused_fd(out_fence_fd);
 err_in_fence:
@@ -1808,9 +2025,27 @@ i915_gem_execbuffer(struct drm_device *dev, void *data,
 		return -EINVAL;
 	}
 
+	exec2.buffers_ptr = args->buffers_ptr;
+	exec2.buffer_count = args->buffer_count;
+	exec2.batch_start_offset = args->batch_start_offset;
+	exec2.batch_len = args->batch_len;
+	exec2.DR1 = args->DR1;
+	exec2.DR4 = args->DR4;
+	exec2.num_cliprects = args->num_cliprects;
+	exec2.cliprects_ptr = args->cliprects_ptr;
+	exec2.flags = I915_EXEC_RENDER;
+	i915_execbuffer2_set_context_id(exec2, 0);
+
+	if (!i915_gem_check_execbuffer(&exec2))
+		return -EINVAL;
+
 	/* Copy in the exec list from userland */
-	exec_list = drm_malloc_ab(sizeof(*exec_list), args->buffer_count);
-	exec2_list = drm_malloc_ab(sizeof(*exec2_list), args->buffer_count);
+	exec_list = drm_malloc_gfp(args->buffer_count,
+				   sizeof(*exec_list),
+				   __GFP_NOWARN | GFP_TEMPORARY);
+	exec2_list = drm_malloc_gfp(args->buffer_count + 1,
+				    sizeof(*exec2_list),
+				    __GFP_NOWARN | GFP_TEMPORARY);
 	if (exec_list == NULL || exec2_list == NULL) {
 		DRM_DEBUG("Failed to allocate exec list for %d buffers\n",
 			  args->buffer_count);
@@ -1841,36 +2076,23 @@ i915_gem_execbuffer(struct drm_device *dev, void *data,
 			exec2_list[i].flags = 0;
 	}
 
-	exec2.buffers_ptr = args->buffers_ptr;
-	exec2.buffer_count = args->buffer_count;
-	exec2.batch_start_offset = args->batch_start_offset;
-	exec2.batch_len = args->batch_len;
-	exec2.DR1 = args->DR1;
-	exec2.DR4 = args->DR4;
-	exec2.num_cliprects = args->num_cliprects;
-	exec2.cliprects_ptr = args->cliprects_ptr;
-	exec2.flags = I915_EXEC_RENDER;
-	i915_execbuffer2_set_context_id(exec2, 0);
-
 	ret = i915_gem_do_execbuffer(dev, file, &exec2, exec2_list);
-	if (!ret) {
+	if (exec2.flags & __EXEC_HAS_RELOC) {
 		struct drm_i915_gem_exec_object __user *user_exec_list =
 			u64_to_user_ptr(args->buffers_ptr);
 
 		/* Copy the new buffer offsets back to the user's exec list. */
 		for (i = 0; i < args->buffer_count; i++) {
+			if ((exec2_list[i].offset & UPDATE) == 0)
+				continue;
+
 			exec2_list[i].offset =
-				gen8_canonical_addr(exec2_list[i].offset);
-			ret = __copy_to_user(&user_exec_list[i].offset,
-					     &exec2_list[i].offset,
-					     sizeof(user_exec_list[i].offset));
-			if (ret) {
-				ret = -EFAULT;
-				DRM_DEBUG("failed to copy %d exec entries "
-					  "back to user (%d)\n",
-					  args->buffer_count, ret);
+				gen8_canonical_addr(exec2_list[i].offset & PIN_OFFSET_MASK);
+			exec2_list[i].offset &= PIN_OFFSET_MASK;
+			if (__copy_to_user(&user_exec_list[i].offset,
+					   &exec2_list[i].offset,
+					   sizeof(user_exec_list[i].offset)))
 				break;
-			}
 		}
 	}
 
@@ -1884,56 +2106,63 @@ i915_gem_execbuffer2(struct drm_device *dev, void *data,
 		     struct drm_file *file)
 {
 	struct drm_i915_gem_execbuffer2 *args = data;
-	struct drm_i915_gem_exec_object2 *exec2_list = NULL;
+	struct drm_i915_gem_exec_object2 *exec2_list;
 	int ret;
 
 	if (args->buffer_count < 1 ||
-	    args->buffer_count > UINT_MAX / sizeof(*exec2_list)) {
+	    args->buffer_count >= UINT_MAX / sizeof(*exec2_list) - 1) {
 		DRM_DEBUG("execbuf2 with %d buffers\n", args->buffer_count);
 		return -EINVAL;
 	}
 
-	exec2_list = drm_malloc_gfp(args->buffer_count,
+	if (!i915_gem_check_execbuffer(args))
+		return -EINVAL;
+
+	exec2_list = drm_malloc_gfp(args->buffer_count + 1,
 				    sizeof(*exec2_list),
-				    GFP_TEMPORARY);
+				    __GFP_NOWARN | GFP_TEMPORARY);
 	if (exec2_list == NULL) {
 		DRM_DEBUG("Failed to allocate exec list for %d buffers\n",
 			  args->buffer_count);
 		return -ENOMEM;
 	}
-	ret = copy_from_user(exec2_list,
-			     u64_to_user_ptr(args->buffers_ptr),
-			     sizeof(*exec2_list) * args->buffer_count);
-	if (ret != 0) {
-		DRM_DEBUG("copy %d exec entries failed %d\n",
-			  args->buffer_count, ret);
+	if (copy_from_user(exec2_list,
+			   u64_to_user_ptr(args->buffers_ptr),
+			   sizeof(*exec2_list) * args->buffer_count)) {
+		DRM_DEBUG("copy %d exec entries failed\n", args->buffer_count);
 		drm_free_large(exec2_list);
 		return -EFAULT;
 	}
 
 	ret = i915_gem_do_execbuffer(dev, file, args, exec2_list);
-	if (!ret) {
-		/* Copy the new buffer offsets back to the user's exec list. */
+
+	/* Now that we have begun execution of the batchbuffer, we ignore
+	 * any new error after this point. Also given that we have already
+	 * updated the associated relocations, we try to write out the current
+	 * object locations irrespective of any error.
+	 */
+	if (args->flags & __EXEC_HAS_RELOC) {
 		struct drm_i915_gem_exec_object2 __user *user_exec_list =
-				   u64_to_user_ptr(args->buffers_ptr);
+			u64_to_user_ptr(args->buffers_ptr);
 		int i;
 
+		/* Copy the new buffer offsets back to the user's exec list. */
+		user_access_begin();
 		for (i = 0; i < args->buffer_count; i++) {
+			if ((exec2_list[i].offset & UPDATE) == 0)
+				continue;
+
 			exec2_list[i].offset =
-				gen8_canonical_addr(exec2_list[i].offset);
-			ret = __copy_to_user(&user_exec_list[i].offset,
-					     &exec2_list[i].offset,
-					     sizeof(user_exec_list[i].offset));
-			if (ret) {
-				ret = -EFAULT;
-				DRM_DEBUG("failed to copy %d exec entries "
-					  "back to user\n",
-					  args->buffer_count);
-				break;
-			}
+				gen8_canonical_addr(exec2_list[i].offset & PIN_OFFSET_MASK);
+			unsafe_put_user(exec2_list[i].offset,
+					&user_exec_list[i].offset,
+					end_user);
 		}
+end_user:
+		user_access_end();
 	}
 
+	args->flags &= ~__I915_EXEC_UNKNOWN_FLAGS;
 	drm_free_large(exec2_list);
 	return ret;
 }
diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index 507463f144c9..5097dcc3987a 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -463,7 +463,7 @@ i915_vma_insert(struct i915_vma *vma, u64 size, u64 alignment, u64 flags)
 			  size, obj->base.size,
 			  flags & PIN_MAPPABLE ? "mappable" : "total",
 			  end);
-		return -E2BIG;
+		return -ENOSPC;
 	}
 
 	ret = i915_gem_object_pin_pages(obj);
diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h
index 88543fafcffc..062addfee6ef 100644
--- a/drivers/gpu/drm/i915/i915_vma.h
+++ b/drivers/gpu/drm/i915/i915_vma.h
@@ -103,6 +103,7 @@ struct i915_vma {
 
 	/** This vma's place in the execbuf reservation list */
 	struct list_head exec_link;
+	struct list_head reloc_link;
 
 	/** This vma's place in the eviction list */
 	struct list_head evict_link;
diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_evict.c b/drivers/gpu/drm/i915/selftests/i915_gem_evict.c
index 14e9c2fbc4e6..5ea373221f49 100644
--- a/drivers/gpu/drm/i915/selftests/i915_gem_evict.c
+++ b/drivers/gpu/drm/i915/selftests/i915_gem_evict.c
@@ -304,7 +304,7 @@ static int igt_evict_vm(void *arg)
 		goto cleanup;
 
 	/* Everything is pinned, nothing should happen */
-	err = i915_gem_evict_vm(&ggtt->base, false);
+	err = i915_gem_evict_vm(&ggtt->base);
 	if (err) {
 		pr_err("i915_gem_evict_vm on a full GGTT returned err=%d]\n",
 		       err);
@@ -313,7 +313,7 @@ static int igt_evict_vm(void *arg)
 
 	unpin_ggtt(i915);
 
-	err = i915_gem_evict_vm(&ggtt->base, false);
+	err = i915_gem_evict_vm(&ggtt->base);
 	if (err) {
 		pr_err("i915_gem_evict_vm on a full GGTT returned err=%d]\n",
 		       err);
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 39+ messages in thread

* [PATCH 10/15] drm/i915: First try the previous execbuffer location
  2017-03-16 13:19 Make execbuf fast and green Chris Wilson
                   ` (8 preceding siblings ...)
  2017-03-16 13:20 ` [PATCH 09/15] drm/i915: Eliminate lots of iterations over the execobjects array Chris Wilson
@ 2017-03-16 13:20 ` Chris Wilson
  2017-03-21 13:54   ` Joonas Lahtinen
  2017-03-16 13:20 ` [PATCH 11/15] drm/i915: Wait upon userptr get-user-pages within execbuffer Chris Wilson
                   ` (8 subsequent siblings)
  18 siblings, 1 reply; 39+ messages in thread
From: Chris Wilson @ 2017-03-16 13:20 UTC (permalink / raw)
  To: intel-gfx

When choosing a slot for an execbuffer, we ideally want to use the same
address as last time (so that we don't have to rebind it) and the same
address as expected by the user (so that we don't have to fixup any
relocations pointing to it). If we first try to bind the incoming
execbuffer->offset from the user, or the currently bound offset that
should hopefully achieve the goal of avoiding the rebind cost and the
relocation penalty. However, if the object is not currently bound there
we don't want to arbitrarily unbind an object in our chosen position and
so choose to rebind/relocate the incoming object instead. After we
report the new position back to the user, on the next pass the
relocations should have settled down.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 12 ++++++++----
 drivers/gpu/drm/i915/i915_gem_gtt.c        |  6 ++++++
 drivers/gpu/drm/i915/i915_gem_gtt.h        |  1 +
 3 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 5ce4f2e47eb9..0f141a5800f2 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -293,10 +293,15 @@ eb_pin_vma(struct i915_execbuffer *eb,
 {
 	u64 flags;
 
-	flags = vma->node.start;
-	flags |= PIN_USER | PIN_NONBLOCK | PIN_OFFSET_FIXED;
+	if (vma->node.size)
+		flags = vma->node.start;
+	else
+		flags = entry->offset & PIN_OFFSET_MASK;
+
+	flags |= PIN_USER | PIN_NOEVICT | PIN_OFFSET_FIXED;
 	if (unlikely(entry->flags & EXEC_OBJECT_NEEDS_GTT))
 		flags |= PIN_GLOBAL;
+
 	if (unlikely(i915_vma_pin(vma, 0, 0, flags)))
 		return;
 
@@ -407,8 +412,7 @@ eb_add_vma(struct i915_execbuffer *eb,
 		entry->flags |= eb->context_flags;
 
 	ret = 0;
-	if (vma->node.size)
-		eb_pin_vma(eb, entry, vma);
+	eb_pin_vma(eb, entry, vma);
 	if (eb_vma_misplaced(entry, vma)) {
 		eb_unreserve_vma(vma, entry);
 
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index cee9c4fec52a..9160733cbb09 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -3288,6 +3288,9 @@ int i915_gem_gtt_reserve(struct i915_address_space *vm,
 	if (err != -ENOSPC)
 		return err;
 
+	if (flags & PIN_NOEVICT)
+		return -ENOSPC;
+
 	err = i915_gem_evict_for_node(vm, node, flags);
 	if (err == 0)
 		err = drm_mm_reserve_node(&vm->mm, node);
@@ -3402,6 +3405,9 @@ int i915_gem_gtt_insert(struct i915_address_space *vm,
 	if (err != -ENOSPC)
 		return err;
 
+	if (flags & PIN_NOEVICT)
+		return -ENOSPC;
+
 	/* No free space, pick a slot at random.
 	 *
 	 * There is a pathological case here using a GTT shared between
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
index 948b2e68131a..6be7a5c32392 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -589,6 +589,7 @@ int i915_gem_gtt_insert(struct i915_address_space *vm,
 #define PIN_MAPPABLE		BIT(1)
 #define PIN_ZONE_4G		BIT(2)
 #define PIN_NONFAULT		BIT(3)
+#define PIN_NOEVICT		BIT(4)
 
 #define PIN_MBZ			BIT(5) /* I915_VMA_PIN_OVERFLOW */
 #define PIN_GLOBAL		BIT(6) /* I915_VMA_GLOBAL_BIND */
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 39+ messages in thread

* [PATCH 11/15] drm/i915: Wait upon userptr get-user-pages within execbuffer
  2017-03-16 13:19 Make execbuf fast and green Chris Wilson
                   ` (9 preceding siblings ...)
  2017-03-16 13:20 ` [PATCH 10/15] drm/i915: First try the previous execbuffer location Chris Wilson
@ 2017-03-16 13:20 ` Chris Wilson
  2017-03-16 13:20 ` [PATCH 12/15] drm/i915: Remove superfluous i915_add_request_no_flush() helper Chris Wilson
                   ` (7 subsequent siblings)
  18 siblings, 0 replies; 39+ messages in thread
From: Chris Wilson @ 2017-03-16 13:20 UTC (permalink / raw)
  To: intel-gfx

This simply hides the EAGAIN caused by userptr when userspace causes
resource contention. However, it is quite beneficial with highly
contended userptr users as we avoid repeating the setup costs and
kernel-user context switches.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Michał Winiarski <michal.winiarski@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.c            |  1 +
 drivers/gpu/drm/i915/i915_drv.h            | 10 +++++++++-
 drivers/gpu/drm/i915/i915_gem.c            |  4 +++-
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |  3 +++
 drivers/gpu/drm/i915/i915_gem_userptr.c    | 18 +++++++++++++++---
 5 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 9d8c8b928aab..fc51094189c0 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -552,6 +552,7 @@ static void i915_gem_fini(struct drm_i915_private *dev_priv)
 	mutex_lock(&dev_priv->drm.struct_mutex);
 	i915_gem_cleanup_engines(dev_priv);
 	i915_gem_context_fini(dev_priv);
+	i915_gem_cleanup_userptr(dev_priv);
 	mutex_unlock(&dev_priv->drm.struct_mutex);
 
 	i915_gem_drain_freed_objects(dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 4bb397d84923..2ff8694cdaa5 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1525,6 +1525,13 @@ struct i915_gem_mm {
 	struct list_head fence_list;
 
 	/**
+	 * Workqueue to fault in userptr pages, flushed by the execbuf
+	 * when required but otherwise left to userspace to try again
+	 * on EAGAIN.
+	 */
+	struct workqueue_struct *userptr_wq;
+
+	/**
 	 * Are we in a non-interruptible section of code like
 	 * modesetting?
 	 */
@@ -3191,7 +3198,8 @@ int i915_gem_set_tiling_ioctl(struct drm_device *dev, void *data,
 			      struct drm_file *file_priv);
 int i915_gem_get_tiling_ioctl(struct drm_device *dev, void *data,
 			      struct drm_file *file_priv);
-void i915_gem_init_userptr(struct drm_i915_private *dev_priv);
+int i915_gem_init_userptr(struct drm_i915_private *dev_priv);
+void i915_gem_cleanup_userptr(struct drm_i915_private *dev_priv);
 int i915_gem_userptr_ioctl(struct drm_device *dev, void *data,
 			   struct drm_file *file);
 int i915_gem_get_aperture_ioctl(struct drm_device *dev, void *data,
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 1b6539a91625..c6077eba3d3c 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4598,7 +4598,9 @@ int i915_gem_init(struct drm_i915_private *dev_priv)
 	 */
 	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
 
-	i915_gem_init_userptr(dev_priv);
+	ret = i915_gem_init_userptr(dev_priv);
+	if (ret)
+		goto out_unlock;
 
 	ret = i915_gem_init_ggtt(dev_priv);
 	if (ret)
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 0f141a5800f2..0c4fa7fc94aa 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1386,6 +1386,9 @@ static int eb_relocate_slow(struct i915_execbuffer *eb)
 		goto out;
 	}
 
+	/* A frequent cause for EAGAIN are currently unavailable client pages */
+	flush_workqueue(eb->i915->mm.userptr_wq);
+
 	ret = i915_mutex_lock_interruptible(dev);
 	if (ret) {
 		mutex_lock(&dev->struct_mutex);
diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 58ccf8b8ca1c..66b09163bfba 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -378,7 +378,7 @@ __i915_mm_struct_free(struct kref *kref)
 	mutex_unlock(&mm->i915->mm_lock);
 
 	INIT_WORK(&mm->work, __i915_mm_struct_free__worker);
-	schedule_work(&mm->work);
+	queue_work(mm->i915->mm.userptr_wq, &mm->work);
 }
 
 static void
@@ -598,7 +598,7 @@ __i915_gem_userptr_get_pages_schedule(struct drm_i915_gem_object *obj)
 	get_task_struct(work->task);
 
 	INIT_WORK(&work->work, __i915_gem_userptr_get_pages_worker);
-	schedule_work(&work->work);
+	queue_work(to_i915(obj->base.dev)->mm.userptr_wq, &work->work);
 
 	return ERR_PTR(-EAGAIN);
 }
@@ -828,8 +828,20 @@ i915_gem_userptr_ioctl(struct drm_device *dev, void *data, struct drm_file *file
 	return 0;
 }
 
-void i915_gem_init_userptr(struct drm_i915_private *dev_priv)
+int i915_gem_init_userptr(struct drm_i915_private *dev_priv)
 {
 	mutex_init(&dev_priv->mm_lock);
 	hash_init(dev_priv->mm_structs);
+
+	dev_priv->mm.userptr_wq =
+		alloc_workqueue("i915-userptr-acquire", WQ_HIGHPRI, 0);
+	if (!dev_priv->mm.userptr_wq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void i915_gem_cleanup_userptr(struct drm_i915_private *dev_priv)
+{
+	destroy_workqueue(dev_priv->mm.userptr_wq);
 }
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 39+ messages in thread

* [PATCH 12/15] drm/i915: Remove superfluous i915_add_request_no_flush() helper
  2017-03-16 13:19 Make execbuf fast and green Chris Wilson
                   ` (10 preceding siblings ...)
  2017-03-16 13:20 ` [PATCH 11/15] drm/i915: Wait upon userptr get-user-pages within execbuffer Chris Wilson
@ 2017-03-16 13:20 ` Chris Wilson
  2017-03-17 11:17   ` Joonas Lahtinen
  2017-03-16 13:20 ` [PATCH 13/15] drm/i915: Allow execbuffer to use the first object as the batch Chris Wilson
                   ` (6 subsequent siblings)
  18 siblings, 1 reply; 39+ messages in thread
From: Chris Wilson @ 2017-03-16 13:20 UTC (permalink / raw)
  To: intel-gfx

The only time we need to emit a flush inside request emission is after
an execbuffer, for which we can use the full __i915_add_request(). All
other instances want the simpler i915_add_request() without flushing, so
remove the useless helper.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/gvt/scheduler.c    | 2 +-
 drivers/gpu/drm/i915/i915_gem_context.c | 2 +-
 drivers/gpu/drm/i915/i915_gem_request.h | 2 --
 drivers/gpu/drm/i915/intel_display.c    | 4 ++--
 drivers/gpu/drm/i915/intel_overlay.c    | 8 ++++----
 drivers/gpu/drm/i915/intel_pm.c         | 2 +-
 6 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/i915/gvt/scheduler.c b/drivers/gpu/drm/i915/gvt/scheduler.c
index d3a56c949025..cfd9f7a38c27 100644
--- a/drivers/gpu/drm/i915/gvt/scheduler.c
+++ b/drivers/gpu/drm/i915/gvt/scheduler.c
@@ -226,7 +226,7 @@ static int dispatch_workload(struct intel_vgpu_workload *workload)
 		workload->status = ret;
 
 	if (!IS_ERR_OR_NULL(rq))
-		i915_add_request_no_flush(rq);
+		i915_add_request(rq);
 	mutex_unlock(&dev_priv->drm.struct_mutex);
 	return ret;
 }
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 9a9a8afff1e9..184c077ee9bc 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -1007,7 +1007,7 @@ int i915_gem_switch_to_kernel_context(struct drm_i915_private *dev_priv)
 		}
 
 		ret = i915_switch_context(req);
-		i915_add_request_no_flush(req);
+		i915_add_request(req);
 		if (ret)
 			return ret;
 	}
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index 4eb642960393..4ccab5affd3c 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -278,8 +278,6 @@ int i915_gem_request_await_dma_fence(struct drm_i915_gem_request *req,
 
 void __i915_add_request(struct drm_i915_gem_request *req, bool flush_caches);
 #define i915_add_request(req) \
-	__i915_add_request(req, true)
-#define i915_add_request_no_flush(req) \
 	__i915_add_request(req, false)
 
 void __i915_gem_request_submit(struct drm_i915_gem_request *request);
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 4b73513b46c1..fcfeb92a77d0 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -10668,7 +10668,7 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc,
 		intel_mark_page_flip_active(intel_crtc, work);
 
 		work->flip_queued_req = i915_gem_request_get(request);
-		i915_add_request_no_flush(request);
+		i915_add_request(request);
 	}
 
 	i915_gem_object_wait_priority(obj, 0, I915_PRIORITY_DISPLAY);
@@ -10684,7 +10684,7 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc,
 	return 0;
 
 cleanup_request:
-	i915_add_request_no_flush(request);
+	i915_add_request(request);
 cleanup_unpin:
 	to_intel_plane_state(primary->state)->vma = work->old_vma;
 	intel_unpin_fb_vma(vma);
diff --git a/drivers/gpu/drm/i915/intel_overlay.c b/drivers/gpu/drm/i915/intel_overlay.c
index 5ef9f5bfb92c..2e0c56ed22bb 100644
--- a/drivers/gpu/drm/i915/intel_overlay.c
+++ b/drivers/gpu/drm/i915/intel_overlay.c
@@ -278,7 +278,7 @@ static int intel_overlay_on(struct intel_overlay *overlay)
 
 	cs = intel_ring_begin(req, 4);
 	if (IS_ERR(cs)) {
-		i915_add_request_no_flush(req);
+		i915_add_request(req);
 		return PTR_ERR(cs);
 	}
 
@@ -343,7 +343,7 @@ static int intel_overlay_continue(struct intel_overlay *overlay,
 
 	cs = intel_ring_begin(req, 2);
 	if (IS_ERR(cs)) {
-		i915_add_request_no_flush(req);
+		i915_add_request(req);
 		return PTR_ERR(cs);
 	}
 
@@ -419,7 +419,7 @@ static int intel_overlay_off(struct intel_overlay *overlay)
 
 	cs = intel_ring_begin(req, 6);
 	if (IS_ERR(cs)) {
-		i915_add_request_no_flush(req);
+		i915_add_request(req);
 		return PTR_ERR(cs);
 	}
 
@@ -477,7 +477,7 @@ static int intel_overlay_release_old_vid(struct intel_overlay *overlay)
 
 		cs = intel_ring_begin(req, 2);
 		if (IS_ERR(cs)) {
-			i915_add_request_no_flush(req);
+			i915_add_request(req);
 			return PTR_ERR(cs);
 		}
 
diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
index 934a8c00073a..d55bf882d1aa 100644
--- a/drivers/gpu/drm/i915/intel_pm.c
+++ b/drivers/gpu/drm/i915/intel_pm.c
@@ -7086,7 +7086,7 @@ static void __intel_autoenable_gt_powersave(struct work_struct *work)
 		rcs->init_context(req);
 
 	/* Mark the device busy, calling intel_enable_gt_powersave() */
-	i915_add_request_no_flush(req);
+	i915_add_request(req);
 
 unlock:
 	mutex_unlock(&dev_priv->drm.struct_mutex);
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 39+ messages in thread

* [PATCH 13/15] drm/i915: Allow execbuffer to use the first object as the batch
  2017-03-16 13:19 Make execbuf fast and green Chris Wilson
                   ` (11 preceding siblings ...)
  2017-03-16 13:20 ` [PATCH 12/15] drm/i915: Remove superfluous i915_add_request_no_flush() helper Chris Wilson
@ 2017-03-16 13:20 ` Chris Wilson
  2017-03-17 11:15   ` Joonas Lahtinen
  2017-03-16 13:20 ` [PATCH 14/15] drm/i915: Async GPU relocation processing Chris Wilson
                   ` (5 subsequent siblings)
  18 siblings, 1 reply; 39+ messages in thread
From: Chris Wilson @ 2017-03-16 13:20 UTC (permalink / raw)
  To: intel-gfx

Currently, the last object in the execlist is the always the batch.
However, when building the batch buffer we often know the batch object
first and if we can use the first slot in the execlist we can emit
relocation instructions relative to it immediately and avoid a separate
pass to adjust the relocations to point to the last execlist slot.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.c            | 1 +
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 5 ++++-
 include/uapi/drm/i915_drm.h                | 8 +++++++-
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index fc51094189c0..e63fe3f704d6 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -351,6 +351,7 @@ static int i915_getparam(struct drm_device *dev, void *data,
 	case I915_PARAM_HAS_EXEC_ASYNC:
 	case I915_PARAM_HAS_EXEC_FENCE:
 	case I915_PARAM_HAS_EXEC_CAPTURE:
+	case I915_PARAM_HAS_EXEC_BATCH_FIRST:
 		/* For the time being all of these are always true;
 		 * if some supported hardware does not have one of these
 		 * features this value needs to be provided from
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 0c4fa7fc94aa..d39ea2ba20ff 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -559,7 +559,10 @@ ht_head(const struct i915_gem_context *ctx, u32 handle)
 
 static int eb_batch_index(const struct i915_execbuffer *eb)
 {
-	return eb->buffer_count - 1;
+	if (eb->args->flags & I915_EXEC_BATCH_FIRST)
+		return 0;
+	else
+		return eb->buffer_count - 1;
 }
 
 static int eb_select_context(struct i915_execbuffer *eb)
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 176c5a70300b..a3ec2f355c09 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -418,6 +418,11 @@ typedef struct drm_i915_irq_wait {
  */
 #define I915_PARAM_HAS_EXEC_CAPTURE	 45
 
+/* Query whether DRM_I915_GEM_EXECBUFFER2 supports supplying the batch buffer
+ * as the first execobject as opposed to the last. See I915_EXEC_BATCH_FIRST.
+ */
+#define I915_PARAM_HAS_EXEC_BATCH_FIRST	 46
+
 typedef struct drm_i915_getparam {
 	__s32 param;
 	/*
@@ -902,7 +907,8 @@ struct drm_i915_gem_execbuffer2 {
  */
 #define I915_EXEC_FENCE_OUT		(1<<17)
 
-#define __I915_EXEC_UNKNOWN_FLAGS (-(I915_EXEC_FENCE_OUT<<1))
+#define I915_EXEC_BATCH_FIRST		(1<<18)
+#define __I915_EXEC_UNKNOWN_FLAGS (-(I915_EXEC_BATCH_FIRST<<1))
 
 #define I915_EXEC_CONTEXT_ID_MASK	(0xffffffff)
 #define i915_execbuffer2_set_context_id(eb2, context) \
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 39+ messages in thread

* [PATCH 14/15] drm/i915: Async GPU relocation processing
  2017-03-16 13:19 Make execbuf fast and green Chris Wilson
                   ` (12 preceding siblings ...)
  2017-03-16 13:20 ` [PATCH 13/15] drm/i915: Allow execbuffer to use the first object as the batch Chris Wilson
@ 2017-03-16 13:20 ` Chris Wilson
  2017-03-17 11:11   ` [PATCH v2] " Chris Wilson
  2017-03-17 11:15   ` [PATCH 14/15] " Joonas Lahtinen
  2017-03-16 13:20 ` [PATCH 15/15] drm/i915/scheduler: Support user-defined priorities Chris Wilson
                   ` (4 subsequent siblings)
  18 siblings, 2 replies; 39+ messages in thread
From: Chris Wilson @ 2017-03-16 13:20 UTC (permalink / raw)
  To: intel-gfx

If the user requires patching of their batch or auxiliary buffers, we
currently make the alterations on the cpu. If they are active on the GPU
at the time, we wait under the struct_mutex for them to finish executing
before we rewrite the contents. This happens if shared relocation trees
are used between different contexts with separate address space (and the
buffers then have different addresses in each), the 3D state will need
to be adjusted between execution on each context. However, we don't need
to use the CPU to do the relocation patching, as we could queue commands
to the GPU to perform it and use fences to serialise the operation with
the current activity and future - so the operation on the GPU appears
just as atomic as performing it immediately. Performing the relocation
rewrites on the GPU is not free, in terms of pure throughput, the number
of relocations/s is about halved - but more importantly so is the time
under the struct_mutex.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem.c            |   1 -
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 217 +++++++++++++++++++++++++++--
 2 files changed, 208 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index c6077eba3d3c..15ca0dea0a13 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4192,7 +4192,6 @@ static void __i915_gem_free_objects(struct drm_i915_private *i915,
 		GEM_BUG_ON(i915_gem_object_is_active(obj));
 		list_for_each_entry_safe(vma, vn,
 					 &obj->vma_list, obj_link) {
-			GEM_BUG_ON(!i915_vma_is_ggtt(vma));
 			GEM_BUG_ON(i915_vma_is_active(vma));
 			vma->flags &= ~I915_VMA_PIN_MASK;
 			i915_vma_close(vma);
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index d39ea2ba20ff..89d2218ae9ea 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -40,7 +40,12 @@
 #include "intel_drv.h"
 #include "intel_frontbuffer.h"
 
-#define DBG_USE_CPU_RELOC 0 /* -1 force GTT relocs; 1 force CPU relocs */
+enum {
+	FORCE_CPU_RELOC = 1,
+	FORCE_GTT_RELOC,
+	FORCE_GPU_RELOC,
+#define DBG_FORCE_RELOC 0 /* choose one of the above! */
+};
 
 #define  __EXEC_OBJECT_HAS_PIN		BIT(31)
 #define  __EXEC_OBJECT_HAS_FENCE	BIT(30)
@@ -187,9 +192,14 @@ struct i915_execbuffer {
 		struct drm_mm_node node;
 		unsigned long vaddr;
 		unsigned int page;
-		bool use_64bit_reloc;
-		bool has_llc;
-		bool has_fence;
+		unsigned int gen;
+		bool use_64bit_reloc : 1;
+		bool has_llc : 1;
+		bool has_fence : 1;
+
+		struct drm_i915_gem_request *rq;
+		u32 *rq_cmd;
+		unsigned int rq_size;
 	} reloc_cache;
 	u64 invalid_flags;
 	u32 context_flags;
@@ -434,8 +444,11 @@ static inline int use_cpu_reloc(const struct reloc_cache *cache,
 	if (!i915_gem_object_has_struct_page(obj))
 		return false;
 
-	if (DBG_USE_CPU_RELOC)
-		return DBG_USE_CPU_RELOC > 0;
+	if (DBG_FORCE_RELOC == FORCE_CPU_RELOC)
+		return true;
+
+	if (DBG_FORCE_RELOC == FORCE_GTT_RELOC)
+		return false;
 
 	return (cache->has_llc ||
 		obj->base.write_domain == I915_GEM_DOMAIN_CPU ||
@@ -807,10 +820,13 @@ static void reloc_cache_init(struct reloc_cache *cache,
 	cache->page = -1;
 	cache->vaddr = 0;
 	/* Must be a variable in the struct to allow GCC to unroll. */
+	cache->gen = INTEL_GEN(i915);
 	cache->has_llc = HAS_LLC(i915);
-	cache->has_fence = INTEL_GEN(i915) < 4;
-	cache->use_64bit_reloc = HAS_64BIT_RELOC(i915);
+	cache->has_fence = cache->gen < 4;
+	cache->use_64bit_reloc = cache->gen >= 8;
 	cache->node.allocated = false;
+	cache->rq = NULL;
+	cache->rq_size = 0;
 }
 
 static inline void *unmask_page(unsigned long p)
@@ -832,10 +848,24 @@ static inline struct i915_ggtt *cache_to_ggtt(struct reloc_cache *cache)
 	return &i915->ggtt;
 }
 
+static void reloc_gpu_flush(struct reloc_cache *cache)
+{
+	GEM_BUG_ON(cache->rq_size >= cache->rq->batch->obj->base.size / sizeof(u32));
+	cache->rq_cmd[cache->rq_size] = MI_BATCH_BUFFER_END;
+	i915_gem_object_unpin_map(cache->rq->batch->obj);
+	i915_gem_chipset_flush(cache->rq->i915);
+
+	__i915_add_request(cache->rq, true);
+	cache->rq = NULL;
+}
+
 static void reloc_cache_reset(struct reloc_cache *cache)
 {
 	void *vaddr;
 
+	if (cache->rq)
+		reloc_gpu_flush(cache);
+
 	if (!cache->vaddr)
 		return;
 
@@ -1000,6 +1030,112 @@ static void clflush_write32(u32 *addr, u32 value, unsigned int flushes)
 		*addr = value;
 }
 
+static u32 *reloc_gpu(struct i915_execbuffer *eb,
+		      struct i915_vma *vma,
+		      unsigned int len)
+{
+	struct reloc_cache *cache = &eb->reloc_cache;
+	u32 *cmd;
+
+	if (cache->rq_size > PAGE_SIZE/sizeof(u32) - (len + 1))
+		reloc_gpu_flush(cache);
+
+	if (!cache->rq) {
+		struct drm_i915_gem_object *obj;
+		struct drm_i915_gem_request *rq;
+		struct i915_vma *batch;
+		int err;
+
+		GEM_BUG_ON(vma->obj->base.write_domain & I915_GEM_DOMAIN_CPU);
+
+		obj = i915_gem_batch_pool_get(&eb->engine->batch_pool,
+					      PAGE_SIZE);
+		if (IS_ERR(obj))
+			return ERR_CAST(obj);
+
+		cmd = i915_gem_object_pin_map(obj,
+					      cache->has_llc ? I915_MAP_WB : I915_MAP_WC);
+		i915_gem_object_unpin_pages(obj);
+		if (IS_ERR(cmd))
+			return ERR_CAST(cmd);
+
+		err = i915_gem_object_set_to_gtt_domain(obj, false);
+		if (err) {
+err_unmap:
+			i915_gem_object_unpin_map(obj);
+			return ERR_PTR(err);
+		}
+
+		batch = i915_vma_instance(obj, vma->vm, NULL);
+		if (IS_ERR(batch)) {
+			err = PTR_ERR(batch);
+			goto err_unmap;
+		}
+
+		err = i915_vma_pin(batch, 0, 0, PIN_USER | PIN_NONBLOCK);
+		if (err)
+			goto err_unmap;
+
+		rq = i915_gem_request_alloc(eb->engine, eb->ctx);
+		if (IS_ERR(rq)) {
+			err = PTR_ERR(rq);
+err_unpin:
+			i915_vma_unpin(batch);
+			goto err_unmap;
+		}
+
+		err = i915_gem_request_await_object(rq,
+						    vma->obj,
+						    EXEC_OBJECT_WRITE);
+		if (err) {
+err_request:
+			i915_add_request(rq);
+			goto err_unpin;
+		}
+
+		err = eb->engine->emit_flush(rq, EMIT_INVALIDATE);
+		if (err)
+			goto err_request;
+
+		err = i915_switch_context(rq);
+		if (err)
+			goto err_request;
+
+		err = eb->engine->emit_bb_start(rq,
+						batch->node.start, PAGE_SIZE,
+						cache->gen > 5 ? 0 : I915_DISPATCH_SECURE);
+		if (err)
+			goto err_request;
+
+		GEM_BUG_ON(!reservation_object_test_signaled_rcu(obj->resv,
+								 true));
+		i915_vma_move_to_active(batch, rq, 0);
+		reservation_object_lock(obj->resv, NULL);
+		reservation_object_add_excl_fence(obj->resv, &rq->fence);
+		reservation_object_unlock(obj->resv);
+		i915_vma_unpin(batch);
+
+		i915_vma_move_to_active(vma, rq, true);
+		reservation_object_lock(vma->obj->resv, NULL);
+		reservation_object_add_excl_fence(vma->obj->resv, &rq->fence);
+		reservation_object_unlock(vma->obj->resv);
+
+		vma->obj->base.write_domain = 0;
+		vma->obj->base.read_domains = I915_GEM_GPU_DOMAINS;
+
+		rq->batch = batch;
+
+		cache->rq = rq;
+		cache->rq_cmd = cmd;
+		cache->rq_size = 0;
+	}
+
+	cmd = cache->rq_cmd + cache->rq_size;
+	cache->rq_size += len;
+
+	return cmd;
+}
+
 static u64
 relocate_entry(struct i915_vma *vma,
 	       const struct drm_i915_gem_relocation_entry *reloc,
@@ -1012,6 +1148,67 @@ relocate_entry(struct i915_vma *vma,
 	bool wide = eb->reloc_cache.use_64bit_reloc;
 	void *vaddr;
 
+	if (!eb->reloc_cache.vaddr &&
+	    (DBG_FORCE_RELOC == FORCE_GPU_RELOC ||
+	     !reservation_object_test_signaled_rcu(obj->resv, true))) {
+		const unsigned int gen = eb->reloc_cache.gen;
+		unsigned int len;
+		u32 *batch;
+		u64 addr;
+
+		if (wide)
+			len = offset & 7 ? 8 : 5;
+		else if (gen >= 4)
+			len = 4;
+		else if (gen >= 3)
+			len = 3;
+		else /* On gen2 MI_STORE_DWORD_IMM uses a physical address */
+			goto repeat;
+
+		batch = reloc_gpu(eb, vma, len);
+		if (IS_ERR(batch))
+			goto repeat;
+
+		addr = gen8_canonical_addr(vma->node.start + offset);
+		if (wide) {
+			if (offset & 7) {
+				*batch++ = MI_STORE_DWORD_IMM_GEN4;
+				*batch++ = lower_32_bits(addr);
+				*batch++ = upper_32_bits(addr);
+				*batch++ = lower_32_bits(target_offset);
+
+				addr = gen8_canonical_addr(addr + 4);
+
+				*batch++ = MI_STORE_DWORD_IMM_GEN4;
+				*batch++ = lower_32_bits(addr);
+				*batch++ = upper_32_bits(addr);
+				*batch++ = upper_32_bits(target_offset);
+			} else {
+				*batch++ = (MI_STORE_DWORD_IMM_GEN4 | (1 << 21)) + 1;
+				*batch++ = lower_32_bits(addr);
+				*batch++ = upper_32_bits(addr);
+				*batch++ = lower_32_bits(target_offset);
+				*batch++ = upper_32_bits(target_offset);
+			}
+		} else if (gen >= 6) {
+			*batch++ = MI_STORE_DWORD_IMM_GEN4;
+			*batch++ = 0;
+			*batch++ = addr;
+			*batch++ = target_offset;
+		} else if (gen >= 4) {
+			*batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+			*batch++ = 0;
+			*batch++ = addr;
+			*batch++ = target_offset;
+		} else {
+			*batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
+			*batch++ = addr;
+			*batch++ = target_offset;
+		}
+
+		goto out;
+	}
+
 repeat:
 	vaddr = reloc_vaddr(obj, &eb->reloc_cache, offset >> PAGE_SHIFT);
 	if (IS_ERR(vaddr))
@@ -1028,6 +1225,7 @@ relocate_entry(struct i915_vma *vma,
 		goto repeat;
 	}
 
+out:
 	return gen8_canonical_addr(target->node.start) | 1;
 }
 
@@ -1088,7 +1286,8 @@ eb_relocate_entry(struct i915_execbuffer *eb,
 	/* If the relocation already has the right value in it, no
 	 * more work needs to be done.
 	 */
-	if (gen8_canonical_addr(target->node.start) == reloc->presumed_offset)
+	if (!DBG_FORCE_RELOC &&
+	    gen8_canonical_addr(target->node.start) == reloc->presumed_offset)
 		return 0;
 
 	/* Check that the relocation address is valid... */
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 39+ messages in thread

* [PATCH 15/15] drm/i915/scheduler: Support user-defined priorities
  2017-03-16 13:19 Make execbuf fast and green Chris Wilson
                   ` (13 preceding siblings ...)
  2017-03-16 13:20 ` [PATCH 14/15] drm/i915: Async GPU relocation processing Chris Wilson
@ 2017-03-16 13:20 ` Chris Wilson
  2017-03-16 13:57 ` ✗ Fi.CI.BAT: warning for series starting with [01/15] drm/i915: Copy user requested buffers into the error state Patchwork
                   ` (3 subsequent siblings)
  18 siblings, 0 replies; 39+ messages in thread
From: Chris Wilson @ 2017-03-16 13:20 UTC (permalink / raw)
  To: intel-gfx

Use a priority stored in the context as the initial value when
submitting a request. This allows us to change the default priority on a
per-context basis, allowing different contexts to be favoured with GPU
time at the expense of lower importance work. The user can adjust the
context's priority via I915_CONTEXT_PARAM_PRIORITY, with more positive
values being higher priority (they will be serviced earlier, after their
dependencies have been resolved). Any prerequisite work for an execbuf
will have its priority raised to match the new request as required.

Normal users can specify any value in the range of -1023 to 0 [default],
i.e. they can reduce the priority of their workloads (and temporarily
boost it back to normal if so desired).

Privileged users can specify any value in the range of -1023 to 1023,
[default is 0], i.e. they can raise their priority above all overs and
so potentially starve the system.

Note that the existing schedulers are not fair, nor load balancing, the
execution is strictly by priority on a first-come, first-served basis,
and the driver may choose to boost some requests above the range
available to users.

This priority was originally based around nice(2), but evolved to allow
clients to adjust their priority within a small range, and allow for a
privileged high priority range.

For example, this can be used to implement EGL_IMG_context_priority
https://www.khronos.org/registry/egl/extensions/IMG/EGL_IMG_context_priority.txt

	EGL_CONTEXT_PRIORITY_LEVEL_IMG determines the priority level of
        the context to be created. This attribute is a hint, as an
        implementation may not support multiple contexts at some
        priority levels and system policy may limit access to high
        priority contexts to appropriate system privilege level. The
        default value for EGL_CONTEXT_PRIORITY_LEVEL_IMG is
        EGL_CONTEXT_PRIORITY_MEDIUM_IMG."

so we can map

	PRIORITY_HIGH -> 1023 [privileged, will failback to 0]
	PRIORITY_MED -> 0 [default]
	PRIORITY_LOW -> -1023

They also map onto the priorities used by VkQueue (and a VkQueue is
essentially a timeline, our i915_gem_context under full-ppgtt).

Testcase: igt/gem_exec_schedule
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/i915_gem_context.c | 20 ++++++++++++++++++++
 include/uapi/drm/i915_drm.h             |  3 +++
 2 files changed, 23 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 184c077ee9bc..64eeb03df061 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -1135,6 +1135,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
 	case I915_CONTEXT_PARAM_BANNABLE:
 		args->value = i915_gem_context_is_bannable(ctx);
 		break;
+	case I915_CONTEXT_PARAM_PRIORITY:
+		args->value = ctx->priority;
+		break;
 	default:
 		ret = -EINVAL;
 		break;
@@ -1192,6 +1195,23 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
 		else
 			i915_gem_context_clear_bannable(ctx);
 		break;
+
+	case I915_CONTEXT_PARAM_PRIORITY:
+		{
+			int priority = args->value;
+
+			if (args->size)
+				ret = -EINVAL;
+			else if (priority >= I915_PRIORITY_MAX ||
+				 priority <= I915_PRIORITY_MIN)
+				ret = -EINVAL;
+			else if (priority > 0 && !capable(CAP_SYS_ADMIN))
+				ret = -EPERM;
+			else
+				ctx->priority = priority;
+		}
+		break;
+
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index a3ec2f355c09..bb9617233e28 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -395,6 +395,8 @@ typedef struct drm_i915_irq_wait {
 
 /* Query whether DRM_I915_GEM_EXECBUFFER2 supports user defined execution
  * priorities and the driver will attempt to execute batches in priority order.
+ * The initial priority for each batch is supplied by the context and is
+ * controlled via I915_CONTEXT_PARAM_PRIORITY.
  */
 #define I915_PARAM_HAS_SCHEDULER	 41
 #define I915_PARAM_HUC_STATUS		 42
@@ -1308,6 +1310,7 @@ struct drm_i915_gem_context_param {
 #define I915_CONTEXT_PARAM_GTT_SIZE	0x3
 #define I915_CONTEXT_PARAM_NO_ERROR_CAPTURE	0x4
 #define I915_CONTEXT_PARAM_BANNABLE	0x5
+#define I915_CONTEXT_PARAM_PRIORITY	0x6
 	__u64 value;
 };
 
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 39+ messages in thread

* ✗ Fi.CI.BAT: warning for series starting with [01/15] drm/i915: Copy user requested buffers into the error state
  2017-03-16 13:19 Make execbuf fast and green Chris Wilson
                   ` (14 preceding siblings ...)
  2017-03-16 13:20 ` [PATCH 15/15] drm/i915/scheduler: Support user-defined priorities Chris Wilson
@ 2017-03-16 13:57 ` Patchwork
  2017-03-17 11:28 ` ✓ Fi.CI.BAT: success for series starting with [01/15] drm/i915: Copy user requested buffers into the error state (rev2) Patchwork
                   ` (2 subsequent siblings)
  18 siblings, 0 replies; 39+ messages in thread
From: Patchwork @ 2017-03-16 13:57 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [01/15] drm/i915: Copy user requested buffers into the error state
URL   : https://patchwork.freedesktop.org/series/21377/
State : warning

== Summary ==

Series 21377v1 Series without cover letter
https://patchwork.freedesktop.org/api/1.0/series/21377/revisions/1/mbox/

Test gem_exec_fence:
        Subgroup await-hang-default:
                pass       -> INCOMPLETE (fi-ilk-650) fdo#100144
Test gem_exec_flush:
        Subgroup basic-batch-kernel-default-uc:
                fail       -> PASS       (fi-snb-2600) fdo#100007
Test gem_exec_suspend:
        Subgroup basic-s4-devices:
                pass       -> DMESG-WARN (fi-snb-2600)
Test kms_pipe_crc_basic:
        Subgroup nonblocking-crc-pipe-b-frame-sequence:
                fail       -> PASS       (fi-skl-6770hq)

fdo#100144 https://bugs.freedesktop.org/show_bug.cgi?id=100144
fdo#100007 https://bugs.freedesktop.org/show_bug.cgi?id=100007

fi-bdw-5557u     total:278  pass:267  dwarn:0   dfail:0   fail:0   skip:11  time: 471s
fi-bsw-n3050     total:278  pass:239  dwarn:0   dfail:0   fail:0   skip:39  time: 577s
fi-bxt-j4205     total:278  pass:259  dwarn:0   dfail:0   fail:0   skip:19  time: 551s
fi-bxt-t5700     total:278  pass:258  dwarn:0   dfail:0   fail:0   skip:20  time: 567s
fi-byt-j1900     total:278  pass:251  dwarn:0   dfail:0   fail:0   skip:27  time: 498s
fi-byt-n2820     total:278  pass:247  dwarn:0   dfail:0   fail:0   skip:31  time: 498s
fi-hsw-4770      total:278  pass:262  dwarn:0   dfail:0   fail:0   skip:16  time: 431s
fi-hsw-4770r     total:278  pass:262  dwarn:0   dfail:0   fail:0   skip:16  time: 431s
fi-ilk-650       total:48   pass:27   dwarn:0   dfail:0   fail:0   skip:20  time: 0s
fi-ivb-3520m     total:278  pass:260  dwarn:0   dfail:0   fail:0   skip:18  time: 520s
fi-ivb-3770      total:278  pass:260  dwarn:0   dfail:0   fail:0   skip:18  time: 503s
fi-kbl-7500u     total:278  pass:260  dwarn:0   dfail:0   fail:0   skip:18  time: 499s
fi-skl-6260u     total:278  pass:268  dwarn:0   dfail:0   fail:0   skip:10  time: 508s
fi-skl-6700hq    total:278  pass:261  dwarn:0   dfail:0   fail:0   skip:17  time: 613s
fi-skl-6700k     total:278  pass:256  dwarn:4   dfail:0   fail:0   skip:18  time: 502s
fi-skl-6770hq    total:278  pass:268  dwarn:0   dfail:0   fail:0   skip:10  time: 550s
fi-snb-2520m     total:278  pass:250  dwarn:0   dfail:0   fail:0   skip:28  time: 545s
fi-snb-2600      total:278  pass:248  dwarn:1   dfail:0   fail:0   skip:29  time: 428s

b0e9b91c4e54bb54ff451fb2b221fb07e4eec8e0 drm-tip: 2017y-03m-16d-12h-52m-50s UTC integration manifest
fb70d29 drm/i915/scheduler: Support user-defined priorities
8a79a9b drm/i915: Async GPU relocation processing
00b8804 drm/i915: Allow execbuffer to use the first object as the batch
3964dc3 drm/i915: Remove superfluous i915_add_request_no_flush() helper
7ceb9ce drm/i915: Wait upon userptr get-user-pages within execbuffer
fa3a316 drm/i915: First try the previous execbuffer location
64afc81 drm/i915: Eliminate lots of iterations over the execobjects array
4e46078 drm/i915: Pass vma to relocate entry
69b0cb0 drm/i915: Store a direct lookup from object handle to vma
6ed53b2 drm/i915: Stop using obj->obj_exec_link outside of execbuf
a633c82 drm/i915: Split vma exec_link/evict_link
87bfd88 drm/i915: Use vma->exec_entry as our double-entry placeholder
c71fcc8 drm/i915: Amalgamate execbuffer parameter structures
e2f40c9 drm/i915: Retire an active batch pool object rather than allocate new
f4edbb8 drm/i915: Copy user requested buffers into the error state

== Logs ==

For more details see: https://intel-gfx-ci.01.org/CI/Patchwork_4201/
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 02/15] drm/i915: Retire an active batch pool object rather than allocate new
  2017-03-16 13:19 ` [PATCH 02/15] drm/i915: Retire an active batch pool object rather than allocate new Chris Wilson
@ 2017-03-17  8:52   ` Joonas Lahtinen
  2017-03-17  9:02     ` Chris Wilson
  0 siblings, 1 reply; 39+ messages in thread
From: Joonas Lahtinen @ 2017-03-17  8:52 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

On to, 2017-03-16 at 13:19 +0000, Chris Wilson wrote:
> Since obj->active_count is only updated upon retirement, if we see an
> active object in the batch pool, double check that is still active
> before deciding to allocate a new object.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

Any specific testcase for this?

Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>

Regards, Joonas
-- 
Joonas Lahtinen
Open Source Technology Center
Intel Corporation
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 02/15] drm/i915: Retire an active batch pool object rather than allocate new
  2017-03-17  8:52   ` Joonas Lahtinen
@ 2017-03-17  9:02     ` Chris Wilson
  0 siblings, 0 replies; 39+ messages in thread
From: Chris Wilson @ 2017-03-17  9:02 UTC (permalink / raw)
  To: Joonas Lahtinen; +Cc: intel-gfx

On Fri, Mar 17, 2017 at 10:52:16AM +0200, Joonas Lahtinen wrote:
> On to, 2017-03-16 at 13:19 +0000, Chris Wilson wrote:
> > Since obj->active_count is only updated upon retirement, if we see an
> > active object in the batch pool, double check that is still active
> > before deciding to allocate a new object.
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> 
> Any specific testcase for this?

gem_exec_parse/allocations was written to show a performance cliff wrt
to an older implementation of the batch_pool cache.
gem_exec_parallel + async patching puts this nicely to use.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 03/15] drm/i915: Amalgamate execbuffer parameter structures
  2017-03-16 13:19 ` [PATCH 03/15] drm/i915: Amalgamate execbuffer parameter structures Chris Wilson
@ 2017-03-17 10:04   ` Joonas Lahtinen
  0 siblings, 0 replies; 39+ messages in thread
From: Joonas Lahtinen @ 2017-03-17 10:04 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

On to, 2017-03-16 at 13:19 +0000, Chris Wilson wrote:
> @@ -1133,22 +1110,22 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req,
>  		}
>  
>  		ret = i915_gem_request_await_object
> -			(req, obj, obj->base.pending_write_domain);
> +			(eb->request, obj, obj->base.pending_write_domain);

Bump "(eb->request" to first line.

Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>

Regards, Joonas
-- 
Joonas Lahtinen
Open Source Technology Center
Intel Corporation
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [PATCH v2] drm/i915: Async GPU relocation processing
  2017-03-16 13:20 ` [PATCH 14/15] drm/i915: Async GPU relocation processing Chris Wilson
@ 2017-03-17 11:11   ` Chris Wilson
  2017-03-17 11:15   ` [PATCH 14/15] " Joonas Lahtinen
  1 sibling, 0 replies; 39+ messages in thread
From: Chris Wilson @ 2017-03-17 11:11 UTC (permalink / raw)
  To: intel-gfx

If the user requires patching of their batch or auxiliary buffers, we
currently make the alterations on the cpu. If they are active on the GPU
at the time, we wait under the struct_mutex for them to finish executing
before we rewrite the contents. This happens if shared relocation trees
are used between different contexts with separate address space (and the
buffers then have different addresses in each), the 3D state will need
to be adjusted between execution on each context. However, we don't need
to use the CPU to do the relocation patching, as we could queue commands
to the GPU to perform it and use fences to serialise the operation with
the current activity and future - so the operation on the GPU appears
just as atomic as performing it immediately. Performing the relocation
rewrites on the GPU is not free, in terms of pure throughput, the number
of relocations/s is about halved - but more importantly so is the time
under the struct_mutex.

v2: Break out the request/batch allocation for clearer error flow.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem.c            |   1 -
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 233 +++++++++++++++++++++++++++--
 2 files changed, 224 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 90b7c34ef550..ff4029bb6ad4 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4248,7 +4248,6 @@ static void __i915_gem_free_objects(struct drm_i915_private *i915,
 		GEM_BUG_ON(i915_gem_object_is_active(obj));
 		list_for_each_entry_safe(vma, vn,
 					 &obj->vma_list, obj_link) {
-			GEM_BUG_ON(!i915_vma_is_ggtt(vma));
 			GEM_BUG_ON(i915_vma_is_active(vma));
 			vma->flags &= ~I915_VMA_PIN_MASK;
 			i915_vma_close(vma);
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index d39ea2ba20ff..a28d08f5b8b4 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -40,7 +40,12 @@
 #include "intel_drv.h"
 #include "intel_frontbuffer.h"
 
-#define DBG_USE_CPU_RELOC 0 /* -1 force GTT relocs; 1 force CPU relocs */
+enum {
+	FORCE_CPU_RELOC = 1,
+	FORCE_GTT_RELOC,
+	FORCE_GPU_RELOC,
+#define DBG_FORCE_RELOC 0 /* choose one of the above! */
+};
 
 #define  __EXEC_OBJECT_HAS_PIN		BIT(31)
 #define  __EXEC_OBJECT_HAS_FENCE	BIT(30)
@@ -187,9 +192,14 @@ struct i915_execbuffer {
 		struct drm_mm_node node;
 		unsigned long vaddr;
 		unsigned int page;
-		bool use_64bit_reloc;
-		bool has_llc;
-		bool has_fence;
+		unsigned int gen;
+		bool use_64bit_reloc : 1;
+		bool has_llc : 1;
+		bool has_fence : 1;
+
+		struct drm_i915_gem_request *rq;
+		u32 *rq_cmd;
+		unsigned int rq_size;
 	} reloc_cache;
 	u64 invalid_flags;
 	u32 context_flags;
@@ -434,8 +444,11 @@ static inline int use_cpu_reloc(const struct reloc_cache *cache,
 	if (!i915_gem_object_has_struct_page(obj))
 		return false;
 
-	if (DBG_USE_CPU_RELOC)
-		return DBG_USE_CPU_RELOC > 0;
+	if (DBG_FORCE_RELOC == FORCE_CPU_RELOC)
+		return true;
+
+	if (DBG_FORCE_RELOC == FORCE_GTT_RELOC)
+		return false;
 
 	return (cache->has_llc ||
 		obj->base.write_domain == I915_GEM_DOMAIN_CPU ||
@@ -807,10 +820,13 @@ static void reloc_cache_init(struct reloc_cache *cache,
 	cache->page = -1;
 	cache->vaddr = 0;
 	/* Must be a variable in the struct to allow GCC to unroll. */
+	cache->gen = INTEL_GEN(i915);
 	cache->has_llc = HAS_LLC(i915);
-	cache->has_fence = INTEL_GEN(i915) < 4;
-	cache->use_64bit_reloc = HAS_64BIT_RELOC(i915);
+	cache->has_fence = cache->gen < 4;
+	cache->use_64bit_reloc = cache->gen >= 8;
 	cache->node.allocated = false;
+	cache->rq = NULL;
+	cache->rq_size = 0;
 }
 
 static inline void *unmask_page(unsigned long p)
@@ -832,10 +848,24 @@ static inline struct i915_ggtt *cache_to_ggtt(struct reloc_cache *cache)
 	return &i915->ggtt;
 }
 
+static void reloc_gpu_flush(struct reloc_cache *cache)
+{
+	GEM_BUG_ON(cache->rq_size >= cache->rq->batch->obj->base.size / sizeof(u32));
+	cache->rq_cmd[cache->rq_size] = MI_BATCH_BUFFER_END;
+	i915_gem_object_unpin_map(cache->rq->batch->obj);
+	i915_gem_chipset_flush(cache->rq->i915);
+
+	__i915_add_request(cache->rq, true);
+	cache->rq = NULL;
+}
+
 static void reloc_cache_reset(struct reloc_cache *cache)
 {
 	void *vaddr;
 
+	if (cache->rq)
+		reloc_gpu_flush(cache);
+
 	if (!cache->vaddr)
 		return;
 
@@ -1000,6 +1030,128 @@ static void clflush_write32(u32 *addr, u32 value, unsigned int flushes)
 		*addr = value;
 }
 
+static int __reloc_gpu_alloc(struct i915_execbuffer *eb,
+			     struct i915_vma *vma,
+			     unsigned int len)
+{
+	struct reloc_cache *cache = &eb->reloc_cache;
+
+	struct drm_i915_gem_object *obj;
+	struct drm_i915_gem_request *rq;
+	struct i915_vma *batch;
+	u32 *cmd;
+	int err;
+
+	GEM_BUG_ON(vma->obj->base.write_domain & I915_GEM_DOMAIN_CPU);
+
+	obj = i915_gem_batch_pool_get(&eb->engine->batch_pool,
+				      PAGE_SIZE);
+	if (IS_ERR(obj))
+		return PTR_ERR(obj);
+
+	cmd = i915_gem_object_pin_map(obj,
+				      cache->has_llc ? I915_MAP_WB : I915_MAP_WC);
+	i915_gem_object_unpin_pages(obj);
+	if (IS_ERR(cmd))
+		return PTR_ERR(cmd);
+
+	err = i915_gem_object_set_to_gtt_domain(obj, false);
+	if (err)
+		goto err_unmap;
+
+	batch = i915_vma_instance(obj, vma->vm, NULL);
+	if (IS_ERR(batch)) {
+		err = PTR_ERR(batch);
+		goto err_unmap;
+	}
+
+	err = i915_vma_pin(batch, 0, 0, PIN_USER | PIN_NONBLOCK);
+	if (err)
+		goto err_unmap;
+
+	rq = i915_gem_request_alloc(eb->engine, eb->ctx);
+	if (IS_ERR(rq)) {
+		err = PTR_ERR(rq);
+		goto err_unpin;
+	}
+
+	err = i915_gem_request_await_object(rq,
+					    vma->obj,
+					    EXEC_OBJECT_WRITE);
+	if (err)
+		goto err_request;
+
+	err = eb->engine->emit_flush(rq, EMIT_INVALIDATE);
+	if (err)
+		goto err_request;
+
+	err = i915_switch_context(rq);
+	if (err)
+		goto err_request;
+
+	err = eb->engine->emit_bb_start(rq,
+					batch->node.start, PAGE_SIZE,
+					cache->gen > 5 ? 0 : I915_DISPATCH_SECURE);
+	if (err)
+		goto err_request;
+
+	GEM_BUG_ON(!reservation_object_test_signaled_rcu(obj->resv,
+							 true));
+	i915_vma_move_to_active(batch, rq, 0);
+	reservation_object_lock(obj->resv, NULL);
+	reservation_object_add_excl_fence(obj->resv, &rq->fence);
+	reservation_object_unlock(obj->resv);
+	i915_vma_unpin(batch);
+
+	i915_vma_move_to_active(vma, rq, true);
+	reservation_object_lock(vma->obj->resv, NULL);
+	reservation_object_add_excl_fence(vma->obj->resv, &rq->fence);
+	reservation_object_unlock(vma->obj->resv);
+
+	vma->obj->base.write_domain = 0;
+	vma->obj->base.read_domains = I915_GEM_GPU_DOMAINS;
+
+	rq->batch = batch;
+
+	cache->rq = rq;
+	cache->rq_cmd = cmd;
+	cache->rq_size = 0;
+
+	return 0;
+
+err_request:
+	i915_add_request(rq);
+err_unpin:
+	i915_vma_unpin(batch);
+err_unmap:
+	i915_gem_object_unpin_map(obj);
+	return err;
+}
+
+static u32 *reloc_gpu(struct i915_execbuffer *eb,
+		      struct i915_vma *vma,
+		      unsigned int len)
+{
+	struct reloc_cache *cache = &eb->reloc_cache;
+	u32 *cmd;
+
+	if (cache->rq_size > PAGE_SIZE/sizeof(u32) - (len + 1))
+		reloc_gpu_flush(cache);
+
+	if (unlikely(!cache->rq)) {
+		int err;
+
+		err = __reloc_gpu_alloc(eb, vma, len);
+		if (unlikely(err))
+			return ERR_PTR(err);
+	}
+
+	cmd = cache->rq_cmd + cache->rq_size;
+	cache->rq_size += len;
+
+	return cmd;
+}
+
 static u64
 relocate_entry(struct i915_vma *vma,
 	       const struct drm_i915_gem_relocation_entry *reloc,
@@ -1012,6 +1164,67 @@ relocate_entry(struct i915_vma *vma,
 	bool wide = eb->reloc_cache.use_64bit_reloc;
 	void *vaddr;
 
+	if (!eb->reloc_cache.vaddr &&
+	    (DBG_FORCE_RELOC == FORCE_GPU_RELOC ||
+	     !reservation_object_test_signaled_rcu(obj->resv, true))) {
+		const unsigned int gen = eb->reloc_cache.gen;
+		unsigned int len;
+		u32 *batch;
+		u64 addr;
+
+		if (wide)
+			len = offset & 7 ? 8 : 5;
+		else if (gen >= 4)
+			len = 4;
+		else if (gen >= 3)
+			len = 3;
+		else /* On gen2 MI_STORE_DWORD_IMM uses a physical address */
+			goto repeat;
+
+		batch = reloc_gpu(eb, vma, len);
+		if (IS_ERR(batch))
+			goto repeat;
+
+		addr = gen8_canonical_addr(vma->node.start + offset);
+		if (wide) {
+			if (offset & 7) {
+				*batch++ = MI_STORE_DWORD_IMM_GEN4;
+				*batch++ = lower_32_bits(addr);
+				*batch++ = upper_32_bits(addr);
+				*batch++ = lower_32_bits(target_offset);
+
+				addr = gen8_canonical_addr(addr + 4);
+
+				*batch++ = MI_STORE_DWORD_IMM_GEN4;
+				*batch++ = lower_32_bits(addr);
+				*batch++ = upper_32_bits(addr);
+				*batch++ = upper_32_bits(target_offset);
+			} else {
+				*batch++ = (MI_STORE_DWORD_IMM_GEN4 | (1 << 21)) + 1;
+				*batch++ = lower_32_bits(addr);
+				*batch++ = upper_32_bits(addr);
+				*batch++ = lower_32_bits(target_offset);
+				*batch++ = upper_32_bits(target_offset);
+			}
+		} else if (gen >= 6) {
+			*batch++ = MI_STORE_DWORD_IMM_GEN4;
+			*batch++ = 0;
+			*batch++ = addr;
+			*batch++ = target_offset;
+		} else if (gen >= 4) {
+			*batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+			*batch++ = 0;
+			*batch++ = addr;
+			*batch++ = target_offset;
+		} else {
+			*batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
+			*batch++ = addr;
+			*batch++ = target_offset;
+		}
+
+		goto out;
+	}
+
 repeat:
 	vaddr = reloc_vaddr(obj, &eb->reloc_cache, offset >> PAGE_SHIFT);
 	if (IS_ERR(vaddr))
@@ -1028,6 +1241,7 @@ relocate_entry(struct i915_vma *vma,
 		goto repeat;
 	}
 
+out:
 	return gen8_canonical_addr(target->node.start) | 1;
 }
 
@@ -1088,7 +1302,8 @@ eb_relocate_entry(struct i915_execbuffer *eb,
 	/* If the relocation already has the right value in it, no
 	 * more work needs to be done.
 	 */
-	if (gen8_canonical_addr(target->node.start) == reloc->presumed_offset)
+	if (!DBG_FORCE_RELOC &&
+	    gen8_canonical_addr(target->node.start) == reloc->presumed_offset)
 		return 0;
 
 	/* Check that the relocation address is valid... */
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 39+ messages in thread

* Re: [PATCH 14/15] drm/i915: Async GPU relocation processing
  2017-03-16 13:20 ` [PATCH 14/15] drm/i915: Async GPU relocation processing Chris Wilson
  2017-03-17 11:11   ` [PATCH v2] " Chris Wilson
@ 2017-03-17 11:15   ` Joonas Lahtinen
  1 sibling, 0 replies; 39+ messages in thread
From: Joonas Lahtinen @ 2017-03-17 11:15 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

On to, 2017-03-16 at 13:20 +0000, Chris Wilson wrote:
> If the user requires patching of their batch or auxiliary buffers, we
> currently make the alterations on the cpu. If they are active on the GPU
> at the time, we wait under the struct_mutex for them to finish executing
> before we rewrite the contents. This happens if shared relocation trees
> are used between different contexts with separate address space (and the
> buffers then have different addresses in each), the 3D state will need
> to be adjusted between execution on each context. However, we don't need
> to use the CPU to do the relocation patching, as we could queue commands
> to the GPU to perform it and use fences to serialise the operation with
> the current activity and future - so the operation on the GPU appears
> just as atomic as performing it immediately. Performing the relocation
> rewrites on the GPU is not free, in terms of pure throughput, the number
> of relocations/s is about halved - but more importantly so is the time
> under the struct_mutex.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

<SNIP>

>  	/* Must be a variable in the struct to allow GCC to unroll. */
> +	cache->gen = INTEL_GEN(i915);
>  	cache->has_llc = HAS_LLC(i915);
> -	cache->has_fence = INTEL_GEN(i915) < 4;
> -	cache->use_64bit_reloc = HAS_64BIT_RELOC(i915);
> +	cache->has_fence = cache->gen < 4;
> +	cache->use_64bit_reloc = cache->gen >= 8;

Keep using HAS_64BIT_RELOC(i915)...

> +static u32 *reloc_gpu(struct i915_execbuffer *eb,
> +		      struct i915_vma *vma,
> +		      unsigned int len)
> +{
> +	struct reloc_cache *cache = &eb->reloc_cache;
> +	u32 *cmd;
> +
> +	if (cache->rq_size > PAGE_SIZE/sizeof(u32) - (len + 1))

There's no overflow chance here, so I'd rq_size + len + 1. 

> +		reloc_gpu_flush(cache);
> +
> +	if (!cache->rq) {
> +		struct drm_i915_gem_object *obj;
> +		struct drm_i915_gem_request *rq;
> +		struct i915_vma *batch;
> +		int err;
> +
> +		GEM_BUG_ON(vma->obj->base.write_domain & I915_GEM_DOMAIN_CPU);

Use ==.

I just close my eyes for the reminder of this function and expect v2 to
have a proper teardown :P

Also vma / obj pair naming varies from what they usually are, so I'd
consider renaming one of them to lessen confusion.

> @@ -1012,6 +1148,67 @@ relocate_entry(struct i915_vma *vma,
>  	bool wide = eb->reloc_cache.use_64bit_reloc;
>  	void *vaddr;
>  
> +	if (!eb->reloc_cache.vaddr &&
> +	    (DBG_FORCE_RELOC == FORCE_GPU_RELOC ||
> +	     !reservation_object_test_signaled_rcu(obj->resv, true))) {
> +		const unsigned int gen = eb->reloc_cache.gen;
> +		unsigned int len;
> +		u32 *batch;
> +		u64 addr;
> +
> +		if (wide)
> +			len = offset & 7 ? 8 : 5;
> +		else if (gen >= 4)
> +			len = 4;
> +		else if (gen >= 3)
> +			len = 3;
> +		else /* On gen2 MI_STORE_DWORD_IMM uses a physical address */
> +			goto repeat;
> +
> +		batch = reloc_gpu(eb, vma, len);
> +		if (IS_ERR(batch))
> +			goto repeat;
> +
> +		addr = gen8_canonical_addr(vma->node.start + offset);
> +		if (wide) {
> +			if (offset & 7) {
> +				*batch++ = MI_STORE_DWORD_IMM_GEN4;

This indent level calls for a new function. __relocate_entry_gpu

Couldn't we share some of this STORE IMM code we have around? I don't
want to crawl the specs every time :(

Regards, Joonas
-- 
Joonas Lahtinen
Open Source Technology Center
Intel Corporation
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 13/15] drm/i915: Allow execbuffer to use the first object as the batch
  2017-03-16 13:20 ` [PATCH 13/15] drm/i915: Allow execbuffer to use the first object as the batch Chris Wilson
@ 2017-03-17 11:15   ` Joonas Lahtinen
  2017-07-07 10:17     ` Daniel Vetter
  0 siblings, 1 reply; 39+ messages in thread
From: Joonas Lahtinen @ 2017-03-17 11:15 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

On to, 2017-03-16 at 13:20 +0000, Chris Wilson wrote:
> Currently, the last object in the execlist is the always the batch.
> However, when building the batch buffer we often know the batch object
> first and if we can use the first slot in the execlist we can emit
> relocation instructions relative to it immediately and avoid a separate
> pass to adjust the relocations to point to the last execlist slot.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>

Regards, Joonas
-- 
Joonas Lahtinen
Open Source Technology Center
Intel Corporation
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 12/15] drm/i915: Remove superfluous i915_add_request_no_flush() helper
  2017-03-16 13:20 ` [PATCH 12/15] drm/i915: Remove superfluous i915_add_request_no_flush() helper Chris Wilson
@ 2017-03-17 11:17   ` Joonas Lahtinen
  2017-03-17 13:06     ` Chris Wilson
  0 siblings, 1 reply; 39+ messages in thread
From: Joonas Lahtinen @ 2017-03-17 11:17 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

On to, 2017-03-16 at 13:20 +0000, Chris Wilson wrote:
> The only time we need to emit a flush inside request emission is after
> an execbuffer, for which we can use the full __i915_add_request(). All
> other instances want the simpler i915_add_request() without flushing, so
> remove the useless helper.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>

Regards, Joonas
-- 
Joonas Lahtinen
Open Source Technology Center
Intel Corporation
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 39+ messages in thread

* ✓ Fi.CI.BAT: success for series starting with [01/15] drm/i915: Copy user requested buffers into the error state (rev2)
  2017-03-16 13:19 Make execbuf fast and green Chris Wilson
                   ` (15 preceding siblings ...)
  2017-03-16 13:57 ` ✗ Fi.CI.BAT: warning for series starting with [01/15] drm/i915: Copy user requested buffers into the error state Patchwork
@ 2017-03-17 11:28 ` Patchwork
  2017-03-22  9:43 ` ✗ Fi.CI.BAT: failure for series starting with [01/15] drm/i915: Copy user requested buffers into the error state (rev3) Patchwork
  2017-03-23 13:51 ` ✗ Fi.CI.BAT: failure for series starting with [01/15] drm/i915: Copy user requested buffers into the error state (rev4) Patchwork
  18 siblings, 0 replies; 39+ messages in thread
From: Patchwork @ 2017-03-17 11:28 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [01/15] drm/i915: Copy user requested buffers into the error state (rev2)
URL   : https://patchwork.freedesktop.org/series/21377/
State : success

== Summary ==

Series 21377v2 Series without cover letter
https://patchwork.freedesktop.org/api/1.0/series/21377/revisions/2/mbox/

Test gem_exec_fence:
        Subgroup await-hang-default:
                pass       -> INCOMPLETE (fi-ilk-650) fdo#100144
Test gem_exec_flush:
        Subgroup basic-batch-kernel-default-uc:
                pass       -> FAIL       (fi-snb-2600) fdo#100007

fdo#100144 https://bugs.freedesktop.org/show_bug.cgi?id=100144
fdo#100007 https://bugs.freedesktop.org/show_bug.cgi?id=100007

fi-bdw-5557u     total:278  pass:267  dwarn:0   dfail:0   fail:0   skip:11  time: 486s
fi-bsw-n3050     total:278  pass:239  dwarn:0   dfail:0   fail:0   skip:39  time: 580s
fi-bxt-j4205     total:278  pass:259  dwarn:0   dfail:0   fail:0   skip:19  time: 546s
fi-bxt-t5700     total:278  pass:258  dwarn:0   dfail:0   fail:0   skip:20  time: 568s
fi-byt-j1900     total:278  pass:251  dwarn:0   dfail:0   fail:0   skip:27  time: 501s
fi-byt-n2820     total:278  pass:247  dwarn:0   dfail:0   fail:0   skip:31  time: 496s
fi-hsw-4770      total:278  pass:262  dwarn:0   dfail:0   fail:0   skip:16  time: 435s
fi-hsw-4770r     total:278  pass:262  dwarn:0   dfail:0   fail:0   skip:16  time: 436s
fi-ilk-650       total:48   pass:27   dwarn:0   dfail:0   fail:0   skip:20  time: 0s
fi-ivb-3520m     total:278  pass:260  dwarn:0   dfail:0   fail:0   skip:18  time: 506s
fi-ivb-3770      total:278  pass:260  dwarn:0   dfail:0   fail:0   skip:18  time: 490s
fi-kbl-7500u     total:278  pass:260  dwarn:0   dfail:0   fail:0   skip:18  time: 498s
fi-skl-6260u     total:278  pass:268  dwarn:0   dfail:0   fail:0   skip:10  time: 492s
fi-skl-6700hq    total:278  pass:261  dwarn:0   dfail:0   fail:0   skip:17  time: 617s
fi-skl-6700k     total:278  pass:256  dwarn:4   dfail:0   fail:0   skip:18  time: 506s
fi-skl-6770hq    total:278  pass:268  dwarn:0   dfail:0   fail:0   skip:10  time: 536s
fi-snb-2520m     total:278  pass:250  dwarn:0   dfail:0   fail:0   skip:28  time: 548s
fi-snb-2600      total:278  pass:248  dwarn:0   dfail:0   fail:1   skip:29  time: 420s

03d01320f28dfbae10d844cfedef8e72f58b55f5 drm-tip: 2017y-03m-17d-09h-54m-24s UTC integration manifest
1a47c4b drm/i915/scheduler: Support user-defined priorities
c6d9bf0 drm/i915: Async GPU relocation processing
ffbde09 drm/i915: Allow execbuffer to use the first object as the batch
7bbe353 drm/i915: Remove superfluous i915_add_request_no_flush() helper
5f274c0 drm/i915: Wait upon userptr get-user-pages within execbuffer
325e1c9 drm/i915: First try the previous execbuffer location
87c839b drm/i915: Eliminate lots of iterations over the execobjects array
280004c drm/i915: Pass vma to relocate entry
0a98fab drm/i915: Store a direct lookup from object handle to vma
6d59ab9 drm/i915: Stop using obj->obj_exec_link outside of execbuf
2cadbd9 drm/i915: Split vma exec_link/evict_link
c079fc3 drm/i915: Use vma->exec_entry as our double-entry placeholder
75d739f drm/i915: Amalgamate execbuffer parameter structures
4284f1b drm/i915: Retire an active batch pool object rather than allocate new
a67a616 drm/i915: Copy user requested buffers into the error state

== Logs ==

For more details see: https://intel-gfx-ci.01.org/CI/Patchwork_4213/
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 12/15] drm/i915: Remove superfluous i915_add_request_no_flush() helper
  2017-03-17 11:17   ` Joonas Lahtinen
@ 2017-03-17 13:06     ` Chris Wilson
  0 siblings, 0 replies; 39+ messages in thread
From: Chris Wilson @ 2017-03-17 13:06 UTC (permalink / raw)
  To: Joonas Lahtinen; +Cc: intel-gfx

On Fri, Mar 17, 2017 at 01:17:17PM +0200, Joonas Lahtinen wrote:
> On to, 2017-03-16 at 13:20 +0000, Chris Wilson wrote:
> > The only time we need to emit a flush inside request emission is after
> > an execbuffer, for which we can use the full __i915_add_request(). All
> > other instances want the simpler i915_add_request() without flushing, so
> > remove the useless helper.
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> 
> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>

Extracted and pushed to reduce the noise, thanks.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 10/15] drm/i915: First try the previous execbuffer location
  2017-03-16 13:20 ` [PATCH 10/15] drm/i915: First try the previous execbuffer location Chris Wilson
@ 2017-03-21 13:54   ` Joonas Lahtinen
  0 siblings, 0 replies; 39+ messages in thread
From: Joonas Lahtinen @ 2017-03-21 13:54 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

On to, 2017-03-16 at 13:20 +0000, Chris Wilson wrote:
> When choosing a slot for an execbuffer, we ideally want to use the same
> address as last time (so that we don't have to rebind it) and the same
> address as expected by the user (so that we don't have to fixup any
> relocations pointing to it). If we first try to bind the incoming
> execbuffer->offset from the user, or the currently bound offset that
> should hopefully achieve the goal of avoiding the rebind cost and the
> relocation penalty. However, if the object is not currently bound there
> we don't want to arbitrarily unbind an object in our chosen position and
> so choose to rebind/relocate the incoming object instead. After we
> report the new position back to the user, on the next pass the
> relocations should have settled down.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

Reviewed-by: Joonas Lahtinen <joonas.lahtien@linux.intel.com>

Regards, Joonas
-- 
Joonas Lahtinen
Open Source Technology Center
Intel Corporation
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 08/15] drm/i915: Pass vma to relocate entry
  2017-03-16 13:19 ` [PATCH 08/15] drm/i915: Pass vma to relocate entry Chris Wilson
@ 2017-03-21 14:17   ` Joonas Lahtinen
  2017-03-23 13:02     ` [PATCH] " Chris Wilson
  0 siblings, 1 reply; 39+ messages in thread
From: Joonas Lahtinen @ 2017-03-21 14:17 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

On to, 2017-03-16 at 13:19 +0000, Chris Wilson wrote:
> We can simplify our tracking of pending writes in an execbuf to the
> single bit in the vma->exec_entry->flags, but that requires the
> relocation function knowing the object's vma. Pass it along.
> 

Drop a comment here that the flushing granularity is "any bit set"
level, so extra tracking granularity isn't helping.

> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

<SNIP>

After this, pending_read_domains / pending_write_domain are unused in
DRM scope, so could be removed.

> @@ -1669,7 +1652,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
>  	}
>  
>  	/* Set the pending read domains for the batch buffer to COMMAND */

Nuke this comment too.

Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>

Regards, Joonas
-- 
Joonas Lahtinen
Open Source Technology Center
Intel Corporation
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [PATCH v2] drm/i915: Store a direct lookup from object handle to vma
  2017-03-16 13:19 ` [PATCH 07/15] drm/i915: Store a direct lookup from object handle to vma Chris Wilson
@ 2017-03-22  9:33   ` Chris Wilson
  2017-03-22 14:22     ` Joonas Lahtinen
  0 siblings, 1 reply; 39+ messages in thread
From: Chris Wilson @ 2017-03-22  9:33 UTC (permalink / raw)
  To: intel-gfx

The advent of full-ppgtt lead to an extra indirection between the object
and its binding. That extra indirection has a noticeable impact on how
fast we can convert from the user handles to our internal vma for
execbuffer. In order to bypass the extra indirection, we use a
resizable hashtable to jump from the object to the per-ctx vma.
rhashtable was considered but we don't need the online resizing feature
and the extra complexity proved to undermine its usefulness. Instead, we
simply reallocate the hastable on demand in a background task and
serialize it before iterating.

In non-full-ppgtt modes, multiple files and multiple contexts can share
the same vma. This leads to having multiple possible handle->vma links,
so we only use the first to establish the fast path. The majority of
buffers are not shared and so we should still be able to realise
speedups with multiple clients.

v2: Prettier names, more magic.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_debugfs.c           |   6 +
 drivers/gpu/drm/i915/i915_drv.h               |   2 +-
 drivers/gpu/drm/i915/i915_gem.c               |   5 +-
 drivers/gpu/drm/i915/i915_gem_context.c       |  75 ++++++++
 drivers/gpu/drm/i915/i915_gem_context.h       |  25 +++
 drivers/gpu/drm/i915/i915_gem_execbuffer.c    | 254 +++++++++++++++-----------
 drivers/gpu/drm/i915/i915_gem_object.h        |   4 +-
 drivers/gpu/drm/i915/i915_vma.c               |  20 ++
 drivers/gpu/drm/i915/i915_vma.h               |   8 +-
 drivers/gpu/drm/i915/selftests/mock_context.c |  12 +-
 10 files changed, 300 insertions(+), 111 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index fcac795d4396..b5d070acdaf2 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1988,6 +1988,12 @@ static int i915_context_status(struct seq_file *m, void *unused)
 			seq_putc(m, '\n');
 		}
 
+		seq_printf(m,
+			   "\tvma hashtable size=%u (actual %lu), count=%u\n",
+			   ctx->vma_lut.ht_size,
+			   BIT(ctx->vma_lut.ht_bits),
+			   ctx->vma_lut.ht_count);
+
 		seq_putc(m, '\n');
 	}
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index a6ff2fd2ac8e..0b5bcecd24a1 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -37,7 +37,7 @@
 #include <linux/i2c.h>
 #include <linux/i2c-algo-bit.h>
 #include <linux/backlight.h>
-#include <linux/hashtable.h>
+#include <linux/hash.h>
 #include <linux/intel-iommu.h>
 #include <linux/kref.h>
 #include <linux/pm_qos.h>
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index bb65072f688d..7c1a04d5b269 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3161,6 +3161,10 @@ void i915_gem_close_object(struct drm_gem_object *gem, struct drm_file *file)
 		if (vma->vm->file == fpriv)
 			i915_vma_close(vma);
 
+	vma = obj->vma_hashed;
+	if (vma && vma->ctx->file_priv == fpriv)
+		i915_vma_unlink_ctx(vma);
+
 	if (i915_gem_object_is_active(obj) &&
 	    !i915_gem_object_has_active_reference(obj)) {
 		i915_gem_object_set_active_reference(obj);
@@ -4112,7 +4116,6 @@ void i915_gem_object_init(struct drm_i915_gem_object *obj,
 
 	INIT_LIST_HEAD(&obj->global_link);
 	INIT_LIST_HEAD(&obj->userfault_link);
-	INIT_LIST_HEAD(&obj->obj_exec_link);
 	INIT_LIST_HEAD(&obj->vma_list);
 	INIT_LIST_HEAD(&obj->batch_pool_link);
 
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 486051ed681d..153b344ec91a 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -85,6 +85,7 @@
  *
  */
 
+#include <linux/log2.h>
 #include <drm/drmP.h>
 #include <drm/i915_drm.h>
 #include "i915_drv.h"
@@ -92,6 +93,9 @@
 
 #define ALL_L3_SLICES(dev) (1 << NUM_L3_SLICES(dev)) - 1
 
+/* Initial size (as log2) to preallocate the handle->object hashtable */
+#define VMA_HT_BITS 2u /* 4 x 2 pointers, 64 bytes minimum */
+
 static int get_context_size(struct drm_i915_private *dev_priv)
 {
 	int ret;
@@ -119,6 +123,65 @@ static int get_context_size(struct drm_i915_private *dev_priv)
 	return ret;
 }
 
+static void resize_vma_ht(struct work_struct *work)
+{
+	struct i915_gem_context_vma_lut *lut =
+		container_of(work, typeof(*lut), resize);
+	unsigned int size, bits, new_bits, i;
+	struct hlist_head *new_ht;
+
+	bits = 1 + ilog2(4*lut->ht_count/3);
+	new_bits = min_t(unsigned int,
+			 max(bits, VMA_HT_BITS),
+			 sizeof(unsigned int)*8);
+	if (new_bits == lut->ht_bits)
+		goto out;
+
+	new_ht = kzalloc(sizeof(*new_ht)<<new_bits, GFP_KERNEL | __GFP_NOWARN);
+	if (!new_ht)
+		new_ht = vzalloc(sizeof(*new_ht)<<new_bits);
+	if (!new_ht)
+		/* Pretend resize succeeded and stop calling us for a bit! */
+		goto out;
+
+	size = BIT(lut->ht_bits);
+	for (i = 0; i < size; i++) {
+		struct i915_vma *vma;
+		struct hlist_node *tmp;
+
+		hlist_for_each_entry_safe(vma, tmp, &lut->ht[i], ctx_node)
+			hlist_add_head(&vma->ctx_node,
+				       &new_ht[hash_32(vma->ctx_handle,
+						       new_bits)]);
+	}
+	kvfree(lut->ht);
+	lut->ht = new_ht;
+	lut->ht_bits = new_bits;
+	smp_wmb();
+out:
+	lut->ht_size = BIT(bits);
+}
+
+static void vma_lut_free(struct i915_gem_context *ctx)
+{
+	struct i915_gem_context_vma_lut *lut = &ctx->vma_lut;
+	unsigned int i, size;
+
+	if (lut->ht_size & RESIZE_IN_PROGRESS)
+		cancel_work_sync(&lut->resize);
+
+	size = BIT(lut->ht_bits);
+	for (i = 0; i < size; i++) {
+		struct i915_vma *vma;
+
+		hlist_for_each_entry(vma, &lut->ht[i], ctx_node) {
+			vma->obj->vma_hashed = NULL;
+			vma->ctx = NULL;
+		}
+	}
+	kvfree(lut->ht);
+}
+
 void i915_gem_context_free(struct kref *ctx_ref)
 {
 	struct i915_gem_context *ctx = container_of(ctx_ref, typeof(*ctx), ref);
@@ -128,6 +191,7 @@ void i915_gem_context_free(struct kref *ctx_ref)
 	trace_i915_context_free(ctx);
 	GEM_BUG_ON(!i915_gem_context_is_closed(ctx));
 
+	vma_lut_free(ctx);
 	i915_ppgtt_put(ctx->ppgtt);
 
 	for (i = 0; i < I915_NUM_ENGINES; i++) {
@@ -145,6 +209,7 @@ void i915_gem_context_free(struct kref *ctx_ref)
 
 	kfree(ctx->name);
 	put_pid(ctx->pid);
+
 	list_del(&ctx->link);
 
 	ida_simple_remove(&ctx->i915->context_hw_ida, ctx->hw_id);
@@ -266,6 +331,16 @@ __create_hw_context(struct drm_i915_private *dev_priv,
 	list_add_tail(&ctx->link, &dev_priv->context_list);
 	ctx->i915 = dev_priv;
 
+	ctx->vma_lut.ht_bits = VMA_HT_BITS;
+	ctx->vma_lut.ht_size = BIT(VMA_HT_BITS);
+	ctx->vma_lut.ht = kcalloc(ctx->vma_lut.ht_size,
+				  sizeof(*ctx->vma_lut.ht),
+				  GFP_KERNEL);
+	if (!ctx->vma_lut.ht)
+		goto err_out;
+
+	INIT_WORK(&ctx->vma_lut.resize, resize_vma_ht);
+
 	if (dev_priv->hw_context_size) {
 		struct drm_i915_gem_object *obj;
 		struct i915_vma *vma;
diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
index 4af2ab94558b..9a94e53bdf7c 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.h
+++ b/drivers/gpu/drm/i915/i915_gem_context.h
@@ -143,6 +143,31 @@ struct i915_gem_context {
 	/** ggtt_offset_bias: placement restriction for context objects */
 	u32 ggtt_offset_bias;
 
+	struct i915_gem_context_vma_lut {
+		/** ht_size: last request size to allocate the hashtable for. */
+		unsigned int ht_size;
+#define RESIZE_IN_PROGRESS BIT(0)
+		/** ht_bits: real log2(size) of hashtable. */
+		unsigned int ht_bits;
+		/** ht_count: current number of entries inside the hashtable */
+		unsigned int ht_count;
+
+		/** ht: the array of buckets comprising the simple hashtable */
+		struct hlist_head *ht;
+
+		/** resize: After an execbuf completes, we check the load factor
+		 * of the hashtable. If the hashtable is too full, or too empty,
+		 * we schedule a task to resize the hashtable. During the
+		 * resize, the entries are moved between different buckets and
+		 * so we cannot simultaneously read the hashtable as it is
+		 * being resized (unlike rhashtable). Therefore we treat the
+		 * active work as a strong barrier, pausing a subsequent
+		 * execbuf to wait for the resize worker to complete, if
+		 * required.
+		 */
+		struct work_struct resize;
+	} vma_lut;
+
 	/** engine: per-engine logical HW state */
 	struct intel_context {
 		struct i915_vma *state;
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 0c73642825d0..f4ef03787429 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -75,38 +75,33 @@ struct i915_execbuffer {
 		unsigned int page;
 		bool use_64bit_reloc;
 	} reloc_cache;
-	int and;
-	union {
-		struct i915_vma **lut;
-		struct hlist_head *buckets;
-	};
+	int lut_mask;
+	struct hlist_head *buckets;
 };
 
 static int
 eb_create(struct i915_execbuffer *eb)
 {
-	eb->lut = NULL;
-	if (eb->args->flags & I915_EXEC_HANDLE_LUT) {
-		unsigned int size = eb->args->buffer_count;
-		size *= sizeof(struct i915_vma *);
-		eb->lut = kmalloc(size,
-				  GFP_TEMPORARY | __GFP_NOWARN | __GFP_NORETRY);
-	}
-
-	if (!eb->lut) {
-		unsigned int size = eb->args->buffer_count;
-		unsigned int count = PAGE_SIZE / sizeof(struct hlist_head) / 2;
-		BUILD_BUG_ON_NOT_POWER_OF_2(PAGE_SIZE / sizeof(struct hlist_head));
-		while (count > 2*size)
-			count >>= 1;
-		eb->lut = kzalloc(count*sizeof(struct hlist_head),
-				  GFP_TEMPORARY);
-		if (!eb->lut)
-			return -ENOMEM;
-
-		eb->and = count - 1;
+	if ((eb->args->flags & I915_EXEC_HANDLE_LUT) == 0) {
+		unsigned int size = 1 + ilog2(eb->args->buffer_count);
+
+		do {
+			eb->buckets = kzalloc(sizeof(struct hlist_head) << size,
+					     GFP_TEMPORARY | __GFP_NOWARN | __GFP_NORETRY);
+			if (eb->buckets)
+				break;
+		} while (--size);
+
+		if (unlikely(!eb->buckets)) {
+			eb->buckets = kzalloc(sizeof(struct hlist_head),
+					      GFP_TEMPORARY);
+			if (unlikely(!eb->buckets))
+				return -ENOMEM;
+		}
+
+		eb->lut_mask = size;
 	} else {
-		eb->and = -eb->args->buffer_count;
+		eb->lut_mask = -eb->args->buffer_count;
 	}
 
 	return 0;
@@ -143,73 +138,113 @@ eb_reset(struct i915_execbuffer *eb)
 		vma->exec_entry = NULL;
 	}
 
-	if (eb->and >= 0)
-		memset(eb->buckets, 0, (eb->and+1)*sizeof(struct hlist_head));
+	if (eb->lut_mask >= 0)
+		memset(eb->buckets, 0,
+		       (1<<eb->lut_mask)*sizeof(struct hlist_head));
 }
 
-static struct i915_vma *
-eb_get_batch(struct i915_execbuffer *eb)
+#define to_ptr(T, x) ((T *)(uintptr_t)(x))
+
+static bool
+eb_add_vma(struct i915_execbuffer *eb, struct i915_vma *vma, int i)
 {
-	struct i915_vma *vma = list_entry(eb->vmas.prev, typeof(*vma), exec_link);
+	if (unlikely(vma->exec_entry)) {
+		DRM_DEBUG("Object [handle %d, index %d] appears more than once in object list\n",
+			  eb->exec[i].handle, i);
+		return false;
+	}
+	list_add_tail(&vma->exec_link, &eb->vmas);
 
-	/*
-	 * SNA is doing fancy tricks with compressing batch buffers, which leads
-	 * to negative relocation deltas. Usually that works out ok since the
-	 * relocate address is still positive, except when the batch is placed
-	 * very low in the GTT. Ensure this doesn't happen.
-	 *
-	 * Note that actual hangs have only been observed on gen7, but for
-	 * paranoia do it everywhere.
-	 */
-	if ((vma->exec_entry->flags & EXEC_OBJECT_PINNED) == 0)
-		vma->exec_entry->flags |= __EXEC_OBJECT_NEEDS_BIAS;
+	vma->exec_entry = &eb->exec[i];
+	if (eb->lut_mask >= 0) {
+		vma->exec_handle = eb->exec[i].handle;
+		hlist_add_head(&vma->exec_node,
+			       &eb->buckets[hash_32(vma->exec_handle,
+						    eb->lut_mask)]);
+	}
 
-	return vma;
+	i915_vma_get(vma);
+	eb->exec[i].rsvd2 = (uintptr_t)vma;
+	return true;
+}
+
+static inline struct hlist_head *
+ht_head(const struct i915_gem_context *ctx, u32 handle)
+{
+	return &ctx->vma_lut.ht[hash_32(handle, ctx->vma_lut.ht_bits)];
+}
+
+static inline bool
+ht_needs_resize(const struct i915_gem_context *ctx)
+{
+	return (4*ctx->vma_lut.ht_count > 3*ctx->vma_lut.ht_size ||
+		4*ctx->vma_lut.ht_count < ctx->vma_lut.ht_size);
 }
 
 static int
 eb_lookup_vmas(struct i915_execbuffer *eb)
 {
-	struct drm_i915_gem_object *obj;
-	struct list_head objects;
-	int i, ret;
+	const int count = eb->args->buffer_count;
+	struct i915_vma *vma;
+	int slow_pass = -1;
+	int i;
 
 	INIT_LIST_HEAD(&eb->vmas);
 
-	INIT_LIST_HEAD(&objects);
+	if (unlikely(eb->ctx->vma_lut.ht_size & RESIZE_IN_PROGRESS))
+		flush_work(&eb->ctx->vma_lut.resize);
+	GEM_BUG_ON(eb->ctx->vma_lut.ht_size & RESIZE_IN_PROGRESS);
+
+	for (i = 0; i < count; i++) {
+		eb->exec[i].rsvd2 = 0;
+
+		hlist_for_each_entry(vma,
+				     ht_head(eb->ctx, eb->exec[i].handle),
+				     ctx_node) {
+			if (vma->ctx_handle != eb->exec[i].handle)
+				continue;
+
+			if (!eb_add_vma(eb, vma, i))
+				return -EINVAL;
+
+			goto next_vma;
+		}
+
+		if (slow_pass < 0)
+			slow_pass = i;
+next_vma: ;
+	}
+
+	if (slow_pass < 0)
+		return 0;
+
 	spin_lock(&eb->file->table_lock);
 	/* Grab a reference to the object and release the lock so we can lookup
 	 * or create the VMA without using GFP_ATOMIC */
-	for (i = 0; i < eb->args->buffer_count; i++) {
-		obj = to_intel_bo(idr_find(&eb->file->object_idr, eb->exec[i].handle));
-		if (obj == NULL) {
-			spin_unlock(&eb->file->table_lock);
-			DRM_DEBUG("Invalid object handle %d at index %d\n",
-				   eb->exec[i].handle, i);
-			ret = -ENOENT;
-			goto err;
-		}
+	for (i = slow_pass; i < count; i++) {
+		struct drm_i915_gem_object *obj;
+
+		if (eb->exec[i].rsvd2)
+			continue;
 
-		if (!list_empty(&obj->obj_exec_link)) {
+		obj = to_intel_bo(idr_find(&eb->file->object_idr,
+					   eb->exec[i].handle));
+		if (unlikely(!obj)) {
 			spin_unlock(&eb->file->table_lock);
-			DRM_DEBUG("Object %p [handle %d, index %d] appears more than once in object list\n",
-				   obj, eb->exec[i].handle, i);
-			ret = -EINVAL;
-			goto err;
+			DRM_DEBUG("Invalid object handle %d at index %d\n",
+				  eb->exec[i].handle, i);
+			return -ENOENT;
 		}
 
-		i915_gem_object_get(obj);
-		list_add_tail(&obj->obj_exec_link, &objects);
+		eb->exec[i].rsvd2 = 1 | (uintptr_t)obj;
 	}
 	spin_unlock(&eb->file->table_lock);
 
-	i = 0;
-	while (!list_empty(&objects)) {
-		struct i915_vma *vma;
+	for (i = slow_pass; i < count; i++) {
+		struct drm_i915_gem_object *obj;
 
-		obj = list_first_entry(&objects,
-				       struct drm_i915_gem_object,
-				       obj_exec_link);
+		if ((eb->exec[i].rsvd2 & 1) == 0)
+			continue;
 
 		/*
 		 * NOTE: We can leak any vmas created here when something fails
@@ -219,61 +254,72 @@ eb_lookup_vmas(struct i915_execbuffer *eb)
 		 * from the (obj, vm) we don't run the risk of creating
 		 * duplicated vmas for the same vm.
 		 */
+		obj = to_ptr(struct drm_i915_gem_object, eb->exec[i].rsvd2 & ~1);
 		vma = i915_vma_instance(obj, eb->vm, NULL);
 		if (unlikely(IS_ERR(vma))) {
 			DRM_DEBUG("Failed to lookup VMA\n");
-			ret = PTR_ERR(vma);
-			goto err;
+			return PTR_ERR(vma);
 		}
 
-		/* Transfer ownership from the objects list to the vmas list. */
-		list_add_tail(&vma->exec_link, &eb->vmas);
-		list_del_init(&obj->obj_exec_link);
-
-		vma->exec_entry = &eb->exec[i];
-		if (eb->and < 0) {
-			eb->lut[i] = vma;
-		} else {
-			u32 handle =
-				eb->args->flags & I915_EXEC_HANDLE_LUT ?
-				i : eb->exec[i].handle;
-			vma->exec_handle = handle;
-			hlist_add_head(&vma->exec_node,
-				       &eb->buckets[handle & eb->and]);
+		/* First come, first served */
+		if (!vma->ctx) {
+			vma->ctx = eb->ctx;
+			vma->ctx_handle = eb->exec[i].handle;
+			hlist_add_head(&vma->ctx_node,
+				       ht_head(eb->ctx, eb->exec[i].handle));
+			eb->ctx->vma_lut.ht_count++;
+			if (i915_vma_is_ggtt(vma)) {
+				GEM_BUG_ON(obj->vma_hashed);
+				obj->vma_hashed = vma;
+			}
 		}
-		++i;
+
+		if (!eb_add_vma(eb, vma, i))
+			return -EINVAL;
+	}
+
+	if (ht_needs_resize(eb->ctx)) {
+		eb->ctx->vma_lut.ht_size |= RESIZE_IN_PROGRESS;
+		queue_work(system_highpri_wq, &eb->ctx->vma_lut.resize);
 	}
 
 	return 0;
+}
 
+static struct i915_vma *
+eb_get_batch(struct i915_execbuffer *eb)
+{
+	struct i915_vma *vma;
+
+	vma = to_ptr(struct i915_vma, eb->exec[eb->args->buffer_count-1].rsvd2);
 
-err:
-	while (!list_empty(&objects)) {
-		obj = list_first_entry(&objects,
-				       struct drm_i915_gem_object,
-				       obj_exec_link);
-		list_del_init(&obj->obj_exec_link);
-		i915_gem_object_put(obj);
-	}
 	/*
-	 * Objects already transfered to the vmas list will be unreferenced by
-	 * eb_destroy.
+	 * SNA is doing fancy tricks with compressing batch buffers, which leads
+	 * to negative relocation deltas. Usually that works out ok since the
+	 * relocate address is still positive, except when the batch is placed
+	 * very low in the GTT. Ensure this doesn't happen.
+	 *
+	 * Note that actual hangs have only been observed on gen7, but for
+	 * paranoia do it everywhere.
 	 */
+	if ((vma->exec_entry->flags & EXEC_OBJECT_PINNED) == 0)
+		vma->exec_entry->flags |= __EXEC_OBJECT_NEEDS_BIAS;
 
-	return ret;
+	return vma;
 }
 
-static struct i915_vma *eb_get_vma(struct i915_execbuffer *eb, unsigned long handle)
+static struct i915_vma *
+eb_get_vma(struct i915_execbuffer *eb, unsigned long handle)
 {
-	if (eb->and < 0) {
-		if (handle >= -eb->and)
+	if (eb->lut_mask < 0) {
+		if (handle >= -eb->lut_mask)
 			return NULL;
-		return eb->lut[handle];
+		return to_ptr(struct i915_vma, eb->exec[handle].rsvd2);
 	} else {
 		struct hlist_head *head;
 		struct i915_vma *vma;
 
-		head = &eb->buckets[handle & eb->and];
+		head = &eb->buckets[hash_32(handle, eb->lut_mask)];
 		hlist_for_each_entry(vma, head, exec_node) {
 			if (vma->exec_handle == handle)
 				return vma;
@@ -297,7 +343,7 @@ static void eb_destroy(struct i915_execbuffer *eb)
 
 	i915_gem_context_put(eb->ctx);
 
-	if (eb->buckets)
+	if (eb->lut_mask >= 0)
 		kfree(eb->buckets);
 }
 
@@ -915,7 +961,7 @@ static int eb_reserve(struct i915_execbuffer *eb)
 			entry->flags &= ~EXEC_OBJECT_NEEDS_FENCE;
 		need_fence =
 			entry->flags & EXEC_OBJECT_NEEDS_FENCE &&
-			i915_gem_object_is_tiled(obj);
+			i915_gem_object_is_tiled(vma->obj);
 		need_mappable = need_fence || need_reloc_mappable(vma);
 
 		if (entry->flags & EXEC_OBJECT_PINNED)
diff --git a/drivers/gpu/drm/i915/i915_gem_object.h b/drivers/gpu/drm/i915/i915_gem_object.h
index 174cf923c236..5093e065b9a6 100644
--- a/drivers/gpu/drm/i915/i915_gem_object.h
+++ b/drivers/gpu/drm/i915/i915_gem_object.h
@@ -71,6 +71,7 @@ struct drm_i915_gem_object {
 	/** List of VMAs backed by this object */
 	struct list_head vma_list;
 	struct rb_root vma_tree;
+	struct i915_vma *vma_hashed;
 
 	/** Stolen memory for this object, instead of being backed by shmem. */
 	struct drm_mm_node *stolen;
@@ -85,9 +86,6 @@ struct drm_i915_gem_object {
 	 */
 	struct list_head userfault_link;
 
-	/** Used in execbuf to temporarily hold a ref */
-	struct list_head obj_exec_link;
-
 	struct list_head batch_pool_link;
 	I915_SELFTEST_DECLARE(struct list_head st_link);
 
diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index 6cf32da682ec..b286807c451c 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -590,11 +590,31 @@ void i915_vma_destroy(struct i915_vma *vma)
 	kmem_cache_free(to_i915(vma->obj->base.dev)->vmas, vma);
 }
 
+void i915_vma_unlink_ctx(struct i915_vma *vma)
+{
+	struct i915_gem_context *ctx = vma->ctx;
+
+	if (ctx->vma_lut.ht_size & RESIZE_IN_PROGRESS) {
+		cancel_work_sync(&ctx->vma_lut.resize);
+		ctx->vma_lut.ht_size &= ~RESIZE_IN_PROGRESS;
+	}
+
+	__hlist_del(&vma->ctx_node);
+	ctx->vma_lut.ht_count--;
+
+	if (i915_vma_is_ggtt(vma))
+		vma->obj->vma_hashed = NULL;
+	vma->ctx = NULL;
+}
+
 void i915_vma_close(struct i915_vma *vma)
 {
 	GEM_BUG_ON(i915_vma_is_closed(vma));
 	vma->flags |= I915_VMA_CLOSED;
 
+	if (vma->ctx)
+		i915_vma_unlink_ctx(vma);
+
 	list_del(&vma->obj_link);
 	rb_erase(&vma->obj_node, &vma->obj->vma_tree);
 
diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h
index 4d827300d1a8..88543fafcffc 100644
--- a/drivers/gpu/drm/i915/i915_vma.h
+++ b/drivers/gpu/drm/i915/i915_vma.h
@@ -99,6 +99,7 @@ struct i915_vma {
 
 	struct list_head obj_link; /* Link in the object's VMA list */
 	struct rb_node obj_node;
+	struct hlist_node obj_hash;
 
 	/** This vma's place in the execbuf reservation list */
 	struct list_head exec_link;
@@ -110,8 +111,12 @@ struct i915_vma {
 	 * Used for performing relocations during execbuffer insertion.
 	 */
 	struct hlist_node exec_node;
-	unsigned long exec_handle;
 	struct drm_i915_gem_exec_object2 *exec_entry;
+	u32 exec_handle;
+
+	struct i915_gem_context *ctx;
+	struct hlist_node ctx_node;
+	u32 ctx_handle;
 };
 
 struct i915_vma *
@@ -235,6 +240,7 @@ bool i915_vma_misplaced(const struct i915_vma *vma,
 			u64 size, u64 alignment, u64 flags);
 void __i915_vma_set_map_and_fenceable(struct i915_vma *vma);
 int __must_check i915_vma_unbind(struct i915_vma *vma);
+void i915_vma_unlink_ctx(struct i915_vma *vma);
 void i915_vma_close(struct i915_vma *vma);
 void i915_vma_destroy(struct i915_vma *vma);
 
diff --git a/drivers/gpu/drm/i915/selftests/mock_context.c b/drivers/gpu/drm/i915/selftests/mock_context.c
index 8d3a90c3f8ac..f8b9cc212b02 100644
--- a/drivers/gpu/drm/i915/selftests/mock_context.c
+++ b/drivers/gpu/drm/i915/selftests/mock_context.c
@@ -40,10 +40,18 @@ mock_context(struct drm_i915_private *i915,
 	INIT_LIST_HEAD(&ctx->link);
 	ctx->i915 = i915;
 
+	ctx->vma_lut.ht_bits = VMA_HT_BITS;
+	ctx->vma_lut.ht_size = BIT(VMA_HT_BITS);
+	ctx->vma_lut.ht = kcalloc(ctx->vma_lut.ht_size,
+				  sizeof(*ctx->vma_lut.ht),
+				  GFP_KERNEL);
+	if (!ctx->vma_lut.ht)
+		goto err_free;
+
 	ret = ida_simple_get(&i915->context_hw_ida,
 			     0, MAX_CONTEXT_HW_ID, GFP_KERNEL);
 	if (ret < 0)
-		goto err_free;
+		goto err_vma_ht;
 	ctx->hw_id = ret;
 
 	if (name) {
@@ -58,6 +66,8 @@ mock_context(struct drm_i915_private *i915,
 
 	return ctx;
 
+err_vma_ht:
+	kvfree(ctx->vma_lut.ht);
 err_free:
 	kfree(ctx);
 	return NULL;
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 39+ messages in thread

* ✗ Fi.CI.BAT: failure for series starting with [01/15] drm/i915: Copy user requested buffers into the error state (rev3)
  2017-03-16 13:19 Make execbuf fast and green Chris Wilson
                   ` (16 preceding siblings ...)
  2017-03-17 11:28 ` ✓ Fi.CI.BAT: success for series starting with [01/15] drm/i915: Copy user requested buffers into the error state (rev2) Patchwork
@ 2017-03-22  9:43 ` Patchwork
  2017-03-23 13:51 ` ✗ Fi.CI.BAT: failure for series starting with [01/15] drm/i915: Copy user requested buffers into the error state (rev4) Patchwork
  18 siblings, 0 replies; 39+ messages in thread
From: Patchwork @ 2017-03-22  9:43 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [01/15] drm/i915: Copy user requested buffers into the error state (rev3)
URL   : https://patchwork.freedesktop.org/series/21377/
State : failure

== Summary ==

                 from ./include/linux/mutex.h:13,
                 from ./include/linux/ww_mutex.h:17,
                 from ./include/linux/reservation.h:42,
                 from drivers/gpu/drm/i915/i915_gem_execbuffer.c:30:
./include/linux/kernel.h:852:48: error: initialization makes pointer from integer without a cast [-Werror=int-conversion]
  const typeof( ((type *)0)->member ) *__mptr = (ptr); \
                                                ^
./include/linux/list.h:733:40: note: in expansion of macro ‘container_of’
 #define hlist_entry(ptr, type, member) container_of(ptr,type,member)
                                        ^
./include/linux/list.h:744:15: note: in expansion of macro ‘hlist_entry’
     ____ptr ? hlist_entry(____ptr, type, member) : NULL; \
               ^
./include/linux/list.h:754:13: note: in expansion of macro ‘hlist_entry_safe’
  for (pos = hlist_entry_safe((head)->first, typeof(*(pos)), member);\
             ^
drivers/gpu/drm/i915/i915_gem_execbuffer.c:614:3: note: in expansion of macro ‘hlist_for_each_entry’
   hlist_for_each_entry(vma,
   ^
drivers/gpu/drm/i915/i915_gem_execbuffer.c:685:12: error: passing argument 2 of ‘hlist_add_head’ makes pointer from integer without a cast [-Werror=int-conversion]
            ht_head(eb->ctx, eb->exec[i].handle));
            ^
In file included from ./include/linux/mutex.h:14:0,
                 from ./include/linux/ww_mutex.h:17,
                 from ./include/linux/reservation.h:42,
                 from drivers/gpu/drm/i915/i915_gem_execbuffer.c:30:
./include/linux/list.h:668:20: note: expected ‘struct hlist_head *’ but argument is of type ‘int’
 static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h)
                    ^
drivers/gpu/drm/i915/i915_gem_execbuffer.c:723:9: error: implicit declaration of function ‘eb_reserve’ [-Werror=implicit-function-declaration]
  return eb_reserve(eb);
         ^
  LD      drivers/thermal/thermal_sys.o
  LD      drivers/usb/gadget/built-in.o
  LD      drivers/thermal/built-in.o
  LD [M]  drivers/net/ethernet/intel/igbvf/igbvf.o
drivers/gpu/drm/i915/i915_gem_execbuffer.c: At top level:
drivers/gpu/drm/i915/i915_gem_execbuffer.c:442:1: error: ‘eb_reserve_vma’ defined but not used [-Werror=unused-function]
 eb_reserve_vma(struct i915_execbuffer *eb, struct i915_vma *vma)
 ^
  LD [M]  drivers/usb/serial/usbserial.o
  LD      drivers/tty/serial/8250/8250.o
  LD      sound/built-in.o
cc1: all warnings being treated as errors
scripts/Makefile.build:294: recipe for target 'drivers/gpu/drm/i915/i915_gem_execbuffer.o' failed
make[4]: *** [drivers/gpu/drm/i915/i915_gem_execbuffer.o] Error 1
make[4]: *** Waiting for unfinished jobs....
  LD      drivers/video/fbdev/core/fb.o
  LD      drivers/video/fbdev/core/built-in.o
  LD      drivers/video/fbdev/built-in.o
  LD      drivers/pci/built-in.o
  AR      lib/lib.a
  EXPORTS lib/lib-ksyms.o
  LD      lib/built-in.o
  LD      drivers/scsi/scsi_mod.o
  LD      drivers/usb/core/usbcore.o
  LD      drivers/usb/core/built-in.o
  LD      net/ipv6/ipv6.o
  LD      drivers/spi/built-in.o
  LD      net/ipv6/built-in.o
  LD      drivers/tty/serial/8250/8250_base.o
  LD      drivers/tty/serial/8250/built-in.o
  LD      fs/btrfs/btrfs.o
  LD      drivers/tty/serial/built-in.o
  LD      drivers/video/console/built-in.o
  LD      drivers/video/built-in.o
  LD      fs/btrfs/built-in.o
  LD      drivers/scsi/sd_mod.o
  LD      drivers/scsi/built-in.o
  LD [M]  drivers/net/ethernet/broadcom/genet/genet.o
  LD [M]  drivers/net/ethernet/intel/e1000/e1000.o
  CC      arch/x86/kernel/cpu/capflags.o
  LD      arch/x86/kernel/cpu/built-in.o
  LD      arch/x86/kernel/built-in.o
  LD      drivers/usb/host/xhci-hcd.o
  LD [M]  drivers/net/ethernet/intel/igb/igb.o
  LD      drivers/md/md-mod.o
  LD      drivers/md/built-in.o
  LD      arch/x86/built-in.o
  LD      drivers/tty/vt/built-in.o
  LD      drivers/tty/built-in.o
  LD      net/ipv4/built-in.o
  LD      fs/ext4/ext4.o
  LD      fs/ext4/built-in.o
  LD      fs/built-in.o
  LD      net/core/built-in.o
  LD      net/built-in.o
  LD      drivers/usb/host/built-in.o
  LD      drivers/usb/built-in.o
  LD [M]  drivers/net/ethernet/intel/e1000e/e1000e.o
  LD      drivers/net/ethernet/built-in.o
  LD      drivers/net/built-in.o
scripts/Makefile.build:553: recipe for target 'drivers/gpu/drm/i915' failed
make[3]: *** [drivers/gpu/drm/i915] Error 2
scripts/Makefile.build:553: recipe for target 'drivers/gpu/drm' failed
make[2]: *** [drivers/gpu/drm] Error 2
scripts/Makefile.build:553: recipe for target 'drivers/gpu' failed
make[1]: *** [drivers/gpu] Error 2
Makefile:1002: recipe for target 'drivers' failed
make: *** [drivers] Error 2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2] drm/i915: Store a direct lookup from object handle to vma
  2017-03-22  9:33   ` [PATCH v2] " Chris Wilson
@ 2017-03-22 14:22     ` Joonas Lahtinen
  2017-03-23 14:23       ` Daniel Vetter
  0 siblings, 1 reply; 39+ messages in thread
From: Joonas Lahtinen @ 2017-03-22 14:22 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx, Daniel Vetter

+ Daniel for the rsvd2

On ke, 2017-03-22 at 09:33 +0000, Chris Wilson wrote:
> The advent of full-ppgtt lead to an extra indirection between the object
> and its binding. That extra indirection has a noticeable impact on how
> fast we can convert from the user handles to our internal vma for
> execbuffer. In order to bypass the extra indirection, we use a
> resizable hashtable to jump from the object to the per-ctx vma.
> rhashtable was considered but we don't need the online resizing feature
> and the extra complexity proved to undermine its usefulness. Instead, we
> simply reallocate the hastable on demand in a background task and
> serialize it before iterating.
> 
> In non-full-ppgtt modes, multiple files and multiple contexts can share
> the same vma. This leads to having multiple possible handle->vma links,
> so we only use the first to establish the fast path. The majority of
> buffers are not shared and so we should still be able to realise
> speedups with multiple clients.
> 
> v2: Prettier names, more magic.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

<SNIP>
 
> +static void resize_vma_ht(struct work_struct *work)
> +{
> +	struct i915_gem_context_vma_lut *lut =
> +		container_of(work, typeof(*lut), resize);
> +	unsigned int size, bits, new_bits, i;
> +	struct hlist_head *new_ht;
> +
> +	bits = 1 + ilog2(4*lut->ht_count/3);
> +	new_bits = min_t(unsigned int,
> +			 max(bits, VMA_HT_BITS),
> +			 sizeof(unsigned int)*8);

* BITS_PER_BYTE for extra clarity.

> +	if (new_bits == lut->ht_bits)
> +		goto out;
> +
> +	new_ht = kzalloc(sizeof(*new_ht)<<new_bits, GFP_KERNEL | __GFP_NOWARN);
> +	if (!new_ht)
> +		new_ht = vzalloc(sizeof(*new_ht)<<new_bits);

No vcalloc :( Otherwise would've suggested

vzalloc(BIT(new_bits), sizeof(*new_ht), ...);

but

kzalloc(BIT(new_bits)*sizeof(*new_ht), ...)

might still be clearer.

> @@ -266,6 +331,16 @@ __create_hw_context(struct drm_i915_private *dev_priv,
>  	list_add_tail(&ctx->link, &dev_priv->context_list);
>  	ctx->i915 = dev_priv;
>  
> +	ctx->vma_lut.ht_bits = VMA_HT_BITS;
> +	ctx->vma_lut.ht_size = BIT(VMA_HT_BITS);
> +	ctx->vma_lut.ht = kcalloc(ctx->vma_lut.ht_size,
> +				  sizeof(*ctx->vma_lut.ht),
> +				  GFP_KERNEL);
> +	if (!ctx->vma_lut.ht)
> +		goto err_out;
> +

Errors after this point will leak lut. Need err_lut label and call it
from further error gotos.

> @@ -143,6 +143,31 @@ struct i915_gem_context {
>  	/** ggtt_offset_bias: placement restriction for context objects */
>  	u32 ggtt_offset_bias;
>  
> +	struct i915_gem_context_vma_lut {
> +		/** ht_size: last request size to allocate the hashtable for. */
> +		unsigned int ht_size;
> +#define RESIZE_IN_PROGRESS BIT(0)

Easily conflicting name? Forward declare and hide in .c file? By making
ht[0] at the end, and maybe pull out work. Or maybe just prefix :P

> +#define to_ptr(T, x) ((T *)(uintptr_t)(x))

i915_utils.h so we some day push to core. from_uintptr might make more
sense, though.

The remainder is somewhat hard to review due to combined code motion
and changes becoming mess, but didn't find anything else but I still
dislike rsvd2 usage, and the magic bit it has.

We could at least #define proper_name rsvd2 in the .c file... Jani
would be glad, I guess. I think if somebody touches a reserved field by
name, they deserve to have their build broken.

Regards, Joonas
-- 
Joonas Lahtinen
Open Source Technology Center
Intel Corporation
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [PATCH] drm/i915: Pass vma to relocate entry
  2017-03-21 14:17   ` Joonas Lahtinen
@ 2017-03-23 13:02     ` Chris Wilson
  0 siblings, 0 replies; 39+ messages in thread
From: Chris Wilson @ 2017-03-23 13:02 UTC (permalink / raw)
  To: intel-gfx

We can simplify our tracking of pending writes in an execbuf to the
single bit in the vma->exec_entry->flags, but that requires the
relocation function knowing the object's vma. Pass it along.

Note we have only been using a single bit to track flushing since

commit cc889e0f6ce6a63c62db17d702ecfed86d58083f
Author: Daniel Vetter <daniel.vetter@ffwll.ch>
Date:   Wed Jun 13 20:45:19 2012 +0200

    drm/i915: disable flushing_list/gpu_write_list

unconditionally flushed all render caches before the breadcrumb and

commit 6ac42f4148bc27e5ffd18a9ab0eac57f58822af4
Author: Daniel Vetter <daniel.vetter@ffwll.ch>
Date:   Sat Jul 21 12:25:01 2012 +0200

    drm/i915: Replace the complex flushing logic with simple invalidate/flush all

did away with the explicit GPU domain tracking. This was then codified
into the ABI with NO_RELOC in

commit ed5982e6ce5f106abcbf071f80730db344a6da42
Author: Daniel Vetter <daniel.vetter@ffwll.ch> # Oi! Patch stealer!
Date:   Thu Jan 17 22:23:36 2013 +0100

    drm/i915: Allow userspace to hint that the relocations were known

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 101 ++++++++++++-----------------
 1 file changed, 41 insertions(+), 60 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 27868798eb85..a07375bad449 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -622,42 +622,25 @@ relocate_entry(struct drm_i915_gem_object *obj,
 }
 
 static int
-eb_relocate_entry(struct drm_i915_gem_object *obj,
+eb_relocate_entry(struct i915_vma *vma,
 		  struct i915_execbuffer *eb,
 		  struct drm_i915_gem_relocation_entry *reloc)
 {
-	struct drm_gem_object *target_obj;
-	struct drm_i915_gem_object *target_i915_obj;
-	struct i915_vma *target_vma;
-	uint64_t target_offset;
+	struct i915_vma *target;
+	u64 target_offset;
 	int ret;
 
 	/* we've already hold a reference to all valid objects */
-	target_vma = eb_get_vma(eb, reloc->target_handle);
-	if (unlikely(target_vma == NULL))
+	target = eb_get_vma(eb, reloc->target_handle);
+	if (unlikely(!target))
 		return -ENOENT;
-	target_i915_obj = target_vma->obj;
-	target_obj = &target_vma->obj->base;
-
-	target_offset = gen8_canonical_addr(target_vma->node.start);
-
-	/* Sandybridge PPGTT errata: We need a global gtt mapping for MI and
-	 * pipe_control writes because the gpu doesn't properly redirect them
-	 * through the ppgtt for non_secure batchbuffers. */
-	if (unlikely(IS_GEN6(eb->i915) &&
-		     reloc->write_domain == I915_GEM_DOMAIN_INSTRUCTION)) {
-		ret = i915_vma_bind(target_vma, target_i915_obj->cache_level,
-				    PIN_GLOBAL);
-		if (WARN_ONCE(ret, "Unexpected failure to bind target VMA!"))
-			return ret;
-	}
 
 	/* Validate that the target is in a valid r/w GPU domain */
 	if (unlikely(reloc->write_domain & (reloc->write_domain - 1))) {
 		DRM_DEBUG("reloc with multiple write domains: "
-			  "obj %p target %d offset %d "
+			  "target %d offset %d "
 			  "read %08x write %08x",
-			  obj, reloc->target_handle,
+			  reloc->target_handle,
 			  (int) reloc->offset,
 			  reloc->read_domains,
 			  reloc->write_domain);
@@ -666,43 +649,56 @@ eb_relocate_entry(struct drm_i915_gem_object *obj,
 	if (unlikely((reloc->write_domain | reloc->read_domains)
 		     & ~I915_GEM_GPU_DOMAINS)) {
 		DRM_DEBUG("reloc with read/write non-GPU domains: "
-			  "obj %p target %d offset %d "
+			  "target %d offset %d "
 			  "read %08x write %08x",
-			  obj, reloc->target_handle,
+			  reloc->target_handle,
 			  (int) reloc->offset,
 			  reloc->read_domains,
 			  reloc->write_domain);
 		return -EINVAL;
 	}
 
-	target_obj->pending_read_domains |= reloc->read_domains;
-	target_obj->pending_write_domain |= reloc->write_domain;
+	if (reloc->write_domain)
+		target->exec_entry->flags |= EXEC_OBJECT_WRITE;
+
+	/* Sandybridge PPGTT errata: We need a global gtt mapping for MI and
+	 * pipe_control writes because the gpu doesn't properly redirect them
+	 * through the ppgtt for non_secure batchbuffers.
+	 */
+	if (unlikely(IS_GEN6(eb->i915) &&
+		     reloc->write_domain == I915_GEM_DOMAIN_INSTRUCTION)) {
+		ret = i915_vma_bind(target, target->obj->cache_level,
+				    PIN_GLOBAL);
+		if (WARN_ONCE(ret, "Unexpected failure to bind target VMA!"))
+			return ret;
+	}
 
 	/* If the relocation already has the right value in it, no
 	 * more work needs to be done.
 	 */
+	target_offset = gen8_canonical_addr(target->node.start);
 	if (target_offset == reloc->presumed_offset)
 		return 0;
 
 	/* Check that the relocation address is valid... */
 	if (unlikely(reloc->offset >
-		     obj->base.size - (eb->reloc_cache.use_64bit_reloc ? 8 : 4))) {
+		     vma->size - (eb->reloc_cache.use_64bit_reloc ? 8 : 4))) {
 		DRM_DEBUG("Relocation beyond object bounds: "
-			  "obj %p target %d offset %d size %d.\n",
-			  obj, reloc->target_handle,
-			  (int) reloc->offset,
-			  (int) obj->base.size);
+			  "target %d offset %d size %d.\n",
+			  reloc->target_handle,
+			  (int)reloc->offset,
+			  (int)vma->size);
 		return -EINVAL;
 	}
 	if (unlikely(reloc->offset & 3)) {
 		DRM_DEBUG("Relocation not 4-byte aligned: "
-			  "obj %p target %d offset %d.\n",
-			  obj, reloc->target_handle,
-			  (int) reloc->offset);
+			  "target %d offset %d.\n",
+			  reloc->target_handle,
+			  (int)reloc->offset);
 		return -EINVAL;
 	}
 
-	ret = relocate_entry(obj, reloc, &eb->reloc_cache, target_offset);
+	ret = relocate_entry(vma->obj, reloc, &eb->reloc_cache, target_offset);
 	if (ret)
 		return ret;
 
@@ -748,7 +744,7 @@ static int eb_relocate_vma(struct i915_vma *vma, struct i915_execbuffer *eb)
 		do {
 			u64 offset = r->presumed_offset;
 
-			ret = eb_relocate_entry(vma->obj, eb, r);
+			ret = eb_relocate_entry(vma, eb, r);
 			if (ret)
 				goto out;
 
@@ -794,7 +790,7 @@ eb_relocate_vma_slow(struct i915_vma *vma,
 	int i, ret = 0;
 
 	for (i = 0; i < entry->relocation_count; i++) {
-		ret = eb_relocate_entry(vma->obj, eb, &relocs[i]);
+		ret = eb_relocate_entry(vma, eb, &relocs[i]);
 		if (ret)
 			break;
 	}
@@ -827,7 +823,6 @@ eb_reserve_vma(struct i915_vma *vma,
 	       struct intel_engine_cs *engine,
 	       bool *need_reloc)
 {
-	struct drm_i915_gem_object *obj = vma->obj;
 	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
 	uint64_t flags;
 	int ret;
@@ -881,11 +876,6 @@ eb_reserve_vma(struct i915_vma *vma,
 		*need_reloc = true;
 	}
 
-	if (entry->flags & EXEC_OBJECT_WRITE) {
-		obj->base.pending_read_domains = I915_GEM_DOMAIN_RENDER;
-		obj->base.pending_write_domain = I915_GEM_DOMAIN_RENDER;
-	}
-
 	return 0;
 }
 
@@ -947,7 +937,6 @@ eb_vma_misplaced(struct i915_vma *vma)
 static int eb_reserve(struct i915_execbuffer *eb)
 {
 	const bool has_fenced_gpu_access = INTEL_GEN(eb->i915) < 4;
-	struct drm_i915_gem_object *obj;
 	struct i915_vma *vma;
 	struct list_head ordered_vmas;
 	struct list_head pinned_vmas;
@@ -960,7 +949,6 @@ static int eb_reserve(struct i915_execbuffer *eb)
 		bool need_fence, need_mappable;
 
 		vma = list_first_entry(&eb->vmas, struct i915_vma, exec_link);
-		obj = vma->obj;
 		entry = vma->exec_entry;
 
 		if (eb->ctx->flags & CONTEXT_NO_ZEROMAP)
@@ -980,9 +968,6 @@ static int eb_reserve(struct i915_execbuffer *eb)
 			list_move(&vma->exec_link, &ordered_vmas);
 		} else
 			list_move_tail(&vma->exec_link, &ordered_vmas);
-
-		obj->base.pending_read_domains = I915_GEM_GPU_DOMAINS & ~I915_GEM_DOMAIN_COMMAND;
-		obj->base.pending_write_domain = 0;
 	}
 	list_splice(&ordered_vmas, &eb->vmas);
 	list_splice(&pinned_vmas, &eb->vmas);
@@ -1170,7 +1155,7 @@ eb_move_to_gpu(struct i915_execbuffer *eb)
 		}
 
 		ret = i915_gem_request_await_object
-			(eb->request, obj, obj->base.pending_write_domain);
+			(eb->request, obj, vma->exec_entry->flags & EXEC_OBJECT_WRITE);
 		if (ret)
 			return ret;
 	}
@@ -1371,12 +1356,10 @@ eb_move_to_active(struct i915_execbuffer *eb)
 	list_for_each_entry(vma, &eb->vmas, exec_link) {
 		struct drm_i915_gem_object *obj = vma->obj;
 
-		obj->base.write_domain = obj->base.pending_write_domain;
-		if (obj->base.write_domain)
-			vma->exec_entry->flags |= EXEC_OBJECT_WRITE;
-		else
-			obj->base.pending_read_domains |= obj->base.read_domains;
-		obj->base.read_domains = obj->base.pending_read_domains;
+		obj->base.write_domain = 0;
+		if (vma->exec_entry->flags & EXEC_OBJECT_WRITE)
+			obj->base.read_domains = 0;
+		obj->base.read_domains |= I915_GEM_GPU_DOMAINS;
 
 		i915_vma_move_to_active(vma, eb->request, vma->exec_entry->flags);
 		eb_export_fence(obj, eb->request, vma->exec_entry->flags);
@@ -1686,8 +1669,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 			goto err;
 	}
 
-	/* Set the pending read domains for the batch buffer to COMMAND */
-	if (eb.batch->obj->base.pending_write_domain) {
+	if (eb.batch->exec_entry->flags & EXEC_OBJECT_WRITE) {
 		DRM_DEBUG("Attempting to use self-modifying batch buffer\n");
 		ret = -EINVAL;
 		goto err;
@@ -1724,7 +1706,6 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 		}
 	}
 
-	eb.batch->obj->base.pending_read_domains |= I915_GEM_DOMAIN_COMMAND;
 	if (eb.batch_len == 0)
 		eb.batch_len = eb.batch->size - eb.batch_start_offset;
 
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 39+ messages in thread

* ✗ Fi.CI.BAT: failure for series starting with [01/15] drm/i915: Copy user requested buffers into the error state (rev4)
  2017-03-16 13:19 Make execbuf fast and green Chris Wilson
                   ` (17 preceding siblings ...)
  2017-03-22  9:43 ` ✗ Fi.CI.BAT: failure for series starting with [01/15] drm/i915: Copy user requested buffers into the error state (rev3) Patchwork
@ 2017-03-23 13:51 ` Patchwork
  18 siblings, 0 replies; 39+ messages in thread
From: Patchwork @ 2017-03-23 13:51 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [01/15] drm/i915: Copy user requested buffers into the error state (rev4)
URL   : https://patchwork.freedesktop.org/series/21377/
State : failure

== Summary ==

drivers/gpu/drm/i915/i915_gem_execbuffer.c:1891:1: error: expected expression before ‘<<’ token
 <<<<<<< b63fa3700fd66618174ee4d422c0867a0fc30e0f
 ^
drivers/gpu/drm/i915/i915_gem_execbuffer.c:2184:1: error: expected declaration or statement at end of input
 }
 ^
drivers/gpu/drm/i915/i915_gem_execbuffer.c:1889:3: error: label ‘err_vma’ used but not defined
   goto err_vma;
   ^
drivers/gpu/drm/i915/i915_gem_execbuffer.c:1873:3: error: label ‘err_unlock’ used but not defined
   goto err_unlock;
   ^
drivers/gpu/drm/i915/i915_gem_execbuffer.c:1869:3: error: label ‘err_rpm’ used but not defined
   goto err_rpm;
   ^
drivers/gpu/drm/i915/i915_gem_execbuffer.c:1853:4: error: label ‘err_in_fence’ used but not defined
    goto err_in_fence;
    ^
drivers/gpu/drm/i915/i915_gem_execbuffer.c:1793:20: error: unused variable ‘out_fence’ [-Werror=unused-variable]
  struct sync_file *out_fence = NULL;
                    ^
drivers/gpu/drm/i915/i915_gem_execbuffer.c: At top level:
drivers/gpu/drm/i915/i915_gem_execbuffer.c:442:1: error: ‘eb_reserve_vma’ defined but not used [-Werror=unused-function]
 eb_reserve_vma(struct i915_execbuffer *eb, struct i915_vma *vma)
 ^
drivers/gpu/drm/i915/i915_gem_execbuffer.c:773:13: error: ‘eb_release_vma’ defined but not used [-Werror=unused-function]
 static void eb_release_vma(const struct i915_execbuffer *eb)
             ^
drivers/gpu/drm/i915/i915_gem_execbuffer.c:796:13: error: ‘eb_destroy’ defined but not used [-Werror=unused-function]
 static void eb_destroy(const struct i915_execbuffer *eb)
             ^
drivers/gpu/drm/i915/i915_gem_execbuffer.c:1546:1: error: ‘i915_gem_check_execbuffer’ defined but not used [-Werror=unused-function]
 i915_gem_check_execbuffer(struct drm_i915_gem_execbuffer2 *exec)
 ^
drivers/gpu/drm/i915/i915_gem_execbuffer.c:1630:25: error: ‘eb_parse’ defined but not used [-Werror=unused-function]
 static struct i915_vma *eb_parse(struct i915_execbuffer *eb, bool is_master)
                         ^
drivers/gpu/drm/i915/i915_gem_execbuffer.c:1672:1: error: ‘add_to_client’ defined but not used [-Werror=unused-function]
 add_to_client(struct drm_i915_gem_request *req,
 ^
drivers/gpu/drm/i915/i915_gem_execbuffer.c:1680:1: error: ‘eb_submit’ defined but not used [-Werror=unused-function]
 eb_submit(struct i915_execbuffer *eb)
 ^
drivers/gpu/drm/i915/i915_gem_execbuffer.c:1786:1: error: ‘i915_gem_do_execbuffer’ defined but not used [-Werror=unused-function]
 i915_gem_do_execbuffer(struct drm_device *dev,
 ^
cc1: all warnings being treated as errors
scripts/Makefile.build:294: recipe for target 'drivers/gpu/drm/i915/i915_gem_execbuffer.o' failed
make[4]: *** [drivers/gpu/drm/i915/i915_gem_execbuffer.o] Error 1
make[4]: *** Waiting for unfinished jobs....
  LD      drivers/usb/gadget/udc/udc-core.o
  LD      drivers/video/console/built-in.o
  LD      drivers/usb/gadget/udc/built-in.o
  LD      drivers/video/built-in.o
  LD      drivers/usb/gadget/built-in.o
  LD [M]  drivers/net/ethernet/intel/e1000/e1000.o
  LD      net/ipv6/ipv6.o
  LD      net/ipv6/built-in.o
  LD      drivers/tty/serial/8250/8250_base.o
  LD      drivers/scsi/sd_mod.o
  LD      drivers/tty/serial/8250/built-in.o
  LD      drivers/scsi/built-in.o
  AR      lib/lib.a
  LD      drivers/tty/serial/built-in.o
  LD [M]  drivers/net/ethernet/intel/igb/igb.o
  EXPORTS lib/lib-ksyms.o
  LD      lib/built-in.o
  LD      net/xfrm/built-in.o
  LD      drivers/usb/core/usbcore.o
  LD      drivers/usb/core/built-in.o
  LD      drivers/md/md-mod.o
  LD      drivers/md/built-in.o
  CC      arch/x86/kernel/cpu/capflags.o
  LD      arch/x86/kernel/cpu/built-in.o
  LD      arch/x86/kernel/built-in.o
  LD      drivers/tty/vt/built-in.o
  LD      drivers/usb/host/xhci-hcd.o
  LD      drivers/tty/built-in.o
  LD      fs/btrfs/btrfs.o
  LD      fs/ext4/ext4.o
  LD      arch/x86/built-in.o
  LD      drivers/usb/host/built-in.o
  LD [M]  drivers/net/ethernet/intel/e1000e/e1000e.o
  LD      drivers/usb/built-in.o
  LD      fs/ext4/built-in.o
  LD      fs/btrfs/built-in.o
  LD      net/core/built-in.o
  LD      fs/built-in.o
  LD      net/ipv4/built-in.o
  LD      net/built-in.o
  LD      drivers/net/ethernet/built-in.o
  LD      drivers/net/built-in.o
scripts/Makefile.build:553: recipe for target 'drivers/gpu/drm/i915' failed
make[3]: *** [drivers/gpu/drm/i915] Error 2
scripts/Makefile.build:553: recipe for target 'drivers/gpu/drm' failed
make[2]: *** [drivers/gpu/drm] Error 2
scripts/Makefile.build:553: recipe for target 'drivers/gpu' failed
make[1]: *** [drivers/gpu] Error 2
Makefile:1002: recipe for target 'drivers' failed
make: *** [drivers] Error 2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2] drm/i915: Store a direct lookup from object handle to vma
  2017-03-22 14:22     ` Joonas Lahtinen
@ 2017-03-23 14:23       ` Daniel Vetter
  2017-03-23 14:40         ` Chris Wilson
  0 siblings, 1 reply; 39+ messages in thread
From: Daniel Vetter @ 2017-03-23 14:23 UTC (permalink / raw)
  To: Joonas Lahtinen; +Cc: Daniel Vetter, intel-gfx

On Wed, Mar 22, 2017 at 04:22:38PM +0200, Joonas Lahtinen wrote:
> + Daniel for the rsvd2

I'd go with a shadow struct definition which matches the uapi one exactly,
except for having a proper name and type for this ... Or we do the
anynomous union thing again.
-Daniel

> 
> On ke, 2017-03-22 at 09:33 +0000, Chris Wilson wrote:
> > The advent of full-ppgtt lead to an extra indirection between the object
> > and its binding. That extra indirection has a noticeable impact on how
> > fast we can convert from the user handles to our internal vma for
> > execbuffer. In order to bypass the extra indirection, we use a
> > resizable hashtable to jump from the object to the per-ctx vma.
> > rhashtable was considered but we don't need the online resizing feature
> > and the extra complexity proved to undermine its usefulness. Instead, we
> > simply reallocate the hastable on demand in a background task and
> > serialize it before iterating.
> > 
> > In non-full-ppgtt modes, multiple files and multiple contexts can share
> > the same vma. This leads to having multiple possible handle->vma links,
> > so we only use the first to establish the fast path. The majority of
> > buffers are not shared and so we should still be able to realise
> > speedups with multiple clients.
> > 
> > v2: Prettier names, more magic.
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> 
> <SNIP>
>  
> > +static void resize_vma_ht(struct work_struct *work)
> > +{
> > +	struct i915_gem_context_vma_lut *lut =
> > +		container_of(work, typeof(*lut), resize);
> > +	unsigned int size, bits, new_bits, i;
> > +	struct hlist_head *new_ht;
> > +
> > +	bits = 1 + ilog2(4*lut->ht_count/3);
> > +	new_bits = min_t(unsigned int,
> > +			 max(bits, VMA_HT_BITS),
> > +			 sizeof(unsigned int)*8);
> 
> * BITS_PER_BYTE for extra clarity.
> 
> > +	if (new_bits == lut->ht_bits)
> > +		goto out;
> > +
> > +	new_ht = kzalloc(sizeof(*new_ht)<<new_bits, GFP_KERNEL | __GFP_NOWARN);
> > +	if (!new_ht)
> > +		new_ht = vzalloc(sizeof(*new_ht)<<new_bits);
> 
> No vcalloc :( Otherwise would've suggested
> 
> vzalloc(BIT(new_bits), sizeof(*new_ht), ...);
> 
> but
> 
> kzalloc(BIT(new_bits)*sizeof(*new_ht), ...)
> 
> might still be clearer.
> 
> > @@ -266,6 +331,16 @@ __create_hw_context(struct drm_i915_private *dev_priv,
> >  	list_add_tail(&ctx->link, &dev_priv->context_list);
> >  	ctx->i915 = dev_priv;
> >  
> > +	ctx->vma_lut.ht_bits = VMA_HT_BITS;
> > +	ctx->vma_lut.ht_size = BIT(VMA_HT_BITS);
> > +	ctx->vma_lut.ht = kcalloc(ctx->vma_lut.ht_size,
> > +				  sizeof(*ctx->vma_lut.ht),
> > +				  GFP_KERNEL);
> > +	if (!ctx->vma_lut.ht)
> > +		goto err_out;
> > +
> 
> Errors after this point will leak lut. Need err_lut label and call it
> from further error gotos.
> 
> > @@ -143,6 +143,31 @@ struct i915_gem_context {
> >  	/** ggtt_offset_bias: placement restriction for context objects */
> >  	u32 ggtt_offset_bias;
> >  
> > +	struct i915_gem_context_vma_lut {
> > +		/** ht_size: last request size to allocate the hashtable for. */
> > +		unsigned int ht_size;
> > +#define RESIZE_IN_PROGRESS BIT(0)
> 
> Easily conflicting name? Forward declare and hide in .c file? By making
> ht[0] at the end, and maybe pull out work. Or maybe just prefix :P
> 
> > +#define to_ptr(T, x) ((T *)(uintptr_t)(x))
> 
> i915_utils.h so we some day push to core. from_uintptr might make more
> sense, though.
> 
> The remainder is somewhat hard to review due to combined code motion
> and changes becoming mess, but didn't find anything else but I still
> dislike rsvd2 usage, and the magic bit it has.
> 
> We could at least #define proper_name rsvd2 in the .c file... Jani
> would be glad, I guess. I think if somebody touches a reserved field by
> name, they deserve to have their build broken.
> 
> Regards, Joonas
> -- 
> Joonas Lahtinen
> Open Source Technology Center
> Intel Corporation

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2] drm/i915: Store a direct lookup from object handle to vma
  2017-03-23 14:23       ` Daniel Vetter
@ 2017-03-23 14:40         ` Chris Wilson
  0 siblings, 0 replies; 39+ messages in thread
From: Chris Wilson @ 2017-03-23 14:40 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: Daniel Vetter, intel-gfx

On Thu, Mar 23, 2017 at 03:23:28PM +0100, Daniel Vetter wrote:
> On Wed, Mar 22, 2017 at 04:22:38PM +0200, Joonas Lahtinen wrote:
> > + Daniel for the rsvd2
> 
> I'd go with a shadow struct definition which matches the uapi one exactly,
> except for having a proper name and type for this ... Or we do the
> anynomous union thing again.

Definitely not an anon union since this is not part of the ABI, just
taking advantage of the overallocation in the kernel copy of the user
structs. I've plonked it behind a macro so that we have only one point
of failure, though realistically expanding the struct or using a lut[]
will take more massaging than just renaming the field. It's not a big
job so I don't feel like the code is hamstringing our future selves.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 13/15] drm/i915: Allow execbuffer to use the first object as the batch
  2017-03-17 11:15   ` Joonas Lahtinen
@ 2017-07-07 10:17     ` Daniel Vetter
  2017-07-07 11:58       ` Chris Wilson
  2017-07-08  3:54       ` Kenneth Graunke
  0 siblings, 2 replies; 39+ messages in thread
From: Daniel Vetter @ 2017-07-07 10:17 UTC (permalink / raw)
  To: Joonas Lahtinen, Kenneth W Graunke, Dave Airlie; +Cc: intel-gfx

On Fri, Mar 17, 2017 at 12:15 PM, Joonas Lahtinen
<joonas.lahtinen@linux.intel.com> wrote:
> On to, 2017-03-16 at 13:20 +0000, Chris Wilson wrote:
>> Currently, the last object in the execlist is the always the batch.
>> However, when building the batch buffer we often know the batch object
>> first and if we can use the first slot in the execlist we can emit
>> relocation instructions relative to it immediately and avoid a separate
>> pass to adjust the relocations to point to the last execlist slot.
>>
>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>
> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>

This patch was reviewed/pushed full month before the mesa patch was
fully reviewed and ready for merging. That's not how uapi is done.

I've fixed this up now by at least reviewing the mesa patch, but for
next time around: If you review uapi, and you don't make sure the
userspace side is in good shape too, then you've not reviewed the
patch properly.

Same goes for reviewing and not making sure there's tests, but that's
another rant.

Ken, pls make sure we don't end up with another case like resource
streamer (or tell me I should revert this).
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 13/15] drm/i915: Allow execbuffer to use the first object as the batch
  2017-07-07 10:17     ` Daniel Vetter
@ 2017-07-07 11:58       ` Chris Wilson
  2017-07-10  5:55         ` Daniel Vetter
  2017-07-08  3:54       ` Kenneth Graunke
  1 sibling, 1 reply; 39+ messages in thread
From: Chris Wilson @ 2017-07-07 11:58 UTC (permalink / raw)
  To: Daniel Vetter, Joonas Lahtinen, Kenneth W Graunke, Dave Airlie; +Cc: intel-gfx

Quoting Daniel Vetter (2017-07-07 11:17:22)
> On Fri, Mar 17, 2017 at 12:15 PM, Joonas Lahtinen
> <joonas.lahtinen@linux.intel.com> wrote:
> > On to, 2017-03-16 at 13:20 +0000, Chris Wilson wrote:
> >> Currently, the last object in the execlist is the always the batch.
> >> However, when building the batch buffer we often know the batch object
> >> first and if we can use the first slot in the execlist we can emit
> >> relocation instructions relative to it immediately and avoid a separate
> >> pass to adjust the relocations to point to the last execlist slot.
> >>
> >> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >
> > Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> 
> This patch was reviewed/pushed full month before the mesa patch was
> fully reviewed and ready for merging. That's not how uapi is done.
> 
> I've fixed this up now by at least reviewing the mesa patch, but for
> next time around: If you review uapi, and you don't make sure the
> userspace side is in good shape too, then you've not reviewed the
> patch properly.

The userspace was in good shape and had been requested. Now explain the
complication in the API that merits a rant?

You've had well over 2 years to review the patches that first used it.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 13/15] drm/i915: Allow execbuffer to use the first object as the batch
  2017-07-07 10:17     ` Daniel Vetter
  2017-07-07 11:58       ` Chris Wilson
@ 2017-07-08  3:54       ` Kenneth Graunke
  1 sibling, 0 replies; 39+ messages in thread
From: Kenneth Graunke @ 2017-07-08  3:54 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: Dave Airlie, intel-gfx


[-- Attachment #1.1: Type: text/plain, Size: 1930 bytes --]

On Friday, July 7, 2017 3:17:22 AM PDT Daniel Vetter wrote:
> On Fri, Mar 17, 2017 at 12:15 PM, Joonas Lahtinen
> <joonas.lahtinen@linux.intel.com> wrote:
> > On to, 2017-03-16 at 13:20 +0000, Chris Wilson wrote:
> >> Currently, the last object in the execlist is the always the batch.
> >> However, when building the batch buffer we often know the batch object
> >> first and if we can use the first slot in the execlist we can emit
> >> relocation instructions relative to it immediately and avoid a separate
> >> pass to adjust the relocations to point to the last execlist slot.
> >>
> >> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >
> > Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> 
> This patch was reviewed/pushed full month before the mesa patch was
> fully reviewed and ready for merging. That's not how uapi is done.
> 
> I've fixed this up now by at least reviewing the mesa patch, but for
> next time around: If you review uapi, and you don't make sure the
> userspace side is in good shape too, then you've not reviewed the
> patch properly.
> 
> Same goes for reviewing and not making sure there's tests, but that's
> another rant.
> 
> Ken, pls make sure we don't end up with another case like resource
> streamer (or tell me I should revert this).
> -Daniel

Sorry, that's partly my bad - I had mentioned to Chris that I wanted this
feature, and planned to use it, but then got distracted with other work and
didn't get around to actually shipping a patch to do so.  Both Chris and I
wrote patches, and IIRC I was benchmarking things when I got distracted.

Basically, I915_EXEC_HANDLE_LUT appeared to be a performance loss in the
CPU bound benchmarks I looked at, because we had to walk over one of the
lists and patch up references to the batch (-1 => actual index).

BATCH_FIRST eliminates that overhead, making HANDLE_LUT actually useful
(although only a small amount).

--Ken

[-- Attachment #1.2: This is a digitally signed message part. --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

[-- Attachment #2: Type: text/plain, Size: 160 bytes --]

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 13/15] drm/i915: Allow execbuffer to use the first object as the batch
  2017-07-07 11:58       ` Chris Wilson
@ 2017-07-10  5:55         ` Daniel Vetter
  0 siblings, 0 replies; 39+ messages in thread
From: Daniel Vetter @ 2017-07-10  5:55 UTC (permalink / raw)
  To: Chris Wilson; +Cc: Dave Airlie, Kenneth W Graunke, intel-gfx

On Fri, Jul 7, 2017 at 1:58 PM, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> Quoting Daniel Vetter (2017-07-07 11:17:22)
>> On Fri, Mar 17, 2017 at 12:15 PM, Joonas Lahtinen
>> <joonas.lahtinen@linux.intel.com> wrote:
>> > On to, 2017-03-16 at 13:20 +0000, Chris Wilson wrote:
>> >> Currently, the last object in the execlist is the always the batch.
>> >> However, when building the batch buffer we often know the batch object
>> >> first and if we can use the first slot in the execlist we can emit
>> >> relocation instructions relative to it immediately and avoid a separate
>> >> pass to adjust the relocations to point to the last execlist slot.
>> >>
>> >> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>> >
>> > Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
>>
>> This patch was reviewed/pushed full month before the mesa patch was
>> fully reviewed and ready for merging. That's not how uapi is done.
>>
>> I've fixed this up now by at least reviewing the mesa patch, but for
>> next time around: If you review uapi, and you don't make sure the
>> userspace side is in good shape too, then you've not reviewed the
>> patch properly.
>
> The userspace was in good shape and had been requested. Now explain the
> complication in the API that merits a rant?
>
> You've had well over 2 years to review the patches that first used it.

uapi merge rules:
1. get everything reviewed, tested and everything ready for merging
2. merge kernel side
3. merge userpsace side

This here very much looked liked the userspace side wasn't ready yet,
and the kernel side was pushed already. This is the
reviewers&committers job to ensure, not mine, so no idea why I've had
over 2 years to review things ...

Anyway, should we add some checks for uapi additions to dim? Just
checking for a Testcase: or References: line for the userspace might
be useful, but then many uapi additions don't touch include/uapi ...
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 39+ messages in thread

end of thread, other threads:[~2017-07-10  5:55 UTC | newest]

Thread overview: 39+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-03-16 13:19 Make execbuf fast and green Chris Wilson
2017-03-16 13:19 ` [PATCH 01/15] drm/i915: Copy user requested buffers into the error state Chris Wilson
2017-03-16 13:19 ` [PATCH 02/15] drm/i915: Retire an active batch pool object rather than allocate new Chris Wilson
2017-03-17  8:52   ` Joonas Lahtinen
2017-03-17  9:02     ` Chris Wilson
2017-03-16 13:19 ` [PATCH 03/15] drm/i915: Amalgamate execbuffer parameter structures Chris Wilson
2017-03-17 10:04   ` Joonas Lahtinen
2017-03-16 13:19 ` [PATCH 04/15] drm/i915: Use vma->exec_entry as our double-entry placeholder Chris Wilson
2017-03-16 13:19 ` [PATCH 05/15] drm/i915: Split vma exec_link/evict_link Chris Wilson
2017-03-16 13:19 ` [PATCH 06/15] drm/i915: Stop using obj->obj_exec_link outside of execbuf Chris Wilson
2017-03-16 13:19 ` [PATCH 07/15] drm/i915: Store a direct lookup from object handle to vma Chris Wilson
2017-03-22  9:33   ` [PATCH v2] " Chris Wilson
2017-03-22 14:22     ` Joonas Lahtinen
2017-03-23 14:23       ` Daniel Vetter
2017-03-23 14:40         ` Chris Wilson
2017-03-16 13:19 ` [PATCH 08/15] drm/i915: Pass vma to relocate entry Chris Wilson
2017-03-21 14:17   ` Joonas Lahtinen
2017-03-23 13:02     ` [PATCH] " Chris Wilson
2017-03-16 13:20 ` [PATCH 09/15] drm/i915: Eliminate lots of iterations over the execobjects array Chris Wilson
2017-03-16 13:20 ` [PATCH 10/15] drm/i915: First try the previous execbuffer location Chris Wilson
2017-03-21 13:54   ` Joonas Lahtinen
2017-03-16 13:20 ` [PATCH 11/15] drm/i915: Wait upon userptr get-user-pages within execbuffer Chris Wilson
2017-03-16 13:20 ` [PATCH 12/15] drm/i915: Remove superfluous i915_add_request_no_flush() helper Chris Wilson
2017-03-17 11:17   ` Joonas Lahtinen
2017-03-17 13:06     ` Chris Wilson
2017-03-16 13:20 ` [PATCH 13/15] drm/i915: Allow execbuffer to use the first object as the batch Chris Wilson
2017-03-17 11:15   ` Joonas Lahtinen
2017-07-07 10:17     ` Daniel Vetter
2017-07-07 11:58       ` Chris Wilson
2017-07-10  5:55         ` Daniel Vetter
2017-07-08  3:54       ` Kenneth Graunke
2017-03-16 13:20 ` [PATCH 14/15] drm/i915: Async GPU relocation processing Chris Wilson
2017-03-17 11:11   ` [PATCH v2] " Chris Wilson
2017-03-17 11:15   ` [PATCH 14/15] " Joonas Lahtinen
2017-03-16 13:20 ` [PATCH 15/15] drm/i915/scheduler: Support user-defined priorities Chris Wilson
2017-03-16 13:57 ` ✗ Fi.CI.BAT: warning for series starting with [01/15] drm/i915: Copy user requested buffers into the error state Patchwork
2017-03-17 11:28 ` ✓ Fi.CI.BAT: success for series starting with [01/15] drm/i915: Copy user requested buffers into the error state (rev2) Patchwork
2017-03-22  9:43 ` ✗ Fi.CI.BAT: failure for series starting with [01/15] drm/i915: Copy user requested buffers into the error state (rev3) Patchwork
2017-03-23 13:51 ` ✗ Fi.CI.BAT: failure for series starting with [01/15] drm/i915: Copy user requested buffers into the error state (rev4) Patchwork

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.