All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/6] drm/i915: Limit C-states when waiting for the active request
@ 2018-08-06  8:30 Chris Wilson
  2018-08-06  8:30 ` [PATCH 2/6] drm/i915: Reserve some priority bits for internal use Chris Wilson
                   ` (10 more replies)
  0 siblings, 11 replies; 24+ messages in thread
From: Chris Wilson @ 2018-08-06  8:30 UTC (permalink / raw)
  To: intel-gfx; +Cc: Eero Tamminen

If we are waiting for the currently executing request, we have a good
idea that it will be completed in the very near future and so want to
cap the CPU_DMA_LATENCY to ensure that we wake up the client quickly.

v2: Not allowed to block in kmalloc after setting TASK_INTERRUPTIBLE.
v3: Avoid the blocking notifier as well for TASK_INTERRUPTIBLE
v4: Beautification?
v5: And ignore the preemptibility of queue_work before schedule.
v6: Use highpri wq to keep our pm_qos window as small as possible.

Testcase: igt/gem_sync/store-default
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Eero Tamminen <eero.t.tamminen@intel.com>
Cc: Francisco Jerez <currojerez@riseup.net>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
---
 drivers/gpu/drm/i915/i915_request.c | 59 +++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 5c2c93cbab12..67fd2ec75d78 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -1258,6 +1258,58 @@ static bool __i915_wait_request_check_and_reset(struct i915_request *request)
 	return true;
 }
 
+struct wait_dma_qos {
+	struct pm_qos_request req;
+	struct work_struct add, del;
+};
+
+static void __wait_dma_qos_add(struct work_struct *work)
+{
+	struct wait_dma_qos *qos = container_of(work, typeof(*qos), add);
+
+	pm_qos_add_request(&qos->req, PM_QOS_CPU_DMA_LATENCY, 50);
+}
+
+static void __wait_dma_qos_del(struct work_struct *work)
+{
+	struct wait_dma_qos *qos = container_of(work, typeof(*qos), del);
+
+	if (!cancel_work_sync(&qos->add))
+		pm_qos_remove_request(&qos->req);
+
+	kfree(qos);
+}
+
+static struct wait_dma_qos *wait_dma_qos_add(void)
+{
+	struct wait_dma_qos *qos;
+
+	/* Called under TASK_INTERRUPTIBLE, so not allowed to sleep/block. */
+	qos = kzalloc(sizeof(*qos), GFP_NOWAIT | __GFP_NOWARN);
+	if (!qos)
+		return NULL;
+
+	INIT_WORK(&qos->add, __wait_dma_qos_add);
+	INIT_WORK(&qos->del, __wait_dma_qos_del);
+
+	/*
+	 * Schedule the enabling work on the local cpu so that it should only
+	 * take effect if we actually sleep. If schedule() short circuits due to
+	 * our request already being completed, we should then be able to cancel
+	 * the work before it is even run.
+	 */
+	queue_work_on(raw_smp_processor_id(), system_highpri_wq, &qos->add);
+
+	return qos;
+}
+
+static void wait_dma_qos_del(struct wait_dma_qos *qos)
+{
+	/* Defer to worker so not incur extra latency for our woken client. */
+	if (qos)
+		queue_work(system_highpri_wq, &qos->del);
+}
+
 /**
  * i915_request_wait - wait until execution of request has finished
  * @rq: the request to wait upon
@@ -1286,6 +1338,7 @@ long i915_request_wait(struct i915_request *rq,
 	wait_queue_head_t *errq = &rq->i915->gpu_error.wait_queue;
 	DEFINE_WAIT_FUNC(reset, default_wake_function);
 	DEFINE_WAIT_FUNC(exec, default_wake_function);
+	struct wait_dma_qos *qos = NULL;
 	struct intel_wait wait;
 
 	might_sleep();
@@ -1363,6 +1416,11 @@ long i915_request_wait(struct i915_request *rq,
 			break;
 		}
 
+		if (!qos &&
+		    i915_seqno_passed(intel_engine_get_seqno(rq->engine),
+				      wait.seqno - 1))
+			qos = wait_dma_qos_add();
+
 		timeout = io_schedule_timeout(timeout);
 
 		if (intel_wait_complete(&wait) &&
@@ -1412,6 +1470,7 @@ long i915_request_wait(struct i915_request *rq,
 	if (flags & I915_WAIT_LOCKED)
 		remove_wait_queue(errq, &reset);
 	remove_wait_queue(&rq->execute, &exec);
+	wait_dma_qos_del(qos);
 	trace_i915_request_wait_end(rq);
 
 	return timeout;
-- 
2.18.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 2/6] drm/i915: Reserve some priority bits for internal use
  2018-08-06  8:30 [PATCH 1/6] drm/i915: Limit C-states when waiting for the active request Chris Wilson
@ 2018-08-06  8:30 ` Chris Wilson
  2018-08-06  8:30 ` [PATCH 3/6] drm/i915: Combine multiple internal plists into the same i915_priolist bucket Chris Wilson
                   ` (9 subsequent siblings)
  10 siblings, 0 replies; 24+ messages in thread
From: Chris Wilson @ 2018-08-06  8:30 UTC (permalink / raw)
  To: intel-gfx

In the next few patches, we will want to give a small priority boost to
some requests/queues but not so much that we perturb the user controlled
order. As such we shift the user priority bits higher leaving ourselves
a few low priority bits for our bumping.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.h            | 2 +-
 drivers/gpu/drm/i915/i915_gem_context.c    | 9 +++++----
 drivers/gpu/drm/i915/i915_scheduler.h      | 6 ++++++
 drivers/gpu/drm/i915/selftests/intel_lrc.c | 8 +++++---
 4 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 4aca5344863d..3f5bf6f768b9 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3165,7 +3165,7 @@ int i915_gem_object_wait(struct drm_i915_gem_object *obj,
 int i915_gem_object_wait_priority(struct drm_i915_gem_object *obj,
 				  unsigned int flags,
 				  const struct i915_sched_attr *attr);
-#define I915_PRIORITY_DISPLAY I915_PRIORITY_MAX
+#define I915_PRIORITY_DISPLAY I915_USER_PRIORITY(I915_PRIORITY_MAX)
 
 int __must_check
 i915_gem_object_set_to_wc_domain(struct drm_i915_gem_object *obj, bool write);
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index f15a039772db..cc953c2f93c6 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -285,7 +285,7 @@ __create_hw_context(struct drm_i915_private *dev_priv,
 	kref_init(&ctx->ref);
 	list_add_tail(&ctx->link, &dev_priv->contexts.list);
 	ctx->i915 = dev_priv;
-	ctx->sched.priority = I915_PRIORITY_NORMAL;
+	ctx->sched.priority = I915_USER_PRIORITY(I915_PRIORITY_NORMAL);
 
 	for (n = 0; n < ARRAY_SIZE(ctx->__engine); n++) {
 		struct intel_context *ce = &ctx->__engine[n];
@@ -431,7 +431,7 @@ i915_gem_context_create_kernel(struct drm_i915_private *i915, int prio)
 		return ctx;
 
 	i915_gem_context_clear_bannable(ctx);
-	ctx->sched.priority = prio;
+	ctx->sched.priority = I915_USER_PRIORITY(prio);
 	ctx->ring_size = PAGE_SIZE;
 
 	GEM_BUG_ON(!i915_gem_context_is_kernel(ctx));
@@ -806,7 +806,7 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
 		args->value = i915_gem_context_is_bannable(ctx);
 		break;
 	case I915_CONTEXT_PARAM_PRIORITY:
-		args->value = ctx->sched.priority;
+		args->value = ctx->sched.priority >> I915_USER_PRIORITY_SHIFT;
 		break;
 	default:
 		ret = -EINVAL;
@@ -879,7 +879,8 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
 				 !capable(CAP_SYS_NICE))
 				ret = -EPERM;
 			else
-				ctx->sched.priority = priority;
+				ctx->sched.priority =
+					I915_USER_PRIORITY(priority);
 		}
 		break;
 
diff --git a/drivers/gpu/drm/i915/i915_scheduler.h b/drivers/gpu/drm/i915/i915_scheduler.h
index 70a42220358d..7edfad0abfd7 100644
--- a/drivers/gpu/drm/i915/i915_scheduler.h
+++ b/drivers/gpu/drm/i915/i915_scheduler.h
@@ -19,6 +19,12 @@ enum {
 	I915_PRIORITY_INVALID = INT_MIN
 };
 
+#define I915_USER_PRIORITY_SHIFT 0
+#define I915_USER_PRIORITY(x) ((x) << I915_USER_PRIORITY_SHIFT)
+
+#define I915_PRIORITY_COUNT BIT(I915_USER_PRIORITY_SHIFT)
+#define I915_PRIORITY_MASK (-I915_PRIORITY_COUNT)
+
 struct i915_sched_attr {
 	/**
 	 * @priority: execution and service priority
diff --git a/drivers/gpu/drm/i915/selftests/intel_lrc.c b/drivers/gpu/drm/i915/selftests/intel_lrc.c
index 582566faef09..36befb19c52c 100644
--- a/drivers/gpu/drm/i915/selftests/intel_lrc.c
+++ b/drivers/gpu/drm/i915/selftests/intel_lrc.c
@@ -288,12 +288,14 @@ static int live_preempt(void *arg)
 	ctx_hi = kernel_context(i915);
 	if (!ctx_hi)
 		goto err_spin_lo;
-	ctx_hi->sched.priority = I915_CONTEXT_MAX_USER_PRIORITY;
+	ctx_hi->sched.priority =
+		I915_USER_PRIORITY(I915_CONTEXT_MAX_USER_PRIORITY);
 
 	ctx_lo = kernel_context(i915);
 	if (!ctx_lo)
 		goto err_ctx_hi;
-	ctx_lo->sched.priority = I915_CONTEXT_MIN_USER_PRIORITY;
+	ctx_lo->sched.priority =
+		I915_USER_PRIORITY(I915_CONTEXT_MIN_USER_PRIORITY);
 
 	for_each_engine(engine, i915, id) {
 		struct i915_request *rq;
@@ -412,7 +414,7 @@ static int live_late_preempt(void *arg)
 			goto err_wedged;
 		}
 
-		attr.priority = I915_PRIORITY_MAX;
+		attr.priority = I915_USER_PRIORITY(I915_PRIORITY_MAX);
 		engine->schedule(rq, &attr);
 
 		if (!wait_for_spinner(&spin_hi, rq)) {
-- 
2.18.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 3/6] drm/i915: Combine multiple internal plists into the same i915_priolist bucket
  2018-08-06  8:30 [PATCH 1/6] drm/i915: Limit C-states when waiting for the active request Chris Wilson
  2018-08-06  8:30 ` [PATCH 2/6] drm/i915: Reserve some priority bits for internal use Chris Wilson
@ 2018-08-06  8:30 ` Chris Wilson
  2018-08-06  8:30 ` [PATCH 4/6] drm/i915: Priority boost for new clients Chris Wilson
                   ` (8 subsequent siblings)
  10 siblings, 0 replies; 24+ messages in thread
From: Chris Wilson @ 2018-08-06  8:30 UTC (permalink / raw)
  To: intel-gfx

As we are about to allow ourselves to slightly bump the user priority
into a few different sublevels, packthose internal priority lists
into the same i915_priolist to keep the rbtree compact and avoid having
to allocate the default user priority even after the internal bumping.
The downside to having an requests[] rather than a node per active list,
is that we then have to walk over the empty higher priority lists. To
compensate, we track the active buckets and use a small bitmap to skip
over any inactive ones.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/intel_engine_cs.c      |  6 +-
 drivers/gpu/drm/i915/intel_guc_submission.c | 12 ++-
 drivers/gpu/drm/i915/intel_lrc.c            | 87 ++++++++++++++-------
 drivers/gpu/drm/i915/intel_ringbuffer.h     | 13 ++-
 4 files changed, 80 insertions(+), 38 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 67c4fc5d737c..9fb479068002 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -1530,10 +1530,10 @@ void intel_engine_dump(struct intel_engine_cs *engine,
 	count = 0;
 	drm_printf(m, "\t\tQueue priority: %d\n", execlists->queue_priority);
 	for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
-		struct i915_priolist *p =
-			rb_entry(rb, typeof(*p), node);
+		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
+		int i;
 
-		list_for_each_entry(rq, &p->requests, sched.link) {
+		priolist_for_each_request(rq, p, i) {
 			if (count++ < MAX_REQUESTS_TO_SHOW - 1)
 				print_request(m, rq, "\t\tQ ");
 			else
diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c b/drivers/gpu/drm/i915/intel_guc_submission.c
index 195adbd0ebf7..3c8ebc1c5828 100644
--- a/drivers/gpu/drm/i915/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/intel_guc_submission.c
@@ -719,30 +719,28 @@ static bool __guc_dequeue(struct intel_engine_cs *engine)
 	while ((rb = rb_first_cached(&execlists->queue))) {
 		struct i915_priolist *p = to_priolist(rb);
 		struct i915_request *rq, *rn;
+		int i;
 
-		list_for_each_entry_safe(rq, rn, &p->requests, sched.link) {
+		priolist_for_each_request_consume(rq, rn, p, i) {
 			if (last && rq->hw_context != last->hw_context) {
-				if (port == last_port) {
-					__list_del_many(&p->requests,
-							&rq->sched.link);
+				if (port == last_port)
 					goto done;
-				}
 
 				if (submit)
 					port_assign(port, last);
 				port++;
 			}
 
-			INIT_LIST_HEAD(&rq->sched.link);
+			list_del_init(&rq->sched.link);
 
 			__i915_request_submit(rq);
 			trace_i915_request_in(rq, port_index(port, execlists));
+
 			last = rq;
 			submit = true;
 		}
 
 		rb_erase_cached(&p->node, &execlists->queue);
-		INIT_LIST_HEAD(&p->requests);
 		if (p->priority != I915_PRIORITY_NORMAL)
 			kmem_cache_free(engine->i915->priorities, p);
 	}
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index b0be180c6294..a36b12f8277d 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -259,14 +259,49 @@ intel_lr_context_descriptor_update(struct i915_gem_context *ctx,
 	ce->lrc_desc = desc;
 }
 
-static struct i915_priolist *
+static void assert_priolists(struct intel_engine_execlists * const execlists,
+			     int queue_priority)
+{
+	struct rb_node *rb;
+	int last_prio, i;
+
+	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
+		return;
+
+	GEM_BUG_ON(rb_first_cached(&execlists->queue) !=
+		   rb_first(&execlists->queue.rb_root));
+
+	last_prio = (queue_priority >> I915_USER_PRIORITY_SHIFT) + 1;
+	for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
+		struct i915_priolist *p = to_priolist(rb);
+
+		GEM_BUG_ON(p->priority >= last_prio);
+		last_prio = p->priority;
+
+		GEM_BUG_ON(!p->used);
+		for (i = 0; i < ARRAY_SIZE(p->requests); i++) {
+			if (list_empty(&p->requests[i]))
+				continue;
+
+			GEM_BUG_ON(!(p->used & BIT(i)));
+		}
+	}
+}
+
+static struct list_head *
 lookup_priolist(struct intel_engine_cs *engine, int prio)
 {
 	struct intel_engine_execlists * const execlists = &engine->execlists;
 	struct i915_priolist *p;
 	struct rb_node **parent, *rb;
 	bool first = true;
+	int idx, i;
+
+	assert_priolists(execlists, INT_MAX);
 
+	/* buckets sorted from highest [in slot 0] to lowest priority */
+	idx = I915_PRIORITY_COUNT - (prio & ~I915_PRIORITY_MASK) - 1;
+	prio >>= I915_USER_PRIORITY_SHIFT;
 	if (unlikely(execlists->no_priolist))
 		prio = I915_PRIORITY_NORMAL;
 
@@ -283,7 +318,7 @@ lookup_priolist(struct intel_engine_cs *engine, int prio)
 			parent = &rb->rb_right;
 			first = false;
 		} else {
-			return p;
+			goto out;
 		}
 	}
 
@@ -309,11 +344,15 @@ lookup_priolist(struct intel_engine_cs *engine, int prio)
 	}
 
 	p->priority = prio;
-	INIT_LIST_HEAD(&p->requests);
+	for (i = 0; i < ARRAY_SIZE(p->requests); i++)
+		INIT_LIST_HEAD(&p->requests[i]);
 	rb_link_node(&p->node, rb, parent);
 	rb_insert_color_cached(&p->node, &execlists->queue, first);
+	p->used = 0;
 
-	return p;
+out:
+	p->used |= BIT(idx);
+	return &p->requests[idx];
 }
 
 static void unwind_wa_tail(struct i915_request *rq)
@@ -325,7 +364,7 @@ static void unwind_wa_tail(struct i915_request *rq)
 static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
 {
 	struct i915_request *rq, *rn;
-	struct i915_priolist *uninitialized_var(p);
+	struct list_head *uninitialized_var(pl);
 	int last_prio = I915_PRIORITY_INVALID;
 
 	lockdep_assert_held(&engine->timeline.lock);
@@ -342,11 +381,10 @@ static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
 		GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
 		if (rq_prio(rq) != last_prio) {
 			last_prio = rq_prio(rq);
-			p = lookup_priolist(engine, last_prio);
+			pl = lookup_priolist(engine, last_prio);
 		}
 
-		GEM_BUG_ON(p->priority != rq_prio(rq));
-		list_add(&rq->sched.link, &p->requests);
+		list_add(&rq->sched.link, pl);
 	}
 }
 
@@ -674,8 +712,9 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 	while ((rb = rb_first_cached(&execlists->queue))) {
 		struct i915_priolist *p = to_priolist(rb);
 		struct i915_request *rq, *rn;
+		int i;
 
-		list_for_each_entry_safe(rq, rn, &p->requests, sched.link) {
+		priolist_for_each_request_consume(rq, rn, p, i) {
 			/*
 			 * Can we combine this request with the current port?
 			 * It has to be the same context/ringbuffer and not
@@ -694,11 +733,8 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 				 * combine this request with the last, then we
 				 * are done.
 				 */
-				if (port == last_port) {
-					__list_del_many(&p->requests,
-							&rq->sched.link);
+				if (port == last_port)
 					goto done;
-				}
 
 				/*
 				 * If GVT overrides us we only ever submit
@@ -708,11 +744,8 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 				 * request) to the second port.
 				 */
 				if (ctx_single_port_submission(last->hw_context) ||
-				    ctx_single_port_submission(rq->hw_context)) {
-					__list_del_many(&p->requests,
-							&rq->sched.link);
+				    ctx_single_port_submission(rq->hw_context))
 					goto done;
-				}
 
 				GEM_BUG_ON(last->hw_context == rq->hw_context);
 
@@ -723,15 +756,16 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 				GEM_BUG_ON(port_isset(port));
 			}
 
-			INIT_LIST_HEAD(&rq->sched.link);
+			list_del_init(&rq->sched.link);
+
 			__i915_request_submit(rq);
 			trace_i915_request_in(rq, port_index(port, execlists));
+
 			last = rq;
 			submit = true;
 		}
 
 		rb_erase_cached(&p->node, &execlists->queue);
-		INIT_LIST_HEAD(&p->requests);
 		if (p->priority != I915_PRIORITY_NORMAL)
 			kmem_cache_free(engine->i915->priorities, p);
 	}
@@ -755,6 +789,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 	 */
 	execlists->queue_priority =
 		port != execlists->port ? rq_prio(last) : INT_MIN;
+	assert_priolists(execlists, execlists->queue_priority);
 
 	if (submit) {
 		port_assign(port, last);
@@ -866,16 +901,16 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
 	/* Flush the queued requests to the timeline list (for retiring). */
 	while ((rb = rb_first_cached(&execlists->queue))) {
 		struct i915_priolist *p = to_priolist(rb);
+		int i;
 
-		list_for_each_entry_safe(rq, rn, &p->requests, sched.link) {
-			INIT_LIST_HEAD(&rq->sched.link);
+		priolist_for_each_request_consume(rq, rn, p, i) {
+			list_del_init(&rq->sched.link);
 
 			dma_fence_set_error(&rq->fence, -EIO);
 			__i915_request_submit(rq);
 		}
 
 		rb_erase_cached(&p->node, &execlists->queue);
-		INIT_LIST_HEAD(&p->requests);
 		if (p->priority != I915_PRIORITY_NORMAL)
 			kmem_cache_free(engine->i915->priorities, p);
 	}
@@ -1081,8 +1116,7 @@ static void queue_request(struct intel_engine_cs *engine,
 			  struct i915_sched_node *node,
 			  int prio)
 {
-	list_add_tail(&node->link,
-		      &lookup_priolist(engine, prio)->requests);
+	list_add_tail(&node->link, lookup_priolist(engine, prio));
 }
 
 static void __update_queue(struct intel_engine_cs *engine, int prio)
@@ -1152,7 +1186,7 @@ sched_lock_engine(struct i915_sched_node *node, struct intel_engine_cs *locked)
 static void execlists_schedule(struct i915_request *request,
 			       const struct i915_sched_attr *attr)
 {
-	struct i915_priolist *uninitialized_var(pl);
+	struct list_head *uninitialized_var(pl);
 	struct intel_engine_cs *engine, *last;
 	struct i915_dependency *dep, *p;
 	struct i915_dependency stack;
@@ -1247,8 +1281,7 @@ static void execlists_schedule(struct i915_request *request,
 				pl = lookup_priolist(engine, prio);
 				last = engine;
 			}
-			GEM_BUG_ON(pl->priority != prio);
-			list_move_tail(&node->link, &pl->requests);
+			list_move_tail(&node->link, pl);
 		}
 
 		if (prio > engine->execlists.queue_priority &&
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 57f3787ed6ec..bfcbf4317745 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -188,11 +188,22 @@ enum intel_engine_id {
 };
 
 struct i915_priolist {
+	struct list_head requests[I915_PRIORITY_COUNT];
 	struct rb_node node;
-	struct list_head requests;
+	unsigned long used;
 	int priority;
 };
 
+#define priolist_for_each_request(it, plist, idx) \
+	for (idx = 0; idx < ARRAY_SIZE((plist)->requests); idx++) \
+		list_for_each_entry(it, &(plist)->requests[idx], sched.link)
+
+#define priolist_for_each_request_consume(it, n, plist, idx) \
+	for (; (idx = ffs((plist)->used)); (plist)->used &= ~BIT(idx - 1)) \
+		list_for_each_entry_safe(it, n, \
+					 &(plist)->requests[idx - 1], \
+					 sched.link)
+
 struct st_preempt_hang {
 	struct completion completion;
 	bool inject_hang;
-- 
2.18.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 4/6] drm/i915: Priority boost for new clients
  2018-08-06  8:30 [PATCH 1/6] drm/i915: Limit C-states when waiting for the active request Chris Wilson
  2018-08-06  8:30 ` [PATCH 2/6] drm/i915: Reserve some priority bits for internal use Chris Wilson
  2018-08-06  8:30 ` [PATCH 3/6] drm/i915: Combine multiple internal plists into the same i915_priolist bucket Chris Wilson
@ 2018-08-06  8:30 ` Chris Wilson
  2018-08-06  9:47   ` Tvrtko Ursulin
  2018-08-07  7:29   ` [PATCH v2] " Chris Wilson
  2018-08-06  8:30 ` [PATCH 5/6] drm/i915: Pull scheduling under standalone lock Chris Wilson
                   ` (7 subsequent siblings)
  10 siblings, 2 replies; 24+ messages in thread
From: Chris Wilson @ 2018-08-06  8:30 UTC (permalink / raw)
  To: intel-gfx

Taken from an idea used for FQ_CODEL, we give new request flows a
priority boost. These flows are likely to correspond with interactive
tasks and so be more latency sensitive than the long queues. As soon as
the client has more than one request in the queue, further requests are
not boosted and it settles down into ordinary steady state behaviour.
Such small kicks dramatically help combat the starvation issue, by
allowing each client the opportunity to run even when the system is
under heavy throughput load (within the constraints of the user
selected priority).

Testcase: igt/benchmarks/rrul
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_request.c   | 16 ++++++++++++++--
 drivers/gpu/drm/i915/i915_scheduler.h |  4 +++-
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 67fd2ec75d78..63785a5c0418 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -1126,8 +1126,20 @@ void i915_request_add(struct i915_request *request)
 	 */
 	local_bh_disable();
 	rcu_read_lock(); /* RCU serialisation for set-wedged protection */
-	if (engine->schedule)
-		engine->schedule(request, &request->gem_context->sched);
+	if (engine->schedule) {
+		struct i915_sched_attr attr = request->gem_context->sched;
+
+		/*
+		 * Boost priorities to new clients (new request flows).
+		 *
+		 * Allow interactive/synchronous clients to jump ahead of
+		 * the bulk clients. (FQ_CODEL)
+		 */
+		if (!prev || i915_request_completed(prev))
+			attr.priority |= I915_PRIORITY_NEWCLIENT;
+
+		engine->schedule(request, &attr);
+	}
 	rcu_read_unlock();
 	i915_sw_fence_commit(&request->submit);
 	local_bh_enable(); /* Kick the execlists tasklet if just scheduled */
diff --git a/drivers/gpu/drm/i915/i915_scheduler.h b/drivers/gpu/drm/i915/i915_scheduler.h
index 7edfad0abfd7..e9fb6c1d5e42 100644
--- a/drivers/gpu/drm/i915/i915_scheduler.h
+++ b/drivers/gpu/drm/i915/i915_scheduler.h
@@ -19,12 +19,14 @@ enum {
 	I915_PRIORITY_INVALID = INT_MIN
 };
 
-#define I915_USER_PRIORITY_SHIFT 0
+#define I915_USER_PRIORITY_SHIFT 1
 #define I915_USER_PRIORITY(x) ((x) << I915_USER_PRIORITY_SHIFT)
 
 #define I915_PRIORITY_COUNT BIT(I915_USER_PRIORITY_SHIFT)
 #define I915_PRIORITY_MASK (-I915_PRIORITY_COUNT)
 
+#define I915_PRIORITY_NEWCLIENT BIT(0)
+
 struct i915_sched_attr {
 	/**
 	 * @priority: execution and service priority
-- 
2.18.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 5/6] drm/i915: Pull scheduling under standalone lock
  2018-08-06  8:30 [PATCH 1/6] drm/i915: Limit C-states when waiting for the active request Chris Wilson
                   ` (2 preceding siblings ...)
  2018-08-06  8:30 ` [PATCH 4/6] drm/i915: Priority boost for new clients Chris Wilson
@ 2018-08-06  8:30 ` Chris Wilson
  2018-08-06  8:30 ` [PATCH 6/6] drm/i915: Priority boost for waiting clients Chris Wilson
                   ` (6 subsequent siblings)
  10 siblings, 0 replies; 24+ messages in thread
From: Chris Wilson @ 2018-08-06  8:30 UTC (permalink / raw)
  To: intel-gfx

Currently, the backend scheduling code abuses struct_mutex into order to
have a global lock to manipulate a temporary list (without widespread
allocation) and to protect against list modifications. This is an
extraneous coupling to struct_mutex and further can not extend beyond
the local device.

Pull all the code that needs to be under the one true lock into
i915_scheduler.c, and make it so.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/Makefile           |   1 +
 drivers/gpu/drm/i915/i915_request.c     |  85 ------
 drivers/gpu/drm/i915/i915_request.h     |   8 -
 drivers/gpu/drm/i915/i915_scheduler.c   | 356 ++++++++++++++++++++++++
 drivers/gpu/drm/i915/i915_scheduler.h   |  25 ++
 drivers/gpu/drm/i915/intel_display.c    |   3 +-
 drivers/gpu/drm/i915/intel_lrc.c        | 242 +---------------
 drivers/gpu/drm/i915/intel_ringbuffer.h |   5 +-
 8 files changed, 389 insertions(+), 336 deletions(-)
 create mode 100644 drivers/gpu/drm/i915/i915_scheduler.c

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index 5794f102f9b8..ef1480c14e4e 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -75,6 +75,7 @@ i915-y += i915_cmd_parser.o \
 	  i915_gemfs.o \
 	  i915_query.o \
 	  i915_request.o \
+	  i915_scheduler.o \
 	  i915_timeline.o \
 	  i915_trace_points.o \
 	  i915_vma.o \
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 63785a5c0418..e33cdd95bdc3 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -111,91 +111,6 @@ i915_request_remove_from_client(struct i915_request *request)
 	spin_unlock(&file_priv->mm.lock);
 }
 
-static struct i915_dependency *
-i915_dependency_alloc(struct drm_i915_private *i915)
-{
-	return kmem_cache_alloc(i915->dependencies, GFP_KERNEL);
-}
-
-static void
-i915_dependency_free(struct drm_i915_private *i915,
-		     struct i915_dependency *dep)
-{
-	kmem_cache_free(i915->dependencies, dep);
-}
-
-static void
-__i915_sched_node_add_dependency(struct i915_sched_node *node,
-				 struct i915_sched_node *signal,
-				 struct i915_dependency *dep,
-				 unsigned long flags)
-{
-	INIT_LIST_HEAD(&dep->dfs_link);
-	list_add(&dep->wait_link, &signal->waiters_list);
-	list_add(&dep->signal_link, &node->signalers_list);
-	dep->signaler = signal;
-	dep->flags = flags;
-}
-
-static int
-i915_sched_node_add_dependency(struct drm_i915_private *i915,
-			       struct i915_sched_node *node,
-			       struct i915_sched_node *signal)
-{
-	struct i915_dependency *dep;
-
-	dep = i915_dependency_alloc(i915);
-	if (!dep)
-		return -ENOMEM;
-
-	__i915_sched_node_add_dependency(node, signal, dep,
-					 I915_DEPENDENCY_ALLOC);
-	return 0;
-}
-
-static void
-i915_sched_node_fini(struct drm_i915_private *i915,
-		     struct i915_sched_node *node)
-{
-	struct i915_dependency *dep, *tmp;
-
-	GEM_BUG_ON(!list_empty(&node->link));
-
-	/*
-	 * Everyone we depended upon (the fences we wait to be signaled)
-	 * should retire before us and remove themselves from our list.
-	 * However, retirement is run independently on each timeline and
-	 * so we may be called out-of-order.
-	 */
-	list_for_each_entry_safe(dep, tmp, &node->signalers_list, signal_link) {
-		GEM_BUG_ON(!i915_sched_node_signaled(dep->signaler));
-		GEM_BUG_ON(!list_empty(&dep->dfs_link));
-
-		list_del(&dep->wait_link);
-		if (dep->flags & I915_DEPENDENCY_ALLOC)
-			i915_dependency_free(i915, dep);
-	}
-
-	/* Remove ourselves from everyone who depends upon us */
-	list_for_each_entry_safe(dep, tmp, &node->waiters_list, wait_link) {
-		GEM_BUG_ON(dep->signaler != node);
-		GEM_BUG_ON(!list_empty(&dep->dfs_link));
-
-		list_del(&dep->signal_link);
-		if (dep->flags & I915_DEPENDENCY_ALLOC)
-			i915_dependency_free(i915, dep);
-	}
-}
-
-static void
-i915_sched_node_init(struct i915_sched_node *node)
-{
-	INIT_LIST_HEAD(&node->signalers_list);
-	INIT_LIST_HEAD(&node->waiters_list);
-	INIT_LIST_HEAD(&node->link);
-	node->attr.priority = I915_PRIORITY_INVALID;
-}
-
 static int reset_all_global_seqno(struct drm_i915_private *i915, u32 seqno)
 {
 	struct intel_engine_cs *engine;
diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
index e1c9365dfefb..549d56d0ba1c 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -313,14 +313,6 @@ static inline bool i915_request_started(const struct i915_request *rq)
 				 seqno - 1);
 }
 
-static inline bool i915_sched_node_signaled(const struct i915_sched_node *node)
-{
-	const struct i915_request *rq =
-		container_of(node, const struct i915_request, sched);
-
-	return i915_request_completed(rq);
-}
-
 void i915_retire_requests(struct drm_i915_private *i915);
 
 /*
diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c
new file mode 100644
index 000000000000..749a192b4628
--- /dev/null
+++ b/drivers/gpu/drm/i915/i915_scheduler.c
@@ -0,0 +1,356 @@
+/*
+ * SPDX-License-Identifier: MIT
+ *
+ * Copyright © 2018 Intel Corporation
+ */
+
+#include <linux/mutex.h>
+
+#include "i915_drv.h"
+#include "i915_request.h"
+#include "i915_scheduler.h"
+
+static DEFINE_SPINLOCK(schedule_lock);
+
+static const struct i915_request *
+node_to_request(const struct i915_sched_node *node)
+{
+	return container_of(node, const struct i915_request, sched);
+}
+
+static inline bool node_signaled(const struct i915_sched_node *node)
+{
+	return i915_request_completed(node_to_request(node));
+}
+
+void i915_sched_node_init(struct i915_sched_node *node)
+{
+	INIT_LIST_HEAD(&node->signalers_list);
+	INIT_LIST_HEAD(&node->waiters_list);
+	INIT_LIST_HEAD(&node->link);
+	node->attr.priority = I915_PRIORITY_INVALID;
+}
+
+struct i915_dependency *
+i915_dependency_alloc(struct drm_i915_private *i915)
+{
+	return kmem_cache_alloc(i915->dependencies, GFP_KERNEL);
+}
+
+static void
+i915_dependency_free(struct drm_i915_private *i915,
+		     struct i915_dependency *dep)
+{
+	kmem_cache_free(i915->dependencies, dep);
+}
+
+bool __i915_sched_node_add_dependency(struct i915_sched_node *node,
+				      struct i915_sched_node *signal,
+				      struct i915_dependency *dep,
+				      unsigned long flags)
+{
+	bool ret = false;
+
+	spin_lock(&schedule_lock);
+
+	if (!node_signaled(signal)) {
+		INIT_LIST_HEAD(&dep->dfs_link);
+		list_add(&dep->wait_link, &signal->waiters_list);
+		list_add(&dep->signal_link, &node->signalers_list);
+		dep->signaler = signal;
+		dep->flags = flags;
+
+		ret = true;
+	}
+
+	spin_unlock(&schedule_lock);
+
+	return ret;
+}
+
+int i915_sched_node_add_dependency(struct drm_i915_private *i915,
+				   struct i915_sched_node *node,
+				   struct i915_sched_node *signal)
+{
+	struct i915_dependency *dep;
+
+	dep = i915_dependency_alloc(i915);
+	if (!dep)
+		return -ENOMEM;
+
+	if (!__i915_sched_node_add_dependency(node, signal, dep,
+					      I915_DEPENDENCY_ALLOC))
+		i915_dependency_free(i915, dep);
+
+	return 0;
+}
+
+void i915_sched_node_fini(struct drm_i915_private *i915,
+			  struct i915_sched_node *node)
+{
+	struct i915_dependency *dep, *tmp;
+
+	GEM_BUG_ON(!list_empty(&node->link));
+
+	spin_lock(&schedule_lock);
+
+	/*
+	 * Everyone we depended upon (the fences we wait to be signaled)
+	 * should retire before us and remove themselves from our list.
+	 * However, retirement is run independently on each timeline and
+	 * so we may be called out-of-order.
+	 */
+	list_for_each_entry_safe(dep, tmp, &node->signalers_list, signal_link) {
+		GEM_BUG_ON(!node_signaled(dep->signaler));
+		GEM_BUG_ON(!list_empty(&dep->dfs_link));
+
+		list_del(&dep->wait_link);
+		if (dep->flags & I915_DEPENDENCY_ALLOC)
+			i915_dependency_free(i915, dep);
+	}
+
+	/* Remove ourselves from everyone who depends upon us */
+	list_for_each_entry_safe(dep, tmp, &node->waiters_list, wait_link) {
+		GEM_BUG_ON(dep->signaler != node);
+		GEM_BUG_ON(!list_empty(&dep->dfs_link));
+
+		list_del(&dep->signal_link);
+		if (dep->flags & I915_DEPENDENCY_ALLOC)
+			i915_dependency_free(i915, dep);
+	}
+
+	spin_unlock(&schedule_lock);
+}
+
+static inline struct i915_priolist *to_priolist(struct rb_node *rb)
+{
+	return rb_entry(rb, struct i915_priolist, node);
+}
+
+static void assert_priolists(struct intel_engine_execlists * const execlists,
+			     int queue_priority)
+{
+	struct rb_node *rb;
+	int last_prio, i;
+
+	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
+		return;
+
+	GEM_BUG_ON(rb_first_cached(&execlists->queue) !=
+		   rb_first(&execlists->queue.rb_root));
+
+	last_prio = (queue_priority >> I915_USER_PRIORITY_SHIFT) + 1;
+	for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
+		const struct i915_priolist *p = to_priolist(rb);
+
+		GEM_BUG_ON(p->priority >= last_prio);
+		last_prio = p->priority;
+
+		GEM_BUG_ON(!p->used);
+		for (i = 0; i < ARRAY_SIZE(p->requests); i++) {
+			if (list_empty(&p->requests[i]))
+				continue;
+
+			GEM_BUG_ON(!(p->used & BIT(i)));
+		}
+	}
+}
+
+struct list_head *
+i915_sched_lookup_priolist(struct intel_engine_cs *engine, int prio)
+{
+	struct intel_engine_execlists * const execlists = &engine->execlists;
+	struct i915_priolist *p;
+	struct rb_node **parent, *rb;
+	bool first = true;
+	int idx, i;
+
+	lockdep_assert_held(&engine->timeline.lock);
+	assert_priolists(execlists, INT_MAX);
+
+	/* buckets sorted from highest [in slot 0] to lowest priority */
+	idx = I915_PRIORITY_COUNT - (prio & ~I915_PRIORITY_MASK) - 1;
+	prio >>= I915_USER_PRIORITY_SHIFT;
+	if (unlikely(execlists->no_priolist))
+		prio = I915_PRIORITY_NORMAL;
+
+find_priolist:
+	/* most positive priority is scheduled first, equal priorities fifo */
+	rb = NULL;
+	parent = &execlists->queue.rb_root.rb_node;
+	while (*parent) {
+		rb = *parent;
+		p = to_priolist(rb);
+		if (prio > p->priority) {
+			parent = &rb->rb_left;
+		} else if (prio < p->priority) {
+			parent = &rb->rb_right;
+			first = false;
+		} else {
+			goto out;
+		}
+	}
+
+	if (prio == I915_PRIORITY_NORMAL) {
+		p = &execlists->default_priolist;
+	} else {
+		p = kmem_cache_alloc(engine->i915->priorities, GFP_ATOMIC);
+		/* Convert an allocation failure to a priority bump */
+		if (unlikely(!p)) {
+			prio = I915_PRIORITY_NORMAL; /* recurses just once */
+
+			/* To maintain ordering with all rendering, after an
+			 * allocation failure we have to disable all scheduling.
+			 * Requests will then be executed in fifo, and schedule
+			 * will ensure that dependencies are emitted in fifo.
+			 * There will be still some reordering with existing
+			 * requests, so if userspace lied about their
+			 * dependencies that reordering may be visible.
+			 */
+			execlists->no_priolist = true;
+			goto find_priolist;
+		}
+	}
+
+	p->priority = prio;
+	for (i = 0; i < ARRAY_SIZE(p->requests); i++)
+		INIT_LIST_HEAD(&p->requests[i]);
+	rb_link_node(&p->node, rb, parent);
+	rb_insert_color_cached(&p->node, &execlists->queue, first);
+	p->used = 0;
+
+out:
+	p->used |= BIT(idx);
+	return &p->requests[idx];
+}
+
+static struct intel_engine_cs *
+sched_lock_engine(struct i915_sched_node *node, struct intel_engine_cs *locked)
+{
+	struct intel_engine_cs *engine = node_to_request(node)->engine;
+
+	GEM_BUG_ON(!locked);
+
+	if (engine != locked) {
+		spin_unlock(&locked->timeline.lock);
+		spin_lock(&engine->timeline.lock);
+	}
+
+	return engine;
+}
+
+void i915_schedule(struct i915_request *rq, const struct i915_sched_attr *attr)
+{
+	struct list_head *uninitialized_var(pl);
+	struct intel_engine_cs *engine, *last;
+	struct i915_dependency *dep, *p;
+	struct i915_dependency stack;
+	const int prio = attr->priority;
+	LIST_HEAD(dfs);
+
+	GEM_BUG_ON(prio == I915_PRIORITY_INVALID);
+
+	if (i915_request_completed(rq))
+		return;
+
+	if (prio <= READ_ONCE(rq->sched.attr.priority))
+		return;
+
+	/* Needed in order to use the temporary link inside i915_dependency */
+	spin_lock(&schedule_lock);
+
+	stack.signaler = &rq->sched;
+	list_add(&stack.dfs_link, &dfs);
+
+	/*
+	 * Recursively bump all dependent priorities to match the new request.
+	 *
+	 * A naive approach would be to use recursion:
+	 * static void update_priorities(struct i915_sched_node *node, prio) {
+	 *	list_for_each_entry(dep, &node->signalers_list, signal_link)
+	 *		update_priorities(dep->signal, prio)
+	 *	queue_request(node);
+	 * }
+	 * but that may have unlimited recursion depth and so runs a very
+	 * real risk of overunning the kernel stack. Instead, we build
+	 * a flat list of all dependencies starting with the current request.
+	 * As we walk the list of dependencies, we add all of its dependencies
+	 * to the end of the list (this may include an already visited
+	 * request) and continue to walk onwards onto the new dependencies. The
+	 * end result is a topological list of requests in reverse order, the
+	 * last element in the list is the request we must execute first.
+	 */
+	list_for_each_entry(dep, &dfs, dfs_link) {
+		struct i915_sched_node *node = dep->signaler;
+
+		/*
+		 * Within an engine, there can be no cycle, but we may
+		 * refer to the same dependency chain multiple times
+		 * (redundant dependencies are not eliminated) and across
+		 * engines.
+		 */
+		list_for_each_entry(p, &node->signalers_list, signal_link) {
+			GEM_BUG_ON(p == dep); /* no cycles! */
+
+			if (node_signaled(p->signaler))
+				continue;
+
+			GEM_BUG_ON(p->signaler->attr.priority < node->attr.priority);
+			if (prio > READ_ONCE(p->signaler->attr.priority))
+				list_move_tail(&p->dfs_link, &dfs);
+		}
+	}
+
+	/*
+	 * If we didn't need to bump any existing priorities, and we haven't
+	 * yet submitted this request (i.e. there is no potential race with
+	 * execlists_submit_request()), we can set our own priority and skip
+	 * acquiring the engine locks.
+	 */
+	if (rq->sched.attr.priority == I915_PRIORITY_INVALID) {
+		GEM_BUG_ON(!list_empty(&rq->sched.link));
+		rq->sched.attr = *attr;
+
+		if (stack.dfs_link.next == stack.dfs_link.prev)
+			goto out_unlock;
+
+		__list_del_entry(&stack.dfs_link);
+	}
+
+	last = NULL;
+	engine = rq->engine;
+	spin_lock_irq(&engine->timeline.lock);
+
+	/* Fifo and depth-first replacement ensure our deps execute before us */
+	list_for_each_entry_safe_reverse(dep, p, &dfs, dfs_link) {
+		struct i915_sched_node *node = dep->signaler;
+
+		INIT_LIST_HEAD(&dep->dfs_link);
+
+		engine = sched_lock_engine(node, engine);
+
+		if (prio <= node->attr.priority)
+			continue;
+
+		node->attr.priority = prio;
+		if (!list_empty(&node->link)) {
+			if (last != engine) {
+				pl = i915_sched_lookup_priolist(engine, prio);
+				last = engine;
+			}
+			list_move_tail(&node->link, pl);
+		}
+
+		if (prio > engine->execlists.queue_priority &&
+		    i915_sw_fence_done(&node_to_request(node)->submit)) {
+			/* defer submission until after all of our updates */
+			engine->execlists.queue_priority = prio;
+			tasklet_hi_schedule(&engine->execlists.tasklet);
+		}
+	}
+
+	spin_unlock_irq(&engine->timeline.lock);
+
+out_unlock:
+	spin_unlock(&schedule_lock);
+}
diff --git a/drivers/gpu/drm/i915/i915_scheduler.h b/drivers/gpu/drm/i915/i915_scheduler.h
index e9fb6c1d5e42..9b21ff72f619 100644
--- a/drivers/gpu/drm/i915/i915_scheduler.h
+++ b/drivers/gpu/drm/i915/i915_scheduler.h
@@ -8,9 +8,14 @@
 #define _I915_SCHEDULER_H_
 
 #include <linux/bitops.h>
+#include <linux/kernel.h>
 
 #include <uapi/drm/i915_drm.h>
 
+struct drm_i915_private;
+struct i915_request;
+struct intel_engine_cs;
+
 enum {
 	I915_PRIORITY_MIN = I915_CONTEXT_MIN_USER_PRIORITY - 1,
 	I915_PRIORITY_NORMAL = I915_CONTEXT_DEFAULT_PRIORITY,
@@ -77,4 +82,24 @@ struct i915_dependency {
 #define I915_DEPENDENCY_ALLOC BIT(0)
 };
 
+void i915_sched_node_init(struct i915_sched_node *node);
+
+bool __i915_sched_node_add_dependency(struct i915_sched_node *node,
+				      struct i915_sched_node *signal,
+				      struct i915_dependency *dep,
+				      unsigned long flags);
+
+int i915_sched_node_add_dependency(struct drm_i915_private *i915,
+				   struct i915_sched_node *node,
+				   struct i915_sched_node *signal);
+
+void i915_sched_node_fini(struct drm_i915_private *i915,
+			  struct i915_sched_node *node);
+
+void i915_schedule(struct i915_request *request,
+		   const struct i915_sched_attr *attr);
+
+struct list_head *
+i915_sched_lookup_priolist(struct intel_engine_cs *engine, int prio);
+
 #endif /* _I915_SCHEDULER_H_ */
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 73c6d56ba3ec..3c962740073c 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -13076,13 +13076,12 @@ intel_prepare_plane_fb(struct drm_plane *plane,
 
 	ret = intel_plane_pin_fb(to_intel_plane_state(new_state));
 
-	fb_obj_bump_render_priority(obj);
-
 	mutex_unlock(&dev_priv->drm.struct_mutex);
 	i915_gem_object_unpin_pages(obj);
 	if (ret)
 		return ret;
 
+	fb_obj_bump_render_priority(obj);
 	intel_fb_obj_flush(obj, ORIGIN_DIRTYFB);
 
 	if (!new_state->fence) { /* implicit fencing */
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index a36b12f8277d..f0a0045bae30 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -259,102 +259,6 @@ intel_lr_context_descriptor_update(struct i915_gem_context *ctx,
 	ce->lrc_desc = desc;
 }
 
-static void assert_priolists(struct intel_engine_execlists * const execlists,
-			     int queue_priority)
-{
-	struct rb_node *rb;
-	int last_prio, i;
-
-	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
-		return;
-
-	GEM_BUG_ON(rb_first_cached(&execlists->queue) !=
-		   rb_first(&execlists->queue.rb_root));
-
-	last_prio = (queue_priority >> I915_USER_PRIORITY_SHIFT) + 1;
-	for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
-		struct i915_priolist *p = to_priolist(rb);
-
-		GEM_BUG_ON(p->priority >= last_prio);
-		last_prio = p->priority;
-
-		GEM_BUG_ON(!p->used);
-		for (i = 0; i < ARRAY_SIZE(p->requests); i++) {
-			if (list_empty(&p->requests[i]))
-				continue;
-
-			GEM_BUG_ON(!(p->used & BIT(i)));
-		}
-	}
-}
-
-static struct list_head *
-lookup_priolist(struct intel_engine_cs *engine, int prio)
-{
-	struct intel_engine_execlists * const execlists = &engine->execlists;
-	struct i915_priolist *p;
-	struct rb_node **parent, *rb;
-	bool first = true;
-	int idx, i;
-
-	assert_priolists(execlists, INT_MAX);
-
-	/* buckets sorted from highest [in slot 0] to lowest priority */
-	idx = I915_PRIORITY_COUNT - (prio & ~I915_PRIORITY_MASK) - 1;
-	prio >>= I915_USER_PRIORITY_SHIFT;
-	if (unlikely(execlists->no_priolist))
-		prio = I915_PRIORITY_NORMAL;
-
-find_priolist:
-	/* most positive priority is scheduled first, equal priorities fifo */
-	rb = NULL;
-	parent = &execlists->queue.rb_root.rb_node;
-	while (*parent) {
-		rb = *parent;
-		p = to_priolist(rb);
-		if (prio > p->priority) {
-			parent = &rb->rb_left;
-		} else if (prio < p->priority) {
-			parent = &rb->rb_right;
-			first = false;
-		} else {
-			goto out;
-		}
-	}
-
-	if (prio == I915_PRIORITY_NORMAL) {
-		p = &execlists->default_priolist;
-	} else {
-		p = kmem_cache_alloc(engine->i915->priorities, GFP_ATOMIC);
-		/* Convert an allocation failure to a priority bump */
-		if (unlikely(!p)) {
-			prio = I915_PRIORITY_NORMAL; /* recurses just once */
-
-			/* To maintain ordering with all rendering, after an
-			 * allocation failure we have to disable all scheduling.
-			 * Requests will then be executed in fifo, and schedule
-			 * will ensure that dependencies are emitted in fifo.
-			 * There will be still some reordering with existing
-			 * requests, so if userspace lied about their
-			 * dependencies that reordering may be visible.
-			 */
-			execlists->no_priolist = true;
-			goto find_priolist;
-		}
-	}
-
-	p->priority = prio;
-	for (i = 0; i < ARRAY_SIZE(p->requests); i++)
-		INIT_LIST_HEAD(&p->requests[i]);
-	rb_link_node(&p->node, rb, parent);
-	rb_insert_color_cached(&p->node, &execlists->queue, first);
-	p->used = 0;
-
-out:
-	p->used |= BIT(idx);
-	return &p->requests[idx];
-}
-
 static void unwind_wa_tail(struct i915_request *rq)
 {
 	rq->tail = intel_ring_wrap(rq->ring, rq->wa_tail - WA_TAIL_BYTES);
@@ -381,7 +285,7 @@ static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
 		GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
 		if (rq_prio(rq) != last_prio) {
 			last_prio = rq_prio(rq);
-			pl = lookup_priolist(engine, last_prio);
+			pl = i915_sched_lookup_priolist(engine, last_prio);
 		}
 
 		list_add(&rq->sched.link, pl);
@@ -789,7 +693,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 	 */
 	execlists->queue_priority =
 		port != execlists->port ? rq_prio(last) : INT_MIN;
-	assert_priolists(execlists, execlists->queue_priority);
 
 	if (submit) {
 		port_assign(port, last);
@@ -1116,12 +1019,7 @@ static void queue_request(struct intel_engine_cs *engine,
 			  struct i915_sched_node *node,
 			  int prio)
 {
-	list_add_tail(&node->link, lookup_priolist(engine, prio));
-}
-
-static void __update_queue(struct intel_engine_cs *engine, int prio)
-{
-	engine->execlists.queue_priority = prio;
+	list_add_tail(&node->link, i915_sched_lookup_priolist(engine, prio));
 }
 
 static void __submit_queue_imm(struct intel_engine_cs *engine)
@@ -1140,7 +1038,7 @@ static void __submit_queue_imm(struct intel_engine_cs *engine)
 static void submit_queue(struct intel_engine_cs *engine, int prio)
 {
 	if (prio > engine->execlists.queue_priority) {
-		__update_queue(engine, prio);
+		engine->execlists.queue_priority = prio;
 		__submit_queue_imm(engine);
 	}
 }
@@ -1163,138 +1061,6 @@ static void execlists_submit_request(struct i915_request *request)
 	spin_unlock_irqrestore(&engine->timeline.lock, flags);
 }
 
-static struct i915_request *sched_to_request(struct i915_sched_node *node)
-{
-	return container_of(node, struct i915_request, sched);
-}
-
-static struct intel_engine_cs *
-sched_lock_engine(struct i915_sched_node *node, struct intel_engine_cs *locked)
-{
-	struct intel_engine_cs *engine = sched_to_request(node)->engine;
-
-	GEM_BUG_ON(!locked);
-
-	if (engine != locked) {
-		spin_unlock(&locked->timeline.lock);
-		spin_lock(&engine->timeline.lock);
-	}
-
-	return engine;
-}
-
-static void execlists_schedule(struct i915_request *request,
-			       const struct i915_sched_attr *attr)
-{
-	struct list_head *uninitialized_var(pl);
-	struct intel_engine_cs *engine, *last;
-	struct i915_dependency *dep, *p;
-	struct i915_dependency stack;
-	const int prio = attr->priority;
-	LIST_HEAD(dfs);
-
-	GEM_BUG_ON(prio == I915_PRIORITY_INVALID);
-
-	if (i915_request_completed(request))
-		return;
-
-	if (prio <= READ_ONCE(request->sched.attr.priority))
-		return;
-
-	/* Need BKL in order to use the temporary link inside i915_dependency */
-	lockdep_assert_held(&request->i915->drm.struct_mutex);
-
-	stack.signaler = &request->sched;
-	list_add(&stack.dfs_link, &dfs);
-
-	/*
-	 * Recursively bump all dependent priorities to match the new request.
-	 *
-	 * A naive approach would be to use recursion:
-	 * static void update_priorities(struct i915_sched_node *node, prio) {
-	 *	list_for_each_entry(dep, &node->signalers_list, signal_link)
-	 *		update_priorities(dep->signal, prio)
-	 *	queue_request(node);
-	 * }
-	 * but that may have unlimited recursion depth and so runs a very
-	 * real risk of overunning the kernel stack. Instead, we build
-	 * a flat list of all dependencies starting with the current request.
-	 * As we walk the list of dependencies, we add all of its dependencies
-	 * to the end of the list (this may include an already visited
-	 * request) and continue to walk onwards onto the new dependencies. The
-	 * end result is a topological list of requests in reverse order, the
-	 * last element in the list is the request we must execute first.
-	 */
-	list_for_each_entry(dep, &dfs, dfs_link) {
-		struct i915_sched_node *node = dep->signaler;
-
-		/*
-		 * Within an engine, there can be no cycle, but we may
-		 * refer to the same dependency chain multiple times
-		 * (redundant dependencies are not eliminated) and across
-		 * engines.
-		 */
-		list_for_each_entry(p, &node->signalers_list, signal_link) {
-			GEM_BUG_ON(p == dep); /* no cycles! */
-
-			if (i915_sched_node_signaled(p->signaler))
-				continue;
-
-			GEM_BUG_ON(p->signaler->attr.priority < node->attr.priority);
-			if (prio > READ_ONCE(p->signaler->attr.priority))
-				list_move_tail(&p->dfs_link, &dfs);
-		}
-	}
-
-	/*
-	 * If we didn't need to bump any existing priorities, and we haven't
-	 * yet submitted this request (i.e. there is no potential race with
-	 * execlists_submit_request()), we can set our own priority and skip
-	 * acquiring the engine locks.
-	 */
-	if (request->sched.attr.priority == I915_PRIORITY_INVALID) {
-		GEM_BUG_ON(!list_empty(&request->sched.link));
-		request->sched.attr = *attr;
-		if (stack.dfs_link.next == stack.dfs_link.prev)
-			return;
-		__list_del_entry(&stack.dfs_link);
-	}
-
-	last = NULL;
-	engine = request->engine;
-	spin_lock_irq(&engine->timeline.lock);
-
-	/* Fifo and depth-first replacement ensure our deps execute before us */
-	list_for_each_entry_safe_reverse(dep, p, &dfs, dfs_link) {
-		struct i915_sched_node *node = dep->signaler;
-
-		INIT_LIST_HEAD(&dep->dfs_link);
-
-		engine = sched_lock_engine(node, engine);
-
-		if (prio <= node->attr.priority)
-			continue;
-
-		node->attr.priority = prio;
-		if (!list_empty(&node->link)) {
-			if (last != engine) {
-				pl = lookup_priolist(engine, prio);
-				last = engine;
-			}
-			list_move_tail(&node->link, pl);
-		}
-
-		if (prio > engine->execlists.queue_priority &&
-		    i915_sw_fence_done(&sched_to_request(node)->submit)) {
-			/* defer submission until after all of our updates */
-			__update_queue(engine, prio);
-			tasklet_hi_schedule(&engine->execlists.tasklet);
-		}
-	}
-
-	spin_unlock_irq(&engine->timeline.lock);
-}
-
 static void execlists_context_destroy(struct intel_context *ce)
 {
 	GEM_BUG_ON(ce->pin_count);
@@ -2328,7 +2094,7 @@ void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
 {
 	engine->submit_request = execlists_submit_request;
 	engine->cancel_requests = execlists_cancel_requests;
-	engine->schedule = execlists_schedule;
+	engine->schedule = i915_schedule;
 	engine->execlists.tasklet.func = execlists_submission_tasklet;
 
 	engine->reset.prepare = execlists_reset_prepare;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index bfcbf4317745..ae7498bfbf1d 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -497,11 +497,10 @@ struct intel_engine_cs {
 	 */
 	void		(*submit_request)(struct i915_request *rq);
 
-	/* Call when the priority on a request has changed and it and its
+	/*
+	 * Call when the priority on a request has changed and it and its
 	 * dependencies may need rescheduling. Note the request itself may
 	 * not be ready to run!
-	 *
-	 * Called under the struct_mutex.
 	 */
 	void		(*schedule)(struct i915_request *request,
 				    const struct i915_sched_attr *attr);
-- 
2.18.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 6/6] drm/i915: Priority boost for waiting clients
  2018-08-06  8:30 [PATCH 1/6] drm/i915: Limit C-states when waiting for the active request Chris Wilson
                   ` (3 preceding siblings ...)
  2018-08-06  8:30 ` [PATCH 5/6] drm/i915: Pull scheduling under standalone lock Chris Wilson
@ 2018-08-06  8:30 ` Chris Wilson
  2018-08-06  9:53   ` Tvrtko Ursulin
  2018-08-06  8:41 ` ✗ Fi.CI.CHECKPATCH: warning for series starting with [1/6] drm/i915: Limit C-states when waiting for the active request Patchwork
                   ` (5 subsequent siblings)
  10 siblings, 1 reply; 24+ messages in thread
From: Chris Wilson @ 2018-08-06  8:30 UTC (permalink / raw)
  To: intel-gfx

Latency is in the eye of the beholder. In the case where a client stops
and waits for the gpu, give that request chain a small priority boost
(not so that it overtakes higher priority clients, to preserve the
external ordering) so that ideally the wait completes earlier.

Testcase: igt/gem_sync/switch-default
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
---
 drivers/gpu/drm/i915/i915_gem.c       |  5 +++-
 drivers/gpu/drm/i915/i915_request.c   |  2 ++
 drivers/gpu/drm/i915/i915_request.h   |  5 ++--
 drivers/gpu/drm/i915/i915_scheduler.c | 34 ++++++++++++++++++++++-----
 drivers/gpu/drm/i915/i915_scheduler.h |  5 +++-
 5 files changed, 41 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 460f256114f7..033d8057dd83 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1748,6 +1748,7 @@ i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
 	 */
 	err = i915_gem_object_wait(obj,
 				   I915_WAIT_INTERRUPTIBLE |
+				   I915_WAIT_PRIORITY |
 				   (write_domain ? I915_WAIT_ALL : 0),
 				   MAX_SCHEDULE_TIMEOUT,
 				   to_rps_client(file));
@@ -3733,7 +3734,9 @@ i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 	start = ktime_get();
 
 	ret = i915_gem_object_wait(obj,
-				   I915_WAIT_INTERRUPTIBLE | I915_WAIT_ALL,
+				   I915_WAIT_INTERRUPTIBLE |
+				   I915_WAIT_PRIORITY |
+				   I915_WAIT_ALL,
 				   to_wait_timeout(args->timeout_ns),
 				   to_rps_client(file));
 
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index e33cdd95bdc3..105a9e45277b 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -1289,6 +1289,8 @@ long i915_request_wait(struct i915_request *rq,
 		add_wait_queue(errq, &reset);
 
 	intel_wait_init(&wait);
+	if (flags & I915_WAIT_PRIORITY)
+		i915_schedule_bump_priority(rq, I915_PRIORITY_WAIT);
 
 restart:
 	do {
diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
index 549d56d0ba1c..2898ca74fa27 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -269,8 +269,9 @@ long i915_request_wait(struct i915_request *rq,
 	__attribute__((nonnull(1)));
 #define I915_WAIT_INTERRUPTIBLE	BIT(0)
 #define I915_WAIT_LOCKED	BIT(1) /* struct_mutex held, handle GPU reset */
-#define I915_WAIT_ALL		BIT(2) /* used by i915_gem_object_wait() */
-#define I915_WAIT_FOR_IDLE_BOOST BIT(3)
+#define I915_WAIT_PRIORITY	BIT(2) /* small priority bump for the request */
+#define I915_WAIT_ALL		BIT(3) /* used by i915_gem_object_wait() */
+#define I915_WAIT_FOR_IDLE_BOOST BIT(4)
 
 static inline u32 intel_engine_get_seqno(struct intel_engine_cs *engine);
 
diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c
index 749a192b4628..2da651db5029 100644
--- a/drivers/gpu/drm/i915/i915_scheduler.c
+++ b/drivers/gpu/drm/i915/i915_scheduler.c
@@ -239,7 +239,8 @@ sched_lock_engine(struct i915_sched_node *node, struct intel_engine_cs *locked)
 	return engine;
 }
 
-void i915_schedule(struct i915_request *rq, const struct i915_sched_attr *attr)
+static void __i915_schedule(struct i915_request *rq,
+			    const struct i915_sched_attr *attr)
 {
 	struct list_head *uninitialized_var(pl);
 	struct intel_engine_cs *engine, *last;
@@ -248,6 +249,8 @@ void i915_schedule(struct i915_request *rq, const struct i915_sched_attr *attr)
 	const int prio = attr->priority;
 	LIST_HEAD(dfs);
 
+	/* Needed in order to use the temporary link inside i915_dependency */
+	lockdep_assert_held(&schedule_lock);
 	GEM_BUG_ON(prio == I915_PRIORITY_INVALID);
 
 	if (i915_request_completed(rq))
@@ -256,9 +259,6 @@ void i915_schedule(struct i915_request *rq, const struct i915_sched_attr *attr)
 	if (prio <= READ_ONCE(rq->sched.attr.priority))
 		return;
 
-	/* Needed in order to use the temporary link inside i915_dependency */
-	spin_lock(&schedule_lock);
-
 	stack.signaler = &rq->sched;
 	list_add(&stack.dfs_link, &dfs);
 
@@ -312,7 +312,7 @@ void i915_schedule(struct i915_request *rq, const struct i915_sched_attr *attr)
 		rq->sched.attr = *attr;
 
 		if (stack.dfs_link.next == stack.dfs_link.prev)
-			goto out_unlock;
+			return;
 
 		__list_del_entry(&stack.dfs_link);
 	}
@@ -350,7 +350,29 @@ void i915_schedule(struct i915_request *rq, const struct i915_sched_attr *attr)
 	}
 
 	spin_unlock_irq(&engine->timeline.lock);
+}
 
-out_unlock:
+void i915_schedule(struct i915_request *rq, const struct i915_sched_attr *attr)
+{
+	spin_lock(&schedule_lock);
+	__i915_schedule(rq, attr);
 	spin_unlock(&schedule_lock);
 }
+
+void i915_schedule_bump_priority(struct i915_request *rq, unsigned int bump)
+{
+	struct i915_sched_attr attr;
+
+	GEM_BUG_ON(bump & I915_PRIORITY_MASK);
+
+	if (READ_ONCE(rq->sched.attr.priority) == I915_PRIORITY_INVALID)
+		return;
+
+	spin_lock_bh(&schedule_lock);
+
+	attr = rq->sched.attr;
+	attr.priority |= bump;
+	__i915_schedule(rq, &attr);
+
+	spin_unlock_bh(&schedule_lock);
+}
diff --git a/drivers/gpu/drm/i915/i915_scheduler.h b/drivers/gpu/drm/i915/i915_scheduler.h
index 9b21ff72f619..e5120ef974d4 100644
--- a/drivers/gpu/drm/i915/i915_scheduler.h
+++ b/drivers/gpu/drm/i915/i915_scheduler.h
@@ -24,12 +24,13 @@ enum {
 	I915_PRIORITY_INVALID = INT_MIN
 };
 
-#define I915_USER_PRIORITY_SHIFT 1
+#define I915_USER_PRIORITY_SHIFT 2
 #define I915_USER_PRIORITY(x) ((x) << I915_USER_PRIORITY_SHIFT)
 
 #define I915_PRIORITY_COUNT BIT(I915_USER_PRIORITY_SHIFT)
 #define I915_PRIORITY_MASK (-I915_PRIORITY_COUNT)
 
+#define I915_PRIORITY_WAIT BIT(1)
 #define I915_PRIORITY_NEWCLIENT BIT(0)
 
 struct i915_sched_attr {
@@ -99,6 +100,8 @@ void i915_sched_node_fini(struct drm_i915_private *i915,
 void i915_schedule(struct i915_request *request,
 		   const struct i915_sched_attr *attr);
 
+void i915_schedule_bump_priority(struct i915_request *rq, unsigned int bump);
+
 struct list_head *
 i915_sched_lookup_priolist(struct intel_engine_cs *engine, int prio);
 
-- 
2.18.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* ✗ Fi.CI.CHECKPATCH: warning for series starting with [1/6] drm/i915: Limit C-states when waiting for the active request
  2018-08-06  8:30 [PATCH 1/6] drm/i915: Limit C-states when waiting for the active request Chris Wilson
                   ` (4 preceding siblings ...)
  2018-08-06  8:30 ` [PATCH 6/6] drm/i915: Priority boost for waiting clients Chris Wilson
@ 2018-08-06  8:41 ` Patchwork
  2018-08-06  8:44 ` ✗ Fi.CI.SPARSE: " Patchwork
                   ` (4 subsequent siblings)
  10 siblings, 0 replies; 24+ messages in thread
From: Patchwork @ 2018-08-06  8:41 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/6] drm/i915: Limit C-states when waiting for the active request
URL   : https://patchwork.freedesktop.org/series/47739/
State : warning

== Summary ==

$ dim checkpatch origin/drm-tip
ffbc30b340a9 drm/i915: Limit C-states when waiting for the active request
50c5633b2117 drm/i915: Reserve some priority bits for internal use
5d7875b544e9 drm/i915: Combine multiple internal plists into the same i915_priolist bucket
-:163: WARNING:FUNCTION_ARGUMENTS: function definition argument 'pl' should also have an identifier name
#163: FILE: drivers/gpu/drm/i915/intel_lrc.c:367:
+	struct list_head *uninitialized_var(pl);

-:280: WARNING:FUNCTION_ARGUMENTS: function definition argument 'pl' should also have an identifier name
#280: FILE: drivers/gpu/drm/i915/intel_lrc.c:1189:
+	struct list_head *uninitialized_var(pl);

-:309: CHECK:MACRO_ARG_REUSE: Macro argument reuse 'plist' - possible side-effects?
#309: FILE: drivers/gpu/drm/i915/intel_ringbuffer.h:197:
+#define priolist_for_each_request(it, plist, idx) \
+	for (idx = 0; idx < ARRAY_SIZE((plist)->requests); idx++) \
+		list_for_each_entry(it, &(plist)->requests[idx], sched.link)

-:309: CHECK:MACRO_ARG_REUSE: Macro argument reuse 'idx' - possible side-effects?
#309: FILE: drivers/gpu/drm/i915/intel_ringbuffer.h:197:
+#define priolist_for_each_request(it, plist, idx) \
+	for (idx = 0; idx < ARRAY_SIZE((plist)->requests); idx++) \
+		list_for_each_entry(it, &(plist)->requests[idx], sched.link)

-:313: CHECK:MACRO_ARG_REUSE: Macro argument reuse 'plist' - possible side-effects?
#313: FILE: drivers/gpu/drm/i915/intel_ringbuffer.h:201:
+#define priolist_for_each_request_consume(it, n, plist, idx) \
+	for (; (idx = ffs((plist)->used)); (plist)->used &= ~BIT(idx - 1)) \
+		list_for_each_entry_safe(it, n, \
+					 &(plist)->requests[idx - 1], \
+					 sched.link)

-:313: CHECK:MACRO_ARG_REUSE: Macro argument reuse 'idx' - possible side-effects?
#313: FILE: drivers/gpu/drm/i915/intel_ringbuffer.h:201:
+#define priolist_for_each_request_consume(it, n, plist, idx) \
+	for (; (idx = ffs((plist)->used)); (plist)->used &= ~BIT(idx - 1)) \
+		list_for_each_entry_safe(it, n, \
+					 &(plist)->requests[idx - 1], \
+					 sched.link)

total: 0 errors, 2 warnings, 4 checks, 271 lines checked
71162d4b270e drm/i915: Priority boost for new clients
db922abd8717 drm/i915: Pull scheduling under standalone lock
-:145: WARNING:FILE_PATH_CHANGES: added, moved or deleted file(s), does MAINTAINERS need updating?
#145: 
new file mode 100644

-:150: WARNING:SPDX_LICENSE_TAG: Missing or malformed SPDX-License-Identifier tag in line 1
#150: FILE: drivers/gpu/drm/i915/i915_scheduler.c:1:
+/*

-:393: WARNING:FUNCTION_ARGUMENTS: function definition argument 'pl' should also have an identifier name
#393: FILE: drivers/gpu/drm/i915/i915_scheduler.c:244:
+	struct list_head *uninitialized_var(pl);

total: 0 errors, 3 warnings, 0 checks, 817 lines checked
17f7463076d9 drm/i915: Priority boost for waiting clients

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 24+ messages in thread

* ✗ Fi.CI.SPARSE: warning for series starting with [1/6] drm/i915: Limit C-states when waiting for the active request
  2018-08-06  8:30 [PATCH 1/6] drm/i915: Limit C-states when waiting for the active request Chris Wilson
                   ` (5 preceding siblings ...)
  2018-08-06  8:41 ` ✗ Fi.CI.CHECKPATCH: warning for series starting with [1/6] drm/i915: Limit C-states when waiting for the active request Patchwork
@ 2018-08-06  8:44 ` Patchwork
  2018-08-06  8:56 ` ✓ Fi.CI.BAT: success " Patchwork
                   ` (3 subsequent siblings)
  10 siblings, 0 replies; 24+ messages in thread
From: Patchwork @ 2018-08-06  8:44 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/6] drm/i915: Limit C-states when waiting for the active request
URL   : https://patchwork.freedesktop.org/series/47739/
State : warning

== Summary ==

$ dim sparse origin/drm-tip
Commit: drm/i915: Limit C-states when waiting for the active request
Okay!

Commit: drm/i915: Reserve some priority bits for internal use
Okay!

Commit: drm/i915: Combine multiple internal plists into the same i915_priolist bucket
Okay!

Commit: drm/i915: Priority boost for new clients
Okay!

Commit: drm/i915: Pull scheduling under standalone lock
+drivers/gpu/drm/i915/i915_scheduler.c:34:24: warning: symbol 'i915_dependency_alloc' was not declared. Should it be static?

Commit: drm/i915: Priority boost for waiting clients
Okay!

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 24+ messages in thread

* ✓ Fi.CI.BAT: success for series starting with [1/6] drm/i915: Limit C-states when waiting for the active request
  2018-08-06  8:30 [PATCH 1/6] drm/i915: Limit C-states when waiting for the active request Chris Wilson
                   ` (6 preceding siblings ...)
  2018-08-06  8:44 ` ✗ Fi.CI.SPARSE: " Patchwork
@ 2018-08-06  8:56 ` Patchwork
  2018-08-06  9:34 ` [PATCH 1/6] " Tvrtko Ursulin
                   ` (2 subsequent siblings)
  10 siblings, 0 replies; 24+ messages in thread
From: Patchwork @ 2018-08-06  8:56 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/6] drm/i915: Limit C-states when waiting for the active request
URL   : https://patchwork.freedesktop.org/series/47739/
State : success

== Summary ==

= CI Bug Log - changes from CI_DRM_4618 -> Patchwork_9855 =

== Summary - SUCCESS ==

  No regressions found.

  External URL: https://patchwork.freedesktop.org/api/1.0/series/47739/revisions/1/mbox/

== Possible new issues ==

  Here are the unknown changes that may have been introduced in Patchwork_9855:

  === IGT changes ===

    ==== Possible regressions ====

    igt@gem_ctx_switch@basic-default:
      {fi-icl-u}:         NOTRUN -> INCOMPLETE

    
== Known issues ==

  Here are the changes found in Patchwork_9855 that come from known issues:

  === IGT changes ===

    ==== Issues hit ====

    igt@drv_selftest@live_hangcheck:
      fi-skl-6260u:       PASS -> DMESG-FAIL (fdo#106560, fdo#107174)

    igt@drv_selftest@live_workarounds:
      fi-bsw-n3050:       PASS -> DMESG-FAIL (fdo#107292)

    igt@kms_pipe_crc_basic@read-crc-pipe-a-frame-sequence:
      fi-skl-6700k2:      PASS -> FAIL (fdo#103191)

    
  {name}: This element is suppressed. This means it is ignored when computing
          the status of the difference (SUCCESS, WARNING, or FAILURE).

  fdo#103191 https://bugs.freedesktop.org/show_bug.cgi?id=103191
  fdo#106560 https://bugs.freedesktop.org/show_bug.cgi?id=106560
  fdo#107174 https://bugs.freedesktop.org/show_bug.cgi?id=107174
  fdo#107292 https://bugs.freedesktop.org/show_bug.cgi?id=107292


== Participating hosts (51 -> 48) ==

  Additional (1): fi-icl-u 
  Missing    (4): fi-ilk-m540 fi-byt-squawks fi-bsw-cyan fi-hsw-4200u 


== Build changes ==

    * Linux: CI_DRM_4618 -> Patchwork_9855

  CI_DRM_4618: a59e87511e8f63b8d28973891d0ab407a19638c6 @ git://anongit.freedesktop.org/gfx-ci/linux
  IGT_4587: 5d78c73d871525ec9caecd88ad7d9abe36637314 @ git://anongit.freedesktop.org/xorg/app/intel-gpu-tools
  Patchwork_9855: 17f7463076d9d8f510a35de0efc4c61a12eb3383 @ git://anongit.freedesktop.org/gfx-ci/linux


== Linux commits ==

17f7463076d9 drm/i915: Priority boost for waiting clients
db922abd8717 drm/i915: Pull scheduling under standalone lock
71162d4b270e drm/i915: Priority boost for new clients
5d7875b544e9 drm/i915: Combine multiple internal plists into the same i915_priolist bucket
50c5633b2117 drm/i915: Reserve some priority bits for internal use
ffbc30b340a9 drm/i915: Limit C-states when waiting for the active request

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_9855/issues.html
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 1/6] drm/i915: Limit C-states when waiting for the active request
  2018-08-06  8:30 [PATCH 1/6] drm/i915: Limit C-states when waiting for the active request Chris Wilson
                   ` (7 preceding siblings ...)
  2018-08-06  8:56 ` ✓ Fi.CI.BAT: success " Patchwork
@ 2018-08-06  9:34 ` Tvrtko Ursulin
  2018-08-06  9:59   ` Chris Wilson
  2018-08-06  9:43 ` ✗ Fi.CI.IGT: failure for series starting with [1/6] " Patchwork
  2018-08-07  7:33 ` ✗ Fi.CI.BAT: failure for series starting with [1/6] drm/i915: Limit C-states when waiting for the active request (rev2) Patchwork
  10 siblings, 1 reply; 24+ messages in thread
From: Tvrtko Ursulin @ 2018-08-06  9:34 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx; +Cc: Eero Tamminen


On 06/08/2018 09:30, Chris Wilson wrote:
> If we are waiting for the currently executing request, we have a good
> idea that it will be completed in the very near future and so want to
> cap the CPU_DMA_LATENCY to ensure that we wake up the client quickly.

I cannot shake the opinion that we shouldn't be doing this. For instance 
what if the client has been re-niced (down), or it has re-niced itself? 
Obviously wrong to apply this for those.

Or when you say we have a good idea something will be completed in the 
very near future. Say there is a 60fps workload which is sending 5ms 
batches and waits on them. That would be 30% of time spent outside of 
low C states for a workload which doesn't need it.

Also having read what the OpenCL does, where they want to apply 
different wait optimisations for different call-sites, the idea that we 
should instead be introducing a low-latency flag to wait ioctl sounds 
more appropriate.

> 
> v2: Not allowed to block in kmalloc after setting TASK_INTERRUPTIBLE.
> v3: Avoid the blocking notifier as well for TASK_INTERRUPTIBLE
> v4: Beautification?
> v5: And ignore the preemptibility of queue_work before schedule.
> v6: Use highpri wq to keep our pm_qos window as small as possible.
> 
> Testcase: igt/gem_sync/store-default
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> Cc: Eero Tamminen <eero.t.tamminen@intel.com>
> Cc: Francisco Jerez <currojerez@riseup.net>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
> ---
>   drivers/gpu/drm/i915/i915_request.c | 59 +++++++++++++++++++++++++++++
>   1 file changed, 59 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index 5c2c93cbab12..67fd2ec75d78 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -1258,6 +1258,58 @@ static bool __i915_wait_request_check_and_reset(struct i915_request *request)
>   	return true;
>   }
>   
> +struct wait_dma_qos {
> +	struct pm_qos_request req;
> +	struct work_struct add, del;
> +};
> +
> +static void __wait_dma_qos_add(struct work_struct *work)
> +{
> +	struct wait_dma_qos *qos = container_of(work, typeof(*qos), add);
> +
> +	pm_qos_add_request(&qos->req, PM_QOS_CPU_DMA_LATENCY, 50);
> +}
> +
> +static void __wait_dma_qos_del(struct work_struct *work)
> +{
> +	struct wait_dma_qos *qos = container_of(work, typeof(*qos), del);
> +
> +	if (!cancel_work_sync(&qos->add))
> +		pm_qos_remove_request(&qos->req);
> +
> +	kfree(qos);
> +}
> +
> +static struct wait_dma_qos *wait_dma_qos_add(void)
> +{
> +	struct wait_dma_qos *qos;
> +
> +	/* Called under TASK_INTERRUPTIBLE, so not allowed to sleep/block. */
> +	qos = kzalloc(sizeof(*qos), GFP_NOWAIT | __GFP_NOWARN);
> +	if (!qos)
> +		return NULL;
> +
> +	INIT_WORK(&qos->add, __wait_dma_qos_add);
> +	INIT_WORK(&qos->del, __wait_dma_qos_del);
> +
> +	/*
> +	 * Schedule the enabling work on the local cpu so that it should only
> +	 * take effect if we actually sleep. If schedule() short circuits due to
> +	 * our request already being completed, we should then be able to cancel
> +	 * the work before it is even run.
> +	 */
> +	queue_work_on(raw_smp_processor_id(), system_highpri_wq, &qos->add);
> +
> +	return qos;
> +}
> +
> +static void wait_dma_qos_del(struct wait_dma_qos *qos)
> +{
> +	/* Defer to worker so not incur extra latency for our woken client. */
> +	if (qos)
> +		queue_work(system_highpri_wq, &qos->del);
> +}
> +
>   /**
>    * i915_request_wait - wait until execution of request has finished
>    * @rq: the request to wait upon
> @@ -1286,6 +1338,7 @@ long i915_request_wait(struct i915_request *rq,
>   	wait_queue_head_t *errq = &rq->i915->gpu_error.wait_queue;
>   	DEFINE_WAIT_FUNC(reset, default_wake_function);
>   	DEFINE_WAIT_FUNC(exec, default_wake_function);
> +	struct wait_dma_qos *qos = NULL;
>   	struct intel_wait wait;
>   
>   	might_sleep();
> @@ -1363,6 +1416,11 @@ long i915_request_wait(struct i915_request *rq,
>   			break;
>   		}
>   
> +		if (!qos &&
> +		    i915_seqno_passed(intel_engine_get_seqno(rq->engine),
> +				      wait.seqno - 1))

I also realized that this will get incorrectly applied when there is 
preemption. If a low-priority request gets preempted after we applied 
the PM QoS it will persist for much longer than intended. (Until the 
high-prio request completes and then low-prio one.) And the explicit 
low-latency wait flag would have the same problem. We could perhaps go 
with removing the PM QoS request if preempted. It should not be frequent 
enough to cause issue with too much traffic on the API. But

Another side note - quick grep shows there are a few other "seqno - 1" 
callsites so perhaps we should add a helper for this with a more 
self-explanatory like __i915_seqno_is_executing(engine, seqno) or something?

> +			qos = wait_dma_qos_add();
> +
>   		timeout = io_schedule_timeout(timeout);
>   
>   		if (intel_wait_complete(&wait) &&
> @@ -1412,6 +1470,7 @@ long i915_request_wait(struct i915_request *rq,
>   	if (flags & I915_WAIT_LOCKED)
>   		remove_wait_queue(errq, &reset);
>   	remove_wait_queue(&rq->execute, &exec);
> +	wait_dma_qos_del(qos);
>   	trace_i915_request_wait_end(rq);
>   
>   	return timeout;
>
Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 24+ messages in thread

* ✗ Fi.CI.IGT: failure for series starting with [1/6] drm/i915: Limit C-states when waiting for the active request
  2018-08-06  8:30 [PATCH 1/6] drm/i915: Limit C-states when waiting for the active request Chris Wilson
                   ` (8 preceding siblings ...)
  2018-08-06  9:34 ` [PATCH 1/6] " Tvrtko Ursulin
@ 2018-08-06  9:43 ` Patchwork
  2018-08-07  7:33 ` ✗ Fi.CI.BAT: failure for series starting with [1/6] drm/i915: Limit C-states when waiting for the active request (rev2) Patchwork
  10 siblings, 0 replies; 24+ messages in thread
From: Patchwork @ 2018-08-06  9:43 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/6] drm/i915: Limit C-states when waiting for the active request
URL   : https://patchwork.freedesktop.org/series/47739/
State : failure

== Summary ==

= CI Bug Log - changes from CI_DRM_4618_full -> Patchwork_9855_full =

== Summary - FAILURE ==

  Serious unknown changes coming with Patchwork_9855_full absolutely need to be
  verified manually.
  
  If you think the reported changes have nothing to do with the changes
  introduced in Patchwork_9855_full, please notify your bug team to allow them
  to document this new failure mode, which will reduce false positives in CI.

  

== Possible new issues ==

  Here are the unknown changes that may have been introduced in Patchwork_9855_full:

  === IGT changes ===

    ==== Possible regressions ====

    igt@prime_busy@hang-bsd:
      shard-apl:          PASS -> FAIL

    
    ==== Warnings ====

    igt@kms_cursor_legacy@short-flip-after-cursor-atomic-transitions:
      shard-snb:          SKIP -> PASS

    igt@pm_rc6_residency@rc6-accuracy:
      shard-kbl:          PASS -> SKIP
      shard-snb:          PASS -> SKIP

    
== Known issues ==

  Here are the changes found in Patchwork_9855_full that come from known issues:

  === IGT changes ===

    ==== Issues hit ====

    igt@gem_mmap_wc@write-gtt:
      shard-snb:          NOTRUN -> INCOMPLETE (fdo#105411)

    igt@kms_flip@2x-flip-vs-dpms:
      shard-hsw:          PASS -> DMESG-WARN (fdo#102614)

    igt@kms_flip@flip-vs-expired-vblank-interruptible:
      shard-glk:          PASS -> FAIL (fdo#102887, fdo#105363)

    igt@kms_frontbuffer_tracking@fbc-1p-primscrn-spr-indfb-draw-pwrite:
      shard-snb:          NOTRUN -> DMESG-FAIL (fdo#103167)

    igt@kms_setmode@basic:
      shard-apl:          PASS -> FAIL (fdo#99912)

    
    ==== Possible fixes ====

    igt@gem_eio@reset-stress:
      shard-hsw:          FAIL -> PASS

    igt@gem_softpin@noreloc-s3:
      shard-snb:          DMESG-WARN (fdo#102365) -> PASS

    igt@kms_cursor_crc@cursor-128x128-suspend:
      shard-glk:          FAIL (fdo#103375) -> PASS

    igt@kms_flip@2x-flip-vs-expired-vblank-interruptible:
      shard-glk:          FAIL (fdo#105363) -> PASS

    igt@perf_pmu@busy-accuracy-50-bcs0:
      shard-snb:          INCOMPLETE (fdo#105411) -> SKIP

    
  fdo#102365 https://bugs.freedesktop.org/show_bug.cgi?id=102365
  fdo#102614 https://bugs.freedesktop.org/show_bug.cgi?id=102614
  fdo#102887 https://bugs.freedesktop.org/show_bug.cgi?id=102887
  fdo#103167 https://bugs.freedesktop.org/show_bug.cgi?id=103167
  fdo#103375 https://bugs.freedesktop.org/show_bug.cgi?id=103375
  fdo#105363 https://bugs.freedesktop.org/show_bug.cgi?id=105363
  fdo#105411 https://bugs.freedesktop.org/show_bug.cgi?id=105411
  fdo#99912 https://bugs.freedesktop.org/show_bug.cgi?id=99912


== Participating hosts (5 -> 5) ==

  No changes in participating hosts


== Build changes ==

    * Linux: CI_DRM_4618 -> Patchwork_9855

  CI_DRM_4618: a59e87511e8f63b8d28973891d0ab407a19638c6 @ git://anongit.freedesktop.org/gfx-ci/linux
  IGT_4587: 5d78c73d871525ec9caecd88ad7d9abe36637314 @ git://anongit.freedesktop.org/xorg/app/intel-gpu-tools
  Patchwork_9855: 17f7463076d9d8f510a35de0efc4c61a12eb3383 @ git://anongit.freedesktop.org/gfx-ci/linux
  piglit_4509: fdc5a4ca11124ab8413c7988896eec4c97336694 @ git://anongit.freedesktop.org/piglit

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_9855/shards.html
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 4/6] drm/i915: Priority boost for new clients
  2018-08-06  8:30 ` [PATCH 4/6] drm/i915: Priority boost for new clients Chris Wilson
@ 2018-08-06  9:47   ` Tvrtko Ursulin
  2018-08-07  7:29   ` [PATCH v2] " Chris Wilson
  1 sibling, 0 replies; 24+ messages in thread
From: Tvrtko Ursulin @ 2018-08-06  9:47 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 06/08/2018 09:30, Chris Wilson wrote:
> Taken from an idea used for FQ_CODEL, we give new request flows a
> priority boost. These flows are likely to correspond with interactive
> tasks and so be more latency sensitive than the long queues. As soon as
> the client has more than one request in the queue, further requests are
> not boosted and it settles down into ordinary steady state behaviour.
> Such small kicks dramatically help combat the starvation issue, by
> allowing each client the opportunity to run even when the system is
> under heavy throughput load (within the constraints of the user
> selected priority).
> 
> Testcase: igt/benchmarks/rrul
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> ---
>   drivers/gpu/drm/i915/i915_request.c   | 16 ++++++++++++++--
>   drivers/gpu/drm/i915/i915_scheduler.h |  4 +++-
>   2 files changed, 17 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index 67fd2ec75d78..63785a5c0418 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -1126,8 +1126,20 @@ void i915_request_add(struct i915_request *request)
>   	 */
>   	local_bh_disable();
>   	rcu_read_lock(); /* RCU serialisation for set-wedged protection */
> -	if (engine->schedule)
> -		engine->schedule(request, &request->gem_context->sched);
> +	if (engine->schedule) {
> +		struct i915_sched_attr attr = request->gem_context->sched;
> +
> +		/*
> +		 * Boost priorities to new clients (new request flows).
> +		 *
> +		 * Allow interactive/synchronous clients to jump ahead of
> +		 * the bulk clients. (FQ_CODEL)
> +		 */
> +		if (!prev || i915_request_completed(prev))
> +			attr.priority |= I915_PRIORITY_NEWCLIENT;
> +
> +		engine->schedule(request, &attr);

This AFAIR ended up in an unresolved thread approximately one month ago.

My concern was two unprivileged clients - one (A) sending 1ms batches 
back to back, gem_wait after each, and another (B) sending say 10ms 
batches. In this scenario A would always get a priority bump and would 
keep pre-empting B whose 10ms batch would now effectively never 
complete. So a serious unfairness/starvation issue.

Or a cross-engine "exploit", where a client submits heavy render 
workload and is then able to jump the other render clients by submitting 
a no-op batch to an idle engine, if it sets up null/fake dependencies 
with it. Effectively defeating the need to be privileged to elevate 
priority.

Sorry I can only think of the problems at the moment!

Regards,

Tvrtko

> +	}
>   	rcu_read_unlock();
>   	i915_sw_fence_commit(&request->submit);
>   	local_bh_enable(); /* Kick the execlists tasklet if just scheduled */
> diff --git a/drivers/gpu/drm/i915/i915_scheduler.h b/drivers/gpu/drm/i915/i915_scheduler.h
> index 7edfad0abfd7..e9fb6c1d5e42 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler.h
> +++ b/drivers/gpu/drm/i915/i915_scheduler.h
> @@ -19,12 +19,14 @@ enum {
>   	I915_PRIORITY_INVALID = INT_MIN
>   };
>   
> -#define I915_USER_PRIORITY_SHIFT 0
> +#define I915_USER_PRIORITY_SHIFT 1
>   #define I915_USER_PRIORITY(x) ((x) << I915_USER_PRIORITY_SHIFT)
>   
>   #define I915_PRIORITY_COUNT BIT(I915_USER_PRIORITY_SHIFT)
>   #define I915_PRIORITY_MASK (-I915_PRIORITY_COUNT)
>   
> +#define I915_PRIORITY_NEWCLIENT BIT(0)
> +
>   struct i915_sched_attr {
>   	/**
>   	 * @priority: execution and service priority
> 
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 6/6] drm/i915: Priority boost for waiting clients
  2018-08-06  8:30 ` [PATCH 6/6] drm/i915: Priority boost for waiting clients Chris Wilson
@ 2018-08-06  9:53   ` Tvrtko Ursulin
  2018-08-06 10:03     ` Chris Wilson
  0 siblings, 1 reply; 24+ messages in thread
From: Tvrtko Ursulin @ 2018-08-06  9:53 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 06/08/2018 09:30, Chris Wilson wrote:
> Latency is in the eye of the beholder. In the case where a client stops
> and waits for the gpu, give that request chain a small priority boost
> (not so that it overtakes higher priority clients, to preserve the
> external ordering) so that ideally the wait completes earlier.

Going back to my comments made against the new clients boost patch - 
perhaps if you make it that preemption is not triggered for within 
internal priority delta maybe it is all acceptable. Not sure how much of 
the positive effect you observed remains though.

Regards,

Tvrtko

> 
> Testcase: igt/gem_sync/switch-default
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> Cc: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
> ---
>   drivers/gpu/drm/i915/i915_gem.c       |  5 +++-
>   drivers/gpu/drm/i915/i915_request.c   |  2 ++
>   drivers/gpu/drm/i915/i915_request.h   |  5 ++--
>   drivers/gpu/drm/i915/i915_scheduler.c | 34 ++++++++++++++++++++++-----
>   drivers/gpu/drm/i915/i915_scheduler.h |  5 +++-
>   5 files changed, 41 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 460f256114f7..033d8057dd83 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -1748,6 +1748,7 @@ i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
>   	 */
>   	err = i915_gem_object_wait(obj,
>   				   I915_WAIT_INTERRUPTIBLE |
> +				   I915_WAIT_PRIORITY |
>   				   (write_domain ? I915_WAIT_ALL : 0),
>   				   MAX_SCHEDULE_TIMEOUT,
>   				   to_rps_client(file));
> @@ -3733,7 +3734,9 @@ i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>   	start = ktime_get();
>   
>   	ret = i915_gem_object_wait(obj,
> -				   I915_WAIT_INTERRUPTIBLE | I915_WAIT_ALL,
> +				   I915_WAIT_INTERRUPTIBLE |
> +				   I915_WAIT_PRIORITY |
> +				   I915_WAIT_ALL,
>   				   to_wait_timeout(args->timeout_ns),
>   				   to_rps_client(file));
>   
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index e33cdd95bdc3..105a9e45277b 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -1289,6 +1289,8 @@ long i915_request_wait(struct i915_request *rq,
>   		add_wait_queue(errq, &reset);
>   
>   	intel_wait_init(&wait);
> +	if (flags & I915_WAIT_PRIORITY)
> +		i915_schedule_bump_priority(rq, I915_PRIORITY_WAIT);
>   
>   restart:
>   	do {
> diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
> index 549d56d0ba1c..2898ca74fa27 100644
> --- a/drivers/gpu/drm/i915/i915_request.h
> +++ b/drivers/gpu/drm/i915/i915_request.h
> @@ -269,8 +269,9 @@ long i915_request_wait(struct i915_request *rq,
>   	__attribute__((nonnull(1)));
>   #define I915_WAIT_INTERRUPTIBLE	BIT(0)
>   #define I915_WAIT_LOCKED	BIT(1) /* struct_mutex held, handle GPU reset */
> -#define I915_WAIT_ALL		BIT(2) /* used by i915_gem_object_wait() */
> -#define I915_WAIT_FOR_IDLE_BOOST BIT(3)
> +#define I915_WAIT_PRIORITY	BIT(2) /* small priority bump for the request */
> +#define I915_WAIT_ALL		BIT(3) /* used by i915_gem_object_wait() */
> +#define I915_WAIT_FOR_IDLE_BOOST BIT(4)
>   
>   static inline u32 intel_engine_get_seqno(struct intel_engine_cs *engine);
>   
> diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c
> index 749a192b4628..2da651db5029 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler.c
> +++ b/drivers/gpu/drm/i915/i915_scheduler.c
> @@ -239,7 +239,8 @@ sched_lock_engine(struct i915_sched_node *node, struct intel_engine_cs *locked)
>   	return engine;
>   }
>   
> -void i915_schedule(struct i915_request *rq, const struct i915_sched_attr *attr)
> +static void __i915_schedule(struct i915_request *rq,
> +			    const struct i915_sched_attr *attr)
>   {
>   	struct list_head *uninitialized_var(pl);
>   	struct intel_engine_cs *engine, *last;
> @@ -248,6 +249,8 @@ void i915_schedule(struct i915_request *rq, const struct i915_sched_attr *attr)
>   	const int prio = attr->priority;
>   	LIST_HEAD(dfs);
>   
> +	/* Needed in order to use the temporary link inside i915_dependency */
> +	lockdep_assert_held(&schedule_lock);
>   	GEM_BUG_ON(prio == I915_PRIORITY_INVALID);
>   
>   	if (i915_request_completed(rq))
> @@ -256,9 +259,6 @@ void i915_schedule(struct i915_request *rq, const struct i915_sched_attr *attr)
>   	if (prio <= READ_ONCE(rq->sched.attr.priority))
>   		return;
>   
> -	/* Needed in order to use the temporary link inside i915_dependency */
> -	spin_lock(&schedule_lock);
> -
>   	stack.signaler = &rq->sched;
>   	list_add(&stack.dfs_link, &dfs);
>   
> @@ -312,7 +312,7 @@ void i915_schedule(struct i915_request *rq, const struct i915_sched_attr *attr)
>   		rq->sched.attr = *attr;
>   
>   		if (stack.dfs_link.next == stack.dfs_link.prev)
> -			goto out_unlock;
> +			return;
>   
>   		__list_del_entry(&stack.dfs_link);
>   	}
> @@ -350,7 +350,29 @@ void i915_schedule(struct i915_request *rq, const struct i915_sched_attr *attr)
>   	}
>   
>   	spin_unlock_irq(&engine->timeline.lock);
> +}
>   
> -out_unlock:
> +void i915_schedule(struct i915_request *rq, const struct i915_sched_attr *attr)
> +{
> +	spin_lock(&schedule_lock);
> +	__i915_schedule(rq, attr);
>   	spin_unlock(&schedule_lock);
>   }
> +
> +void i915_schedule_bump_priority(struct i915_request *rq, unsigned int bump)
> +{
> +	struct i915_sched_attr attr;
> +
> +	GEM_BUG_ON(bump & I915_PRIORITY_MASK);
> +
> +	if (READ_ONCE(rq->sched.attr.priority) == I915_PRIORITY_INVALID)
> +		return;
> +
> +	spin_lock_bh(&schedule_lock);
> +
> +	attr = rq->sched.attr;
> +	attr.priority |= bump;
> +	__i915_schedule(rq, &attr);
> +
> +	spin_unlock_bh(&schedule_lock);
> +}
> diff --git a/drivers/gpu/drm/i915/i915_scheduler.h b/drivers/gpu/drm/i915/i915_scheduler.h
> index 9b21ff72f619..e5120ef974d4 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler.h
> +++ b/drivers/gpu/drm/i915/i915_scheduler.h
> @@ -24,12 +24,13 @@ enum {
>   	I915_PRIORITY_INVALID = INT_MIN
>   };
>   
> -#define I915_USER_PRIORITY_SHIFT 1
> +#define I915_USER_PRIORITY_SHIFT 2
>   #define I915_USER_PRIORITY(x) ((x) << I915_USER_PRIORITY_SHIFT)
>   
>   #define I915_PRIORITY_COUNT BIT(I915_USER_PRIORITY_SHIFT)
>   #define I915_PRIORITY_MASK (-I915_PRIORITY_COUNT)
>   
> +#define I915_PRIORITY_WAIT BIT(1)
>   #define I915_PRIORITY_NEWCLIENT BIT(0)
>   
>   struct i915_sched_attr {
> @@ -99,6 +100,8 @@ void i915_sched_node_fini(struct drm_i915_private *i915,
>   void i915_schedule(struct i915_request *request,
>   		   const struct i915_sched_attr *attr);
>   
> +void i915_schedule_bump_priority(struct i915_request *rq, unsigned int bump);
> +
>   struct list_head *
>   i915_sched_lookup_priolist(struct intel_engine_cs *engine, int prio);
>   
> 
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 1/6] drm/i915: Limit C-states when waiting for the active request
  2018-08-06  9:34 ` [PATCH 1/6] " Tvrtko Ursulin
@ 2018-08-06  9:59   ` Chris Wilson
  2018-08-06 10:28     ` Tvrtko Ursulin
  0 siblings, 1 reply; 24+ messages in thread
From: Chris Wilson @ 2018-08-06  9:59 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx; +Cc: Eero Tamminen

Quoting Tvrtko Ursulin (2018-08-06 10:34:54)
> 
> On 06/08/2018 09:30, Chris Wilson wrote:
> > If we are waiting for the currently executing request, we have a good
> > idea that it will be completed in the very near future and so want to
> > cap the CPU_DMA_LATENCY to ensure that we wake up the client quickly.
> 
> I cannot shake the opinion that we shouldn't be doing this. For instance 
> what if the client has been re-niced (down), or it has re-niced itself? 
> Obviously wrong to apply this for those.

Niceness only restricts its position on the scheduler runqueue, doesn't
actually have any cpufreq implications (give or take RT heuristics).
So I don't think we need a tsk->prio restriction.

> Or when you say we have a good idea something will be completed in the 
> very near future. Say there is a 60fps workload which is sending 5ms 
> batches and waits on them. That would be 30% of time spent outside of 
> low C states for a workload which doesn't need it.

Quite frankly, they shouldn't be using wait on the current frame. For
example, in mesa you wait for the end of the previous frame which should
be roughly complete, and since it is a stall before computing the next,
latency is still important.
 
> Also having read what the OpenCL does, where they want to apply 
> different wait optimisations for different call-sites, the idea that we 
> should instead be introducing a low-latency flag to wait ioctl sounds 
> more appropriate.

I'm not impressed by what I've heard there yet. There's also the
dilemma with what to do with dma-fence poll().

> > +             if (!qos &&
> > +                 i915_seqno_passed(intel_engine_get_seqno(rq->engine),
> > +                                   wait.seqno - 1))
> 
> I also realized that this will get incorrectly applied when there is 
> preemption. If a low-priority request gets preempted after we applied 
> the PM QoS it will persist for much longer than intended. (Until the 
> high-prio request completes and then low-prio one.) And the explicit 
> low-latency wait flag would have the same problem. We could perhaps go 
> with removing the PM QoS request if preempted. It should not be frequent 
> enough to cause issue with too much traffic on the API. But

Sure, I didn't think it was worth worrying about. We could cancel it and
reset it on next execution.
 
> Another side note - quick grep shows there are a few other "seqno - 1" 
> callsites so perhaps we should add a helper for this with a more 
> self-explanatory like __i915_seqno_is_executing(engine, seqno) or something?

I briefly considered something along those lines,
intel_engine_has_signaled(), intel_engine_has_started. I also noticed
that I didn't kill i915_request_started even though I though we had.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 6/6] drm/i915: Priority boost for waiting clients
  2018-08-06  9:53   ` Tvrtko Ursulin
@ 2018-08-06 10:03     ` Chris Wilson
  2018-08-06 10:30       ` Tvrtko Ursulin
  0 siblings, 1 reply; 24+ messages in thread
From: Chris Wilson @ 2018-08-06 10:03 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx

Quoting Tvrtko Ursulin (2018-08-06 10:53:52)
> 
> On 06/08/2018 09:30, Chris Wilson wrote:
> > Latency is in the eye of the beholder. In the case where a client stops
> > and waits for the gpu, give that request chain a small priority boost
> > (not so that it overtakes higher priority clients, to preserve the
> > external ordering) so that ideally the wait completes earlier.
> 
> Going back to my comments made against the new clients boost patch - 
> perhaps if you make it that preemption is not triggered for within 
> internal priority delta maybe it is all acceptable. Not sure how much of 
> the positive effect you observed remains though.

That entirely defeats the point of being able to preempt within the same
user priority; so no more "fairness" wrt FQ_CODEL and no more cheating
here.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 1/6] drm/i915: Limit C-states when waiting for the active request
  2018-08-06  9:59   ` Chris Wilson
@ 2018-08-06 10:28     ` Tvrtko Ursulin
  0 siblings, 0 replies; 24+ messages in thread
From: Tvrtko Ursulin @ 2018-08-06 10:28 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx; +Cc: Eero Tamminen


On 06/08/2018 10:59, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2018-08-06 10:34:54)
>>
>> On 06/08/2018 09:30, Chris Wilson wrote:
>>> If we are waiting for the currently executing request, we have a good
>>> idea that it will be completed in the very near future and so want to
>>> cap the CPU_DMA_LATENCY to ensure that we wake up the client quickly.
>>
>> I cannot shake the opinion that we shouldn't be doing this. For instance
>> what if the client has been re-niced (down), or it has re-niced itself?
>> Obviously wrong to apply this for those.
> 
> Niceness only restricts its position on the scheduler runqueue, doesn't
> actually have any cpufreq implications (give or take RT heuristics).
> So I don't think we need a tsk->prio restriction.

I was thinking the client obviously doesn't care about latency or 
anything (more or less) so we would be incorrectly applying the PM QoS 
request.

>> Or when you say we have a good idea something will be completed in the
>> very near future. Say there is a 60fps workload which is sending 5ms
>> batches and waits on them. That would be 30% of time spent outside of
>> low C states for a workload which doesn't need it.
> 
> Quite frankly, they shouldn't be using wait on the current frame. For
> example, in mesa you wait for the end of the previous frame which should
> be roughly complete, and since it is a stall before computing the next,
> latency is still important.

Maybe, but I think we shouldn't go into details on how a client might 
use it, and create limitations / hidden gotchas on this level if they do 
not behave as we expect / prescribe.

But I have noticed so far you have been avoiding to comment on the idea 
of explicit flag. :)

Regards,

Tvrtko

>   
>> Also having read what the OpenCL does, where they want to apply
>> different wait optimisations for different call-sites, the idea that we
>> should instead be introducing a low-latency flag to wait ioctl sounds
>> more appropriate.
> 
> I'm not impressed by what I've heard there yet. There's also the
> dilemma with what to do with dma-fence poll().
> 
>>> +             if (!qos &&
>>> +                 i915_seqno_passed(intel_engine_get_seqno(rq->engine),
>>> +                                   wait.seqno - 1))
>>
>> I also realized that this will get incorrectly applied when there is
>> preemption. If a low-priority request gets preempted after we applied
>> the PM QoS it will persist for much longer than intended. (Until the
>> high-prio request completes and then low-prio one.) And the explicit
>> low-latency wait flag would have the same problem. We could perhaps go
>> with removing the PM QoS request if preempted. It should not be frequent
>> enough to cause issue with too much traffic on the API. But
> 
> Sure, I didn't think it was worth worrying about. We could cancel it and
> reset it on next execution.
>   
>> Another side note - quick grep shows there are a few other "seqno - 1"
>> callsites so perhaps we should add a helper for this with a more
>> self-explanatory like __i915_seqno_is_executing(engine, seqno) or something?
> 
> I briefly considered something along those lines,
> intel_engine_has_signaled(), intel_engine_has_started. I also noticed
> that I didn't kill i915_request_started even though I though we had.
> -Chris
> 
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 6/6] drm/i915: Priority boost for waiting clients
  2018-08-06 10:03     ` Chris Wilson
@ 2018-08-06 10:30       ` Tvrtko Ursulin
  0 siblings, 0 replies; 24+ messages in thread
From: Tvrtko Ursulin @ 2018-08-06 10:30 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 06/08/2018 11:03, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2018-08-06 10:53:52)
>>
>> On 06/08/2018 09:30, Chris Wilson wrote:
>>> Latency is in the eye of the beholder. In the case where a client stops
>>> and waits for the gpu, give that request chain a small priority boost
>>> (not so that it overtakes higher priority clients, to preserve the
>>> external ordering) so that ideally the wait completes earlier.
>>
>> Going back to my comments made against the new clients boost patch -
>> perhaps if you make it that preemption is not triggered for within
>> internal priority delta maybe it is all acceptable. Not sure how much of
>> the positive effect you observed remains though.
> 
> That entirely defeats the point of being able to preempt within the same
> user priority; so no more "fairness" wrt FQ_CODEL and no more cheating
> here.

In this case postpone these tweaks until after we implement time-slicing? :)

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH v2] drm/i915: Priority boost for new clients
  2018-08-06  8:30 ` [PATCH 4/6] drm/i915: Priority boost for new clients Chris Wilson
  2018-08-06  9:47   ` Tvrtko Ursulin
@ 2018-08-07  7:29   ` Chris Wilson
  2018-08-07  9:08     ` Tvrtko Ursulin
  1 sibling, 1 reply; 24+ messages in thread
From: Chris Wilson @ 2018-08-07  7:29 UTC (permalink / raw)
  To: intel-gfx

Taken from an idea used for FQ_CODEL, we give the first request of a
new request flows a small priority boost. These flows are likely to
correspond with short, interactive tasks and so be more latency sensitive
than the longer free running queues. As soon as the client has more than
one request in the queue, further requests are not boosted and it settles
down into ordinary steady state behaviour.  Such small kicks dramatically
help combat the starvation issue, by allowing each client the opportunity
to run even when the system is under heavy throughput load (within the
constraints of the user selected priority).

v2: Mark the preempted request as the start of a new flow, to prevent a
single client being continually gazumped by its peers.

Testcase: igt/benchmarks/rrul
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_request.c   | 16 ++++++++++++++--
 drivers/gpu/drm/i915/i915_scheduler.h |  4 +++-
 drivers/gpu/drm/i915/intel_lrc.c      | 23 ++++++++++++++++++-----
 3 files changed, 35 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index fdcd48651973..334a56daefad 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -1125,8 +1125,20 @@ void i915_request_add(struct i915_request *request)
 	 */
 	local_bh_disable();
 	rcu_read_lock(); /* RCU serialisation for set-wedged protection */
-	if (engine->schedule)
-		engine->schedule(request, &request->gem_context->sched);
+	if (engine->schedule) {
+		struct i915_sched_attr attr = request->gem_context->sched;
+
+		/*
+		 * Boost priorities to new clients (new request flows).
+		 *
+		 * Allow interactive/synchronous clients to jump ahead of
+		 * the bulk clients. (FQ_CODEL)
+		 */
+		if (!prev || i915_request_completed(prev))
+			attr.priority |= I915_PRIORITY_NEWCLIENT;
+
+		engine->schedule(request, &attr);
+	}
 	rcu_read_unlock();
 	i915_sw_fence_commit(&request->submit);
 	local_bh_enable(); /* Kick the execlists tasklet if just scheduled */
diff --git a/drivers/gpu/drm/i915/i915_scheduler.h b/drivers/gpu/drm/i915/i915_scheduler.h
index 7edfad0abfd7..e9fb6c1d5e42 100644
--- a/drivers/gpu/drm/i915/i915_scheduler.h
+++ b/drivers/gpu/drm/i915/i915_scheduler.h
@@ -19,12 +19,14 @@ enum {
 	I915_PRIORITY_INVALID = INT_MIN
 };
 
-#define I915_USER_PRIORITY_SHIFT 0
+#define I915_USER_PRIORITY_SHIFT 1
 #define I915_USER_PRIORITY(x) ((x) << I915_USER_PRIORITY_SHIFT)
 
 #define I915_PRIORITY_COUNT BIT(I915_USER_PRIORITY_SHIFT)
 #define I915_PRIORITY_MASK (-I915_PRIORITY_COUNT)
 
+#define I915_PRIORITY_NEWCLIENT BIT(0)
+
 struct i915_sched_attr {
 	/**
 	 * @priority: execution and service priority
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 1558214eb502..6dec59872282 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -363,9 +363,9 @@ static void unwind_wa_tail(struct i915_request *rq)
 
 static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
 {
-	struct i915_request *rq, *rn;
+	struct i915_request *rq, *rn, *active = NULL;
 	struct list_head *uninitialized_var(pl);
-	int last_prio = I915_PRIORITY_INVALID;
+	int prio = I915_PRIORITY_INVALID;
 
 	lockdep_assert_held(&engine->timeline.lock);
 
@@ -379,12 +379,25 @@ static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
 		unwind_wa_tail(rq);
 
 		GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
-		if (rq_prio(rq) != last_prio) {
-			last_prio = rq_prio(rq);
-			pl = lookup_priolist(engine, last_prio);
+		if (rq_prio(rq) != prio) {
+			prio = rq_prio(rq);
+			pl = lookup_priolist(engine, prio);
 		}
 
 		list_add(&rq->sched.link, pl);
+
+		active = rq;
+	}
+
+	/*
+	 * The active request is now effectively the start of a new client
+	 * stream, so give it the equivalent small priority bump to prevent
+	 * it being gazumped a second time by another peer.
+	 */
+	if (!(prio & I915_PRIORITY_NEWCLIENT)) {
+		list_move_tail(&active->sched.link,
+			       lookup_priolist(engine,
+					       prio | I915_PRIORITY_NEWCLIENT));
 	}
 }
 
-- 
2.18.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* ✗ Fi.CI.BAT: failure for series starting with [1/6] drm/i915: Limit C-states when waiting for the active request (rev2)
  2018-08-06  8:30 [PATCH 1/6] drm/i915: Limit C-states when waiting for the active request Chris Wilson
                   ` (9 preceding siblings ...)
  2018-08-06  9:43 ` ✗ Fi.CI.IGT: failure for series starting with [1/6] " Patchwork
@ 2018-08-07  7:33 ` Patchwork
  10 siblings, 0 replies; 24+ messages in thread
From: Patchwork @ 2018-08-07  7:33 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/6] drm/i915: Limit C-states when waiting for the active request (rev2)
URL   : https://patchwork.freedesktop.org/series/47739/
State : failure

== Summary ==

Applying: drm/i915: Limit C-states when waiting for the active request
Applying: drm/i915: Reserve some priority bits for internal use
Applying: drm/i915: Combine multiple internal plists into the same i915_priolist bucket
Applying: drm/i915: Priority boost for new clients
Applying: drm/i915: Pull scheduling under standalone lock
Using index info to reconstruct a base tree...
M	drivers/gpu/drm/i915/intel_lrc.c
M	drivers/gpu/drm/i915/intel_ringbuffer.h
Falling back to patching base and 3-way merge...
Auto-merging drivers/gpu/drm/i915/intel_ringbuffer.h
Auto-merging drivers/gpu/drm/i915/intel_lrc.c
CONFLICT (content): Merge conflict in drivers/gpu/drm/i915/intel_lrc.c
error: Failed to merge in the changes.
Patch failed at 0005 drm/i915: Pull scheduling under standalone lock
Use 'git am --show-current-patch' to see the failed patch
When you have resolved this problem, run "git am --continue".
If you prefer to skip this patch, run "git am --skip" instead.
To restore the original branch and stop patching, run "git am --abort".

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v2] drm/i915: Priority boost for new clients
  2018-08-07  7:29   ` [PATCH v2] " Chris Wilson
@ 2018-08-07  9:08     ` Tvrtko Ursulin
  2018-08-07 15:02       ` Chris Wilson
  0 siblings, 1 reply; 24+ messages in thread
From: Tvrtko Ursulin @ 2018-08-07  9:08 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 07/08/2018 08:29, Chris Wilson wrote:
> Taken from an idea used for FQ_CODEL, we give the first request of a
> new request flows a small priority boost. These flows are likely to
> correspond with short, interactive tasks and so be more latency sensitive
> than the longer free running queues. As soon as the client has more than
> one request in the queue, further requests are not boosted and it settles
> down into ordinary steady state behaviour.  Such small kicks dramatically
> help combat the starvation issue, by allowing each client the opportunity
> to run even when the system is under heavy throughput load (within the
> constraints of the user selected priority).
> 
> v2: Mark the preempted request as the start of a new flow, to prevent a
> single client being continually gazumped by its peers.
> 
> Testcase: igt/benchmarks/rrul
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> ---
>   drivers/gpu/drm/i915/i915_request.c   | 16 ++++++++++++++--
>   drivers/gpu/drm/i915/i915_scheduler.h |  4 +++-
>   drivers/gpu/drm/i915/intel_lrc.c      | 23 ++++++++++++++++++-----
>   3 files changed, 35 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index fdcd48651973..334a56daefad 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -1125,8 +1125,20 @@ void i915_request_add(struct i915_request *request)
>   	 */
>   	local_bh_disable();
>   	rcu_read_lock(); /* RCU serialisation for set-wedged protection */
> -	if (engine->schedule)
> -		engine->schedule(request, &request->gem_context->sched);
> +	if (engine->schedule) {
> +		struct i915_sched_attr attr = request->gem_context->sched;
> +
> +		/*
> +		 * Boost priorities to new clients (new request flows).
> +		 *
> +		 * Allow interactive/synchronous clients to jump ahead of
> +		 * the bulk clients. (FQ_CODEL)
> +		 */
> +		if (!prev || i915_request_completed(prev))
> +			attr.priority |= I915_PRIORITY_NEWCLIENT;
> +
> +		engine->schedule(request, &attr);
> +	}
>   	rcu_read_unlock();
>   	i915_sw_fence_commit(&request->submit);
>   	local_bh_enable(); /* Kick the execlists tasklet if just scheduled */
> diff --git a/drivers/gpu/drm/i915/i915_scheduler.h b/drivers/gpu/drm/i915/i915_scheduler.h
> index 7edfad0abfd7..e9fb6c1d5e42 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler.h
> +++ b/drivers/gpu/drm/i915/i915_scheduler.h
> @@ -19,12 +19,14 @@ enum {
>   	I915_PRIORITY_INVALID = INT_MIN
>   };
>   
> -#define I915_USER_PRIORITY_SHIFT 0
> +#define I915_USER_PRIORITY_SHIFT 1
>   #define I915_USER_PRIORITY(x) ((x) << I915_USER_PRIORITY_SHIFT)
>   
>   #define I915_PRIORITY_COUNT BIT(I915_USER_PRIORITY_SHIFT)
>   #define I915_PRIORITY_MASK (-I915_PRIORITY_COUNT)
>   
> +#define I915_PRIORITY_NEWCLIENT BIT(0)
> +
>   struct i915_sched_attr {
>   	/**
>   	 * @priority: execution and service priority
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 1558214eb502..6dec59872282 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -363,9 +363,9 @@ static void unwind_wa_tail(struct i915_request *rq)
>   
>   static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
>   {
> -	struct i915_request *rq, *rn;
> +	struct i915_request *rq, *rn, *active = NULL;
>   	struct list_head *uninitialized_var(pl);
> -	int last_prio = I915_PRIORITY_INVALID;
> +	int prio = I915_PRIORITY_INVALID;
>   
>   	lockdep_assert_held(&engine->timeline.lock);
>   
> @@ -379,12 +379,25 @@ static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
>   		unwind_wa_tail(rq);
>   
>   		GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
> -		if (rq_prio(rq) != last_prio) {
> -			last_prio = rq_prio(rq);
> -			pl = lookup_priolist(engine, last_prio);
> +		if (rq_prio(rq) != prio) {
> +			prio = rq_prio(rq);
> +			pl = lookup_priolist(engine, prio);
>   		}
>   
>   		list_add(&rq->sched.link, pl);
> +
> +		active = rq;
> +	}
> +
> +	/*
> +	 * The active request is now effectively the start of a new client
> +	 * stream, so give it the equivalent small priority bump to prevent
> +	 * it being gazumped a second time by another peer.
> +	 */
> +	if (!(prio & I915_PRIORITY_NEWCLIENT)) {
> +		list_move_tail(&active->sched.link,
> +			       lookup_priolist(engine,
> +					       prio | I915_PRIORITY_NEWCLIENT));
>   	}
>   }
>   
> 

This sounds fair, I think I'm okay with it. Does it still work well for 
mixed media workloads?

Regards,

Tvrtko


_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v2] drm/i915: Priority boost for new clients
  2018-08-07  9:08     ` Tvrtko Ursulin
@ 2018-08-07 15:02       ` Chris Wilson
  2018-08-08 12:40         ` Tvrtko Ursulin
  0 siblings, 1 reply; 24+ messages in thread
From: Chris Wilson @ 2018-08-07 15:02 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx

Quoting Tvrtko Ursulin (2018-08-07 10:08:28)
> 
> On 07/08/2018 08:29, Chris Wilson wrote:
> > +     /*
> > +      * The active request is now effectively the start of a new client
> > +      * stream, so give it the equivalent small priority bump to prevent
> > +      * it being gazumped a second time by another peer.
> > +      */
> > +     if (!(prio & I915_PRIORITY_NEWCLIENT)) {
> > +             list_move_tail(&active->sched.link,
> > +                            lookup_priolist(engine,
> > +                                            prio | I915_PRIORITY_NEWCLIENT));
> >       }
> >   }
> >   
> > 
> 
> This sounds fair, I think I'm okay with it. Does it still work well for 
> mixed media workloads?

Grr. Current drm-tip is scoring much much higher in fairness than I
remember, and there's no apparent improvement, even little room for
possible improvement. When in doubt, blame ksoftirqd ;)

Quite surprising though, it was (at least if memory serves) a dramatic
improvement from this patch. Time to see if the metrics do resemble what
I think they should be, and to see if I can find a good example in
media-bench.pl
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v2] drm/i915: Priority boost for new clients
  2018-08-07 15:02       ` Chris Wilson
@ 2018-08-08 12:40         ` Tvrtko Ursulin
  2018-08-08 18:53           ` Chris Wilson
  0 siblings, 1 reply; 24+ messages in thread
From: Tvrtko Ursulin @ 2018-08-08 12:40 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 07/08/2018 16:02, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2018-08-07 10:08:28)
>>
>> On 07/08/2018 08:29, Chris Wilson wrote:
>>> +     /*
>>> +      * The active request is now effectively the start of a new client
>>> +      * stream, so give it the equivalent small priority bump to prevent
>>> +      * it being gazumped a second time by another peer.
>>> +      */
>>> +     if (!(prio & I915_PRIORITY_NEWCLIENT)) {
>>> +             list_move_tail(&active->sched.link,
>>> +                            lookup_priolist(engine,
>>> +                                            prio | I915_PRIORITY_NEWCLIENT));
>>>        }
>>>    }
>>>    
>>>
>>
>> This sounds fair, I think I'm okay with it. Does it still work well for
>> mixed media workloads?
> 
> Grr. Current drm-tip is scoring much much higher in fairness than I
> remember, and there's no apparent improvement, even little room for
> possible improvement. When in doubt, blame ksoftirqd ;)

Perhaps previous testing was before direct submission?

> Quite surprising though, it was (at least if memory serves) a dramatic
> improvement from this patch. Time to see if the metrics do resemble what
> I think they should be, and to see if I can find a good example in
> media-bench.pl

I should maybe rename it to wsim-bench.pl with a switch to select 
workload groups per directory or something..

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v2] drm/i915: Priority boost for new clients
  2018-08-08 12:40         ` Tvrtko Ursulin
@ 2018-08-08 18:53           ` Chris Wilson
  2018-08-08 19:24             ` Chris Wilson
  0 siblings, 1 reply; 24+ messages in thread
From: Chris Wilson @ 2018-08-08 18:53 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx

Quoting Tvrtko Ursulin (2018-08-08 13:40:41)
> 
> On 07/08/2018 16:02, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2018-08-07 10:08:28)
> >>
> >> On 07/08/2018 08:29, Chris Wilson wrote:
> >>> +     /*
> >>> +      * The active request is now effectively the start of a new client
> >>> +      * stream, so give it the equivalent small priority bump to prevent
> >>> +      * it being gazumped a second time by another peer.
> >>> +      */
> >>> +     if (!(prio & I915_PRIORITY_NEWCLIENT)) {
> >>> +             list_move_tail(&active->sched.link,
> >>> +                            lookup_priolist(engine,
> >>> +                                            prio | I915_PRIORITY_NEWCLIENT));
> >>>        }
> >>>    }
> >>>    
> >>>
> >>
> >> This sounds fair, I think I'm okay with it. Does it still work well for
> >> mixed media workloads?
> > 
> > Grr. Current drm-tip is scoring much much higher in fairness than I
> > remember, and there's no apparent improvement, even little room for
> > possible improvement. When in doubt, blame ksoftirqd ;)
> 
> Perhaps previous testing was before direct submission?

I went back and checked that, unfortunately not that simple. The
influencing factor appears to be the choice of workload. I am seeing
some now that respond favourably (by pure chance selection) but I need
to spend more time to ensure the results are stable, and see if there's
any method to the madness in selection.
 
> > Quite surprising though, it was (at least if memory serves) a dramatic
> > improvement from this patch. Time to see if the metrics do resemble what
> > I think they should be, and to see if I can find a good example in
> > media-bench.pl
> 
> I should maybe rename it to wsim-bench.pl with a switch to select 
> workload groups per directory or something..

What I'm thinking is closer to the expression I want is

diff --git a/scripts/media-bench.pl b/scripts/media-bench.pl
index ddf9c0ec..f39b1bb1 100755
--- a/scripts/media-bench.pl
+++ b/scripts/media-bench.pl
@@ -339,7 +339,6 @@ sub find_saturation_point
                } elsif ($c == 1) {
                        $swps = $wps;
                        return ($c, $wps, $swps, $wwps) if $wcnt > 1 or
-                                                          $multi_mode or
                                                           ($wps_target_param < 0 and
                                                            $wps_target == 0);
                }

We want to find the saturated peak of each wsim. We could use sim_wsim
for this.

@@ -553,7 +552,7 @@ foreach my $wrk (@saturation_workloads) {

                                        # Normalize mixed sum with sum of
                                        # individual runs.
-                                       $w *= 100;
+                                       $w *= 100 * scalar(@multi_workloads);
                                        $w /= $tot;

Knowing the saturation point of each wsim, we should be able to saturate
the engine running all simultaneous, each taking 1/N of the bw.
(Spherical cows and all that). Applying the factor of N here sets the
normalized target to 100%.

                                        # Second metric is average of each
@@ -563,10 +562,11 @@ foreach my $wrk (@saturation_workloads) {
                                        $s = 0;
                                        $widx = 0;
                                        foreach my $wrk (@multi_workloads) {
-                                               $s += 100 * $bwwps->[$widx] /
-                                                     $allwps{$wrk}->{$best_bid{$wrk}};
+                                               my $target = $allwps{$wrk}->{$best_bid{$wrk}} / scalar(@multi_workloads);
+                                               $s += 1. - abs($target - $bwwps->[$widx]) / $target;
                                                $widx++;
                                        }
+                                       $s *= 100;
                                        $s /= scalar(@multi_workloads);

This is the challenge. My idea of fairness is that each wsim got 1/N of
the bw, so should see wps of its saturated max / N. Fairness is then how
close we get to our theoretical fair slice.

Problem is the spherical cows are multiplying. It's box packing (linear
optimisation) problem, probably best to quantify the ideal fair slice
using sim_wsim and then report how unfair reality was.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH v2] drm/i915: Priority boost for new clients
  2018-08-08 18:53           ` Chris Wilson
@ 2018-08-08 19:24             ` Chris Wilson
  0 siblings, 0 replies; 24+ messages in thread
From: Chris Wilson @ 2018-08-08 19:24 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx

Quoting Chris Wilson (2018-08-08 19:53:33)
> Quoting Tvrtko Ursulin (2018-08-08 13:40:41)
> > 
> > On 07/08/2018 16:02, Chris Wilson wrote:
> > > Quoting Tvrtko Ursulin (2018-08-07 10:08:28)
> > >>
> > >> On 07/08/2018 08:29, Chris Wilson wrote:
> > >>> +     /*
> > >>> +      * The active request is now effectively the start of a new client
> > >>> +      * stream, so give it the equivalent small priority bump to prevent
> > >>> +      * it being gazumped a second time by another peer.
> > >>> +      */
> > >>> +     if (!(prio & I915_PRIORITY_NEWCLIENT)) {
> > >>> +             list_move_tail(&active->sched.link,
> > >>> +                            lookup_priolist(engine,
> > >>> +                                            prio | I915_PRIORITY_NEWCLIENT));
> > >>>        }
> > >>>    }
> > >>>    
> > >>>
> > >>
> > >> This sounds fair, I think I'm okay with it. Does it still work well for
> > >> mixed media workloads?
> > > 
> > > Grr. Current drm-tip is scoring much much higher in fairness than I
> > > remember, and there's no apparent improvement, even little room for
> > > possible improvement. When in doubt, blame ksoftirqd ;)
> > 
> > Perhaps previous testing was before direct submission?
> 
> I went back and checked that, unfortunately not that simple. The
> influencing factor appears to be the choice of workload. I am seeing
> some now that respond favourably (by pure chance selection) but I need
> to spend more time to ensure the results are stable, and see if there's
> any method to the madness in selection.

Early figures, suffering a bit from lack of transparency in the results.
(Using a pinned kbl gt4e)

TLDR;
w/o boosts:    busy balancer ('-b busy -R'): Aggregate (normalized) 89.28%; fairness 79.64%
w/  boosts:    busy balancer ('-b busy -R'): Aggregate (normalized) 108.92%; fairness 64.72%

Fairness went down (grumble) but that's a large increase in the
multiw-sim throughput.

Before:
ickle@kabylake:~/intel-gpu-tools$ sudo ./scripts/media-bench.pl -n 306765 -B busy -m -W media_nn_1080p_s1.wsim,media_nn_1080p_s2.wsim,media_nn_1080p_s3.wsim,media_nn_1080p.wsim,media_nn_480p.wsim
Workloads:
  media_nn_1080p_s1.wsim
  media_nn_1080p_s2.wsim
  media_nn_1080p_s3.wsim
  media_nn_1080p.wsim
  media_nn_480p.wsim
Balancers: busy,
Target workload duration is 10s.
Calibration tolerance is 0.01.
Multi-workload mode.
Nop calibration is 306765.

Evaluating 'media_nn_1080p_s1.wsim'... 10s is 177 workloads. (error=0.000300000000000011)
  Finding saturation points for 'media_nn_1080p_s1.wsim'...
    busy balancer ('-b busy -R'): 4 clients (31.844 wps, 17.671 wps single client, score=49.515).
    busy balancer ('-b busy -R -G'): 4 clients (31.957 wps, 17.667 wps single client, score=49.624).
    busy balancer ('-b busy -R -d'): 3 clients (31.904 wps, 17.677 wps single client, score=49.581).
    busy balancer ('-b busy -R -G -d'): 3 clients (32.046 wps, 17.624 wps single client, score=49.67).

Evaluating 'media_nn_1080p_s2.wsim'... 10s is 183 workloads. (error=0.00609999999999999)
  Finding saturation points for 'media_nn_1080p_s2.wsim'...
    busy balancer ('-b busy -R'): 3 clients (31.984 wps, 18.147 wps single client, score=50.131).
    busy balancer ('-b busy -R -G'): 3 clients (31.929 wps, 18.058 wps single client, score=49.987).
    busy balancer ('-b busy -R -d'): 2 clients (32.061 wps, 18.160 wps single client, score=50.221).
    busy balancer ('-b busy -R -G -d'): 3 clients (31.909 wps, 18.059 wps single client, score=49.968).

Evaluating 'media_nn_1080p_s3.wsim'... 10s is 182 workloads. (error=0.00719999999999992)
  Finding saturation points for 'media_nn_1080p_s3.wsim'...
    busy balancer ('-b busy -R'): 3 clients (31.957 wps, 18.044 wps single client, score=50.001).
    busy balancer ('-b busy -R -G'): 3 clients (31.930 wps, 18.134 wps single client, score=50.064).
    busy balancer ('-b busy -R -d'): 2 clients (32.047 wps, 18.054 wps single client, score=50.101).
    busy balancer ('-b busy -R -G -d'): 2 clients (31.972 wps, 18.040 wps single client, score=50.012).

Evaluating 'media_nn_1080p.wsim'... 10s is 156 workloads. (error=0.00470000000000006)
  Finding saturation points for 'media_nn_1080p.wsim'...
    busy balancer ('-b busy -R'): 4 clients (31.957 wps, 15.667 wps single client, score=47.624).
    busy balancer ('-b busy -R -G'): 4 clients (32.066 wps, 15.684 wps single client, score=47.75).
    busy balancer ('-b busy -R -d'): 5 clients (32.022 wps, 15.886 wps single client, score=47.908).
    busy balancer ('-b busy -R -G -d'): 4 clients (31.939 wps, 15.657 wps single client, score=47.596).

Evaluating 'media_nn_480p.wsim'... 10s is 337 workloads. (error=0.00820000000000007)
  Finding saturation points for 'media_nn_480p.wsim'...
    busy balancer ('-b busy -R'): 5 clients (74.077 wps, 33.412 wps single client, score=107.489).
    busy balancer ('-b busy -R -G'): 4 clients (73.849 wps, 33.415 wps single client, score=107.264).
    busy balancer ('-b busy -R -d'): 5 clients (73.357 wps, 33.319 wps single client, score=106.676).
    busy balancer ('-b busy -R -G -d'): 5 clients (73.540 wps, 33.497 wps single client, score=107.037).

Evaluating '-w /home/ickle/intel-gpu-tools/benchmarks/wsim/media_nn_1080p_s1.wsim -w /home/ickle/intel-gpu-tools/benchmarks/wsim/media_nn_1080p_s2.wsim -w /home/ickle/intel-gpu-tools/benchmarks/wsim/media_nn_1080p_s3.wsim -w /home/ickle/intel-gpu-tools/benchmarks/wsim/media_nn_1080p.wsim -w /home/ickle/intel-gpu-tools/benchmarks/wsim/media_nn_480p.wsim'... 10s is 72 workloads. (error=0.00109999999999992)
  Finding saturation points for '-w /home/ickle/intel-gpu-tools/benchmarks/wsim/media_nn_1080p_s1.wsim -w /home/ickle/intel-gpu-tools/benchmarks/wsim/media_nn_1080p_s2.wsim -w /home/ickle/intel-gpu-tools/benchmarks/wsim/media_nn_1080p_s3.wsim -w /home/ickle/intel-gpu-tools/benchmarks/wsim/media_nn_1080p.wsim -w /home/ickle/intel-gpu-tools/benchmarks/wsim/media_nn_480p.wsim'...
    busy balancer ('-b busy -R'): Aggregate (normalized) 89.28%; fairness 79.64%
    busy balancer ('-b busy -R -G'): Aggregate (normalized) 89.42%; fairness 79.52%
    busy balancer ('-b busy -R -d'): Aggregate (normalized) 89.60%; fairness 79.36%
    busy balancer ('-b busy -R -G -d'): Aggregate (normalized) 89.92%; fairness 79.15%
  Best balancer is '-b busy -R -G -d' (range=0.000886735101758962).

After:
ickle@kabylake:~/intel-gpu-tools$ sudo ./scripts/media-bench.pl -n 306765 -B busy -m -W media_nn_1080p_s1.wsim,media_nn_1080p_s2.wsim,media_nn_1080p_s3.wsim,media_nn_1080p.wsim,media_nn_480p.wsim
Workloads:
  media_nn_1080p_s1.wsim
  media_nn_1080p_s2.wsim
  media_nn_1080p_s3.wsim
  media_nn_1080p.wsim
  media_nn_480p.wsim
Balancers: busy,
Target workload duration is 10s.
Calibration tolerance is 0.01.
Multi-workload mode.
Nop calibration is 306765.

Evaluating 'media_nn_1080p_s1.wsim'... 10s is 177 workloads. (error=0.00180000000000007)
  Finding saturation points for 'media_nn_1080p_s1.wsim'...
    busy balancer ('-b busy -R'): 3 clients (31.846 wps, 17.674 wps single client, score=49.52).
    busy balancer ('-b busy -R -G'): 3 clients (31.929 wps, 17.668 wps single client, score=49.597).
    busy balancer ('-b busy -R -d'): 3 clients (31.874 wps, 17.620 wps single client, score=49.494).
    busy balancer ('-b busy -R -G -d'): 3 clients (31.888 wps, 17.676 wps single client, score=49.564).

Evaluating 'media_nn_1080p_s2.wsim'... 10s is 181 workloads. (error=0.000799999999999912)
  Finding saturation points for 'media_nn_1080p_s2.wsim'...
    busy balancer ('-b busy -R'): 3 clients (31.987 wps, 18.063 wps single client, score=50.05).
    busy balancer ('-b busy -R -G'): 3 clients (31.938 wps, 18.082 wps single client, score=50.02).
    busy balancer ('-b busy -R -d'): 3 clients (31.925 wps, 18.060 wps single client, score=49.985).
    busy balancer ('-b busy -R -G -d'): 2 clients (32.002 wps, 18.087 wps single client, score=50.089).

Evaluating 'media_nn_1080p_s3.wsim'... 10s is 182 workloads. (error=0.00779999999999994)
  Finding saturation points for 'media_nn_1080p_s3.wsim'...
    busy balancer ('-b busy -R'): 3 clients (32.005 wps, 18.042 wps single client, score=50.047).
    busy balancer ('-b busy -R -G'): 3 clients (32.013 wps, 18.156 wps single client, score=50.169).
    busy balancer ('-b busy -R -d'): 3 clients (32.090 wps, 18.079 wps single client, score=50.169).
    busy balancer ('-b busy -R -G -d'): 2 clients (32.094 wps, 18.038 wps single client, score=50.132).

Evaluating 'media_nn_1080p.wsim'... 10s is 156 workloads. (error=0.00459999999999994)
  Finding saturation points for 'media_nn_1080p.wsim'...
    busy balancer ('-b busy -R'): 5 clients (32.010 wps, 15.668 wps single client, score=47.678).
    busy balancer ('-b busy -R -G'): 4 clients (31.922 wps, 15.732 wps single client, score=47.654).
    busy balancer ('-b busy -R -d'): 3 clients (32.003 wps, 15.634 wps single client, score=47.637).
    busy balancer ('-b busy -R -G -d'): 4 clients (31.798 wps, 15.634 wps single client, score=47.432).

Evaluating 'media_nn_480p.wsim'... 10s is 336 workloads. (error=0.00510000000000002)
  Finding saturation points for 'media_nn_480p.wsim'...
    busy balancer ('-b busy -R'): 5 clients (73.326 wps, 33.979 wps single client, score=107.305).
    busy balancer ('-b busy -R -G'): 5 clients (73.588 wps, 33.410 wps single client, score=106.998).
    busy balancer ('-b busy -R -d'): 5 clients (73.734 wps, 33.280 wps single client, score=107.014).
    busy balancer ('-b busy -R -G -d'): 4 clients (73.139 wps, 33.768 wps single client, score=106.907).

Evaluating '-w /home/ickle/intel-gpu-tools/benchmarks/wsim/media_nn_1080p_s1.wsim -w /home/ickle/intel-gpu-tools/benchmarks/wsim/media_nn_1080p_s2.wsim -w /home/ickle/intel-gpu-tools/benchmarks/wsim/media_nn_1080p_s3.wsim -w /home/ickle/intel-gpu-tools/benchmarks/wsim/media_nn_1080p.wsim -w /home/ickle/intel-gpu-tools/benchmarks/wsim/media_nn_480p.wsim'... 10s is 61 workloads. (error=0.00779999999999994)
  Finding saturation points for '-w /home/ickle/intel-gpu-tools/benchmarks/wsim/media_nn_1080p_s1.wsim -w /home/ickle/intel-gpu-tools/benchmarks/wsim/media_nn_1080p_s2.wsim -w /home/ickle/intel-gpu-tools/benchmarks/wsim/media_nn_1080p_s3.wsim -w /home/ickle/intel-gpu-tools/benchmarks/wsim/media_nn_1080p.wsim -w /home/ickle/intel-gpu-tools/benchmarks/wsim/media_nn_480p.wsim'...
    busy balancer ('-b busy -R'): Aggregate (normalized) 108.92%; fairness 64.72%
    busy balancer ('-b busy -R -G'): Aggregate (normalized) 109.23%; fairness 64.02%
    busy balancer ('-b busy -R -d'): Aggregate (normalized) 110.08%; fairness 55.50%
    busy balancer ('-b busy -R -G -d'): Aggregate (normalized) 108.42%; fairness 55.71%
  Best balancer is '-b busy -R' (range=0.0547794021204777).
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2018-08-08 19:24 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-08-06  8:30 [PATCH 1/6] drm/i915: Limit C-states when waiting for the active request Chris Wilson
2018-08-06  8:30 ` [PATCH 2/6] drm/i915: Reserve some priority bits for internal use Chris Wilson
2018-08-06  8:30 ` [PATCH 3/6] drm/i915: Combine multiple internal plists into the same i915_priolist bucket Chris Wilson
2018-08-06  8:30 ` [PATCH 4/6] drm/i915: Priority boost for new clients Chris Wilson
2018-08-06  9:47   ` Tvrtko Ursulin
2018-08-07  7:29   ` [PATCH v2] " Chris Wilson
2018-08-07  9:08     ` Tvrtko Ursulin
2018-08-07 15:02       ` Chris Wilson
2018-08-08 12:40         ` Tvrtko Ursulin
2018-08-08 18:53           ` Chris Wilson
2018-08-08 19:24             ` Chris Wilson
2018-08-06  8:30 ` [PATCH 5/6] drm/i915: Pull scheduling under standalone lock Chris Wilson
2018-08-06  8:30 ` [PATCH 6/6] drm/i915: Priority boost for waiting clients Chris Wilson
2018-08-06  9:53   ` Tvrtko Ursulin
2018-08-06 10:03     ` Chris Wilson
2018-08-06 10:30       ` Tvrtko Ursulin
2018-08-06  8:41 ` ✗ Fi.CI.CHECKPATCH: warning for series starting with [1/6] drm/i915: Limit C-states when waiting for the active request Patchwork
2018-08-06  8:44 ` ✗ Fi.CI.SPARSE: " Patchwork
2018-08-06  8:56 ` ✓ Fi.CI.BAT: success " Patchwork
2018-08-06  9:34 ` [PATCH 1/6] " Tvrtko Ursulin
2018-08-06  9:59   ` Chris Wilson
2018-08-06 10:28     ` Tvrtko Ursulin
2018-08-06  9:43 ` ✗ Fi.CI.IGT: failure for series starting with [1/6] " Patchwork
2018-08-07  7:33 ` ✗ Fi.CI.BAT: failure for series starting with [1/6] drm/i915: Limit C-states when waiting for the active request (rev2) Patchwork

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.