All of lore.kernel.org
 help / color / mirror / Atom feed
* [Intel-gfx] [PATCH v2] drm/i915/selftests: Measure dispatch latency
@ 2020-05-18 14:39 Chris Wilson
  2020-05-18 15:07 ` Mika Kuoppala
                   ` (6 more replies)
  0 siblings, 7 replies; 18+ messages in thread
From: Chris Wilson @ 2020-05-18 14:39 UTC (permalink / raw)
  To: intel-gfx; +Cc: Chris Wilson

A useful metric of the system's health is how fast we can tell the GPU
to do various actions, so measure our latency.

v2: Refactor all the instruction building into emitters.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 drivers/gpu/drm/i915/selftests/i915_request.c | 764 ++++++++++++++++++
 1 file changed, 764 insertions(+)

diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c
index 6014e8dfcbb1..c6dff5145a3c 100644
--- a/drivers/gpu/drm/i915/selftests/i915_request.c
+++ b/drivers/gpu/drm/i915/selftests/i915_request.c
@@ -24,16 +24,20 @@
 
 #include <linux/prime_numbers.h>
 #include <linux/pm_qos.h>
+#include <linux/sort.h>
 
 #include "gem/i915_gem_pm.h"
 #include "gem/selftests/mock_context.h"
 
+#include "gt/intel_engine_heartbeat.h"
 #include "gt/intel_engine_pm.h"
 #include "gt/intel_engine_user.h"
 #include "gt/intel_gt.h"
+#include "gt/intel_gt_requests.h"
 
 #include "i915_random.h"
 #include "i915_selftest.h"
+#include "igt_flush_test.h"
 #include "igt_live_test.h"
 #include "igt_spinner.h"
 #include "lib_sw_fence.h"
@@ -1524,6 +1528,765 @@ struct perf_series {
 	struct intel_context *ce[];
 };
 
+#define COUNT 5
+
+static int cmp_u32(const void *A, const void *B)
+{
+	const u32 *a = A, *b = B;
+
+	return *a - *b;
+}
+
+static u32 trifilter(u32 *a)
+{
+	u64 sum;
+
+	sort(a, COUNT, sizeof(*a), cmp_u32, NULL);
+
+	sum = mul_u32_u32(a[2], 2);
+	sum += a[1];
+	sum += a[3];
+
+	return (sum + 2) >> 2;
+}
+
+static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
+{
+	u64 ns = i915_cs_timestamp_ticks_to_ns(engine->i915, cycles);
+
+	return DIV_ROUND_CLOSEST(ns, 1 << COUNT);
+}
+
+static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset)
+{
+	*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+	*cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base)));
+	*cs++ = offset;
+	*cs++ = 0;
+
+	return cs;
+}
+
+static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value)
+{
+	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+	*cs++ = offset;
+	*cs++ = 0;
+	*cs++ = value;
+
+	return cs;
+}
+
+static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset)
+{
+	*cs++ = MI_SEMAPHORE_WAIT |
+		MI_SEMAPHORE_GLOBAL_GTT |
+		MI_SEMAPHORE_POLL |
+		mode;
+	*cs++ = value;
+	*cs++ = offset;
+	*cs++ = 0;
+
+	return cs;
+}
+
+static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value)
+{
+	return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset);
+}
+
+static void semaphore_set(u32 *sema, u32 value)
+{
+	WRITE_ONCE(*sema, value);
+	wmb(); /* flush the update to the cache, and beyond */
+}
+
+static u32 *hwsp_scratch(const struct intel_context *ce)
+{
+	return memset32(ce->engine->status_page.addr + 1000, 0, 21);
+}
+
+static u32 hwsp_offset(const struct intel_context *ce, u32 *dw)
+{
+	return (i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(dw));
+}
+
+static int measure_semaphore_response(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	u32 elapsed[COUNT], cycles;
+	struct i915_request *rq;
+	u32 *cs;
+	int i;
+
+	/*
+	 * Measure how many cycles it takes for the HW to detect the change
+	 * in a semaphore value.
+	 *
+	 *    A: read CS_TIMESTAMP from CPU
+	 *    poke semaphore
+	 *    B: read CS_TIMESTAMP on GPU
+	 *
+	 * Semaphore latency: B - A
+	 */
+
+	semaphore_set(sema, -1);
+
+	rq = i915_request_create(ce);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	cs = intel_ring_begin(rq, 4 + 8 * ARRAY_SIZE(elapsed));
+	if (IS_ERR(cs)) {
+		i915_request_add(rq);
+		return PTR_ERR(cs);
+	}
+
+	cs = emit_store_dw(cs, offset, 0);
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		cs = emit_semaphore_poll_until(cs, offset, i);
+		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+	}
+
+	intel_ring_advance(rq, cs);
+	i915_request_add(rq);
+
+	if (wait_for(READ_ONCE(*sema) == 0, 50)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		preempt_disable();
+		cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		semaphore_set(sema, i);
+		preempt_enable();
+
+		if (wait_for(READ_ONCE(sema[i]), 50)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		elapsed[i - 1] = (sema[i] - cycles) << COUNT;
+	}
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: semaphore response %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_idle_dispatch(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	u32 elapsed[COUNT], cycles;
+	u32 *cs;
+	int i;
+
+	/*
+	 * Measure how long it takes for us to submit a request while the
+	 * engine is idle, but is resting in our context.
+	 *
+	 *    A: read CS_TIMESTAMP from CPU
+	 *    submit request
+	 *    B: read CS_TIMESTAMP on GPU
+	 *
+	 * Submission latency: B - A
+	 */
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+		int err;
+
+		err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+		if (err)
+			return err;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 4);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+
+		intel_ring_advance(rq, cs);
+
+		preempt_disable();
+		local_bh_disable();
+		elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		i915_request_add(rq);
+		local_bh_enable();
+		preempt_enable();
+	}
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+		if (wait_for(READ_ONCE(sema[i]), 50)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		elapsed[i] = (sema[i] - elapsed[i]) << COUNT;
+	}
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_busy_dispatch(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	u32 elapsed[COUNT + 1], cycles;
+	u32 *cs;
+	int i;
+
+	/*
+	 * Measure how long it takes for us to submit a request while the
+	 * engine is busy, polling on a semaphore in our context. With
+	 * direct submission, this will include the cost of a lite restore.
+	 *
+	 *    A: read CS_TIMESTAMP from CPU
+	 *    submit request
+	 *    B: read CS_TIMESTAMP on GPU
+	 *
+	 * Submission latency: B - A
+	 */
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 12);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+		cs = emit_semaphore_poll_until(cs, offset, i);
+		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+
+		intel_ring_advance(rq, cs);
+
+		if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		preempt_disable();
+		local_bh_disable();
+		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		i915_request_add(rq);
+		local_bh_enable();
+		semaphore_set(sema, i - 1);
+		preempt_enable();
+	}
+
+	wait_for(READ_ONCE(sema[i - 1]), 500);
+	semaphore_set(sema, i - 1);
+
+	for (i = 1; i <= COUNT; i++)
+		elapsed[i - 1] = (sema[i] - elapsed[i]) << COUNT;
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
+{
+	const u32 offset =
+		i915_ggtt_offset(engine->status_page.vma) +
+		offset_in_page(sema);
+	struct i915_request *rq;
+	u32 *cs;
+
+	rq = i915_request_create(engine->kernel_context);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	cs = intel_ring_begin(rq, 4);
+	if (IS_ERR(cs)) {
+		i915_request_add(rq);
+		return PTR_ERR(cs);
+	}
+
+	cs = emit_semaphore_poll(cs, mode, value, offset);
+
+	intel_ring_advance(rq, cs);
+	i915_request_add(rq);
+
+	return 0;
+}
+
+static int measure_inter_request(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	u32 elapsed[COUNT + 1], cycles;
+	struct i915_sw_fence *submit;
+	int i, err;
+
+	/*
+	 * Measure how long it takes to advance from one request into the
+	 * next. Between each request we flush the GPU caches to memory,
+	 * update the breadcrumbs, and then invalidate those caches.
+	 * We queue up all the requests to be submitted in one batch so
+	 * it should be one set of contiguous measurements.
+	 *
+	 *    A: read CS_TIMESTAMP on GPU
+	 *    advance request
+	 *    B: read CS_TIMESTAMP on GPU
+	 *
+	 * Request latency: B - A
+	 */
+
+	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+	if (err)
+		return err;
+
+	submit = heap_fence_create(GFP_KERNEL);
+	if (!submit) {
+		semaphore_set(sema, 1);
+		return -ENOMEM;
+	}
+
+	intel_engine_flush_submission(ce->engine);
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+		u32 *cs;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq)) {
+			semaphore_set(sema, 1);
+			return PTR_ERR(rq);
+		}
+
+		err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
+						       submit,
+						       GFP_KERNEL);
+		if (err < 0) {
+			i915_sw_fence_commit(submit);
+			heap_fence_put(submit);
+			i915_request_add(rq);
+			semaphore_set(sema, 1);
+			return err;
+		}
+
+		cs = intel_ring_begin(rq, 4);
+		if (IS_ERR(cs)) {
+			i915_sw_fence_commit(submit);
+			heap_fence_put(submit);
+			i915_request_add(rq);
+			semaphore_set(sema, 1);
+			return PTR_ERR(cs);
+		}
+
+		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+
+		intel_ring_advance(rq, cs);
+		i915_request_add(rq);
+	}
+	local_bh_disable();
+	i915_sw_fence_commit(submit);
+	local_bh_enable();
+	intel_engine_flush_submission(ce->engine);
+	heap_fence_put(submit);
+
+	semaphore_set(sema, 1);
+	if (wait_for(READ_ONCE(sema[COUNT + 1]), 100)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= COUNT; i++)
+		elapsed[i - 1] = (sema[i + 1] - sema[i]) << COUNT;
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: inter-request latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_context_switch(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	struct i915_request *fence = NULL;
+	u32 elapsed[COUNT + 1], cycles;
+	int i, j, err;
+	u32 *cs;
+
+	/*
+	 * Measure how long it takes to advance from one request in one
+	 * context to a request in another context. This allows us to
+	 * measure how long the context save/restore take, along with all
+	 * the inter-context setup we require.
+	 *
+	 *    A: read CS_TIMESTAMP on GPU
+	 *    switch context
+	 *    B: read CS_TIMESTAMP on GPU
+	 *
+	 * Context switch latency: B - A
+	 */
+
+	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+	if (err)
+		return err;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct intel_context *arr[] = {
+			ce, ce->engine->kernel_context
+		};
+		u32 timestamp = offset + ARRAY_SIZE(arr) * i * sizeof(u32);
+
+		for (j = 0; j < ARRAY_SIZE(arr); j++) {
+			struct i915_request *rq;
+
+			rq = i915_request_create(arr[j]);
+			if (IS_ERR(rq))
+				return PTR_ERR(rq);
+
+			if (fence) {
+				err = i915_request_await_dma_fence(rq,
+								   &fence->fence);
+				if (err) {
+					i915_request_add(rq);
+					return err;
+				}
+			}
+
+			cs = intel_ring_begin(rq, 4);
+			if (IS_ERR(cs)) {
+				i915_request_add(rq);
+				return PTR_ERR(cs);
+			}
+
+			cs = emit_timestamp_store(cs, ce, timestamp);
+			timestamp += sizeof(u32);
+
+			intel_ring_advance(rq, cs);
+
+			i915_request_put(fence);
+			fence = i915_request_get(rq);
+
+			i915_request_add(rq);
+		}
+	}
+	i915_request_put(fence);
+	intel_engine_flush_submission(ce->engine);
+
+	semaphore_set(sema, 1);
+	if (wait_for(READ_ONCE(sema[2 * i - 1]), 500)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= COUNT; i++)
+		elapsed[i - 1] = (sema[2 * i + 2] - sema[2 * i + 1]) << COUNT;
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: context switch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_preemption(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	u32 elapsed[COUNT], cycles;
+	u32 *cs;
+	int i;
+
+	/*
+	 * We measure two latencies while triggering preemption. The first
+	 * latency is how long it takes for us to submit a preempting request.
+	 * The second latency is how it takes for us to return from the
+	 * preemption back to the original context.
+	 *
+	 *    A: read CS_TIMESTAMP from CPU
+	 *    submit preemption
+	 *    B: read CS_TIMESTAMP on GPU (in preempting context)
+	 *    context switch
+	 *    C: read CS_TIMESTAMP on GPU (in original context)
+	 *
+	 * Preemption dispatch latency: B - A
+	 * Preemption switch latency: C - B
+	 */
+
+	if (!intel_engine_has_preemption(ce->engine))
+		return 0;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		u32 timestamp = offset + 2 * i * sizeof(u32);
+		struct i915_request *rq;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 12);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		cs = emit_timestamp_store(cs, ce, timestamp);
+		cs = emit_semaphore_poll_until(cs, offset, i);
+		cs = emit_timestamp_store(cs, ce, timestamp + sizeof(u32));
+
+		intel_ring_advance(rq, cs);
+		i915_request_add(rq);
+
+		if (wait_for(READ_ONCE(sema[2 * i]), 500)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		rq = i915_request_create(ce->engine->kernel_context);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 8);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		cs = emit_timestamp_store(cs, ce, timestamp);
+		cs = emit_store_dw(cs, offset, i);
+
+		intel_ring_advance(rq, cs);
+		rq->sched.attr.priority = I915_PRIORITY_BARRIER;
+
+		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		i915_request_add(rq);
+	}
+
+	if (wait_for(READ_ONCE(sema[2 * i - 1]), 500)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= COUNT; i++)
+		elapsed[i - 1] = (sema[2 * i + 0] - elapsed[i - 1]) << COUNT;
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: preemption dispatch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	for (i = 1; i <= COUNT; i++)
+		elapsed[i - 1] = (sema[2 * i + 1] - sema[2 * i + 0]) << COUNT;
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: preemption switch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+struct signal_cb {
+	struct dma_fence_cb base;
+	bool seen;
+};
+
+static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
+{
+	struct signal_cb *s = container_of(cb, typeof(*s), base);
+
+	smp_store_mb(s->seen, true); /* be safe, be strong */
+}
+
+static int measure_completion(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	u32 elapsed[COUNT], cycles;
+	u32 *cs;
+	int i;
+
+	/*
+	 * Measure how long it takes for the signal (interrupt) to be
+	 * sent from the GPU to be processed by the CPU.
+	 *
+	 *    A: read CS_TIMESTAMP on GPU
+	 *    signal
+	 *    B: read CS_TIMESTAMP from CPU
+	 *
+	 * Completion latency: B - A
+	 */
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct signal_cb cb = { .seen = false };
+		struct i915_request *rq;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 12);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+		cs = emit_semaphore_poll_until(cs, offset, i);
+		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+
+		intel_ring_advance(rq, cs);
+
+		dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
+
+		local_bh_disable();
+		i915_request_add(rq);
+		local_bh_enable();
+
+		if (wait_for(READ_ONCE(sema[i]), 50)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		preempt_disable();
+		semaphore_set(sema, i);
+		while (!READ_ONCE(cb.seen))
+			cpu_relax();
+
+		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		preempt_enable();
+	}
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++)
+		elapsed[i] = (elapsed[i] - sema[i + 1]) << COUNT;
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: completion latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static void rps_pin(struct intel_gt *gt)
+{
+	/* Pin the frequency to max */
+	atomic_inc(&gt->rps.num_waiters);
+	intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
+
+	mutex_lock(&gt->rps.lock);
+	intel_rps_set(&gt->rps, gt->rps.max_freq);
+	mutex_unlock(&gt->rps.lock);
+}
+
+static void rps_unpin(struct intel_gt *gt)
+{
+	intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
+	atomic_dec(&gt->rps.num_waiters);
+}
+
+static void engine_heartbeat_disable(struct intel_engine_cs *engine)
+{
+	engine->props.heartbeat_interval_ms = 0;
+
+	intel_engine_pm_get(engine);
+	intel_engine_park_heartbeat(engine);
+}
+
+static void engine_heartbeat_enable(struct intel_engine_cs *engine)
+{
+	intel_engine_pm_put(engine);
+
+	engine->props.heartbeat_interval_ms =
+		engine->defaults.heartbeat_interval_ms;
+}
+
+static int perf_request_latency(void *arg)
+{
+	struct drm_i915_private *i915 = arg;
+	struct intel_engine_cs *engine;
+	struct pm_qos_request qos;
+	int err = 0;
+
+	if (INTEL_GEN(i915) < 8) /* per-engine CS timestamp, semaphores */
+		return 0;
+
+	cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
+
+	for_each_uabi_engine(engine, i915) {
+		struct intel_context *ce;
+
+		ce = intel_context_create(engine);
+		if (IS_ERR(ce))
+			goto out;
+
+		err = intel_context_pin(ce);
+		if (err) {
+			intel_context_put(ce);
+			goto out;
+		}
+
+		engine_heartbeat_disable(engine);
+		rps_pin(engine->gt);
+
+		if (err == 0)
+			err = measure_semaphore_response(ce);
+		if (err == 0)
+			err = measure_idle_dispatch(ce);
+		if (err == 0)
+			err = measure_busy_dispatch(ce);
+		if (err == 0)
+			err = measure_inter_request(ce);
+		if (err == 0)
+			err = measure_context_switch(ce);
+		if (err == 0)
+			err = measure_preemption(ce);
+		if (err == 0)
+			err = measure_completion(ce);
+
+		rps_unpin(engine->gt);
+		engine_heartbeat_enable(engine);
+
+		intel_context_unpin(ce);
+		intel_context_put(ce);
+		if (err)
+			goto out;
+	}
+
+out:
+	if (igt_flush_test(i915))
+		err = -EIO;
+
+	cpu_latency_qos_remove_request(&qos);
+	return err;
+}
+
 static int s_sync0(void *arg)
 {
 	struct perf_series *ps = arg;
@@ -2042,6 +2805,7 @@ static int perf_parallel_engines(void *arg)
 int i915_request_perf_selftests(struct drm_i915_private *i915)
 {
 	static const struct i915_subtest tests[] = {
+		SUBTEST(perf_request_latency),
 		SUBTEST(perf_series_engines),
 		SUBTEST(perf_parallel_engines),
 	};
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [Intel-gfx] [PATCH v2] drm/i915/selftests: Measure dispatch latency
  2020-05-18 14:39 [Intel-gfx] [PATCH v2] drm/i915/selftests: Measure dispatch latency Chris Wilson
@ 2020-05-18 15:07 ` Mika Kuoppala
  2020-05-18 15:14   ` Chris Wilson
  2020-05-18 16:19 ` [Intel-gfx] ✗ Fi.CI.BUILD: failure for drm/i915/selftests: Measure dispatch latency (rev7) Patchwork
                   ` (5 subsequent siblings)
  6 siblings, 1 reply; 18+ messages in thread
From: Mika Kuoppala @ 2020-05-18 15:07 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx; +Cc: Chris Wilson

Chris Wilson <chris@chris-wilson.co.uk> writes:

> A useful metric of the system's health is how fast we can tell the GPU
> to do various actions, so measure our latency.
>
> v2: Refactor all the instruction building into emitters.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> ---
>  drivers/gpu/drm/i915/selftests/i915_request.c | 764 ++++++++++++++++++
>  1 file changed, 764 insertions(+)
>
> diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c
> index 6014e8dfcbb1..c6dff5145a3c 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_request.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_request.c
> @@ -24,16 +24,20 @@
>  
>  #include <linux/prime_numbers.h>
>  #include <linux/pm_qos.h>
> +#include <linux/sort.h>
>  
>  #include "gem/i915_gem_pm.h"
>  #include "gem/selftests/mock_context.h"
>  
> +#include "gt/intel_engine_heartbeat.h"
>  #include "gt/intel_engine_pm.h"
>  #include "gt/intel_engine_user.h"
>  #include "gt/intel_gt.h"
> +#include "gt/intel_gt_requests.h"
>  
>  #include "i915_random.h"
>  #include "i915_selftest.h"
> +#include "igt_flush_test.h"
>  #include "igt_live_test.h"
>  #include "igt_spinner.h"
>  #include "lib_sw_fence.h"
> @@ -1524,6 +1528,765 @@ struct perf_series {
>  	struct intel_context *ce[];
>  };
>  
> +#define COUNT 5
> +
> +static int cmp_u32(const void *A, const void *B)
> +{
> +	const u32 *a = A, *b = B;
> +
> +	return *a - *b;
> +}
> +
> +static u32 trifilter(u32 *a)
> +{
> +	u64 sum;
> +
> +	sort(a, COUNT, sizeof(*a), cmp_u32, NULL);
> +
> +	sum = mul_u32_u32(a[2], 2);
> +	sum += a[1];
> +	sum += a[3];
> +
> +	return (sum + 2) >> 2;
> +}
> +
> +static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
> +{
> +	u64 ns = i915_cs_timestamp_ticks_to_ns(engine->i915, cycles);
> +
> +	return DIV_ROUND_CLOSEST(ns, 1 << COUNT);
> +}
> +
> +static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset)
> +{
> +	*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
> +	*cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base)));
> +	*cs++ = offset;
> +	*cs++ = 0;
> +
> +	return cs;
> +}
> +
> +static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value)
> +{
> +	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
> +	*cs++ = offset;
> +	*cs++ = 0;
> +	*cs++ = value;
> +
> +	return cs;
> +}
> +
> +static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset)
> +{
> +	*cs++ = MI_SEMAPHORE_WAIT |
> +		MI_SEMAPHORE_GLOBAL_GTT |
> +		MI_SEMAPHORE_POLL |
> +		mode;
> +	*cs++ = value;
> +	*cs++ = offset;
> +	*cs++ = 0;
> +
> +	return cs;
> +}
> +
> +static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value)
> +{
> +	return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset);
> +}
> +
> +static void semaphore_set(u32 *sema, u32 value)
> +{
> +	WRITE_ONCE(*sema, value);
> +	wmb(); /* flush the update to the cache, and beyond */
> +}
> +
> +static u32 *hwsp_scratch(const struct intel_context *ce)
> +{
> +	return memset32(ce->engine->status_page.addr + 1000, 0, 21);
> +}
> +
> +static u32 hwsp_offset(const struct intel_context *ce, u32 *dw)
> +{
> +	return (i915_ggtt_offset(ce->engine->status_page.vma) +
> +		offset_in_page(dw));
> +}
> +
> +static int measure_semaphore_response(struct intel_context *ce)
> +{
> +	u32 *sema = hwsp_scratch(ce);
> +	const u32 offset = hwsp_offset(ce, sema);
> +	u32 elapsed[COUNT], cycles;
> +	struct i915_request *rq;
> +	u32 *cs;
> +	int i;
> +
> +	/*
> +	 * Measure how many cycles it takes for the HW to detect the change
> +	 * in a semaphore value.
> +	 *
> +	 *    A: read CS_TIMESTAMP from CPU
> +	 *    poke semaphore
> +	 *    B: read CS_TIMESTAMP on GPU
> +	 *
> +	 * Semaphore latency: B - A
> +	 */
> +
> +	semaphore_set(sema, -1);
> +
> +	rq = i915_request_create(ce);
> +	if (IS_ERR(rq))
> +		return PTR_ERR(rq);
> +
> +	cs = intel_ring_begin(rq, 4 + 8 * ARRAY_SIZE(elapsed));
> +	if (IS_ERR(cs)) {
> +		i915_request_add(rq);
> +		return PTR_ERR(cs);
> +	}
> +
> +	cs = emit_store_dw(cs, offset, 0);
> +	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
> +		cs = emit_semaphore_poll_until(cs, offset, i);
> +		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
> +	}
> +
> +	intel_ring_advance(rq, cs);
> +	i915_request_add(rq);
> +
> +	if (wait_for(READ_ONCE(*sema) == 0, 50)) {
> +		intel_gt_set_wedged(ce->engine->gt);
> +		return -EIO;
> +	}
> +
> +	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
> +		preempt_disable();
> +		cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
> +		semaphore_set(sema, i);
> +		preempt_enable();
> +
> +		if (wait_for(READ_ONCE(sema[i]), 50)) {
> +			intel_gt_set_wedged(ce->engine->gt);
> +			return -EIO;
> +		}
> +
> +		elapsed[i - 1] = (sema[i] - cycles) << COUNT;
> +	}
> +
> +	cycles = trifilter(elapsed);
> +	pr_info("%s: semaphore response %d cycles, %lluns\n",
> +		ce->engine->name, cycles >> COUNT,
> +		cycles_to_ns(ce->engine, cycles));
> +
> +	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
> +}
> +
> +static int measure_idle_dispatch(struct intel_context *ce)
> +{
> +	u32 *sema = hwsp_scratch(ce);
> +	const u32 offset = hwsp_offset(ce, sema);
> +	u32 elapsed[COUNT], cycles;
> +	u32 *cs;
> +	int i;
> +
> +	/*
> +	 * Measure how long it takes for us to submit a request while the
> +	 * engine is idle, but is resting in our context.
> +	 *
> +	 *    A: read CS_TIMESTAMP from CPU
> +	 *    submit request
> +	 *    B: read CS_TIMESTAMP on GPU
> +	 *
> +	 * Submission latency: B - A
> +	 */
> +
> +	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
> +		struct i915_request *rq;
> +		int err;
> +
> +		err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
> +		if (err)
> +			return err;
> +
> +		rq = i915_request_create(ce);
> +		if (IS_ERR(rq))
> +			return PTR_ERR(rq);
> +
> +		cs = intel_ring_begin(rq, 4);
> +		if (IS_ERR(cs)) {
> +			i915_request_add(rq);
> +			return PTR_ERR(cs);
> +		}
> +
> +		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
> +
> +		intel_ring_advance(rq, cs);
> +
> +		preempt_disable();
> +		local_bh_disable();
> +		elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
> +		i915_request_add(rq);
> +		local_bh_enable();
> +		preempt_enable();
> +	}
> +
> +	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
> +		if (wait_for(READ_ONCE(sema[i]), 50)) {
> +			intel_gt_set_wedged(ce->engine->gt);
> +			return -EIO;
> +		}
> +
> +		elapsed[i] = (sema[i] - elapsed[i]) << COUNT;
> +	}
> +
> +	cycles = trifilter(elapsed);
> +	pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
> +		ce->engine->name, cycles >> COUNT,
> +		cycles_to_ns(ce->engine, cycles));
> +
> +	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
> +}
> +
> +static int measure_busy_dispatch(struct intel_context *ce)
> +{
> +	u32 *sema = hwsp_scratch(ce);
> +	const u32 offset = hwsp_offset(ce, sema);
> +	u32 elapsed[COUNT + 1], cycles;
> +	u32 *cs;
> +	int i;
> +
> +	/*
> +	 * Measure how long it takes for us to submit a request while the
> +	 * engine is busy, polling on a semaphore in our context. With
> +	 * direct submission, this will include the cost of a lite restore.
> +	 *
> +	 *    A: read CS_TIMESTAMP from CPU
> +	 *    submit request
> +	 *    B: read CS_TIMESTAMP on GPU
> +	 *
> +	 * Submission latency: B - A
> +	 */
> +
> +	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
> +		struct i915_request *rq;
> +
> +		rq = i915_request_create(ce);
> +		if (IS_ERR(rq))
> +			return PTR_ERR(rq);
> +
> +		cs = intel_ring_begin(rq, 12);
> +		if (IS_ERR(cs)) {
> +			i915_request_add(rq);
> +			return PTR_ERR(cs);
> +		}
> +
> +		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));

Is the dual writes here so that when you kick the semaphore, you get the
latest no matter which side you were on?

-Mika

> +		cs = emit_semaphore_poll_until(cs, offset, i);
> +		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
> +
> +		intel_ring_advance(rq, cs);
> +
> +		if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
> +			intel_gt_set_wedged(ce->engine->gt);
> +			return -EIO;
> +		}
> +
> +		preempt_disable();
> +		local_bh_disable();
> +		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
> +		i915_request_add(rq);
> +		local_bh_enable();
> +		semaphore_set(sema, i - 1);
> +		preempt_enable();
> +	}
> +
> +	wait_for(READ_ONCE(sema[i - 1]), 500);
> +	semaphore_set(sema, i - 1);
> +
> +	for (i = 1; i <= COUNT; i++)
> +		elapsed[i - 1] = (sema[i] - elapsed[i]) << COUNT;
> +
> +	cycles = trifilter(elapsed);
> +	pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
> +		ce->engine->name, cycles >> COUNT,
> +		cycles_to_ns(ce->engine, cycles));
> +
> +	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
> +}
> +
> +static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
> +{
> +	const u32 offset =
> +		i915_ggtt_offset(engine->status_page.vma) +
> +		offset_in_page(sema);
> +	struct i915_request *rq;
> +	u32 *cs;
> +
> +	rq = i915_request_create(engine->kernel_context);
> +	if (IS_ERR(rq))
> +		return PTR_ERR(rq);
> +
> +	cs = intel_ring_begin(rq, 4);
> +	if (IS_ERR(cs)) {
> +		i915_request_add(rq);
> +		return PTR_ERR(cs);
> +	}
> +
> +	cs = emit_semaphore_poll(cs, mode, value, offset);
> +
> +	intel_ring_advance(rq, cs);
> +	i915_request_add(rq);
> +
> +	return 0;
> +}
> +
> +static int measure_inter_request(struct intel_context *ce)
> +{
> +	u32 *sema = hwsp_scratch(ce);
> +	const u32 offset = hwsp_offset(ce, sema);
> +	u32 elapsed[COUNT + 1], cycles;
> +	struct i915_sw_fence *submit;
> +	int i, err;
> +
> +	/*
> +	 * Measure how long it takes to advance from one request into the
> +	 * next. Between each request we flush the GPU caches to memory,
> +	 * update the breadcrumbs, and then invalidate those caches.
> +	 * We queue up all the requests to be submitted in one batch so
> +	 * it should be one set of contiguous measurements.
> +	 *
> +	 *    A: read CS_TIMESTAMP on GPU
> +	 *    advance request
> +	 *    B: read CS_TIMESTAMP on GPU
> +	 *
> +	 * Request latency: B - A
> +	 */
> +
> +	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
> +	if (err)
> +		return err;
> +
> +	submit = heap_fence_create(GFP_KERNEL);
> +	if (!submit) {
> +		semaphore_set(sema, 1);
> +		return -ENOMEM;
> +	}
> +
> +	intel_engine_flush_submission(ce->engine);
> +	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
> +		struct i915_request *rq;
> +		u32 *cs;
> +
> +		rq = i915_request_create(ce);
> +		if (IS_ERR(rq)) {
> +			semaphore_set(sema, 1);
> +			return PTR_ERR(rq);
> +		}
> +
> +		err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
> +						       submit,
> +						       GFP_KERNEL);
> +		if (err < 0) {
> +			i915_sw_fence_commit(submit);
> +			heap_fence_put(submit);
> +			i915_request_add(rq);
> +			semaphore_set(sema, 1);
> +			return err;
> +		}
> +
> +		cs = intel_ring_begin(rq, 4);
> +		if (IS_ERR(cs)) {
> +			i915_sw_fence_commit(submit);
> +			heap_fence_put(submit);
> +			i915_request_add(rq);
> +			semaphore_set(sema, 1);
> +			return PTR_ERR(cs);
> +		}
> +
> +		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
> +
> +		intel_ring_advance(rq, cs);
> +		i915_request_add(rq);
> +	}
> +	local_bh_disable();
> +	i915_sw_fence_commit(submit);
> +	local_bh_enable();
> +	intel_engine_flush_submission(ce->engine);
> +	heap_fence_put(submit);
> +
> +	semaphore_set(sema, 1);
> +	if (wait_for(READ_ONCE(sema[COUNT + 1]), 100)) {
> +		intel_gt_set_wedged(ce->engine->gt);
> +		return -EIO;
> +	}
> +
> +	for (i = 1; i <= COUNT; i++)
> +		elapsed[i - 1] = (sema[i + 1] - sema[i]) << COUNT;
> +
> +	cycles = trifilter(elapsed);
> +	pr_info("%s: inter-request latency %d cycles, %lluns\n",
> +		ce->engine->name, cycles >> COUNT,
> +		cycles_to_ns(ce->engine, cycles));
> +
> +	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
> +}
> +
> +static int measure_context_switch(struct intel_context *ce)
> +{
> +	u32 *sema = hwsp_scratch(ce);
> +	const u32 offset = hwsp_offset(ce, sema);
> +	struct i915_request *fence = NULL;
> +	u32 elapsed[COUNT + 1], cycles;
> +	int i, j, err;
> +	u32 *cs;
> +
> +	/*
> +	 * Measure how long it takes to advance from one request in one
> +	 * context to a request in another context. This allows us to
> +	 * measure how long the context save/restore take, along with all
> +	 * the inter-context setup we require.
> +	 *
> +	 *    A: read CS_TIMESTAMP on GPU
> +	 *    switch context
> +	 *    B: read CS_TIMESTAMP on GPU
> +	 *
> +	 * Context switch latency: B - A
> +	 */
> +
> +	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
> +	if (err)
> +		return err;
> +
> +	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
> +		struct intel_context *arr[] = {
> +			ce, ce->engine->kernel_context
> +		};
> +		u32 timestamp = offset + ARRAY_SIZE(arr) * i * sizeof(u32);
> +
> +		for (j = 0; j < ARRAY_SIZE(arr); j++) {
> +			struct i915_request *rq;
> +
> +			rq = i915_request_create(arr[j]);
> +			if (IS_ERR(rq))
> +				return PTR_ERR(rq);
> +
> +			if (fence) {
> +				err = i915_request_await_dma_fence(rq,
> +								   &fence->fence);
> +				if (err) {
> +					i915_request_add(rq);
> +					return err;
> +				}
> +			}
> +
> +			cs = intel_ring_begin(rq, 4);
> +			if (IS_ERR(cs)) {
> +				i915_request_add(rq);
> +				return PTR_ERR(cs);
> +			}
> +
> +			cs = emit_timestamp_store(cs, ce, timestamp);
> +			timestamp += sizeof(u32);
> +
> +			intel_ring_advance(rq, cs);
> +
> +			i915_request_put(fence);
> +			fence = i915_request_get(rq);
> +
> +			i915_request_add(rq);
> +		}
> +	}
> +	i915_request_put(fence);
> +	intel_engine_flush_submission(ce->engine);
> +
> +	semaphore_set(sema, 1);
> +	if (wait_for(READ_ONCE(sema[2 * i - 1]), 500)) {
> +		intel_gt_set_wedged(ce->engine->gt);
> +		return -EIO;
> +	}
> +
> +	for (i = 1; i <= COUNT; i++)
> +		elapsed[i - 1] = (sema[2 * i + 2] - sema[2 * i + 1]) << COUNT;
> +
> +	cycles = trifilter(elapsed);
> +	pr_info("%s: context switch latency %d cycles, %lluns\n",
> +		ce->engine->name, cycles >> COUNT,
> +		cycles_to_ns(ce->engine, cycles));
> +
> +	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
> +}
> +
> +static int measure_preemption(struct intel_context *ce)
> +{
> +	u32 *sema = hwsp_scratch(ce);
> +	const u32 offset = hwsp_offset(ce, sema);
> +	u32 elapsed[COUNT], cycles;
> +	u32 *cs;
> +	int i;
> +
> +	/*
> +	 * We measure two latencies while triggering preemption. The first
> +	 * latency is how long it takes for us to submit a preempting request.
> +	 * The second latency is how it takes for us to return from the
> +	 * preemption back to the original context.
> +	 *
> +	 *    A: read CS_TIMESTAMP from CPU
> +	 *    submit preemption
> +	 *    B: read CS_TIMESTAMP on GPU (in preempting context)
> +	 *    context switch
> +	 *    C: read CS_TIMESTAMP on GPU (in original context)
> +	 *
> +	 * Preemption dispatch latency: B - A
> +	 * Preemption switch latency: C - B
> +	 */
> +
> +	if (!intel_engine_has_preemption(ce->engine))
> +		return 0;
> +
> +	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
> +		u32 timestamp = offset + 2 * i * sizeof(u32);
> +		struct i915_request *rq;
> +
> +		rq = i915_request_create(ce);
> +		if (IS_ERR(rq))
> +			return PTR_ERR(rq);
> +
> +		cs = intel_ring_begin(rq, 12);
> +		if (IS_ERR(cs)) {
> +			i915_request_add(rq);
> +			return PTR_ERR(cs);
> +		}
> +
> +		cs = emit_timestamp_store(cs, ce, timestamp);
> +		cs = emit_semaphore_poll_until(cs, offset, i);
> +		cs = emit_timestamp_store(cs, ce, timestamp + sizeof(u32));
> +
> +		intel_ring_advance(rq, cs);
> +		i915_request_add(rq);
> +
> +		if (wait_for(READ_ONCE(sema[2 * i]), 500)) {
> +			intel_gt_set_wedged(ce->engine->gt);
> +			return -EIO;
> +		}
> +
> +		rq = i915_request_create(ce->engine->kernel_context);
> +		if (IS_ERR(rq))
> +			return PTR_ERR(rq);
> +
> +		cs = intel_ring_begin(rq, 8);
> +		if (IS_ERR(cs)) {
> +			i915_request_add(rq);
> +			return PTR_ERR(cs);
> +		}
> +
> +		cs = emit_timestamp_store(cs, ce, timestamp);
> +		cs = emit_store_dw(cs, offset, i);
> +
> +		intel_ring_advance(rq, cs);
> +		rq->sched.attr.priority = I915_PRIORITY_BARRIER;
> +
> +		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
> +		i915_request_add(rq);
> +	}
> +
> +	if (wait_for(READ_ONCE(sema[2 * i - 1]), 500)) {
> +		intel_gt_set_wedged(ce->engine->gt);
> +		return -EIO;
> +	}
> +
> +	for (i = 1; i <= COUNT; i++)
> +		elapsed[i - 1] = (sema[2 * i + 0] - elapsed[i - 1]) << COUNT;
> +
> +	cycles = trifilter(elapsed);
> +	pr_info("%s: preemption dispatch latency %d cycles, %lluns\n",
> +		ce->engine->name, cycles >> COUNT,
> +		cycles_to_ns(ce->engine, cycles));
> +
> +	for (i = 1; i <= COUNT; i++)
> +		elapsed[i - 1] = (sema[2 * i + 1] - sema[2 * i + 0]) << COUNT;
> +
> +	cycles = trifilter(elapsed);
> +	pr_info("%s: preemption switch latency %d cycles, %lluns\n",
> +		ce->engine->name, cycles >> COUNT,
> +		cycles_to_ns(ce->engine, cycles));
> +
> +	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
> +}
> +
> +struct signal_cb {
> +	struct dma_fence_cb base;
> +	bool seen;
> +};
> +
> +static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
> +{
> +	struct signal_cb *s = container_of(cb, typeof(*s), base);
> +
> +	smp_store_mb(s->seen, true); /* be safe, be strong */
> +}
> +
> +static int measure_completion(struct intel_context *ce)
> +{
> +	u32 *sema = hwsp_scratch(ce);
> +	const u32 offset = hwsp_offset(ce, sema);
> +	u32 elapsed[COUNT], cycles;
> +	u32 *cs;
> +	int i;
> +
> +	/*
> +	 * Measure how long it takes for the signal (interrupt) to be
> +	 * sent from the GPU to be processed by the CPU.
> +	 *
> +	 *    A: read CS_TIMESTAMP on GPU
> +	 *    signal
> +	 *    B: read CS_TIMESTAMP from CPU
> +	 *
> +	 * Completion latency: B - A
> +	 */
> +
> +	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
> +		struct signal_cb cb = { .seen = false };
> +		struct i915_request *rq;
> +
> +		rq = i915_request_create(ce);
> +		if (IS_ERR(rq))
> +			return PTR_ERR(rq);
> +
> +		cs = intel_ring_begin(rq, 12);
> +		if (IS_ERR(cs)) {
> +			i915_request_add(rq);
> +			return PTR_ERR(cs);
> +		}
> +
> +		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
> +		cs = emit_semaphore_poll_until(cs, offset, i);
> +		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
> +
> +		intel_ring_advance(rq, cs);
> +
> +		dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
> +
> +		local_bh_disable();
> +		i915_request_add(rq);
> +		local_bh_enable();
> +
> +		if (wait_for(READ_ONCE(sema[i]), 50)) {
> +			intel_gt_set_wedged(ce->engine->gt);
> +			return -EIO;
> +		}
> +
> +		preempt_disable();
> +		semaphore_set(sema, i);
> +		while (!READ_ONCE(cb.seen))
> +			cpu_relax();
> +
> +		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
> +		preempt_enable();
> +	}
> +
> +	for (i = 0; i < ARRAY_SIZE(elapsed); i++)
> +		elapsed[i] = (elapsed[i] - sema[i + 1]) << COUNT;
> +
> +	cycles = trifilter(elapsed);
> +	pr_info("%s: completion latency %d cycles, %lluns\n",
> +		ce->engine->name, cycles >> COUNT,
> +		cycles_to_ns(ce->engine, cycles));
> +
> +	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
> +}
> +
> +static void rps_pin(struct intel_gt *gt)
> +{
> +	/* Pin the frequency to max */
> +	atomic_inc(&gt->rps.num_waiters);
> +	intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
> +
> +	mutex_lock(&gt->rps.lock);
> +	intel_rps_set(&gt->rps, gt->rps.max_freq);
> +	mutex_unlock(&gt->rps.lock);
> +}
> +
> +static void rps_unpin(struct intel_gt *gt)
> +{
> +	intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
> +	atomic_dec(&gt->rps.num_waiters);
> +}
> +
> +static void engine_heartbeat_disable(struct intel_engine_cs *engine)
> +{
> +	engine->props.heartbeat_interval_ms = 0;
> +
> +	intel_engine_pm_get(engine);
> +	intel_engine_park_heartbeat(engine);
> +}
> +
> +static void engine_heartbeat_enable(struct intel_engine_cs *engine)
> +{
> +	intel_engine_pm_put(engine);
> +
> +	engine->props.heartbeat_interval_ms =
> +		engine->defaults.heartbeat_interval_ms;
> +}
> +
> +static int perf_request_latency(void *arg)
> +{
> +	struct drm_i915_private *i915 = arg;
> +	struct intel_engine_cs *engine;
> +	struct pm_qos_request qos;
> +	int err = 0;
> +
> +	if (INTEL_GEN(i915) < 8) /* per-engine CS timestamp, semaphores */
> +		return 0;
> +
> +	cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
> +
> +	for_each_uabi_engine(engine, i915) {
> +		struct intel_context *ce;
> +
> +		ce = intel_context_create(engine);
> +		if (IS_ERR(ce))
> +			goto out;
> +
> +		err = intel_context_pin(ce);
> +		if (err) {
> +			intel_context_put(ce);
> +			goto out;
> +		}
> +
> +		engine_heartbeat_disable(engine);
> +		rps_pin(engine->gt);
> +
> +		if (err == 0)
> +			err = measure_semaphore_response(ce);
> +		if (err == 0)
> +			err = measure_idle_dispatch(ce);
> +		if (err == 0)
> +			err = measure_busy_dispatch(ce);
> +		if (err == 0)
> +			err = measure_inter_request(ce);
> +		if (err == 0)
> +			err = measure_context_switch(ce);
> +		if (err == 0)
> +			err = measure_preemption(ce);
> +		if (err == 0)
> +			err = measure_completion(ce);
> +
> +		rps_unpin(engine->gt);
> +		engine_heartbeat_enable(engine);
> +
> +		intel_context_unpin(ce);
> +		intel_context_put(ce);
> +		if (err)
> +			goto out;
> +	}
> +
> +out:
> +	if (igt_flush_test(i915))
> +		err = -EIO;
> +
> +	cpu_latency_qos_remove_request(&qos);
> +	return err;
> +}
> +
>  static int s_sync0(void *arg)
>  {
>  	struct perf_series *ps = arg;
> @@ -2042,6 +2805,7 @@ static int perf_parallel_engines(void *arg)
>  int i915_request_perf_selftests(struct drm_i915_private *i915)
>  {
>  	static const struct i915_subtest tests[] = {
> +		SUBTEST(perf_request_latency),
>  		SUBTEST(perf_series_engines),
>  		SUBTEST(perf_parallel_engines),
>  	};
> -- 
> 2.20.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Intel-gfx] [PATCH v2] drm/i915/selftests: Measure dispatch latency
  2020-05-18 15:07 ` Mika Kuoppala
@ 2020-05-18 15:14   ` Chris Wilson
  2020-05-18 15:17     ` Chris Wilson
  0 siblings, 1 reply; 18+ messages in thread
From: Chris Wilson @ 2020-05-18 15:14 UTC (permalink / raw)
  To: Mika Kuoppala, intel-gfx

Quoting Mika Kuoppala (2020-05-18 16:07:47)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> > +             cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
> 
> Is the dual writes here so that when you kick the semaphore, you get the
> latest no matter which side you were on?

We wait on the first write in the CPU before releasing the semaphore. It
was easier to copy the code, but now it can be an emit_store_dw() to
make it clearer that we are not using it as timestamp -- and avoid the
infinite wait if we hit CS_TIMESTAMP == 0.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Intel-gfx] [PATCH v2] drm/i915/selftests: Measure dispatch latency
  2020-05-18 15:14   ` Chris Wilson
@ 2020-05-18 15:17     ` Chris Wilson
  0 siblings, 0 replies; 18+ messages in thread
From: Chris Wilson @ 2020-05-18 15:17 UTC (permalink / raw)
  To: Mika Kuoppala, intel-gfx

Quoting Chris Wilson (2020-05-18 16:14:43)
> Quoting Mika Kuoppala (2020-05-18 16:07:47)
> > Chris Wilson <chris@chris-wilson.co.uk> writes:
> > > +             cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
> > 
> > Is the dual writes here so that when you kick the semaphore, you get the
> > latest no matter which side you were on?
> 
> We wait on the first write in the CPU before releasing the semaphore. It
> was easier to copy the code, but now it can be an emit_store_dw() to
> make it clearer that we are not using it as timestamp -- and avoid the
> infinite wait if we hit CS_TIMESTAMP == 0.

diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c
index c6dff5145a3c..887171ff21a0 100644
--- a/drivers/gpu/drm/i915/selftests/i915_request.c
+++ b/drivers/gpu/drm/i915/selftests/i915_request.c
@@ -1779,7 +1779,7 @@ static int measure_busy_dispatch(struct intel_context *ce)
                        return PTR_ERR(cs);
                }

-               cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+               cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
                cs = emit_semaphore_poll_until(cs, offset, i);
                cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));

@@ -1802,8 +1802,10 @@ static int measure_busy_dispatch(struct intel_context *ce)
        wait_for(READ_ONCE(sema[i - 1]), 500);
        semaphore_set(sema, i - 1);

-       for (i = 1; i <= COUNT; i++)
+       for (i = 1; i <= COUNT; i++) {
+               GEM_BUG_ON(sema[i] == -1);
                elapsed[i - 1] = (sema[i] - elapsed[i]) << COUNT;
+       }
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [Intel-gfx] ✗ Fi.CI.BUILD: failure for drm/i915/selftests: Measure dispatch latency (rev7)
  2020-05-18 14:39 [Intel-gfx] [PATCH v2] drm/i915/selftests: Measure dispatch latency Chris Wilson
  2020-05-18 15:07 ` Mika Kuoppala
@ 2020-05-18 16:19 ` Patchwork
  2020-05-18 16:22 ` [Intel-gfx] [PATCH] drm/i915/selftests: Measure dispatch latency Chris Wilson
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 18+ messages in thread
From: Patchwork @ 2020-05-18 16:19 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: drm/i915/selftests: Measure dispatch latency (rev7)
URL   : https://patchwork.freedesktop.org/series/77308/
State : failure

== Summary ==

Applying: drm/i915/selftests: Measure dispatch latency
error: corrupt patch at line 23
error: could not build fake ancestor
hint: Use 'git am --show-current-patch=diff' to see the failed patch
Patch failed at 0001 drm/i915/selftests: Measure dispatch latency
When you have resolved this problem, run "git am --continue".
If you prefer to skip this patch, run "git am --skip" instead.
To restore the original branch and stop patching, run "git am --abort".

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [Intel-gfx] [PATCH] drm/i915/selftests: Measure dispatch latency
  2020-05-18 14:39 [Intel-gfx] [PATCH v2] drm/i915/selftests: Measure dispatch latency Chris Wilson
  2020-05-18 15:07 ` Mika Kuoppala
  2020-05-18 16:19 ` [Intel-gfx] ✗ Fi.CI.BUILD: failure for drm/i915/selftests: Measure dispatch latency (rev7) Patchwork
@ 2020-05-18 16:22 ` Chris Wilson
  2020-05-18 18:15 ` [Intel-gfx] ✓ Fi.CI.BAT: success for drm/i915/selftests: Measure dispatch latency (rev8) Patchwork
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 18+ messages in thread
From: Chris Wilson @ 2020-05-18 16:22 UTC (permalink / raw)
  To: intel-gfx; +Cc: Chris Wilson

A useful metric of the system's health is how fast we can tell the GPU
to do various actions, so measure our latency.

v2: Refactor all the instruction building into emitters.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 drivers/gpu/drm/i915/selftests/i915_request.c | 767 ++++++++++++++++++
 1 file changed, 767 insertions(+)

diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c
index 6014e8dfcbb1..3e8169c9b081 100644
--- a/drivers/gpu/drm/i915/selftests/i915_request.c
+++ b/drivers/gpu/drm/i915/selftests/i915_request.c
@@ -24,16 +24,20 @@
 
 #include <linux/prime_numbers.h>
 #include <linux/pm_qos.h>
+#include <linux/sort.h>
 
 #include "gem/i915_gem_pm.h"
 #include "gem/selftests/mock_context.h"
 
+#include "gt/intel_engine_heartbeat.h"
 #include "gt/intel_engine_pm.h"
 #include "gt/intel_engine_user.h"
 #include "gt/intel_gt.h"
+#include "gt/intel_gt_requests.h"
 
 #include "i915_random.h"
 #include "i915_selftest.h"
+#include "igt_flush_test.h"
 #include "igt_live_test.h"
 #include "igt_spinner.h"
 #include "lib_sw_fence.h"
@@ -1524,6 +1528,768 @@ struct perf_series {
 	struct intel_context *ce[];
 };
 
+static int cmp_u32(const void *A, const void *B)
+{
+	const u32 *a = A, *b = B;
+
+	return *a - *b;
+}
+
+static u32 trifilter(u32 *a)
+{
+	u64 sum;
+
+#define TF_COUNT 5
+	sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL);
+
+	sum = mul_u32_u32(a[2], 2);
+	sum += a[1];
+	sum += a[3];
+
+	GEM_BUG_ON(sum > U32_MAX);
+	return sum;
+#define TF_BIAS 2
+}
+
+static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
+{
+	u64 ns = i915_cs_timestamp_ticks_to_ns(engine->i915, cycles);
+
+	return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS);
+}
+
+static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset)
+{
+	*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+	*cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base)));
+	*cs++ = offset;
+	*cs++ = 0;
+
+	return cs;
+}
+
+static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value)
+{
+	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+	*cs++ = offset;
+	*cs++ = 0;
+	*cs++ = value;
+
+	return cs;
+}
+
+static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset)
+{
+	*cs++ = MI_SEMAPHORE_WAIT |
+		MI_SEMAPHORE_GLOBAL_GTT |
+		MI_SEMAPHORE_POLL |
+		mode;
+	*cs++ = value;
+	*cs++ = offset;
+	*cs++ = 0;
+
+	return cs;
+}
+
+static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value)
+{
+	return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset);
+}
+
+static void semaphore_set(u32 *sema, u32 value)
+{
+	WRITE_ONCE(*sema, value);
+	wmb(); /* flush the update to the cache, and beyond */
+}
+
+static u32 *hwsp_scratch(const struct intel_context *ce)
+{
+	return memset32(ce->engine->status_page.addr + 1000, 0, 21);
+}
+
+static u32 hwsp_offset(const struct intel_context *ce, u32 *dw)
+{
+	return (i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(dw));
+}
+
+static int measure_semaphore_response(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	u32 elapsed[TF_COUNT], cycles;
+	struct i915_request *rq;
+	u32 *cs;
+	int i;
+
+	/*
+	 * Measure how many cycles it takes for the HW to detect the change
+	 * in a semaphore value.
+	 *
+	 *    A: read CS_TIMESTAMP from CPU
+	 *    poke semaphore
+	 *    B: read CS_TIMESTAMP on GPU
+	 *
+	 * Semaphore latency: B - A
+	 */
+
+	semaphore_set(sema, -1);
+
+	rq = i915_request_create(ce);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	cs = intel_ring_begin(rq, 4 + 8 * ARRAY_SIZE(elapsed));
+	if (IS_ERR(cs)) {
+		i915_request_add(rq);
+		return PTR_ERR(cs);
+	}
+
+	cs = emit_store_dw(cs, offset, 0);
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		cs = emit_semaphore_poll_until(cs, offset, i);
+		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+	}
+
+	intel_ring_advance(rq, cs);
+	i915_request_add(rq);
+
+	if (wait_for(READ_ONCE(*sema) == 0, 50)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		preempt_disable();
+		cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		semaphore_set(sema, i);
+		preempt_enable();
+
+		if (wait_for(READ_ONCE(sema[i]), 50)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		elapsed[i - 1] = sema[i] - cycles;
+	}
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: semaphore response %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_idle_dispatch(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	u32 elapsed[TF_COUNT], cycles;
+	u32 *cs;
+	int i;
+
+	/*
+	 * Measure how long it takes for us to submit a request while the
+	 * engine is idle, but is resting in our context.
+	 *
+	 *    A: read CS_TIMESTAMP from CPU
+	 *    submit request
+	 *    B: read CS_TIMESTAMP on GPU
+	 *
+	 * Submission latency: B - A
+	 */
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+		int err;
+
+		err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+		if (err)
+			return err;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 4);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+
+		intel_ring_advance(rq, cs);
+
+		preempt_disable();
+		local_bh_disable();
+		elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		i915_request_add(rq);
+		local_bh_enable();
+		preempt_enable();
+	}
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+		if (wait_for(READ_ONCE(sema[i]), 50)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		elapsed[i] = sema[i] - elapsed[i];
+	}
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_busy_dispatch(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	u32 elapsed[TF_COUNT + 1], cycles;
+	u32 *cs;
+	int i;
+
+	/*
+	 * Measure how long it takes for us to submit a request while the
+	 * engine is busy, polling on a semaphore in our context. With
+	 * direct submission, this will include the cost of a lite restore.
+	 *
+	 *    A: read CS_TIMESTAMP from CPU
+	 *    submit request
+	 *    B: read CS_TIMESTAMP on GPU
+	 *
+	 * Submission latency: B - A
+	 */
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 12);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
+		cs = emit_semaphore_poll_until(cs, offset, i);
+		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+
+		intel_ring_advance(rq, cs);
+
+		if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		preempt_disable();
+		local_bh_disable();
+		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		i915_request_add(rq);
+		local_bh_enable();
+		semaphore_set(sema, i - 1);
+		preempt_enable();
+	}
+
+	wait_for(READ_ONCE(sema[i - 1]), 500);
+	semaphore_set(sema, i - 1);
+
+	for (i = 1; i <= TF_COUNT; i++) {
+		GEM_BUG_ON(sema[i] == -1);
+		elapsed[i - 1] = sema[i] - elapsed[i];
+	}
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
+{
+	const u32 offset =
+		i915_ggtt_offset(engine->status_page.vma) +
+		offset_in_page(sema);
+	struct i915_request *rq;
+	u32 *cs;
+
+	rq = i915_request_create(engine->kernel_context);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	cs = intel_ring_begin(rq, 4);
+	if (IS_ERR(cs)) {
+		i915_request_add(rq);
+		return PTR_ERR(cs);
+	}
+
+	cs = emit_semaphore_poll(cs, mode, value, offset);
+
+	intel_ring_advance(rq, cs);
+	i915_request_add(rq);
+
+	return 0;
+}
+
+static int measure_inter_request(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	u32 elapsed[TF_COUNT + 1], cycles;
+	struct i915_sw_fence *submit;
+	int i, err;
+
+	/*
+	 * Measure how long it takes to advance from one request into the
+	 * next. Between each request we flush the GPU caches to memory,
+	 * update the breadcrumbs, and then invalidate those caches.
+	 * We queue up all the requests to be submitted in one batch so
+	 * it should be one set of contiguous measurements.
+	 *
+	 *    A: read CS_TIMESTAMP on GPU
+	 *    advance request
+	 *    B: read CS_TIMESTAMP on GPU
+	 *
+	 * Request latency: B - A
+	 */
+
+	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+	if (err)
+		return err;
+
+	submit = heap_fence_create(GFP_KERNEL);
+	if (!submit) {
+		semaphore_set(sema, 1);
+		return -ENOMEM;
+	}
+
+	intel_engine_flush_submission(ce->engine);
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+		u32 *cs;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq)) {
+			semaphore_set(sema, 1);
+			return PTR_ERR(rq);
+		}
+
+		err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
+						       submit,
+						       GFP_KERNEL);
+		if (err < 0) {
+			i915_sw_fence_commit(submit);
+			heap_fence_put(submit);
+			i915_request_add(rq);
+			semaphore_set(sema, 1);
+			return err;
+		}
+
+		cs = intel_ring_begin(rq, 4);
+		if (IS_ERR(cs)) {
+			i915_sw_fence_commit(submit);
+			heap_fence_put(submit);
+			i915_request_add(rq);
+			semaphore_set(sema, 1);
+			return PTR_ERR(cs);
+		}
+
+		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+
+		intel_ring_advance(rq, cs);
+		i915_request_add(rq);
+	}
+	local_bh_disable();
+	i915_sw_fence_commit(submit);
+	local_bh_enable();
+	intel_engine_flush_submission(ce->engine);
+	heap_fence_put(submit);
+
+	semaphore_set(sema, 1);
+	if (wait_for(READ_ONCE(sema[TF_COUNT + 1]), 100)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= TF_COUNT; i++)
+		elapsed[i - 1] = sema[i + 1] - sema[i];
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: inter-request latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_context_switch(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	struct i915_request *fence = NULL;
+	u32 elapsed[TF_COUNT + 1], cycles;
+	int i, j, err;
+	u32 *cs;
+
+	/*
+	 * Measure how long it takes to advance from one request in one
+	 * context to a request in another context. This allows us to
+	 * measure how long the context save/restore take, along with all
+	 * the inter-context setup we require.
+	 *
+	 *    A: read CS_TIMESTAMP on GPU
+	 *    switch context
+	 *    B: read CS_TIMESTAMP on GPU
+	 *
+	 * Context switch latency: B - A
+	 */
+
+	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+	if (err)
+		return err;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct intel_context *arr[] = {
+			ce, ce->engine->kernel_context
+		};
+		u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32);
+
+		for (j = 0; j < ARRAY_SIZE(arr); j++) {
+			struct i915_request *rq;
+
+			rq = i915_request_create(arr[j]);
+			if (IS_ERR(rq))
+				return PTR_ERR(rq);
+
+			if (fence) {
+				err = i915_request_await_dma_fence(rq,
+								   &fence->fence);
+				if (err) {
+					i915_request_add(rq);
+					return err;
+				}
+			}
+
+			cs = intel_ring_begin(rq, 4);
+			if (IS_ERR(cs)) {
+				i915_request_add(rq);
+				return PTR_ERR(cs);
+			}
+
+			cs = emit_timestamp_store(cs, ce, addr);
+			addr += sizeof(u32);
+
+			intel_ring_advance(rq, cs);
+
+			i915_request_put(fence);
+			fence = i915_request_get(rq);
+
+			i915_request_add(rq);
+		}
+	}
+	i915_request_put(fence);
+	intel_engine_flush_submission(ce->engine);
+
+	semaphore_set(sema, 1);
+	if (wait_for(READ_ONCE(sema[2 * i - 1]), 500)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= TF_COUNT; i++)
+		elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1];
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: context switch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_preemption(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	u32 elapsed[TF_COUNT], cycles;
+	u32 *cs;
+	int i;
+
+	/*
+	 * We measure two latencies while triggering preemption. The first
+	 * latency is how long it takes for us to submit a preempting request.
+	 * The second latency is how it takes for us to return from the
+	 * preemption back to the original context.
+	 *
+	 *    A: read CS_TIMESTAMP from CPU
+	 *    submit preemption
+	 *    B: read CS_TIMESTAMP on GPU (in preempting context)
+	 *    context switch
+	 *    C: read CS_TIMESTAMP on GPU (in original context)
+	 *
+	 * Preemption dispatch latency: B - A
+	 * Preemption switch latency: C - B
+	 */
+
+	if (!intel_engine_has_preemption(ce->engine))
+		return 0;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		u32 addr = offset + 2 * i * sizeof(u32);
+		struct i915_request *rq;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 12);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		cs = emit_store_dw(cs, addr, -1);
+		cs = emit_semaphore_poll_until(cs, offset, i);
+		cs = emit_timestamp_store(cs, ce, addr + sizeof(u32));
+
+		intel_ring_advance(rq, cs);
+		i915_request_add(rq);
+
+		if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		rq = i915_request_create(ce->engine->kernel_context);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 8);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		cs = emit_timestamp_store(cs, ce, addr);
+		cs = emit_store_dw(cs, offset, i);
+
+		intel_ring_advance(rq, cs);
+		rq->sched.attr.priority = I915_PRIORITY_BARRIER;
+
+		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		i915_request_add(rq);
+	}
+
+	if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= TF_COUNT; i++)
+		elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1];
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: preemption dispatch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	for (i = 1; i <= TF_COUNT; i++)
+		elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0];
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: preemption switch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+struct signal_cb {
+	struct dma_fence_cb base;
+	bool seen;
+};
+
+static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
+{
+	struct signal_cb *s = container_of(cb, typeof(*s), base);
+
+	smp_store_mb(s->seen, true); /* be safe, be strong */
+}
+
+static int measure_completion(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	u32 elapsed[TF_COUNT], cycles;
+	u32 *cs;
+	int i;
+
+	/*
+	 * Measure how long it takes for the signal (interrupt) to be
+	 * sent from the GPU to be processed by the CPU.
+	 *
+	 *    A: read CS_TIMESTAMP on GPU
+	 *    signal
+	 *    B: read CS_TIMESTAMP from CPU
+	 *
+	 * Completion latency: B - A
+	 */
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct signal_cb cb = { .seen = false };
+		struct i915_request *rq;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 12);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+		cs = emit_semaphore_poll_until(cs, offset, i);
+		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+
+		intel_ring_advance(rq, cs);
+
+		dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
+
+		local_bh_disable();
+		i915_request_add(rq);
+		local_bh_enable();
+
+		if (wait_for(READ_ONCE(sema[i]), 50)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		preempt_disable();
+		semaphore_set(sema, i);
+		while (!READ_ONCE(cb.seen))
+			cpu_relax();
+
+		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		preempt_enable();
+	}
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++)
+		elapsed[i] = elapsed[i] - sema[i + 1];
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: completion latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static void rps_pin(struct intel_gt *gt)
+{
+	/* Pin the frequency to max */
+	atomic_inc(&gt->rps.num_waiters);
+	intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
+
+	mutex_lock(&gt->rps.lock);
+	intel_rps_set(&gt->rps, gt->rps.max_freq);
+	mutex_unlock(&gt->rps.lock);
+}
+
+static void rps_unpin(struct intel_gt *gt)
+{
+	intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
+	atomic_dec(&gt->rps.num_waiters);
+}
+
+static void engine_heartbeat_disable(struct intel_engine_cs *engine)
+{
+	engine->props.heartbeat_interval_ms = 0;
+
+	intel_engine_pm_get(engine);
+	intel_engine_park_heartbeat(engine);
+}
+
+static void engine_heartbeat_enable(struct intel_engine_cs *engine)
+{
+	intel_engine_pm_put(engine);
+
+	engine->props.heartbeat_interval_ms =
+		engine->defaults.heartbeat_interval_ms;
+}
+
+static int perf_request_latency(void *arg)
+{
+	struct drm_i915_private *i915 = arg;
+	struct intel_engine_cs *engine;
+	struct pm_qos_request qos;
+	int err = 0;
+
+	if (INTEL_GEN(i915) < 8) /* per-engine CS timestamp, semaphores */
+		return 0;
+
+	cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
+
+	for_each_uabi_engine(engine, i915) {
+		struct intel_context *ce;
+
+		ce = intel_context_create(engine);
+		if (IS_ERR(ce))
+			goto out;
+
+		err = intel_context_pin(ce);
+		if (err) {
+			intel_context_put(ce);
+			goto out;
+		}
+
+		engine_heartbeat_disable(engine);
+		rps_pin(engine->gt);
+
+		if (err == 0)
+			err = measure_semaphore_response(ce);
+		if (err == 0)
+			err = measure_idle_dispatch(ce);
+		if (err == 0)
+			err = measure_busy_dispatch(ce);
+		if (err == 0)
+			err = measure_inter_request(ce);
+		if (err == 0)
+			err = measure_context_switch(ce);
+		if (err == 0)
+			err = measure_preemption(ce);
+		if (err == 0)
+			err = measure_completion(ce);
+
+		rps_unpin(engine->gt);
+		engine_heartbeat_enable(engine);
+
+		intel_context_unpin(ce);
+		intel_context_put(ce);
+		if (err)
+			goto out;
+	}
+
+out:
+	if (igt_flush_test(i915))
+		err = -EIO;
+
+	cpu_latency_qos_remove_request(&qos);
+	return err;
+}
+
 static int s_sync0(void *arg)
 {
 	struct perf_series *ps = arg;
@@ -2042,6 +2808,7 @@ static int perf_parallel_engines(void *arg)
 int i915_request_perf_selftests(struct drm_i915_private *i915)
 {
 	static const struct i915_subtest tests[] = {
+		SUBTEST(perf_request_latency),
 		SUBTEST(perf_series_engines),
 		SUBTEST(perf_parallel_engines),
 	};
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [Intel-gfx] ✓ Fi.CI.BAT: success for drm/i915/selftests: Measure dispatch latency (rev8)
  2020-05-18 14:39 [Intel-gfx] [PATCH v2] drm/i915/selftests: Measure dispatch latency Chris Wilson
                   ` (2 preceding siblings ...)
  2020-05-18 16:22 ` [Intel-gfx] [PATCH] drm/i915/selftests: Measure dispatch latency Chris Wilson
@ 2020-05-18 18:15 ` Patchwork
  2020-05-18 20:02 ` [Intel-gfx] ✓ Fi.CI.IGT: " Patchwork
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 18+ messages in thread
From: Patchwork @ 2020-05-18 18:15 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: drm/i915/selftests: Measure dispatch latency (rev8)
URL   : https://patchwork.freedesktop.org/series/77308/
State : success

== Summary ==

CI Bug Log - changes from CI_DRM_8495 -> Patchwork_17695
====================================================

Summary
-------

  **SUCCESS**

  No regressions found.

  External URL: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17695/index.html

Known issues
------------

  Here are the changes found in Patchwork_17695 that come from known issues:

### IGT changes ###

#### Possible fixes ####

  * igt@debugfs_test@read_all_entries:
    - fi-bsw-nick:        [INCOMPLETE][1] ([i915#1250] / [i915#1436]) -> [PASS][2]
   [1]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8495/fi-bsw-nick/igt@debugfs_test@read_all_entries.html
   [2]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17695/fi-bsw-nick/igt@debugfs_test@read_all_entries.html

  * igt@i915_selftest@live@execlists:
    - fi-skl-lmem:        [INCOMPLETE][3] ([i915#1874]) -> [PASS][4]
   [3]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8495/fi-skl-lmem/igt@i915_selftest@live@execlists.html
   [4]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17695/fi-skl-lmem/igt@i915_selftest@live@execlists.html
    - fi-whl-u:           [INCOMPLETE][5] ([i915#656]) -> [PASS][6]
   [5]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8495/fi-whl-u/igt@i915_selftest@live@execlists.html
   [6]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17695/fi-whl-u/igt@i915_selftest@live@execlists.html
    - fi-kbl-guc:         [INCOMPLETE][7] ([i915#656]) -> [PASS][8]
   [7]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8495/fi-kbl-guc/igt@i915_selftest@live@execlists.html
   [8]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17695/fi-kbl-guc/igt@i915_selftest@live@execlists.html

  
#### Warnings ####

  * igt@i915_pm_rpm@module-reload:
    - fi-kbl-x1275:       [SKIP][9] ([fdo#109271]) -> [FAIL][10] ([i915#62])
   [9]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8495/fi-kbl-x1275/igt@i915_pm_rpm@module-reload.html
   [10]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17695/fi-kbl-x1275/igt@i915_pm_rpm@module-reload.html

  
  [fdo#109271]: https://bugs.freedesktop.org/show_bug.cgi?id=109271
  [i915#1250]: https://gitlab.freedesktop.org/drm/intel/issues/1250
  [i915#1436]: https://gitlab.freedesktop.org/drm/intel/issues/1436
  [i915#1874]: https://gitlab.freedesktop.org/drm/intel/issues/1874
  [i915#62]: https://gitlab.freedesktop.org/drm/intel/issues/62
  [i915#656]: https://gitlab.freedesktop.org/drm/intel/issues/656


Participating hosts (52 -> 44)
------------------------------

  Missing    (8): fi-ilk-m540 fi-hsw-4200u fi-byt-squawks fi-bsw-cyan fi-ctg-p8600 fi-kbl-7560u fi-byt-clapper fi-bdw-samus 


Build changes
-------------

  * Linux: CI_DRM_8495 -> Patchwork_17695

  CI-20190529: 20190529
  CI_DRM_8495: 695b7d214a475060bb448c2609e4184fc450d69a @ git://anongit.freedesktop.org/gfx-ci/linux
  IGT_5659: 66ab5e42811fee3dea8c21ab29e70e323a0650de @ git://anongit.freedesktop.org/xorg/app/intel-gpu-tools
  Patchwork_17695: 2633bb73f44c35def35771444a87a1cc99411d8a @ git://anongit.freedesktop.org/gfx-ci/linux


== Linux commits ==

2633bb73f44c drm/i915/selftests: Measure dispatch latency

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17695/index.html
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [Intel-gfx] ✓ Fi.CI.IGT: success for drm/i915/selftests: Measure dispatch latency (rev8)
  2020-05-18 14:39 [Intel-gfx] [PATCH v2] drm/i915/selftests: Measure dispatch latency Chris Wilson
                   ` (3 preceding siblings ...)
  2020-05-18 18:15 ` [Intel-gfx] ✓ Fi.CI.BAT: success for drm/i915/selftests: Measure dispatch latency (rev8) Patchwork
@ 2020-05-18 20:02 ` Patchwork
  2020-05-19 11:41 ` [Intel-gfx] [PATCH] drm/i915/selftests: Measure dispatch latency Chris Wilson
  2020-05-19 13:25 ` [Intel-gfx] ✓ Fi.CI.BAT: success for drm/i915/selftests: Measure dispatch latency (rev9) Patchwork
  6 siblings, 0 replies; 18+ messages in thread
From: Patchwork @ 2020-05-18 20:02 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: drm/i915/selftests: Measure dispatch latency (rev8)
URL   : https://patchwork.freedesktop.org/series/77308/
State : success

== Summary ==

CI Bug Log - changes from CI_DRM_8495_full -> Patchwork_17695_full
====================================================

Summary
-------

  **SUCCESS**

  No regressions found.

  

Known issues
------------

  Here are the changes found in Patchwork_17695_full that come from known issues:

### IGT changes ###

#### Issues hit ####

  * igt@gem_softpin@noreloc-s3:
    - shard-kbl:          [PASS][1] -> [DMESG-WARN][2] ([i915#180]) +1 similar issue
   [1]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8495/shard-kbl7/igt@gem_softpin@noreloc-s3.html
   [2]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17695/shard-kbl3/igt@gem_softpin@noreloc-s3.html

  * igt@i915_suspend@fence-restore-tiled2untiled:
    - shard-skl:          [PASS][3] -> [INCOMPLETE][4] ([i915#69])
   [3]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8495/shard-skl4/igt@i915_suspend@fence-restore-tiled2untiled.html
   [4]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17695/shard-skl5/igt@i915_suspend@fence-restore-tiled2untiled.html

  * igt@kms_cursor_crc@pipe-b-cursor-256x256-sliding:
    - shard-skl:          [PASS][5] -> [FAIL][6] ([i915#54])
   [5]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8495/shard-skl4/igt@kms_cursor_crc@pipe-b-cursor-256x256-sliding.html
   [6]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17695/shard-skl6/igt@kms_cursor_crc@pipe-b-cursor-256x256-sliding.html

  * igt@kms_flip_tiling@flip-changes-tiling:
    - shard-apl:          [PASS][7] -> [FAIL][8] ([i915#95])
   [7]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8495/shard-apl4/igt@kms_flip_tiling@flip-changes-tiling.html
   [8]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17695/shard-apl4/igt@kms_flip_tiling@flip-changes-tiling.html

  * igt@kms_plane_alpha_blend@pipe-a-coverage-7efc:
    - shard-skl:          [PASS][9] -> [FAIL][10] ([fdo#108145] / [i915#265])
   [9]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8495/shard-skl4/igt@kms_plane_alpha_blend@pipe-a-coverage-7efc.html
   [10]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17695/shard-skl5/igt@kms_plane_alpha_blend@pipe-a-coverage-7efc.html

  * igt@kms_vblank@pipe-a-ts-continuation-suspend:
    - shard-apl:          [PASS][11] -> [DMESG-WARN][12] ([i915#180]) +1 similar issue
   [11]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8495/shard-apl7/igt@kms_vblank@pipe-a-ts-continuation-suspend.html
   [12]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17695/shard-apl1/igt@kms_vblank@pipe-a-ts-continuation-suspend.html

  
#### Possible fixes ####

  * {igt@gem_ctx_isolation@preservation-s3@bcs0}:
    - shard-skl:          [INCOMPLETE][13] ([i915#198]) -> [PASS][14]
   [13]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8495/shard-skl2/igt@gem_ctx_isolation@preservation-s3@bcs0.html
   [14]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17695/shard-skl4/igt@gem_ctx_isolation@preservation-s3@bcs0.html

  * {igt@gem_ctx_isolation@preservation-s3@rcs0}:
    - shard-apl:          [DMESG-WARN][15] ([i915#180]) -> [PASS][16] +1 similar issue
   [15]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8495/shard-apl6/igt@gem_ctx_isolation@preservation-s3@rcs0.html
   [16]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17695/shard-apl2/igt@gem_ctx_isolation@preservation-s3@rcs0.html

  * igt@gem_workarounds@suspend-resume-context:
    - shard-kbl:          [DMESG-WARN][17] ([i915#180]) -> [PASS][18]
   [17]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8495/shard-kbl2/igt@gem_workarounds@suspend-resume-context.html
   [18]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17695/shard-kbl6/igt@gem_workarounds@suspend-resume-context.html

  * igt@kms_big_fb@y-tiled-32bpp-rotate-90:
    - shard-tglb:         [FAIL][19] ([i915#1172] / [i915#1897] / [i915#402]) -> [PASS][20]
   [19]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8495/shard-tglb3/igt@kms_big_fb@y-tiled-32bpp-rotate-90.html
   [20]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17695/shard-tglb7/igt@kms_big_fb@y-tiled-32bpp-rotate-90.html

  * igt@kms_color@pipe-b-ctm-max:
    - shard-tglb:         [FAIL][21] ([i915#1149]) -> [PASS][22]
   [21]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8495/shard-tglb5/igt@kms_color@pipe-b-ctm-max.html
   [22]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17695/shard-tglb1/igt@kms_color@pipe-b-ctm-max.html

  * igt@kms_cursor_crc@pipe-c-cursor-64x64-onscreen:
    - shard-tglb:         [FAIL][23] ([i915#1897]) -> [PASS][24] +9 similar issues
   [23]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8495/shard-tglb7/igt@kms_cursor_crc@pipe-c-cursor-64x64-onscreen.html
   [24]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17695/shard-tglb6/igt@kms_cursor_crc@pipe-c-cursor-64x64-onscreen.html

  * igt@kms_fbcon_fbt@fbc-suspend:
    - shard-tglb:         [FAIL][25] ([i915#64]) -> [PASS][26]
   [25]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8495/shard-tglb7/igt@kms_fbcon_fbt@fbc-suspend.html
   [26]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17695/shard-tglb6/igt@kms_fbcon_fbt@fbc-suspend.html

  * {igt@kms_flip@plain-flip-fb-recreate-interruptible@b-edp1}:
    - shard-skl:          [FAIL][27] ([i915#1883]) -> [PASS][28] +1 similar issue
   [27]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8495/shard-skl6/igt@kms_flip@plain-flip-fb-recreate-interruptible@b-edp1.html
   [28]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17695/shard-skl3/igt@kms_flip@plain-flip-fb-recreate-interruptible@b-edp1.html

  * igt@kms_frontbuffer_tracking@fbcpsr-1p-offscren-pri-indfb-draw-blt:
    - shard-tglb:         [FAIL][29] ([i915#1897] / [i915#402]) -> [PASS][30] +8 similar issues
   [29]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8495/shard-tglb3/igt@kms_frontbuffer_tracking@fbcpsr-1p-offscren-pri-indfb-draw-blt.html
   [30]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17695/shard-tglb7/igt@kms_frontbuffer_tracking@fbcpsr-1p-offscren-pri-indfb-draw-blt.html

  * igt@kms_plane_alpha_blend@pipe-b-constant-alpha-min:
    - shard-skl:          [FAIL][31] ([fdo#108145] / [i915#265]) -> [PASS][32]
   [31]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8495/shard-skl7/igt@kms_plane_alpha_blend@pipe-b-constant-alpha-min.html
   [32]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17695/shard-skl2/igt@kms_plane_alpha_blend@pipe-b-constant-alpha-min.html

  * igt@kms_plane_lowres@pipe-c-tiling-none:
    - shard-tglb:         [FAIL][33] ([i915#1897] / [i915#899]) -> [PASS][34]
   [33]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8495/shard-tglb2/igt@kms_plane_lowres@pipe-c-tiling-none.html
   [34]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17695/shard-tglb7/igt@kms_plane_lowres@pipe-c-tiling-none.html

  * igt@kms_psr@psr2_primary_page_flip:
    - shard-iclb:         [SKIP][35] ([fdo#109441]) -> [PASS][36]
   [35]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8495/shard-iclb6/igt@kms_psr@psr2_primary_page_flip.html
   [36]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17695/shard-iclb2/igt@kms_psr@psr2_primary_page_flip.html

  * igt@kms_psr@sprite_mmap_cpu:
    - shard-tglb:         [SKIP][37] ([i915#668]) -> [PASS][38] +1 similar issue
   [37]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8495/shard-tglb7/igt@kms_psr@sprite_mmap_cpu.html
   [38]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17695/shard-tglb6/igt@kms_psr@sprite_mmap_cpu.html

  
#### Warnings ####

  * igt@i915_pm_dc@dc6-dpms:
    - shard-tglb:         [FAIL][39] ([i915#402] / [i915#454]) -> [FAIL][40] ([i915#454])
   [39]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8495/shard-tglb7/igt@i915_pm_dc@dc6-dpms.html
   [40]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17695/shard-tglb6/igt@i915_pm_dc@dc6-dpms.html

  * igt@kms_frontbuffer_tracking@fbcpsr-1p-primscrn-indfb-msflip-blt:
    - shard-tglb:         [FAIL][41] ([i915#1897] / [i915#402]) -> [SKIP][42] ([i915#668])
   [41]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8495/shard-tglb8/igt@kms_frontbuffer_tracking@fbcpsr-1p-primscrn-indfb-msflip-blt.html
   [42]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17695/shard-tglb2/igt@kms_frontbuffer_tracking@fbcpsr-1p-primscrn-indfb-msflip-blt.html

  * igt@kms_pipe_crc_basic@read-crc-pipe-b:
    - shard-tglb:         [FAIL][43] ([i915#1897]) -> [FAIL][44] ([i915#1897] / [i915#402]) +1 similar issue
   [43]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8495/shard-tglb7/igt@kms_pipe_crc_basic@read-crc-pipe-b.html
   [44]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17695/shard-tglb3/igt@kms_pipe_crc_basic@read-crc-pipe-b.html

  * igt@kms_plane@plane-panning-top-left-pipe-c-planes:
    - shard-tglb:         [FAIL][45] ([i915#1897] / [i915#402]) -> [FAIL][46] ([i915#1897])
   [45]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8495/shard-tglb6/igt@kms_plane@plane-panning-top-left-pipe-c-planes.html
   [46]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17695/shard-tglb6/igt@kms_plane@plane-panning-top-left-pipe-c-planes.html

  * igt@kms_psr2_su@page_flip:
    - shard-tglb:         [SKIP][47] ([i915#668]) -> [FAIL][48] ([i915#608])
   [47]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8495/shard-tglb7/igt@kms_psr2_su@page_flip.html
   [48]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17695/shard-tglb6/igt@kms_psr2_su@page_flip.html

  
  {name}: This element is suppressed. This means it is ignored when computing
          the status of the difference (SUCCESS, WARNING, or FAILURE).

  [fdo#108145]: https://bugs.freedesktop.org/show_bug.cgi?id=108145
  [fdo#109441]: https://bugs.freedesktop.org/show_bug.cgi?id=109441
  [i915#1149]: https://gitlab.freedesktop.org/drm/intel/issues/1149
  [i915#1172]: https://gitlab.freedesktop.org/drm/intel/issues/1172
  [i915#180]: https://gitlab.freedesktop.org/drm/intel/issues/180
  [i915#1883]: https://gitlab.freedesktop.org/drm/intel/issues/1883
  [i915#1897]: https://gitlab.freedesktop.org/drm/intel/issues/1897
  [i915#198]: https://gitlab.freedesktop.org/drm/intel/issues/198
  [i915#265]: https://gitlab.freedesktop.org/drm/intel/issues/265
  [i915#402]: https://gitlab.freedesktop.org/drm/intel/issues/402
  [i915#454]: https://gitlab.freedesktop.org/drm/intel/issues/454
  [i915#54]: https://gitlab.freedesktop.org/drm/intel/issues/54
  [i915#608]: https://gitlab.freedesktop.org/drm/intel/issues/608
  [i915#64]: https://gitlab.freedesktop.org/drm/intel/issues/64
  [i915#668]: https://gitlab.freedesktop.org/drm/intel/issues/668
  [i915#69]: https://gitlab.freedesktop.org/drm/intel/issues/69
  [i915#79]: https://gitlab.freedesktop.org/drm/intel/issues/79
  [i915#899]: https://gitlab.freedesktop.org/drm/intel/issues/899
  [i915#95]: https://gitlab.freedesktop.org/drm/intel/issues/95


Participating hosts (11 -> 11)
------------------------------

  No changes in participating hosts


Build changes
-------------

  * Linux: CI_DRM_8495 -> Patchwork_17695

  CI-20190529: 20190529
  CI_DRM_8495: 695b7d214a475060bb448c2609e4184fc450d69a @ git://anongit.freedesktop.org/gfx-ci/linux
  IGT_5659: 66ab5e42811fee3dea8c21ab29e70e323a0650de @ git://anongit.freedesktop.org/xorg/app/intel-gpu-tools
  Patchwork_17695: 2633bb73f44c35def35771444a87a1cc99411d8a @ git://anongit.freedesktop.org/gfx-ci/linux
  piglit_4509: fdc5a4ca11124ab8413c7988896eec4c97336694 @ git://anongit.freedesktop.org/piglit

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17695/index.html
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [Intel-gfx] [PATCH] drm/i915/selftests: Measure dispatch latency
  2020-05-18 14:39 [Intel-gfx] [PATCH v2] drm/i915/selftests: Measure dispatch latency Chris Wilson
                   ` (4 preceding siblings ...)
  2020-05-18 20:02 ` [Intel-gfx] ✓ Fi.CI.IGT: " Patchwork
@ 2020-05-19 11:41 ` Chris Wilson
  2020-05-19 12:47   ` Mika Kuoppala
  2020-05-19 13:25 ` [Intel-gfx] ✓ Fi.CI.BAT: success for drm/i915/selftests: Measure dispatch latency (rev9) Patchwork
  6 siblings, 1 reply; 18+ messages in thread
From: Chris Wilson @ 2020-05-19 11:41 UTC (permalink / raw)
  To: intel-gfx; +Cc: Chris Wilson

A useful metric of the system's health is how fast we can tell the GPU
to do various actions, so measure our latency.

v2: Refactor all the instruction building into emitters.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 drivers/gpu/drm/i915/selftests/i915_request.c | 779 ++++++++++++++++++
 1 file changed, 779 insertions(+)

diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c
index 6014e8dfcbb1..db09e9cb54b8 100644
--- a/drivers/gpu/drm/i915/selftests/i915_request.c
+++ b/drivers/gpu/drm/i915/selftests/i915_request.c
@@ -24,16 +24,20 @@
 
 #include <linux/prime_numbers.h>
 #include <linux/pm_qos.h>
+#include <linux/sort.h>
 
 #include "gem/i915_gem_pm.h"
 #include "gem/selftests/mock_context.h"
 
+#include "gt/intel_engine_heartbeat.h"
 #include "gt/intel_engine_pm.h"
 #include "gt/intel_engine_user.h"
 #include "gt/intel_gt.h"
+#include "gt/intel_gt_requests.h"
 
 #include "i915_random.h"
 #include "i915_selftest.h"
+#include "igt_flush_test.h"
 #include "igt_live_test.h"
 #include "igt_spinner.h"
 #include "lib_sw_fence.h"
@@ -1524,6 +1528,780 @@ struct perf_series {
 	struct intel_context *ce[];
 };
 
+static int cmp_u32(const void *A, const void *B)
+{
+	const u32 *a = A, *b = B;
+
+	return *a - *b;
+}
+
+static u32 trifilter(u32 *a)
+{
+	u64 sum;
+
+#define TF_COUNT 5
+	sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL);
+
+	sum = mul_u32_u32(a[2], 2);
+	sum += a[1];
+	sum += a[3];
+
+	GEM_BUG_ON(sum > U32_MAX);
+	return sum;
+#define TF_BIAS 2
+}
+
+static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
+{
+	u64 ns = i915_cs_timestamp_ticks_to_ns(engine->i915, cycles);
+
+	return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS);
+}
+
+static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset)
+{
+	*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+	*cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base)));
+	*cs++ = offset;
+	*cs++ = 0;
+
+	return cs;
+}
+
+static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value)
+{
+	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+	*cs++ = offset;
+	*cs++ = 0;
+	*cs++ = value;
+
+	return cs;
+}
+
+static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset)
+{
+	*cs++ = MI_SEMAPHORE_WAIT |
+		MI_SEMAPHORE_GLOBAL_GTT |
+		MI_SEMAPHORE_POLL |
+		mode;
+	*cs++ = value;
+	*cs++ = offset;
+	*cs++ = 0;
+
+	return cs;
+}
+
+static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value)
+{
+	return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset);
+}
+
+static void semaphore_set(u32 *sema, u32 value)
+{
+	WRITE_ONCE(*sema, value);
+	wmb(); /* flush the update to the cache, and beyond */
+}
+
+static u32 *hwsp_scratch(const struct intel_context *ce)
+{
+	return memset32(ce->engine->status_page.addr + 1000, 0, 21);
+}
+
+static u32 hwsp_offset(const struct intel_context *ce, u32 *dw)
+{
+	return (i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(dw));
+}
+
+static int measure_semaphore_response(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	u32 elapsed[TF_COUNT], cycles;
+	struct i915_request *rq;
+	u32 *cs;
+	int i;
+
+	/*
+	 * Measure how many cycles it takes for the HW to detect the change
+	 * in a semaphore value.
+	 *
+	 *    A: read CS_TIMESTAMP from CPU
+	 *    poke semaphore
+	 *    B: read CS_TIMESTAMP on GPU
+	 *
+	 * Semaphore latency: B - A
+	 */
+
+	semaphore_set(sema, -1);
+
+	rq = i915_request_create(ce);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed));
+	if (IS_ERR(cs)) {
+		i915_request_add(rq);
+		return PTR_ERR(cs);
+	}
+
+	cs = emit_store_dw(cs, offset, 0);
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		cs = emit_semaphore_poll_until(cs, offset, i);
+		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+		cs = emit_store_dw(cs, offset, 0);
+	}
+
+	intel_ring_advance(rq, cs);
+	i915_request_add(rq);
+
+	if (wait_for(READ_ONCE(*sema) == 0, 50)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		preempt_disable();
+		cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		semaphore_set(sema, i);
+		preempt_enable();
+
+		if (wait_for(READ_ONCE(*sema) == 0, 50)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		elapsed[i - 1] = sema[i] - cycles;
+	}
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: semaphore response %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_idle_dispatch(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	u32 elapsed[TF_COUNT], cycles;
+	u32 *cs;
+	int err;
+	int i;
+
+	/*
+	 * Measure how long it takes for us to submit a request while the
+	 * engine is idle, but is resting in our context.
+	 *
+	 *    A: read CS_TIMESTAMP from CPU
+	 *    submit request
+	 *    B: read CS_TIMESTAMP on GPU
+	 *
+	 * Submission latency: B - A
+	 */
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+
+		err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+		if (err)
+			return err;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 4);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+
+		intel_ring_advance(rq, cs);
+
+		preempt_disable();
+		local_bh_disable();
+		elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		i915_request_add(rq);
+		local_bh_enable();
+		preempt_enable();
+	}
+
+	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+	if (err) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return err;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++)
+		elapsed[i] = sema[i] - elapsed[i];
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_busy_dispatch(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	u32 elapsed[TF_COUNT + 1], cycles;
+	u32 *cs;
+	int i;
+
+	/*
+	 * Measure how long it takes for us to submit a request while the
+	 * engine is busy, polling on a semaphore in our context. With
+	 * direct submission, this will include the cost of a lite restore.
+	 *
+	 *    A: read CS_TIMESTAMP from CPU
+	 *    submit request
+	 *    B: read CS_TIMESTAMP on GPU
+	 *
+	 * Submission latency: B - A
+	 */
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 12);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
+		cs = emit_semaphore_poll_until(cs, offset, i);
+		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+
+		intel_ring_advance(rq, cs);
+
+		if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		preempt_disable();
+		local_bh_disable();
+		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		i915_request_add(rq);
+		local_bh_enable();
+		semaphore_set(sema, i - 1);
+		preempt_enable();
+	}
+
+	wait_for(READ_ONCE(sema[i - 1]), 500);
+	semaphore_set(sema, i - 1);
+
+	for (i = 1; i <= TF_COUNT; i++) {
+		GEM_BUG_ON(sema[i] == -1);
+		elapsed[i - 1] = sema[i] - elapsed[i];
+	}
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
+{
+	const u32 offset =
+		i915_ggtt_offset(engine->status_page.vma) +
+		offset_in_page(sema);
+	struct i915_request *rq;
+	u32 *cs;
+
+	rq = i915_request_create(engine->kernel_context);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	cs = intel_ring_begin(rq, 4);
+	if (IS_ERR(cs)) {
+		i915_request_add(rq);
+		return PTR_ERR(cs);
+	}
+
+	cs = emit_semaphore_poll(cs, mode, value, offset);
+
+	intel_ring_advance(rq, cs);
+	i915_request_add(rq);
+
+	return 0;
+}
+
+static int measure_inter_request(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	u32 elapsed[TF_COUNT + 1], cycles;
+	struct i915_sw_fence *submit;
+	int i, err;
+
+	/*
+	 * Measure how long it takes to advance from one request into the
+	 * next. Between each request we flush the GPU caches to memory,
+	 * update the breadcrumbs, and then invalidate those caches.
+	 * We queue up all the requests to be submitted in one batch so
+	 * it should be one set of contiguous measurements.
+	 *
+	 *    A: read CS_TIMESTAMP on GPU
+	 *    advance request
+	 *    B: read CS_TIMESTAMP on GPU
+	 *
+	 * Request latency: B - A
+	 */
+
+	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+	if (err)
+		return err;
+
+	submit = heap_fence_create(GFP_KERNEL);
+	if (!submit) {
+		semaphore_set(sema, 1);
+		return -ENOMEM;
+	}
+
+	intel_engine_flush_submission(ce->engine);
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+		u32 *cs;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq)) {
+			semaphore_set(sema, 1);
+			return PTR_ERR(rq);
+		}
+
+		err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
+						       submit,
+						       GFP_KERNEL);
+		if (err < 0) {
+			i915_sw_fence_commit(submit);
+			heap_fence_put(submit);
+			i915_request_add(rq);
+			semaphore_set(sema, 1);
+			return err;
+		}
+
+		cs = intel_ring_begin(rq, 4);
+		if (IS_ERR(cs)) {
+			i915_sw_fence_commit(submit);
+			heap_fence_put(submit);
+			i915_request_add(rq);
+			semaphore_set(sema, 1);
+			return PTR_ERR(cs);
+		}
+
+		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+
+		intel_ring_advance(rq, cs);
+		i915_request_add(rq);
+	}
+	local_bh_disable();
+	i915_sw_fence_commit(submit);
+	local_bh_enable();
+	intel_engine_flush_submission(ce->engine);
+	heap_fence_put(submit);
+
+	semaphore_set(sema, 1);
+	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+	if (err) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return err;
+	}
+
+	for (i = 1; i <= TF_COUNT; i++)
+		elapsed[i - 1] = sema[i + 1] - sema[i];
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: inter-request latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_context_switch(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	struct i915_request *fence = NULL;
+	u32 elapsed[TF_COUNT + 1], cycles;
+	int i, j, err;
+	u32 *cs;
+
+	/*
+	 * Measure how long it takes to advance from one request in one
+	 * context to a request in another context. This allows us to
+	 * measure how long the context save/restore take, along with all
+	 * the inter-context setup we require.
+	 *
+	 *    A: read CS_TIMESTAMP on GPU
+	 *    switch context
+	 *    B: read CS_TIMESTAMP on GPU
+	 *
+	 * Context switch latency: B - A
+	 */
+
+	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+	if (err)
+		return err;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct intel_context *arr[] = {
+			ce, ce->engine->kernel_context
+		};
+		u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32);
+
+		for (j = 0; j < ARRAY_SIZE(arr); j++) {
+			struct i915_request *rq;
+
+			rq = i915_request_create(arr[j]);
+			if (IS_ERR(rq))
+				return PTR_ERR(rq);
+
+			if (fence) {
+				err = i915_request_await_dma_fence(rq,
+								   &fence->fence);
+				if (err) {
+					i915_request_add(rq);
+					return err;
+				}
+			}
+
+			cs = intel_ring_begin(rq, 4);
+			if (IS_ERR(cs)) {
+				i915_request_add(rq);
+				return PTR_ERR(cs);
+			}
+
+			cs = emit_timestamp_store(cs, ce, addr);
+			addr += sizeof(u32);
+
+			intel_ring_advance(rq, cs);
+
+			i915_request_put(fence);
+			fence = i915_request_get(rq);
+
+			i915_request_add(rq);
+		}
+	}
+	i915_request_put(fence);
+	intel_engine_flush_submission(ce->engine);
+
+	semaphore_set(sema, 1);
+	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+	if (err) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return err;
+	}
+
+	for (i = 1; i <= TF_COUNT; i++)
+		elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1];
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: context switch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_preemption(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	u32 elapsed[TF_COUNT], cycles;
+	u32 *cs;
+	int i;
+
+	/*
+	 * We measure two latencies while triggering preemption. The first
+	 * latency is how long it takes for us to submit a preempting request.
+	 * The second latency is how it takes for us to return from the
+	 * preemption back to the original context.
+	 *
+	 *    A: read CS_TIMESTAMP from CPU
+	 *    submit preemption
+	 *    B: read CS_TIMESTAMP on GPU (in preempting context)
+	 *    context switch
+	 *    C: read CS_TIMESTAMP on GPU (in original context)
+	 *
+	 * Preemption dispatch latency: B - A
+	 * Preemption switch latency: C - B
+	 */
+
+	if (!intel_engine_has_preemption(ce->engine))
+		return 0;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		u32 addr = offset + 2 * i * sizeof(u32);
+		struct i915_request *rq;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 12);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		cs = emit_store_dw(cs, addr, -1);
+		cs = emit_semaphore_poll_until(cs, offset, i);
+		cs = emit_timestamp_store(cs, ce, addr + sizeof(u32));
+
+		intel_ring_advance(rq, cs);
+		i915_request_add(rq);
+
+		if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		rq = i915_request_create(ce->engine->kernel_context);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 8);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		cs = emit_timestamp_store(cs, ce, addr);
+		cs = emit_store_dw(cs, offset, i);
+
+		intel_ring_advance(rq, cs);
+		rq->sched.attr.priority = I915_PRIORITY_BARRIER;
+
+		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		i915_request_add(rq);
+	}
+
+	if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= TF_COUNT; i++)
+		elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1];
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: preemption dispatch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	for (i = 1; i <= TF_COUNT; i++)
+		elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0];
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: preemption switch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+struct signal_cb {
+	struct dma_fence_cb base;
+	bool seen;
+};
+
+static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
+{
+	struct signal_cb *s = container_of(cb, typeof(*s), base);
+
+	smp_store_mb(s->seen, true); /* be safe, be strong */
+}
+
+static int measure_completion(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	u32 elapsed[TF_COUNT], cycles;
+	u32 *cs;
+	int err;
+	int i;
+
+	/*
+	 * Measure how long it takes for the signal (interrupt) to be
+	 * sent from the GPU to be processed by the CPU.
+	 *
+	 *    A: read CS_TIMESTAMP on GPU
+	 *    signal
+	 *    B: read CS_TIMESTAMP from CPU
+	 *
+	 * Completion latency: B - A
+	 */
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct signal_cb cb = { .seen = false };
+		struct i915_request *rq;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 12);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
+		cs = emit_semaphore_poll_until(cs, offset, i);
+		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+
+		intel_ring_advance(rq, cs);
+
+		dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
+
+		local_bh_disable();
+		i915_request_add(rq);
+		local_bh_enable();
+
+		if (wait_for(READ_ONCE(sema[i]) == -1, 50)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		preempt_disable();
+		semaphore_set(sema, i);
+		while (!READ_ONCE(cb.seen))
+			cpu_relax();
+
+		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		preempt_enable();
+	}
+
+	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+	if (err) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return err;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+		GEM_BUG_ON(sema[i + 1] == -1);
+		elapsed[i] = elapsed[i] - sema[i + 1];
+	}
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: completion latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static void rps_pin(struct intel_gt *gt)
+{
+	/* Pin the frequency to max */
+	atomic_inc(&gt->rps.num_waiters);
+	intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
+
+	mutex_lock(&gt->rps.lock);
+	intel_rps_set(&gt->rps, gt->rps.max_freq);
+	mutex_unlock(&gt->rps.lock);
+}
+
+static void rps_unpin(struct intel_gt *gt)
+{
+	intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
+	atomic_dec(&gt->rps.num_waiters);
+}
+
+static void engine_heartbeat_disable(struct intel_engine_cs *engine)
+{
+	engine->props.heartbeat_interval_ms = 0;
+
+	intel_engine_pm_get(engine);
+	intel_engine_park_heartbeat(engine);
+}
+
+static void engine_heartbeat_enable(struct intel_engine_cs *engine)
+{
+	intel_engine_pm_put(engine);
+
+	engine->props.heartbeat_interval_ms =
+		engine->defaults.heartbeat_interval_ms;
+}
+
+static int perf_request_latency(void *arg)
+{
+	struct drm_i915_private *i915 = arg;
+	struct intel_engine_cs *engine;
+	struct pm_qos_request qos;
+	int err = 0;
+
+	if (INTEL_GEN(i915) < 8) /* per-engine CS timestamp, semaphores */
+		return 0;
+
+	cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
+
+	for_each_uabi_engine(engine, i915) {
+		struct intel_context *ce;
+
+		ce = intel_context_create(engine);
+		if (IS_ERR(ce))
+			goto out;
+
+		err = intel_context_pin(ce);
+		if (err) {
+			intel_context_put(ce);
+			goto out;
+		}
+
+		engine_heartbeat_disable(engine);
+		rps_pin(engine->gt);
+
+		if (err == 0)
+			err = measure_semaphore_response(ce);
+		if (err == 0)
+			err = measure_idle_dispatch(ce);
+		if (err == 0)
+			err = measure_busy_dispatch(ce);
+		if (err == 0)
+			err = measure_inter_request(ce);
+		if (err == 0)
+			err = measure_context_switch(ce);
+		if (err == 0)
+			err = measure_preemption(ce);
+		if (err == 0)
+			err = measure_completion(ce);
+
+		rps_unpin(engine->gt);
+		engine_heartbeat_enable(engine);
+
+		intel_context_unpin(ce);
+		intel_context_put(ce);
+		if (err)
+			goto out;
+	}
+
+out:
+	if (igt_flush_test(i915))
+		err = -EIO;
+
+	cpu_latency_qos_remove_request(&qos);
+	return err;
+}
+
 static int s_sync0(void *arg)
 {
 	struct perf_series *ps = arg;
@@ -2042,6 +2820,7 @@ static int perf_parallel_engines(void *arg)
 int i915_request_perf_selftests(struct drm_i915_private *i915)
 {
 	static const struct i915_subtest tests[] = {
+		SUBTEST(perf_request_latency),
 		SUBTEST(perf_series_engines),
 		SUBTEST(perf_parallel_engines),
 	};
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [Intel-gfx] [PATCH] drm/i915/selftests: Measure dispatch latency
  2020-05-19 11:41 ` [Intel-gfx] [PATCH] drm/i915/selftests: Measure dispatch latency Chris Wilson
@ 2020-05-19 12:47   ` Mika Kuoppala
  2020-05-19 12:56     ` Chris Wilson
  0 siblings, 1 reply; 18+ messages in thread
From: Mika Kuoppala @ 2020-05-19 12:47 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx; +Cc: Chris Wilson

Chris Wilson <chris@chris-wilson.co.uk> writes:

> A useful metric of the system's health is how fast we can tell the GPU
> to do various actions, so measure our latency.
>
> v2: Refactor all the instruction building into emitters.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>

Not much nitpicking left. Could have used one goto in the fence
using tests on error paths but meh.

Lots of tests poking hw from different angles.
With a clear comments, it is like a guided tour of our
submission/scheduling front.

Analyzing of differences between different sets will
be interesting.

Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>

> ---
>  drivers/gpu/drm/i915/selftests/i915_request.c | 779 ++++++++++++++++++
>  1 file changed, 779 insertions(+)
>
> diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c
> index 6014e8dfcbb1..db09e9cb54b8 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_request.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_request.c
> @@ -24,16 +24,20 @@
>  
>  #include <linux/prime_numbers.h>
>  #include <linux/pm_qos.h>
> +#include <linux/sort.h>
>  
>  #include "gem/i915_gem_pm.h"
>  #include "gem/selftests/mock_context.h"
>  
> +#include "gt/intel_engine_heartbeat.h"
>  #include "gt/intel_engine_pm.h"
>  #include "gt/intel_engine_user.h"
>  #include "gt/intel_gt.h"
> +#include "gt/intel_gt_requests.h"
>  
>  #include "i915_random.h"
>  #include "i915_selftest.h"
> +#include "igt_flush_test.h"
>  #include "igt_live_test.h"
>  #include "igt_spinner.h"
>  #include "lib_sw_fence.h"
> @@ -1524,6 +1528,780 @@ struct perf_series {
>  	struct intel_context *ce[];
>  };
>  
> +static int cmp_u32(const void *A, const void *B)
> +{
> +	const u32 *a = A, *b = B;
> +
> +	return *a - *b;
> +}
> +
> +static u32 trifilter(u32 *a)
> +{
> +	u64 sum;
> +
> +#define TF_COUNT 5
> +	sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL);
> +
> +	sum = mul_u32_u32(a[2], 2);
> +	sum += a[1];
> +	sum += a[3];
> +
> +	GEM_BUG_ON(sum > U32_MAX);
> +	return sum;
> +#define TF_BIAS 2
> +}
> +
> +static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
> +{
> +	u64 ns = i915_cs_timestamp_ticks_to_ns(engine->i915, cycles);
> +
> +	return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS);
> +}
> +
> +static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset)
> +{
> +	*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
> +	*cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base)));
> +	*cs++ = offset;
> +	*cs++ = 0;
> +
> +	return cs;
> +}
> +
> +static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value)
> +{
> +	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
> +	*cs++ = offset;
> +	*cs++ = 0;
> +	*cs++ = value;
> +
> +	return cs;
> +}
> +
> +static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset)
> +{
> +	*cs++ = MI_SEMAPHORE_WAIT |
> +		MI_SEMAPHORE_GLOBAL_GTT |
> +		MI_SEMAPHORE_POLL |
> +		mode;
> +	*cs++ = value;
> +	*cs++ = offset;
> +	*cs++ = 0;
> +
> +	return cs;
> +}
> +
> +static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value)
> +{
> +	return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset);
> +}
> +
> +static void semaphore_set(u32 *sema, u32 value)
> +{
> +	WRITE_ONCE(*sema, value);
> +	wmb(); /* flush the update to the cache, and beyond */
> +}
> +
> +static u32 *hwsp_scratch(const struct intel_context *ce)
> +{
> +	return memset32(ce->engine->status_page.addr + 1000, 0, 21);
> +}
> +
> +static u32 hwsp_offset(const struct intel_context *ce, u32 *dw)
> +{
> +	return (i915_ggtt_offset(ce->engine->status_page.vma) +
> +		offset_in_page(dw));
> +}
> +
> +static int measure_semaphore_response(struct intel_context *ce)
> +{
> +	u32 *sema = hwsp_scratch(ce);
> +	const u32 offset = hwsp_offset(ce, sema);
> +	u32 elapsed[TF_COUNT], cycles;
> +	struct i915_request *rq;
> +	u32 *cs;
> +	int i;
> +
> +	/*
> +	 * Measure how many cycles it takes for the HW to detect the change
> +	 * in a semaphore value.
> +	 *
> +	 *    A: read CS_TIMESTAMP from CPU
> +	 *    poke semaphore
> +	 *    B: read CS_TIMESTAMP on GPU
> +	 *
> +	 * Semaphore latency: B - A
> +	 */
> +
> +	semaphore_set(sema, -1);
> +
> +	rq = i915_request_create(ce);
> +	if (IS_ERR(rq))
> +		return PTR_ERR(rq);
> +
> +	cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed));
> +	if (IS_ERR(cs)) {
> +		i915_request_add(rq);
> +		return PTR_ERR(cs);
> +	}
> +
> +	cs = emit_store_dw(cs, offset, 0);
> +	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
> +		cs = emit_semaphore_poll_until(cs, offset, i);
> +		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
> +		cs = emit_store_dw(cs, offset, 0);
> +	}
> +
> +	intel_ring_advance(rq, cs);
> +	i915_request_add(rq);
> +
> +	if (wait_for(READ_ONCE(*sema) == 0, 50)) {
> +		intel_gt_set_wedged(ce->engine->gt);
> +		return -EIO;
> +	}
> +
> +	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
> +		preempt_disable();
> +		cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
> +		semaphore_set(sema, i);
> +		preempt_enable();
> +
> +		if (wait_for(READ_ONCE(*sema) == 0, 50)) {
> +			intel_gt_set_wedged(ce->engine->gt);
> +			return -EIO;
> +		}
> +
> +		elapsed[i - 1] = sema[i] - cycles;
> +	}
> +
> +	cycles = trifilter(elapsed);
> +	pr_info("%s: semaphore response %d cycles, %lluns\n",
> +		ce->engine->name, cycles >> TF_BIAS,
> +		cycles_to_ns(ce->engine, cycles));
> +
> +	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
> +}
> +
> +static int measure_idle_dispatch(struct intel_context *ce)
> +{
> +	u32 *sema = hwsp_scratch(ce);
> +	const u32 offset = hwsp_offset(ce, sema);
> +	u32 elapsed[TF_COUNT], cycles;
> +	u32 *cs;
> +	int err;
> +	int i;
> +
> +	/*
> +	 * Measure how long it takes for us to submit a request while the
> +	 * engine is idle, but is resting in our context.
> +	 *
> +	 *    A: read CS_TIMESTAMP from CPU
> +	 *    submit request
> +	 *    B: read CS_TIMESTAMP on GPU
> +	 *
> +	 * Submission latency: B - A
> +	 */
> +
> +	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
> +		struct i915_request *rq;
> +
> +		err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
> +		if (err)
> +			return err;
> +
> +		rq = i915_request_create(ce);
> +		if (IS_ERR(rq))
> +			return PTR_ERR(rq);
> +
> +		cs = intel_ring_begin(rq, 4);
> +		if (IS_ERR(cs)) {
> +			i915_request_add(rq);
> +			return PTR_ERR(cs);
> +		}
> +
> +		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
> +
> +		intel_ring_advance(rq, cs);
> +
> +		preempt_disable();
> +		local_bh_disable();
> +		elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
> +		i915_request_add(rq);
> +		local_bh_enable();
> +		preempt_enable();
> +	}
> +
> +	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
> +	if (err) {
> +		intel_gt_set_wedged(ce->engine->gt);
> +		return err;
> +	}
> +
> +	for (i = 0; i < ARRAY_SIZE(elapsed); i++)
> +		elapsed[i] = sema[i] - elapsed[i];
> +
> +	cycles = trifilter(elapsed);
> +	pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
> +		ce->engine->name, cycles >> TF_BIAS,
> +		cycles_to_ns(ce->engine, cycles));
> +
> +	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
> +}
> +
> +static int measure_busy_dispatch(struct intel_context *ce)
> +{
> +	u32 *sema = hwsp_scratch(ce);
> +	const u32 offset = hwsp_offset(ce, sema);
> +	u32 elapsed[TF_COUNT + 1], cycles;
> +	u32 *cs;
> +	int i;
> +
> +	/*
> +	 * Measure how long it takes for us to submit a request while the
> +	 * engine is busy, polling on a semaphore in our context. With
> +	 * direct submission, this will include the cost of a lite restore.
> +	 *
> +	 *    A: read CS_TIMESTAMP from CPU
> +	 *    submit request
> +	 *    B: read CS_TIMESTAMP on GPU
> +	 *
> +	 * Submission latency: B - A
> +	 */
> +
> +	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
> +		struct i915_request *rq;
> +
> +		rq = i915_request_create(ce);
> +		if (IS_ERR(rq))
> +			return PTR_ERR(rq);
> +
> +		cs = intel_ring_begin(rq, 12);
> +		if (IS_ERR(cs)) {
> +			i915_request_add(rq);
> +			return PTR_ERR(cs);
> +		}
> +
> +		cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
> +		cs = emit_semaphore_poll_until(cs, offset, i);
> +		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
> +
> +		intel_ring_advance(rq, cs);
> +
> +		if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
> +			intel_gt_set_wedged(ce->engine->gt);
> +			return -EIO;
> +		}
> +
> +		preempt_disable();
> +		local_bh_disable();
> +		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
> +		i915_request_add(rq);
> +		local_bh_enable();
> +		semaphore_set(sema, i - 1);
> +		preempt_enable();
> +	}
> +
> +	wait_for(READ_ONCE(sema[i - 1]), 500);
> +	semaphore_set(sema, i - 1);
> +
> +	for (i = 1; i <= TF_COUNT; i++) {
> +		GEM_BUG_ON(sema[i] == -1);
> +		elapsed[i - 1] = sema[i] - elapsed[i];
> +	}
> +
> +	cycles = trifilter(elapsed);
> +	pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
> +		ce->engine->name, cycles >> TF_BIAS,
> +		cycles_to_ns(ce->engine, cycles));
> +
> +	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
> +}
> +
> +static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
> +{
> +	const u32 offset =
> +		i915_ggtt_offset(engine->status_page.vma) +
> +		offset_in_page(sema);
> +	struct i915_request *rq;
> +	u32 *cs;
> +
> +	rq = i915_request_create(engine->kernel_context);
> +	if (IS_ERR(rq))
> +		return PTR_ERR(rq);
> +
> +	cs = intel_ring_begin(rq, 4);
> +	if (IS_ERR(cs)) {
> +		i915_request_add(rq);
> +		return PTR_ERR(cs);
> +	}
> +
> +	cs = emit_semaphore_poll(cs, mode, value, offset);
> +
> +	intel_ring_advance(rq, cs);
> +	i915_request_add(rq);
> +
> +	return 0;
> +}
> +
> +static int measure_inter_request(struct intel_context *ce)
> +{
> +	u32 *sema = hwsp_scratch(ce);
> +	const u32 offset = hwsp_offset(ce, sema);
> +	u32 elapsed[TF_COUNT + 1], cycles;
> +	struct i915_sw_fence *submit;
> +	int i, err;
> +
> +	/*
> +	 * Measure how long it takes to advance from one request into the
> +	 * next. Between each request we flush the GPU caches to memory,
> +	 * update the breadcrumbs, and then invalidate those caches.
> +	 * We queue up all the requests to be submitted in one batch so
> +	 * it should be one set of contiguous measurements.
> +	 *
> +	 *    A: read CS_TIMESTAMP on GPU
> +	 *    advance request
> +	 *    B: read CS_TIMESTAMP on GPU
> +	 *
> +	 * Request latency: B - A
> +	 */
> +
> +	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
> +	if (err)
> +		return err;
> +
> +	submit = heap_fence_create(GFP_KERNEL);
> +	if (!submit) {
> +		semaphore_set(sema, 1);
> +		return -ENOMEM;
> +	}
> +
> +	intel_engine_flush_submission(ce->engine);
> +	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
> +		struct i915_request *rq;
> +		u32 *cs;
> +
> +		rq = i915_request_create(ce);
> +		if (IS_ERR(rq)) {
> +			semaphore_set(sema, 1);
> +			return PTR_ERR(rq);
> +		}
> +
> +		err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
> +						       submit,
> +						       GFP_KERNEL);
> +		if (err < 0) {
> +			i915_sw_fence_commit(submit);
> +			heap_fence_put(submit);
> +			i915_request_add(rq);
> +			semaphore_set(sema, 1);
> +			return err;
> +		}
> +
> +		cs = intel_ring_begin(rq, 4);
> +		if (IS_ERR(cs)) {
> +			i915_sw_fence_commit(submit);
> +			heap_fence_put(submit);
> +			i915_request_add(rq);
> +			semaphore_set(sema, 1);
> +			return PTR_ERR(cs);
> +		}
> +
> +		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
> +
> +		intel_ring_advance(rq, cs);
> +		i915_request_add(rq);
> +	}
> +	local_bh_disable();
> +	i915_sw_fence_commit(submit);
> +	local_bh_enable();
> +	intel_engine_flush_submission(ce->engine);
> +	heap_fence_put(submit);
> +
> +	semaphore_set(sema, 1);
> +	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
> +	if (err) {
> +		intel_gt_set_wedged(ce->engine->gt);
> +		return err;
> +	}
> +
> +	for (i = 1; i <= TF_COUNT; i++)
> +		elapsed[i - 1] = sema[i + 1] - sema[i];
> +
> +	cycles = trifilter(elapsed);
> +	pr_info("%s: inter-request latency %d cycles, %lluns\n",
> +		ce->engine->name, cycles >> TF_BIAS,
> +		cycles_to_ns(ce->engine, cycles));
> +
> +	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
> +}
> +
> +static int measure_context_switch(struct intel_context *ce)
> +{
> +	u32 *sema = hwsp_scratch(ce);
> +	const u32 offset = hwsp_offset(ce, sema);
> +	struct i915_request *fence = NULL;
> +	u32 elapsed[TF_COUNT + 1], cycles;
> +	int i, j, err;
> +	u32 *cs;
> +
> +	/*
> +	 * Measure how long it takes to advance from one request in one
> +	 * context to a request in another context. This allows us to
> +	 * measure how long the context save/restore take, along with all
> +	 * the inter-context setup we require.
> +	 *
> +	 *    A: read CS_TIMESTAMP on GPU
> +	 *    switch context
> +	 *    B: read CS_TIMESTAMP on GPU
> +	 *
> +	 * Context switch latency: B - A
> +	 */
> +
> +	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
> +	if (err)
> +		return err;
> +
> +	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
> +		struct intel_context *arr[] = {
> +			ce, ce->engine->kernel_context
> +		};
> +		u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32);
> +
> +		for (j = 0; j < ARRAY_SIZE(arr); j++) {
> +			struct i915_request *rq;
> +
> +			rq = i915_request_create(arr[j]);
> +			if (IS_ERR(rq))
> +				return PTR_ERR(rq);
> +
> +			if (fence) {
> +				err = i915_request_await_dma_fence(rq,
> +								   &fence->fence);
> +				if (err) {
> +					i915_request_add(rq);
> +					return err;
> +				}
> +			}
> +
> +			cs = intel_ring_begin(rq, 4);
> +			if (IS_ERR(cs)) {
> +				i915_request_add(rq);
> +				return PTR_ERR(cs);
> +			}
> +
> +			cs = emit_timestamp_store(cs, ce, addr);
> +			addr += sizeof(u32);
> +
> +			intel_ring_advance(rq, cs);
> +
> +			i915_request_put(fence);
> +			fence = i915_request_get(rq);
> +
> +			i915_request_add(rq);
> +		}
> +	}
> +	i915_request_put(fence);
> +	intel_engine_flush_submission(ce->engine);
> +
> +	semaphore_set(sema, 1);
> +	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
> +	if (err) {
> +		intel_gt_set_wedged(ce->engine->gt);
> +		return err;
> +	}
> +
> +	for (i = 1; i <= TF_COUNT; i++)
> +		elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1];
> +
> +	cycles = trifilter(elapsed);
> +	pr_info("%s: context switch latency %d cycles, %lluns\n",
> +		ce->engine->name, cycles >> TF_BIAS,
> +		cycles_to_ns(ce->engine, cycles));
> +
> +	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
> +}
> +
> +static int measure_preemption(struct intel_context *ce)
> +{
> +	u32 *sema = hwsp_scratch(ce);
> +	const u32 offset = hwsp_offset(ce, sema);
> +	u32 elapsed[TF_COUNT], cycles;
> +	u32 *cs;
> +	int i;
> +
> +	/*
> +	 * We measure two latencies while triggering preemption. The first
> +	 * latency is how long it takes for us to submit a preempting request.
> +	 * The second latency is how it takes for us to return from the
> +	 * preemption back to the original context.
> +	 *
> +	 *    A: read CS_TIMESTAMP from CPU
> +	 *    submit preemption
> +	 *    B: read CS_TIMESTAMP on GPU (in preempting context)
> +	 *    context switch
> +	 *    C: read CS_TIMESTAMP on GPU (in original context)
> +	 *
> +	 * Preemption dispatch latency: B - A
> +	 * Preemption switch latency: C - B
> +	 */
> +
> +	if (!intel_engine_has_preemption(ce->engine))
> +		return 0;
> +
> +	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
> +		u32 addr = offset + 2 * i * sizeof(u32);
> +		struct i915_request *rq;
> +
> +		rq = i915_request_create(ce);
> +		if (IS_ERR(rq))
> +			return PTR_ERR(rq);
> +
> +		cs = intel_ring_begin(rq, 12);
> +		if (IS_ERR(cs)) {
> +			i915_request_add(rq);
> +			return PTR_ERR(cs);
> +		}
> +
> +		cs = emit_store_dw(cs, addr, -1);
> +		cs = emit_semaphore_poll_until(cs, offset, i);
> +		cs = emit_timestamp_store(cs, ce, addr + sizeof(u32));
> +
> +		intel_ring_advance(rq, cs);
> +		i915_request_add(rq);
> +
> +		if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) {
> +			intel_gt_set_wedged(ce->engine->gt);
> +			return -EIO;
> +		}
> +
> +		rq = i915_request_create(ce->engine->kernel_context);
> +		if (IS_ERR(rq))
> +			return PTR_ERR(rq);
> +
> +		cs = intel_ring_begin(rq, 8);
> +		if (IS_ERR(cs)) {
> +			i915_request_add(rq);
> +			return PTR_ERR(cs);
> +		}
> +
> +		cs = emit_timestamp_store(cs, ce, addr);
> +		cs = emit_store_dw(cs, offset, i);
> +
> +		intel_ring_advance(rq, cs);
> +		rq->sched.attr.priority = I915_PRIORITY_BARRIER;
> +
> +		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
> +		i915_request_add(rq);
> +	}
> +
> +	if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) {
> +		intel_gt_set_wedged(ce->engine->gt);
> +		return -EIO;
> +	}
> +
> +	for (i = 1; i <= TF_COUNT; i++)
> +		elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1];
> +
> +	cycles = trifilter(elapsed);
> +	pr_info("%s: preemption dispatch latency %d cycles, %lluns\n",
> +		ce->engine->name, cycles >> TF_BIAS,
> +		cycles_to_ns(ce->engine, cycles));
> +
> +	for (i = 1; i <= TF_COUNT; i++)
> +		elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0];
> +
> +	cycles = trifilter(elapsed);
> +	pr_info("%s: preemption switch latency %d cycles, %lluns\n",
> +		ce->engine->name, cycles >> TF_BIAS,
> +		cycles_to_ns(ce->engine, cycles));
> +
> +	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
> +}
> +
> +struct signal_cb {
> +	struct dma_fence_cb base;
> +	bool seen;
> +};
> +
> +static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
> +{
> +	struct signal_cb *s = container_of(cb, typeof(*s), base);
> +
> +	smp_store_mb(s->seen, true); /* be safe, be strong */
> +}
> +
> +static int measure_completion(struct intel_context *ce)
> +{
> +	u32 *sema = hwsp_scratch(ce);
> +	const u32 offset = hwsp_offset(ce, sema);
> +	u32 elapsed[TF_COUNT], cycles;
> +	u32 *cs;
> +	int err;
> +	int i;
> +
> +	/*
> +	 * Measure how long it takes for the signal (interrupt) to be
> +	 * sent from the GPU to be processed by the CPU.
> +	 *
> +	 *    A: read CS_TIMESTAMP on GPU
> +	 *    signal
> +	 *    B: read CS_TIMESTAMP from CPU
> +	 *
> +	 * Completion latency: B - A
> +	 */
> +
> +	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
> +		struct signal_cb cb = { .seen = false };
> +		struct i915_request *rq;
> +
> +		rq = i915_request_create(ce);
> +		if (IS_ERR(rq))
> +			return PTR_ERR(rq);
> +
> +		cs = intel_ring_begin(rq, 12);
> +		if (IS_ERR(cs)) {
> +			i915_request_add(rq);
> +			return PTR_ERR(cs);
> +		}
> +
> +		cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
> +		cs = emit_semaphore_poll_until(cs, offset, i);
> +		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
> +
> +		intel_ring_advance(rq, cs);
> +
> +		dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
> +
> +		local_bh_disable();
> +		i915_request_add(rq);
> +		local_bh_enable();
> +
> +		if (wait_for(READ_ONCE(sema[i]) == -1, 50)) {
> +			intel_gt_set_wedged(ce->engine->gt);
> +			return -EIO;
> +		}
> +
> +		preempt_disable();
> +		semaphore_set(sema, i);
> +		while (!READ_ONCE(cb.seen))
> +			cpu_relax();
> +
> +		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
> +		preempt_enable();
> +	}
> +
> +	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
> +	if (err) {
> +		intel_gt_set_wedged(ce->engine->gt);
> +		return err;
> +	}
> +
> +	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
> +		GEM_BUG_ON(sema[i + 1] == -1);
> +		elapsed[i] = elapsed[i] - sema[i + 1];
> +	}
> +
> +	cycles = trifilter(elapsed);
> +	pr_info("%s: completion latency %d cycles, %lluns\n",
> +		ce->engine->name, cycles >> TF_BIAS,
> +		cycles_to_ns(ce->engine, cycles));
> +
> +	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
> +}
> +
> +static void rps_pin(struct intel_gt *gt)
> +{
> +	/* Pin the frequency to max */
> +	atomic_inc(&gt->rps.num_waiters);
> +	intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
> +
> +	mutex_lock(&gt->rps.lock);
> +	intel_rps_set(&gt->rps, gt->rps.max_freq);
> +	mutex_unlock(&gt->rps.lock);
> +}
> +
> +static void rps_unpin(struct intel_gt *gt)
> +{
> +	intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
> +	atomic_dec(&gt->rps.num_waiters);
> +}
> +
> +static void engine_heartbeat_disable(struct intel_engine_cs *engine)
> +{
> +	engine->props.heartbeat_interval_ms = 0;
> +
> +	intel_engine_pm_get(engine);
> +	intel_engine_park_heartbeat(engine);
> +}
> +
> +static void engine_heartbeat_enable(struct intel_engine_cs *engine)
> +{
> +	intel_engine_pm_put(engine);
> +
> +	engine->props.heartbeat_interval_ms =
> +		engine->defaults.heartbeat_interval_ms;
> +}
> +
> +static int perf_request_latency(void *arg)
> +{
> +	struct drm_i915_private *i915 = arg;
> +	struct intel_engine_cs *engine;
> +	struct pm_qos_request qos;
> +	int err = 0;
> +
> +	if (INTEL_GEN(i915) < 8) /* per-engine CS timestamp, semaphores */
> +		return 0;
> +
> +	cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
> +
> +	for_each_uabi_engine(engine, i915) {
> +		struct intel_context *ce;
> +
> +		ce = intel_context_create(engine);
> +		if (IS_ERR(ce))
> +			goto out;
> +
> +		err = intel_context_pin(ce);
> +		if (err) {
> +			intel_context_put(ce);
> +			goto out;
> +		}
> +
> +		engine_heartbeat_disable(engine);
> +		rps_pin(engine->gt);
> +
> +		if (err == 0)
> +			err = measure_semaphore_response(ce);
> +		if (err == 0)
> +			err = measure_idle_dispatch(ce);
> +		if (err == 0)
> +			err = measure_busy_dispatch(ce);
> +		if (err == 0)
> +			err = measure_inter_request(ce);
> +		if (err == 0)
> +			err = measure_context_switch(ce);
> +		if (err == 0)
> +			err = measure_preemption(ce);
> +		if (err == 0)
> +			err = measure_completion(ce);
> +
> +		rps_unpin(engine->gt);
> +		engine_heartbeat_enable(engine);
> +
> +		intel_context_unpin(ce);
> +		intel_context_put(ce);
> +		if (err)
> +			goto out;
> +	}
> +
> +out:
> +	if (igt_flush_test(i915))
> +		err = -EIO;
> +
> +	cpu_latency_qos_remove_request(&qos);
> +	return err;
> +}
> +
>  static int s_sync0(void *arg)
>  {
>  	struct perf_series *ps = arg;
> @@ -2042,6 +2820,7 @@ static int perf_parallel_engines(void *arg)
>  int i915_request_perf_selftests(struct drm_i915_private *i915)
>  {
>  	static const struct i915_subtest tests[] = {
> +		SUBTEST(perf_request_latency),
>  		SUBTEST(perf_series_engines),
>  		SUBTEST(perf_parallel_engines),
>  	};
> -- 
> 2.20.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Intel-gfx] [PATCH] drm/i915/selftests: Measure dispatch latency
  2020-05-19 12:47   ` Mika Kuoppala
@ 2020-05-19 12:56     ` Chris Wilson
  0 siblings, 0 replies; 18+ messages in thread
From: Chris Wilson @ 2020-05-19 12:56 UTC (permalink / raw)
  To: Mika Kuoppala, intel-gfx

Quoting Mika Kuoppala (2020-05-19 13:47:31)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > A useful metric of the system's health is how fast we can tell the GPU
> > to do various actions, so measure our latency.
> >
> > v2: Refactor all the instruction building into emitters.
> >
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> > Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> 
> Not much nitpicking left. Could have used one goto in the fence
> using tests on error paths but meh.

Error handling is not great here, I agree.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [Intel-gfx] ✓ Fi.CI.BAT: success for drm/i915/selftests: Measure dispatch latency (rev9)
  2020-05-18 14:39 [Intel-gfx] [PATCH v2] drm/i915/selftests: Measure dispatch latency Chris Wilson
                   ` (5 preceding siblings ...)
  2020-05-19 11:41 ` [Intel-gfx] [PATCH] drm/i915/selftests: Measure dispatch latency Chris Wilson
@ 2020-05-19 13:25 ` Patchwork
  6 siblings, 0 replies; 18+ messages in thread
From: Patchwork @ 2020-05-19 13:25 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: drm/i915/selftests: Measure dispatch latency (rev9)
URL   : https://patchwork.freedesktop.org/series/77308/
State : success

== Summary ==

CI Bug Log - changes from CI_DRM_8502 -> Patchwork_17709
====================================================

Summary
-------

  **SUCCESS**

  No regressions found.

  External URL: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17709/index.html

Known issues
------------

  Here are the changes found in Patchwork_17709 that come from known issues:

### IGT changes ###

#### Possible fixes ####

  * igt@i915_selftest@live@execlists:
    - fi-cfl-guc:         [INCOMPLETE][1] ([i915#656]) -> [PASS][2]
   [1]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8502/fi-cfl-guc/igt@i915_selftest@live@execlists.html
   [2]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17709/fi-cfl-guc/igt@i915_selftest@live@execlists.html

  
  [i915#656]: https://gitlab.freedesktop.org/drm/intel/issues/656


Participating hosts (50 -> 43)
------------------------------

  Missing    (7): fi-ilk-m540 fi-hsw-4200u fi-byt-squawks fi-bsw-cyan fi-ctg-p8600 fi-kbl-7560u fi-byt-clapper 


Build changes
-------------

  * Linux: CI_DRM_8502 -> Patchwork_17709

  CI-20190529: 20190529
  CI_DRM_8502: 5bafb3de802a8dd663009250b587c1a78ad298c9 @ git://anongit.freedesktop.org/gfx-ci/linux
  IGT_5659: 66ab5e42811fee3dea8c21ab29e70e323a0650de @ git://anongit.freedesktop.org/xorg/app/intel-gpu-tools
  Patchwork_17709: a072e6b6f5c787b3051f16f09afc6cd411b99af9 @ git://anongit.freedesktop.org/gfx-ci/linux


== Linux commits ==

a072e6b6f5c7 drm/i915/selftests: Measure dispatch latency

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17709/index.html
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [Intel-gfx] [PATCH] drm/i915/selftests: Measure dispatch latency
@ 2020-05-19 13:08 Chris Wilson
  0 siblings, 0 replies; 18+ messages in thread
From: Chris Wilson @ 2020-05-19 13:08 UTC (permalink / raw)
  To: intel-gfx; +Cc: Chris Wilson

A useful metric of the system's health is how fast we can tell the GPU
to do various actions, so measure our latency.

v2: Refactor all the instruction building into emitters.
v3: Mark the error handling if not perfect, at least consistent.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
 drivers/gpu/drm/i915/selftests/i915_request.c | 823 ++++++++++++++++++
 1 file changed, 823 insertions(+)

diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c
index 6014e8dfcbb1..92c628f18c60 100644
--- a/drivers/gpu/drm/i915/selftests/i915_request.c
+++ b/drivers/gpu/drm/i915/selftests/i915_request.c
@@ -24,16 +24,20 @@
 
 #include <linux/prime_numbers.h>
 #include <linux/pm_qos.h>
+#include <linux/sort.h>
 
 #include "gem/i915_gem_pm.h"
 #include "gem/selftests/mock_context.h"
 
+#include "gt/intel_engine_heartbeat.h"
 #include "gt/intel_engine_pm.h"
 #include "gt/intel_engine_user.h"
 #include "gt/intel_gt.h"
+#include "gt/intel_gt_requests.h"
 
 #include "i915_random.h"
 #include "i915_selftest.h"
+#include "igt_flush_test.h"
 #include "igt_live_test.h"
 #include "igt_spinner.h"
 #include "lib_sw_fence.h"
@@ -1524,6 +1528,824 @@ struct perf_series {
 	struct intel_context *ce[];
 };
 
+static int cmp_u32(const void *A, const void *B)
+{
+	const u32 *a = A, *b = B;
+
+	return *a - *b;
+}
+
+static u32 trifilter(u32 *a)
+{
+	u64 sum;
+
+#define TF_COUNT 5
+	sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL);
+
+	sum = mul_u32_u32(a[2], 2);
+	sum += a[1];
+	sum += a[3];
+
+	GEM_BUG_ON(sum > U32_MAX);
+	return sum;
+#define TF_BIAS 2
+}
+
+static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
+{
+	u64 ns = i915_cs_timestamp_ticks_to_ns(engine->i915, cycles);
+
+	return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS);
+}
+
+static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset)
+{
+	*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+	*cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base)));
+	*cs++ = offset;
+	*cs++ = 0;
+
+	return cs;
+}
+
+static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value)
+{
+	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+	*cs++ = offset;
+	*cs++ = 0;
+	*cs++ = value;
+
+	return cs;
+}
+
+static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset)
+{
+	*cs++ = MI_SEMAPHORE_WAIT |
+		MI_SEMAPHORE_GLOBAL_GTT |
+		MI_SEMAPHORE_POLL |
+		mode;
+	*cs++ = value;
+	*cs++ = offset;
+	*cs++ = 0;
+
+	return cs;
+}
+
+static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value)
+{
+	return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset);
+}
+
+static void semaphore_set(u32 *sema, u32 value)
+{
+	WRITE_ONCE(*sema, value);
+	wmb(); /* flush the update to the cache, and beyond */
+}
+
+static u32 *hwsp_scratch(const struct intel_context *ce)
+{
+	return memset32(ce->engine->status_page.addr + 1000, 0, 21);
+}
+
+static u32 hwsp_offset(const struct intel_context *ce, u32 *dw)
+{
+	return (i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(dw));
+}
+
+static int measure_semaphore_response(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	u32 elapsed[TF_COUNT], cycles;
+	struct i915_request *rq;
+	u32 *cs;
+	int err;
+	int i;
+
+	/*
+	 * Measure how many cycles it takes for the HW to detect the change
+	 * in a semaphore value.
+	 *
+	 *    A: read CS_TIMESTAMP from CPU
+	 *    poke semaphore
+	 *    B: read CS_TIMESTAMP on GPU
+	 *
+	 * Semaphore latency: B - A
+	 */
+
+	semaphore_set(sema, -1);
+
+	rq = i915_request_create(ce);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed));
+	if (IS_ERR(cs)) {
+		i915_request_add(rq);
+		err = PTR_ERR(cs);
+		goto err;
+	}
+
+	cs = emit_store_dw(cs, offset, 0);
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		cs = emit_semaphore_poll_until(cs, offset, i);
+		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+		cs = emit_store_dw(cs, offset, 0);
+	}
+
+	intel_ring_advance(rq, cs);
+	i915_request_add(rq);
+
+	if (wait_for(READ_ONCE(*sema) == 0, 50)) {
+		err = -EIO;
+		goto err;
+	}
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		preempt_disable();
+		cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		semaphore_set(sema, i);
+		preempt_enable();
+
+		if (wait_for(READ_ONCE(*sema) == 0, 50)) {
+			err = -EIO;
+			goto err;
+		}
+
+		elapsed[i - 1] = sema[i] - cycles;
+	}
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: semaphore response %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+
+err:
+	intel_gt_set_wedged(ce->engine->gt);
+	return err;
+}
+
+static int measure_idle_dispatch(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	u32 elapsed[TF_COUNT], cycles;
+	u32 *cs;
+	int err;
+	int i;
+
+	/*
+	 * Measure how long it takes for us to submit a request while the
+	 * engine is idle, but is resting in our context.
+	 *
+	 *    A: read CS_TIMESTAMP from CPU
+	 *    submit request
+	 *    B: read CS_TIMESTAMP on GPU
+	 *
+	 * Submission latency: B - A
+	 */
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+
+		err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+		if (err)
+			return err;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq)) {
+			err = PTR_ERR(rq);
+			goto err;
+		}
+
+		cs = intel_ring_begin(rq, 4);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			err = PTR_ERR(cs);
+			goto err;
+		}
+
+		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+
+		intel_ring_advance(rq, cs);
+
+		preempt_disable();
+		local_bh_disable();
+		elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		i915_request_add(rq);
+		local_bh_enable();
+		preempt_enable();
+	}
+
+	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+	if (err)
+		goto err;
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++)
+		elapsed[i] = sema[i] - elapsed[i];
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+
+err:
+	intel_gt_set_wedged(ce->engine->gt);
+	return err;
+}
+
+static int measure_busy_dispatch(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	u32 elapsed[TF_COUNT + 1], cycles;
+	u32 *cs;
+	int err;
+	int i;
+
+	/*
+	 * Measure how long it takes for us to submit a request while the
+	 * engine is busy, polling on a semaphore in our context. With
+	 * direct submission, this will include the cost of a lite restore.
+	 *
+	 *    A: read CS_TIMESTAMP from CPU
+	 *    submit request
+	 *    B: read CS_TIMESTAMP on GPU
+	 *
+	 * Submission latency: B - A
+	 */
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq)) {
+			err = PTR_ERR(rq);
+			goto err;
+		}
+
+		cs = intel_ring_begin(rq, 12);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			err = PTR_ERR(cs);
+			goto err;
+		}
+
+		cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
+		cs = emit_semaphore_poll_until(cs, offset, i);
+		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+
+		intel_ring_advance(rq, cs);
+
+		if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
+			err = -EIO;
+			goto err;
+		}
+
+		preempt_disable();
+		local_bh_disable();
+		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		i915_request_add(rq);
+		local_bh_enable();
+		semaphore_set(sema, i - 1);
+		preempt_enable();
+	}
+
+	wait_for(READ_ONCE(sema[i - 1]), 500);
+	semaphore_set(sema, i - 1);
+
+	for (i = 1; i <= TF_COUNT; i++) {
+		GEM_BUG_ON(sema[i] == -1);
+		elapsed[i - 1] = sema[i] - elapsed[i];
+	}
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+
+err:
+	intel_gt_set_wedged(ce->engine->gt);
+	return err;
+}
+
+static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
+{
+	const u32 offset =
+		i915_ggtt_offset(engine->status_page.vma) +
+		offset_in_page(sema);
+	struct i915_request *rq;
+	u32 *cs;
+
+	rq = i915_request_create(engine->kernel_context);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	cs = intel_ring_begin(rq, 4);
+	if (IS_ERR(cs)) {
+		i915_request_add(rq);
+		return PTR_ERR(cs);
+	}
+
+	cs = emit_semaphore_poll(cs, mode, value, offset);
+
+	intel_ring_advance(rq, cs);
+	i915_request_add(rq);
+
+	return 0;
+}
+
+static int measure_inter_request(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	u32 elapsed[TF_COUNT + 1], cycles;
+	struct i915_sw_fence *submit;
+	int i, err;
+
+	/*
+	 * Measure how long it takes to advance from one request into the
+	 * next. Between each request we flush the GPU caches to memory,
+	 * update the breadcrumbs, and then invalidate those caches.
+	 * We queue up all the requests to be submitted in one batch so
+	 * it should be one set of contiguous measurements.
+	 *
+	 *    A: read CS_TIMESTAMP on GPU
+	 *    advance request
+	 *    B: read CS_TIMESTAMP on GPU
+	 *
+	 * Request latency: B - A
+	 */
+
+	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+	if (err)
+		return err;
+
+	submit = heap_fence_create(GFP_KERNEL);
+	if (!submit) {
+		semaphore_set(sema, 1);
+		return -ENOMEM;
+	}
+
+	intel_engine_flush_submission(ce->engine);
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+		u32 *cs;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq)) {
+			err = PTR_ERR(rq);
+			goto err_submit;
+		}
+
+		err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
+						       submit,
+						       GFP_KERNEL);
+		if (err < 0) {
+			i915_request_add(rq);
+			goto err_submit;
+		}
+
+		cs = intel_ring_begin(rq, 4);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			err = PTR_ERR(cs);
+			goto err_submit;
+		}
+
+		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+
+		intel_ring_advance(rq, cs);
+		i915_request_add(rq);
+	}
+	local_bh_disable();
+	i915_sw_fence_commit(submit);
+	local_bh_enable();
+	intel_engine_flush_submission(ce->engine);
+	heap_fence_put(submit);
+
+	semaphore_set(sema, 1);
+	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+	if (err)
+		goto err;
+
+	for (i = 1; i <= TF_COUNT; i++)
+		elapsed[i - 1] = sema[i + 1] - sema[i];
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: inter-request latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+
+err_submit:
+	i915_sw_fence_commit(submit);
+	heap_fence_put(submit);
+	semaphore_set(sema, 1);
+err:
+	intel_gt_set_wedged(ce->engine->gt);
+	return err;
+}
+
+static int measure_context_switch(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	struct i915_request *fence = NULL;
+	u32 elapsed[TF_COUNT + 1], cycles;
+	int i, j, err;
+	u32 *cs;
+
+	/*
+	 * Measure how long it takes to advance from one request in one
+	 * context to a request in another context. This allows us to
+	 * measure how long the context save/restore take, along with all
+	 * the inter-context setup we require.
+	 *
+	 *    A: read CS_TIMESTAMP on GPU
+	 *    switch context
+	 *    B: read CS_TIMESTAMP on GPU
+	 *
+	 * Context switch latency: B - A
+	 */
+
+	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+	if (err)
+		return err;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct intel_context *arr[] = {
+			ce, ce->engine->kernel_context
+		};
+		u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32);
+
+		for (j = 0; j < ARRAY_SIZE(arr); j++) {
+			struct i915_request *rq;
+
+			rq = i915_request_create(arr[j]);
+			if (IS_ERR(rq)) {
+				err = PTR_ERR(rq);
+				goto err_fence;
+			}
+
+			if (fence) {
+				err = i915_request_await_dma_fence(rq,
+								   &fence->fence);
+				if (err) {
+					i915_request_add(rq);
+					goto err_fence;
+				}
+			}
+
+			cs = intel_ring_begin(rq, 4);
+			if (IS_ERR(cs)) {
+				i915_request_add(rq);
+				err = PTR_ERR(cs);
+				goto err_fence;
+			}
+
+			cs = emit_timestamp_store(cs, ce, addr);
+			addr += sizeof(u32);
+
+			intel_ring_advance(rq, cs);
+
+			i915_request_put(fence);
+			fence = i915_request_get(rq);
+
+			i915_request_add(rq);
+		}
+	}
+	i915_request_put(fence);
+	intel_engine_flush_submission(ce->engine);
+
+	semaphore_set(sema, 1);
+	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+	if (err)
+		goto err;
+
+	for (i = 1; i <= TF_COUNT; i++)
+		elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1];
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: context switch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+
+err_fence:
+	i915_request_put(fence);
+	semaphore_set(sema, 1);
+err:
+	intel_gt_set_wedged(ce->engine->gt);
+	return err;
+}
+
+static int measure_preemption(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	u32 elapsed[TF_COUNT], cycles;
+	u32 *cs;
+	int err;
+	int i;
+
+	/*
+	 * We measure two latencies while triggering preemption. The first
+	 * latency is how long it takes for us to submit a preempting request.
+	 * The second latency is how it takes for us to return from the
+	 * preemption back to the original context.
+	 *
+	 *    A: read CS_TIMESTAMP from CPU
+	 *    submit preemption
+	 *    B: read CS_TIMESTAMP on GPU (in preempting context)
+	 *    context switch
+	 *    C: read CS_TIMESTAMP on GPU (in original context)
+	 *
+	 * Preemption dispatch latency: B - A
+	 * Preemption switch latency: C - B
+	 */
+
+	if (!intel_engine_has_preemption(ce->engine))
+		return 0;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		u32 addr = offset + 2 * i * sizeof(u32);
+		struct i915_request *rq;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq)) {
+			err = PTR_ERR(rq);
+			goto err;
+		}
+
+		cs = intel_ring_begin(rq, 12);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			err = PTR_ERR(cs);
+			goto err;
+		}
+
+		cs = emit_store_dw(cs, addr, -1);
+		cs = emit_semaphore_poll_until(cs, offset, i);
+		cs = emit_timestamp_store(cs, ce, addr + sizeof(u32));
+
+		intel_ring_advance(rq, cs);
+		i915_request_add(rq);
+
+		if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) {
+			err = -EIO;
+			goto err;
+		}
+
+		rq = i915_request_create(ce->engine->kernel_context);
+		if (IS_ERR(rq)) {
+			err = PTR_ERR(rq);
+			goto err;
+		}
+
+		cs = intel_ring_begin(rq, 8);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			err = PTR_ERR(cs);
+			goto err;
+		}
+
+		cs = emit_timestamp_store(cs, ce, addr);
+		cs = emit_store_dw(cs, offset, i);
+
+		intel_ring_advance(rq, cs);
+		rq->sched.attr.priority = I915_PRIORITY_BARRIER;
+
+		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		i915_request_add(rq);
+	}
+
+	if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) {
+		err = -EIO;
+		goto err;
+	}
+
+	for (i = 1; i <= TF_COUNT; i++)
+		elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1];
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: preemption dispatch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	for (i = 1; i <= TF_COUNT; i++)
+		elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0];
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: preemption switch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+
+err:
+	intel_gt_set_wedged(ce->engine->gt);
+	return err;
+}
+
+struct signal_cb {
+	struct dma_fence_cb base;
+	bool seen;
+};
+
+static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
+{
+	struct signal_cb *s = container_of(cb, typeof(*s), base);
+
+	smp_store_mb(s->seen, true); /* be safe, be strong */
+}
+
+static int measure_completion(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	u32 elapsed[TF_COUNT], cycles;
+	u32 *cs;
+	int err;
+	int i;
+
+	/*
+	 * Measure how long it takes for the signal (interrupt) to be
+	 * sent from the GPU to be processed by the CPU.
+	 *
+	 *    A: read CS_TIMESTAMP on GPU
+	 *    signal
+	 *    B: read CS_TIMESTAMP from CPU
+	 *
+	 * Completion latency: B - A
+	 */
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct signal_cb cb = { .seen = false };
+		struct i915_request *rq;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq)) {
+			err = PTR_ERR(rq);
+			goto err;
+		}
+
+		cs = intel_ring_begin(rq, 12);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			err = PTR_ERR(cs);
+			goto err;
+		}
+
+		cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
+		cs = emit_semaphore_poll_until(cs, offset, i);
+		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+
+		intel_ring_advance(rq, cs);
+
+		dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
+
+		local_bh_disable();
+		i915_request_add(rq);
+		local_bh_enable();
+
+		if (wait_for(READ_ONCE(sema[i]) == -1, 50)) {
+			err = -EIO;
+			goto err;
+		}
+
+		preempt_disable();
+		semaphore_set(sema, i);
+		while (!READ_ONCE(cb.seen))
+			cpu_relax();
+
+		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		preempt_enable();
+	}
+
+	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+	if (err)
+		goto err;
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+		GEM_BUG_ON(sema[i + 1] == -1);
+		elapsed[i] = elapsed[i] - sema[i + 1];
+	}
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: completion latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+
+err:
+	intel_gt_set_wedged(ce->engine->gt);
+	return err;
+}
+
+static void rps_pin(struct intel_gt *gt)
+{
+	/* Pin the frequency to max */
+	atomic_inc(&gt->rps.num_waiters);
+	intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
+
+	mutex_lock(&gt->rps.lock);
+	intel_rps_set(&gt->rps, gt->rps.max_freq);
+	mutex_unlock(&gt->rps.lock);
+}
+
+static void rps_unpin(struct intel_gt *gt)
+{
+	intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
+	atomic_dec(&gt->rps.num_waiters);
+}
+
+static void engine_heartbeat_disable(struct intel_engine_cs *engine)
+{
+	engine->props.heartbeat_interval_ms = 0;
+
+	intel_engine_pm_get(engine);
+	intel_engine_park_heartbeat(engine);
+}
+
+static void engine_heartbeat_enable(struct intel_engine_cs *engine)
+{
+	intel_engine_pm_put(engine);
+
+	engine->props.heartbeat_interval_ms =
+		engine->defaults.heartbeat_interval_ms;
+}
+
+static int perf_request_latency(void *arg)
+{
+	struct drm_i915_private *i915 = arg;
+	struct intel_engine_cs *engine;
+	struct pm_qos_request qos;
+	int err = 0;
+
+	if (INTEL_GEN(i915) < 8) /* per-engine CS timestamp, semaphores */
+		return 0;
+
+	cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
+
+	for_each_uabi_engine(engine, i915) {
+		struct intel_context *ce;
+
+		ce = intel_context_create(engine);
+		if (IS_ERR(ce))
+			goto out;
+
+		err = intel_context_pin(ce);
+		if (err) {
+			intel_context_put(ce);
+			goto out;
+		}
+
+		engine_heartbeat_disable(engine);
+		rps_pin(engine->gt);
+
+		if (err == 0)
+			err = measure_semaphore_response(ce);
+		if (err == 0)
+			err = measure_idle_dispatch(ce);
+		if (err == 0)
+			err = measure_busy_dispatch(ce);
+		if (err == 0)
+			err = measure_inter_request(ce);
+		if (err == 0)
+			err = measure_context_switch(ce);
+		if (err == 0)
+			err = measure_preemption(ce);
+		if (err == 0)
+			err = measure_completion(ce);
+
+		rps_unpin(engine->gt);
+		engine_heartbeat_enable(engine);
+
+		intel_context_unpin(ce);
+		intel_context_put(ce);
+		if (err)
+			goto out;
+	}
+
+out:
+	if (igt_flush_test(i915))
+		err = -EIO;
+
+	cpu_latency_qos_remove_request(&qos);
+	return err;
+}
+
 static int s_sync0(void *arg)
 {
 	struct perf_series *ps = arg;
@@ -2042,6 +2864,7 @@ static int perf_parallel_engines(void *arg)
 int i915_request_perf_selftests(struct drm_i915_private *i915)
 {
 	static const struct i915_subtest tests[] = {
+		SUBTEST(perf_request_latency),
 		SUBTEST(perf_series_engines),
 		SUBTEST(perf_parallel_engines),
 	};
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [Intel-gfx] [PATCH] drm/i915/selftests: Measure dispatch latency
@ 2020-05-18  8:57 Chris Wilson
  0 siblings, 0 replies; 18+ messages in thread
From: Chris Wilson @ 2020-05-18  8:57 UTC (permalink / raw)
  To: intel-gfx; +Cc: Chris Wilson

A useful metric of the system's health is how fast we can tell the GPU
to do various actions, so measure our latency.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 drivers/gpu/drm/i915/selftests/i915_request.c | 802 ++++++++++++++++++
 1 file changed, 802 insertions(+)

diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c
index 6014e8dfcbb1..f9631526730d 100644
--- a/drivers/gpu/drm/i915/selftests/i915_request.c
+++ b/drivers/gpu/drm/i915/selftests/i915_request.c
@@ -24,16 +24,20 @@
 
 #include <linux/prime_numbers.h>
 #include <linux/pm_qos.h>
+#include <linux/sort.h>
 
 #include "gem/i915_gem_pm.h"
 #include "gem/selftests/mock_context.h"
 
+#include "gt/intel_engine_heartbeat.h"
 #include "gt/intel_engine_pm.h"
 #include "gt/intel_engine_user.h"
 #include "gt/intel_gt.h"
+#include "gt/intel_gt_requests.h"
 
 #include "i915_random.h"
 #include "i915_selftest.h"
+#include "igt_flush_test.h"
 #include "igt_live_test.h"
 #include "igt_spinner.h"
 #include "lib_sw_fence.h"
@@ -1524,6 +1528,803 @@ struct perf_series {
 	struct intel_context *ce[];
 };
 
+#define COUNT 5
+
+static int cmp_u32(const void *A, const void *B)
+{
+	const u32 *a = A, *b = B;
+
+	return *a - *b;
+}
+
+static u32 trifilter(u32 *a)
+{
+	u64 sum;
+
+	sort(a, COUNT, sizeof(*a), cmp_u32, NULL);
+
+	sum = mul_u32_u32(a[2], 2);
+	sum += a[1];
+	sum += a[3];
+
+	return (sum + 2) >> 2;
+}
+
+static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
+{
+	u64 ns = i915_cs_timestamp_ticks_to_ns(engine->i915, cycles);
+
+	return DIV_ROUND_CLOSEST(ns, 1 << COUNT);
+}
+
+static int measure_semaphore_response(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT], cycles;
+	struct i915_request *rq;
+	u32 *cs;
+	int i;
+
+	/*
+	 * Measure how many cycles it takes for the HW to detect the change
+	 * in a semaphore value.
+	 *
+	 *    A: read CS_TIMESTAMP from CPU
+	 *    poke semaphore
+	 *    B: read CS_TIMESTAMP on GPU
+	 *
+	 * Semaphore latency: B - A
+	 */
+
+	rq = i915_request_create(ce);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	cs = intel_ring_begin(rq, 4 + 8 * ARRAY_SIZE(elapsed));
+	if (IS_ERR(cs)) {
+		i915_request_add(rq);
+		return PTR_ERR(cs);
+	}
+
+	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+	*cs++ = offset;
+	*cs++ = 0;
+	*cs++ = 0xffffffff;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		*cs++ = MI_SEMAPHORE_WAIT |
+			MI_SEMAPHORE_GLOBAL_GTT |
+			MI_SEMAPHORE_POLL |
+			MI_SEMAPHORE_SAD_EQ_SDD;
+		*cs++ = i;
+		*cs++ = offset;
+		*cs++ = 0;
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+	}
+
+	intel_ring_advance(rq, cs);
+	i915_request_add(rq);
+
+	if (wait_for(READ_ONCE(*sema) == 0xffffffff, 50)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		preempt_disable();
+		cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		WRITE_ONCE(sema[0], i);
+		wmb(); /* flush the update to the cache, and beyond */
+		preempt_enable();
+
+		if (wait_for(READ_ONCE(sema[i]), 50)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		elapsed[i - 1] = (sema[i] - cycles) << COUNT;
+	}
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: semaphore response %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_idle_dispatch(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT], cycles;
+	u32 *cs;
+	int i;
+
+	/*
+	 * Measure how long it takes for us to submit a request while the
+	 * engine is idle, but is resting in our context.
+	 *
+	 *    A: read CS_TIMESTAMP from CPU
+	 *    submit request
+	 *    B: read CS_TIMESTAMP on GPU
+	 *
+	 * Submission latency: B - A
+	 */
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+		int err;
+
+		err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+		if (err)
+			return err;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 4);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		intel_ring_advance(rq, cs);
+
+		preempt_disable();
+		local_bh_disable();
+		elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		i915_request_add(rq);
+		local_bh_enable();
+		preempt_enable();
+	}
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+		if (wait_for(READ_ONCE(sema[i]), 50)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		elapsed[i] = (sema[i] - elapsed[i]) << COUNT;
+	}
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_busy_dispatch(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT + 1], cycles;
+	u32 *cs;
+	int i;
+
+	/*
+	 * Measure how long it takes for us to submit a request while the
+	 * engine is busy, polling on a semaphore in our context. With
+	 * direct submission, this will include the cost of an idle restore.
+	 *
+	 *    A: read CS_TIMESTAMP from CPU
+	 *    submit request
+	 *    B: read CS_TIMESTAMP on GPU
+	 *
+	 * Submission latency: B - A
+	 */
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 12);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		*cs++ = MI_SEMAPHORE_WAIT |
+			MI_SEMAPHORE_GLOBAL_GTT |
+			MI_SEMAPHORE_POLL |
+			MI_SEMAPHORE_SAD_GTE_SDD;
+		*cs++ = i;
+		*cs++ = offset;
+		*cs++ = 0;
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		intel_ring_advance(rq, cs);
+
+		if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		preempt_disable();
+		local_bh_disable();
+		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		i915_request_add(rq);
+		local_bh_enable();
+		WRITE_ONCE(sema[0], i - 1);
+		wmb(); /* flush the update to the cache, and beyond */
+		preempt_enable();
+	}
+	WRITE_ONCE(sema[0], i - 1);
+
+	for (i = 1; i <= COUNT; i++)
+		elapsed[i - 1] = (sema[i] - elapsed[i]) << COUNT;
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
+{
+	const u32 offset =
+		i915_ggtt_offset(engine->status_page.vma) +
+		offset_in_page(sema);
+	struct i915_request *rq;
+	u32 *cs;
+
+	rq = i915_request_create(engine->kernel_context);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	cs = intel_ring_begin(rq, 4);
+	if (IS_ERR(cs)) {
+		i915_request_add(rq);
+		return PTR_ERR(cs);
+	}
+
+	*cs++ = MI_SEMAPHORE_WAIT |
+		MI_SEMAPHORE_GLOBAL_GTT |
+		MI_SEMAPHORE_POLL |
+		mode;
+	*cs++ = value;
+	*cs++ = offset;
+	*cs++ = 0;
+
+	intel_ring_advance(rq, cs);
+	i915_request_add(rq);
+
+	return 0;
+}
+
+static int measure_inter_request(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT + 1], cycles;
+	struct i915_sw_fence *submit;
+	int i, err;
+
+	/*
+	 * Measure how long it takes to advance from one request into the
+	 * next. Between each request we flush the GPU caches to memory,
+	 * update the breadcrumbs, and then invalidate those caches.
+	 * We queue up all the requests to be submitted in one batch so
+	 * it should be one set of contiguous measurements.
+	 *
+	 *    A: read CS_TIMESTAMP on GPU
+	 *    advance request
+	 *    B: read CS_TIMESTAMP on GPU
+	 *
+	 * Request latency: B - A
+	 */
+
+	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+	if (err)
+		return err;
+
+	submit = heap_fence_create(GFP_KERNEL);
+	if (!submit) {
+		WRITE_ONCE(sema[0], 1);
+		return -ENOMEM;
+	}
+
+	intel_engine_flush_submission(ce->engine);
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+		u32 *cs;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq)) {
+			WRITE_ONCE(sema[0], 1);
+			return PTR_ERR(rq);
+		}
+
+		err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
+						       submit,
+						       GFP_KERNEL);
+		if (err < 0) {
+			i915_sw_fence_commit(submit);
+			heap_fence_put(submit);
+			i915_request_add(rq);
+			WRITE_ONCE(sema[0], 1);
+			return err;
+		}
+
+		cs = intel_ring_begin(rq, 4);
+		if (IS_ERR(cs)) {
+			i915_sw_fence_commit(submit);
+			heap_fence_put(submit);
+			i915_request_add(rq);
+			WRITE_ONCE(sema[0], 1);
+			return PTR_ERR(cs);
+		}
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		intel_ring_advance(rq, cs);
+		i915_request_add(rq);
+	}
+	local_bh_disable();
+	i915_sw_fence_commit(submit);
+	local_bh_enable();
+	intel_engine_flush_submission(ce->engine);
+	heap_fence_put(submit);
+
+	WRITE_ONCE(sema[0], 1);
+	wmb(); /* flush the update to the cache, and beyond */
+
+	if (wait_for(READ_ONCE(sema[COUNT + 1]), 100)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= COUNT; i++)
+		elapsed[i - 1] = (sema[i + 1] - sema[i]) << COUNT;
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: inter-request latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_context_switch(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	struct i915_request *fence = NULL;
+	u32 elapsed[COUNT + 1], cycles;
+	int i, j, err;
+	u32 *cs;
+
+	/*
+	 * Measure how long it takes to advance from one request in one
+	 * context to a request in another context. This allows us to
+	 * measure how long the context save/restore take, along with all
+	 * the inter-context setup we require.
+	 *
+	 *    A: read CS_TIMESTAMP on GPU
+	 *    switch context
+	 *    B: read CS_TIMESTAMP on GPU
+	 *
+	 * Context switch latency: B - A
+	 */
+
+	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+	if (err)
+		return err;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct intel_context *arr[] = {
+			ce, ce->engine->kernel_context
+		};
+
+		for (j = 0; j < ARRAY_SIZE(arr); j++) {
+			struct i915_request *rq;
+
+			rq = i915_request_create(arr[j]);
+			if (IS_ERR(rq))
+				return PTR_ERR(rq);
+
+			if (fence) {
+				err = i915_request_await_dma_fence(rq,
+								   &fence->fence);
+				if (err) {
+					i915_request_add(rq);
+					return err;
+				}
+			}
+
+			cs = intel_ring_begin(rq, 4);
+			if (IS_ERR(cs)) {
+				i915_request_add(rq);
+				return PTR_ERR(cs);
+			}
+
+			*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+			*cs++ = ce->engine->mmio_base + 0x358;
+			*cs++ = offset +
+				sizeof(*sema) * (ARRAY_SIZE(arr) * i + j);
+			*cs++ = 0;
+
+			intel_ring_advance(rq, cs);
+
+			i915_request_put(fence);
+			fence = i915_request_get(rq);
+
+			i915_request_add(rq);
+		}
+	}
+	i915_request_put(fence);
+	intel_engine_flush_submission(ce->engine);
+
+	WRITE_ONCE(sema[0], 1);
+	wmb(); /* flush the update to the cache, and beyond */
+
+	if (wait_for(READ_ONCE(sema[2 * i - 1]), 500)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= COUNT; i++)
+		elapsed[i - 1] = (sema[2 * i + 2] - sema[2 * i + 1]) << COUNT;
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: context switch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_preemption(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT], cycles;
+	u32 *cs;
+	int i;
+
+	/*
+	 * We measure two latencies while triggering preemption. The first
+	 * latency is how long it takes for us to submit a preempting request.
+	 * The second latency is how it takes for us to return from the
+	 * preemption back to the original context.
+	 *
+	 *    A: read CS_TIMESTAMP from CPU
+	 *    submit preemption
+	 *    B: read CS_TIMESTAMP on GPU (in preempting context)
+	 *    context switch
+	 *    C: read CS_TIMESTAMP on GPU (in original context)
+	 *
+	 * Preemption dispatch latency: B - A
+	 * Preemption switch latency: C - B
+	 */
+
+	if (!intel_engine_has_preemption(ce->engine))
+		return 0;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 12);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * (2 * i + 0);
+		*cs++ = 0;
+
+		*cs++ = MI_SEMAPHORE_WAIT |
+			MI_SEMAPHORE_GLOBAL_GTT |
+			MI_SEMAPHORE_POLL |
+			MI_SEMAPHORE_SAD_GTE_SDD;
+		*cs++ = i;
+		*cs++ = offset;
+		*cs++ = 0;
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * (2 * i + 1);
+		*cs++ = 0;
+
+		intel_ring_advance(rq, cs);
+		i915_request_add(rq);
+
+		if (wait_for(READ_ONCE(sema[2 * i]), 500)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		rq = i915_request_create(ce->engine->kernel_context);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 8);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * (2 * i + 0);
+		*cs++ = 0;
+
+		*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+		*cs++ = offset;
+		*cs++ = 0;
+		*cs++ = i;
+
+		intel_ring_advance(rq, cs);
+		rq->sched.attr.priority = I915_PRIORITY_BARRIER;
+
+		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		i915_request_add(rq);
+	}
+
+	if (wait_for(READ_ONCE(sema[2 * i - 1]), 500)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= COUNT; i++)
+		elapsed[i - 1] = (sema[2 * i + 0] - elapsed[i - 1]) << COUNT;
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: preemption dispatch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	for (i = 1; i <= COUNT; i++)
+		elapsed[i - 1] = (sema[2 * i + 1] - sema[2 * i + 0]) << COUNT;
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: preemption switch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+struct signal_cb {
+	struct dma_fence_cb base;
+	bool seen;
+};
+
+static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
+{
+	struct signal_cb *s = container_of(cb, typeof(*s), base);
+
+	smp_store_mb(s->seen, true); /* be safe, be strong */
+}
+
+static int measure_completion(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT], cycles;
+	u32 *cs;
+	int i;
+
+	/*
+	 * Measure how long it takes for the signal (interrupt) to be
+	 * sent from the GPU to be processed by the CPU.
+	 *
+	 *    A: read CS_TIMESTAMP on GPU
+	 *    signal
+	 *    B: read CS_TIMESTAMP from CPU
+	 *
+	 * Completion latency: B - A
+	 */
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct signal_cb cb = { .seen = false };
+		struct i915_request *rq;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 12);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		*cs++ = MI_SEMAPHORE_WAIT |
+			MI_SEMAPHORE_GLOBAL_GTT |
+			MI_SEMAPHORE_POLL |
+			MI_SEMAPHORE_SAD_EQ_SDD;
+		*cs++ = i;
+		*cs++ = offset;
+		*cs++ = 0;
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		intel_ring_advance(rq, cs);
+
+		dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
+
+		local_bh_disable();
+		i915_request_add(rq);
+		local_bh_enable();
+
+		if (wait_for(READ_ONCE(sema[i]), 50)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		preempt_disable();
+		WRITE_ONCE(sema[0], i);
+		wmb();
+		while (!READ_ONCE(cb.seen))
+			cpu_relax();
+
+		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		preempt_enable();
+	}
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++)
+		elapsed[i] = (elapsed[i] - sema[i + 1]) << COUNT;
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: completion latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static void rps_pin(struct intel_gt *gt)
+{
+	/* Pin the frequency to max */
+	atomic_inc(&gt->rps.num_waiters);
+	intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
+
+	mutex_lock(&gt->rps.lock);
+	intel_rps_set(&gt->rps, gt->rps.max_freq);
+	mutex_unlock(&gt->rps.lock);
+}
+
+static void rps_unpin(struct intel_gt *gt)
+{
+	intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
+	atomic_dec(&gt->rps.num_waiters);
+}
+
+static void engine_heartbeat_disable(struct intel_engine_cs *engine)
+{
+	engine->props.heartbeat_interval_ms = 0;
+
+	intel_engine_pm_get(engine);
+	intel_engine_park_heartbeat(engine);
+}
+
+static void engine_heartbeat_enable(struct intel_engine_cs *engine)
+{
+	intel_engine_pm_put(engine);
+
+	engine->props.heartbeat_interval_ms =
+		engine->defaults.heartbeat_interval_ms;
+}
+
+static int perf_request_latency(void *arg)
+{
+	struct drm_i915_private *i915 = arg;
+	struct intel_engine_cs *engine;
+	struct pm_qos_request qos;
+	int err = 0;
+
+	if (INTEL_GEN(i915) < 8) /* per-engine CS timestamp, semaphores */
+		return 0;
+
+	cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
+
+	for_each_uabi_engine(engine, i915) {
+		struct intel_context *ce;
+
+		ce = intel_context_create(engine);
+		if (IS_ERR(ce))
+			goto out;
+
+		err = intel_context_pin(ce);
+		if (err) {
+			intel_context_put(ce);
+			goto out;
+		}
+
+		engine_heartbeat_disable(engine);
+		rps_pin(engine->gt);
+
+		if (err == 0)
+			err = measure_semaphore_response(ce);
+		if (err == 0)
+			err = measure_idle_dispatch(ce);
+		if (err == 0)
+			err = measure_busy_dispatch(ce);
+		if (err == 0)
+			err = measure_inter_request(ce);
+		if (err == 0)
+			err = measure_context_switch(ce);
+		if (err == 0)
+			err = measure_preemption(ce);
+		if (err == 0)
+			err = measure_completion(ce);
+
+		rps_unpin(engine->gt);
+		engine_heartbeat_enable(engine);
+
+		intel_context_unpin(ce);
+		intel_context_put(ce);
+		if (err)
+			goto out;
+	}
+
+out:
+	if (igt_flush_test(i915))
+		err = -EIO;
+
+	cpu_latency_qos_remove_request(&qos);
+	return err;
+}
+
 static int s_sync0(void *arg)
 {
 	struct perf_series *ps = arg;
@@ -2042,6 +2843,7 @@ static int perf_parallel_engines(void *arg)
 int i915_request_perf_selftests(struct drm_i915_private *i915)
 {
 	static const struct i915_subtest tests[] = {
+		SUBTEST(perf_request_latency),
 		SUBTEST(perf_series_engines),
 		SUBTEST(perf_parallel_engines),
 	};
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [Intel-gfx] [PATCH] drm/i915/selftests: Measure dispatch latency
  2020-05-15 15:10 Chris Wilson
  2020-05-15 17:32 ` Chris Wilson
  2020-05-15 17:58 ` Chris Wilson
@ 2020-05-15 18:02 ` Chris Wilson
  2 siblings, 0 replies; 18+ messages in thread
From: Chris Wilson @ 2020-05-15 18:02 UTC (permalink / raw)
  To: intel-gfx; +Cc: Chris Wilson

A useful metric of the system's health is how fast we can tell the GPU
to do various actions, so measure our latency.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
 drivers/gpu/drm/i915/selftests/i915_request.c | 612 ++++++++++++++++++
 1 file changed, 612 insertions(+)

diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c
index 6014e8dfcbb1..bfb601214c4a 100644
--- a/drivers/gpu/drm/i915/selftests/i915_request.c
+++ b/drivers/gpu/drm/i915/selftests/i915_request.c
@@ -24,16 +24,20 @@
 
 #include <linux/prime_numbers.h>
 #include <linux/pm_qos.h>
+#include <linux/sort.h>
 
 #include "gem/i915_gem_pm.h"
 #include "gem/selftests/mock_context.h"
 
+#include "gt/intel_engine_heartbeat.h"
 #include "gt/intel_engine_pm.h"
 #include "gt/intel_engine_user.h"
 #include "gt/intel_gt.h"
+#include "gt/intel_gt_requests.h"
 
 #include "i915_random.h"
 #include "i915_selftest.h"
+#include "igt_flush_test.h"
 #include "igt_live_test.h"
 #include "igt_spinner.h"
 #include "lib_sw_fence.h"
@@ -1524,6 +1528,613 @@ struct perf_series {
 	struct intel_context *ce[];
 };
 
+#define COUNT 5
+
+static int cmp_u32(const void *A, const void *B)
+{
+	const u32 *a = A, *b = B;
+
+	return *a - *b;
+}
+
+static u32 trifilter(u32 *a)
+{
+	u64 sum;
+
+	sort(a, COUNT, sizeof(*a), cmp_u32, NULL);
+
+	sum = mul_u32_u32(a[2], 2);
+	sum += a[1];
+	sum += a[3];
+
+	return (sum + 2) >> 2;
+}
+
+static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
+{
+	u64 ns = i915_cs_timestamp_ticks_to_ns(engine->i915, cycles);
+
+	return DIV_ROUND_CLOSEST(ns, 1 << COUNT);
+}
+
+static int measure_semaphore_response(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT], cycles;
+	struct i915_request *rq;
+	u32 *cs;
+	int i;
+
+	rq = i915_request_create(ce);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	cs = intel_ring_begin(rq, 4 + 8 * ARRAY_SIZE(elapsed));
+	if (IS_ERR(cs)) {
+		i915_request_add(rq);
+		return PTR_ERR(cs);
+	}
+
+	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+	*cs++ = offset;
+	*cs++ = 0;
+	*cs++ = 0xffffffff;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		*cs++ = MI_SEMAPHORE_WAIT |
+			MI_SEMAPHORE_GLOBAL_GTT |
+			MI_SEMAPHORE_POLL |
+			MI_SEMAPHORE_SAD_EQ_SDD;
+		*cs++ = i;
+		*cs++ = offset;
+		*cs++ = 0;
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+	}
+
+	intel_ring_advance(rq, cs);
+	i915_request_add(rq);
+
+	if (wait_for(READ_ONCE(*sema) == 0xffffffff, 50)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		WRITE_ONCE(sema[0], i);
+		wmb(); /* flush the update to the cache, and beyond */
+
+		if (wait_for(READ_ONCE(sema[i]), 50)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		elapsed[i - 1] = (sema[i] - cycles) << COUNT;
+	}
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: semaphore response %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_idle_dispatch(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT], cycles;
+	u32 *cs;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+		int err;
+
+		err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+		if (err)
+			return err;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 4);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		intel_ring_advance(rq, cs);
+
+		preempt_disable();
+		local_bh_disable();
+		elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		i915_request_add(rq);
+		local_bh_enable();
+		preempt_enable();
+	}
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+		if (wait_for(READ_ONCE(sema[i]), 50)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		elapsed[i] = (sema[i] - elapsed[i]) << COUNT;
+	}
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_busy_dispatch(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT + 1], cycles;
+	u32 *cs;
+	int i;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 12);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		*cs++ = MI_SEMAPHORE_WAIT |
+			MI_SEMAPHORE_GLOBAL_GTT |
+			MI_SEMAPHORE_POLL |
+			MI_SEMAPHORE_SAD_GTE_SDD;
+		*cs++ = i;
+		*cs++ = offset;
+		*cs++ = 0;
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		intel_ring_advance(rq, cs);
+
+		if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		preempt_disable();
+		local_bh_disable();
+		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		i915_request_add(rq);
+		local_bh_enable();
+		WRITE_ONCE(sema[0], i - 1);
+		wmb(); /* flush the update to the cache, and beyond */
+		preempt_enable();
+	}
+	WRITE_ONCE(sema[0], i - 1);
+
+	for (i = 1; i <= COUNT; i++)
+		elapsed[i - 1] = (sema[i] - elapsed[i]) << COUNT;
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
+{
+	const u32 offset =
+		i915_ggtt_offset(engine->status_page.vma) +
+		offset_in_page(sema);
+	struct i915_request *rq;
+	u32 *cs;
+
+	rq = i915_request_create(engine->kernel_context);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	cs = intel_ring_begin(rq, 4);
+	if (IS_ERR(cs)) {
+		i915_request_add(rq);
+		return PTR_ERR(cs);
+	}
+
+	*cs++ = MI_SEMAPHORE_WAIT |
+		MI_SEMAPHORE_GLOBAL_GTT |
+		MI_SEMAPHORE_POLL |
+		mode;
+	*cs++ = value;
+	*cs++ = offset;
+	*cs++ = 0;
+
+	intel_ring_advance(rq, cs);
+	i915_request_add(rq);
+
+	return 0;
+}
+
+static int measure_inter_request(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT + 1], cycles;
+	struct i915_sw_fence *submit;
+	int i, err;
+
+	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+	if (err)
+		return err;
+
+	submit = heap_fence_create(GFP_KERNEL);
+	if (!submit) {
+		WRITE_ONCE(sema[0], 1);
+		return -ENOMEM;
+	}
+
+	intel_engine_flush_submission(ce->engine);
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+		u32 *cs;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq)) {
+			WRITE_ONCE(sema[0], 1);
+			return PTR_ERR(rq);
+		}
+
+		err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
+						       submit,
+						       GFP_KERNEL);
+		if (err < 0) {
+			i915_sw_fence_commit(submit);
+			heap_fence_put(submit);
+			i915_request_add(rq);
+			WRITE_ONCE(sema[0], 1);
+			return err;
+		}
+
+		cs = intel_ring_begin(rq, 4);
+		if (IS_ERR(cs)) {
+			i915_sw_fence_commit(submit);
+			heap_fence_put(submit);
+			i915_request_add(rq);
+			WRITE_ONCE(sema[0], 1);
+			return PTR_ERR(cs);
+		}
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		intel_ring_advance(rq, cs);
+		i915_request_add(rq);
+	}
+	local_bh_disable();
+	i915_sw_fence_commit(submit);
+	local_bh_enable();
+	intel_engine_flush_submission(ce->engine);
+	heap_fence_put(submit);
+
+	WRITE_ONCE(sema[0], 1);
+	wmb(); /* flush the update to the cache, and beyond */
+
+	if (wait_for(READ_ONCE(sema[COUNT + 1]), 100)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= COUNT; i++)
+		elapsed[i - 1] = (sema[i + 1] - sema[i]) << COUNT;
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: inter-request latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_context_switch(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	struct i915_request *fence = NULL;
+	u32 elapsed[COUNT + 1], cycles;
+	int i, j, err;
+	u32 *cs;
+
+	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+	if (err)
+		return err;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct intel_context *arr[] = {
+			ce, ce->engine->kernel_context
+		};
+
+		for (j = 0; j < ARRAY_SIZE(arr); j++) {
+			struct i915_request *rq;
+
+			rq = i915_request_create(arr[j]);
+			if (IS_ERR(rq))
+				return PTR_ERR(rq);
+
+			if (fence) {
+				err = i915_request_await_dma_fence(rq,
+								   &fence->fence);
+				if (err) {
+					i915_request_add(rq);
+					return err;
+				}
+			}
+
+			cs = intel_ring_begin(rq, 4);
+			if (IS_ERR(cs)) {
+				i915_request_add(rq);
+				return PTR_ERR(cs);
+			}
+
+			*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+			*cs++ = ce->engine->mmio_base + 0x358;
+			*cs++ = offset +
+				sizeof(*sema) * (ARRAY_SIZE(arr) * i + j);
+			*cs++ = 0;
+
+			intel_ring_advance(rq, cs);
+
+			i915_request_put(fence);
+			fence = i915_request_get(rq);
+
+			i915_request_add(rq);
+		}
+	}
+	i915_request_put(fence);
+	intel_engine_flush_submission(ce->engine);
+
+	WRITE_ONCE(sema[0], 1);
+	wmb(); /* flush the update to the cache, and beyond */
+
+	if (wait_for(READ_ONCE(sema[2 *i - 1]), 500)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= COUNT; i++)
+		elapsed[i - 1] = (sema[2 * i + 2] - sema[2 * i + 1]) << COUNT;
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: context switch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+struct signal_cb {
+	struct dma_fence_cb base;
+	bool seen;
+};
+
+static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
+{
+	struct signal_cb *s = container_of(cb, typeof(*s), base);
+
+	smp_store_mb(s->seen, true); /* be safe, be strong */
+}
+
+static int measure_completion(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT], cycles;
+	u32 *cs;
+	int i;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct signal_cb cb = { .seen = false };
+		struct i915_request *rq;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 12);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		*cs++ = MI_SEMAPHORE_WAIT |
+			MI_SEMAPHORE_GLOBAL_GTT |
+			MI_SEMAPHORE_POLL |
+			MI_SEMAPHORE_SAD_EQ_SDD;
+		*cs++ = i;
+		*cs++ = offset;
+		*cs++ = 0;
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		intel_ring_advance(rq, cs);
+
+		dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
+
+		local_bh_disable();
+		i915_request_add(rq);
+		local_bh_enable();
+
+		if (wait_for(READ_ONCE(sema[i]), 50)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		preempt_disable();
+		WRITE_ONCE(sema[0], i);
+		wmb();
+		while (!READ_ONCE(cb.seen))
+			cpu_relax();
+
+		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		preempt_enable();
+	}
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++)
+		elapsed[i] = (elapsed[i] - sema[i + 1]) << COUNT;
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: completion latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static void rps_pin(struct intel_gt *gt)
+{
+	/* Pin the frequency to max */
+	atomic_inc(&gt->rps.num_waiters);
+	intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
+
+	mutex_lock(&gt->rps.lock);
+	intel_rps_set(&gt->rps, gt->rps.max_freq);
+	mutex_unlock(&gt->rps.lock);
+}
+
+static void rps_unpin(struct intel_gt *gt)
+{
+	intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
+	atomic_dec(&gt->rps.num_waiters);
+}
+
+static unsigned long engine_heartbeat_disable(struct intel_engine_cs *engine)
+{
+	unsigned long old;
+
+	old = fetch_and_zero(&engine->props.heartbeat_interval_ms);
+
+	intel_engine_pm_get(engine);
+	intel_engine_park_heartbeat(engine);
+
+	return old;
+}
+
+static void engine_heartbeat_enable(struct intel_engine_cs *engine,
+				    unsigned long saved)
+{
+	intel_engine_pm_put(engine);
+
+	engine->props.heartbeat_interval_ms = saved;
+}
+
+static int perf_request_latency(void *arg)
+{
+	struct drm_i915_private *i915 = arg;
+	struct intel_engine_cs *engine;
+	struct pm_qos_request qos;
+	int err = 0;
+
+	if (INTEL_GEN(i915) < 8) /* per-engine CS timestamp, semaphores */
+		return 0;
+
+	cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
+
+	for_each_uabi_engine(engine, i915) {
+		unsigned long saved_heartbeat;
+		struct intel_context *ce;
+
+		ce = intel_context_create(engine);
+		if (IS_ERR(ce))
+			goto out;
+
+		err = intel_context_pin(ce);
+		if (err) {
+			intel_context_put(ce);
+			goto out;
+		}
+
+		saved_heartbeat = engine_heartbeat_disable(engine);
+		rps_pin(engine->gt);
+
+		if (err == 0)
+			err = measure_semaphore_response(ce);
+		if (err == 0)
+			err = measure_idle_dispatch(ce);
+		if (err == 0)
+			err = measure_busy_dispatch(ce);
+		if (err == 0)
+			err = measure_inter_request(ce);
+		if (err == 0)
+			err = measure_context_switch(ce);
+		if (err == 0)
+			err = measure_completion(ce);
+
+		rps_unpin(engine->gt);
+		engine_heartbeat_enable(engine, saved_heartbeat);
+
+		intel_context_unpin(ce);
+		intel_context_put(ce);
+		if (err)
+			goto out;
+	}
+
+out:
+	if (igt_flush_test(i915))
+		err = -EIO;
+
+	cpu_latency_qos_remove_request(&qos);
+	return err;
+}
+
 static int s_sync0(void *arg)
 {
 	struct perf_series *ps = arg;
@@ -2042,6 +2653,7 @@ static int perf_parallel_engines(void *arg)
 int i915_request_perf_selftests(struct drm_i915_private *i915)
 {
 	static const struct i915_subtest tests[] = {
+		SUBTEST(perf_request_latency),
 		SUBTEST(perf_series_engines),
 		SUBTEST(perf_parallel_engines),
 	};
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [Intel-gfx] [PATCH] drm/i915/selftests: Measure dispatch latency
  2020-05-15 15:10 Chris Wilson
  2020-05-15 17:32 ` Chris Wilson
@ 2020-05-15 17:58 ` Chris Wilson
  2020-05-15 18:02 ` Chris Wilson
  2 siblings, 0 replies; 18+ messages in thread
From: Chris Wilson @ 2020-05-15 17:58 UTC (permalink / raw)
  To: intel-gfx; +Cc: Chris Wilson

A useful metric of the system's health is how fast we can tell the GPU
to do various actions, so measure our latency.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
 drivers/gpu/drm/i915/selftests/i915_request.c | 612 ++++++++++++++++++
 1 file changed, 612 insertions(+)

diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c
index 6014e8dfcbb1..c265501c26a8 100644
--- a/drivers/gpu/drm/i915/selftests/i915_request.c
+++ b/drivers/gpu/drm/i915/selftests/i915_request.c
@@ -24,16 +24,20 @@
 
 #include <linux/prime_numbers.h>
 #include <linux/pm_qos.h>
+#include <linux/sort.h>
 
 #include "gem/i915_gem_pm.h"
 #include "gem/selftests/mock_context.h"
 
+#include "gt/intel_engine_heartbeat.h"
 #include "gt/intel_engine_pm.h"
 #include "gt/intel_engine_user.h"
 #include "gt/intel_gt.h"
+#include "gt/intel_gt_requests.h"
 
 #include "i915_random.h"
 #include "i915_selftest.h"
+#include "igt_flush_test.h"
 #include "igt_live_test.h"
 #include "igt_spinner.h"
 #include "lib_sw_fence.h"
@@ -1524,6 +1528,613 @@ struct perf_series {
 	struct intel_context *ce[];
 };
 
+#define COUNT 5
+
+static int cmp_u32(const void *A, const void *B)
+{
+	const u32 *a = A, *b = B;
+
+	return *a - *b;
+}
+
+static u32 trifilter(u32 *a)
+{
+	u64 sum;
+
+	sort(a, COUNT, sizeof(*a), cmp_u32, NULL);
+
+	sum = mul_u32_u32(a[2], 2);
+	sum += a[1];
+	sum += a[3];
+
+	return (sum + 2) >> 2;
+}
+
+static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
+{
+	u64 ns = i915_cs_timestamp_ticks_to_ns(engine->i915, cycles);
+
+	return DIV_ROUND_CLOSEST(ns, 1 << COUNT);
+}
+
+static int measure_semaphore_response(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT], cycles;
+	struct i915_request *rq;
+	u32 *cs;
+	int i;
+
+	rq = i915_request_create(ce);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	cs = intel_ring_begin(rq, 4 + 8 * ARRAY_SIZE(elapsed));
+	if (IS_ERR(cs)) {
+		i915_request_add(rq);
+		return PTR_ERR(cs);
+	}
+
+	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+	*cs++ = offset;
+	*cs++ = 0;
+	*cs++ = 0xffffffff;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		*cs++ = MI_SEMAPHORE_WAIT |
+			MI_SEMAPHORE_GLOBAL_GTT |
+			MI_SEMAPHORE_POLL |
+			MI_SEMAPHORE_SAD_EQ_SDD;
+		*cs++ = i;
+		*cs++ = offset;
+		*cs++ = 0;
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+	}
+
+	intel_ring_advance(rq, cs);
+	i915_request_add(rq);
+
+	if (wait_for(READ_ONCE(*sema) == 0xffffffff, 50)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		WRITE_ONCE(sema[0], i);
+		wmb(); /* flush the update to the cache, and beyond */
+
+		if (wait_for(READ_ONCE(sema[i]), 50)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		elapsed[i - 1] = (sema[i] - cycles) << COUNT;
+	}
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: semaphore response %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_idle_dispatch(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT], cycles;
+	u32 *cs;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+		int err;
+
+		err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+		if (err)
+			return err;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 4);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		intel_ring_advance(rq, cs);
+
+		preempt_disable();
+		local_bh_disable();
+		elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		i915_request_add(rq);
+		local_bh_enable();
+		preempt_enable();
+	}
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+		if (wait_for(READ_ONCE(sema[i]), 50)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		elapsed[i] = (sema[i] - elapsed[i]) << COUNT;
+	}
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_busy_dispatch(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT + 1], cycles;
+	u32 *cs;
+	int i;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 12);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		*cs++ = MI_SEMAPHORE_WAIT |
+			MI_SEMAPHORE_GLOBAL_GTT |
+			MI_SEMAPHORE_POLL |
+			MI_SEMAPHORE_SAD_GTE_SDD;
+		*cs++ = i;
+		*cs++ = offset;
+		*cs++ = 0;
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		intel_ring_advance(rq, cs);
+
+		if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		preempt_disable();
+		local_bh_disable();
+		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		i915_request_add(rq);
+		local_bh_enable();
+		WRITE_ONCE(sema[0], i - 1);
+		wmb(); /* flush the update to the cache, and beyond */
+		preempt_enable();
+	}
+	WRITE_ONCE(sema[0], i - 1);
+
+	for (i = 1; i <= COUNT; i++)
+		elapsed[i - 1] = (sema[i] - elapsed[i]) << COUNT;
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
+{
+	const u32 offset =
+		i915_ggtt_offset(engine->status_page.vma) +
+		offset_in_page(sema);
+	struct i915_request *rq;
+	u32 *cs;
+
+	rq = i915_request_create(engine->kernel_context);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	cs = intel_ring_begin(rq, 4);
+	if (IS_ERR(cs)) {
+		i915_request_add(rq);
+		return PTR_ERR(cs);
+	}
+
+	*cs++ = MI_SEMAPHORE_WAIT |
+		MI_SEMAPHORE_GLOBAL_GTT |
+		MI_SEMAPHORE_POLL |
+		mode;
+	*cs++ = value;
+	*cs++ = offset;
+	*cs++ = 0;
+
+	intel_ring_advance(rq, cs);
+	i915_request_add(rq);
+
+	return 0;
+}
+
+static int measure_inter_request(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT + 1], cycles;
+	struct i915_sw_fence *submit;
+	int i, err;
+
+	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+	if (err)
+		return err;
+
+	submit = heap_fence_create(GFP_KERNEL);
+	if (!submit) {
+		WRITE_ONCE(sema[0], 1);
+		return -ENOMEM;
+	}
+
+	intel_engine_flush_submission(ce->engine);
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+		u32 *cs;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq)) {
+			WRITE_ONCE(sema[0], 1);
+			return PTR_ERR(rq);
+		}
+
+		err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
+						       submit,
+						       GFP_KERNEL);
+		if (err < 0) {
+			i915_sw_fence_commit(submit);
+			heap_fence_put(submit);
+			i915_request_add(rq);
+			WRITE_ONCE(sema[0], 1);
+			return err;
+		}
+
+		cs = intel_ring_begin(rq, 4);
+		if (IS_ERR(cs)) {
+			i915_sw_fence_commit(submit);
+			heap_fence_put(submit);
+			i915_request_add(rq);
+			WRITE_ONCE(sema[0], 1);
+			return PTR_ERR(cs);
+		}
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		intel_ring_advance(rq, cs);
+		i915_request_add(rq);
+	}
+	local_bh_disable();
+	i915_sw_fence_commit(submit);
+	local_bh_enable();
+	intel_engine_flush_submission(ce->engine);
+	heap_fence_put(submit);
+
+	WRITE_ONCE(sema[0], 1);
+	wmb(); /* flush the update to the cache, and beyond */
+
+	if (wait_for(READ_ONCE(sema[COUNT + 1]), 100)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= COUNT; i++)
+		elapsed[i - 1] = (sema[i + 1] - sema[i]) << COUNT;
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: inter-request latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_context_switch(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	struct i915_request *fence = NULL;
+	u32 elapsed[COUNT + 1], cycles;
+	int i, j, err;
+	u32 *cs;
+
+	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+	if (err)
+		return err;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct intel_context *arr[] = {
+			ce, ce->engine->kernel_context
+		};
+
+		for (j = 0; j < ARRAY_SIZE(arr); j++) {
+			struct i915_request *rq;
+
+			rq = i915_request_create(arr[j]);
+			if (IS_ERR(rq))
+				return PTR_ERR(rq);
+
+			if (fence) {
+				err = i915_request_await_dma_fence(rq,
+								   &fence->fence);
+				if (err) {
+					i915_request_add(rq);
+					return err;
+				}
+			}
+
+			cs = intel_ring_begin(rq, 4);
+			if (IS_ERR(cs)) {
+				i915_request_add(rq);
+				return PTR_ERR(cs);
+			}
+
+			*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+			*cs++ = ce->engine->mmio_base + 0x358;
+			*cs++ = offset +
+				sizeof(*sema) * (ARRAY_SIZE(arr) * i + j);
+			*cs++ = 0;
+
+			intel_ring_advance(rq, cs);
+
+			i915_request_put(fence);
+			fence = i915_request_get(rq);
+
+			i915_request_add(rq);
+		}
+	}
+	i915_request_put(fence);
+	intel_engine_flush_submission(ce->engine);
+
+	WRITE_ONCE(sema[0], 1);
+	wmb(); /* flush the update to the cache, and beyond */
+
+	if (wait_for(READ_ONCE(sema[2 *i - 1]), 500)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= COUNT; i++)
+		elapsed[i - 1] = (sema[2 * i + 2] - sema[2 * i + 1]) << COUNT;
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: context switch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+struct signal_cb {
+	struct dma_fence_cb base;
+	bool seen;
+};
+
+static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
+{
+	struct signal_cb *s = container_of(cb, typeof(*s), base);
+
+	smp_store_mb(s->seen, true); /* be safe, be strong */
+}
+
+static int measure_completion(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT], cycles;
+	u32 *cs;
+	int i;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct signal_cb cb = { .seen = false };
+		struct i915_request *rq;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 12);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		*cs++ = MI_SEMAPHORE_WAIT |
+			MI_SEMAPHORE_GLOBAL_GTT |
+			MI_SEMAPHORE_POLL |
+			MI_SEMAPHORE_SAD_EQ_SDD;
+		*cs++ = i;
+		*cs++ = offset;
+		*cs++ = 0;
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		intel_ring_advance(rq, cs);
+
+		dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
+
+		local_bh_disable();
+		i915_request_add(rq);
+		local_bh_enable();
+
+		if (wait_for(READ_ONCE(sema[i]), 50)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		preempt_disable();
+		WRITE_ONCE(sema[0], i);
+		wmb();
+		while (!READ_ONCE(cb.seen))
+			cpu_relax();
+
+		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		preempt_enable();
+	}
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++)
+		elapsed[i] = (sema[i + 1] - elapsed[i]) << COUNT;
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: completion latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static void rps_pin(struct intel_gt *gt)
+{
+	/* Pin the frequency to max */
+	atomic_inc(&gt->rps.num_waiters);
+	intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
+
+	mutex_lock(&gt->rps.lock);
+	intel_rps_set(&gt->rps, gt->rps.max_freq);
+	mutex_unlock(&gt->rps.lock);
+}
+
+static void rps_unpin(struct intel_gt *gt)
+{
+	intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
+	atomic_dec(&gt->rps.num_waiters);
+}
+
+static unsigned long engine_heartbeat_disable(struct intel_engine_cs *engine)
+{
+	unsigned long old;
+
+	old = fetch_and_zero(&engine->props.heartbeat_interval_ms);
+
+	intel_engine_pm_get(engine);
+	intel_engine_park_heartbeat(engine);
+
+	return old;
+}
+
+static void engine_heartbeat_enable(struct intel_engine_cs *engine,
+				    unsigned long saved)
+{
+	intel_engine_pm_put(engine);
+
+	engine->props.heartbeat_interval_ms = saved;
+}
+
+static int perf_request_latency(void *arg)
+{
+	struct drm_i915_private *i915 = arg;
+	struct intel_engine_cs *engine;
+	struct pm_qos_request qos;
+	int err = 0;
+
+	if (INTEL_GEN(i915) < 8) /* per-engine CS timestamp, semaphores */
+		return 0;
+
+	cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
+
+	for_each_uabi_engine(engine, i915) {
+		unsigned long saved_heartbeat;
+		struct intel_context *ce;
+
+		ce = intel_context_create(engine);
+		if (IS_ERR(ce))
+			goto out;
+
+		err = intel_context_pin(ce);
+		if (err) {
+			intel_context_put(ce);
+			goto out;
+		}
+
+		saved_heartbeat = engine_heartbeat_disable(engine);
+		rps_pin(engine->gt);
+
+		if (err == 0)
+			err = measure_semaphore_response(ce);
+		if (err == 0)
+			err = measure_idle_dispatch(ce);
+		if (err == 0)
+			err = measure_busy_dispatch(ce);
+		if (err == 0)
+			err = measure_inter_request(ce);
+		if (err == 0)
+			err = measure_context_switch(ce);
+		if (err == 0)
+			err = measure_completion(ce);
+
+		rps_unpin(engine->gt);
+		engine_heartbeat_enable(engine, saved_heartbeat);
+
+		intel_context_unpin(ce);
+		intel_context_put(ce);
+		if (err)
+			goto out;
+	}
+
+out:
+	if (igt_flush_test(i915))
+		err = -EIO;
+
+	cpu_latency_qos_remove_request(&qos);
+	return err;
+}
+
 static int s_sync0(void *arg)
 {
 	struct perf_series *ps = arg;
@@ -2042,6 +2653,7 @@ static int perf_parallel_engines(void *arg)
 int i915_request_perf_selftests(struct drm_i915_private *i915)
 {
 	static const struct i915_subtest tests[] = {
+		SUBTEST(perf_request_latency),
 		SUBTEST(perf_series_engines),
 		SUBTEST(perf_parallel_engines),
 	};
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [Intel-gfx] [PATCH] drm/i915/selftests: Measure dispatch latency
  2020-05-15 15:10 Chris Wilson
@ 2020-05-15 17:32 ` Chris Wilson
  2020-05-15 17:58 ` Chris Wilson
  2020-05-15 18:02 ` Chris Wilson
  2 siblings, 0 replies; 18+ messages in thread
From: Chris Wilson @ 2020-05-15 17:32 UTC (permalink / raw)
  To: intel-gfx; +Cc: Chris Wilson

A useful metric of the system's health is how fast we can tell the GPU
to do various actions, so measure our latency.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
 drivers/gpu/drm/i915/selftests/i915_request.c | 521 ++++++++++++++++++
 1 file changed, 521 insertions(+)

diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c
index 6014e8dfcbb1..a47ac68955ec 100644
--- a/drivers/gpu/drm/i915/selftests/i915_request.c
+++ b/drivers/gpu/drm/i915/selftests/i915_request.c
@@ -24,16 +24,20 @@
 
 #include <linux/prime_numbers.h>
 #include <linux/pm_qos.h>
+#include <linux/sort.h>
 
 #include "gem/i915_gem_pm.h"
 #include "gem/selftests/mock_context.h"
 
+#include "gt/intel_engine_heartbeat.h"
 #include "gt/intel_engine_pm.h"
 #include "gt/intel_engine_user.h"
 #include "gt/intel_gt.h"
+#include "gt/intel_gt_requests.h"
 
 #include "i915_random.h"
 #include "i915_selftest.h"
+#include "igt_flush_test.h"
 #include "igt_live_test.h"
 #include "igt_spinner.h"
 #include "lib_sw_fence.h"
@@ -1524,6 +1528,522 @@ struct perf_series {
 	struct intel_context *ce[];
 };
 
+#define COUNT 5
+
+static int cmp_u32(const void *A, const void *B)
+{
+	const u32 *a = A, *b = B;
+
+	return *a - *b;
+}
+
+static u32 trifilter(u32 *a)
+{
+	u64 sum;
+
+	sort(a, COUNT, sizeof(*a), cmp_u32, NULL);
+
+	sum = mul_u32_u32(a[2], 2);
+	sum += a[1];
+	sum += a[3];
+
+	return (sum + 2) >> 2;
+}
+
+static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
+{
+	u64 ns = i915_cs_timestamp_ticks_to_ns(engine->i915, cycles);
+
+	return DIV_ROUND_CLOSEST(ns, 1 << COUNT);
+}
+
+static int measure_semaphore_response(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT], cycles;
+	struct i915_request *rq;
+	u32 *cs;
+	int i;
+
+	rq = i915_request_create(ce);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	cs = intel_ring_begin(rq, 4 + 8 * ARRAY_SIZE(elapsed));
+	if (IS_ERR(cs)) {
+		i915_request_add(rq);
+		return PTR_ERR(cs);
+	}
+
+	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+	*cs++ = offset;
+	*cs++ = 0;
+	*cs++ = 0xffffffff;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		*cs++ = MI_SEMAPHORE_WAIT |
+			MI_SEMAPHORE_GLOBAL_GTT |
+			MI_SEMAPHORE_POLL |
+			MI_SEMAPHORE_SAD_EQ_SDD;
+		*cs++ = i;
+		*cs++ = offset;
+		*cs++ = 0;
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+	}
+
+	intel_ring_advance(rq, cs);
+	i915_request_add(rq);
+
+	if (wait_for(READ_ONCE(*sema) == 0xffffffff, 50)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		WRITE_ONCE(sema[0], i);
+		wmb(); /* flush the update to the cache, and beyond */
+
+		if (wait_for(READ_ONCE(sema[i]), 50)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		elapsed[i - 1] = (sema[i] - cycles) << COUNT;
+	}
+
+	cycles = trifilter(elapsed);
+	pr_err("%s: semaphore response %d cycles, %lluns\n",
+	       ce->engine->name, cycles >> COUNT,
+	       cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_idle_dispatch(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT], cycles;
+	u32 *cs;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+		int err;
+
+		err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+		if (err)
+			return err;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 4);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		intel_ring_advance(rq, cs);
+
+		elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+
+		local_bh_disable();
+		i915_request_add(rq);
+		local_bh_enable();
+	}
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+		if (wait_for(READ_ONCE(sema[i]), 50)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		elapsed[i] = (sema[i] - elapsed[i]) << COUNT;
+	}
+
+	cycles = trifilter(elapsed);
+	pr_err("%s: idle dispatch latency %d cycles, %lluns\n",
+	       ce->engine->name, cycles >> COUNT,
+	       cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_busy_dispatch(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT + 1], cycles;
+	u32 *cs;
+	int i;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 12);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		*cs++ = MI_SEMAPHORE_WAIT |
+			MI_SEMAPHORE_GLOBAL_GTT |
+			MI_SEMAPHORE_POLL |
+			MI_SEMAPHORE_SAD_GTE_SDD;
+		*cs++ = i;
+		*cs++ = offset;
+		*cs++ = 0;
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		intel_ring_advance(rq, cs);
+
+		if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+
+		local_bh_disable();
+		i915_request_add(rq);
+		local_bh_enable();
+
+		WRITE_ONCE(sema[0], i - 1);
+		wmb(); /* flush the update to the cache, and beyond */
+	}
+	WRITE_ONCE(sema[0], i - 1);
+
+	for (i = 1; i <= COUNT; i++)
+		elapsed[i - 1] = (sema[i] - elapsed[i]) << COUNT;
+
+	cycles = trifilter(elapsed);
+	pr_err("%s: busy dispatch latency %d cycles, %lluns\n",
+	       ce->engine->name, cycles >> COUNT,
+	       cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
+{
+	const u32 offset =
+		i915_ggtt_offset(engine->status_page.vma) +
+		offset_in_page(sema);
+	struct i915_request *rq;
+	u32 *cs;
+
+	rq = i915_request_create(engine->kernel_context);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	cs = intel_ring_begin(rq, 4);
+	if (IS_ERR(cs)) {
+		i915_request_add(rq);
+		return PTR_ERR(cs);
+	}
+
+	*cs++ = MI_SEMAPHORE_WAIT |
+		MI_SEMAPHORE_GLOBAL_GTT |
+		MI_SEMAPHORE_POLL |
+		mode;
+	*cs++ = value;
+	*cs++ = offset;
+	*cs++ = 0;
+
+	intel_ring_advance(rq, cs);
+	i915_request_add(rq);
+
+	return 0;
+}
+
+static int measure_inter_request(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT + 1], cycles;
+	struct i915_sw_fence *submit;
+	int i, err;
+
+	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+	if (err)
+		return err;
+
+	submit = heap_fence_create(GFP_KERNEL);
+	if (!submit) {
+		WRITE_ONCE(sema[0], 1);
+		return -ENOMEM;
+	}
+
+	intel_engine_flush_submission(ce->engine);
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+		u32 *cs;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq)) {
+			WRITE_ONCE(sema[0], 1);
+			return PTR_ERR(rq);
+		}
+
+		err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
+						       submit,
+						       GFP_KERNEL);
+		if (err < 0) {
+			i915_sw_fence_commit(submit);
+			heap_fence_put(submit);
+			i915_request_add(rq);
+			WRITE_ONCE(sema[0], 1);
+			return err;
+		}
+
+		cs = intel_ring_begin(rq, 4);
+		if (IS_ERR(cs)) {
+			i915_sw_fence_commit(submit);
+			heap_fence_put(submit);
+			i915_request_add(rq);
+			WRITE_ONCE(sema[0], 1);
+			return PTR_ERR(cs);
+		}
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		intel_ring_advance(rq, cs);
+		i915_request_add(rq);
+	}
+	local_bh_disable();
+	i915_sw_fence_commit(submit);
+	local_bh_enable();
+	intel_engine_flush_submission(ce->engine);
+	heap_fence_put(submit);
+
+	WRITE_ONCE(sema[0], 1);
+	wmb(); /* flush the update to the cache, and beyond */
+
+	if (wait_for(READ_ONCE(sema[COUNT + 1]), 100)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= COUNT; i++)
+		elapsed[i - 1] = (sema[i + 1] - sema[i]) << COUNT;
+
+	cycles = trifilter(elapsed);
+	pr_err("%s: inter-request latency %d cycles, %lluns\n",
+	       ce->engine->name, cycles >> COUNT,
+	       cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_context_switch(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	struct i915_request *fence = NULL;
+	u32 elapsed[COUNT + 1], cycles;
+	int i, j, err;
+	u32 *cs;
+
+	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+	if (err)
+		return err;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct intel_context *arr[] = {
+			ce, ce->engine->kernel_context
+		};
+
+		for (j = 0; j < ARRAY_SIZE(arr); j++) {
+			struct i915_request *rq;
+
+			rq = i915_request_create(arr[j]);
+			if (IS_ERR(rq))
+				return PTR_ERR(rq);
+
+			if (fence) {
+				err = i915_request_await_dma_fence(rq,
+								   &fence->fence);
+				if (err) {
+					i915_request_add(rq);
+					return err;
+				}
+			}
+
+			cs = intel_ring_begin(rq, 4);
+			if (IS_ERR(cs)) {
+				i915_request_add(rq);
+				return PTR_ERR(cs);
+			}
+
+			*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+			*cs++ = ce->engine->mmio_base + 0x358;
+			*cs++ = offset +
+				sizeof(*sema) * (ARRAY_SIZE(arr) * i + j);
+			*cs++ = 0;
+
+			intel_ring_advance(rq, cs);
+
+			i915_request_put(fence);
+			fence = i915_request_get(rq);
+
+			i915_request_add(rq);
+		}
+	}
+	i915_request_put(fence);
+	intel_engine_flush_submission(ce->engine);
+
+	WRITE_ONCE(sema[0], 1);
+	wmb(); /* flush the update to the cache, and beyond */
+
+	if (wait_for(READ_ONCE(sema[2 *i - 1]), 500)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= COUNT; i++)
+		elapsed[i - 1] = (sema[2 * i + 2] - sema[2 * i + 1]) << COUNT;
+
+	cycles = trifilter(elapsed);
+	pr_err("%s: context switch latency %d cycles, %lluns\n",
+	       ce->engine->name, cycles >> COUNT,
+	       cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static void rps_pin(struct intel_gt *gt)
+{
+	/* Pin the frequency to max */
+	atomic_inc(&gt->rps.num_waiters);
+	intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
+
+	mutex_lock(&gt->rps.lock);
+	intel_rps_set(&gt->rps, gt->rps.max_freq);
+	mutex_unlock(&gt->rps.lock);
+}
+
+static void rps_unpin(struct intel_gt *gt)
+{
+	intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
+	atomic_dec(&gt->rps.num_waiters);
+}
+
+static unsigned long engine_heartbeat_disable(struct intel_engine_cs *engine)
+{
+	unsigned long old;
+
+	old = fetch_and_zero(&engine->props.heartbeat_interval_ms);
+
+	intel_engine_pm_get(engine);
+	intel_engine_park_heartbeat(engine);
+
+	return old;
+}
+
+static void engine_heartbeat_enable(struct intel_engine_cs *engine,
+				    unsigned long saved)
+{
+	intel_engine_pm_put(engine);
+
+	engine->props.heartbeat_interval_ms = saved;
+}
+
+static int perf_request_latency(void *arg)
+{
+	struct drm_i915_private *i915 = arg;
+	struct intel_engine_cs *engine;
+	struct pm_qos_request qos;
+	int err = 0;
+
+	if (INTEL_GEN(i915) < 8) /* per-engine CS timestamp, semaphores */
+		return 0;
+
+	cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
+
+	for_each_uabi_engine(engine, i915) {
+		unsigned long saved_heartbeat;
+		struct intel_context *ce;
+
+		ce = intel_context_create(engine);
+		if (IS_ERR(ce))
+			goto out;
+
+		err = intel_context_pin(ce);
+		if (err) {
+			intel_context_put(ce);
+			goto out;
+		}
+
+		saved_heartbeat = engine_heartbeat_disable(engine);
+		rps_pin(engine->gt);
+
+		if (err == 0)
+			err = measure_semaphore_response(ce);
+		if (err == 0)
+			err = measure_idle_dispatch(ce);
+		if (err == 0)
+			err = measure_busy_dispatch(ce);
+		if (err == 0)
+			err = measure_inter_request(ce);
+		if (err == 0)
+			err = measure_context_switch(ce);
+
+		rps_unpin(engine->gt);
+		engine_heartbeat_enable(engine, saved_heartbeat);
+
+		intel_context_unpin(ce);
+		intel_context_put(ce);
+		if (err)
+			goto out;
+	}
+
+out:
+	if (igt_flush_test(i915))
+		err = -EIO;
+
+	cpu_latency_qos_remove_request(&qos);
+	return err;
+}
+
 static int s_sync0(void *arg)
 {
 	struct perf_series *ps = arg;
@@ -2042,6 +2562,7 @@ static int perf_parallel_engines(void *arg)
 int i915_request_perf_selftests(struct drm_i915_private *i915)
 {
 	static const struct i915_subtest tests[] = {
+		SUBTEST(perf_request_latency),
 		SUBTEST(perf_series_engines),
 		SUBTEST(perf_parallel_engines),
 	};
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [Intel-gfx] [PATCH] drm/i915/selftests: Measure dispatch latency
@ 2020-05-15 15:10 Chris Wilson
  2020-05-15 17:32 ` Chris Wilson
                   ` (2 more replies)
  0 siblings, 3 replies; 18+ messages in thread
From: Chris Wilson @ 2020-05-15 15:10 UTC (permalink / raw)
  To: intel-gfx; +Cc: Chris Wilson

A useful metric of the system's health is how fast we can tell the GPU
to do various actions, so measure our latency.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
 drivers/gpu/drm/i915/selftests/i915_request.c | 505 ++++++++++++++++++
 1 file changed, 505 insertions(+)

diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c
index 6014e8dfcbb1..94c79066f82a 100644
--- a/drivers/gpu/drm/i915/selftests/i915_request.c
+++ b/drivers/gpu/drm/i915/selftests/i915_request.c
@@ -24,13 +24,16 @@
 
 #include <linux/prime_numbers.h>
 #include <linux/pm_qos.h>
+#include <linux/sort.h>
 
 #include "gem/i915_gem_pm.h"
 #include "gem/selftests/mock_context.h"
 
+#include "gt/intel_engine_heartbeat.h"
 #include "gt/intel_engine_pm.h"
 #include "gt/intel_engine_user.h"
 #include "gt/intel_gt.h"
+#include "gt/intel_gt_requests.h"
 
 #include "i915_random.h"
 #include "i915_selftest.h"
@@ -1524,6 +1527,507 @@ struct perf_series {
 	struct intel_context *ce[];
 };
 
+#define COUNT 5
+
+static int cmp_u32(const void *A, const void *B)
+{
+	const u32 *a = A, *b = B;
+
+	return *a - *b;
+}
+
+static u32 trifilter(u32 *a)
+{
+	u64 sum;
+
+	sort(a, COUNT, sizeof(*a), cmp_u32, NULL);
+
+	sum = mul_u32_u32(a[2], 2);
+	sum += a[1];
+	sum += a[3];
+
+	return sum >> 2;
+}
+
+static int measure_semaphore_response(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT], cycles;
+	struct i915_request *rq;
+	u32 *cs;
+	int i;
+
+	rq = i915_request_create(ce);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	cs = intel_ring_begin(rq, 4 + 8 * ARRAY_SIZE(elapsed));
+	if (IS_ERR(cs)) {
+		i915_request_add(rq);
+		return PTR_ERR(cs);
+	}
+
+	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+	*cs++ = offset;
+	*cs++ = 0;
+	*cs++ = 0xffffffff;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		*cs++ = MI_SEMAPHORE_WAIT |
+			MI_SEMAPHORE_GLOBAL_GTT |
+			MI_SEMAPHORE_POLL |
+			MI_SEMAPHORE_SAD_EQ_SDD;
+		*cs++ = i;
+		*cs++ = offset;
+		*cs++ = 0;
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+	}
+
+	intel_ring_advance(rq, cs);
+	i915_request_add(rq);
+
+	if (wait_for(READ_ONCE(*sema) == 0xffffffff, 50)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		WRITE_ONCE(sema[0], i);
+		wmb();
+
+		if (wait_for(READ_ONCE(sema[i]), 50)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		elapsed[i - 1] = (sema[i] - cycles) << COUNT;
+	}
+
+	cycles = trifilter(elapsed);
+	pr_err("%s: semaphore response %d cycles, %lluns\n",
+	       ce->engine->name, cycles >> COUNT,
+	       i915_cs_timestamp_ticks_to_ns(ce->engine->i915,
+					     cycles) >> COUNT);
+
+	return 0;
+}
+
+static int measure_idle_dispatch(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT], cycles;
+	u32 *cs;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+		int err;
+
+		err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+		if (err)
+			return err;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 4);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		intel_ring_advance(rq, cs);
+
+		elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		i915_request_add(rq);
+	}
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+		if (wait_for(READ_ONCE(sema[i]), 50)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		elapsed[i] = (sema[i] - elapsed[i]) << COUNT;
+	}
+
+	cycles = trifilter(elapsed);
+	pr_err("%s: idle dispatch latency %d cycles, %lluns\n",
+	       ce->engine->name, cycles >> COUNT,
+	       i915_cs_timestamp_ticks_to_ns(ce->engine->i915,
+					     cycles) >> COUNT);
+
+	return 0;
+}
+
+static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
+{
+	const u32 offset =
+		i915_ggtt_offset(engine->status_page.vma) +
+		offset_in_page(sema);
+	struct i915_request *rq;
+	u32 *cs;
+
+	rq = i915_request_create(engine->kernel_context);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	cs = intel_ring_begin(rq, 4);
+	if (IS_ERR(cs)) {
+		i915_request_add(rq);
+		return PTR_ERR(cs);
+	}
+
+	*cs++ = MI_SEMAPHORE_WAIT |
+		MI_SEMAPHORE_GLOBAL_GTT |
+		MI_SEMAPHORE_POLL |
+		mode;
+	*cs++ = value;
+	*cs++ = offset;
+	*cs++ = 0;
+
+	intel_ring_advance(rq, cs);
+	i915_request_add(rq);
+
+	return 0;
+}
+
+static int measure_inter_request(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT + 1], cycles;
+	struct i915_sw_fence *submit;
+	int i, err;
+
+	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+	if (err)
+		return err;
+
+	submit = heap_fence_create(GFP_KERNEL);
+	if (!submit) {
+		WRITE_ONCE(sema[0], 1);
+		return -ENOMEM;
+	}
+
+	intel_engine_flush_submission(ce->engine);
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+		u32 *cs;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq)) {
+			WRITE_ONCE(sema[0], 1);
+			return PTR_ERR(rq);
+		}
+
+		err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
+						       submit,
+						       GFP_KERNEL);
+		if (err < 0) {
+			i915_sw_fence_commit(submit);
+			heap_fence_put(submit);
+			i915_request_add(rq);
+			WRITE_ONCE(sema[0], 1);
+			return err;
+		}
+
+		cs = intel_ring_begin(rq, 4);
+		if (IS_ERR(cs)) {
+			i915_sw_fence_commit(submit);
+			heap_fence_put(submit);
+			i915_request_add(rq);
+			WRITE_ONCE(sema[0], 1);
+			return PTR_ERR(cs);
+		}
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		intel_ring_advance(rq, cs);
+		i915_request_add(rq);
+	}
+	i915_sw_fence_commit(submit);
+	intel_engine_flush_submission(ce->engine);
+	heap_fence_put(submit);
+
+	WRITE_ONCE(sema[0], 1);
+	if (wait_for(READ_ONCE(sema[COUNT + 1]), 100)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= COUNT; i++)
+		elapsed[i - 1] = (sema[i + 1] - sema[i]) << COUNT;
+
+	cycles = trifilter(elapsed);
+	pr_err("%s: inter-request latency %d cycles, %lluns\n",
+	       ce->engine->name, cycles >> COUNT,
+	       i915_cs_timestamp_ticks_to_ns(ce->engine->i915,
+					     cycles) >> COUNT);
+
+	return 0;
+}
+
+static int measure_lite_restore(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT + 1], cycles;
+	u32 *cs;
+	int i;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 12);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * (2 * i + 0);
+		*cs++ = 0;
+
+		*cs++ = MI_SEMAPHORE_WAIT |
+			MI_SEMAPHORE_GLOBAL_GTT |
+			MI_SEMAPHORE_POLL |
+			MI_SEMAPHORE_SAD_GTE_SDD;
+		*cs++ = i;
+		*cs++ = offset;
+		*cs++ = 0;
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * (2 * i + 1);
+		*cs++ = 0;
+
+		intel_ring_advance(rq, cs);
+		i915_request_add(rq);
+		intel_engine_flush_submission(ce->engine);
+
+		WRITE_ONCE(sema[0], i - 1);
+		if (i > 1 && wait_for(READ_ONCE(sema[2 * (i - 1)]), 100)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+	}
+
+	WRITE_ONCE(sema[0], i - 1);
+	if (wait_for(READ_ONCE(sema[2 *i - 1]), 500)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= COUNT; i++)
+		elapsed[i - 1] = (sema[2 * i + 2] - sema[2 * i + 1]) << COUNT;
+
+	cycles = trifilter(elapsed);
+	pr_err("%s: lite restore latency %d cycles, %lluns\n",
+	       ce->engine->name, cycles >> COUNT,
+	       i915_cs_timestamp_ticks_to_ns(ce->engine->i915,
+					     cycles) >> COUNT);
+
+	return 0;
+}
+
+static int measure_context_switch(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	struct i915_request *fence = NULL;
+	u32 elapsed[COUNT + 1], cycles;
+	int i, j, err;
+	u32 *cs;
+
+	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+	if (err)
+		return err;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct intel_context *arr[] = {
+			ce, ce->engine->kernel_context
+		};
+
+		for (j = 0; j < ARRAY_SIZE(arr); j++) {
+			struct i915_request *rq;
+
+			rq = i915_request_create(arr[j]);
+			if (IS_ERR(rq))
+				return PTR_ERR(rq);
+
+			if (fence) {
+				err = i915_request_await_dma_fence(rq,
+								   &fence->fence);
+				if (err) {
+					i915_request_add(rq);
+					return err;
+				}
+			}
+
+			cs = intel_ring_begin(rq, 4);
+			if (IS_ERR(cs)) {
+				i915_request_add(rq);
+				return PTR_ERR(cs);
+			}
+
+			*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+			*cs++ = ce->engine->mmio_base + 0x358;
+			*cs++ = offset +
+				sizeof(*sema) * (ARRAY_SIZE(arr) * i + j);
+			*cs++ = 0;
+
+			intel_ring_advance(rq, cs);
+
+			i915_request_put(fence);
+			fence = i915_request_get(rq);
+
+			i915_request_add(rq);
+		}
+	}
+	i915_request_put(fence);
+	intel_engine_flush_submission(ce->engine);
+
+	WRITE_ONCE(sema[0], 1);
+	if (wait_for(READ_ONCE(sema[2 *i - 1]), 500)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= COUNT; i++)
+		elapsed[i - 1] = (sema[2 * i + 2] - sema[2 * i + 1]) << COUNT;
+
+	cycles = trifilter(elapsed);
+	pr_err("%s: context switch latency %d cycles, %lluns\n",
+	       ce->engine->name, cycles >> COUNT,
+	       i915_cs_timestamp_ticks_to_ns(ce->engine->i915,
+					     cycles) >> COUNT);
+
+	return 0;
+}
+
+static void rps_pin(struct intel_gt *gt)
+{
+	/* Pin the frequency to max */
+	atomic_inc(&gt->rps.num_waiters);
+
+	mutex_lock(&gt->rps.lock);
+	intel_rps_set(&gt->rps, gt->rps.max_freq);
+	mutex_unlock(&gt->rps.lock);
+}
+
+static void rps_unpin(struct intel_gt *gt)
+{
+	atomic_dec(&gt->rps.num_waiters);
+}
+
+static unsigned long engine_heartbeat_disable(struct intel_engine_cs *engine)
+{
+	unsigned long old;
+
+	old = fetch_and_zero(&engine->props.heartbeat_interval_ms);
+
+	intel_engine_pm_get(engine);
+	intel_engine_park_heartbeat(engine);
+
+	return old;
+}
+
+static void engine_heartbeat_enable(struct intel_engine_cs *engine,
+				    unsigned long saved)
+{
+	intel_engine_pm_put(engine);
+
+	engine->props.heartbeat_interval_ms = saved;
+}
+
+static int perf_request_latency(void *arg)
+{
+	struct drm_i915_private *i915 = arg;
+	struct intel_engine_cs *engine;
+	struct pm_qos_request qos;
+	int err = 0;
+
+	if (INTEL_GEN(i915) < 8) /* per-engine CS timestamp, semaphores */
+		return 0;
+
+	cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
+
+	for_each_uabi_engine(engine, i915) {
+		unsigned long saved_heartbeat;
+		struct intel_context *ce;
+
+		ce = intel_context_create(engine);
+		if (IS_ERR(ce))
+			goto out;
+
+		err = intel_context_pin(ce);
+		if (err) {
+			intel_context_put(ce);
+			goto out;
+		}
+
+		saved_heartbeat = engine_heartbeat_disable(engine);
+		intel_uncore_forcewake_get(engine->uncore, FORCEWAKE_ALL);
+		rps_pin(engine->gt);
+
+		if (err == 0)
+			err = measure_semaphore_response(ce);
+		if (err == 0)
+			err = measure_idle_dispatch(ce);
+		if (err == 0)
+			err = measure_inter_request(ce);
+		if (err == 0)
+			err = measure_lite_restore(ce);
+		if (err == 0)
+			err = measure_context_switch(ce);
+
+		rps_unpin(engine->gt);
+		intel_uncore_forcewake_put(engine->uncore, FORCEWAKE_ALL);
+		engine_heartbeat_enable(engine, saved_heartbeat);
+
+		intel_context_unpin(ce);
+		intel_context_put(ce);
+		if (err)
+			goto out;
+	}
+
+out:
+	cpu_latency_qos_remove_request(&qos);
+	return err;
+}
+
 static int s_sync0(void *arg)
 {
 	struct perf_series *ps = arg;
@@ -2042,6 +2546,7 @@ static int perf_parallel_engines(void *arg)
 int i915_request_perf_selftests(struct drm_i915_private *i915)
 {
 	static const struct i915_subtest tests[] = {
+		SUBTEST(perf_request_latency),
 		SUBTEST(perf_series_engines),
 		SUBTEST(perf_parallel_engines),
 	};
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2020-05-19 13:25 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-05-18 14:39 [Intel-gfx] [PATCH v2] drm/i915/selftests: Measure dispatch latency Chris Wilson
2020-05-18 15:07 ` Mika Kuoppala
2020-05-18 15:14   ` Chris Wilson
2020-05-18 15:17     ` Chris Wilson
2020-05-18 16:19 ` [Intel-gfx] ✗ Fi.CI.BUILD: failure for drm/i915/selftests: Measure dispatch latency (rev7) Patchwork
2020-05-18 16:22 ` [Intel-gfx] [PATCH] drm/i915/selftests: Measure dispatch latency Chris Wilson
2020-05-18 18:15 ` [Intel-gfx] ✓ Fi.CI.BAT: success for drm/i915/selftests: Measure dispatch latency (rev8) Patchwork
2020-05-18 20:02 ` [Intel-gfx] ✓ Fi.CI.IGT: " Patchwork
2020-05-19 11:41 ` [Intel-gfx] [PATCH] drm/i915/selftests: Measure dispatch latency Chris Wilson
2020-05-19 12:47   ` Mika Kuoppala
2020-05-19 12:56     ` Chris Wilson
2020-05-19 13:25 ` [Intel-gfx] ✓ Fi.CI.BAT: success for drm/i915/selftests: Measure dispatch latency (rev9) Patchwork
  -- strict thread matches above, loose matches on Subject: below --
2020-05-19 13:08 [Intel-gfx] [PATCH] drm/i915/selftests: Measure dispatch latency Chris Wilson
2020-05-18  8:57 Chris Wilson
2020-05-15 15:10 Chris Wilson
2020-05-15 17:32 ` Chris Wilson
2020-05-15 17:58 ` Chris Wilson
2020-05-15 18:02 ` Chris Wilson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.