All of lore.kernel.org
 help / color / mirror / Atom feed
* [Intel-gfx] [PATCH] drm/i915/selftests: Measure dispatch latency
@ 2020-05-18  8:57 Chris Wilson
  2020-05-18 12:46 ` [Intel-gfx] ✗ Fi.CI.CHECKPATCH: warning for drm/i915/selftests: Measure dispatch latency (rev5) Patchwork
                   ` (2 more replies)
  0 siblings, 3 replies; 13+ messages in thread
From: Chris Wilson @ 2020-05-18  8:57 UTC (permalink / raw)
  To: intel-gfx; +Cc: Chris Wilson

A useful metric of the system's health is how fast we can tell the GPU
to do various actions, so measure our latency.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 drivers/gpu/drm/i915/selftests/i915_request.c | 802 ++++++++++++++++++
 1 file changed, 802 insertions(+)

diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c
index 6014e8dfcbb1..f9631526730d 100644
--- a/drivers/gpu/drm/i915/selftests/i915_request.c
+++ b/drivers/gpu/drm/i915/selftests/i915_request.c
@@ -24,16 +24,20 @@
 
 #include <linux/prime_numbers.h>
 #include <linux/pm_qos.h>
+#include <linux/sort.h>
 
 #include "gem/i915_gem_pm.h"
 #include "gem/selftests/mock_context.h"
 
+#include "gt/intel_engine_heartbeat.h"
 #include "gt/intel_engine_pm.h"
 #include "gt/intel_engine_user.h"
 #include "gt/intel_gt.h"
+#include "gt/intel_gt_requests.h"
 
 #include "i915_random.h"
 #include "i915_selftest.h"
+#include "igt_flush_test.h"
 #include "igt_live_test.h"
 #include "igt_spinner.h"
 #include "lib_sw_fence.h"
@@ -1524,6 +1528,803 @@ struct perf_series {
 	struct intel_context *ce[];
 };
 
+#define COUNT 5
+
+static int cmp_u32(const void *A, const void *B)
+{
+	const u32 *a = A, *b = B;
+
+	return *a - *b;
+}
+
+static u32 trifilter(u32 *a)
+{
+	u64 sum;
+
+	sort(a, COUNT, sizeof(*a), cmp_u32, NULL);
+
+	sum = mul_u32_u32(a[2], 2);
+	sum += a[1];
+	sum += a[3];
+
+	return (sum + 2) >> 2;
+}
+
+static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
+{
+	u64 ns = i915_cs_timestamp_ticks_to_ns(engine->i915, cycles);
+
+	return DIV_ROUND_CLOSEST(ns, 1 << COUNT);
+}
+
+static int measure_semaphore_response(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT], cycles;
+	struct i915_request *rq;
+	u32 *cs;
+	int i;
+
+	/*
+	 * Measure how many cycles it takes for the HW to detect the change
+	 * in a semaphore value.
+	 *
+	 *    A: read CS_TIMESTAMP from CPU
+	 *    poke semaphore
+	 *    B: read CS_TIMESTAMP on GPU
+	 *
+	 * Semaphore latency: B - A
+	 */
+
+	rq = i915_request_create(ce);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	cs = intel_ring_begin(rq, 4 + 8 * ARRAY_SIZE(elapsed));
+	if (IS_ERR(cs)) {
+		i915_request_add(rq);
+		return PTR_ERR(cs);
+	}
+
+	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+	*cs++ = offset;
+	*cs++ = 0;
+	*cs++ = 0xffffffff;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		*cs++ = MI_SEMAPHORE_WAIT |
+			MI_SEMAPHORE_GLOBAL_GTT |
+			MI_SEMAPHORE_POLL |
+			MI_SEMAPHORE_SAD_EQ_SDD;
+		*cs++ = i;
+		*cs++ = offset;
+		*cs++ = 0;
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+	}
+
+	intel_ring_advance(rq, cs);
+	i915_request_add(rq);
+
+	if (wait_for(READ_ONCE(*sema) == 0xffffffff, 50)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		preempt_disable();
+		cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		WRITE_ONCE(sema[0], i);
+		wmb(); /* flush the update to the cache, and beyond */
+		preempt_enable();
+
+		if (wait_for(READ_ONCE(sema[i]), 50)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		elapsed[i - 1] = (sema[i] - cycles) << COUNT;
+	}
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: semaphore response %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_idle_dispatch(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT], cycles;
+	u32 *cs;
+	int i;
+
+	/*
+	 * Measure how long it takes for us to submit a request while the
+	 * engine is idle, but is resting in our context.
+	 *
+	 *    A: read CS_TIMESTAMP from CPU
+	 *    submit request
+	 *    B: read CS_TIMESTAMP on GPU
+	 *
+	 * Submission latency: B - A
+	 */
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+		int err;
+
+		err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+		if (err)
+			return err;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 4);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		intel_ring_advance(rq, cs);
+
+		preempt_disable();
+		local_bh_disable();
+		elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		i915_request_add(rq);
+		local_bh_enable();
+		preempt_enable();
+	}
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+		if (wait_for(READ_ONCE(sema[i]), 50)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		elapsed[i] = (sema[i] - elapsed[i]) << COUNT;
+	}
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_busy_dispatch(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT + 1], cycles;
+	u32 *cs;
+	int i;
+
+	/*
+	 * Measure how long it takes for us to submit a request while the
+	 * engine is busy, polling on a semaphore in our context. With
+	 * direct submission, this will include the cost of an idle restore.
+	 *
+	 *    A: read CS_TIMESTAMP from CPU
+	 *    submit request
+	 *    B: read CS_TIMESTAMP on GPU
+	 *
+	 * Submission latency: B - A
+	 */
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 12);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		*cs++ = MI_SEMAPHORE_WAIT |
+			MI_SEMAPHORE_GLOBAL_GTT |
+			MI_SEMAPHORE_POLL |
+			MI_SEMAPHORE_SAD_GTE_SDD;
+		*cs++ = i;
+		*cs++ = offset;
+		*cs++ = 0;
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		intel_ring_advance(rq, cs);
+
+		if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		preempt_disable();
+		local_bh_disable();
+		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		i915_request_add(rq);
+		local_bh_enable();
+		WRITE_ONCE(sema[0], i - 1);
+		wmb(); /* flush the update to the cache, and beyond */
+		preempt_enable();
+	}
+	WRITE_ONCE(sema[0], i - 1);
+
+	for (i = 1; i <= COUNT; i++)
+		elapsed[i - 1] = (sema[i] - elapsed[i]) << COUNT;
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
+{
+	const u32 offset =
+		i915_ggtt_offset(engine->status_page.vma) +
+		offset_in_page(sema);
+	struct i915_request *rq;
+	u32 *cs;
+
+	rq = i915_request_create(engine->kernel_context);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	cs = intel_ring_begin(rq, 4);
+	if (IS_ERR(cs)) {
+		i915_request_add(rq);
+		return PTR_ERR(cs);
+	}
+
+	*cs++ = MI_SEMAPHORE_WAIT |
+		MI_SEMAPHORE_GLOBAL_GTT |
+		MI_SEMAPHORE_POLL |
+		mode;
+	*cs++ = value;
+	*cs++ = offset;
+	*cs++ = 0;
+
+	intel_ring_advance(rq, cs);
+	i915_request_add(rq);
+
+	return 0;
+}
+
+static int measure_inter_request(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT + 1], cycles;
+	struct i915_sw_fence *submit;
+	int i, err;
+
+	/*
+	 * Measure how long it takes to advance from one request into the
+	 * next. Between each request we flush the GPU caches to memory,
+	 * update the breadcrumbs, and then invalidate those caches.
+	 * We queue up all the requests to be submitted in one batch so
+	 * it should be one set of contiguous measurements.
+	 *
+	 *    A: read CS_TIMESTAMP on GPU
+	 *    advance request
+	 *    B: read CS_TIMESTAMP on GPU
+	 *
+	 * Request latency: B - A
+	 */
+
+	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+	if (err)
+		return err;
+
+	submit = heap_fence_create(GFP_KERNEL);
+	if (!submit) {
+		WRITE_ONCE(sema[0], 1);
+		return -ENOMEM;
+	}
+
+	intel_engine_flush_submission(ce->engine);
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+		u32 *cs;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq)) {
+			WRITE_ONCE(sema[0], 1);
+			return PTR_ERR(rq);
+		}
+
+		err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
+						       submit,
+						       GFP_KERNEL);
+		if (err < 0) {
+			i915_sw_fence_commit(submit);
+			heap_fence_put(submit);
+			i915_request_add(rq);
+			WRITE_ONCE(sema[0], 1);
+			return err;
+		}
+
+		cs = intel_ring_begin(rq, 4);
+		if (IS_ERR(cs)) {
+			i915_sw_fence_commit(submit);
+			heap_fence_put(submit);
+			i915_request_add(rq);
+			WRITE_ONCE(sema[0], 1);
+			return PTR_ERR(cs);
+		}
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		intel_ring_advance(rq, cs);
+		i915_request_add(rq);
+	}
+	local_bh_disable();
+	i915_sw_fence_commit(submit);
+	local_bh_enable();
+	intel_engine_flush_submission(ce->engine);
+	heap_fence_put(submit);
+
+	WRITE_ONCE(sema[0], 1);
+	wmb(); /* flush the update to the cache, and beyond */
+
+	if (wait_for(READ_ONCE(sema[COUNT + 1]), 100)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= COUNT; i++)
+		elapsed[i - 1] = (sema[i + 1] - sema[i]) << COUNT;
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: inter-request latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_context_switch(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	struct i915_request *fence = NULL;
+	u32 elapsed[COUNT + 1], cycles;
+	int i, j, err;
+	u32 *cs;
+
+	/*
+	 * Measure how long it takes to advance from one request in one
+	 * context to a request in another context. This allows us to
+	 * measure how long the context save/restore take, along with all
+	 * the inter-context setup we require.
+	 *
+	 *    A: read CS_TIMESTAMP on GPU
+	 *    switch context
+	 *    B: read CS_TIMESTAMP on GPU
+	 *
+	 * Context switch latency: B - A
+	 */
+
+	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+	if (err)
+		return err;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct intel_context *arr[] = {
+			ce, ce->engine->kernel_context
+		};
+
+		for (j = 0; j < ARRAY_SIZE(arr); j++) {
+			struct i915_request *rq;
+
+			rq = i915_request_create(arr[j]);
+			if (IS_ERR(rq))
+				return PTR_ERR(rq);
+
+			if (fence) {
+				err = i915_request_await_dma_fence(rq,
+								   &fence->fence);
+				if (err) {
+					i915_request_add(rq);
+					return err;
+				}
+			}
+
+			cs = intel_ring_begin(rq, 4);
+			if (IS_ERR(cs)) {
+				i915_request_add(rq);
+				return PTR_ERR(cs);
+			}
+
+			*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+			*cs++ = ce->engine->mmio_base + 0x358;
+			*cs++ = offset +
+				sizeof(*sema) * (ARRAY_SIZE(arr) * i + j);
+			*cs++ = 0;
+
+			intel_ring_advance(rq, cs);
+
+			i915_request_put(fence);
+			fence = i915_request_get(rq);
+
+			i915_request_add(rq);
+		}
+	}
+	i915_request_put(fence);
+	intel_engine_flush_submission(ce->engine);
+
+	WRITE_ONCE(sema[0], 1);
+	wmb(); /* flush the update to the cache, and beyond */
+
+	if (wait_for(READ_ONCE(sema[2 * i - 1]), 500)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= COUNT; i++)
+		elapsed[i - 1] = (sema[2 * i + 2] - sema[2 * i + 1]) << COUNT;
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: context switch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_preemption(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT], cycles;
+	u32 *cs;
+	int i;
+
+	/*
+	 * We measure two latencies while triggering preemption. The first
+	 * latency is how long it takes for us to submit a preempting request.
+	 * The second latency is how it takes for us to return from the
+	 * preemption back to the original context.
+	 *
+	 *    A: read CS_TIMESTAMP from CPU
+	 *    submit preemption
+	 *    B: read CS_TIMESTAMP on GPU (in preempting context)
+	 *    context switch
+	 *    C: read CS_TIMESTAMP on GPU (in original context)
+	 *
+	 * Preemption dispatch latency: B - A
+	 * Preemption switch latency: C - B
+	 */
+
+	if (!intel_engine_has_preemption(ce->engine))
+		return 0;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 12);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * (2 * i + 0);
+		*cs++ = 0;
+
+		*cs++ = MI_SEMAPHORE_WAIT |
+			MI_SEMAPHORE_GLOBAL_GTT |
+			MI_SEMAPHORE_POLL |
+			MI_SEMAPHORE_SAD_GTE_SDD;
+		*cs++ = i;
+		*cs++ = offset;
+		*cs++ = 0;
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * (2 * i + 1);
+		*cs++ = 0;
+
+		intel_ring_advance(rq, cs);
+		i915_request_add(rq);
+
+		if (wait_for(READ_ONCE(sema[2 * i]), 500)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		rq = i915_request_create(ce->engine->kernel_context);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 8);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * (2 * i + 0);
+		*cs++ = 0;
+
+		*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+		*cs++ = offset;
+		*cs++ = 0;
+		*cs++ = i;
+
+		intel_ring_advance(rq, cs);
+		rq->sched.attr.priority = I915_PRIORITY_BARRIER;
+
+		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		i915_request_add(rq);
+	}
+
+	if (wait_for(READ_ONCE(sema[2 * i - 1]), 500)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= COUNT; i++)
+		elapsed[i - 1] = (sema[2 * i + 0] - elapsed[i - 1]) << COUNT;
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: preemption dispatch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	for (i = 1; i <= COUNT; i++)
+		elapsed[i - 1] = (sema[2 * i + 1] - sema[2 * i + 0]) << COUNT;
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: preemption switch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+struct signal_cb {
+	struct dma_fence_cb base;
+	bool seen;
+};
+
+static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
+{
+	struct signal_cb *s = container_of(cb, typeof(*s), base);
+
+	smp_store_mb(s->seen, true); /* be safe, be strong */
+}
+
+static int measure_completion(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT], cycles;
+	u32 *cs;
+	int i;
+
+	/*
+	 * Measure how long it takes for the signal (interrupt) to be
+	 * sent from the GPU to be processed by the CPU.
+	 *
+	 *    A: read CS_TIMESTAMP on GPU
+	 *    signal
+	 *    B: read CS_TIMESTAMP from CPU
+	 *
+	 * Completion latency: B - A
+	 */
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct signal_cb cb = { .seen = false };
+		struct i915_request *rq;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 12);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		*cs++ = MI_SEMAPHORE_WAIT |
+			MI_SEMAPHORE_GLOBAL_GTT |
+			MI_SEMAPHORE_POLL |
+			MI_SEMAPHORE_SAD_EQ_SDD;
+		*cs++ = i;
+		*cs++ = offset;
+		*cs++ = 0;
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		intel_ring_advance(rq, cs);
+
+		dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
+
+		local_bh_disable();
+		i915_request_add(rq);
+		local_bh_enable();
+
+		if (wait_for(READ_ONCE(sema[i]), 50)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		preempt_disable();
+		WRITE_ONCE(sema[0], i);
+		wmb();
+		while (!READ_ONCE(cb.seen))
+			cpu_relax();
+
+		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		preempt_enable();
+	}
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++)
+		elapsed[i] = (elapsed[i] - sema[i + 1]) << COUNT;
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: completion latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static void rps_pin(struct intel_gt *gt)
+{
+	/* Pin the frequency to max */
+	atomic_inc(&gt->rps.num_waiters);
+	intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
+
+	mutex_lock(&gt->rps.lock);
+	intel_rps_set(&gt->rps, gt->rps.max_freq);
+	mutex_unlock(&gt->rps.lock);
+}
+
+static void rps_unpin(struct intel_gt *gt)
+{
+	intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
+	atomic_dec(&gt->rps.num_waiters);
+}
+
+static void engine_heartbeat_disable(struct intel_engine_cs *engine)
+{
+	engine->props.heartbeat_interval_ms = 0;
+
+	intel_engine_pm_get(engine);
+	intel_engine_park_heartbeat(engine);
+}
+
+static void engine_heartbeat_enable(struct intel_engine_cs *engine)
+{
+	intel_engine_pm_put(engine);
+
+	engine->props.heartbeat_interval_ms =
+		engine->defaults.heartbeat_interval_ms;
+}
+
+static int perf_request_latency(void *arg)
+{
+	struct drm_i915_private *i915 = arg;
+	struct intel_engine_cs *engine;
+	struct pm_qos_request qos;
+	int err = 0;
+
+	if (INTEL_GEN(i915) < 8) /* per-engine CS timestamp, semaphores */
+		return 0;
+
+	cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
+
+	for_each_uabi_engine(engine, i915) {
+		struct intel_context *ce;
+
+		ce = intel_context_create(engine);
+		if (IS_ERR(ce))
+			goto out;
+
+		err = intel_context_pin(ce);
+		if (err) {
+			intel_context_put(ce);
+			goto out;
+		}
+
+		engine_heartbeat_disable(engine);
+		rps_pin(engine->gt);
+
+		if (err == 0)
+			err = measure_semaphore_response(ce);
+		if (err == 0)
+			err = measure_idle_dispatch(ce);
+		if (err == 0)
+			err = measure_busy_dispatch(ce);
+		if (err == 0)
+			err = measure_inter_request(ce);
+		if (err == 0)
+			err = measure_context_switch(ce);
+		if (err == 0)
+			err = measure_preemption(ce);
+		if (err == 0)
+			err = measure_completion(ce);
+
+		rps_unpin(engine->gt);
+		engine_heartbeat_enable(engine);
+
+		intel_context_unpin(ce);
+		intel_context_put(ce);
+		if (err)
+			goto out;
+	}
+
+out:
+	if (igt_flush_test(i915))
+		err = -EIO;
+
+	cpu_latency_qos_remove_request(&qos);
+	return err;
+}
+
 static int s_sync0(void *arg)
 {
 	struct perf_series *ps = arg;
@@ -2042,6 +2843,7 @@ static int perf_parallel_engines(void *arg)
 int i915_request_perf_selftests(struct drm_i915_private *i915)
 {
 	static const struct i915_subtest tests[] = {
+		SUBTEST(perf_request_latency),
 		SUBTEST(perf_series_engines),
 		SUBTEST(perf_parallel_engines),
 	};
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [Intel-gfx] ✗ Fi.CI.CHECKPATCH: warning for drm/i915/selftests: Measure dispatch latency (rev5)
  2020-05-18  8:57 [Intel-gfx] [PATCH] drm/i915/selftests: Measure dispatch latency Chris Wilson
@ 2020-05-18 12:46 ` Patchwork
  2020-05-18 13:20 ` [Intel-gfx] ✓ Fi.CI.BAT: success " Patchwork
  2020-05-18 15:13 ` [Intel-gfx] ✓ Fi.CI.IGT: " Patchwork
  2 siblings, 0 replies; 13+ messages in thread
From: Patchwork @ 2020-05-18 12:46 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: drm/i915/selftests: Measure dispatch latency (rev5)
URL   : https://patchwork.freedesktop.org/series/77308/
State : warning

== Summary ==

$ dim checkpatch origin/drm-tip
960f5e78842f drm/i915/selftests: Measure dispatch latency
-:727: WARNING:MEMORY_BARRIER: memory barrier without comment
#727: FILE: drivers/gpu/drm/i915/selftests/i915_request.c:2216:
+		wmb();

total: 0 errors, 1 warnings, 0 checks, 830 lines checked

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Intel-gfx] ✓ Fi.CI.BAT: success for drm/i915/selftests: Measure dispatch latency (rev5)
  2020-05-18  8:57 [Intel-gfx] [PATCH] drm/i915/selftests: Measure dispatch latency Chris Wilson
  2020-05-18 12:46 ` [Intel-gfx] ✗ Fi.CI.CHECKPATCH: warning for drm/i915/selftests: Measure dispatch latency (rev5) Patchwork
@ 2020-05-18 13:20 ` Patchwork
  2020-05-18 15:13 ` [Intel-gfx] ✓ Fi.CI.IGT: " Patchwork
  2 siblings, 0 replies; 13+ messages in thread
From: Patchwork @ 2020-05-18 13:20 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: drm/i915/selftests: Measure dispatch latency (rev5)
URL   : https://patchwork.freedesktop.org/series/77308/
State : success

== Summary ==

CI Bug Log - changes from CI_DRM_8494 -> Patchwork_17686
====================================================

Summary
-------

  **SUCCESS**

  No regressions found.

  External URL: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17686/index.html

Known issues
------------

  Here are the changes found in Patchwork_17686 that come from known issues:

### IGT changes ###

#### Issues hit ####

  * igt@i915_selftest@live@execlists:
    - fi-cfl-8700k:       [PASS][1] -> [INCOMPLETE][2] ([i915#656])
   [1]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8494/fi-cfl-8700k/igt@i915_selftest@live@execlists.html
   [2]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17686/fi-cfl-8700k/igt@i915_selftest@live@execlists.html

  
#### Possible fixes ####

  * igt@i915_selftest@live@execlists:
    - fi-whl-u:           [INCOMPLETE][3] ([i915#656]) -> [PASS][4]
   [3]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8494/fi-whl-u/igt@i915_selftest@live@execlists.html
   [4]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17686/fi-whl-u/igt@i915_selftest@live@execlists.html

  
  [i915#656]: https://gitlab.freedesktop.org/drm/intel/issues/656


Participating hosts (51 -> 44)
------------------------------

  Missing    (7): fi-ilk-m540 fi-hsw-4200u fi-byt-squawks fi-bsw-cyan fi-ctg-p8600 fi-byt-clapper fi-bdw-samus 


Build changes
-------------

  * Linux: CI_DRM_8494 -> Patchwork_17686

  CI-20190529: 20190529
  CI_DRM_8494: 3d15348fde9b998e754da0b0655baf02b98e7f17 @ git://anongit.freedesktop.org/gfx-ci/linux
  IGT_5657: 649eae5c905a7460b44305800f95db83a6dd47cb @ git://anongit.freedesktop.org/xorg/app/intel-gpu-tools
  Patchwork_17686: 960f5e78842fef918f6f8172c2a7eefeb38adc7f @ git://anongit.freedesktop.org/gfx-ci/linux


== Linux commits ==

960f5e78842f drm/i915/selftests: Measure dispatch latency

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17686/index.html
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Intel-gfx] ✓ Fi.CI.IGT: success for drm/i915/selftests: Measure dispatch latency (rev5)
  2020-05-18  8:57 [Intel-gfx] [PATCH] drm/i915/selftests: Measure dispatch latency Chris Wilson
  2020-05-18 12:46 ` [Intel-gfx] ✗ Fi.CI.CHECKPATCH: warning for drm/i915/selftests: Measure dispatch latency (rev5) Patchwork
  2020-05-18 13:20 ` [Intel-gfx] ✓ Fi.CI.BAT: success " Patchwork
@ 2020-05-18 15:13 ` Patchwork
  2 siblings, 0 replies; 13+ messages in thread
From: Patchwork @ 2020-05-18 15:13 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: drm/i915/selftests: Measure dispatch latency (rev5)
URL   : https://patchwork.freedesktop.org/series/77308/
State : success

== Summary ==

CI Bug Log - changes from CI_DRM_8494_full -> Patchwork_17686_full
====================================================

Summary
-------

  **SUCCESS**

  No regressions found.

  

Known issues
------------

  Here are the changes found in Patchwork_17686_full that come from known issues:

### IGT changes ###

#### Issues hit ####

  * igt@gen9_exec_parse@allowed-all:
    - shard-apl:          [PASS][1] -> [DMESG-WARN][2] ([i915#1436] / [i915#716])
   [1]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8494/shard-apl7/igt@gen9_exec_parse@allowed-all.html
   [2]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17686/shard-apl6/igt@gen9_exec_parse@allowed-all.html

  * igt@kms_cursor_crc@pipe-a-cursor-suspend:
    - shard-kbl:          [PASS][3] -> [DMESG-WARN][4] ([i915#180])
   [3]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8494/shard-kbl3/igt@kms_cursor_crc@pipe-a-cursor-suspend.html
   [4]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17686/shard-kbl2/igt@kms_cursor_crc@pipe-a-cursor-suspend.html

  * igt@kms_hdr@bpc-switch-dpms:
    - shard-skl:          [PASS][5] -> [FAIL][6] ([i915#1188]) +1 similar issue
   [5]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8494/shard-skl8/igt@kms_hdr@bpc-switch-dpms.html
   [6]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17686/shard-skl3/igt@kms_hdr@bpc-switch-dpms.html

  * igt@kms_plane_alpha_blend@pipe-b-constant-alpha-min:
    - shard-skl:          [PASS][7] -> [FAIL][8] ([fdo#108145] / [i915#265])
   [7]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8494/shard-skl4/igt@kms_plane_alpha_blend@pipe-b-constant-alpha-min.html
   [8]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17686/shard-skl6/igt@kms_plane_alpha_blend@pipe-b-constant-alpha-min.html

  * igt@kms_psr@no_drrs:
    - shard-iclb:         [PASS][9] -> [FAIL][10] ([i915#173])
   [9]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8494/shard-iclb8/igt@kms_psr@no_drrs.html
   [10]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17686/shard-iclb1/igt@kms_psr@no_drrs.html

  * igt@kms_psr@psr2_sprite_blt:
    - shard-iclb:         [PASS][11] -> [SKIP][12] ([fdo#109441])
   [11]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8494/shard-iclb2/igt@kms_psr@psr2_sprite_blt.html
   [12]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17686/shard-iclb5/igt@kms_psr@psr2_sprite_blt.html

  * igt@kms_vblank@pipe-c-ts-continuation-suspend:
    - shard-skl:          [PASS][13] -> [INCOMPLETE][14] ([i915#69])
   [13]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8494/shard-skl5/igt@kms_vblank@pipe-c-ts-continuation-suspend.html
   [14]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17686/shard-skl3/igt@kms_vblank@pipe-c-ts-continuation-suspend.html

  
#### Possible fixes ####

  * {igt@gem_exec_schedule@pi-shared-iova@rcs0}:
    - shard-tglb:         [INCOMPLETE][15] ([i915#1193]) -> [PASS][16]
   [15]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8494/shard-tglb2/igt@gem_exec_schedule@pi-shared-iova@rcs0.html
   [16]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17686/shard-tglb2/igt@gem_exec_schedule@pi-shared-iova@rcs0.html

  * {igt@kms_atomic_transition@plane-all-transition-nonblocking@pipe-b}:
    - shard-kbl:          [DMESG-WARN][17] ([i915#78]) -> [PASS][18]
   [17]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8494/shard-kbl2/igt@kms_atomic_transition@plane-all-transition-nonblocking@pipe-b.html
   [18]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17686/shard-kbl6/igt@kms_atomic_transition@plane-all-transition-nonblocking@pipe-b.html

  * igt@kms_cursor_crc@pipe-b-cursor-128x42-offscreen:
    - shard-tglb:         [FAIL][19] ([i915#1897]) -> [PASS][20] +4 similar issues
   [19]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8494/shard-tglb1/igt@kms_cursor_crc@pipe-b-cursor-128x42-offscreen.html
   [20]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17686/shard-tglb5/igt@kms_cursor_crc@pipe-b-cursor-128x42-offscreen.html

  * igt@kms_cursor_crc@pipe-c-cursor-64x64-onscreen:
    - shard-skl:          [FAIL][21] ([i915#54]) -> [PASS][22]
   [21]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8494/shard-skl4/igt@kms_cursor_crc@pipe-c-cursor-64x64-onscreen.html
   [22]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17686/shard-skl6/igt@kms_cursor_crc@pipe-c-cursor-64x64-onscreen.html

  * {igt@kms_flip@flip-vs-expired-vblank-interruptible@b-edp1}:
    - shard-skl:          [FAIL][23] ([i915#79]) -> [PASS][24]
   [23]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8494/shard-skl2/igt@kms_flip@flip-vs-expired-vblank-interruptible@b-edp1.html
   [24]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17686/shard-skl7/igt@kms_flip@flip-vs-expired-vblank-interruptible@b-edp1.html

  * {igt@kms_flip@flip-vs-suspend-interruptible@a-dp1}:
    - shard-apl:          [DMESG-WARN][25] ([i915#180]) -> [PASS][26] +1 similar issue
   [25]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8494/shard-apl4/igt@kms_flip@flip-vs-suspend-interruptible@a-dp1.html
   [26]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17686/shard-apl8/igt@kms_flip@flip-vs-suspend-interruptible@a-dp1.html

  * igt@kms_frontbuffer_tracking@fbcpsr-rgb565-draw-mmap-cpu:
    - shard-tglb:         [FAIL][27] ([i915#1897] / [i915#402]) -> [PASS][28] +4 similar issues
   [27]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8494/shard-tglb1/igt@kms_frontbuffer_tracking@fbcpsr-rgb565-draw-mmap-cpu.html
   [28]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17686/shard-tglb5/igt@kms_frontbuffer_tracking@fbcpsr-rgb565-draw-mmap-cpu.html

  * igt@kms_frontbuffer_tracking@psr-1p-pri-indfb-multidraw:
    - shard-skl:          [FAIL][29] ([i915#49]) -> [PASS][30]
   [29]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8494/shard-skl5/igt@kms_frontbuffer_tracking@psr-1p-pri-indfb-multidraw.html
   [30]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17686/shard-skl9/igt@kms_frontbuffer_tracking@psr-1p-pri-indfb-multidraw.html

  * igt@kms_hdr@bpc-switch-suspend:
    - shard-kbl:          [DMESG-WARN][31] ([i915#180]) -> [PASS][32] +7 similar issues
   [31]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8494/shard-kbl2/igt@kms_hdr@bpc-switch-suspend.html
   [32]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17686/shard-kbl6/igt@kms_hdr@bpc-switch-suspend.html

  * igt@kms_pipe_crc_basic@suspend-read-crc-pipe-b:
    - shard-kbl:          [INCOMPLETE][33] ([i915#155]) -> [PASS][34]
   [33]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8494/shard-kbl3/igt@kms_pipe_crc_basic@suspend-read-crc-pipe-b.html
   [34]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17686/shard-kbl6/igt@kms_pipe_crc_basic@suspend-read-crc-pipe-b.html

  * igt@kms_plane_alpha_blend@pipe-a-coverage-7efc:
    - shard-skl:          [FAIL][35] ([fdo#108145] / [i915#265]) -> [PASS][36]
   [35]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8494/shard-skl5/igt@kms_plane_alpha_blend@pipe-a-coverage-7efc.html
   [36]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17686/shard-skl9/igt@kms_plane_alpha_blend@pipe-a-coverage-7efc.html

  * igt@kms_plane_cursor@pipe-a-primary-size-256:
    - shard-glk:          [FAIL][37] ([i915#1559]) -> [PASS][38]
   [37]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8494/shard-glk5/igt@kms_plane_cursor@pipe-a-primary-size-256.html
   [38]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17686/shard-glk7/igt@kms_plane_cursor@pipe-a-primary-size-256.html

  * igt@kms_psr@psr2_cursor_render:
    - shard-iclb:         [SKIP][39] ([fdo#109441]) -> [PASS][40] +2 similar issues
   [39]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8494/shard-iclb3/igt@kms_psr@psr2_cursor_render.html
   [40]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17686/shard-iclb2/igt@kms_psr@psr2_cursor_render.html

  
#### Warnings ####

  * igt@i915_pm_dc@dc6-psr:
    - shard-snb:          [SKIP][41] ([fdo#109271]) -> [INCOMPLETE][42] ([i915#82])
   [41]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8494/shard-snb1/igt@i915_pm_dc@dc6-psr.html
   [42]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17686/shard-snb5/igt@i915_pm_dc@dc6-psr.html

  * igt@kms_big_fb@x-tiled-8bpp-rotate-0:
    - shard-tglb:         [FAIL][43] ([i915#1172] / [i915#1897] / [i915#402]) -> [FAIL][44] ([i915#1172] / [i915#1897]) +1 similar issue
   [43]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8494/shard-tglb8/igt@kms_big_fb@x-tiled-8bpp-rotate-0.html
   [44]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17686/shard-tglb2/igt@kms_big_fb@x-tiled-8bpp-rotate-0.html

  * igt@kms_big_fb@y-tiled-16bpp-rotate-90:
    - shard-tglb:         [FAIL][45] ([i915#1172] / [i915#1897]) -> [FAIL][46] ([i915#1172] / [i915#1897] / [i915#402])
   [45]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8494/shard-tglb3/igt@kms_big_fb@y-tiled-16bpp-rotate-90.html
   [46]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17686/shard-tglb1/igt@kms_big_fb@y-tiled-16bpp-rotate-90.html

  * igt@kms_color@pipe-b-ctm-0-5:
    - shard-tglb:         [FAIL][47] ([i915#1149] / [i915#402]) -> [FAIL][48] ([i915#1149])
   [47]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8494/shard-tglb3/igt@kms_color@pipe-b-ctm-0-5.html
   [48]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17686/shard-tglb6/igt@kms_color@pipe-b-ctm-0-5.html

  * igt@kms_color@pipe-c-gamma:
    - shard-tglb:         [FAIL][49] ([i915#1149] / [i915#1897]) -> [FAIL][50] ([i915#1149] / [i915#1897] / [i915#402]) +1 similar issue
   [49]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8494/shard-tglb7/igt@kms_color@pipe-c-gamma.html
   [50]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17686/shard-tglb8/igt@kms_color@pipe-c-gamma.html

  * igt@kms_content_protection@atomic-dpms:
    - shard-apl:          [FAIL][51] ([fdo#110321] / [fdo#110336]) -> [TIMEOUT][52] ([i915#1319])
   [51]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8494/shard-apl2/igt@kms_content_protection@atomic-dpms.html
   [52]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17686/shard-apl7/igt@kms_content_protection@atomic-dpms.html

  * igt@kms_content_protection@lic:
    - shard-apl:          [DMESG-FAIL][53] ([fdo#110321]) -> [TIMEOUT][54] ([i915#1319])
   [53]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8494/shard-apl8/igt@kms_content_protection@lic.html
   [54]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17686/shard-apl3/igt@kms_content_protection@lic.html

  * igt@kms_frontbuffer_tracking@fbcpsr-rgb101010-draw-mmap-gtt:
    - shard-tglb:         [FAIL][55] ([i915#1897]) -> [FAIL][56] ([i915#1897] / [i915#402]) +3 similar issues
   [55]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8494/shard-tglb2/igt@kms_frontbuffer_tracking@fbcpsr-rgb101010-draw-mmap-gtt.html
   [56]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17686/shard-tglb6/igt@kms_frontbuffer_tracking@fbcpsr-rgb101010-draw-mmap-gtt.html

  * igt@kms_plane@plane-panning-top-left-pipe-d-planes:
    - shard-tglb:         [FAIL][57] ([i915#1897] / [i915#402]) -> [FAIL][58] ([i915#1897]) +3 similar issues
   [57]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8494/shard-tglb5/igt@kms_plane@plane-panning-top-left-pipe-d-planes.html
   [58]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17686/shard-tglb3/igt@kms_plane@plane-panning-top-left-pipe-d-planes.html

  
  {name}: This element is suppressed. This means it is ignored when computing
          the status of the difference (SUCCESS, WARNING, or FAILURE).

  [fdo#108145]: https://bugs.freedesktop.org/show_bug.cgi?id=108145
  [fdo#109271]: https://bugs.freedesktop.org/show_bug.cgi?id=109271
  [fdo#109441]: https://bugs.freedesktop.org/show_bug.cgi?id=109441
  [fdo#110321]: https://bugs.freedesktop.org/show_bug.cgi?id=110321
  [fdo#110336]: https://bugs.freedesktop.org/show_bug.cgi?id=110336
  [i915#1149]: https://gitlab.freedesktop.org/drm/intel/issues/1149
  [i915#1172]: https://gitlab.freedesktop.org/drm/intel/issues/1172
  [i915#1188]: https://gitlab.freedesktop.org/drm/intel/issues/1188
  [i915#1193]: https://gitlab.freedesktop.org/drm/intel/issues/1193
  [i915#1319]: https://gitlab.freedesktop.org/drm/intel/issues/1319
  [i915#1436]: https://gitlab.freedesktop.org/drm/intel/issues/1436
  [i915#155]: https://gitlab.freedesktop.org/drm/intel/issues/155
  [i915#1559]: https://gitlab.freedesktop.org/drm/intel/issues/1559
  [i915#173]: https://gitlab.freedesktop.org/drm/intel/issues/173
  [i915#180]: https://gitlab.freedesktop.org/drm/intel/issues/180
  [i915#1897]: https://gitlab.freedesktop.org/drm/intel/issues/1897
  [i915#265]: https://gitlab.freedesktop.org/drm/intel/issues/265
  [i915#402]: https://gitlab.freedesktop.org/drm/intel/issues/402
  [i915#49]: https://gitlab.freedesktop.org/drm/intel/issues/49
  [i915#54]: https://gitlab.freedesktop.org/drm/intel/issues/54
  [i915#69]: https://gitlab.freedesktop.org/drm/intel/issues/69
  [i915#716]: https://gitlab.freedesktop.org/drm/intel/issues/716
  [i915#78]: https://gitlab.freedesktop.org/drm/intel/issues/78
  [i915#79]: https://gitlab.freedesktop.org/drm/intel/issues/79
  [i915#82]: https://gitlab.freedesktop.org/drm/intel/issues/82


Participating hosts (11 -> 11)
------------------------------

  No changes in participating hosts


Build changes
-------------

  * Linux: CI_DRM_8494 -> Patchwork_17686

  CI-20190529: 20190529
  CI_DRM_8494: 3d15348fde9b998e754da0b0655baf02b98e7f17 @ git://anongit.freedesktop.org/gfx-ci/linux
  IGT_5657: 649eae5c905a7460b44305800f95db83a6dd47cb @ git://anongit.freedesktop.org/xorg/app/intel-gpu-tools
  Patchwork_17686: 960f5e78842fef918f6f8172c2a7eefeb38adc7f @ git://anongit.freedesktop.org/gfx-ci/linux
  piglit_4509: fdc5a4ca11124ab8413c7988896eec4c97336694 @ git://anongit.freedesktop.org/piglit

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17686/index.html
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Intel-gfx] [PATCH] drm/i915/selftests: Measure dispatch latency
@ 2020-05-19 13:08 Chris Wilson
  0 siblings, 0 replies; 13+ messages in thread
From: Chris Wilson @ 2020-05-19 13:08 UTC (permalink / raw)
  To: intel-gfx; +Cc: Chris Wilson

A useful metric of the system's health is how fast we can tell the GPU
to do various actions, so measure our latency.

v2: Refactor all the instruction building into emitters.
v3: Mark the error handling if not perfect, at least consistent.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
 drivers/gpu/drm/i915/selftests/i915_request.c | 823 ++++++++++++++++++
 1 file changed, 823 insertions(+)

diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c
index 6014e8dfcbb1..92c628f18c60 100644
--- a/drivers/gpu/drm/i915/selftests/i915_request.c
+++ b/drivers/gpu/drm/i915/selftests/i915_request.c
@@ -24,16 +24,20 @@
 
 #include <linux/prime_numbers.h>
 #include <linux/pm_qos.h>
+#include <linux/sort.h>
 
 #include "gem/i915_gem_pm.h"
 #include "gem/selftests/mock_context.h"
 
+#include "gt/intel_engine_heartbeat.h"
 #include "gt/intel_engine_pm.h"
 #include "gt/intel_engine_user.h"
 #include "gt/intel_gt.h"
+#include "gt/intel_gt_requests.h"
 
 #include "i915_random.h"
 #include "i915_selftest.h"
+#include "igt_flush_test.h"
 #include "igt_live_test.h"
 #include "igt_spinner.h"
 #include "lib_sw_fence.h"
@@ -1524,6 +1528,824 @@ struct perf_series {
 	struct intel_context *ce[];
 };
 
+static int cmp_u32(const void *A, const void *B)
+{
+	const u32 *a = A, *b = B;
+
+	return *a - *b;
+}
+
+static u32 trifilter(u32 *a)
+{
+	u64 sum;
+
+#define TF_COUNT 5
+	sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL);
+
+	sum = mul_u32_u32(a[2], 2);
+	sum += a[1];
+	sum += a[3];
+
+	GEM_BUG_ON(sum > U32_MAX);
+	return sum;
+#define TF_BIAS 2
+}
+
+static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
+{
+	u64 ns = i915_cs_timestamp_ticks_to_ns(engine->i915, cycles);
+
+	return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS);
+}
+
+static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset)
+{
+	*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+	*cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base)));
+	*cs++ = offset;
+	*cs++ = 0;
+
+	return cs;
+}
+
+static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value)
+{
+	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+	*cs++ = offset;
+	*cs++ = 0;
+	*cs++ = value;
+
+	return cs;
+}
+
+static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset)
+{
+	*cs++ = MI_SEMAPHORE_WAIT |
+		MI_SEMAPHORE_GLOBAL_GTT |
+		MI_SEMAPHORE_POLL |
+		mode;
+	*cs++ = value;
+	*cs++ = offset;
+	*cs++ = 0;
+
+	return cs;
+}
+
+static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value)
+{
+	return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset);
+}
+
+static void semaphore_set(u32 *sema, u32 value)
+{
+	WRITE_ONCE(*sema, value);
+	wmb(); /* flush the update to the cache, and beyond */
+}
+
+static u32 *hwsp_scratch(const struct intel_context *ce)
+{
+	return memset32(ce->engine->status_page.addr + 1000, 0, 21);
+}
+
+static u32 hwsp_offset(const struct intel_context *ce, u32 *dw)
+{
+	return (i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(dw));
+}
+
+static int measure_semaphore_response(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	u32 elapsed[TF_COUNT], cycles;
+	struct i915_request *rq;
+	u32 *cs;
+	int err;
+	int i;
+
+	/*
+	 * Measure how many cycles it takes for the HW to detect the change
+	 * in a semaphore value.
+	 *
+	 *    A: read CS_TIMESTAMP from CPU
+	 *    poke semaphore
+	 *    B: read CS_TIMESTAMP on GPU
+	 *
+	 * Semaphore latency: B - A
+	 */
+
+	semaphore_set(sema, -1);
+
+	rq = i915_request_create(ce);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed));
+	if (IS_ERR(cs)) {
+		i915_request_add(rq);
+		err = PTR_ERR(cs);
+		goto err;
+	}
+
+	cs = emit_store_dw(cs, offset, 0);
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		cs = emit_semaphore_poll_until(cs, offset, i);
+		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+		cs = emit_store_dw(cs, offset, 0);
+	}
+
+	intel_ring_advance(rq, cs);
+	i915_request_add(rq);
+
+	if (wait_for(READ_ONCE(*sema) == 0, 50)) {
+		err = -EIO;
+		goto err;
+	}
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		preempt_disable();
+		cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		semaphore_set(sema, i);
+		preempt_enable();
+
+		if (wait_for(READ_ONCE(*sema) == 0, 50)) {
+			err = -EIO;
+			goto err;
+		}
+
+		elapsed[i - 1] = sema[i] - cycles;
+	}
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: semaphore response %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+
+err:
+	intel_gt_set_wedged(ce->engine->gt);
+	return err;
+}
+
+static int measure_idle_dispatch(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	u32 elapsed[TF_COUNT], cycles;
+	u32 *cs;
+	int err;
+	int i;
+
+	/*
+	 * Measure how long it takes for us to submit a request while the
+	 * engine is idle, but is resting in our context.
+	 *
+	 *    A: read CS_TIMESTAMP from CPU
+	 *    submit request
+	 *    B: read CS_TIMESTAMP on GPU
+	 *
+	 * Submission latency: B - A
+	 */
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+
+		err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+		if (err)
+			return err;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq)) {
+			err = PTR_ERR(rq);
+			goto err;
+		}
+
+		cs = intel_ring_begin(rq, 4);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			err = PTR_ERR(cs);
+			goto err;
+		}
+
+		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+
+		intel_ring_advance(rq, cs);
+
+		preempt_disable();
+		local_bh_disable();
+		elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		i915_request_add(rq);
+		local_bh_enable();
+		preempt_enable();
+	}
+
+	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+	if (err)
+		goto err;
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++)
+		elapsed[i] = sema[i] - elapsed[i];
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+
+err:
+	intel_gt_set_wedged(ce->engine->gt);
+	return err;
+}
+
+static int measure_busy_dispatch(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	u32 elapsed[TF_COUNT + 1], cycles;
+	u32 *cs;
+	int err;
+	int i;
+
+	/*
+	 * Measure how long it takes for us to submit a request while the
+	 * engine is busy, polling on a semaphore in our context. With
+	 * direct submission, this will include the cost of a lite restore.
+	 *
+	 *    A: read CS_TIMESTAMP from CPU
+	 *    submit request
+	 *    B: read CS_TIMESTAMP on GPU
+	 *
+	 * Submission latency: B - A
+	 */
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq)) {
+			err = PTR_ERR(rq);
+			goto err;
+		}
+
+		cs = intel_ring_begin(rq, 12);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			err = PTR_ERR(cs);
+			goto err;
+		}
+
+		cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
+		cs = emit_semaphore_poll_until(cs, offset, i);
+		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+
+		intel_ring_advance(rq, cs);
+
+		if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
+			err = -EIO;
+			goto err;
+		}
+
+		preempt_disable();
+		local_bh_disable();
+		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		i915_request_add(rq);
+		local_bh_enable();
+		semaphore_set(sema, i - 1);
+		preempt_enable();
+	}
+
+	wait_for(READ_ONCE(sema[i - 1]), 500);
+	semaphore_set(sema, i - 1);
+
+	for (i = 1; i <= TF_COUNT; i++) {
+		GEM_BUG_ON(sema[i] == -1);
+		elapsed[i - 1] = sema[i] - elapsed[i];
+	}
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+
+err:
+	intel_gt_set_wedged(ce->engine->gt);
+	return err;
+}
+
+static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
+{
+	const u32 offset =
+		i915_ggtt_offset(engine->status_page.vma) +
+		offset_in_page(sema);
+	struct i915_request *rq;
+	u32 *cs;
+
+	rq = i915_request_create(engine->kernel_context);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	cs = intel_ring_begin(rq, 4);
+	if (IS_ERR(cs)) {
+		i915_request_add(rq);
+		return PTR_ERR(cs);
+	}
+
+	cs = emit_semaphore_poll(cs, mode, value, offset);
+
+	intel_ring_advance(rq, cs);
+	i915_request_add(rq);
+
+	return 0;
+}
+
+static int measure_inter_request(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	u32 elapsed[TF_COUNT + 1], cycles;
+	struct i915_sw_fence *submit;
+	int i, err;
+
+	/*
+	 * Measure how long it takes to advance from one request into the
+	 * next. Between each request we flush the GPU caches to memory,
+	 * update the breadcrumbs, and then invalidate those caches.
+	 * We queue up all the requests to be submitted in one batch so
+	 * it should be one set of contiguous measurements.
+	 *
+	 *    A: read CS_TIMESTAMP on GPU
+	 *    advance request
+	 *    B: read CS_TIMESTAMP on GPU
+	 *
+	 * Request latency: B - A
+	 */
+
+	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+	if (err)
+		return err;
+
+	submit = heap_fence_create(GFP_KERNEL);
+	if (!submit) {
+		semaphore_set(sema, 1);
+		return -ENOMEM;
+	}
+
+	intel_engine_flush_submission(ce->engine);
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+		u32 *cs;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq)) {
+			err = PTR_ERR(rq);
+			goto err_submit;
+		}
+
+		err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
+						       submit,
+						       GFP_KERNEL);
+		if (err < 0) {
+			i915_request_add(rq);
+			goto err_submit;
+		}
+
+		cs = intel_ring_begin(rq, 4);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			err = PTR_ERR(cs);
+			goto err_submit;
+		}
+
+		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+
+		intel_ring_advance(rq, cs);
+		i915_request_add(rq);
+	}
+	local_bh_disable();
+	i915_sw_fence_commit(submit);
+	local_bh_enable();
+	intel_engine_flush_submission(ce->engine);
+	heap_fence_put(submit);
+
+	semaphore_set(sema, 1);
+	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+	if (err)
+		goto err;
+
+	for (i = 1; i <= TF_COUNT; i++)
+		elapsed[i - 1] = sema[i + 1] - sema[i];
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: inter-request latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+
+err_submit:
+	i915_sw_fence_commit(submit);
+	heap_fence_put(submit);
+	semaphore_set(sema, 1);
+err:
+	intel_gt_set_wedged(ce->engine->gt);
+	return err;
+}
+
+static int measure_context_switch(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	struct i915_request *fence = NULL;
+	u32 elapsed[TF_COUNT + 1], cycles;
+	int i, j, err;
+	u32 *cs;
+
+	/*
+	 * Measure how long it takes to advance from one request in one
+	 * context to a request in another context. This allows us to
+	 * measure how long the context save/restore take, along with all
+	 * the inter-context setup we require.
+	 *
+	 *    A: read CS_TIMESTAMP on GPU
+	 *    switch context
+	 *    B: read CS_TIMESTAMP on GPU
+	 *
+	 * Context switch latency: B - A
+	 */
+
+	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+	if (err)
+		return err;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct intel_context *arr[] = {
+			ce, ce->engine->kernel_context
+		};
+		u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32);
+
+		for (j = 0; j < ARRAY_SIZE(arr); j++) {
+			struct i915_request *rq;
+
+			rq = i915_request_create(arr[j]);
+			if (IS_ERR(rq)) {
+				err = PTR_ERR(rq);
+				goto err_fence;
+			}
+
+			if (fence) {
+				err = i915_request_await_dma_fence(rq,
+								   &fence->fence);
+				if (err) {
+					i915_request_add(rq);
+					goto err_fence;
+				}
+			}
+
+			cs = intel_ring_begin(rq, 4);
+			if (IS_ERR(cs)) {
+				i915_request_add(rq);
+				err = PTR_ERR(cs);
+				goto err_fence;
+			}
+
+			cs = emit_timestamp_store(cs, ce, addr);
+			addr += sizeof(u32);
+
+			intel_ring_advance(rq, cs);
+
+			i915_request_put(fence);
+			fence = i915_request_get(rq);
+
+			i915_request_add(rq);
+		}
+	}
+	i915_request_put(fence);
+	intel_engine_flush_submission(ce->engine);
+
+	semaphore_set(sema, 1);
+	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+	if (err)
+		goto err;
+
+	for (i = 1; i <= TF_COUNT; i++)
+		elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1];
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: context switch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+
+err_fence:
+	i915_request_put(fence);
+	semaphore_set(sema, 1);
+err:
+	intel_gt_set_wedged(ce->engine->gt);
+	return err;
+}
+
+static int measure_preemption(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	u32 elapsed[TF_COUNT], cycles;
+	u32 *cs;
+	int err;
+	int i;
+
+	/*
+	 * We measure two latencies while triggering preemption. The first
+	 * latency is how long it takes for us to submit a preempting request.
+	 * The second latency is how it takes for us to return from the
+	 * preemption back to the original context.
+	 *
+	 *    A: read CS_TIMESTAMP from CPU
+	 *    submit preemption
+	 *    B: read CS_TIMESTAMP on GPU (in preempting context)
+	 *    context switch
+	 *    C: read CS_TIMESTAMP on GPU (in original context)
+	 *
+	 * Preemption dispatch latency: B - A
+	 * Preemption switch latency: C - B
+	 */
+
+	if (!intel_engine_has_preemption(ce->engine))
+		return 0;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		u32 addr = offset + 2 * i * sizeof(u32);
+		struct i915_request *rq;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq)) {
+			err = PTR_ERR(rq);
+			goto err;
+		}
+
+		cs = intel_ring_begin(rq, 12);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			err = PTR_ERR(cs);
+			goto err;
+		}
+
+		cs = emit_store_dw(cs, addr, -1);
+		cs = emit_semaphore_poll_until(cs, offset, i);
+		cs = emit_timestamp_store(cs, ce, addr + sizeof(u32));
+
+		intel_ring_advance(rq, cs);
+		i915_request_add(rq);
+
+		if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) {
+			err = -EIO;
+			goto err;
+		}
+
+		rq = i915_request_create(ce->engine->kernel_context);
+		if (IS_ERR(rq)) {
+			err = PTR_ERR(rq);
+			goto err;
+		}
+
+		cs = intel_ring_begin(rq, 8);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			err = PTR_ERR(cs);
+			goto err;
+		}
+
+		cs = emit_timestamp_store(cs, ce, addr);
+		cs = emit_store_dw(cs, offset, i);
+
+		intel_ring_advance(rq, cs);
+		rq->sched.attr.priority = I915_PRIORITY_BARRIER;
+
+		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		i915_request_add(rq);
+	}
+
+	if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) {
+		err = -EIO;
+		goto err;
+	}
+
+	for (i = 1; i <= TF_COUNT; i++)
+		elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1];
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: preemption dispatch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	for (i = 1; i <= TF_COUNT; i++)
+		elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0];
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: preemption switch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+
+err:
+	intel_gt_set_wedged(ce->engine->gt);
+	return err;
+}
+
+struct signal_cb {
+	struct dma_fence_cb base;
+	bool seen;
+};
+
+static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
+{
+	struct signal_cb *s = container_of(cb, typeof(*s), base);
+
+	smp_store_mb(s->seen, true); /* be safe, be strong */
+}
+
+static int measure_completion(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	u32 elapsed[TF_COUNT], cycles;
+	u32 *cs;
+	int err;
+	int i;
+
+	/*
+	 * Measure how long it takes for the signal (interrupt) to be
+	 * sent from the GPU to be processed by the CPU.
+	 *
+	 *    A: read CS_TIMESTAMP on GPU
+	 *    signal
+	 *    B: read CS_TIMESTAMP from CPU
+	 *
+	 * Completion latency: B - A
+	 */
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct signal_cb cb = { .seen = false };
+		struct i915_request *rq;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq)) {
+			err = PTR_ERR(rq);
+			goto err;
+		}
+
+		cs = intel_ring_begin(rq, 12);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			err = PTR_ERR(cs);
+			goto err;
+		}
+
+		cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
+		cs = emit_semaphore_poll_until(cs, offset, i);
+		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+
+		intel_ring_advance(rq, cs);
+
+		dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
+
+		local_bh_disable();
+		i915_request_add(rq);
+		local_bh_enable();
+
+		if (wait_for(READ_ONCE(sema[i]) == -1, 50)) {
+			err = -EIO;
+			goto err;
+		}
+
+		preempt_disable();
+		semaphore_set(sema, i);
+		while (!READ_ONCE(cb.seen))
+			cpu_relax();
+
+		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		preempt_enable();
+	}
+
+	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+	if (err)
+		goto err;
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+		GEM_BUG_ON(sema[i + 1] == -1);
+		elapsed[i] = elapsed[i] - sema[i + 1];
+	}
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: completion latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+
+err:
+	intel_gt_set_wedged(ce->engine->gt);
+	return err;
+}
+
+static void rps_pin(struct intel_gt *gt)
+{
+	/* Pin the frequency to max */
+	atomic_inc(&gt->rps.num_waiters);
+	intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
+
+	mutex_lock(&gt->rps.lock);
+	intel_rps_set(&gt->rps, gt->rps.max_freq);
+	mutex_unlock(&gt->rps.lock);
+}
+
+static void rps_unpin(struct intel_gt *gt)
+{
+	intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
+	atomic_dec(&gt->rps.num_waiters);
+}
+
+static void engine_heartbeat_disable(struct intel_engine_cs *engine)
+{
+	engine->props.heartbeat_interval_ms = 0;
+
+	intel_engine_pm_get(engine);
+	intel_engine_park_heartbeat(engine);
+}
+
+static void engine_heartbeat_enable(struct intel_engine_cs *engine)
+{
+	intel_engine_pm_put(engine);
+
+	engine->props.heartbeat_interval_ms =
+		engine->defaults.heartbeat_interval_ms;
+}
+
+static int perf_request_latency(void *arg)
+{
+	struct drm_i915_private *i915 = arg;
+	struct intel_engine_cs *engine;
+	struct pm_qos_request qos;
+	int err = 0;
+
+	if (INTEL_GEN(i915) < 8) /* per-engine CS timestamp, semaphores */
+		return 0;
+
+	cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
+
+	for_each_uabi_engine(engine, i915) {
+		struct intel_context *ce;
+
+		ce = intel_context_create(engine);
+		if (IS_ERR(ce))
+			goto out;
+
+		err = intel_context_pin(ce);
+		if (err) {
+			intel_context_put(ce);
+			goto out;
+		}
+
+		engine_heartbeat_disable(engine);
+		rps_pin(engine->gt);
+
+		if (err == 0)
+			err = measure_semaphore_response(ce);
+		if (err == 0)
+			err = measure_idle_dispatch(ce);
+		if (err == 0)
+			err = measure_busy_dispatch(ce);
+		if (err == 0)
+			err = measure_inter_request(ce);
+		if (err == 0)
+			err = measure_context_switch(ce);
+		if (err == 0)
+			err = measure_preemption(ce);
+		if (err == 0)
+			err = measure_completion(ce);
+
+		rps_unpin(engine->gt);
+		engine_heartbeat_enable(engine);
+
+		intel_context_unpin(ce);
+		intel_context_put(ce);
+		if (err)
+			goto out;
+	}
+
+out:
+	if (igt_flush_test(i915))
+		err = -EIO;
+
+	cpu_latency_qos_remove_request(&qos);
+	return err;
+}
+
 static int s_sync0(void *arg)
 {
 	struct perf_series *ps = arg;
@@ -2042,6 +2864,7 @@ static int perf_parallel_engines(void *arg)
 int i915_request_perf_selftests(struct drm_i915_private *i915)
 {
 	static const struct i915_subtest tests[] = {
+		SUBTEST(perf_request_latency),
 		SUBTEST(perf_series_engines),
 		SUBTEST(perf_parallel_engines),
 	};
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [Intel-gfx] [PATCH] drm/i915/selftests: Measure dispatch latency
  2020-05-19 12:47   ` Mika Kuoppala
@ 2020-05-19 12:56     ` Chris Wilson
  0 siblings, 0 replies; 13+ messages in thread
From: Chris Wilson @ 2020-05-19 12:56 UTC (permalink / raw)
  To: Mika Kuoppala, intel-gfx

Quoting Mika Kuoppala (2020-05-19 13:47:31)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > A useful metric of the system's health is how fast we can tell the GPU
> > to do various actions, so measure our latency.
> >
> > v2: Refactor all the instruction building into emitters.
> >
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> > Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> 
> Not much nitpicking left. Could have used one goto in the fence
> using tests on error paths but meh.

Error handling is not great here, I agree.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Intel-gfx] [PATCH] drm/i915/selftests: Measure dispatch latency
  2020-05-19 11:41 ` Chris Wilson
@ 2020-05-19 12:47   ` Mika Kuoppala
  2020-05-19 12:56     ` Chris Wilson
  0 siblings, 1 reply; 13+ messages in thread
From: Mika Kuoppala @ 2020-05-19 12:47 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx; +Cc: Chris Wilson

Chris Wilson <chris@chris-wilson.co.uk> writes:

> A useful metric of the system's health is how fast we can tell the GPU
> to do various actions, so measure our latency.
>
> v2: Refactor all the instruction building into emitters.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>

Not much nitpicking left. Could have used one goto in the fence
using tests on error paths but meh.

Lots of tests poking hw from different angles.
With a clear comments, it is like a guided tour of our
submission/scheduling front.

Analyzing of differences between different sets will
be interesting.

Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>

> ---
>  drivers/gpu/drm/i915/selftests/i915_request.c | 779 ++++++++++++++++++
>  1 file changed, 779 insertions(+)
>
> diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c
> index 6014e8dfcbb1..db09e9cb54b8 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_request.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_request.c
> @@ -24,16 +24,20 @@
>  
>  #include <linux/prime_numbers.h>
>  #include <linux/pm_qos.h>
> +#include <linux/sort.h>
>  
>  #include "gem/i915_gem_pm.h"
>  #include "gem/selftests/mock_context.h"
>  
> +#include "gt/intel_engine_heartbeat.h"
>  #include "gt/intel_engine_pm.h"
>  #include "gt/intel_engine_user.h"
>  #include "gt/intel_gt.h"
> +#include "gt/intel_gt_requests.h"
>  
>  #include "i915_random.h"
>  #include "i915_selftest.h"
> +#include "igt_flush_test.h"
>  #include "igt_live_test.h"
>  #include "igt_spinner.h"
>  #include "lib_sw_fence.h"
> @@ -1524,6 +1528,780 @@ struct perf_series {
>  	struct intel_context *ce[];
>  };
>  
> +static int cmp_u32(const void *A, const void *B)
> +{
> +	const u32 *a = A, *b = B;
> +
> +	return *a - *b;
> +}
> +
> +static u32 trifilter(u32 *a)
> +{
> +	u64 sum;
> +
> +#define TF_COUNT 5
> +	sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL);
> +
> +	sum = mul_u32_u32(a[2], 2);
> +	sum += a[1];
> +	sum += a[3];
> +
> +	GEM_BUG_ON(sum > U32_MAX);
> +	return sum;
> +#define TF_BIAS 2
> +}
> +
> +static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
> +{
> +	u64 ns = i915_cs_timestamp_ticks_to_ns(engine->i915, cycles);
> +
> +	return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS);
> +}
> +
> +static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset)
> +{
> +	*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
> +	*cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base)));
> +	*cs++ = offset;
> +	*cs++ = 0;
> +
> +	return cs;
> +}
> +
> +static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value)
> +{
> +	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
> +	*cs++ = offset;
> +	*cs++ = 0;
> +	*cs++ = value;
> +
> +	return cs;
> +}
> +
> +static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset)
> +{
> +	*cs++ = MI_SEMAPHORE_WAIT |
> +		MI_SEMAPHORE_GLOBAL_GTT |
> +		MI_SEMAPHORE_POLL |
> +		mode;
> +	*cs++ = value;
> +	*cs++ = offset;
> +	*cs++ = 0;
> +
> +	return cs;
> +}
> +
> +static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value)
> +{
> +	return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset);
> +}
> +
> +static void semaphore_set(u32 *sema, u32 value)
> +{
> +	WRITE_ONCE(*sema, value);
> +	wmb(); /* flush the update to the cache, and beyond */
> +}
> +
> +static u32 *hwsp_scratch(const struct intel_context *ce)
> +{
> +	return memset32(ce->engine->status_page.addr + 1000, 0, 21);
> +}
> +
> +static u32 hwsp_offset(const struct intel_context *ce, u32 *dw)
> +{
> +	return (i915_ggtt_offset(ce->engine->status_page.vma) +
> +		offset_in_page(dw));
> +}
> +
> +static int measure_semaphore_response(struct intel_context *ce)
> +{
> +	u32 *sema = hwsp_scratch(ce);
> +	const u32 offset = hwsp_offset(ce, sema);
> +	u32 elapsed[TF_COUNT], cycles;
> +	struct i915_request *rq;
> +	u32 *cs;
> +	int i;
> +
> +	/*
> +	 * Measure how many cycles it takes for the HW to detect the change
> +	 * in a semaphore value.
> +	 *
> +	 *    A: read CS_TIMESTAMP from CPU
> +	 *    poke semaphore
> +	 *    B: read CS_TIMESTAMP on GPU
> +	 *
> +	 * Semaphore latency: B - A
> +	 */
> +
> +	semaphore_set(sema, -1);
> +
> +	rq = i915_request_create(ce);
> +	if (IS_ERR(rq))
> +		return PTR_ERR(rq);
> +
> +	cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed));
> +	if (IS_ERR(cs)) {
> +		i915_request_add(rq);
> +		return PTR_ERR(cs);
> +	}
> +
> +	cs = emit_store_dw(cs, offset, 0);
> +	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
> +		cs = emit_semaphore_poll_until(cs, offset, i);
> +		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
> +		cs = emit_store_dw(cs, offset, 0);
> +	}
> +
> +	intel_ring_advance(rq, cs);
> +	i915_request_add(rq);
> +
> +	if (wait_for(READ_ONCE(*sema) == 0, 50)) {
> +		intel_gt_set_wedged(ce->engine->gt);
> +		return -EIO;
> +	}
> +
> +	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
> +		preempt_disable();
> +		cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
> +		semaphore_set(sema, i);
> +		preempt_enable();
> +
> +		if (wait_for(READ_ONCE(*sema) == 0, 50)) {
> +			intel_gt_set_wedged(ce->engine->gt);
> +			return -EIO;
> +		}
> +
> +		elapsed[i - 1] = sema[i] - cycles;
> +	}
> +
> +	cycles = trifilter(elapsed);
> +	pr_info("%s: semaphore response %d cycles, %lluns\n",
> +		ce->engine->name, cycles >> TF_BIAS,
> +		cycles_to_ns(ce->engine, cycles));
> +
> +	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
> +}
> +
> +static int measure_idle_dispatch(struct intel_context *ce)
> +{
> +	u32 *sema = hwsp_scratch(ce);
> +	const u32 offset = hwsp_offset(ce, sema);
> +	u32 elapsed[TF_COUNT], cycles;
> +	u32 *cs;
> +	int err;
> +	int i;
> +
> +	/*
> +	 * Measure how long it takes for us to submit a request while the
> +	 * engine is idle, but is resting in our context.
> +	 *
> +	 *    A: read CS_TIMESTAMP from CPU
> +	 *    submit request
> +	 *    B: read CS_TIMESTAMP on GPU
> +	 *
> +	 * Submission latency: B - A
> +	 */
> +
> +	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
> +		struct i915_request *rq;
> +
> +		err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
> +		if (err)
> +			return err;
> +
> +		rq = i915_request_create(ce);
> +		if (IS_ERR(rq))
> +			return PTR_ERR(rq);
> +
> +		cs = intel_ring_begin(rq, 4);
> +		if (IS_ERR(cs)) {
> +			i915_request_add(rq);
> +			return PTR_ERR(cs);
> +		}
> +
> +		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
> +
> +		intel_ring_advance(rq, cs);
> +
> +		preempt_disable();
> +		local_bh_disable();
> +		elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
> +		i915_request_add(rq);
> +		local_bh_enable();
> +		preempt_enable();
> +	}
> +
> +	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
> +	if (err) {
> +		intel_gt_set_wedged(ce->engine->gt);
> +		return err;
> +	}
> +
> +	for (i = 0; i < ARRAY_SIZE(elapsed); i++)
> +		elapsed[i] = sema[i] - elapsed[i];
> +
> +	cycles = trifilter(elapsed);
> +	pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
> +		ce->engine->name, cycles >> TF_BIAS,
> +		cycles_to_ns(ce->engine, cycles));
> +
> +	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
> +}
> +
> +static int measure_busy_dispatch(struct intel_context *ce)
> +{
> +	u32 *sema = hwsp_scratch(ce);
> +	const u32 offset = hwsp_offset(ce, sema);
> +	u32 elapsed[TF_COUNT + 1], cycles;
> +	u32 *cs;
> +	int i;
> +
> +	/*
> +	 * Measure how long it takes for us to submit a request while the
> +	 * engine is busy, polling on a semaphore in our context. With
> +	 * direct submission, this will include the cost of a lite restore.
> +	 *
> +	 *    A: read CS_TIMESTAMP from CPU
> +	 *    submit request
> +	 *    B: read CS_TIMESTAMP on GPU
> +	 *
> +	 * Submission latency: B - A
> +	 */
> +
> +	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
> +		struct i915_request *rq;
> +
> +		rq = i915_request_create(ce);
> +		if (IS_ERR(rq))
> +			return PTR_ERR(rq);
> +
> +		cs = intel_ring_begin(rq, 12);
> +		if (IS_ERR(cs)) {
> +			i915_request_add(rq);
> +			return PTR_ERR(cs);
> +		}
> +
> +		cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
> +		cs = emit_semaphore_poll_until(cs, offset, i);
> +		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
> +
> +		intel_ring_advance(rq, cs);
> +
> +		if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
> +			intel_gt_set_wedged(ce->engine->gt);
> +			return -EIO;
> +		}
> +
> +		preempt_disable();
> +		local_bh_disable();
> +		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
> +		i915_request_add(rq);
> +		local_bh_enable();
> +		semaphore_set(sema, i - 1);
> +		preempt_enable();
> +	}
> +
> +	wait_for(READ_ONCE(sema[i - 1]), 500);
> +	semaphore_set(sema, i - 1);
> +
> +	for (i = 1; i <= TF_COUNT; i++) {
> +		GEM_BUG_ON(sema[i] == -1);
> +		elapsed[i - 1] = sema[i] - elapsed[i];
> +	}
> +
> +	cycles = trifilter(elapsed);
> +	pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
> +		ce->engine->name, cycles >> TF_BIAS,
> +		cycles_to_ns(ce->engine, cycles));
> +
> +	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
> +}
> +
> +static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
> +{
> +	const u32 offset =
> +		i915_ggtt_offset(engine->status_page.vma) +
> +		offset_in_page(sema);
> +	struct i915_request *rq;
> +	u32 *cs;
> +
> +	rq = i915_request_create(engine->kernel_context);
> +	if (IS_ERR(rq))
> +		return PTR_ERR(rq);
> +
> +	cs = intel_ring_begin(rq, 4);
> +	if (IS_ERR(cs)) {
> +		i915_request_add(rq);
> +		return PTR_ERR(cs);
> +	}
> +
> +	cs = emit_semaphore_poll(cs, mode, value, offset);
> +
> +	intel_ring_advance(rq, cs);
> +	i915_request_add(rq);
> +
> +	return 0;
> +}
> +
> +static int measure_inter_request(struct intel_context *ce)
> +{
> +	u32 *sema = hwsp_scratch(ce);
> +	const u32 offset = hwsp_offset(ce, sema);
> +	u32 elapsed[TF_COUNT + 1], cycles;
> +	struct i915_sw_fence *submit;
> +	int i, err;
> +
> +	/*
> +	 * Measure how long it takes to advance from one request into the
> +	 * next. Between each request we flush the GPU caches to memory,
> +	 * update the breadcrumbs, and then invalidate those caches.
> +	 * We queue up all the requests to be submitted in one batch so
> +	 * it should be one set of contiguous measurements.
> +	 *
> +	 *    A: read CS_TIMESTAMP on GPU
> +	 *    advance request
> +	 *    B: read CS_TIMESTAMP on GPU
> +	 *
> +	 * Request latency: B - A
> +	 */
> +
> +	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
> +	if (err)
> +		return err;
> +
> +	submit = heap_fence_create(GFP_KERNEL);
> +	if (!submit) {
> +		semaphore_set(sema, 1);
> +		return -ENOMEM;
> +	}
> +
> +	intel_engine_flush_submission(ce->engine);
> +	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
> +		struct i915_request *rq;
> +		u32 *cs;
> +
> +		rq = i915_request_create(ce);
> +		if (IS_ERR(rq)) {
> +			semaphore_set(sema, 1);
> +			return PTR_ERR(rq);
> +		}
> +
> +		err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
> +						       submit,
> +						       GFP_KERNEL);
> +		if (err < 0) {
> +			i915_sw_fence_commit(submit);
> +			heap_fence_put(submit);
> +			i915_request_add(rq);
> +			semaphore_set(sema, 1);
> +			return err;
> +		}
> +
> +		cs = intel_ring_begin(rq, 4);
> +		if (IS_ERR(cs)) {
> +			i915_sw_fence_commit(submit);
> +			heap_fence_put(submit);
> +			i915_request_add(rq);
> +			semaphore_set(sema, 1);
> +			return PTR_ERR(cs);
> +		}
> +
> +		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
> +
> +		intel_ring_advance(rq, cs);
> +		i915_request_add(rq);
> +	}
> +	local_bh_disable();
> +	i915_sw_fence_commit(submit);
> +	local_bh_enable();
> +	intel_engine_flush_submission(ce->engine);
> +	heap_fence_put(submit);
> +
> +	semaphore_set(sema, 1);
> +	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
> +	if (err) {
> +		intel_gt_set_wedged(ce->engine->gt);
> +		return err;
> +	}
> +
> +	for (i = 1; i <= TF_COUNT; i++)
> +		elapsed[i - 1] = sema[i + 1] - sema[i];
> +
> +	cycles = trifilter(elapsed);
> +	pr_info("%s: inter-request latency %d cycles, %lluns\n",
> +		ce->engine->name, cycles >> TF_BIAS,
> +		cycles_to_ns(ce->engine, cycles));
> +
> +	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
> +}
> +
> +static int measure_context_switch(struct intel_context *ce)
> +{
> +	u32 *sema = hwsp_scratch(ce);
> +	const u32 offset = hwsp_offset(ce, sema);
> +	struct i915_request *fence = NULL;
> +	u32 elapsed[TF_COUNT + 1], cycles;
> +	int i, j, err;
> +	u32 *cs;
> +
> +	/*
> +	 * Measure how long it takes to advance from one request in one
> +	 * context to a request in another context. This allows us to
> +	 * measure how long the context save/restore take, along with all
> +	 * the inter-context setup we require.
> +	 *
> +	 *    A: read CS_TIMESTAMP on GPU
> +	 *    switch context
> +	 *    B: read CS_TIMESTAMP on GPU
> +	 *
> +	 * Context switch latency: B - A
> +	 */
> +
> +	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
> +	if (err)
> +		return err;
> +
> +	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
> +		struct intel_context *arr[] = {
> +			ce, ce->engine->kernel_context
> +		};
> +		u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32);
> +
> +		for (j = 0; j < ARRAY_SIZE(arr); j++) {
> +			struct i915_request *rq;
> +
> +			rq = i915_request_create(arr[j]);
> +			if (IS_ERR(rq))
> +				return PTR_ERR(rq);
> +
> +			if (fence) {
> +				err = i915_request_await_dma_fence(rq,
> +								   &fence->fence);
> +				if (err) {
> +					i915_request_add(rq);
> +					return err;
> +				}
> +			}
> +
> +			cs = intel_ring_begin(rq, 4);
> +			if (IS_ERR(cs)) {
> +				i915_request_add(rq);
> +				return PTR_ERR(cs);
> +			}
> +
> +			cs = emit_timestamp_store(cs, ce, addr);
> +			addr += sizeof(u32);
> +
> +			intel_ring_advance(rq, cs);
> +
> +			i915_request_put(fence);
> +			fence = i915_request_get(rq);
> +
> +			i915_request_add(rq);
> +		}
> +	}
> +	i915_request_put(fence);
> +	intel_engine_flush_submission(ce->engine);
> +
> +	semaphore_set(sema, 1);
> +	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
> +	if (err) {
> +		intel_gt_set_wedged(ce->engine->gt);
> +		return err;
> +	}
> +
> +	for (i = 1; i <= TF_COUNT; i++)
> +		elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1];
> +
> +	cycles = trifilter(elapsed);
> +	pr_info("%s: context switch latency %d cycles, %lluns\n",
> +		ce->engine->name, cycles >> TF_BIAS,
> +		cycles_to_ns(ce->engine, cycles));
> +
> +	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
> +}
> +
> +static int measure_preemption(struct intel_context *ce)
> +{
> +	u32 *sema = hwsp_scratch(ce);
> +	const u32 offset = hwsp_offset(ce, sema);
> +	u32 elapsed[TF_COUNT], cycles;
> +	u32 *cs;
> +	int i;
> +
> +	/*
> +	 * We measure two latencies while triggering preemption. The first
> +	 * latency is how long it takes for us to submit a preempting request.
> +	 * The second latency is how it takes for us to return from the
> +	 * preemption back to the original context.
> +	 *
> +	 *    A: read CS_TIMESTAMP from CPU
> +	 *    submit preemption
> +	 *    B: read CS_TIMESTAMP on GPU (in preempting context)
> +	 *    context switch
> +	 *    C: read CS_TIMESTAMP on GPU (in original context)
> +	 *
> +	 * Preemption dispatch latency: B - A
> +	 * Preemption switch latency: C - B
> +	 */
> +
> +	if (!intel_engine_has_preemption(ce->engine))
> +		return 0;
> +
> +	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
> +		u32 addr = offset + 2 * i * sizeof(u32);
> +		struct i915_request *rq;
> +
> +		rq = i915_request_create(ce);
> +		if (IS_ERR(rq))
> +			return PTR_ERR(rq);
> +
> +		cs = intel_ring_begin(rq, 12);
> +		if (IS_ERR(cs)) {
> +			i915_request_add(rq);
> +			return PTR_ERR(cs);
> +		}
> +
> +		cs = emit_store_dw(cs, addr, -1);
> +		cs = emit_semaphore_poll_until(cs, offset, i);
> +		cs = emit_timestamp_store(cs, ce, addr + sizeof(u32));
> +
> +		intel_ring_advance(rq, cs);
> +		i915_request_add(rq);
> +
> +		if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) {
> +			intel_gt_set_wedged(ce->engine->gt);
> +			return -EIO;
> +		}
> +
> +		rq = i915_request_create(ce->engine->kernel_context);
> +		if (IS_ERR(rq))
> +			return PTR_ERR(rq);
> +
> +		cs = intel_ring_begin(rq, 8);
> +		if (IS_ERR(cs)) {
> +			i915_request_add(rq);
> +			return PTR_ERR(cs);
> +		}
> +
> +		cs = emit_timestamp_store(cs, ce, addr);
> +		cs = emit_store_dw(cs, offset, i);
> +
> +		intel_ring_advance(rq, cs);
> +		rq->sched.attr.priority = I915_PRIORITY_BARRIER;
> +
> +		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
> +		i915_request_add(rq);
> +	}
> +
> +	if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) {
> +		intel_gt_set_wedged(ce->engine->gt);
> +		return -EIO;
> +	}
> +
> +	for (i = 1; i <= TF_COUNT; i++)
> +		elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1];
> +
> +	cycles = trifilter(elapsed);
> +	pr_info("%s: preemption dispatch latency %d cycles, %lluns\n",
> +		ce->engine->name, cycles >> TF_BIAS,
> +		cycles_to_ns(ce->engine, cycles));
> +
> +	for (i = 1; i <= TF_COUNT; i++)
> +		elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0];
> +
> +	cycles = trifilter(elapsed);
> +	pr_info("%s: preemption switch latency %d cycles, %lluns\n",
> +		ce->engine->name, cycles >> TF_BIAS,
> +		cycles_to_ns(ce->engine, cycles));
> +
> +	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
> +}
> +
> +struct signal_cb {
> +	struct dma_fence_cb base;
> +	bool seen;
> +};
> +
> +static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
> +{
> +	struct signal_cb *s = container_of(cb, typeof(*s), base);
> +
> +	smp_store_mb(s->seen, true); /* be safe, be strong */
> +}
> +
> +static int measure_completion(struct intel_context *ce)
> +{
> +	u32 *sema = hwsp_scratch(ce);
> +	const u32 offset = hwsp_offset(ce, sema);
> +	u32 elapsed[TF_COUNT], cycles;
> +	u32 *cs;
> +	int err;
> +	int i;
> +
> +	/*
> +	 * Measure how long it takes for the signal (interrupt) to be
> +	 * sent from the GPU to be processed by the CPU.
> +	 *
> +	 *    A: read CS_TIMESTAMP on GPU
> +	 *    signal
> +	 *    B: read CS_TIMESTAMP from CPU
> +	 *
> +	 * Completion latency: B - A
> +	 */
> +
> +	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
> +		struct signal_cb cb = { .seen = false };
> +		struct i915_request *rq;
> +
> +		rq = i915_request_create(ce);
> +		if (IS_ERR(rq))
> +			return PTR_ERR(rq);
> +
> +		cs = intel_ring_begin(rq, 12);
> +		if (IS_ERR(cs)) {
> +			i915_request_add(rq);
> +			return PTR_ERR(cs);
> +		}
> +
> +		cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
> +		cs = emit_semaphore_poll_until(cs, offset, i);
> +		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
> +
> +		intel_ring_advance(rq, cs);
> +
> +		dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
> +
> +		local_bh_disable();
> +		i915_request_add(rq);
> +		local_bh_enable();
> +
> +		if (wait_for(READ_ONCE(sema[i]) == -1, 50)) {
> +			intel_gt_set_wedged(ce->engine->gt);
> +			return -EIO;
> +		}
> +
> +		preempt_disable();
> +		semaphore_set(sema, i);
> +		while (!READ_ONCE(cb.seen))
> +			cpu_relax();
> +
> +		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
> +		preempt_enable();
> +	}
> +
> +	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
> +	if (err) {
> +		intel_gt_set_wedged(ce->engine->gt);
> +		return err;
> +	}
> +
> +	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
> +		GEM_BUG_ON(sema[i + 1] == -1);
> +		elapsed[i] = elapsed[i] - sema[i + 1];
> +	}
> +
> +	cycles = trifilter(elapsed);
> +	pr_info("%s: completion latency %d cycles, %lluns\n",
> +		ce->engine->name, cycles >> TF_BIAS,
> +		cycles_to_ns(ce->engine, cycles));
> +
> +	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
> +}
> +
> +static void rps_pin(struct intel_gt *gt)
> +{
> +	/* Pin the frequency to max */
> +	atomic_inc(&gt->rps.num_waiters);
> +	intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
> +
> +	mutex_lock(&gt->rps.lock);
> +	intel_rps_set(&gt->rps, gt->rps.max_freq);
> +	mutex_unlock(&gt->rps.lock);
> +}
> +
> +static void rps_unpin(struct intel_gt *gt)
> +{
> +	intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
> +	atomic_dec(&gt->rps.num_waiters);
> +}
> +
> +static void engine_heartbeat_disable(struct intel_engine_cs *engine)
> +{
> +	engine->props.heartbeat_interval_ms = 0;
> +
> +	intel_engine_pm_get(engine);
> +	intel_engine_park_heartbeat(engine);
> +}
> +
> +static void engine_heartbeat_enable(struct intel_engine_cs *engine)
> +{
> +	intel_engine_pm_put(engine);
> +
> +	engine->props.heartbeat_interval_ms =
> +		engine->defaults.heartbeat_interval_ms;
> +}
> +
> +static int perf_request_latency(void *arg)
> +{
> +	struct drm_i915_private *i915 = arg;
> +	struct intel_engine_cs *engine;
> +	struct pm_qos_request qos;
> +	int err = 0;
> +
> +	if (INTEL_GEN(i915) < 8) /* per-engine CS timestamp, semaphores */
> +		return 0;
> +
> +	cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
> +
> +	for_each_uabi_engine(engine, i915) {
> +		struct intel_context *ce;
> +
> +		ce = intel_context_create(engine);
> +		if (IS_ERR(ce))
> +			goto out;
> +
> +		err = intel_context_pin(ce);
> +		if (err) {
> +			intel_context_put(ce);
> +			goto out;
> +		}
> +
> +		engine_heartbeat_disable(engine);
> +		rps_pin(engine->gt);
> +
> +		if (err == 0)
> +			err = measure_semaphore_response(ce);
> +		if (err == 0)
> +			err = measure_idle_dispatch(ce);
> +		if (err == 0)
> +			err = measure_busy_dispatch(ce);
> +		if (err == 0)
> +			err = measure_inter_request(ce);
> +		if (err == 0)
> +			err = measure_context_switch(ce);
> +		if (err == 0)
> +			err = measure_preemption(ce);
> +		if (err == 0)
> +			err = measure_completion(ce);
> +
> +		rps_unpin(engine->gt);
> +		engine_heartbeat_enable(engine);
> +
> +		intel_context_unpin(ce);
> +		intel_context_put(ce);
> +		if (err)
> +			goto out;
> +	}
> +
> +out:
> +	if (igt_flush_test(i915))
> +		err = -EIO;
> +
> +	cpu_latency_qos_remove_request(&qos);
> +	return err;
> +}
> +
>  static int s_sync0(void *arg)
>  {
>  	struct perf_series *ps = arg;
> @@ -2042,6 +2820,7 @@ static int perf_parallel_engines(void *arg)
>  int i915_request_perf_selftests(struct drm_i915_private *i915)
>  {
>  	static const struct i915_subtest tests[] = {
> +		SUBTEST(perf_request_latency),
>  		SUBTEST(perf_series_engines),
>  		SUBTEST(perf_parallel_engines),
>  	};
> -- 
> 2.20.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Intel-gfx] [PATCH] drm/i915/selftests: Measure dispatch latency
  2020-05-18 14:39 [Intel-gfx] [PATCH v2] " Chris Wilson
  2020-05-18 16:22 ` [Intel-gfx] [PATCH] " Chris Wilson
@ 2020-05-19 11:41 ` Chris Wilson
  2020-05-19 12:47   ` Mika Kuoppala
  1 sibling, 1 reply; 13+ messages in thread
From: Chris Wilson @ 2020-05-19 11:41 UTC (permalink / raw)
  To: intel-gfx; +Cc: Chris Wilson

A useful metric of the system's health is how fast we can tell the GPU
to do various actions, so measure our latency.

v2: Refactor all the instruction building into emitters.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 drivers/gpu/drm/i915/selftests/i915_request.c | 779 ++++++++++++++++++
 1 file changed, 779 insertions(+)

diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c
index 6014e8dfcbb1..db09e9cb54b8 100644
--- a/drivers/gpu/drm/i915/selftests/i915_request.c
+++ b/drivers/gpu/drm/i915/selftests/i915_request.c
@@ -24,16 +24,20 @@
 
 #include <linux/prime_numbers.h>
 #include <linux/pm_qos.h>
+#include <linux/sort.h>
 
 #include "gem/i915_gem_pm.h"
 #include "gem/selftests/mock_context.h"
 
+#include "gt/intel_engine_heartbeat.h"
 #include "gt/intel_engine_pm.h"
 #include "gt/intel_engine_user.h"
 #include "gt/intel_gt.h"
+#include "gt/intel_gt_requests.h"
 
 #include "i915_random.h"
 #include "i915_selftest.h"
+#include "igt_flush_test.h"
 #include "igt_live_test.h"
 #include "igt_spinner.h"
 #include "lib_sw_fence.h"
@@ -1524,6 +1528,780 @@ struct perf_series {
 	struct intel_context *ce[];
 };
 
+static int cmp_u32(const void *A, const void *B)
+{
+	const u32 *a = A, *b = B;
+
+	return *a - *b;
+}
+
+static u32 trifilter(u32 *a)
+{
+	u64 sum;
+
+#define TF_COUNT 5
+	sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL);
+
+	sum = mul_u32_u32(a[2], 2);
+	sum += a[1];
+	sum += a[3];
+
+	GEM_BUG_ON(sum > U32_MAX);
+	return sum;
+#define TF_BIAS 2
+}
+
+static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
+{
+	u64 ns = i915_cs_timestamp_ticks_to_ns(engine->i915, cycles);
+
+	return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS);
+}
+
+static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset)
+{
+	*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+	*cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base)));
+	*cs++ = offset;
+	*cs++ = 0;
+
+	return cs;
+}
+
+static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value)
+{
+	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+	*cs++ = offset;
+	*cs++ = 0;
+	*cs++ = value;
+
+	return cs;
+}
+
+static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset)
+{
+	*cs++ = MI_SEMAPHORE_WAIT |
+		MI_SEMAPHORE_GLOBAL_GTT |
+		MI_SEMAPHORE_POLL |
+		mode;
+	*cs++ = value;
+	*cs++ = offset;
+	*cs++ = 0;
+
+	return cs;
+}
+
+static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value)
+{
+	return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset);
+}
+
+static void semaphore_set(u32 *sema, u32 value)
+{
+	WRITE_ONCE(*sema, value);
+	wmb(); /* flush the update to the cache, and beyond */
+}
+
+static u32 *hwsp_scratch(const struct intel_context *ce)
+{
+	return memset32(ce->engine->status_page.addr + 1000, 0, 21);
+}
+
+static u32 hwsp_offset(const struct intel_context *ce, u32 *dw)
+{
+	return (i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(dw));
+}
+
+static int measure_semaphore_response(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	u32 elapsed[TF_COUNT], cycles;
+	struct i915_request *rq;
+	u32 *cs;
+	int i;
+
+	/*
+	 * Measure how many cycles it takes for the HW to detect the change
+	 * in a semaphore value.
+	 *
+	 *    A: read CS_TIMESTAMP from CPU
+	 *    poke semaphore
+	 *    B: read CS_TIMESTAMP on GPU
+	 *
+	 * Semaphore latency: B - A
+	 */
+
+	semaphore_set(sema, -1);
+
+	rq = i915_request_create(ce);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed));
+	if (IS_ERR(cs)) {
+		i915_request_add(rq);
+		return PTR_ERR(cs);
+	}
+
+	cs = emit_store_dw(cs, offset, 0);
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		cs = emit_semaphore_poll_until(cs, offset, i);
+		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+		cs = emit_store_dw(cs, offset, 0);
+	}
+
+	intel_ring_advance(rq, cs);
+	i915_request_add(rq);
+
+	if (wait_for(READ_ONCE(*sema) == 0, 50)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		preempt_disable();
+		cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		semaphore_set(sema, i);
+		preempt_enable();
+
+		if (wait_for(READ_ONCE(*sema) == 0, 50)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		elapsed[i - 1] = sema[i] - cycles;
+	}
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: semaphore response %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_idle_dispatch(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	u32 elapsed[TF_COUNT], cycles;
+	u32 *cs;
+	int err;
+	int i;
+
+	/*
+	 * Measure how long it takes for us to submit a request while the
+	 * engine is idle, but is resting in our context.
+	 *
+	 *    A: read CS_TIMESTAMP from CPU
+	 *    submit request
+	 *    B: read CS_TIMESTAMP on GPU
+	 *
+	 * Submission latency: B - A
+	 */
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+
+		err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+		if (err)
+			return err;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 4);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+
+		intel_ring_advance(rq, cs);
+
+		preempt_disable();
+		local_bh_disable();
+		elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		i915_request_add(rq);
+		local_bh_enable();
+		preempt_enable();
+	}
+
+	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+	if (err) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return err;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++)
+		elapsed[i] = sema[i] - elapsed[i];
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_busy_dispatch(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	u32 elapsed[TF_COUNT + 1], cycles;
+	u32 *cs;
+	int i;
+
+	/*
+	 * Measure how long it takes for us to submit a request while the
+	 * engine is busy, polling on a semaphore in our context. With
+	 * direct submission, this will include the cost of a lite restore.
+	 *
+	 *    A: read CS_TIMESTAMP from CPU
+	 *    submit request
+	 *    B: read CS_TIMESTAMP on GPU
+	 *
+	 * Submission latency: B - A
+	 */
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 12);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
+		cs = emit_semaphore_poll_until(cs, offset, i);
+		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+
+		intel_ring_advance(rq, cs);
+
+		if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		preempt_disable();
+		local_bh_disable();
+		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		i915_request_add(rq);
+		local_bh_enable();
+		semaphore_set(sema, i - 1);
+		preempt_enable();
+	}
+
+	wait_for(READ_ONCE(sema[i - 1]), 500);
+	semaphore_set(sema, i - 1);
+
+	for (i = 1; i <= TF_COUNT; i++) {
+		GEM_BUG_ON(sema[i] == -1);
+		elapsed[i - 1] = sema[i] - elapsed[i];
+	}
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
+{
+	const u32 offset =
+		i915_ggtt_offset(engine->status_page.vma) +
+		offset_in_page(sema);
+	struct i915_request *rq;
+	u32 *cs;
+
+	rq = i915_request_create(engine->kernel_context);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	cs = intel_ring_begin(rq, 4);
+	if (IS_ERR(cs)) {
+		i915_request_add(rq);
+		return PTR_ERR(cs);
+	}
+
+	cs = emit_semaphore_poll(cs, mode, value, offset);
+
+	intel_ring_advance(rq, cs);
+	i915_request_add(rq);
+
+	return 0;
+}
+
+static int measure_inter_request(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	u32 elapsed[TF_COUNT + 1], cycles;
+	struct i915_sw_fence *submit;
+	int i, err;
+
+	/*
+	 * Measure how long it takes to advance from one request into the
+	 * next. Between each request we flush the GPU caches to memory,
+	 * update the breadcrumbs, and then invalidate those caches.
+	 * We queue up all the requests to be submitted in one batch so
+	 * it should be one set of contiguous measurements.
+	 *
+	 *    A: read CS_TIMESTAMP on GPU
+	 *    advance request
+	 *    B: read CS_TIMESTAMP on GPU
+	 *
+	 * Request latency: B - A
+	 */
+
+	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+	if (err)
+		return err;
+
+	submit = heap_fence_create(GFP_KERNEL);
+	if (!submit) {
+		semaphore_set(sema, 1);
+		return -ENOMEM;
+	}
+
+	intel_engine_flush_submission(ce->engine);
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+		u32 *cs;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq)) {
+			semaphore_set(sema, 1);
+			return PTR_ERR(rq);
+		}
+
+		err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
+						       submit,
+						       GFP_KERNEL);
+		if (err < 0) {
+			i915_sw_fence_commit(submit);
+			heap_fence_put(submit);
+			i915_request_add(rq);
+			semaphore_set(sema, 1);
+			return err;
+		}
+
+		cs = intel_ring_begin(rq, 4);
+		if (IS_ERR(cs)) {
+			i915_sw_fence_commit(submit);
+			heap_fence_put(submit);
+			i915_request_add(rq);
+			semaphore_set(sema, 1);
+			return PTR_ERR(cs);
+		}
+
+		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+
+		intel_ring_advance(rq, cs);
+		i915_request_add(rq);
+	}
+	local_bh_disable();
+	i915_sw_fence_commit(submit);
+	local_bh_enable();
+	intel_engine_flush_submission(ce->engine);
+	heap_fence_put(submit);
+
+	semaphore_set(sema, 1);
+	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+	if (err) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return err;
+	}
+
+	for (i = 1; i <= TF_COUNT; i++)
+		elapsed[i - 1] = sema[i + 1] - sema[i];
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: inter-request latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_context_switch(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	struct i915_request *fence = NULL;
+	u32 elapsed[TF_COUNT + 1], cycles;
+	int i, j, err;
+	u32 *cs;
+
+	/*
+	 * Measure how long it takes to advance from one request in one
+	 * context to a request in another context. This allows us to
+	 * measure how long the context save/restore take, along with all
+	 * the inter-context setup we require.
+	 *
+	 *    A: read CS_TIMESTAMP on GPU
+	 *    switch context
+	 *    B: read CS_TIMESTAMP on GPU
+	 *
+	 * Context switch latency: B - A
+	 */
+
+	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+	if (err)
+		return err;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct intel_context *arr[] = {
+			ce, ce->engine->kernel_context
+		};
+		u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32);
+
+		for (j = 0; j < ARRAY_SIZE(arr); j++) {
+			struct i915_request *rq;
+
+			rq = i915_request_create(arr[j]);
+			if (IS_ERR(rq))
+				return PTR_ERR(rq);
+
+			if (fence) {
+				err = i915_request_await_dma_fence(rq,
+								   &fence->fence);
+				if (err) {
+					i915_request_add(rq);
+					return err;
+				}
+			}
+
+			cs = intel_ring_begin(rq, 4);
+			if (IS_ERR(cs)) {
+				i915_request_add(rq);
+				return PTR_ERR(cs);
+			}
+
+			cs = emit_timestamp_store(cs, ce, addr);
+			addr += sizeof(u32);
+
+			intel_ring_advance(rq, cs);
+
+			i915_request_put(fence);
+			fence = i915_request_get(rq);
+
+			i915_request_add(rq);
+		}
+	}
+	i915_request_put(fence);
+	intel_engine_flush_submission(ce->engine);
+
+	semaphore_set(sema, 1);
+	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+	if (err) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return err;
+	}
+
+	for (i = 1; i <= TF_COUNT; i++)
+		elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1];
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: context switch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_preemption(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	u32 elapsed[TF_COUNT], cycles;
+	u32 *cs;
+	int i;
+
+	/*
+	 * We measure two latencies while triggering preemption. The first
+	 * latency is how long it takes for us to submit a preempting request.
+	 * The second latency is how it takes for us to return from the
+	 * preemption back to the original context.
+	 *
+	 *    A: read CS_TIMESTAMP from CPU
+	 *    submit preemption
+	 *    B: read CS_TIMESTAMP on GPU (in preempting context)
+	 *    context switch
+	 *    C: read CS_TIMESTAMP on GPU (in original context)
+	 *
+	 * Preemption dispatch latency: B - A
+	 * Preemption switch latency: C - B
+	 */
+
+	if (!intel_engine_has_preemption(ce->engine))
+		return 0;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		u32 addr = offset + 2 * i * sizeof(u32);
+		struct i915_request *rq;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 12);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		cs = emit_store_dw(cs, addr, -1);
+		cs = emit_semaphore_poll_until(cs, offset, i);
+		cs = emit_timestamp_store(cs, ce, addr + sizeof(u32));
+
+		intel_ring_advance(rq, cs);
+		i915_request_add(rq);
+
+		if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		rq = i915_request_create(ce->engine->kernel_context);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 8);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		cs = emit_timestamp_store(cs, ce, addr);
+		cs = emit_store_dw(cs, offset, i);
+
+		intel_ring_advance(rq, cs);
+		rq->sched.attr.priority = I915_PRIORITY_BARRIER;
+
+		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		i915_request_add(rq);
+	}
+
+	if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= TF_COUNT; i++)
+		elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1];
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: preemption dispatch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	for (i = 1; i <= TF_COUNT; i++)
+		elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0];
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: preemption switch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+struct signal_cb {
+	struct dma_fence_cb base;
+	bool seen;
+};
+
+static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
+{
+	struct signal_cb *s = container_of(cb, typeof(*s), base);
+
+	smp_store_mb(s->seen, true); /* be safe, be strong */
+}
+
+static int measure_completion(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	u32 elapsed[TF_COUNT], cycles;
+	u32 *cs;
+	int err;
+	int i;
+
+	/*
+	 * Measure how long it takes for the signal (interrupt) to be
+	 * sent from the GPU to be processed by the CPU.
+	 *
+	 *    A: read CS_TIMESTAMP on GPU
+	 *    signal
+	 *    B: read CS_TIMESTAMP from CPU
+	 *
+	 * Completion latency: B - A
+	 */
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct signal_cb cb = { .seen = false };
+		struct i915_request *rq;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 12);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
+		cs = emit_semaphore_poll_until(cs, offset, i);
+		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+
+		intel_ring_advance(rq, cs);
+
+		dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
+
+		local_bh_disable();
+		i915_request_add(rq);
+		local_bh_enable();
+
+		if (wait_for(READ_ONCE(sema[i]) == -1, 50)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		preempt_disable();
+		semaphore_set(sema, i);
+		while (!READ_ONCE(cb.seen))
+			cpu_relax();
+
+		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		preempt_enable();
+	}
+
+	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+	if (err) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return err;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+		GEM_BUG_ON(sema[i + 1] == -1);
+		elapsed[i] = elapsed[i] - sema[i + 1];
+	}
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: completion latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static void rps_pin(struct intel_gt *gt)
+{
+	/* Pin the frequency to max */
+	atomic_inc(&gt->rps.num_waiters);
+	intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
+
+	mutex_lock(&gt->rps.lock);
+	intel_rps_set(&gt->rps, gt->rps.max_freq);
+	mutex_unlock(&gt->rps.lock);
+}
+
+static void rps_unpin(struct intel_gt *gt)
+{
+	intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
+	atomic_dec(&gt->rps.num_waiters);
+}
+
+static void engine_heartbeat_disable(struct intel_engine_cs *engine)
+{
+	engine->props.heartbeat_interval_ms = 0;
+
+	intel_engine_pm_get(engine);
+	intel_engine_park_heartbeat(engine);
+}
+
+static void engine_heartbeat_enable(struct intel_engine_cs *engine)
+{
+	intel_engine_pm_put(engine);
+
+	engine->props.heartbeat_interval_ms =
+		engine->defaults.heartbeat_interval_ms;
+}
+
+static int perf_request_latency(void *arg)
+{
+	struct drm_i915_private *i915 = arg;
+	struct intel_engine_cs *engine;
+	struct pm_qos_request qos;
+	int err = 0;
+
+	if (INTEL_GEN(i915) < 8) /* per-engine CS timestamp, semaphores */
+		return 0;
+
+	cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
+
+	for_each_uabi_engine(engine, i915) {
+		struct intel_context *ce;
+
+		ce = intel_context_create(engine);
+		if (IS_ERR(ce))
+			goto out;
+
+		err = intel_context_pin(ce);
+		if (err) {
+			intel_context_put(ce);
+			goto out;
+		}
+
+		engine_heartbeat_disable(engine);
+		rps_pin(engine->gt);
+
+		if (err == 0)
+			err = measure_semaphore_response(ce);
+		if (err == 0)
+			err = measure_idle_dispatch(ce);
+		if (err == 0)
+			err = measure_busy_dispatch(ce);
+		if (err == 0)
+			err = measure_inter_request(ce);
+		if (err == 0)
+			err = measure_context_switch(ce);
+		if (err == 0)
+			err = measure_preemption(ce);
+		if (err == 0)
+			err = measure_completion(ce);
+
+		rps_unpin(engine->gt);
+		engine_heartbeat_enable(engine);
+
+		intel_context_unpin(ce);
+		intel_context_put(ce);
+		if (err)
+			goto out;
+	}
+
+out:
+	if (igt_flush_test(i915))
+		err = -EIO;
+
+	cpu_latency_qos_remove_request(&qos);
+	return err;
+}
+
 static int s_sync0(void *arg)
 {
 	struct perf_series *ps = arg;
@@ -2042,6 +2820,7 @@ static int perf_parallel_engines(void *arg)
 int i915_request_perf_selftests(struct drm_i915_private *i915)
 {
 	static const struct i915_subtest tests[] = {
+		SUBTEST(perf_request_latency),
 		SUBTEST(perf_series_engines),
 		SUBTEST(perf_parallel_engines),
 	};
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [Intel-gfx] [PATCH] drm/i915/selftests: Measure dispatch latency
  2020-05-18 14:39 [Intel-gfx] [PATCH v2] " Chris Wilson
@ 2020-05-18 16:22 ` Chris Wilson
  2020-05-19 11:41 ` Chris Wilson
  1 sibling, 0 replies; 13+ messages in thread
From: Chris Wilson @ 2020-05-18 16:22 UTC (permalink / raw)
  To: intel-gfx; +Cc: Chris Wilson

A useful metric of the system's health is how fast we can tell the GPU
to do various actions, so measure our latency.

v2: Refactor all the instruction building into emitters.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 drivers/gpu/drm/i915/selftests/i915_request.c | 767 ++++++++++++++++++
 1 file changed, 767 insertions(+)

diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c
index 6014e8dfcbb1..3e8169c9b081 100644
--- a/drivers/gpu/drm/i915/selftests/i915_request.c
+++ b/drivers/gpu/drm/i915/selftests/i915_request.c
@@ -24,16 +24,20 @@
 
 #include <linux/prime_numbers.h>
 #include <linux/pm_qos.h>
+#include <linux/sort.h>
 
 #include "gem/i915_gem_pm.h"
 #include "gem/selftests/mock_context.h"
 
+#include "gt/intel_engine_heartbeat.h"
 #include "gt/intel_engine_pm.h"
 #include "gt/intel_engine_user.h"
 #include "gt/intel_gt.h"
+#include "gt/intel_gt_requests.h"
 
 #include "i915_random.h"
 #include "i915_selftest.h"
+#include "igt_flush_test.h"
 #include "igt_live_test.h"
 #include "igt_spinner.h"
 #include "lib_sw_fence.h"
@@ -1524,6 +1528,768 @@ struct perf_series {
 	struct intel_context *ce[];
 };
 
+static int cmp_u32(const void *A, const void *B)
+{
+	const u32 *a = A, *b = B;
+
+	return *a - *b;
+}
+
+static u32 trifilter(u32 *a)
+{
+	u64 sum;
+
+#define TF_COUNT 5
+	sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL);
+
+	sum = mul_u32_u32(a[2], 2);
+	sum += a[1];
+	sum += a[3];
+
+	GEM_BUG_ON(sum > U32_MAX);
+	return sum;
+#define TF_BIAS 2
+}
+
+static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
+{
+	u64 ns = i915_cs_timestamp_ticks_to_ns(engine->i915, cycles);
+
+	return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS);
+}
+
+static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset)
+{
+	*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+	*cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base)));
+	*cs++ = offset;
+	*cs++ = 0;
+
+	return cs;
+}
+
+static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value)
+{
+	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+	*cs++ = offset;
+	*cs++ = 0;
+	*cs++ = value;
+
+	return cs;
+}
+
+static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset)
+{
+	*cs++ = MI_SEMAPHORE_WAIT |
+		MI_SEMAPHORE_GLOBAL_GTT |
+		MI_SEMAPHORE_POLL |
+		mode;
+	*cs++ = value;
+	*cs++ = offset;
+	*cs++ = 0;
+
+	return cs;
+}
+
+static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value)
+{
+	return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset);
+}
+
+static void semaphore_set(u32 *sema, u32 value)
+{
+	WRITE_ONCE(*sema, value);
+	wmb(); /* flush the update to the cache, and beyond */
+}
+
+static u32 *hwsp_scratch(const struct intel_context *ce)
+{
+	return memset32(ce->engine->status_page.addr + 1000, 0, 21);
+}
+
+static u32 hwsp_offset(const struct intel_context *ce, u32 *dw)
+{
+	return (i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(dw));
+}
+
+static int measure_semaphore_response(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	u32 elapsed[TF_COUNT], cycles;
+	struct i915_request *rq;
+	u32 *cs;
+	int i;
+
+	/*
+	 * Measure how many cycles it takes for the HW to detect the change
+	 * in a semaphore value.
+	 *
+	 *    A: read CS_TIMESTAMP from CPU
+	 *    poke semaphore
+	 *    B: read CS_TIMESTAMP on GPU
+	 *
+	 * Semaphore latency: B - A
+	 */
+
+	semaphore_set(sema, -1);
+
+	rq = i915_request_create(ce);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	cs = intel_ring_begin(rq, 4 + 8 * ARRAY_SIZE(elapsed));
+	if (IS_ERR(cs)) {
+		i915_request_add(rq);
+		return PTR_ERR(cs);
+	}
+
+	cs = emit_store_dw(cs, offset, 0);
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		cs = emit_semaphore_poll_until(cs, offset, i);
+		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+	}
+
+	intel_ring_advance(rq, cs);
+	i915_request_add(rq);
+
+	if (wait_for(READ_ONCE(*sema) == 0, 50)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		preempt_disable();
+		cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		semaphore_set(sema, i);
+		preempt_enable();
+
+		if (wait_for(READ_ONCE(sema[i]), 50)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		elapsed[i - 1] = sema[i] - cycles;
+	}
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: semaphore response %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_idle_dispatch(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	u32 elapsed[TF_COUNT], cycles;
+	u32 *cs;
+	int i;
+
+	/*
+	 * Measure how long it takes for us to submit a request while the
+	 * engine is idle, but is resting in our context.
+	 *
+	 *    A: read CS_TIMESTAMP from CPU
+	 *    submit request
+	 *    B: read CS_TIMESTAMP on GPU
+	 *
+	 * Submission latency: B - A
+	 */
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+		int err;
+
+		err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+		if (err)
+			return err;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 4);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+
+		intel_ring_advance(rq, cs);
+
+		preempt_disable();
+		local_bh_disable();
+		elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		i915_request_add(rq);
+		local_bh_enable();
+		preempt_enable();
+	}
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+		if (wait_for(READ_ONCE(sema[i]), 50)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		elapsed[i] = sema[i] - elapsed[i];
+	}
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_busy_dispatch(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	u32 elapsed[TF_COUNT + 1], cycles;
+	u32 *cs;
+	int i;
+
+	/*
+	 * Measure how long it takes for us to submit a request while the
+	 * engine is busy, polling on a semaphore in our context. With
+	 * direct submission, this will include the cost of a lite restore.
+	 *
+	 *    A: read CS_TIMESTAMP from CPU
+	 *    submit request
+	 *    B: read CS_TIMESTAMP on GPU
+	 *
+	 * Submission latency: B - A
+	 */
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 12);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
+		cs = emit_semaphore_poll_until(cs, offset, i);
+		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+
+		intel_ring_advance(rq, cs);
+
+		if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		preempt_disable();
+		local_bh_disable();
+		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		i915_request_add(rq);
+		local_bh_enable();
+		semaphore_set(sema, i - 1);
+		preempt_enable();
+	}
+
+	wait_for(READ_ONCE(sema[i - 1]), 500);
+	semaphore_set(sema, i - 1);
+
+	for (i = 1; i <= TF_COUNT; i++) {
+		GEM_BUG_ON(sema[i] == -1);
+		elapsed[i - 1] = sema[i] - elapsed[i];
+	}
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
+{
+	const u32 offset =
+		i915_ggtt_offset(engine->status_page.vma) +
+		offset_in_page(sema);
+	struct i915_request *rq;
+	u32 *cs;
+
+	rq = i915_request_create(engine->kernel_context);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	cs = intel_ring_begin(rq, 4);
+	if (IS_ERR(cs)) {
+		i915_request_add(rq);
+		return PTR_ERR(cs);
+	}
+
+	cs = emit_semaphore_poll(cs, mode, value, offset);
+
+	intel_ring_advance(rq, cs);
+	i915_request_add(rq);
+
+	return 0;
+}
+
+static int measure_inter_request(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	u32 elapsed[TF_COUNT + 1], cycles;
+	struct i915_sw_fence *submit;
+	int i, err;
+
+	/*
+	 * Measure how long it takes to advance from one request into the
+	 * next. Between each request we flush the GPU caches to memory,
+	 * update the breadcrumbs, and then invalidate those caches.
+	 * We queue up all the requests to be submitted in one batch so
+	 * it should be one set of contiguous measurements.
+	 *
+	 *    A: read CS_TIMESTAMP on GPU
+	 *    advance request
+	 *    B: read CS_TIMESTAMP on GPU
+	 *
+	 * Request latency: B - A
+	 */
+
+	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+	if (err)
+		return err;
+
+	submit = heap_fence_create(GFP_KERNEL);
+	if (!submit) {
+		semaphore_set(sema, 1);
+		return -ENOMEM;
+	}
+
+	intel_engine_flush_submission(ce->engine);
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+		u32 *cs;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq)) {
+			semaphore_set(sema, 1);
+			return PTR_ERR(rq);
+		}
+
+		err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
+						       submit,
+						       GFP_KERNEL);
+		if (err < 0) {
+			i915_sw_fence_commit(submit);
+			heap_fence_put(submit);
+			i915_request_add(rq);
+			semaphore_set(sema, 1);
+			return err;
+		}
+
+		cs = intel_ring_begin(rq, 4);
+		if (IS_ERR(cs)) {
+			i915_sw_fence_commit(submit);
+			heap_fence_put(submit);
+			i915_request_add(rq);
+			semaphore_set(sema, 1);
+			return PTR_ERR(cs);
+		}
+
+		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+
+		intel_ring_advance(rq, cs);
+		i915_request_add(rq);
+	}
+	local_bh_disable();
+	i915_sw_fence_commit(submit);
+	local_bh_enable();
+	intel_engine_flush_submission(ce->engine);
+	heap_fence_put(submit);
+
+	semaphore_set(sema, 1);
+	if (wait_for(READ_ONCE(sema[TF_COUNT + 1]), 100)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= TF_COUNT; i++)
+		elapsed[i - 1] = sema[i + 1] - sema[i];
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: inter-request latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_context_switch(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	struct i915_request *fence = NULL;
+	u32 elapsed[TF_COUNT + 1], cycles;
+	int i, j, err;
+	u32 *cs;
+
+	/*
+	 * Measure how long it takes to advance from one request in one
+	 * context to a request in another context. This allows us to
+	 * measure how long the context save/restore take, along with all
+	 * the inter-context setup we require.
+	 *
+	 *    A: read CS_TIMESTAMP on GPU
+	 *    switch context
+	 *    B: read CS_TIMESTAMP on GPU
+	 *
+	 * Context switch latency: B - A
+	 */
+
+	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+	if (err)
+		return err;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct intel_context *arr[] = {
+			ce, ce->engine->kernel_context
+		};
+		u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32);
+
+		for (j = 0; j < ARRAY_SIZE(arr); j++) {
+			struct i915_request *rq;
+
+			rq = i915_request_create(arr[j]);
+			if (IS_ERR(rq))
+				return PTR_ERR(rq);
+
+			if (fence) {
+				err = i915_request_await_dma_fence(rq,
+								   &fence->fence);
+				if (err) {
+					i915_request_add(rq);
+					return err;
+				}
+			}
+
+			cs = intel_ring_begin(rq, 4);
+			if (IS_ERR(cs)) {
+				i915_request_add(rq);
+				return PTR_ERR(cs);
+			}
+
+			cs = emit_timestamp_store(cs, ce, addr);
+			addr += sizeof(u32);
+
+			intel_ring_advance(rq, cs);
+
+			i915_request_put(fence);
+			fence = i915_request_get(rq);
+
+			i915_request_add(rq);
+		}
+	}
+	i915_request_put(fence);
+	intel_engine_flush_submission(ce->engine);
+
+	semaphore_set(sema, 1);
+	if (wait_for(READ_ONCE(sema[2 * i - 1]), 500)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= TF_COUNT; i++)
+		elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1];
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: context switch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_preemption(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	u32 elapsed[TF_COUNT], cycles;
+	u32 *cs;
+	int i;
+
+	/*
+	 * We measure two latencies while triggering preemption. The first
+	 * latency is how long it takes for us to submit a preempting request.
+	 * The second latency is how it takes for us to return from the
+	 * preemption back to the original context.
+	 *
+	 *    A: read CS_TIMESTAMP from CPU
+	 *    submit preemption
+	 *    B: read CS_TIMESTAMP on GPU (in preempting context)
+	 *    context switch
+	 *    C: read CS_TIMESTAMP on GPU (in original context)
+	 *
+	 * Preemption dispatch latency: B - A
+	 * Preemption switch latency: C - B
+	 */
+
+	if (!intel_engine_has_preemption(ce->engine))
+		return 0;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		u32 addr = offset + 2 * i * sizeof(u32);
+		struct i915_request *rq;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 12);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		cs = emit_store_dw(cs, addr, -1);
+		cs = emit_semaphore_poll_until(cs, offset, i);
+		cs = emit_timestamp_store(cs, ce, addr + sizeof(u32));
+
+		intel_ring_advance(rq, cs);
+		i915_request_add(rq);
+
+		if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		rq = i915_request_create(ce->engine->kernel_context);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 8);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		cs = emit_timestamp_store(cs, ce, addr);
+		cs = emit_store_dw(cs, offset, i);
+
+		intel_ring_advance(rq, cs);
+		rq->sched.attr.priority = I915_PRIORITY_BARRIER;
+
+		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		i915_request_add(rq);
+	}
+
+	if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= TF_COUNT; i++)
+		elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1];
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: preemption dispatch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	for (i = 1; i <= TF_COUNT; i++)
+		elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0];
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: preemption switch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+struct signal_cb {
+	struct dma_fence_cb base;
+	bool seen;
+};
+
+static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
+{
+	struct signal_cb *s = container_of(cb, typeof(*s), base);
+
+	smp_store_mb(s->seen, true); /* be safe, be strong */
+}
+
+static int measure_completion(struct intel_context *ce)
+{
+	u32 *sema = hwsp_scratch(ce);
+	const u32 offset = hwsp_offset(ce, sema);
+	u32 elapsed[TF_COUNT], cycles;
+	u32 *cs;
+	int i;
+
+	/*
+	 * Measure how long it takes for the signal (interrupt) to be
+	 * sent from the GPU to be processed by the CPU.
+	 *
+	 *    A: read CS_TIMESTAMP on GPU
+	 *    signal
+	 *    B: read CS_TIMESTAMP from CPU
+	 *
+	 * Completion latency: B - A
+	 */
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct signal_cb cb = { .seen = false };
+		struct i915_request *rq;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 12);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+		cs = emit_semaphore_poll_until(cs, offset, i);
+		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+
+		intel_ring_advance(rq, cs);
+
+		dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
+
+		local_bh_disable();
+		i915_request_add(rq);
+		local_bh_enable();
+
+		if (wait_for(READ_ONCE(sema[i]), 50)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		preempt_disable();
+		semaphore_set(sema, i);
+		while (!READ_ONCE(cb.seen))
+			cpu_relax();
+
+		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		preempt_enable();
+	}
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++)
+		elapsed[i] = elapsed[i] - sema[i + 1];
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: completion latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> TF_BIAS,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static void rps_pin(struct intel_gt *gt)
+{
+	/* Pin the frequency to max */
+	atomic_inc(&gt->rps.num_waiters);
+	intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
+
+	mutex_lock(&gt->rps.lock);
+	intel_rps_set(&gt->rps, gt->rps.max_freq);
+	mutex_unlock(&gt->rps.lock);
+}
+
+static void rps_unpin(struct intel_gt *gt)
+{
+	intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
+	atomic_dec(&gt->rps.num_waiters);
+}
+
+static void engine_heartbeat_disable(struct intel_engine_cs *engine)
+{
+	engine->props.heartbeat_interval_ms = 0;
+
+	intel_engine_pm_get(engine);
+	intel_engine_park_heartbeat(engine);
+}
+
+static void engine_heartbeat_enable(struct intel_engine_cs *engine)
+{
+	intel_engine_pm_put(engine);
+
+	engine->props.heartbeat_interval_ms =
+		engine->defaults.heartbeat_interval_ms;
+}
+
+static int perf_request_latency(void *arg)
+{
+	struct drm_i915_private *i915 = arg;
+	struct intel_engine_cs *engine;
+	struct pm_qos_request qos;
+	int err = 0;
+
+	if (INTEL_GEN(i915) < 8) /* per-engine CS timestamp, semaphores */
+		return 0;
+
+	cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
+
+	for_each_uabi_engine(engine, i915) {
+		struct intel_context *ce;
+
+		ce = intel_context_create(engine);
+		if (IS_ERR(ce))
+			goto out;
+
+		err = intel_context_pin(ce);
+		if (err) {
+			intel_context_put(ce);
+			goto out;
+		}
+
+		engine_heartbeat_disable(engine);
+		rps_pin(engine->gt);
+
+		if (err == 0)
+			err = measure_semaphore_response(ce);
+		if (err == 0)
+			err = measure_idle_dispatch(ce);
+		if (err == 0)
+			err = measure_busy_dispatch(ce);
+		if (err == 0)
+			err = measure_inter_request(ce);
+		if (err == 0)
+			err = measure_context_switch(ce);
+		if (err == 0)
+			err = measure_preemption(ce);
+		if (err == 0)
+			err = measure_completion(ce);
+
+		rps_unpin(engine->gt);
+		engine_heartbeat_enable(engine);
+
+		intel_context_unpin(ce);
+		intel_context_put(ce);
+		if (err)
+			goto out;
+	}
+
+out:
+	if (igt_flush_test(i915))
+		err = -EIO;
+
+	cpu_latency_qos_remove_request(&qos);
+	return err;
+}
+
 static int s_sync0(void *arg)
 {
 	struct perf_series *ps = arg;
@@ -2042,6 +2808,7 @@ static int perf_parallel_engines(void *arg)
 int i915_request_perf_selftests(struct drm_i915_private *i915)
 {
 	static const struct i915_subtest tests[] = {
+		SUBTEST(perf_request_latency),
 		SUBTEST(perf_series_engines),
 		SUBTEST(perf_parallel_engines),
 	};
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [Intel-gfx] [PATCH] drm/i915/selftests: Measure dispatch latency
  2020-05-15 15:10 Chris Wilson
  2020-05-15 17:32 ` Chris Wilson
  2020-05-15 17:58 ` Chris Wilson
@ 2020-05-15 18:02 ` Chris Wilson
  2 siblings, 0 replies; 13+ messages in thread
From: Chris Wilson @ 2020-05-15 18:02 UTC (permalink / raw)
  To: intel-gfx; +Cc: Chris Wilson

A useful metric of the system's health is how fast we can tell the GPU
to do various actions, so measure our latency.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
 drivers/gpu/drm/i915/selftests/i915_request.c | 612 ++++++++++++++++++
 1 file changed, 612 insertions(+)

diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c
index 6014e8dfcbb1..bfb601214c4a 100644
--- a/drivers/gpu/drm/i915/selftests/i915_request.c
+++ b/drivers/gpu/drm/i915/selftests/i915_request.c
@@ -24,16 +24,20 @@
 
 #include <linux/prime_numbers.h>
 #include <linux/pm_qos.h>
+#include <linux/sort.h>
 
 #include "gem/i915_gem_pm.h"
 #include "gem/selftests/mock_context.h"
 
+#include "gt/intel_engine_heartbeat.h"
 #include "gt/intel_engine_pm.h"
 #include "gt/intel_engine_user.h"
 #include "gt/intel_gt.h"
+#include "gt/intel_gt_requests.h"
 
 #include "i915_random.h"
 #include "i915_selftest.h"
+#include "igt_flush_test.h"
 #include "igt_live_test.h"
 #include "igt_spinner.h"
 #include "lib_sw_fence.h"
@@ -1524,6 +1528,613 @@ struct perf_series {
 	struct intel_context *ce[];
 };
 
+#define COUNT 5
+
+static int cmp_u32(const void *A, const void *B)
+{
+	const u32 *a = A, *b = B;
+
+	return *a - *b;
+}
+
+static u32 trifilter(u32 *a)
+{
+	u64 sum;
+
+	sort(a, COUNT, sizeof(*a), cmp_u32, NULL);
+
+	sum = mul_u32_u32(a[2], 2);
+	sum += a[1];
+	sum += a[3];
+
+	return (sum + 2) >> 2;
+}
+
+static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
+{
+	u64 ns = i915_cs_timestamp_ticks_to_ns(engine->i915, cycles);
+
+	return DIV_ROUND_CLOSEST(ns, 1 << COUNT);
+}
+
+static int measure_semaphore_response(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT], cycles;
+	struct i915_request *rq;
+	u32 *cs;
+	int i;
+
+	rq = i915_request_create(ce);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	cs = intel_ring_begin(rq, 4 + 8 * ARRAY_SIZE(elapsed));
+	if (IS_ERR(cs)) {
+		i915_request_add(rq);
+		return PTR_ERR(cs);
+	}
+
+	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+	*cs++ = offset;
+	*cs++ = 0;
+	*cs++ = 0xffffffff;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		*cs++ = MI_SEMAPHORE_WAIT |
+			MI_SEMAPHORE_GLOBAL_GTT |
+			MI_SEMAPHORE_POLL |
+			MI_SEMAPHORE_SAD_EQ_SDD;
+		*cs++ = i;
+		*cs++ = offset;
+		*cs++ = 0;
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+	}
+
+	intel_ring_advance(rq, cs);
+	i915_request_add(rq);
+
+	if (wait_for(READ_ONCE(*sema) == 0xffffffff, 50)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		WRITE_ONCE(sema[0], i);
+		wmb(); /* flush the update to the cache, and beyond */
+
+		if (wait_for(READ_ONCE(sema[i]), 50)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		elapsed[i - 1] = (sema[i] - cycles) << COUNT;
+	}
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: semaphore response %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_idle_dispatch(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT], cycles;
+	u32 *cs;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+		int err;
+
+		err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+		if (err)
+			return err;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 4);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		intel_ring_advance(rq, cs);
+
+		preempt_disable();
+		local_bh_disable();
+		elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		i915_request_add(rq);
+		local_bh_enable();
+		preempt_enable();
+	}
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+		if (wait_for(READ_ONCE(sema[i]), 50)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		elapsed[i] = (sema[i] - elapsed[i]) << COUNT;
+	}
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_busy_dispatch(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT + 1], cycles;
+	u32 *cs;
+	int i;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 12);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		*cs++ = MI_SEMAPHORE_WAIT |
+			MI_SEMAPHORE_GLOBAL_GTT |
+			MI_SEMAPHORE_POLL |
+			MI_SEMAPHORE_SAD_GTE_SDD;
+		*cs++ = i;
+		*cs++ = offset;
+		*cs++ = 0;
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		intel_ring_advance(rq, cs);
+
+		if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		preempt_disable();
+		local_bh_disable();
+		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		i915_request_add(rq);
+		local_bh_enable();
+		WRITE_ONCE(sema[0], i - 1);
+		wmb(); /* flush the update to the cache, and beyond */
+		preempt_enable();
+	}
+	WRITE_ONCE(sema[0], i - 1);
+
+	for (i = 1; i <= COUNT; i++)
+		elapsed[i - 1] = (sema[i] - elapsed[i]) << COUNT;
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
+{
+	const u32 offset =
+		i915_ggtt_offset(engine->status_page.vma) +
+		offset_in_page(sema);
+	struct i915_request *rq;
+	u32 *cs;
+
+	rq = i915_request_create(engine->kernel_context);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	cs = intel_ring_begin(rq, 4);
+	if (IS_ERR(cs)) {
+		i915_request_add(rq);
+		return PTR_ERR(cs);
+	}
+
+	*cs++ = MI_SEMAPHORE_WAIT |
+		MI_SEMAPHORE_GLOBAL_GTT |
+		MI_SEMAPHORE_POLL |
+		mode;
+	*cs++ = value;
+	*cs++ = offset;
+	*cs++ = 0;
+
+	intel_ring_advance(rq, cs);
+	i915_request_add(rq);
+
+	return 0;
+}
+
+static int measure_inter_request(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT + 1], cycles;
+	struct i915_sw_fence *submit;
+	int i, err;
+
+	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+	if (err)
+		return err;
+
+	submit = heap_fence_create(GFP_KERNEL);
+	if (!submit) {
+		WRITE_ONCE(sema[0], 1);
+		return -ENOMEM;
+	}
+
+	intel_engine_flush_submission(ce->engine);
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+		u32 *cs;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq)) {
+			WRITE_ONCE(sema[0], 1);
+			return PTR_ERR(rq);
+		}
+
+		err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
+						       submit,
+						       GFP_KERNEL);
+		if (err < 0) {
+			i915_sw_fence_commit(submit);
+			heap_fence_put(submit);
+			i915_request_add(rq);
+			WRITE_ONCE(sema[0], 1);
+			return err;
+		}
+
+		cs = intel_ring_begin(rq, 4);
+		if (IS_ERR(cs)) {
+			i915_sw_fence_commit(submit);
+			heap_fence_put(submit);
+			i915_request_add(rq);
+			WRITE_ONCE(sema[0], 1);
+			return PTR_ERR(cs);
+		}
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		intel_ring_advance(rq, cs);
+		i915_request_add(rq);
+	}
+	local_bh_disable();
+	i915_sw_fence_commit(submit);
+	local_bh_enable();
+	intel_engine_flush_submission(ce->engine);
+	heap_fence_put(submit);
+
+	WRITE_ONCE(sema[0], 1);
+	wmb(); /* flush the update to the cache, and beyond */
+
+	if (wait_for(READ_ONCE(sema[COUNT + 1]), 100)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= COUNT; i++)
+		elapsed[i - 1] = (sema[i + 1] - sema[i]) << COUNT;
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: inter-request latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_context_switch(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	struct i915_request *fence = NULL;
+	u32 elapsed[COUNT + 1], cycles;
+	int i, j, err;
+	u32 *cs;
+
+	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+	if (err)
+		return err;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct intel_context *arr[] = {
+			ce, ce->engine->kernel_context
+		};
+
+		for (j = 0; j < ARRAY_SIZE(arr); j++) {
+			struct i915_request *rq;
+
+			rq = i915_request_create(arr[j]);
+			if (IS_ERR(rq))
+				return PTR_ERR(rq);
+
+			if (fence) {
+				err = i915_request_await_dma_fence(rq,
+								   &fence->fence);
+				if (err) {
+					i915_request_add(rq);
+					return err;
+				}
+			}
+
+			cs = intel_ring_begin(rq, 4);
+			if (IS_ERR(cs)) {
+				i915_request_add(rq);
+				return PTR_ERR(cs);
+			}
+
+			*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+			*cs++ = ce->engine->mmio_base + 0x358;
+			*cs++ = offset +
+				sizeof(*sema) * (ARRAY_SIZE(arr) * i + j);
+			*cs++ = 0;
+
+			intel_ring_advance(rq, cs);
+
+			i915_request_put(fence);
+			fence = i915_request_get(rq);
+
+			i915_request_add(rq);
+		}
+	}
+	i915_request_put(fence);
+	intel_engine_flush_submission(ce->engine);
+
+	WRITE_ONCE(sema[0], 1);
+	wmb(); /* flush the update to the cache, and beyond */
+
+	if (wait_for(READ_ONCE(sema[2 *i - 1]), 500)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= COUNT; i++)
+		elapsed[i - 1] = (sema[2 * i + 2] - sema[2 * i + 1]) << COUNT;
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: context switch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+struct signal_cb {
+	struct dma_fence_cb base;
+	bool seen;
+};
+
+static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
+{
+	struct signal_cb *s = container_of(cb, typeof(*s), base);
+
+	smp_store_mb(s->seen, true); /* be safe, be strong */
+}
+
+static int measure_completion(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT], cycles;
+	u32 *cs;
+	int i;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct signal_cb cb = { .seen = false };
+		struct i915_request *rq;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 12);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		*cs++ = MI_SEMAPHORE_WAIT |
+			MI_SEMAPHORE_GLOBAL_GTT |
+			MI_SEMAPHORE_POLL |
+			MI_SEMAPHORE_SAD_EQ_SDD;
+		*cs++ = i;
+		*cs++ = offset;
+		*cs++ = 0;
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		intel_ring_advance(rq, cs);
+
+		dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
+
+		local_bh_disable();
+		i915_request_add(rq);
+		local_bh_enable();
+
+		if (wait_for(READ_ONCE(sema[i]), 50)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		preempt_disable();
+		WRITE_ONCE(sema[0], i);
+		wmb();
+		while (!READ_ONCE(cb.seen))
+			cpu_relax();
+
+		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		preempt_enable();
+	}
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++)
+		elapsed[i] = (elapsed[i] - sema[i + 1]) << COUNT;
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: completion latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static void rps_pin(struct intel_gt *gt)
+{
+	/* Pin the frequency to max */
+	atomic_inc(&gt->rps.num_waiters);
+	intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
+
+	mutex_lock(&gt->rps.lock);
+	intel_rps_set(&gt->rps, gt->rps.max_freq);
+	mutex_unlock(&gt->rps.lock);
+}
+
+static void rps_unpin(struct intel_gt *gt)
+{
+	intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
+	atomic_dec(&gt->rps.num_waiters);
+}
+
+static unsigned long engine_heartbeat_disable(struct intel_engine_cs *engine)
+{
+	unsigned long old;
+
+	old = fetch_and_zero(&engine->props.heartbeat_interval_ms);
+
+	intel_engine_pm_get(engine);
+	intel_engine_park_heartbeat(engine);
+
+	return old;
+}
+
+static void engine_heartbeat_enable(struct intel_engine_cs *engine,
+				    unsigned long saved)
+{
+	intel_engine_pm_put(engine);
+
+	engine->props.heartbeat_interval_ms = saved;
+}
+
+static int perf_request_latency(void *arg)
+{
+	struct drm_i915_private *i915 = arg;
+	struct intel_engine_cs *engine;
+	struct pm_qos_request qos;
+	int err = 0;
+
+	if (INTEL_GEN(i915) < 8) /* per-engine CS timestamp, semaphores */
+		return 0;
+
+	cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
+
+	for_each_uabi_engine(engine, i915) {
+		unsigned long saved_heartbeat;
+		struct intel_context *ce;
+
+		ce = intel_context_create(engine);
+		if (IS_ERR(ce))
+			goto out;
+
+		err = intel_context_pin(ce);
+		if (err) {
+			intel_context_put(ce);
+			goto out;
+		}
+
+		saved_heartbeat = engine_heartbeat_disable(engine);
+		rps_pin(engine->gt);
+
+		if (err == 0)
+			err = measure_semaphore_response(ce);
+		if (err == 0)
+			err = measure_idle_dispatch(ce);
+		if (err == 0)
+			err = measure_busy_dispatch(ce);
+		if (err == 0)
+			err = measure_inter_request(ce);
+		if (err == 0)
+			err = measure_context_switch(ce);
+		if (err == 0)
+			err = measure_completion(ce);
+
+		rps_unpin(engine->gt);
+		engine_heartbeat_enable(engine, saved_heartbeat);
+
+		intel_context_unpin(ce);
+		intel_context_put(ce);
+		if (err)
+			goto out;
+	}
+
+out:
+	if (igt_flush_test(i915))
+		err = -EIO;
+
+	cpu_latency_qos_remove_request(&qos);
+	return err;
+}
+
 static int s_sync0(void *arg)
 {
 	struct perf_series *ps = arg;
@@ -2042,6 +2653,7 @@ static int perf_parallel_engines(void *arg)
 int i915_request_perf_selftests(struct drm_i915_private *i915)
 {
 	static const struct i915_subtest tests[] = {
+		SUBTEST(perf_request_latency),
 		SUBTEST(perf_series_engines),
 		SUBTEST(perf_parallel_engines),
 	};
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [Intel-gfx] [PATCH] drm/i915/selftests: Measure dispatch latency
  2020-05-15 15:10 Chris Wilson
  2020-05-15 17:32 ` Chris Wilson
@ 2020-05-15 17:58 ` Chris Wilson
  2020-05-15 18:02 ` Chris Wilson
  2 siblings, 0 replies; 13+ messages in thread
From: Chris Wilson @ 2020-05-15 17:58 UTC (permalink / raw)
  To: intel-gfx; +Cc: Chris Wilson

A useful metric of the system's health is how fast we can tell the GPU
to do various actions, so measure our latency.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
 drivers/gpu/drm/i915/selftests/i915_request.c | 612 ++++++++++++++++++
 1 file changed, 612 insertions(+)

diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c
index 6014e8dfcbb1..c265501c26a8 100644
--- a/drivers/gpu/drm/i915/selftests/i915_request.c
+++ b/drivers/gpu/drm/i915/selftests/i915_request.c
@@ -24,16 +24,20 @@
 
 #include <linux/prime_numbers.h>
 #include <linux/pm_qos.h>
+#include <linux/sort.h>
 
 #include "gem/i915_gem_pm.h"
 #include "gem/selftests/mock_context.h"
 
+#include "gt/intel_engine_heartbeat.h"
 #include "gt/intel_engine_pm.h"
 #include "gt/intel_engine_user.h"
 #include "gt/intel_gt.h"
+#include "gt/intel_gt_requests.h"
 
 #include "i915_random.h"
 #include "i915_selftest.h"
+#include "igt_flush_test.h"
 #include "igt_live_test.h"
 #include "igt_spinner.h"
 #include "lib_sw_fence.h"
@@ -1524,6 +1528,613 @@ struct perf_series {
 	struct intel_context *ce[];
 };
 
+#define COUNT 5
+
+static int cmp_u32(const void *A, const void *B)
+{
+	const u32 *a = A, *b = B;
+
+	return *a - *b;
+}
+
+static u32 trifilter(u32 *a)
+{
+	u64 sum;
+
+	sort(a, COUNT, sizeof(*a), cmp_u32, NULL);
+
+	sum = mul_u32_u32(a[2], 2);
+	sum += a[1];
+	sum += a[3];
+
+	return (sum + 2) >> 2;
+}
+
+static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
+{
+	u64 ns = i915_cs_timestamp_ticks_to_ns(engine->i915, cycles);
+
+	return DIV_ROUND_CLOSEST(ns, 1 << COUNT);
+}
+
+static int measure_semaphore_response(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT], cycles;
+	struct i915_request *rq;
+	u32 *cs;
+	int i;
+
+	rq = i915_request_create(ce);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	cs = intel_ring_begin(rq, 4 + 8 * ARRAY_SIZE(elapsed));
+	if (IS_ERR(cs)) {
+		i915_request_add(rq);
+		return PTR_ERR(cs);
+	}
+
+	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+	*cs++ = offset;
+	*cs++ = 0;
+	*cs++ = 0xffffffff;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		*cs++ = MI_SEMAPHORE_WAIT |
+			MI_SEMAPHORE_GLOBAL_GTT |
+			MI_SEMAPHORE_POLL |
+			MI_SEMAPHORE_SAD_EQ_SDD;
+		*cs++ = i;
+		*cs++ = offset;
+		*cs++ = 0;
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+	}
+
+	intel_ring_advance(rq, cs);
+	i915_request_add(rq);
+
+	if (wait_for(READ_ONCE(*sema) == 0xffffffff, 50)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		WRITE_ONCE(sema[0], i);
+		wmb(); /* flush the update to the cache, and beyond */
+
+		if (wait_for(READ_ONCE(sema[i]), 50)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		elapsed[i - 1] = (sema[i] - cycles) << COUNT;
+	}
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: semaphore response %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_idle_dispatch(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT], cycles;
+	u32 *cs;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+		int err;
+
+		err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+		if (err)
+			return err;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 4);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		intel_ring_advance(rq, cs);
+
+		preempt_disable();
+		local_bh_disable();
+		elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		i915_request_add(rq);
+		local_bh_enable();
+		preempt_enable();
+	}
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+		if (wait_for(READ_ONCE(sema[i]), 50)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		elapsed[i] = (sema[i] - elapsed[i]) << COUNT;
+	}
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_busy_dispatch(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT + 1], cycles;
+	u32 *cs;
+	int i;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 12);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		*cs++ = MI_SEMAPHORE_WAIT |
+			MI_SEMAPHORE_GLOBAL_GTT |
+			MI_SEMAPHORE_POLL |
+			MI_SEMAPHORE_SAD_GTE_SDD;
+		*cs++ = i;
+		*cs++ = offset;
+		*cs++ = 0;
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		intel_ring_advance(rq, cs);
+
+		if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		preempt_disable();
+		local_bh_disable();
+		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		i915_request_add(rq);
+		local_bh_enable();
+		WRITE_ONCE(sema[0], i - 1);
+		wmb(); /* flush the update to the cache, and beyond */
+		preempt_enable();
+	}
+	WRITE_ONCE(sema[0], i - 1);
+
+	for (i = 1; i <= COUNT; i++)
+		elapsed[i - 1] = (sema[i] - elapsed[i]) << COUNT;
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
+{
+	const u32 offset =
+		i915_ggtt_offset(engine->status_page.vma) +
+		offset_in_page(sema);
+	struct i915_request *rq;
+	u32 *cs;
+
+	rq = i915_request_create(engine->kernel_context);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	cs = intel_ring_begin(rq, 4);
+	if (IS_ERR(cs)) {
+		i915_request_add(rq);
+		return PTR_ERR(cs);
+	}
+
+	*cs++ = MI_SEMAPHORE_WAIT |
+		MI_SEMAPHORE_GLOBAL_GTT |
+		MI_SEMAPHORE_POLL |
+		mode;
+	*cs++ = value;
+	*cs++ = offset;
+	*cs++ = 0;
+
+	intel_ring_advance(rq, cs);
+	i915_request_add(rq);
+
+	return 0;
+}
+
+static int measure_inter_request(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT + 1], cycles;
+	struct i915_sw_fence *submit;
+	int i, err;
+
+	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+	if (err)
+		return err;
+
+	submit = heap_fence_create(GFP_KERNEL);
+	if (!submit) {
+		WRITE_ONCE(sema[0], 1);
+		return -ENOMEM;
+	}
+
+	intel_engine_flush_submission(ce->engine);
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+		u32 *cs;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq)) {
+			WRITE_ONCE(sema[0], 1);
+			return PTR_ERR(rq);
+		}
+
+		err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
+						       submit,
+						       GFP_KERNEL);
+		if (err < 0) {
+			i915_sw_fence_commit(submit);
+			heap_fence_put(submit);
+			i915_request_add(rq);
+			WRITE_ONCE(sema[0], 1);
+			return err;
+		}
+
+		cs = intel_ring_begin(rq, 4);
+		if (IS_ERR(cs)) {
+			i915_sw_fence_commit(submit);
+			heap_fence_put(submit);
+			i915_request_add(rq);
+			WRITE_ONCE(sema[0], 1);
+			return PTR_ERR(cs);
+		}
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		intel_ring_advance(rq, cs);
+		i915_request_add(rq);
+	}
+	local_bh_disable();
+	i915_sw_fence_commit(submit);
+	local_bh_enable();
+	intel_engine_flush_submission(ce->engine);
+	heap_fence_put(submit);
+
+	WRITE_ONCE(sema[0], 1);
+	wmb(); /* flush the update to the cache, and beyond */
+
+	if (wait_for(READ_ONCE(sema[COUNT + 1]), 100)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= COUNT; i++)
+		elapsed[i - 1] = (sema[i + 1] - sema[i]) << COUNT;
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: inter-request latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_context_switch(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	struct i915_request *fence = NULL;
+	u32 elapsed[COUNT + 1], cycles;
+	int i, j, err;
+	u32 *cs;
+
+	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+	if (err)
+		return err;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct intel_context *arr[] = {
+			ce, ce->engine->kernel_context
+		};
+
+		for (j = 0; j < ARRAY_SIZE(arr); j++) {
+			struct i915_request *rq;
+
+			rq = i915_request_create(arr[j]);
+			if (IS_ERR(rq))
+				return PTR_ERR(rq);
+
+			if (fence) {
+				err = i915_request_await_dma_fence(rq,
+								   &fence->fence);
+				if (err) {
+					i915_request_add(rq);
+					return err;
+				}
+			}
+
+			cs = intel_ring_begin(rq, 4);
+			if (IS_ERR(cs)) {
+				i915_request_add(rq);
+				return PTR_ERR(cs);
+			}
+
+			*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+			*cs++ = ce->engine->mmio_base + 0x358;
+			*cs++ = offset +
+				sizeof(*sema) * (ARRAY_SIZE(arr) * i + j);
+			*cs++ = 0;
+
+			intel_ring_advance(rq, cs);
+
+			i915_request_put(fence);
+			fence = i915_request_get(rq);
+
+			i915_request_add(rq);
+		}
+	}
+	i915_request_put(fence);
+	intel_engine_flush_submission(ce->engine);
+
+	WRITE_ONCE(sema[0], 1);
+	wmb(); /* flush the update to the cache, and beyond */
+
+	if (wait_for(READ_ONCE(sema[2 *i - 1]), 500)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= COUNT; i++)
+		elapsed[i - 1] = (sema[2 * i + 2] - sema[2 * i + 1]) << COUNT;
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: context switch latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+struct signal_cb {
+	struct dma_fence_cb base;
+	bool seen;
+};
+
+static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
+{
+	struct signal_cb *s = container_of(cb, typeof(*s), base);
+
+	smp_store_mb(s->seen, true); /* be safe, be strong */
+}
+
+static int measure_completion(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT], cycles;
+	u32 *cs;
+	int i;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct signal_cb cb = { .seen = false };
+		struct i915_request *rq;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 12);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		*cs++ = MI_SEMAPHORE_WAIT |
+			MI_SEMAPHORE_GLOBAL_GTT |
+			MI_SEMAPHORE_POLL |
+			MI_SEMAPHORE_SAD_EQ_SDD;
+		*cs++ = i;
+		*cs++ = offset;
+		*cs++ = 0;
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		intel_ring_advance(rq, cs);
+
+		dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
+
+		local_bh_disable();
+		i915_request_add(rq);
+		local_bh_enable();
+
+		if (wait_for(READ_ONCE(sema[i]), 50)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		preempt_disable();
+		WRITE_ONCE(sema[0], i);
+		wmb();
+		while (!READ_ONCE(cb.seen))
+			cpu_relax();
+
+		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		preempt_enable();
+	}
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++)
+		elapsed[i] = (sema[i + 1] - elapsed[i]) << COUNT;
+
+	cycles = trifilter(elapsed);
+	pr_info("%s: completion latency %d cycles, %lluns\n",
+		ce->engine->name, cycles >> COUNT,
+		cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static void rps_pin(struct intel_gt *gt)
+{
+	/* Pin the frequency to max */
+	atomic_inc(&gt->rps.num_waiters);
+	intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
+
+	mutex_lock(&gt->rps.lock);
+	intel_rps_set(&gt->rps, gt->rps.max_freq);
+	mutex_unlock(&gt->rps.lock);
+}
+
+static void rps_unpin(struct intel_gt *gt)
+{
+	intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
+	atomic_dec(&gt->rps.num_waiters);
+}
+
+static unsigned long engine_heartbeat_disable(struct intel_engine_cs *engine)
+{
+	unsigned long old;
+
+	old = fetch_and_zero(&engine->props.heartbeat_interval_ms);
+
+	intel_engine_pm_get(engine);
+	intel_engine_park_heartbeat(engine);
+
+	return old;
+}
+
+static void engine_heartbeat_enable(struct intel_engine_cs *engine,
+				    unsigned long saved)
+{
+	intel_engine_pm_put(engine);
+
+	engine->props.heartbeat_interval_ms = saved;
+}
+
+static int perf_request_latency(void *arg)
+{
+	struct drm_i915_private *i915 = arg;
+	struct intel_engine_cs *engine;
+	struct pm_qos_request qos;
+	int err = 0;
+
+	if (INTEL_GEN(i915) < 8) /* per-engine CS timestamp, semaphores */
+		return 0;
+
+	cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
+
+	for_each_uabi_engine(engine, i915) {
+		unsigned long saved_heartbeat;
+		struct intel_context *ce;
+
+		ce = intel_context_create(engine);
+		if (IS_ERR(ce))
+			goto out;
+
+		err = intel_context_pin(ce);
+		if (err) {
+			intel_context_put(ce);
+			goto out;
+		}
+
+		saved_heartbeat = engine_heartbeat_disable(engine);
+		rps_pin(engine->gt);
+
+		if (err == 0)
+			err = measure_semaphore_response(ce);
+		if (err == 0)
+			err = measure_idle_dispatch(ce);
+		if (err == 0)
+			err = measure_busy_dispatch(ce);
+		if (err == 0)
+			err = measure_inter_request(ce);
+		if (err == 0)
+			err = measure_context_switch(ce);
+		if (err == 0)
+			err = measure_completion(ce);
+
+		rps_unpin(engine->gt);
+		engine_heartbeat_enable(engine, saved_heartbeat);
+
+		intel_context_unpin(ce);
+		intel_context_put(ce);
+		if (err)
+			goto out;
+	}
+
+out:
+	if (igt_flush_test(i915))
+		err = -EIO;
+
+	cpu_latency_qos_remove_request(&qos);
+	return err;
+}
+
 static int s_sync0(void *arg)
 {
 	struct perf_series *ps = arg;
@@ -2042,6 +2653,7 @@ static int perf_parallel_engines(void *arg)
 int i915_request_perf_selftests(struct drm_i915_private *i915)
 {
 	static const struct i915_subtest tests[] = {
+		SUBTEST(perf_request_latency),
 		SUBTEST(perf_series_engines),
 		SUBTEST(perf_parallel_engines),
 	};
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [Intel-gfx] [PATCH] drm/i915/selftests: Measure dispatch latency
  2020-05-15 15:10 Chris Wilson
@ 2020-05-15 17:32 ` Chris Wilson
  2020-05-15 17:58 ` Chris Wilson
  2020-05-15 18:02 ` Chris Wilson
  2 siblings, 0 replies; 13+ messages in thread
From: Chris Wilson @ 2020-05-15 17:32 UTC (permalink / raw)
  To: intel-gfx; +Cc: Chris Wilson

A useful metric of the system's health is how fast we can tell the GPU
to do various actions, so measure our latency.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
 drivers/gpu/drm/i915/selftests/i915_request.c | 521 ++++++++++++++++++
 1 file changed, 521 insertions(+)

diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c
index 6014e8dfcbb1..a47ac68955ec 100644
--- a/drivers/gpu/drm/i915/selftests/i915_request.c
+++ b/drivers/gpu/drm/i915/selftests/i915_request.c
@@ -24,16 +24,20 @@
 
 #include <linux/prime_numbers.h>
 #include <linux/pm_qos.h>
+#include <linux/sort.h>
 
 #include "gem/i915_gem_pm.h"
 #include "gem/selftests/mock_context.h"
 
+#include "gt/intel_engine_heartbeat.h"
 #include "gt/intel_engine_pm.h"
 #include "gt/intel_engine_user.h"
 #include "gt/intel_gt.h"
+#include "gt/intel_gt_requests.h"
 
 #include "i915_random.h"
 #include "i915_selftest.h"
+#include "igt_flush_test.h"
 #include "igt_live_test.h"
 #include "igt_spinner.h"
 #include "lib_sw_fence.h"
@@ -1524,6 +1528,522 @@ struct perf_series {
 	struct intel_context *ce[];
 };
 
+#define COUNT 5
+
+static int cmp_u32(const void *A, const void *B)
+{
+	const u32 *a = A, *b = B;
+
+	return *a - *b;
+}
+
+static u32 trifilter(u32 *a)
+{
+	u64 sum;
+
+	sort(a, COUNT, sizeof(*a), cmp_u32, NULL);
+
+	sum = mul_u32_u32(a[2], 2);
+	sum += a[1];
+	sum += a[3];
+
+	return (sum + 2) >> 2;
+}
+
+static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
+{
+	u64 ns = i915_cs_timestamp_ticks_to_ns(engine->i915, cycles);
+
+	return DIV_ROUND_CLOSEST(ns, 1 << COUNT);
+}
+
+static int measure_semaphore_response(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT], cycles;
+	struct i915_request *rq;
+	u32 *cs;
+	int i;
+
+	rq = i915_request_create(ce);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	cs = intel_ring_begin(rq, 4 + 8 * ARRAY_SIZE(elapsed));
+	if (IS_ERR(cs)) {
+		i915_request_add(rq);
+		return PTR_ERR(cs);
+	}
+
+	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+	*cs++ = offset;
+	*cs++ = 0;
+	*cs++ = 0xffffffff;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		*cs++ = MI_SEMAPHORE_WAIT |
+			MI_SEMAPHORE_GLOBAL_GTT |
+			MI_SEMAPHORE_POLL |
+			MI_SEMAPHORE_SAD_EQ_SDD;
+		*cs++ = i;
+		*cs++ = offset;
+		*cs++ = 0;
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+	}
+
+	intel_ring_advance(rq, cs);
+	i915_request_add(rq);
+
+	if (wait_for(READ_ONCE(*sema) == 0xffffffff, 50)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		WRITE_ONCE(sema[0], i);
+		wmb(); /* flush the update to the cache, and beyond */
+
+		if (wait_for(READ_ONCE(sema[i]), 50)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		elapsed[i - 1] = (sema[i] - cycles) << COUNT;
+	}
+
+	cycles = trifilter(elapsed);
+	pr_err("%s: semaphore response %d cycles, %lluns\n",
+	       ce->engine->name, cycles >> COUNT,
+	       cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_idle_dispatch(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT], cycles;
+	u32 *cs;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+		int err;
+
+		err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+		if (err)
+			return err;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 4);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		intel_ring_advance(rq, cs);
+
+		elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+
+		local_bh_disable();
+		i915_request_add(rq);
+		local_bh_enable();
+	}
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+		if (wait_for(READ_ONCE(sema[i]), 50)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		elapsed[i] = (sema[i] - elapsed[i]) << COUNT;
+	}
+
+	cycles = trifilter(elapsed);
+	pr_err("%s: idle dispatch latency %d cycles, %lluns\n",
+	       ce->engine->name, cycles >> COUNT,
+	       cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_busy_dispatch(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT + 1], cycles;
+	u32 *cs;
+	int i;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 12);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		*cs++ = MI_SEMAPHORE_WAIT |
+			MI_SEMAPHORE_GLOBAL_GTT |
+			MI_SEMAPHORE_POLL |
+			MI_SEMAPHORE_SAD_GTE_SDD;
+		*cs++ = i;
+		*cs++ = offset;
+		*cs++ = 0;
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		intel_ring_advance(rq, cs);
+
+		if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+
+		local_bh_disable();
+		i915_request_add(rq);
+		local_bh_enable();
+
+		WRITE_ONCE(sema[0], i - 1);
+		wmb(); /* flush the update to the cache, and beyond */
+	}
+	WRITE_ONCE(sema[0], i - 1);
+
+	for (i = 1; i <= COUNT; i++)
+		elapsed[i - 1] = (sema[i] - elapsed[i]) << COUNT;
+
+	cycles = trifilter(elapsed);
+	pr_err("%s: busy dispatch latency %d cycles, %lluns\n",
+	       ce->engine->name, cycles >> COUNT,
+	       cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
+{
+	const u32 offset =
+		i915_ggtt_offset(engine->status_page.vma) +
+		offset_in_page(sema);
+	struct i915_request *rq;
+	u32 *cs;
+
+	rq = i915_request_create(engine->kernel_context);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	cs = intel_ring_begin(rq, 4);
+	if (IS_ERR(cs)) {
+		i915_request_add(rq);
+		return PTR_ERR(cs);
+	}
+
+	*cs++ = MI_SEMAPHORE_WAIT |
+		MI_SEMAPHORE_GLOBAL_GTT |
+		MI_SEMAPHORE_POLL |
+		mode;
+	*cs++ = value;
+	*cs++ = offset;
+	*cs++ = 0;
+
+	intel_ring_advance(rq, cs);
+	i915_request_add(rq);
+
+	return 0;
+}
+
+static int measure_inter_request(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT + 1], cycles;
+	struct i915_sw_fence *submit;
+	int i, err;
+
+	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+	if (err)
+		return err;
+
+	submit = heap_fence_create(GFP_KERNEL);
+	if (!submit) {
+		WRITE_ONCE(sema[0], 1);
+		return -ENOMEM;
+	}
+
+	intel_engine_flush_submission(ce->engine);
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+		u32 *cs;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq)) {
+			WRITE_ONCE(sema[0], 1);
+			return PTR_ERR(rq);
+		}
+
+		err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
+						       submit,
+						       GFP_KERNEL);
+		if (err < 0) {
+			i915_sw_fence_commit(submit);
+			heap_fence_put(submit);
+			i915_request_add(rq);
+			WRITE_ONCE(sema[0], 1);
+			return err;
+		}
+
+		cs = intel_ring_begin(rq, 4);
+		if (IS_ERR(cs)) {
+			i915_sw_fence_commit(submit);
+			heap_fence_put(submit);
+			i915_request_add(rq);
+			WRITE_ONCE(sema[0], 1);
+			return PTR_ERR(cs);
+		}
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		intel_ring_advance(rq, cs);
+		i915_request_add(rq);
+	}
+	local_bh_disable();
+	i915_sw_fence_commit(submit);
+	local_bh_enable();
+	intel_engine_flush_submission(ce->engine);
+	heap_fence_put(submit);
+
+	WRITE_ONCE(sema[0], 1);
+	wmb(); /* flush the update to the cache, and beyond */
+
+	if (wait_for(READ_ONCE(sema[COUNT + 1]), 100)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= COUNT; i++)
+		elapsed[i - 1] = (sema[i + 1] - sema[i]) << COUNT;
+
+	cycles = trifilter(elapsed);
+	pr_err("%s: inter-request latency %d cycles, %lluns\n",
+	       ce->engine->name, cycles >> COUNT,
+	       cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_context_switch(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	struct i915_request *fence = NULL;
+	u32 elapsed[COUNT + 1], cycles;
+	int i, j, err;
+	u32 *cs;
+
+	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+	if (err)
+		return err;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct intel_context *arr[] = {
+			ce, ce->engine->kernel_context
+		};
+
+		for (j = 0; j < ARRAY_SIZE(arr); j++) {
+			struct i915_request *rq;
+
+			rq = i915_request_create(arr[j]);
+			if (IS_ERR(rq))
+				return PTR_ERR(rq);
+
+			if (fence) {
+				err = i915_request_await_dma_fence(rq,
+								   &fence->fence);
+				if (err) {
+					i915_request_add(rq);
+					return err;
+				}
+			}
+
+			cs = intel_ring_begin(rq, 4);
+			if (IS_ERR(cs)) {
+				i915_request_add(rq);
+				return PTR_ERR(cs);
+			}
+
+			*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+			*cs++ = ce->engine->mmio_base + 0x358;
+			*cs++ = offset +
+				sizeof(*sema) * (ARRAY_SIZE(arr) * i + j);
+			*cs++ = 0;
+
+			intel_ring_advance(rq, cs);
+
+			i915_request_put(fence);
+			fence = i915_request_get(rq);
+
+			i915_request_add(rq);
+		}
+	}
+	i915_request_put(fence);
+	intel_engine_flush_submission(ce->engine);
+
+	WRITE_ONCE(sema[0], 1);
+	wmb(); /* flush the update to the cache, and beyond */
+
+	if (wait_for(READ_ONCE(sema[2 *i - 1]), 500)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= COUNT; i++)
+		elapsed[i - 1] = (sema[2 * i + 2] - sema[2 * i + 1]) << COUNT;
+
+	cycles = trifilter(elapsed);
+	pr_err("%s: context switch latency %d cycles, %lluns\n",
+	       ce->engine->name, cycles >> COUNT,
+	       cycles_to_ns(ce->engine, cycles));
+
+	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static void rps_pin(struct intel_gt *gt)
+{
+	/* Pin the frequency to max */
+	atomic_inc(&gt->rps.num_waiters);
+	intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
+
+	mutex_lock(&gt->rps.lock);
+	intel_rps_set(&gt->rps, gt->rps.max_freq);
+	mutex_unlock(&gt->rps.lock);
+}
+
+static void rps_unpin(struct intel_gt *gt)
+{
+	intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
+	atomic_dec(&gt->rps.num_waiters);
+}
+
+static unsigned long engine_heartbeat_disable(struct intel_engine_cs *engine)
+{
+	unsigned long old;
+
+	old = fetch_and_zero(&engine->props.heartbeat_interval_ms);
+
+	intel_engine_pm_get(engine);
+	intel_engine_park_heartbeat(engine);
+
+	return old;
+}
+
+static void engine_heartbeat_enable(struct intel_engine_cs *engine,
+				    unsigned long saved)
+{
+	intel_engine_pm_put(engine);
+
+	engine->props.heartbeat_interval_ms = saved;
+}
+
+static int perf_request_latency(void *arg)
+{
+	struct drm_i915_private *i915 = arg;
+	struct intel_engine_cs *engine;
+	struct pm_qos_request qos;
+	int err = 0;
+
+	if (INTEL_GEN(i915) < 8) /* per-engine CS timestamp, semaphores */
+		return 0;
+
+	cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
+
+	for_each_uabi_engine(engine, i915) {
+		unsigned long saved_heartbeat;
+		struct intel_context *ce;
+
+		ce = intel_context_create(engine);
+		if (IS_ERR(ce))
+			goto out;
+
+		err = intel_context_pin(ce);
+		if (err) {
+			intel_context_put(ce);
+			goto out;
+		}
+
+		saved_heartbeat = engine_heartbeat_disable(engine);
+		rps_pin(engine->gt);
+
+		if (err == 0)
+			err = measure_semaphore_response(ce);
+		if (err == 0)
+			err = measure_idle_dispatch(ce);
+		if (err == 0)
+			err = measure_busy_dispatch(ce);
+		if (err == 0)
+			err = measure_inter_request(ce);
+		if (err == 0)
+			err = measure_context_switch(ce);
+
+		rps_unpin(engine->gt);
+		engine_heartbeat_enable(engine, saved_heartbeat);
+
+		intel_context_unpin(ce);
+		intel_context_put(ce);
+		if (err)
+			goto out;
+	}
+
+out:
+	if (igt_flush_test(i915))
+		err = -EIO;
+
+	cpu_latency_qos_remove_request(&qos);
+	return err;
+}
+
 static int s_sync0(void *arg)
 {
 	struct perf_series *ps = arg;
@@ -2042,6 +2562,7 @@ static int perf_parallel_engines(void *arg)
 int i915_request_perf_selftests(struct drm_i915_private *i915)
 {
 	static const struct i915_subtest tests[] = {
+		SUBTEST(perf_request_latency),
 		SUBTEST(perf_series_engines),
 		SUBTEST(perf_parallel_engines),
 	};
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [Intel-gfx] [PATCH] drm/i915/selftests: Measure dispatch latency
@ 2020-05-15 15:10 Chris Wilson
  2020-05-15 17:32 ` Chris Wilson
                   ` (2 more replies)
  0 siblings, 3 replies; 13+ messages in thread
From: Chris Wilson @ 2020-05-15 15:10 UTC (permalink / raw)
  To: intel-gfx; +Cc: Chris Wilson

A useful metric of the system's health is how fast we can tell the GPU
to do various actions, so measure our latency.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
 drivers/gpu/drm/i915/selftests/i915_request.c | 505 ++++++++++++++++++
 1 file changed, 505 insertions(+)

diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c
index 6014e8dfcbb1..94c79066f82a 100644
--- a/drivers/gpu/drm/i915/selftests/i915_request.c
+++ b/drivers/gpu/drm/i915/selftests/i915_request.c
@@ -24,13 +24,16 @@
 
 #include <linux/prime_numbers.h>
 #include <linux/pm_qos.h>
+#include <linux/sort.h>
 
 #include "gem/i915_gem_pm.h"
 #include "gem/selftests/mock_context.h"
 
+#include "gt/intel_engine_heartbeat.h"
 #include "gt/intel_engine_pm.h"
 #include "gt/intel_engine_user.h"
 #include "gt/intel_gt.h"
+#include "gt/intel_gt_requests.h"
 
 #include "i915_random.h"
 #include "i915_selftest.h"
@@ -1524,6 +1527,507 @@ struct perf_series {
 	struct intel_context *ce[];
 };
 
+#define COUNT 5
+
+static int cmp_u32(const void *A, const void *B)
+{
+	const u32 *a = A, *b = B;
+
+	return *a - *b;
+}
+
+static u32 trifilter(u32 *a)
+{
+	u64 sum;
+
+	sort(a, COUNT, sizeof(*a), cmp_u32, NULL);
+
+	sum = mul_u32_u32(a[2], 2);
+	sum += a[1];
+	sum += a[3];
+
+	return sum >> 2;
+}
+
+static int measure_semaphore_response(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT], cycles;
+	struct i915_request *rq;
+	u32 *cs;
+	int i;
+
+	rq = i915_request_create(ce);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	cs = intel_ring_begin(rq, 4 + 8 * ARRAY_SIZE(elapsed));
+	if (IS_ERR(cs)) {
+		i915_request_add(rq);
+		return PTR_ERR(cs);
+	}
+
+	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+	*cs++ = offset;
+	*cs++ = 0;
+	*cs++ = 0xffffffff;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		*cs++ = MI_SEMAPHORE_WAIT |
+			MI_SEMAPHORE_GLOBAL_GTT |
+			MI_SEMAPHORE_POLL |
+			MI_SEMAPHORE_SAD_EQ_SDD;
+		*cs++ = i;
+		*cs++ = offset;
+		*cs++ = 0;
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+	}
+
+	intel_ring_advance(rq, cs);
+	i915_request_add(rq);
+
+	if (wait_for(READ_ONCE(*sema) == 0xffffffff, 50)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		WRITE_ONCE(sema[0], i);
+		wmb();
+
+		if (wait_for(READ_ONCE(sema[i]), 50)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		elapsed[i - 1] = (sema[i] - cycles) << COUNT;
+	}
+
+	cycles = trifilter(elapsed);
+	pr_err("%s: semaphore response %d cycles, %lluns\n",
+	       ce->engine->name, cycles >> COUNT,
+	       i915_cs_timestamp_ticks_to_ns(ce->engine->i915,
+					     cycles) >> COUNT);
+
+	return 0;
+}
+
+static int measure_idle_dispatch(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT], cycles;
+	u32 *cs;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+		int err;
+
+		err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+		if (err)
+			return err;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 4);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		intel_ring_advance(rq, cs);
+
+		elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+		i915_request_add(rq);
+	}
+
+	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+		if (wait_for(READ_ONCE(sema[i]), 50)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+
+		elapsed[i] = (sema[i] - elapsed[i]) << COUNT;
+	}
+
+	cycles = trifilter(elapsed);
+	pr_err("%s: idle dispatch latency %d cycles, %lluns\n",
+	       ce->engine->name, cycles >> COUNT,
+	       i915_cs_timestamp_ticks_to_ns(ce->engine->i915,
+					     cycles) >> COUNT);
+
+	return 0;
+}
+
+static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
+{
+	const u32 offset =
+		i915_ggtt_offset(engine->status_page.vma) +
+		offset_in_page(sema);
+	struct i915_request *rq;
+	u32 *cs;
+
+	rq = i915_request_create(engine->kernel_context);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	cs = intel_ring_begin(rq, 4);
+	if (IS_ERR(cs)) {
+		i915_request_add(rq);
+		return PTR_ERR(cs);
+	}
+
+	*cs++ = MI_SEMAPHORE_WAIT |
+		MI_SEMAPHORE_GLOBAL_GTT |
+		MI_SEMAPHORE_POLL |
+		mode;
+	*cs++ = value;
+	*cs++ = offset;
+	*cs++ = 0;
+
+	intel_ring_advance(rq, cs);
+	i915_request_add(rq);
+
+	return 0;
+}
+
+static int measure_inter_request(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT + 1], cycles;
+	struct i915_sw_fence *submit;
+	int i, err;
+
+	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+	if (err)
+		return err;
+
+	submit = heap_fence_create(GFP_KERNEL);
+	if (!submit) {
+		WRITE_ONCE(sema[0], 1);
+		return -ENOMEM;
+	}
+
+	intel_engine_flush_submission(ce->engine);
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+		u32 *cs;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq)) {
+			WRITE_ONCE(sema[0], 1);
+			return PTR_ERR(rq);
+		}
+
+		err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
+						       submit,
+						       GFP_KERNEL);
+		if (err < 0) {
+			i915_sw_fence_commit(submit);
+			heap_fence_put(submit);
+			i915_request_add(rq);
+			WRITE_ONCE(sema[0], 1);
+			return err;
+		}
+
+		cs = intel_ring_begin(rq, 4);
+		if (IS_ERR(cs)) {
+			i915_sw_fence_commit(submit);
+			heap_fence_put(submit);
+			i915_request_add(rq);
+			WRITE_ONCE(sema[0], 1);
+			return PTR_ERR(cs);
+		}
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * i;
+		*cs++ = 0;
+
+		intel_ring_advance(rq, cs);
+		i915_request_add(rq);
+	}
+	i915_sw_fence_commit(submit);
+	intel_engine_flush_submission(ce->engine);
+	heap_fence_put(submit);
+
+	WRITE_ONCE(sema[0], 1);
+	if (wait_for(READ_ONCE(sema[COUNT + 1]), 100)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= COUNT; i++)
+		elapsed[i - 1] = (sema[i + 1] - sema[i]) << COUNT;
+
+	cycles = trifilter(elapsed);
+	pr_err("%s: inter-request latency %d cycles, %lluns\n",
+	       ce->engine->name, cycles >> COUNT,
+	       i915_cs_timestamp_ticks_to_ns(ce->engine->i915,
+					     cycles) >> COUNT);
+
+	return 0;
+}
+
+static int measure_lite_restore(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	u32 elapsed[COUNT + 1], cycles;
+	u32 *cs;
+	int i;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct i915_request *rq;
+
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
+
+		cs = intel_ring_begin(rq, 12);
+		if (IS_ERR(cs)) {
+			i915_request_add(rq);
+			return PTR_ERR(cs);
+		}
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * (2 * i + 0);
+		*cs++ = 0;
+
+		*cs++ = MI_SEMAPHORE_WAIT |
+			MI_SEMAPHORE_GLOBAL_GTT |
+			MI_SEMAPHORE_POLL |
+			MI_SEMAPHORE_SAD_GTE_SDD;
+		*cs++ = i;
+		*cs++ = offset;
+		*cs++ = 0;
+
+		*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+		*cs++ = ce->engine->mmio_base + 0x358;
+		*cs++ = offset + sizeof(*sema) * (2 * i + 1);
+		*cs++ = 0;
+
+		intel_ring_advance(rq, cs);
+		i915_request_add(rq);
+		intel_engine_flush_submission(ce->engine);
+
+		WRITE_ONCE(sema[0], i - 1);
+		if (i > 1 && wait_for(READ_ONCE(sema[2 * (i - 1)]), 100)) {
+			intel_gt_set_wedged(ce->engine->gt);
+			return -EIO;
+		}
+	}
+
+	WRITE_ONCE(sema[0], i - 1);
+	if (wait_for(READ_ONCE(sema[2 *i - 1]), 500)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= COUNT; i++)
+		elapsed[i - 1] = (sema[2 * i + 2] - sema[2 * i + 1]) << COUNT;
+
+	cycles = trifilter(elapsed);
+	pr_err("%s: lite restore latency %d cycles, %lluns\n",
+	       ce->engine->name, cycles >> COUNT,
+	       i915_cs_timestamp_ticks_to_ns(ce->engine->i915,
+					     cycles) >> COUNT);
+
+	return 0;
+}
+
+static int measure_context_switch(struct intel_context *ce)
+{
+	u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+	const u32 offset =
+		i915_ggtt_offset(ce->engine->status_page.vma) +
+		offset_in_page(sema);
+	struct i915_request *fence = NULL;
+	u32 elapsed[COUNT + 1], cycles;
+	int i, j, err;
+	u32 *cs;
+
+	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+	if (err)
+		return err;
+
+	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+		struct intel_context *arr[] = {
+			ce, ce->engine->kernel_context
+		};
+
+		for (j = 0; j < ARRAY_SIZE(arr); j++) {
+			struct i915_request *rq;
+
+			rq = i915_request_create(arr[j]);
+			if (IS_ERR(rq))
+				return PTR_ERR(rq);
+
+			if (fence) {
+				err = i915_request_await_dma_fence(rq,
+								   &fence->fence);
+				if (err) {
+					i915_request_add(rq);
+					return err;
+				}
+			}
+
+			cs = intel_ring_begin(rq, 4);
+			if (IS_ERR(cs)) {
+				i915_request_add(rq);
+				return PTR_ERR(cs);
+			}
+
+			*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+			*cs++ = ce->engine->mmio_base + 0x358;
+			*cs++ = offset +
+				sizeof(*sema) * (ARRAY_SIZE(arr) * i + j);
+			*cs++ = 0;
+
+			intel_ring_advance(rq, cs);
+
+			i915_request_put(fence);
+			fence = i915_request_get(rq);
+
+			i915_request_add(rq);
+		}
+	}
+	i915_request_put(fence);
+	intel_engine_flush_submission(ce->engine);
+
+	WRITE_ONCE(sema[0], 1);
+	if (wait_for(READ_ONCE(sema[2 *i - 1]), 500)) {
+		intel_gt_set_wedged(ce->engine->gt);
+		return -EIO;
+	}
+
+	for (i = 1; i <= COUNT; i++)
+		elapsed[i - 1] = (sema[2 * i + 2] - sema[2 * i + 1]) << COUNT;
+
+	cycles = trifilter(elapsed);
+	pr_err("%s: context switch latency %d cycles, %lluns\n",
+	       ce->engine->name, cycles >> COUNT,
+	       i915_cs_timestamp_ticks_to_ns(ce->engine->i915,
+					     cycles) >> COUNT);
+
+	return 0;
+}
+
+static void rps_pin(struct intel_gt *gt)
+{
+	/* Pin the frequency to max */
+	atomic_inc(&gt->rps.num_waiters);
+
+	mutex_lock(&gt->rps.lock);
+	intel_rps_set(&gt->rps, gt->rps.max_freq);
+	mutex_unlock(&gt->rps.lock);
+}
+
+static void rps_unpin(struct intel_gt *gt)
+{
+	atomic_dec(&gt->rps.num_waiters);
+}
+
+static unsigned long engine_heartbeat_disable(struct intel_engine_cs *engine)
+{
+	unsigned long old;
+
+	old = fetch_and_zero(&engine->props.heartbeat_interval_ms);
+
+	intel_engine_pm_get(engine);
+	intel_engine_park_heartbeat(engine);
+
+	return old;
+}
+
+static void engine_heartbeat_enable(struct intel_engine_cs *engine,
+				    unsigned long saved)
+{
+	intel_engine_pm_put(engine);
+
+	engine->props.heartbeat_interval_ms = saved;
+}
+
+static int perf_request_latency(void *arg)
+{
+	struct drm_i915_private *i915 = arg;
+	struct intel_engine_cs *engine;
+	struct pm_qos_request qos;
+	int err = 0;
+
+	if (INTEL_GEN(i915) < 8) /* per-engine CS timestamp, semaphores */
+		return 0;
+
+	cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
+
+	for_each_uabi_engine(engine, i915) {
+		unsigned long saved_heartbeat;
+		struct intel_context *ce;
+
+		ce = intel_context_create(engine);
+		if (IS_ERR(ce))
+			goto out;
+
+		err = intel_context_pin(ce);
+		if (err) {
+			intel_context_put(ce);
+			goto out;
+		}
+
+		saved_heartbeat = engine_heartbeat_disable(engine);
+		intel_uncore_forcewake_get(engine->uncore, FORCEWAKE_ALL);
+		rps_pin(engine->gt);
+
+		if (err == 0)
+			err = measure_semaphore_response(ce);
+		if (err == 0)
+			err = measure_idle_dispatch(ce);
+		if (err == 0)
+			err = measure_inter_request(ce);
+		if (err == 0)
+			err = measure_lite_restore(ce);
+		if (err == 0)
+			err = measure_context_switch(ce);
+
+		rps_unpin(engine->gt);
+		intel_uncore_forcewake_put(engine->uncore, FORCEWAKE_ALL);
+		engine_heartbeat_enable(engine, saved_heartbeat);
+
+		intel_context_unpin(ce);
+		intel_context_put(ce);
+		if (err)
+			goto out;
+	}
+
+out:
+	cpu_latency_qos_remove_request(&qos);
+	return err;
+}
+
 static int s_sync0(void *arg)
 {
 	struct perf_series *ps = arg;
@@ -2042,6 +2546,7 @@ static int perf_parallel_engines(void *arg)
 int i915_request_perf_selftests(struct drm_i915_private *i915)
 {
 	static const struct i915_subtest tests[] = {
+		SUBTEST(perf_request_latency),
 		SUBTEST(perf_series_engines),
 		SUBTEST(perf_parallel_engines),
 	};
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2020-05-19 13:08 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-05-18  8:57 [Intel-gfx] [PATCH] drm/i915/selftests: Measure dispatch latency Chris Wilson
2020-05-18 12:46 ` [Intel-gfx] ✗ Fi.CI.CHECKPATCH: warning for drm/i915/selftests: Measure dispatch latency (rev5) Patchwork
2020-05-18 13:20 ` [Intel-gfx] ✓ Fi.CI.BAT: success " Patchwork
2020-05-18 15:13 ` [Intel-gfx] ✓ Fi.CI.IGT: " Patchwork
  -- strict thread matches above, loose matches on Subject: below --
2020-05-19 13:08 [Intel-gfx] [PATCH] drm/i915/selftests: Measure dispatch latency Chris Wilson
2020-05-18 14:39 [Intel-gfx] [PATCH v2] " Chris Wilson
2020-05-18 16:22 ` [Intel-gfx] [PATCH] " Chris Wilson
2020-05-19 11:41 ` Chris Wilson
2020-05-19 12:47   ` Mika Kuoppala
2020-05-19 12:56     ` Chris Wilson
2020-05-15 15:10 Chris Wilson
2020-05-15 17:32 ` Chris Wilson
2020-05-15 17:58 ` Chris Wilson
2020-05-15 18:02 ` Chris Wilson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.