* [Intel-gfx] [PATCH] drm/i915/selftests: Measure dispatch latency
@ 2020-05-18 8:57 Chris Wilson
2020-05-18 12:46 ` [Intel-gfx] ✗ Fi.CI.CHECKPATCH: warning for drm/i915/selftests: Measure dispatch latency (rev5) Patchwork
` (2 more replies)
0 siblings, 3 replies; 13+ messages in thread
From: Chris Wilson @ 2020-05-18 8:57 UTC (permalink / raw)
To: intel-gfx; +Cc: Chris Wilson
A useful metric of the system's health is how fast we can tell the GPU
to do various actions, so measure our latency.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
drivers/gpu/drm/i915/selftests/i915_request.c | 802 ++++++++++++++++++
1 file changed, 802 insertions(+)
diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c
index 6014e8dfcbb1..f9631526730d 100644
--- a/drivers/gpu/drm/i915/selftests/i915_request.c
+++ b/drivers/gpu/drm/i915/selftests/i915_request.c
@@ -24,16 +24,20 @@
#include <linux/prime_numbers.h>
#include <linux/pm_qos.h>
+#include <linux/sort.h>
#include "gem/i915_gem_pm.h"
#include "gem/selftests/mock_context.h"
+#include "gt/intel_engine_heartbeat.h"
#include "gt/intel_engine_pm.h"
#include "gt/intel_engine_user.h"
#include "gt/intel_gt.h"
+#include "gt/intel_gt_requests.h"
#include "i915_random.h"
#include "i915_selftest.h"
+#include "igt_flush_test.h"
#include "igt_live_test.h"
#include "igt_spinner.h"
#include "lib_sw_fence.h"
@@ -1524,6 +1528,803 @@ struct perf_series {
struct intel_context *ce[];
};
+#define COUNT 5
+
+static int cmp_u32(const void *A, const void *B)
+{
+ const u32 *a = A, *b = B;
+
+ return *a - *b;
+}
+
+static u32 trifilter(u32 *a)
+{
+ u64 sum;
+
+ sort(a, COUNT, sizeof(*a), cmp_u32, NULL);
+
+ sum = mul_u32_u32(a[2], 2);
+ sum += a[1];
+ sum += a[3];
+
+ return (sum + 2) >> 2;
+}
+
+static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
+{
+ u64 ns = i915_cs_timestamp_ticks_to_ns(engine->i915, cycles);
+
+ return DIV_ROUND_CLOSEST(ns, 1 << COUNT);
+}
+
+static int measure_semaphore_response(struct intel_context *ce)
+{
+ u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+ const u32 offset =
+ i915_ggtt_offset(ce->engine->status_page.vma) +
+ offset_in_page(sema);
+ u32 elapsed[COUNT], cycles;
+ struct i915_request *rq;
+ u32 *cs;
+ int i;
+
+ /*
+ * Measure how many cycles it takes for the HW to detect the change
+ * in a semaphore value.
+ *
+ * A: read CS_TIMESTAMP from CPU
+ * poke semaphore
+ * B: read CS_TIMESTAMP on GPU
+ *
+ * Semaphore latency: B - A
+ */
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 4 + 8 * ARRAY_SIZE(elapsed));
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+ *cs++ = offset;
+ *cs++ = 0;
+ *cs++ = 0xffffffff;
+
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ *cs++ = MI_SEMAPHORE_WAIT |
+ MI_SEMAPHORE_GLOBAL_GTT |
+ MI_SEMAPHORE_POLL |
+ MI_SEMAPHORE_SAD_EQ_SDD;
+ *cs++ = i;
+ *cs++ = offset;
+ *cs++ = 0;
+
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+ *cs++ = ce->engine->mmio_base + 0x358;
+ *cs++ = offset + sizeof(*sema) * i;
+ *cs++ = 0;
+ }
+
+ intel_ring_advance(rq, cs);
+ i915_request_add(rq);
+
+ if (wait_for(READ_ONCE(*sema) == 0xffffffff, 50)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ preempt_disable();
+ cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+ WRITE_ONCE(sema[0], i);
+ wmb(); /* flush the update to the cache, and beyond */
+ preempt_enable();
+
+ if (wait_for(READ_ONCE(sema[i]), 50)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ elapsed[i - 1] = (sema[i] - cycles) << COUNT;
+ }
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: semaphore response %d cycles, %lluns\n",
+ ce->engine->name, cycles >> COUNT,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_idle_dispatch(struct intel_context *ce)
+{
+ u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+ const u32 offset =
+ i915_ggtt_offset(ce->engine->status_page.vma) +
+ offset_in_page(sema);
+ u32 elapsed[COUNT], cycles;
+ u32 *cs;
+ int i;
+
+ /*
+ * Measure how long it takes for us to submit a request while the
+ * engine is idle, but is resting in our context.
+ *
+ * A: read CS_TIMESTAMP from CPU
+ * submit request
+ * B: read CS_TIMESTAMP on GPU
+ *
+ * Submission latency: B - A
+ */
+
+ for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+ struct i915_request *rq;
+ int err;
+
+ err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+ if (err)
+ return err;
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 4);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+ *cs++ = ce->engine->mmio_base + 0x358;
+ *cs++ = offset + sizeof(*sema) * i;
+ *cs++ = 0;
+
+ intel_ring_advance(rq, cs);
+
+ preempt_disable();
+ local_bh_disable();
+ elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+ i915_request_add(rq);
+ local_bh_enable();
+ preempt_enable();
+ }
+
+ for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+ if (wait_for(READ_ONCE(sema[i]), 50)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ elapsed[i] = (sema[i] - elapsed[i]) << COUNT;
+ }
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> COUNT,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_busy_dispatch(struct intel_context *ce)
+{
+ u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+ const u32 offset =
+ i915_ggtt_offset(ce->engine->status_page.vma) +
+ offset_in_page(sema);
+ u32 elapsed[COUNT + 1], cycles;
+ u32 *cs;
+ int i;
+
+ /*
+ * Measure how long it takes for us to submit a request while the
+ * engine is busy, polling on a semaphore in our context. With
+ * direct submission, this will include the cost of an idle restore.
+ *
+ * A: read CS_TIMESTAMP from CPU
+ * submit request
+ * B: read CS_TIMESTAMP on GPU
+ *
+ * Submission latency: B - A
+ */
+
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ struct i915_request *rq;
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 12);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+ *cs++ = ce->engine->mmio_base + 0x358;
+ *cs++ = offset + sizeof(*sema) * i;
+ *cs++ = 0;
+
+ *cs++ = MI_SEMAPHORE_WAIT |
+ MI_SEMAPHORE_GLOBAL_GTT |
+ MI_SEMAPHORE_POLL |
+ MI_SEMAPHORE_SAD_GTE_SDD;
+ *cs++ = i;
+ *cs++ = offset;
+ *cs++ = 0;
+
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+ *cs++ = ce->engine->mmio_base + 0x358;
+ *cs++ = offset + sizeof(*sema) * i;
+ *cs++ = 0;
+
+ intel_ring_advance(rq, cs);
+
+ if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ preempt_disable();
+ local_bh_disable();
+ elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+ i915_request_add(rq);
+ local_bh_enable();
+ WRITE_ONCE(sema[0], i - 1);
+ wmb(); /* flush the update to the cache, and beyond */
+ preempt_enable();
+ }
+ WRITE_ONCE(sema[0], i - 1);
+
+ for (i = 1; i <= COUNT; i++)
+ elapsed[i - 1] = (sema[i] - elapsed[i]) << COUNT;
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> COUNT,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
+{
+ const u32 offset =
+ i915_ggtt_offset(engine->status_page.vma) +
+ offset_in_page(sema);
+ struct i915_request *rq;
+ u32 *cs;
+
+ rq = i915_request_create(engine->kernel_context);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 4);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ *cs++ = MI_SEMAPHORE_WAIT |
+ MI_SEMAPHORE_GLOBAL_GTT |
+ MI_SEMAPHORE_POLL |
+ mode;
+ *cs++ = value;
+ *cs++ = offset;
+ *cs++ = 0;
+
+ intel_ring_advance(rq, cs);
+ i915_request_add(rq);
+
+ return 0;
+}
+
+static int measure_inter_request(struct intel_context *ce)
+{
+ u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+ const u32 offset =
+ i915_ggtt_offset(ce->engine->status_page.vma) +
+ offset_in_page(sema);
+ u32 elapsed[COUNT + 1], cycles;
+ struct i915_sw_fence *submit;
+ int i, err;
+
+ /*
+ * Measure how long it takes to advance from one request into the
+ * next. Between each request we flush the GPU caches to memory,
+ * update the breadcrumbs, and then invalidate those caches.
+ * We queue up all the requests to be submitted in one batch so
+ * it should be one set of contiguous measurements.
+ *
+ * A: read CS_TIMESTAMP on GPU
+ * advance request
+ * B: read CS_TIMESTAMP on GPU
+ *
+ * Request latency: B - A
+ */
+
+ err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+ if (err)
+ return err;
+
+ submit = heap_fence_create(GFP_KERNEL);
+ if (!submit) {
+ WRITE_ONCE(sema[0], 1);
+ return -ENOMEM;
+ }
+
+ intel_engine_flush_submission(ce->engine);
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ struct i915_request *rq;
+ u32 *cs;
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq)) {
+ WRITE_ONCE(sema[0], 1);
+ return PTR_ERR(rq);
+ }
+
+ err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
+ submit,
+ GFP_KERNEL);
+ if (err < 0) {
+ i915_sw_fence_commit(submit);
+ heap_fence_put(submit);
+ i915_request_add(rq);
+ WRITE_ONCE(sema[0], 1);
+ return err;
+ }
+
+ cs = intel_ring_begin(rq, 4);
+ if (IS_ERR(cs)) {
+ i915_sw_fence_commit(submit);
+ heap_fence_put(submit);
+ i915_request_add(rq);
+ WRITE_ONCE(sema[0], 1);
+ return PTR_ERR(cs);
+ }
+
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+ *cs++ = ce->engine->mmio_base + 0x358;
+ *cs++ = offset + sizeof(*sema) * i;
+ *cs++ = 0;
+
+ intel_ring_advance(rq, cs);
+ i915_request_add(rq);
+ }
+ local_bh_disable();
+ i915_sw_fence_commit(submit);
+ local_bh_enable();
+ intel_engine_flush_submission(ce->engine);
+ heap_fence_put(submit);
+
+ WRITE_ONCE(sema[0], 1);
+ wmb(); /* flush the update to the cache, and beyond */
+
+ if (wait_for(READ_ONCE(sema[COUNT + 1]), 100)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ for (i = 1; i <= COUNT; i++)
+ elapsed[i - 1] = (sema[i + 1] - sema[i]) << COUNT;
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: inter-request latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> COUNT,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_context_switch(struct intel_context *ce)
+{
+ u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+ const u32 offset =
+ i915_ggtt_offset(ce->engine->status_page.vma) +
+ offset_in_page(sema);
+ struct i915_request *fence = NULL;
+ u32 elapsed[COUNT + 1], cycles;
+ int i, j, err;
+ u32 *cs;
+
+ /*
+ * Measure how long it takes to advance from one request in one
+ * context to a request in another context. This allows us to
+ * measure how long the context save/restore take, along with all
+ * the inter-context setup we require.
+ *
+ * A: read CS_TIMESTAMP on GPU
+ * switch context
+ * B: read CS_TIMESTAMP on GPU
+ *
+ * Context switch latency: B - A
+ */
+
+ err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+ if (err)
+ return err;
+
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ struct intel_context *arr[] = {
+ ce, ce->engine->kernel_context
+ };
+
+ for (j = 0; j < ARRAY_SIZE(arr); j++) {
+ struct i915_request *rq;
+
+ rq = i915_request_create(arr[j]);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ if (fence) {
+ err = i915_request_await_dma_fence(rq,
+ &fence->fence);
+ if (err) {
+ i915_request_add(rq);
+ return err;
+ }
+ }
+
+ cs = intel_ring_begin(rq, 4);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+ *cs++ = ce->engine->mmio_base + 0x358;
+ *cs++ = offset +
+ sizeof(*sema) * (ARRAY_SIZE(arr) * i + j);
+ *cs++ = 0;
+
+ intel_ring_advance(rq, cs);
+
+ i915_request_put(fence);
+ fence = i915_request_get(rq);
+
+ i915_request_add(rq);
+ }
+ }
+ i915_request_put(fence);
+ intel_engine_flush_submission(ce->engine);
+
+ WRITE_ONCE(sema[0], 1);
+ wmb(); /* flush the update to the cache, and beyond */
+
+ if (wait_for(READ_ONCE(sema[2 * i - 1]), 500)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ for (i = 1; i <= COUNT; i++)
+ elapsed[i - 1] = (sema[2 * i + 2] - sema[2 * i + 1]) << COUNT;
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: context switch latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> COUNT,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_preemption(struct intel_context *ce)
+{
+ u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+ const u32 offset =
+ i915_ggtt_offset(ce->engine->status_page.vma) +
+ offset_in_page(sema);
+ u32 elapsed[COUNT], cycles;
+ u32 *cs;
+ int i;
+
+ /*
+ * We measure two latencies while triggering preemption. The first
+ * latency is how long it takes for us to submit a preempting request.
+ * The second latency is how it takes for us to return from the
+ * preemption back to the original context.
+ *
+ * A: read CS_TIMESTAMP from CPU
+ * submit preemption
+ * B: read CS_TIMESTAMP on GPU (in preempting context)
+ * context switch
+ * C: read CS_TIMESTAMP on GPU (in original context)
+ *
+ * Preemption dispatch latency: B - A
+ * Preemption switch latency: C - B
+ */
+
+ if (!intel_engine_has_preemption(ce->engine))
+ return 0;
+
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ struct i915_request *rq;
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 12);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+ *cs++ = ce->engine->mmio_base + 0x358;
+ *cs++ = offset + sizeof(*sema) * (2 * i + 0);
+ *cs++ = 0;
+
+ *cs++ = MI_SEMAPHORE_WAIT |
+ MI_SEMAPHORE_GLOBAL_GTT |
+ MI_SEMAPHORE_POLL |
+ MI_SEMAPHORE_SAD_GTE_SDD;
+ *cs++ = i;
+ *cs++ = offset;
+ *cs++ = 0;
+
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+ *cs++ = ce->engine->mmio_base + 0x358;
+ *cs++ = offset + sizeof(*sema) * (2 * i + 1);
+ *cs++ = 0;
+
+ intel_ring_advance(rq, cs);
+ i915_request_add(rq);
+
+ if (wait_for(READ_ONCE(sema[2 * i]), 500)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ rq = i915_request_create(ce->engine->kernel_context);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 8);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+ *cs++ = ce->engine->mmio_base + 0x358;
+ *cs++ = offset + sizeof(*sema) * (2 * i + 0);
+ *cs++ = 0;
+
+ *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+ *cs++ = offset;
+ *cs++ = 0;
+ *cs++ = i;
+
+ intel_ring_advance(rq, cs);
+ rq->sched.attr.priority = I915_PRIORITY_BARRIER;
+
+ elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+ i915_request_add(rq);
+ }
+
+ if (wait_for(READ_ONCE(sema[2 * i - 1]), 500)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ for (i = 1; i <= COUNT; i++)
+ elapsed[i - 1] = (sema[2 * i + 0] - elapsed[i - 1]) << COUNT;
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: preemption dispatch latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> COUNT,
+ cycles_to_ns(ce->engine, cycles));
+
+ for (i = 1; i <= COUNT; i++)
+ elapsed[i - 1] = (sema[2 * i + 1] - sema[2 * i + 0]) << COUNT;
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: preemption switch latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> COUNT,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+struct signal_cb {
+ struct dma_fence_cb base;
+ bool seen;
+};
+
+static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
+{
+ struct signal_cb *s = container_of(cb, typeof(*s), base);
+
+ smp_store_mb(s->seen, true); /* be safe, be strong */
+}
+
+static int measure_completion(struct intel_context *ce)
+{
+ u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+ const u32 offset =
+ i915_ggtt_offset(ce->engine->status_page.vma) +
+ offset_in_page(sema);
+ u32 elapsed[COUNT], cycles;
+ u32 *cs;
+ int i;
+
+ /*
+ * Measure how long it takes for the signal (interrupt) to be
+ * sent from the GPU to be processed by the CPU.
+ *
+ * A: read CS_TIMESTAMP on GPU
+ * signal
+ * B: read CS_TIMESTAMP from CPU
+ *
+ * Completion latency: B - A
+ */
+
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ struct signal_cb cb = { .seen = false };
+ struct i915_request *rq;
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 12);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+ *cs++ = ce->engine->mmio_base + 0x358;
+ *cs++ = offset + sizeof(*sema) * i;
+ *cs++ = 0;
+
+ *cs++ = MI_SEMAPHORE_WAIT |
+ MI_SEMAPHORE_GLOBAL_GTT |
+ MI_SEMAPHORE_POLL |
+ MI_SEMAPHORE_SAD_EQ_SDD;
+ *cs++ = i;
+ *cs++ = offset;
+ *cs++ = 0;
+
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+ *cs++ = ce->engine->mmio_base + 0x358;
+ *cs++ = offset + sizeof(*sema) * i;
+ *cs++ = 0;
+
+ intel_ring_advance(rq, cs);
+
+ dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
+
+ local_bh_disable();
+ i915_request_add(rq);
+ local_bh_enable();
+
+ if (wait_for(READ_ONCE(sema[i]), 50)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ preempt_disable();
+ WRITE_ONCE(sema[0], i);
+ wmb();
+ while (!READ_ONCE(cb.seen))
+ cpu_relax();
+
+ elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+ preempt_enable();
+ }
+
+ for (i = 0; i < ARRAY_SIZE(elapsed); i++)
+ elapsed[i] = (elapsed[i] - sema[i + 1]) << COUNT;
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: completion latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> COUNT,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static void rps_pin(struct intel_gt *gt)
+{
+ /* Pin the frequency to max */
+ atomic_inc(>->rps.num_waiters);
+ intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
+
+ mutex_lock(>->rps.lock);
+ intel_rps_set(>->rps, gt->rps.max_freq);
+ mutex_unlock(>->rps.lock);
+}
+
+static void rps_unpin(struct intel_gt *gt)
+{
+ intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
+ atomic_dec(>->rps.num_waiters);
+}
+
+static void engine_heartbeat_disable(struct intel_engine_cs *engine)
+{
+ engine->props.heartbeat_interval_ms = 0;
+
+ intel_engine_pm_get(engine);
+ intel_engine_park_heartbeat(engine);
+}
+
+static void engine_heartbeat_enable(struct intel_engine_cs *engine)
+{
+ intel_engine_pm_put(engine);
+
+ engine->props.heartbeat_interval_ms =
+ engine->defaults.heartbeat_interval_ms;
+}
+
+static int perf_request_latency(void *arg)
+{
+ struct drm_i915_private *i915 = arg;
+ struct intel_engine_cs *engine;
+ struct pm_qos_request qos;
+ int err = 0;
+
+ if (INTEL_GEN(i915) < 8) /* per-engine CS timestamp, semaphores */
+ return 0;
+
+ cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
+
+ for_each_uabi_engine(engine, i915) {
+ struct intel_context *ce;
+
+ ce = intel_context_create(engine);
+ if (IS_ERR(ce))
+ goto out;
+
+ err = intel_context_pin(ce);
+ if (err) {
+ intel_context_put(ce);
+ goto out;
+ }
+
+ engine_heartbeat_disable(engine);
+ rps_pin(engine->gt);
+
+ if (err == 0)
+ err = measure_semaphore_response(ce);
+ if (err == 0)
+ err = measure_idle_dispatch(ce);
+ if (err == 0)
+ err = measure_busy_dispatch(ce);
+ if (err == 0)
+ err = measure_inter_request(ce);
+ if (err == 0)
+ err = measure_context_switch(ce);
+ if (err == 0)
+ err = measure_preemption(ce);
+ if (err == 0)
+ err = measure_completion(ce);
+
+ rps_unpin(engine->gt);
+ engine_heartbeat_enable(engine);
+
+ intel_context_unpin(ce);
+ intel_context_put(ce);
+ if (err)
+ goto out;
+ }
+
+out:
+ if (igt_flush_test(i915))
+ err = -EIO;
+
+ cpu_latency_qos_remove_request(&qos);
+ return err;
+}
+
static int s_sync0(void *arg)
{
struct perf_series *ps = arg;
@@ -2042,6 +2843,7 @@ static int perf_parallel_engines(void *arg)
int i915_request_perf_selftests(struct drm_i915_private *i915)
{
static const struct i915_subtest tests[] = {
+ SUBTEST(perf_request_latency),
SUBTEST(perf_series_engines),
SUBTEST(perf_parallel_engines),
};
--
2.20.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 13+ messages in thread
* [Intel-gfx] [PATCH] drm/i915/selftests: Measure dispatch latency
@ 2020-05-19 13:08 Chris Wilson
0 siblings, 0 replies; 13+ messages in thread
From: Chris Wilson @ 2020-05-19 13:08 UTC (permalink / raw)
To: intel-gfx; +Cc: Chris Wilson
A useful metric of the system's health is how fast we can tell the GPU
to do various actions, so measure our latency.
v2: Refactor all the instruction building into emitters.
v3: Mark the error handling if not perfect, at least consistent.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
drivers/gpu/drm/i915/selftests/i915_request.c | 823 ++++++++++++++++++
1 file changed, 823 insertions(+)
diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c
index 6014e8dfcbb1..92c628f18c60 100644
--- a/drivers/gpu/drm/i915/selftests/i915_request.c
+++ b/drivers/gpu/drm/i915/selftests/i915_request.c
@@ -24,16 +24,20 @@
#include <linux/prime_numbers.h>
#include <linux/pm_qos.h>
+#include <linux/sort.h>
#include "gem/i915_gem_pm.h"
#include "gem/selftests/mock_context.h"
+#include "gt/intel_engine_heartbeat.h"
#include "gt/intel_engine_pm.h"
#include "gt/intel_engine_user.h"
#include "gt/intel_gt.h"
+#include "gt/intel_gt_requests.h"
#include "i915_random.h"
#include "i915_selftest.h"
+#include "igt_flush_test.h"
#include "igt_live_test.h"
#include "igt_spinner.h"
#include "lib_sw_fence.h"
@@ -1524,6 +1528,824 @@ struct perf_series {
struct intel_context *ce[];
};
+static int cmp_u32(const void *A, const void *B)
+{
+ const u32 *a = A, *b = B;
+
+ return *a - *b;
+}
+
+static u32 trifilter(u32 *a)
+{
+ u64 sum;
+
+#define TF_COUNT 5
+ sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL);
+
+ sum = mul_u32_u32(a[2], 2);
+ sum += a[1];
+ sum += a[3];
+
+ GEM_BUG_ON(sum > U32_MAX);
+ return sum;
+#define TF_BIAS 2
+}
+
+static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
+{
+ u64 ns = i915_cs_timestamp_ticks_to_ns(engine->i915, cycles);
+
+ return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS);
+}
+
+static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset)
+{
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+ *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base)));
+ *cs++ = offset;
+ *cs++ = 0;
+
+ return cs;
+}
+
+static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value)
+{
+ *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+ *cs++ = offset;
+ *cs++ = 0;
+ *cs++ = value;
+
+ return cs;
+}
+
+static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset)
+{
+ *cs++ = MI_SEMAPHORE_WAIT |
+ MI_SEMAPHORE_GLOBAL_GTT |
+ MI_SEMAPHORE_POLL |
+ mode;
+ *cs++ = value;
+ *cs++ = offset;
+ *cs++ = 0;
+
+ return cs;
+}
+
+static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value)
+{
+ return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset);
+}
+
+static void semaphore_set(u32 *sema, u32 value)
+{
+ WRITE_ONCE(*sema, value);
+ wmb(); /* flush the update to the cache, and beyond */
+}
+
+static u32 *hwsp_scratch(const struct intel_context *ce)
+{
+ return memset32(ce->engine->status_page.addr + 1000, 0, 21);
+}
+
+static u32 hwsp_offset(const struct intel_context *ce, u32 *dw)
+{
+ return (i915_ggtt_offset(ce->engine->status_page.vma) +
+ offset_in_page(dw));
+}
+
+static int measure_semaphore_response(struct intel_context *ce)
+{
+ u32 *sema = hwsp_scratch(ce);
+ const u32 offset = hwsp_offset(ce, sema);
+ u32 elapsed[TF_COUNT], cycles;
+ struct i915_request *rq;
+ u32 *cs;
+ int err;
+ int i;
+
+ /*
+ * Measure how many cycles it takes for the HW to detect the change
+ * in a semaphore value.
+ *
+ * A: read CS_TIMESTAMP from CPU
+ * poke semaphore
+ * B: read CS_TIMESTAMP on GPU
+ *
+ * Semaphore latency: B - A
+ */
+
+ semaphore_set(sema, -1);
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed));
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ err = PTR_ERR(cs);
+ goto err;
+ }
+
+ cs = emit_store_dw(cs, offset, 0);
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ cs = emit_semaphore_poll_until(cs, offset, i);
+ cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+ cs = emit_store_dw(cs, offset, 0);
+ }
+
+ intel_ring_advance(rq, cs);
+ i915_request_add(rq);
+
+ if (wait_for(READ_ONCE(*sema) == 0, 50)) {
+ err = -EIO;
+ goto err;
+ }
+
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ preempt_disable();
+ cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+ semaphore_set(sema, i);
+ preempt_enable();
+
+ if (wait_for(READ_ONCE(*sema) == 0, 50)) {
+ err = -EIO;
+ goto err;
+ }
+
+ elapsed[i - 1] = sema[i] - cycles;
+ }
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: semaphore response %d cycles, %lluns\n",
+ ce->engine->name, cycles >> TF_BIAS,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+
+err:
+ intel_gt_set_wedged(ce->engine->gt);
+ return err;
+}
+
+static int measure_idle_dispatch(struct intel_context *ce)
+{
+ u32 *sema = hwsp_scratch(ce);
+ const u32 offset = hwsp_offset(ce, sema);
+ u32 elapsed[TF_COUNT], cycles;
+ u32 *cs;
+ int err;
+ int i;
+
+ /*
+ * Measure how long it takes for us to submit a request while the
+ * engine is idle, but is resting in our context.
+ *
+ * A: read CS_TIMESTAMP from CPU
+ * submit request
+ * B: read CS_TIMESTAMP on GPU
+ *
+ * Submission latency: B - A
+ */
+
+ for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+ struct i915_request *rq;
+
+ err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+ if (err)
+ return err;
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq)) {
+ err = PTR_ERR(rq);
+ goto err;
+ }
+
+ cs = intel_ring_begin(rq, 4);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ err = PTR_ERR(cs);
+ goto err;
+ }
+
+ cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+
+ intel_ring_advance(rq, cs);
+
+ preempt_disable();
+ local_bh_disable();
+ elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+ i915_request_add(rq);
+ local_bh_enable();
+ preempt_enable();
+ }
+
+ err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+ if (err)
+ goto err;
+
+ for (i = 0; i < ARRAY_SIZE(elapsed); i++)
+ elapsed[i] = sema[i] - elapsed[i];
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> TF_BIAS,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+
+err:
+ intel_gt_set_wedged(ce->engine->gt);
+ return err;
+}
+
+static int measure_busy_dispatch(struct intel_context *ce)
+{
+ u32 *sema = hwsp_scratch(ce);
+ const u32 offset = hwsp_offset(ce, sema);
+ u32 elapsed[TF_COUNT + 1], cycles;
+ u32 *cs;
+ int err;
+ int i;
+
+ /*
+ * Measure how long it takes for us to submit a request while the
+ * engine is busy, polling on a semaphore in our context. With
+ * direct submission, this will include the cost of a lite restore.
+ *
+ * A: read CS_TIMESTAMP from CPU
+ * submit request
+ * B: read CS_TIMESTAMP on GPU
+ *
+ * Submission latency: B - A
+ */
+
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ struct i915_request *rq;
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq)) {
+ err = PTR_ERR(rq);
+ goto err;
+ }
+
+ cs = intel_ring_begin(rq, 12);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ err = PTR_ERR(cs);
+ goto err;
+ }
+
+ cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
+ cs = emit_semaphore_poll_until(cs, offset, i);
+ cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+
+ intel_ring_advance(rq, cs);
+
+ if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
+ err = -EIO;
+ goto err;
+ }
+
+ preempt_disable();
+ local_bh_disable();
+ elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+ i915_request_add(rq);
+ local_bh_enable();
+ semaphore_set(sema, i - 1);
+ preempt_enable();
+ }
+
+ wait_for(READ_ONCE(sema[i - 1]), 500);
+ semaphore_set(sema, i - 1);
+
+ for (i = 1; i <= TF_COUNT; i++) {
+ GEM_BUG_ON(sema[i] == -1);
+ elapsed[i - 1] = sema[i] - elapsed[i];
+ }
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> TF_BIAS,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+
+err:
+ intel_gt_set_wedged(ce->engine->gt);
+ return err;
+}
+
+static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
+{
+ const u32 offset =
+ i915_ggtt_offset(engine->status_page.vma) +
+ offset_in_page(sema);
+ struct i915_request *rq;
+ u32 *cs;
+
+ rq = i915_request_create(engine->kernel_context);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 4);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ cs = emit_semaphore_poll(cs, mode, value, offset);
+
+ intel_ring_advance(rq, cs);
+ i915_request_add(rq);
+
+ return 0;
+}
+
+static int measure_inter_request(struct intel_context *ce)
+{
+ u32 *sema = hwsp_scratch(ce);
+ const u32 offset = hwsp_offset(ce, sema);
+ u32 elapsed[TF_COUNT + 1], cycles;
+ struct i915_sw_fence *submit;
+ int i, err;
+
+ /*
+ * Measure how long it takes to advance from one request into the
+ * next. Between each request we flush the GPU caches to memory,
+ * update the breadcrumbs, and then invalidate those caches.
+ * We queue up all the requests to be submitted in one batch so
+ * it should be one set of contiguous measurements.
+ *
+ * A: read CS_TIMESTAMP on GPU
+ * advance request
+ * B: read CS_TIMESTAMP on GPU
+ *
+ * Request latency: B - A
+ */
+
+ err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+ if (err)
+ return err;
+
+ submit = heap_fence_create(GFP_KERNEL);
+ if (!submit) {
+ semaphore_set(sema, 1);
+ return -ENOMEM;
+ }
+
+ intel_engine_flush_submission(ce->engine);
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ struct i915_request *rq;
+ u32 *cs;
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq)) {
+ err = PTR_ERR(rq);
+ goto err_submit;
+ }
+
+ err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
+ submit,
+ GFP_KERNEL);
+ if (err < 0) {
+ i915_request_add(rq);
+ goto err_submit;
+ }
+
+ cs = intel_ring_begin(rq, 4);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ err = PTR_ERR(cs);
+ goto err_submit;
+ }
+
+ cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+
+ intel_ring_advance(rq, cs);
+ i915_request_add(rq);
+ }
+ local_bh_disable();
+ i915_sw_fence_commit(submit);
+ local_bh_enable();
+ intel_engine_flush_submission(ce->engine);
+ heap_fence_put(submit);
+
+ semaphore_set(sema, 1);
+ err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+ if (err)
+ goto err;
+
+ for (i = 1; i <= TF_COUNT; i++)
+ elapsed[i - 1] = sema[i + 1] - sema[i];
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: inter-request latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> TF_BIAS,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+
+err_submit:
+ i915_sw_fence_commit(submit);
+ heap_fence_put(submit);
+ semaphore_set(sema, 1);
+err:
+ intel_gt_set_wedged(ce->engine->gt);
+ return err;
+}
+
+static int measure_context_switch(struct intel_context *ce)
+{
+ u32 *sema = hwsp_scratch(ce);
+ const u32 offset = hwsp_offset(ce, sema);
+ struct i915_request *fence = NULL;
+ u32 elapsed[TF_COUNT + 1], cycles;
+ int i, j, err;
+ u32 *cs;
+
+ /*
+ * Measure how long it takes to advance from one request in one
+ * context to a request in another context. This allows us to
+ * measure how long the context save/restore take, along with all
+ * the inter-context setup we require.
+ *
+ * A: read CS_TIMESTAMP on GPU
+ * switch context
+ * B: read CS_TIMESTAMP on GPU
+ *
+ * Context switch latency: B - A
+ */
+
+ err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+ if (err)
+ return err;
+
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ struct intel_context *arr[] = {
+ ce, ce->engine->kernel_context
+ };
+ u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32);
+
+ for (j = 0; j < ARRAY_SIZE(arr); j++) {
+ struct i915_request *rq;
+
+ rq = i915_request_create(arr[j]);
+ if (IS_ERR(rq)) {
+ err = PTR_ERR(rq);
+ goto err_fence;
+ }
+
+ if (fence) {
+ err = i915_request_await_dma_fence(rq,
+ &fence->fence);
+ if (err) {
+ i915_request_add(rq);
+ goto err_fence;
+ }
+ }
+
+ cs = intel_ring_begin(rq, 4);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ err = PTR_ERR(cs);
+ goto err_fence;
+ }
+
+ cs = emit_timestamp_store(cs, ce, addr);
+ addr += sizeof(u32);
+
+ intel_ring_advance(rq, cs);
+
+ i915_request_put(fence);
+ fence = i915_request_get(rq);
+
+ i915_request_add(rq);
+ }
+ }
+ i915_request_put(fence);
+ intel_engine_flush_submission(ce->engine);
+
+ semaphore_set(sema, 1);
+ err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+ if (err)
+ goto err;
+
+ for (i = 1; i <= TF_COUNT; i++)
+ elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1];
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: context switch latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> TF_BIAS,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+
+err_fence:
+ i915_request_put(fence);
+ semaphore_set(sema, 1);
+err:
+ intel_gt_set_wedged(ce->engine->gt);
+ return err;
+}
+
+static int measure_preemption(struct intel_context *ce)
+{
+ u32 *sema = hwsp_scratch(ce);
+ const u32 offset = hwsp_offset(ce, sema);
+ u32 elapsed[TF_COUNT], cycles;
+ u32 *cs;
+ int err;
+ int i;
+
+ /*
+ * We measure two latencies while triggering preemption. The first
+ * latency is how long it takes for us to submit a preempting request.
+ * The second latency is how it takes for us to return from the
+ * preemption back to the original context.
+ *
+ * A: read CS_TIMESTAMP from CPU
+ * submit preemption
+ * B: read CS_TIMESTAMP on GPU (in preempting context)
+ * context switch
+ * C: read CS_TIMESTAMP on GPU (in original context)
+ *
+ * Preemption dispatch latency: B - A
+ * Preemption switch latency: C - B
+ */
+
+ if (!intel_engine_has_preemption(ce->engine))
+ return 0;
+
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ u32 addr = offset + 2 * i * sizeof(u32);
+ struct i915_request *rq;
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq)) {
+ err = PTR_ERR(rq);
+ goto err;
+ }
+
+ cs = intel_ring_begin(rq, 12);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ err = PTR_ERR(cs);
+ goto err;
+ }
+
+ cs = emit_store_dw(cs, addr, -1);
+ cs = emit_semaphore_poll_until(cs, offset, i);
+ cs = emit_timestamp_store(cs, ce, addr + sizeof(u32));
+
+ intel_ring_advance(rq, cs);
+ i915_request_add(rq);
+
+ if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) {
+ err = -EIO;
+ goto err;
+ }
+
+ rq = i915_request_create(ce->engine->kernel_context);
+ if (IS_ERR(rq)) {
+ err = PTR_ERR(rq);
+ goto err;
+ }
+
+ cs = intel_ring_begin(rq, 8);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ err = PTR_ERR(cs);
+ goto err;
+ }
+
+ cs = emit_timestamp_store(cs, ce, addr);
+ cs = emit_store_dw(cs, offset, i);
+
+ intel_ring_advance(rq, cs);
+ rq->sched.attr.priority = I915_PRIORITY_BARRIER;
+
+ elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+ i915_request_add(rq);
+ }
+
+ if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) {
+ err = -EIO;
+ goto err;
+ }
+
+ for (i = 1; i <= TF_COUNT; i++)
+ elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1];
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: preemption dispatch latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> TF_BIAS,
+ cycles_to_ns(ce->engine, cycles));
+
+ for (i = 1; i <= TF_COUNT; i++)
+ elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0];
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: preemption switch latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> TF_BIAS,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+
+err:
+ intel_gt_set_wedged(ce->engine->gt);
+ return err;
+}
+
+struct signal_cb {
+ struct dma_fence_cb base;
+ bool seen;
+};
+
+static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
+{
+ struct signal_cb *s = container_of(cb, typeof(*s), base);
+
+ smp_store_mb(s->seen, true); /* be safe, be strong */
+}
+
+static int measure_completion(struct intel_context *ce)
+{
+ u32 *sema = hwsp_scratch(ce);
+ const u32 offset = hwsp_offset(ce, sema);
+ u32 elapsed[TF_COUNT], cycles;
+ u32 *cs;
+ int err;
+ int i;
+
+ /*
+ * Measure how long it takes for the signal (interrupt) to be
+ * sent from the GPU to be processed by the CPU.
+ *
+ * A: read CS_TIMESTAMP on GPU
+ * signal
+ * B: read CS_TIMESTAMP from CPU
+ *
+ * Completion latency: B - A
+ */
+
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ struct signal_cb cb = { .seen = false };
+ struct i915_request *rq;
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq)) {
+ err = PTR_ERR(rq);
+ goto err;
+ }
+
+ cs = intel_ring_begin(rq, 12);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ err = PTR_ERR(cs);
+ goto err;
+ }
+
+ cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
+ cs = emit_semaphore_poll_until(cs, offset, i);
+ cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+
+ intel_ring_advance(rq, cs);
+
+ dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
+
+ local_bh_disable();
+ i915_request_add(rq);
+ local_bh_enable();
+
+ if (wait_for(READ_ONCE(sema[i]) == -1, 50)) {
+ err = -EIO;
+ goto err;
+ }
+
+ preempt_disable();
+ semaphore_set(sema, i);
+ while (!READ_ONCE(cb.seen))
+ cpu_relax();
+
+ elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+ preempt_enable();
+ }
+
+ err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+ if (err)
+ goto err;
+
+ for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+ GEM_BUG_ON(sema[i + 1] == -1);
+ elapsed[i] = elapsed[i] - sema[i + 1];
+ }
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: completion latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> TF_BIAS,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+
+err:
+ intel_gt_set_wedged(ce->engine->gt);
+ return err;
+}
+
+static void rps_pin(struct intel_gt *gt)
+{
+ /* Pin the frequency to max */
+ atomic_inc(>->rps.num_waiters);
+ intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
+
+ mutex_lock(>->rps.lock);
+ intel_rps_set(>->rps, gt->rps.max_freq);
+ mutex_unlock(>->rps.lock);
+}
+
+static void rps_unpin(struct intel_gt *gt)
+{
+ intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
+ atomic_dec(>->rps.num_waiters);
+}
+
+static void engine_heartbeat_disable(struct intel_engine_cs *engine)
+{
+ engine->props.heartbeat_interval_ms = 0;
+
+ intel_engine_pm_get(engine);
+ intel_engine_park_heartbeat(engine);
+}
+
+static void engine_heartbeat_enable(struct intel_engine_cs *engine)
+{
+ intel_engine_pm_put(engine);
+
+ engine->props.heartbeat_interval_ms =
+ engine->defaults.heartbeat_interval_ms;
+}
+
+static int perf_request_latency(void *arg)
+{
+ struct drm_i915_private *i915 = arg;
+ struct intel_engine_cs *engine;
+ struct pm_qos_request qos;
+ int err = 0;
+
+ if (INTEL_GEN(i915) < 8) /* per-engine CS timestamp, semaphores */
+ return 0;
+
+ cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
+
+ for_each_uabi_engine(engine, i915) {
+ struct intel_context *ce;
+
+ ce = intel_context_create(engine);
+ if (IS_ERR(ce))
+ goto out;
+
+ err = intel_context_pin(ce);
+ if (err) {
+ intel_context_put(ce);
+ goto out;
+ }
+
+ engine_heartbeat_disable(engine);
+ rps_pin(engine->gt);
+
+ if (err == 0)
+ err = measure_semaphore_response(ce);
+ if (err == 0)
+ err = measure_idle_dispatch(ce);
+ if (err == 0)
+ err = measure_busy_dispatch(ce);
+ if (err == 0)
+ err = measure_inter_request(ce);
+ if (err == 0)
+ err = measure_context_switch(ce);
+ if (err == 0)
+ err = measure_preemption(ce);
+ if (err == 0)
+ err = measure_completion(ce);
+
+ rps_unpin(engine->gt);
+ engine_heartbeat_enable(engine);
+
+ intel_context_unpin(ce);
+ intel_context_put(ce);
+ if (err)
+ goto out;
+ }
+
+out:
+ if (igt_flush_test(i915))
+ err = -EIO;
+
+ cpu_latency_qos_remove_request(&qos);
+ return err;
+}
+
static int s_sync0(void *arg)
{
struct perf_series *ps = arg;
@@ -2042,6 +2864,7 @@ static int perf_parallel_engines(void *arg)
int i915_request_perf_selftests(struct drm_i915_private *i915)
{
static const struct i915_subtest tests[] = {
+ SUBTEST(perf_request_latency),
SUBTEST(perf_series_engines),
SUBTEST(perf_parallel_engines),
};
--
2.20.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 13+ messages in thread
* [Intel-gfx] [PATCH v2] drm/i915/selftests: Measure dispatch latency
@ 2020-05-18 14:39 Chris Wilson
2020-05-18 16:22 ` [Intel-gfx] [PATCH] " Chris Wilson
2020-05-19 11:41 ` Chris Wilson
0 siblings, 2 replies; 13+ messages in thread
From: Chris Wilson @ 2020-05-18 14:39 UTC (permalink / raw)
To: intel-gfx; +Cc: Chris Wilson
A useful metric of the system's health is how fast we can tell the GPU
to do various actions, so measure our latency.
v2: Refactor all the instruction building into emitters.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
drivers/gpu/drm/i915/selftests/i915_request.c | 764 ++++++++++++++++++
1 file changed, 764 insertions(+)
diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c
index 6014e8dfcbb1..c6dff5145a3c 100644
--- a/drivers/gpu/drm/i915/selftests/i915_request.c
+++ b/drivers/gpu/drm/i915/selftests/i915_request.c
@@ -24,16 +24,20 @@
#include <linux/prime_numbers.h>
#include <linux/pm_qos.h>
+#include <linux/sort.h>
#include "gem/i915_gem_pm.h"
#include "gem/selftests/mock_context.h"
+#include "gt/intel_engine_heartbeat.h"
#include "gt/intel_engine_pm.h"
#include "gt/intel_engine_user.h"
#include "gt/intel_gt.h"
+#include "gt/intel_gt_requests.h"
#include "i915_random.h"
#include "i915_selftest.h"
+#include "igt_flush_test.h"
#include "igt_live_test.h"
#include "igt_spinner.h"
#include "lib_sw_fence.h"
@@ -1524,6 +1528,765 @@ struct perf_series {
struct intel_context *ce[];
};
+#define COUNT 5
+
+static int cmp_u32(const void *A, const void *B)
+{
+ const u32 *a = A, *b = B;
+
+ return *a - *b;
+}
+
+static u32 trifilter(u32 *a)
+{
+ u64 sum;
+
+ sort(a, COUNT, sizeof(*a), cmp_u32, NULL);
+
+ sum = mul_u32_u32(a[2], 2);
+ sum += a[1];
+ sum += a[3];
+
+ return (sum + 2) >> 2;
+}
+
+static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
+{
+ u64 ns = i915_cs_timestamp_ticks_to_ns(engine->i915, cycles);
+
+ return DIV_ROUND_CLOSEST(ns, 1 << COUNT);
+}
+
+static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset)
+{
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+ *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base)));
+ *cs++ = offset;
+ *cs++ = 0;
+
+ return cs;
+}
+
+static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value)
+{
+ *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+ *cs++ = offset;
+ *cs++ = 0;
+ *cs++ = value;
+
+ return cs;
+}
+
+static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset)
+{
+ *cs++ = MI_SEMAPHORE_WAIT |
+ MI_SEMAPHORE_GLOBAL_GTT |
+ MI_SEMAPHORE_POLL |
+ mode;
+ *cs++ = value;
+ *cs++ = offset;
+ *cs++ = 0;
+
+ return cs;
+}
+
+static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value)
+{
+ return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset);
+}
+
+static void semaphore_set(u32 *sema, u32 value)
+{
+ WRITE_ONCE(*sema, value);
+ wmb(); /* flush the update to the cache, and beyond */
+}
+
+static u32 *hwsp_scratch(const struct intel_context *ce)
+{
+ return memset32(ce->engine->status_page.addr + 1000, 0, 21);
+}
+
+static u32 hwsp_offset(const struct intel_context *ce, u32 *dw)
+{
+ return (i915_ggtt_offset(ce->engine->status_page.vma) +
+ offset_in_page(dw));
+}
+
+static int measure_semaphore_response(struct intel_context *ce)
+{
+ u32 *sema = hwsp_scratch(ce);
+ const u32 offset = hwsp_offset(ce, sema);
+ u32 elapsed[COUNT], cycles;
+ struct i915_request *rq;
+ u32 *cs;
+ int i;
+
+ /*
+ * Measure how many cycles it takes for the HW to detect the change
+ * in a semaphore value.
+ *
+ * A: read CS_TIMESTAMP from CPU
+ * poke semaphore
+ * B: read CS_TIMESTAMP on GPU
+ *
+ * Semaphore latency: B - A
+ */
+
+ semaphore_set(sema, -1);
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 4 + 8 * ARRAY_SIZE(elapsed));
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ cs = emit_store_dw(cs, offset, 0);
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ cs = emit_semaphore_poll_until(cs, offset, i);
+ cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+ }
+
+ intel_ring_advance(rq, cs);
+ i915_request_add(rq);
+
+ if (wait_for(READ_ONCE(*sema) == 0, 50)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ preempt_disable();
+ cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+ semaphore_set(sema, i);
+ preempt_enable();
+
+ if (wait_for(READ_ONCE(sema[i]), 50)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ elapsed[i - 1] = (sema[i] - cycles) << COUNT;
+ }
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: semaphore response %d cycles, %lluns\n",
+ ce->engine->name, cycles >> COUNT,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_idle_dispatch(struct intel_context *ce)
+{
+ u32 *sema = hwsp_scratch(ce);
+ const u32 offset = hwsp_offset(ce, sema);
+ u32 elapsed[COUNT], cycles;
+ u32 *cs;
+ int i;
+
+ /*
+ * Measure how long it takes for us to submit a request while the
+ * engine is idle, but is resting in our context.
+ *
+ * A: read CS_TIMESTAMP from CPU
+ * submit request
+ * B: read CS_TIMESTAMP on GPU
+ *
+ * Submission latency: B - A
+ */
+
+ for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+ struct i915_request *rq;
+ int err;
+
+ err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+ if (err)
+ return err;
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 4);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+
+ intel_ring_advance(rq, cs);
+
+ preempt_disable();
+ local_bh_disable();
+ elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+ i915_request_add(rq);
+ local_bh_enable();
+ preempt_enable();
+ }
+
+ for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+ if (wait_for(READ_ONCE(sema[i]), 50)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ elapsed[i] = (sema[i] - elapsed[i]) << COUNT;
+ }
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> COUNT,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_busy_dispatch(struct intel_context *ce)
+{
+ u32 *sema = hwsp_scratch(ce);
+ const u32 offset = hwsp_offset(ce, sema);
+ u32 elapsed[COUNT + 1], cycles;
+ u32 *cs;
+ int i;
+
+ /*
+ * Measure how long it takes for us to submit a request while the
+ * engine is busy, polling on a semaphore in our context. With
+ * direct submission, this will include the cost of a lite restore.
+ *
+ * A: read CS_TIMESTAMP from CPU
+ * submit request
+ * B: read CS_TIMESTAMP on GPU
+ *
+ * Submission latency: B - A
+ */
+
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ struct i915_request *rq;
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 12);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+ cs = emit_semaphore_poll_until(cs, offset, i);
+ cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+
+ intel_ring_advance(rq, cs);
+
+ if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ preempt_disable();
+ local_bh_disable();
+ elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+ i915_request_add(rq);
+ local_bh_enable();
+ semaphore_set(sema, i - 1);
+ preempt_enable();
+ }
+
+ wait_for(READ_ONCE(sema[i - 1]), 500);
+ semaphore_set(sema, i - 1);
+
+ for (i = 1; i <= COUNT; i++)
+ elapsed[i - 1] = (sema[i] - elapsed[i]) << COUNT;
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> COUNT,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
+{
+ const u32 offset =
+ i915_ggtt_offset(engine->status_page.vma) +
+ offset_in_page(sema);
+ struct i915_request *rq;
+ u32 *cs;
+
+ rq = i915_request_create(engine->kernel_context);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 4);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ cs = emit_semaphore_poll(cs, mode, value, offset);
+
+ intel_ring_advance(rq, cs);
+ i915_request_add(rq);
+
+ return 0;
+}
+
+static int measure_inter_request(struct intel_context *ce)
+{
+ u32 *sema = hwsp_scratch(ce);
+ const u32 offset = hwsp_offset(ce, sema);
+ u32 elapsed[COUNT + 1], cycles;
+ struct i915_sw_fence *submit;
+ int i, err;
+
+ /*
+ * Measure how long it takes to advance from one request into the
+ * next. Between each request we flush the GPU caches to memory,
+ * update the breadcrumbs, and then invalidate those caches.
+ * We queue up all the requests to be submitted in one batch so
+ * it should be one set of contiguous measurements.
+ *
+ * A: read CS_TIMESTAMP on GPU
+ * advance request
+ * B: read CS_TIMESTAMP on GPU
+ *
+ * Request latency: B - A
+ */
+
+ err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+ if (err)
+ return err;
+
+ submit = heap_fence_create(GFP_KERNEL);
+ if (!submit) {
+ semaphore_set(sema, 1);
+ return -ENOMEM;
+ }
+
+ intel_engine_flush_submission(ce->engine);
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ struct i915_request *rq;
+ u32 *cs;
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq)) {
+ semaphore_set(sema, 1);
+ return PTR_ERR(rq);
+ }
+
+ err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
+ submit,
+ GFP_KERNEL);
+ if (err < 0) {
+ i915_sw_fence_commit(submit);
+ heap_fence_put(submit);
+ i915_request_add(rq);
+ semaphore_set(sema, 1);
+ return err;
+ }
+
+ cs = intel_ring_begin(rq, 4);
+ if (IS_ERR(cs)) {
+ i915_sw_fence_commit(submit);
+ heap_fence_put(submit);
+ i915_request_add(rq);
+ semaphore_set(sema, 1);
+ return PTR_ERR(cs);
+ }
+
+ cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+
+ intel_ring_advance(rq, cs);
+ i915_request_add(rq);
+ }
+ local_bh_disable();
+ i915_sw_fence_commit(submit);
+ local_bh_enable();
+ intel_engine_flush_submission(ce->engine);
+ heap_fence_put(submit);
+
+ semaphore_set(sema, 1);
+ if (wait_for(READ_ONCE(sema[COUNT + 1]), 100)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ for (i = 1; i <= COUNT; i++)
+ elapsed[i - 1] = (sema[i + 1] - sema[i]) << COUNT;
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: inter-request latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> COUNT,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_context_switch(struct intel_context *ce)
+{
+ u32 *sema = hwsp_scratch(ce);
+ const u32 offset = hwsp_offset(ce, sema);
+ struct i915_request *fence = NULL;
+ u32 elapsed[COUNT + 1], cycles;
+ int i, j, err;
+ u32 *cs;
+
+ /*
+ * Measure how long it takes to advance from one request in one
+ * context to a request in another context. This allows us to
+ * measure how long the context save/restore take, along with all
+ * the inter-context setup we require.
+ *
+ * A: read CS_TIMESTAMP on GPU
+ * switch context
+ * B: read CS_TIMESTAMP on GPU
+ *
+ * Context switch latency: B - A
+ */
+
+ err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+ if (err)
+ return err;
+
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ struct intel_context *arr[] = {
+ ce, ce->engine->kernel_context
+ };
+ u32 timestamp = offset + ARRAY_SIZE(arr) * i * sizeof(u32);
+
+ for (j = 0; j < ARRAY_SIZE(arr); j++) {
+ struct i915_request *rq;
+
+ rq = i915_request_create(arr[j]);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ if (fence) {
+ err = i915_request_await_dma_fence(rq,
+ &fence->fence);
+ if (err) {
+ i915_request_add(rq);
+ return err;
+ }
+ }
+
+ cs = intel_ring_begin(rq, 4);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ cs = emit_timestamp_store(cs, ce, timestamp);
+ timestamp += sizeof(u32);
+
+ intel_ring_advance(rq, cs);
+
+ i915_request_put(fence);
+ fence = i915_request_get(rq);
+
+ i915_request_add(rq);
+ }
+ }
+ i915_request_put(fence);
+ intel_engine_flush_submission(ce->engine);
+
+ semaphore_set(sema, 1);
+ if (wait_for(READ_ONCE(sema[2 * i - 1]), 500)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ for (i = 1; i <= COUNT; i++)
+ elapsed[i - 1] = (sema[2 * i + 2] - sema[2 * i + 1]) << COUNT;
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: context switch latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> COUNT,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_preemption(struct intel_context *ce)
+{
+ u32 *sema = hwsp_scratch(ce);
+ const u32 offset = hwsp_offset(ce, sema);
+ u32 elapsed[COUNT], cycles;
+ u32 *cs;
+ int i;
+
+ /*
+ * We measure two latencies while triggering preemption. The first
+ * latency is how long it takes for us to submit a preempting request.
+ * The second latency is how it takes for us to return from the
+ * preemption back to the original context.
+ *
+ * A: read CS_TIMESTAMP from CPU
+ * submit preemption
+ * B: read CS_TIMESTAMP on GPU (in preempting context)
+ * context switch
+ * C: read CS_TIMESTAMP on GPU (in original context)
+ *
+ * Preemption dispatch latency: B - A
+ * Preemption switch latency: C - B
+ */
+
+ if (!intel_engine_has_preemption(ce->engine))
+ return 0;
+
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ u32 timestamp = offset + 2 * i * sizeof(u32);
+ struct i915_request *rq;
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 12);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ cs = emit_timestamp_store(cs, ce, timestamp);
+ cs = emit_semaphore_poll_until(cs, offset, i);
+ cs = emit_timestamp_store(cs, ce, timestamp + sizeof(u32));
+
+ intel_ring_advance(rq, cs);
+ i915_request_add(rq);
+
+ if (wait_for(READ_ONCE(sema[2 * i]), 500)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ rq = i915_request_create(ce->engine->kernel_context);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 8);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ cs = emit_timestamp_store(cs, ce, timestamp);
+ cs = emit_store_dw(cs, offset, i);
+
+ intel_ring_advance(rq, cs);
+ rq->sched.attr.priority = I915_PRIORITY_BARRIER;
+
+ elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+ i915_request_add(rq);
+ }
+
+ if (wait_for(READ_ONCE(sema[2 * i - 1]), 500)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ for (i = 1; i <= COUNT; i++)
+ elapsed[i - 1] = (sema[2 * i + 0] - elapsed[i - 1]) << COUNT;
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: preemption dispatch latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> COUNT,
+ cycles_to_ns(ce->engine, cycles));
+
+ for (i = 1; i <= COUNT; i++)
+ elapsed[i - 1] = (sema[2 * i + 1] - sema[2 * i + 0]) << COUNT;
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: preemption switch latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> COUNT,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+struct signal_cb {
+ struct dma_fence_cb base;
+ bool seen;
+};
+
+static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
+{
+ struct signal_cb *s = container_of(cb, typeof(*s), base);
+
+ smp_store_mb(s->seen, true); /* be safe, be strong */
+}
+
+static int measure_completion(struct intel_context *ce)
+{
+ u32 *sema = hwsp_scratch(ce);
+ const u32 offset = hwsp_offset(ce, sema);
+ u32 elapsed[COUNT], cycles;
+ u32 *cs;
+ int i;
+
+ /*
+ * Measure how long it takes for the signal (interrupt) to be
+ * sent from the GPU to be processed by the CPU.
+ *
+ * A: read CS_TIMESTAMP on GPU
+ * signal
+ * B: read CS_TIMESTAMP from CPU
+ *
+ * Completion latency: B - A
+ */
+
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ struct signal_cb cb = { .seen = false };
+ struct i915_request *rq;
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 12);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+ cs = emit_semaphore_poll_until(cs, offset, i);
+ cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+
+ intel_ring_advance(rq, cs);
+
+ dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
+
+ local_bh_disable();
+ i915_request_add(rq);
+ local_bh_enable();
+
+ if (wait_for(READ_ONCE(sema[i]), 50)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ preempt_disable();
+ semaphore_set(sema, i);
+ while (!READ_ONCE(cb.seen))
+ cpu_relax();
+
+ elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+ preempt_enable();
+ }
+
+ for (i = 0; i < ARRAY_SIZE(elapsed); i++)
+ elapsed[i] = (elapsed[i] - sema[i + 1]) << COUNT;
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: completion latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> COUNT,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static void rps_pin(struct intel_gt *gt)
+{
+ /* Pin the frequency to max */
+ atomic_inc(>->rps.num_waiters);
+ intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
+
+ mutex_lock(>->rps.lock);
+ intel_rps_set(>->rps, gt->rps.max_freq);
+ mutex_unlock(>->rps.lock);
+}
+
+static void rps_unpin(struct intel_gt *gt)
+{
+ intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
+ atomic_dec(>->rps.num_waiters);
+}
+
+static void engine_heartbeat_disable(struct intel_engine_cs *engine)
+{
+ engine->props.heartbeat_interval_ms = 0;
+
+ intel_engine_pm_get(engine);
+ intel_engine_park_heartbeat(engine);
+}
+
+static void engine_heartbeat_enable(struct intel_engine_cs *engine)
+{
+ intel_engine_pm_put(engine);
+
+ engine->props.heartbeat_interval_ms =
+ engine->defaults.heartbeat_interval_ms;
+}
+
+static int perf_request_latency(void *arg)
+{
+ struct drm_i915_private *i915 = arg;
+ struct intel_engine_cs *engine;
+ struct pm_qos_request qos;
+ int err = 0;
+
+ if (INTEL_GEN(i915) < 8) /* per-engine CS timestamp, semaphores */
+ return 0;
+
+ cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
+
+ for_each_uabi_engine(engine, i915) {
+ struct intel_context *ce;
+
+ ce = intel_context_create(engine);
+ if (IS_ERR(ce))
+ goto out;
+
+ err = intel_context_pin(ce);
+ if (err) {
+ intel_context_put(ce);
+ goto out;
+ }
+
+ engine_heartbeat_disable(engine);
+ rps_pin(engine->gt);
+
+ if (err == 0)
+ err = measure_semaphore_response(ce);
+ if (err == 0)
+ err = measure_idle_dispatch(ce);
+ if (err == 0)
+ err = measure_busy_dispatch(ce);
+ if (err == 0)
+ err = measure_inter_request(ce);
+ if (err == 0)
+ err = measure_context_switch(ce);
+ if (err == 0)
+ err = measure_preemption(ce);
+ if (err == 0)
+ err = measure_completion(ce);
+
+ rps_unpin(engine->gt);
+ engine_heartbeat_enable(engine);
+
+ intel_context_unpin(ce);
+ intel_context_put(ce);
+ if (err)
+ goto out;
+ }
+
+out:
+ if (igt_flush_test(i915))
+ err = -EIO;
+
+ cpu_latency_qos_remove_request(&qos);
+ return err;
+}
+
static int s_sync0(void *arg)
{
struct perf_series *ps = arg;
@@ -2042,6 +2805,7 @@ static int perf_parallel_engines(void *arg)
int i915_request_perf_selftests(struct drm_i915_private *i915)
{
static const struct i915_subtest tests[] = {
+ SUBTEST(perf_request_latency),
SUBTEST(perf_series_engines),
SUBTEST(perf_parallel_engines),
};
--
2.20.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 13+ messages in thread
* [Intel-gfx] [PATCH] drm/i915/selftests: Measure dispatch latency
2020-05-18 14:39 [Intel-gfx] [PATCH v2] " Chris Wilson
@ 2020-05-18 16:22 ` Chris Wilson
2020-05-19 11:41 ` Chris Wilson
1 sibling, 0 replies; 13+ messages in thread
From: Chris Wilson @ 2020-05-18 16:22 UTC (permalink / raw)
To: intel-gfx; +Cc: Chris Wilson
A useful metric of the system's health is how fast we can tell the GPU
to do various actions, so measure our latency.
v2: Refactor all the instruction building into emitters.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
drivers/gpu/drm/i915/selftests/i915_request.c | 767 ++++++++++++++++++
1 file changed, 767 insertions(+)
diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c
index 6014e8dfcbb1..3e8169c9b081 100644
--- a/drivers/gpu/drm/i915/selftests/i915_request.c
+++ b/drivers/gpu/drm/i915/selftests/i915_request.c
@@ -24,16 +24,20 @@
#include <linux/prime_numbers.h>
#include <linux/pm_qos.h>
+#include <linux/sort.h>
#include "gem/i915_gem_pm.h"
#include "gem/selftests/mock_context.h"
+#include "gt/intel_engine_heartbeat.h"
#include "gt/intel_engine_pm.h"
#include "gt/intel_engine_user.h"
#include "gt/intel_gt.h"
+#include "gt/intel_gt_requests.h"
#include "i915_random.h"
#include "i915_selftest.h"
+#include "igt_flush_test.h"
#include "igt_live_test.h"
#include "igt_spinner.h"
#include "lib_sw_fence.h"
@@ -1524,6 +1528,768 @@ struct perf_series {
struct intel_context *ce[];
};
+static int cmp_u32(const void *A, const void *B)
+{
+ const u32 *a = A, *b = B;
+
+ return *a - *b;
+}
+
+static u32 trifilter(u32 *a)
+{
+ u64 sum;
+
+#define TF_COUNT 5
+ sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL);
+
+ sum = mul_u32_u32(a[2], 2);
+ sum += a[1];
+ sum += a[3];
+
+ GEM_BUG_ON(sum > U32_MAX);
+ return sum;
+#define TF_BIAS 2
+}
+
+static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
+{
+ u64 ns = i915_cs_timestamp_ticks_to_ns(engine->i915, cycles);
+
+ return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS);
+}
+
+static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset)
+{
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+ *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base)));
+ *cs++ = offset;
+ *cs++ = 0;
+
+ return cs;
+}
+
+static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value)
+{
+ *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+ *cs++ = offset;
+ *cs++ = 0;
+ *cs++ = value;
+
+ return cs;
+}
+
+static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset)
+{
+ *cs++ = MI_SEMAPHORE_WAIT |
+ MI_SEMAPHORE_GLOBAL_GTT |
+ MI_SEMAPHORE_POLL |
+ mode;
+ *cs++ = value;
+ *cs++ = offset;
+ *cs++ = 0;
+
+ return cs;
+}
+
+static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value)
+{
+ return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset);
+}
+
+static void semaphore_set(u32 *sema, u32 value)
+{
+ WRITE_ONCE(*sema, value);
+ wmb(); /* flush the update to the cache, and beyond */
+}
+
+static u32 *hwsp_scratch(const struct intel_context *ce)
+{
+ return memset32(ce->engine->status_page.addr + 1000, 0, 21);
+}
+
+static u32 hwsp_offset(const struct intel_context *ce, u32 *dw)
+{
+ return (i915_ggtt_offset(ce->engine->status_page.vma) +
+ offset_in_page(dw));
+}
+
+static int measure_semaphore_response(struct intel_context *ce)
+{
+ u32 *sema = hwsp_scratch(ce);
+ const u32 offset = hwsp_offset(ce, sema);
+ u32 elapsed[TF_COUNT], cycles;
+ struct i915_request *rq;
+ u32 *cs;
+ int i;
+
+ /*
+ * Measure how many cycles it takes for the HW to detect the change
+ * in a semaphore value.
+ *
+ * A: read CS_TIMESTAMP from CPU
+ * poke semaphore
+ * B: read CS_TIMESTAMP on GPU
+ *
+ * Semaphore latency: B - A
+ */
+
+ semaphore_set(sema, -1);
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 4 + 8 * ARRAY_SIZE(elapsed));
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ cs = emit_store_dw(cs, offset, 0);
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ cs = emit_semaphore_poll_until(cs, offset, i);
+ cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+ }
+
+ intel_ring_advance(rq, cs);
+ i915_request_add(rq);
+
+ if (wait_for(READ_ONCE(*sema) == 0, 50)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ preempt_disable();
+ cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+ semaphore_set(sema, i);
+ preempt_enable();
+
+ if (wait_for(READ_ONCE(sema[i]), 50)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ elapsed[i - 1] = sema[i] - cycles;
+ }
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: semaphore response %d cycles, %lluns\n",
+ ce->engine->name, cycles >> TF_BIAS,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_idle_dispatch(struct intel_context *ce)
+{
+ u32 *sema = hwsp_scratch(ce);
+ const u32 offset = hwsp_offset(ce, sema);
+ u32 elapsed[TF_COUNT], cycles;
+ u32 *cs;
+ int i;
+
+ /*
+ * Measure how long it takes for us to submit a request while the
+ * engine is idle, but is resting in our context.
+ *
+ * A: read CS_TIMESTAMP from CPU
+ * submit request
+ * B: read CS_TIMESTAMP on GPU
+ *
+ * Submission latency: B - A
+ */
+
+ for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+ struct i915_request *rq;
+ int err;
+
+ err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+ if (err)
+ return err;
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 4);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+
+ intel_ring_advance(rq, cs);
+
+ preempt_disable();
+ local_bh_disable();
+ elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+ i915_request_add(rq);
+ local_bh_enable();
+ preempt_enable();
+ }
+
+ for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+ if (wait_for(READ_ONCE(sema[i]), 50)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ elapsed[i] = sema[i] - elapsed[i];
+ }
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> TF_BIAS,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_busy_dispatch(struct intel_context *ce)
+{
+ u32 *sema = hwsp_scratch(ce);
+ const u32 offset = hwsp_offset(ce, sema);
+ u32 elapsed[TF_COUNT + 1], cycles;
+ u32 *cs;
+ int i;
+
+ /*
+ * Measure how long it takes for us to submit a request while the
+ * engine is busy, polling on a semaphore in our context. With
+ * direct submission, this will include the cost of a lite restore.
+ *
+ * A: read CS_TIMESTAMP from CPU
+ * submit request
+ * B: read CS_TIMESTAMP on GPU
+ *
+ * Submission latency: B - A
+ */
+
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ struct i915_request *rq;
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 12);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
+ cs = emit_semaphore_poll_until(cs, offset, i);
+ cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+
+ intel_ring_advance(rq, cs);
+
+ if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ preempt_disable();
+ local_bh_disable();
+ elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+ i915_request_add(rq);
+ local_bh_enable();
+ semaphore_set(sema, i - 1);
+ preempt_enable();
+ }
+
+ wait_for(READ_ONCE(sema[i - 1]), 500);
+ semaphore_set(sema, i - 1);
+
+ for (i = 1; i <= TF_COUNT; i++) {
+ GEM_BUG_ON(sema[i] == -1);
+ elapsed[i - 1] = sema[i] - elapsed[i];
+ }
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> TF_BIAS,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
+{
+ const u32 offset =
+ i915_ggtt_offset(engine->status_page.vma) +
+ offset_in_page(sema);
+ struct i915_request *rq;
+ u32 *cs;
+
+ rq = i915_request_create(engine->kernel_context);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 4);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ cs = emit_semaphore_poll(cs, mode, value, offset);
+
+ intel_ring_advance(rq, cs);
+ i915_request_add(rq);
+
+ return 0;
+}
+
+static int measure_inter_request(struct intel_context *ce)
+{
+ u32 *sema = hwsp_scratch(ce);
+ const u32 offset = hwsp_offset(ce, sema);
+ u32 elapsed[TF_COUNT + 1], cycles;
+ struct i915_sw_fence *submit;
+ int i, err;
+
+ /*
+ * Measure how long it takes to advance from one request into the
+ * next. Between each request we flush the GPU caches to memory,
+ * update the breadcrumbs, and then invalidate those caches.
+ * We queue up all the requests to be submitted in one batch so
+ * it should be one set of contiguous measurements.
+ *
+ * A: read CS_TIMESTAMP on GPU
+ * advance request
+ * B: read CS_TIMESTAMP on GPU
+ *
+ * Request latency: B - A
+ */
+
+ err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+ if (err)
+ return err;
+
+ submit = heap_fence_create(GFP_KERNEL);
+ if (!submit) {
+ semaphore_set(sema, 1);
+ return -ENOMEM;
+ }
+
+ intel_engine_flush_submission(ce->engine);
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ struct i915_request *rq;
+ u32 *cs;
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq)) {
+ semaphore_set(sema, 1);
+ return PTR_ERR(rq);
+ }
+
+ err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
+ submit,
+ GFP_KERNEL);
+ if (err < 0) {
+ i915_sw_fence_commit(submit);
+ heap_fence_put(submit);
+ i915_request_add(rq);
+ semaphore_set(sema, 1);
+ return err;
+ }
+
+ cs = intel_ring_begin(rq, 4);
+ if (IS_ERR(cs)) {
+ i915_sw_fence_commit(submit);
+ heap_fence_put(submit);
+ i915_request_add(rq);
+ semaphore_set(sema, 1);
+ return PTR_ERR(cs);
+ }
+
+ cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+
+ intel_ring_advance(rq, cs);
+ i915_request_add(rq);
+ }
+ local_bh_disable();
+ i915_sw_fence_commit(submit);
+ local_bh_enable();
+ intel_engine_flush_submission(ce->engine);
+ heap_fence_put(submit);
+
+ semaphore_set(sema, 1);
+ if (wait_for(READ_ONCE(sema[TF_COUNT + 1]), 100)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ for (i = 1; i <= TF_COUNT; i++)
+ elapsed[i - 1] = sema[i + 1] - sema[i];
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: inter-request latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> TF_BIAS,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_context_switch(struct intel_context *ce)
+{
+ u32 *sema = hwsp_scratch(ce);
+ const u32 offset = hwsp_offset(ce, sema);
+ struct i915_request *fence = NULL;
+ u32 elapsed[TF_COUNT + 1], cycles;
+ int i, j, err;
+ u32 *cs;
+
+ /*
+ * Measure how long it takes to advance from one request in one
+ * context to a request in another context. This allows us to
+ * measure how long the context save/restore take, along with all
+ * the inter-context setup we require.
+ *
+ * A: read CS_TIMESTAMP on GPU
+ * switch context
+ * B: read CS_TIMESTAMP on GPU
+ *
+ * Context switch latency: B - A
+ */
+
+ err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+ if (err)
+ return err;
+
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ struct intel_context *arr[] = {
+ ce, ce->engine->kernel_context
+ };
+ u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32);
+
+ for (j = 0; j < ARRAY_SIZE(arr); j++) {
+ struct i915_request *rq;
+
+ rq = i915_request_create(arr[j]);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ if (fence) {
+ err = i915_request_await_dma_fence(rq,
+ &fence->fence);
+ if (err) {
+ i915_request_add(rq);
+ return err;
+ }
+ }
+
+ cs = intel_ring_begin(rq, 4);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ cs = emit_timestamp_store(cs, ce, addr);
+ addr += sizeof(u32);
+
+ intel_ring_advance(rq, cs);
+
+ i915_request_put(fence);
+ fence = i915_request_get(rq);
+
+ i915_request_add(rq);
+ }
+ }
+ i915_request_put(fence);
+ intel_engine_flush_submission(ce->engine);
+
+ semaphore_set(sema, 1);
+ if (wait_for(READ_ONCE(sema[2 * i - 1]), 500)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ for (i = 1; i <= TF_COUNT; i++)
+ elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1];
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: context switch latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> TF_BIAS,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_preemption(struct intel_context *ce)
+{
+ u32 *sema = hwsp_scratch(ce);
+ const u32 offset = hwsp_offset(ce, sema);
+ u32 elapsed[TF_COUNT], cycles;
+ u32 *cs;
+ int i;
+
+ /*
+ * We measure two latencies while triggering preemption. The first
+ * latency is how long it takes for us to submit a preempting request.
+ * The second latency is how it takes for us to return from the
+ * preemption back to the original context.
+ *
+ * A: read CS_TIMESTAMP from CPU
+ * submit preemption
+ * B: read CS_TIMESTAMP on GPU (in preempting context)
+ * context switch
+ * C: read CS_TIMESTAMP on GPU (in original context)
+ *
+ * Preemption dispatch latency: B - A
+ * Preemption switch latency: C - B
+ */
+
+ if (!intel_engine_has_preemption(ce->engine))
+ return 0;
+
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ u32 addr = offset + 2 * i * sizeof(u32);
+ struct i915_request *rq;
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 12);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ cs = emit_store_dw(cs, addr, -1);
+ cs = emit_semaphore_poll_until(cs, offset, i);
+ cs = emit_timestamp_store(cs, ce, addr + sizeof(u32));
+
+ intel_ring_advance(rq, cs);
+ i915_request_add(rq);
+
+ if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ rq = i915_request_create(ce->engine->kernel_context);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 8);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ cs = emit_timestamp_store(cs, ce, addr);
+ cs = emit_store_dw(cs, offset, i);
+
+ intel_ring_advance(rq, cs);
+ rq->sched.attr.priority = I915_PRIORITY_BARRIER;
+
+ elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+ i915_request_add(rq);
+ }
+
+ if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ for (i = 1; i <= TF_COUNT; i++)
+ elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1];
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: preemption dispatch latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> TF_BIAS,
+ cycles_to_ns(ce->engine, cycles));
+
+ for (i = 1; i <= TF_COUNT; i++)
+ elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0];
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: preemption switch latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> TF_BIAS,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+struct signal_cb {
+ struct dma_fence_cb base;
+ bool seen;
+};
+
+static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
+{
+ struct signal_cb *s = container_of(cb, typeof(*s), base);
+
+ smp_store_mb(s->seen, true); /* be safe, be strong */
+}
+
+static int measure_completion(struct intel_context *ce)
+{
+ u32 *sema = hwsp_scratch(ce);
+ const u32 offset = hwsp_offset(ce, sema);
+ u32 elapsed[TF_COUNT], cycles;
+ u32 *cs;
+ int i;
+
+ /*
+ * Measure how long it takes for the signal (interrupt) to be
+ * sent from the GPU to be processed by the CPU.
+ *
+ * A: read CS_TIMESTAMP on GPU
+ * signal
+ * B: read CS_TIMESTAMP from CPU
+ *
+ * Completion latency: B - A
+ */
+
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ struct signal_cb cb = { .seen = false };
+ struct i915_request *rq;
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 12);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+ cs = emit_semaphore_poll_until(cs, offset, i);
+ cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+
+ intel_ring_advance(rq, cs);
+
+ dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
+
+ local_bh_disable();
+ i915_request_add(rq);
+ local_bh_enable();
+
+ if (wait_for(READ_ONCE(sema[i]), 50)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ preempt_disable();
+ semaphore_set(sema, i);
+ while (!READ_ONCE(cb.seen))
+ cpu_relax();
+
+ elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+ preempt_enable();
+ }
+
+ for (i = 0; i < ARRAY_SIZE(elapsed); i++)
+ elapsed[i] = elapsed[i] - sema[i + 1];
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: completion latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> TF_BIAS,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static void rps_pin(struct intel_gt *gt)
+{
+ /* Pin the frequency to max */
+ atomic_inc(>->rps.num_waiters);
+ intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
+
+ mutex_lock(>->rps.lock);
+ intel_rps_set(>->rps, gt->rps.max_freq);
+ mutex_unlock(>->rps.lock);
+}
+
+static void rps_unpin(struct intel_gt *gt)
+{
+ intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
+ atomic_dec(>->rps.num_waiters);
+}
+
+static void engine_heartbeat_disable(struct intel_engine_cs *engine)
+{
+ engine->props.heartbeat_interval_ms = 0;
+
+ intel_engine_pm_get(engine);
+ intel_engine_park_heartbeat(engine);
+}
+
+static void engine_heartbeat_enable(struct intel_engine_cs *engine)
+{
+ intel_engine_pm_put(engine);
+
+ engine->props.heartbeat_interval_ms =
+ engine->defaults.heartbeat_interval_ms;
+}
+
+static int perf_request_latency(void *arg)
+{
+ struct drm_i915_private *i915 = arg;
+ struct intel_engine_cs *engine;
+ struct pm_qos_request qos;
+ int err = 0;
+
+ if (INTEL_GEN(i915) < 8) /* per-engine CS timestamp, semaphores */
+ return 0;
+
+ cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
+
+ for_each_uabi_engine(engine, i915) {
+ struct intel_context *ce;
+
+ ce = intel_context_create(engine);
+ if (IS_ERR(ce))
+ goto out;
+
+ err = intel_context_pin(ce);
+ if (err) {
+ intel_context_put(ce);
+ goto out;
+ }
+
+ engine_heartbeat_disable(engine);
+ rps_pin(engine->gt);
+
+ if (err == 0)
+ err = measure_semaphore_response(ce);
+ if (err == 0)
+ err = measure_idle_dispatch(ce);
+ if (err == 0)
+ err = measure_busy_dispatch(ce);
+ if (err == 0)
+ err = measure_inter_request(ce);
+ if (err == 0)
+ err = measure_context_switch(ce);
+ if (err == 0)
+ err = measure_preemption(ce);
+ if (err == 0)
+ err = measure_completion(ce);
+
+ rps_unpin(engine->gt);
+ engine_heartbeat_enable(engine);
+
+ intel_context_unpin(ce);
+ intel_context_put(ce);
+ if (err)
+ goto out;
+ }
+
+out:
+ if (igt_flush_test(i915))
+ err = -EIO;
+
+ cpu_latency_qos_remove_request(&qos);
+ return err;
+}
+
static int s_sync0(void *arg)
{
struct perf_series *ps = arg;
@@ -2042,6 +2808,7 @@ static int perf_parallel_engines(void *arg)
int i915_request_perf_selftests(struct drm_i915_private *i915)
{
static const struct i915_subtest tests[] = {
+ SUBTEST(perf_request_latency),
SUBTEST(perf_series_engines),
SUBTEST(perf_parallel_engines),
};
--
2.20.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 13+ messages in thread
* [Intel-gfx] [PATCH] drm/i915/selftests: Measure dispatch latency
2020-05-18 14:39 [Intel-gfx] [PATCH v2] " Chris Wilson
2020-05-18 16:22 ` [Intel-gfx] [PATCH] " Chris Wilson
@ 2020-05-19 11:41 ` Chris Wilson
2020-05-19 12:47 ` Mika Kuoppala
1 sibling, 1 reply; 13+ messages in thread
From: Chris Wilson @ 2020-05-19 11:41 UTC (permalink / raw)
To: intel-gfx; +Cc: Chris Wilson
A useful metric of the system's health is how fast we can tell the GPU
to do various actions, so measure our latency.
v2: Refactor all the instruction building into emitters.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
drivers/gpu/drm/i915/selftests/i915_request.c | 779 ++++++++++++++++++
1 file changed, 779 insertions(+)
diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c
index 6014e8dfcbb1..db09e9cb54b8 100644
--- a/drivers/gpu/drm/i915/selftests/i915_request.c
+++ b/drivers/gpu/drm/i915/selftests/i915_request.c
@@ -24,16 +24,20 @@
#include <linux/prime_numbers.h>
#include <linux/pm_qos.h>
+#include <linux/sort.h>
#include "gem/i915_gem_pm.h"
#include "gem/selftests/mock_context.h"
+#include "gt/intel_engine_heartbeat.h"
#include "gt/intel_engine_pm.h"
#include "gt/intel_engine_user.h"
#include "gt/intel_gt.h"
+#include "gt/intel_gt_requests.h"
#include "i915_random.h"
#include "i915_selftest.h"
+#include "igt_flush_test.h"
#include "igt_live_test.h"
#include "igt_spinner.h"
#include "lib_sw_fence.h"
@@ -1524,6 +1528,780 @@ struct perf_series {
struct intel_context *ce[];
};
+static int cmp_u32(const void *A, const void *B)
+{
+ const u32 *a = A, *b = B;
+
+ return *a - *b;
+}
+
+static u32 trifilter(u32 *a)
+{
+ u64 sum;
+
+#define TF_COUNT 5
+ sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL);
+
+ sum = mul_u32_u32(a[2], 2);
+ sum += a[1];
+ sum += a[3];
+
+ GEM_BUG_ON(sum > U32_MAX);
+ return sum;
+#define TF_BIAS 2
+}
+
+static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
+{
+ u64 ns = i915_cs_timestamp_ticks_to_ns(engine->i915, cycles);
+
+ return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS);
+}
+
+static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset)
+{
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+ *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base)));
+ *cs++ = offset;
+ *cs++ = 0;
+
+ return cs;
+}
+
+static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value)
+{
+ *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+ *cs++ = offset;
+ *cs++ = 0;
+ *cs++ = value;
+
+ return cs;
+}
+
+static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset)
+{
+ *cs++ = MI_SEMAPHORE_WAIT |
+ MI_SEMAPHORE_GLOBAL_GTT |
+ MI_SEMAPHORE_POLL |
+ mode;
+ *cs++ = value;
+ *cs++ = offset;
+ *cs++ = 0;
+
+ return cs;
+}
+
+static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value)
+{
+ return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset);
+}
+
+static void semaphore_set(u32 *sema, u32 value)
+{
+ WRITE_ONCE(*sema, value);
+ wmb(); /* flush the update to the cache, and beyond */
+}
+
+static u32 *hwsp_scratch(const struct intel_context *ce)
+{
+ return memset32(ce->engine->status_page.addr + 1000, 0, 21);
+}
+
+static u32 hwsp_offset(const struct intel_context *ce, u32 *dw)
+{
+ return (i915_ggtt_offset(ce->engine->status_page.vma) +
+ offset_in_page(dw));
+}
+
+static int measure_semaphore_response(struct intel_context *ce)
+{
+ u32 *sema = hwsp_scratch(ce);
+ const u32 offset = hwsp_offset(ce, sema);
+ u32 elapsed[TF_COUNT], cycles;
+ struct i915_request *rq;
+ u32 *cs;
+ int i;
+
+ /*
+ * Measure how many cycles it takes for the HW to detect the change
+ * in a semaphore value.
+ *
+ * A: read CS_TIMESTAMP from CPU
+ * poke semaphore
+ * B: read CS_TIMESTAMP on GPU
+ *
+ * Semaphore latency: B - A
+ */
+
+ semaphore_set(sema, -1);
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed));
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ cs = emit_store_dw(cs, offset, 0);
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ cs = emit_semaphore_poll_until(cs, offset, i);
+ cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+ cs = emit_store_dw(cs, offset, 0);
+ }
+
+ intel_ring_advance(rq, cs);
+ i915_request_add(rq);
+
+ if (wait_for(READ_ONCE(*sema) == 0, 50)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ preempt_disable();
+ cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+ semaphore_set(sema, i);
+ preempt_enable();
+
+ if (wait_for(READ_ONCE(*sema) == 0, 50)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ elapsed[i - 1] = sema[i] - cycles;
+ }
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: semaphore response %d cycles, %lluns\n",
+ ce->engine->name, cycles >> TF_BIAS,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_idle_dispatch(struct intel_context *ce)
+{
+ u32 *sema = hwsp_scratch(ce);
+ const u32 offset = hwsp_offset(ce, sema);
+ u32 elapsed[TF_COUNT], cycles;
+ u32 *cs;
+ int err;
+ int i;
+
+ /*
+ * Measure how long it takes for us to submit a request while the
+ * engine is idle, but is resting in our context.
+ *
+ * A: read CS_TIMESTAMP from CPU
+ * submit request
+ * B: read CS_TIMESTAMP on GPU
+ *
+ * Submission latency: B - A
+ */
+
+ for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+ struct i915_request *rq;
+
+ err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+ if (err)
+ return err;
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 4);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+
+ intel_ring_advance(rq, cs);
+
+ preempt_disable();
+ local_bh_disable();
+ elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+ i915_request_add(rq);
+ local_bh_enable();
+ preempt_enable();
+ }
+
+ err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+ if (err) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return err;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(elapsed); i++)
+ elapsed[i] = sema[i] - elapsed[i];
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> TF_BIAS,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_busy_dispatch(struct intel_context *ce)
+{
+ u32 *sema = hwsp_scratch(ce);
+ const u32 offset = hwsp_offset(ce, sema);
+ u32 elapsed[TF_COUNT + 1], cycles;
+ u32 *cs;
+ int i;
+
+ /*
+ * Measure how long it takes for us to submit a request while the
+ * engine is busy, polling on a semaphore in our context. With
+ * direct submission, this will include the cost of a lite restore.
+ *
+ * A: read CS_TIMESTAMP from CPU
+ * submit request
+ * B: read CS_TIMESTAMP on GPU
+ *
+ * Submission latency: B - A
+ */
+
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ struct i915_request *rq;
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 12);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
+ cs = emit_semaphore_poll_until(cs, offset, i);
+ cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+
+ intel_ring_advance(rq, cs);
+
+ if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ preempt_disable();
+ local_bh_disable();
+ elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+ i915_request_add(rq);
+ local_bh_enable();
+ semaphore_set(sema, i - 1);
+ preempt_enable();
+ }
+
+ wait_for(READ_ONCE(sema[i - 1]), 500);
+ semaphore_set(sema, i - 1);
+
+ for (i = 1; i <= TF_COUNT; i++) {
+ GEM_BUG_ON(sema[i] == -1);
+ elapsed[i - 1] = sema[i] - elapsed[i];
+ }
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> TF_BIAS,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
+{
+ const u32 offset =
+ i915_ggtt_offset(engine->status_page.vma) +
+ offset_in_page(sema);
+ struct i915_request *rq;
+ u32 *cs;
+
+ rq = i915_request_create(engine->kernel_context);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 4);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ cs = emit_semaphore_poll(cs, mode, value, offset);
+
+ intel_ring_advance(rq, cs);
+ i915_request_add(rq);
+
+ return 0;
+}
+
+static int measure_inter_request(struct intel_context *ce)
+{
+ u32 *sema = hwsp_scratch(ce);
+ const u32 offset = hwsp_offset(ce, sema);
+ u32 elapsed[TF_COUNT + 1], cycles;
+ struct i915_sw_fence *submit;
+ int i, err;
+
+ /*
+ * Measure how long it takes to advance from one request into the
+ * next. Between each request we flush the GPU caches to memory,
+ * update the breadcrumbs, and then invalidate those caches.
+ * We queue up all the requests to be submitted in one batch so
+ * it should be one set of contiguous measurements.
+ *
+ * A: read CS_TIMESTAMP on GPU
+ * advance request
+ * B: read CS_TIMESTAMP on GPU
+ *
+ * Request latency: B - A
+ */
+
+ err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+ if (err)
+ return err;
+
+ submit = heap_fence_create(GFP_KERNEL);
+ if (!submit) {
+ semaphore_set(sema, 1);
+ return -ENOMEM;
+ }
+
+ intel_engine_flush_submission(ce->engine);
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ struct i915_request *rq;
+ u32 *cs;
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq)) {
+ semaphore_set(sema, 1);
+ return PTR_ERR(rq);
+ }
+
+ err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
+ submit,
+ GFP_KERNEL);
+ if (err < 0) {
+ i915_sw_fence_commit(submit);
+ heap_fence_put(submit);
+ i915_request_add(rq);
+ semaphore_set(sema, 1);
+ return err;
+ }
+
+ cs = intel_ring_begin(rq, 4);
+ if (IS_ERR(cs)) {
+ i915_sw_fence_commit(submit);
+ heap_fence_put(submit);
+ i915_request_add(rq);
+ semaphore_set(sema, 1);
+ return PTR_ERR(cs);
+ }
+
+ cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+
+ intel_ring_advance(rq, cs);
+ i915_request_add(rq);
+ }
+ local_bh_disable();
+ i915_sw_fence_commit(submit);
+ local_bh_enable();
+ intel_engine_flush_submission(ce->engine);
+ heap_fence_put(submit);
+
+ semaphore_set(sema, 1);
+ err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+ if (err) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return err;
+ }
+
+ for (i = 1; i <= TF_COUNT; i++)
+ elapsed[i - 1] = sema[i + 1] - sema[i];
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: inter-request latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> TF_BIAS,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_context_switch(struct intel_context *ce)
+{
+ u32 *sema = hwsp_scratch(ce);
+ const u32 offset = hwsp_offset(ce, sema);
+ struct i915_request *fence = NULL;
+ u32 elapsed[TF_COUNT + 1], cycles;
+ int i, j, err;
+ u32 *cs;
+
+ /*
+ * Measure how long it takes to advance from one request in one
+ * context to a request in another context. This allows us to
+ * measure how long the context save/restore take, along with all
+ * the inter-context setup we require.
+ *
+ * A: read CS_TIMESTAMP on GPU
+ * switch context
+ * B: read CS_TIMESTAMP on GPU
+ *
+ * Context switch latency: B - A
+ */
+
+ err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+ if (err)
+ return err;
+
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ struct intel_context *arr[] = {
+ ce, ce->engine->kernel_context
+ };
+ u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32);
+
+ for (j = 0; j < ARRAY_SIZE(arr); j++) {
+ struct i915_request *rq;
+
+ rq = i915_request_create(arr[j]);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ if (fence) {
+ err = i915_request_await_dma_fence(rq,
+ &fence->fence);
+ if (err) {
+ i915_request_add(rq);
+ return err;
+ }
+ }
+
+ cs = intel_ring_begin(rq, 4);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ cs = emit_timestamp_store(cs, ce, addr);
+ addr += sizeof(u32);
+
+ intel_ring_advance(rq, cs);
+
+ i915_request_put(fence);
+ fence = i915_request_get(rq);
+
+ i915_request_add(rq);
+ }
+ }
+ i915_request_put(fence);
+ intel_engine_flush_submission(ce->engine);
+
+ semaphore_set(sema, 1);
+ err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+ if (err) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return err;
+ }
+
+ for (i = 1; i <= TF_COUNT; i++)
+ elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1];
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: context switch latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> TF_BIAS,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_preemption(struct intel_context *ce)
+{
+ u32 *sema = hwsp_scratch(ce);
+ const u32 offset = hwsp_offset(ce, sema);
+ u32 elapsed[TF_COUNT], cycles;
+ u32 *cs;
+ int i;
+
+ /*
+ * We measure two latencies while triggering preemption. The first
+ * latency is how long it takes for us to submit a preempting request.
+ * The second latency is how it takes for us to return from the
+ * preemption back to the original context.
+ *
+ * A: read CS_TIMESTAMP from CPU
+ * submit preemption
+ * B: read CS_TIMESTAMP on GPU (in preempting context)
+ * context switch
+ * C: read CS_TIMESTAMP on GPU (in original context)
+ *
+ * Preemption dispatch latency: B - A
+ * Preemption switch latency: C - B
+ */
+
+ if (!intel_engine_has_preemption(ce->engine))
+ return 0;
+
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ u32 addr = offset + 2 * i * sizeof(u32);
+ struct i915_request *rq;
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 12);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ cs = emit_store_dw(cs, addr, -1);
+ cs = emit_semaphore_poll_until(cs, offset, i);
+ cs = emit_timestamp_store(cs, ce, addr + sizeof(u32));
+
+ intel_ring_advance(rq, cs);
+ i915_request_add(rq);
+
+ if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ rq = i915_request_create(ce->engine->kernel_context);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 8);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ cs = emit_timestamp_store(cs, ce, addr);
+ cs = emit_store_dw(cs, offset, i);
+
+ intel_ring_advance(rq, cs);
+ rq->sched.attr.priority = I915_PRIORITY_BARRIER;
+
+ elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+ i915_request_add(rq);
+ }
+
+ if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ for (i = 1; i <= TF_COUNT; i++)
+ elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1];
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: preemption dispatch latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> TF_BIAS,
+ cycles_to_ns(ce->engine, cycles));
+
+ for (i = 1; i <= TF_COUNT; i++)
+ elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0];
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: preemption switch latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> TF_BIAS,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+struct signal_cb {
+ struct dma_fence_cb base;
+ bool seen;
+};
+
+static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
+{
+ struct signal_cb *s = container_of(cb, typeof(*s), base);
+
+ smp_store_mb(s->seen, true); /* be safe, be strong */
+}
+
+static int measure_completion(struct intel_context *ce)
+{
+ u32 *sema = hwsp_scratch(ce);
+ const u32 offset = hwsp_offset(ce, sema);
+ u32 elapsed[TF_COUNT], cycles;
+ u32 *cs;
+ int err;
+ int i;
+
+ /*
+ * Measure how long it takes for the signal (interrupt) to be
+ * sent from the GPU to be processed by the CPU.
+ *
+ * A: read CS_TIMESTAMP on GPU
+ * signal
+ * B: read CS_TIMESTAMP from CPU
+ *
+ * Completion latency: B - A
+ */
+
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ struct signal_cb cb = { .seen = false };
+ struct i915_request *rq;
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 12);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
+ cs = emit_semaphore_poll_until(cs, offset, i);
+ cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
+
+ intel_ring_advance(rq, cs);
+
+ dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
+
+ local_bh_disable();
+ i915_request_add(rq);
+ local_bh_enable();
+
+ if (wait_for(READ_ONCE(sema[i]) == -1, 50)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ preempt_disable();
+ semaphore_set(sema, i);
+ while (!READ_ONCE(cb.seen))
+ cpu_relax();
+
+ elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+ preempt_enable();
+ }
+
+ err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+ if (err) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return err;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+ GEM_BUG_ON(sema[i + 1] == -1);
+ elapsed[i] = elapsed[i] - sema[i + 1];
+ }
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: completion latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> TF_BIAS,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static void rps_pin(struct intel_gt *gt)
+{
+ /* Pin the frequency to max */
+ atomic_inc(>->rps.num_waiters);
+ intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
+
+ mutex_lock(>->rps.lock);
+ intel_rps_set(>->rps, gt->rps.max_freq);
+ mutex_unlock(>->rps.lock);
+}
+
+static void rps_unpin(struct intel_gt *gt)
+{
+ intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
+ atomic_dec(>->rps.num_waiters);
+}
+
+static void engine_heartbeat_disable(struct intel_engine_cs *engine)
+{
+ engine->props.heartbeat_interval_ms = 0;
+
+ intel_engine_pm_get(engine);
+ intel_engine_park_heartbeat(engine);
+}
+
+static void engine_heartbeat_enable(struct intel_engine_cs *engine)
+{
+ intel_engine_pm_put(engine);
+
+ engine->props.heartbeat_interval_ms =
+ engine->defaults.heartbeat_interval_ms;
+}
+
+static int perf_request_latency(void *arg)
+{
+ struct drm_i915_private *i915 = arg;
+ struct intel_engine_cs *engine;
+ struct pm_qos_request qos;
+ int err = 0;
+
+ if (INTEL_GEN(i915) < 8) /* per-engine CS timestamp, semaphores */
+ return 0;
+
+ cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
+
+ for_each_uabi_engine(engine, i915) {
+ struct intel_context *ce;
+
+ ce = intel_context_create(engine);
+ if (IS_ERR(ce))
+ goto out;
+
+ err = intel_context_pin(ce);
+ if (err) {
+ intel_context_put(ce);
+ goto out;
+ }
+
+ engine_heartbeat_disable(engine);
+ rps_pin(engine->gt);
+
+ if (err == 0)
+ err = measure_semaphore_response(ce);
+ if (err == 0)
+ err = measure_idle_dispatch(ce);
+ if (err == 0)
+ err = measure_busy_dispatch(ce);
+ if (err == 0)
+ err = measure_inter_request(ce);
+ if (err == 0)
+ err = measure_context_switch(ce);
+ if (err == 0)
+ err = measure_preemption(ce);
+ if (err == 0)
+ err = measure_completion(ce);
+
+ rps_unpin(engine->gt);
+ engine_heartbeat_enable(engine);
+
+ intel_context_unpin(ce);
+ intel_context_put(ce);
+ if (err)
+ goto out;
+ }
+
+out:
+ if (igt_flush_test(i915))
+ err = -EIO;
+
+ cpu_latency_qos_remove_request(&qos);
+ return err;
+}
+
static int s_sync0(void *arg)
{
struct perf_series *ps = arg;
@@ -2042,6 +2820,7 @@ static int perf_parallel_engines(void *arg)
int i915_request_perf_selftests(struct drm_i915_private *i915)
{
static const struct i915_subtest tests[] = {
+ SUBTEST(perf_request_latency),
SUBTEST(perf_series_engines),
SUBTEST(perf_parallel_engines),
};
--
2.20.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 13+ messages in thread
* Re: [Intel-gfx] [PATCH] drm/i915/selftests: Measure dispatch latency
2020-05-19 11:41 ` Chris Wilson
@ 2020-05-19 12:47 ` Mika Kuoppala
2020-05-19 12:56 ` Chris Wilson
0 siblings, 1 reply; 13+ messages in thread
From: Mika Kuoppala @ 2020-05-19 12:47 UTC (permalink / raw)
To: Chris Wilson, intel-gfx; +Cc: Chris Wilson
Chris Wilson <chris@chris-wilson.co.uk> writes:
> A useful metric of the system's health is how fast we can tell the GPU
> to do various actions, so measure our latency.
>
> v2: Refactor all the instruction building into emitters.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Not much nitpicking left. Could have used one goto in the fence
using tests on error paths but meh.
Lots of tests poking hw from different angles.
With a clear comments, it is like a guided tour of our
submission/scheduling front.
Analyzing of differences between different sets will
be interesting.
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> ---
> drivers/gpu/drm/i915/selftests/i915_request.c | 779 ++++++++++++++++++
> 1 file changed, 779 insertions(+)
>
> diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c
> index 6014e8dfcbb1..db09e9cb54b8 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_request.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_request.c
> @@ -24,16 +24,20 @@
>
> #include <linux/prime_numbers.h>
> #include <linux/pm_qos.h>
> +#include <linux/sort.h>
>
> #include "gem/i915_gem_pm.h"
> #include "gem/selftests/mock_context.h"
>
> +#include "gt/intel_engine_heartbeat.h"
> #include "gt/intel_engine_pm.h"
> #include "gt/intel_engine_user.h"
> #include "gt/intel_gt.h"
> +#include "gt/intel_gt_requests.h"
>
> #include "i915_random.h"
> #include "i915_selftest.h"
> +#include "igt_flush_test.h"
> #include "igt_live_test.h"
> #include "igt_spinner.h"
> #include "lib_sw_fence.h"
> @@ -1524,6 +1528,780 @@ struct perf_series {
> struct intel_context *ce[];
> };
>
> +static int cmp_u32(const void *A, const void *B)
> +{
> + const u32 *a = A, *b = B;
> +
> + return *a - *b;
> +}
> +
> +static u32 trifilter(u32 *a)
> +{
> + u64 sum;
> +
> +#define TF_COUNT 5
> + sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL);
> +
> + sum = mul_u32_u32(a[2], 2);
> + sum += a[1];
> + sum += a[3];
> +
> + GEM_BUG_ON(sum > U32_MAX);
> + return sum;
> +#define TF_BIAS 2
> +}
> +
> +static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
> +{
> + u64 ns = i915_cs_timestamp_ticks_to_ns(engine->i915, cycles);
> +
> + return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS);
> +}
> +
> +static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset)
> +{
> + *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
> + *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base)));
> + *cs++ = offset;
> + *cs++ = 0;
> +
> + return cs;
> +}
> +
> +static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value)
> +{
> + *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
> + *cs++ = offset;
> + *cs++ = 0;
> + *cs++ = value;
> +
> + return cs;
> +}
> +
> +static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset)
> +{
> + *cs++ = MI_SEMAPHORE_WAIT |
> + MI_SEMAPHORE_GLOBAL_GTT |
> + MI_SEMAPHORE_POLL |
> + mode;
> + *cs++ = value;
> + *cs++ = offset;
> + *cs++ = 0;
> +
> + return cs;
> +}
> +
> +static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value)
> +{
> + return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset);
> +}
> +
> +static void semaphore_set(u32 *sema, u32 value)
> +{
> + WRITE_ONCE(*sema, value);
> + wmb(); /* flush the update to the cache, and beyond */
> +}
> +
> +static u32 *hwsp_scratch(const struct intel_context *ce)
> +{
> + return memset32(ce->engine->status_page.addr + 1000, 0, 21);
> +}
> +
> +static u32 hwsp_offset(const struct intel_context *ce, u32 *dw)
> +{
> + return (i915_ggtt_offset(ce->engine->status_page.vma) +
> + offset_in_page(dw));
> +}
> +
> +static int measure_semaphore_response(struct intel_context *ce)
> +{
> + u32 *sema = hwsp_scratch(ce);
> + const u32 offset = hwsp_offset(ce, sema);
> + u32 elapsed[TF_COUNT], cycles;
> + struct i915_request *rq;
> + u32 *cs;
> + int i;
> +
> + /*
> + * Measure how many cycles it takes for the HW to detect the change
> + * in a semaphore value.
> + *
> + * A: read CS_TIMESTAMP from CPU
> + * poke semaphore
> + * B: read CS_TIMESTAMP on GPU
> + *
> + * Semaphore latency: B - A
> + */
> +
> + semaphore_set(sema, -1);
> +
> + rq = i915_request_create(ce);
> + if (IS_ERR(rq))
> + return PTR_ERR(rq);
> +
> + cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed));
> + if (IS_ERR(cs)) {
> + i915_request_add(rq);
> + return PTR_ERR(cs);
> + }
> +
> + cs = emit_store_dw(cs, offset, 0);
> + for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
> + cs = emit_semaphore_poll_until(cs, offset, i);
> + cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
> + cs = emit_store_dw(cs, offset, 0);
> + }
> +
> + intel_ring_advance(rq, cs);
> + i915_request_add(rq);
> +
> + if (wait_for(READ_ONCE(*sema) == 0, 50)) {
> + intel_gt_set_wedged(ce->engine->gt);
> + return -EIO;
> + }
> +
> + for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
> + preempt_disable();
> + cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
> + semaphore_set(sema, i);
> + preempt_enable();
> +
> + if (wait_for(READ_ONCE(*sema) == 0, 50)) {
> + intel_gt_set_wedged(ce->engine->gt);
> + return -EIO;
> + }
> +
> + elapsed[i - 1] = sema[i] - cycles;
> + }
> +
> + cycles = trifilter(elapsed);
> + pr_info("%s: semaphore response %d cycles, %lluns\n",
> + ce->engine->name, cycles >> TF_BIAS,
> + cycles_to_ns(ce->engine, cycles));
> +
> + return intel_gt_wait_for_idle(ce->engine->gt, HZ);
> +}
> +
> +static int measure_idle_dispatch(struct intel_context *ce)
> +{
> + u32 *sema = hwsp_scratch(ce);
> + const u32 offset = hwsp_offset(ce, sema);
> + u32 elapsed[TF_COUNT], cycles;
> + u32 *cs;
> + int err;
> + int i;
> +
> + /*
> + * Measure how long it takes for us to submit a request while the
> + * engine is idle, but is resting in our context.
> + *
> + * A: read CS_TIMESTAMP from CPU
> + * submit request
> + * B: read CS_TIMESTAMP on GPU
> + *
> + * Submission latency: B - A
> + */
> +
> + for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
> + struct i915_request *rq;
> +
> + err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
> + if (err)
> + return err;
> +
> + rq = i915_request_create(ce);
> + if (IS_ERR(rq))
> + return PTR_ERR(rq);
> +
> + cs = intel_ring_begin(rq, 4);
> + if (IS_ERR(cs)) {
> + i915_request_add(rq);
> + return PTR_ERR(cs);
> + }
> +
> + cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
> +
> + intel_ring_advance(rq, cs);
> +
> + preempt_disable();
> + local_bh_disable();
> + elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
> + i915_request_add(rq);
> + local_bh_enable();
> + preempt_enable();
> + }
> +
> + err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
> + if (err) {
> + intel_gt_set_wedged(ce->engine->gt);
> + return err;
> + }
> +
> + for (i = 0; i < ARRAY_SIZE(elapsed); i++)
> + elapsed[i] = sema[i] - elapsed[i];
> +
> + cycles = trifilter(elapsed);
> + pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
> + ce->engine->name, cycles >> TF_BIAS,
> + cycles_to_ns(ce->engine, cycles));
> +
> + return intel_gt_wait_for_idle(ce->engine->gt, HZ);
> +}
> +
> +static int measure_busy_dispatch(struct intel_context *ce)
> +{
> + u32 *sema = hwsp_scratch(ce);
> + const u32 offset = hwsp_offset(ce, sema);
> + u32 elapsed[TF_COUNT + 1], cycles;
> + u32 *cs;
> + int i;
> +
> + /*
> + * Measure how long it takes for us to submit a request while the
> + * engine is busy, polling on a semaphore in our context. With
> + * direct submission, this will include the cost of a lite restore.
> + *
> + * A: read CS_TIMESTAMP from CPU
> + * submit request
> + * B: read CS_TIMESTAMP on GPU
> + *
> + * Submission latency: B - A
> + */
> +
> + for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
> + struct i915_request *rq;
> +
> + rq = i915_request_create(ce);
> + if (IS_ERR(rq))
> + return PTR_ERR(rq);
> +
> + cs = intel_ring_begin(rq, 12);
> + if (IS_ERR(cs)) {
> + i915_request_add(rq);
> + return PTR_ERR(cs);
> + }
> +
> + cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
> + cs = emit_semaphore_poll_until(cs, offset, i);
> + cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
> +
> + intel_ring_advance(rq, cs);
> +
> + if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
> + intel_gt_set_wedged(ce->engine->gt);
> + return -EIO;
> + }
> +
> + preempt_disable();
> + local_bh_disable();
> + elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
> + i915_request_add(rq);
> + local_bh_enable();
> + semaphore_set(sema, i - 1);
> + preempt_enable();
> + }
> +
> + wait_for(READ_ONCE(sema[i - 1]), 500);
> + semaphore_set(sema, i - 1);
> +
> + for (i = 1; i <= TF_COUNT; i++) {
> + GEM_BUG_ON(sema[i] == -1);
> + elapsed[i - 1] = sema[i] - elapsed[i];
> + }
> +
> + cycles = trifilter(elapsed);
> + pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
> + ce->engine->name, cycles >> TF_BIAS,
> + cycles_to_ns(ce->engine, cycles));
> +
> + return intel_gt_wait_for_idle(ce->engine->gt, HZ);
> +}
> +
> +static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
> +{
> + const u32 offset =
> + i915_ggtt_offset(engine->status_page.vma) +
> + offset_in_page(sema);
> + struct i915_request *rq;
> + u32 *cs;
> +
> + rq = i915_request_create(engine->kernel_context);
> + if (IS_ERR(rq))
> + return PTR_ERR(rq);
> +
> + cs = intel_ring_begin(rq, 4);
> + if (IS_ERR(cs)) {
> + i915_request_add(rq);
> + return PTR_ERR(cs);
> + }
> +
> + cs = emit_semaphore_poll(cs, mode, value, offset);
> +
> + intel_ring_advance(rq, cs);
> + i915_request_add(rq);
> +
> + return 0;
> +}
> +
> +static int measure_inter_request(struct intel_context *ce)
> +{
> + u32 *sema = hwsp_scratch(ce);
> + const u32 offset = hwsp_offset(ce, sema);
> + u32 elapsed[TF_COUNT + 1], cycles;
> + struct i915_sw_fence *submit;
> + int i, err;
> +
> + /*
> + * Measure how long it takes to advance from one request into the
> + * next. Between each request we flush the GPU caches to memory,
> + * update the breadcrumbs, and then invalidate those caches.
> + * We queue up all the requests to be submitted in one batch so
> + * it should be one set of contiguous measurements.
> + *
> + * A: read CS_TIMESTAMP on GPU
> + * advance request
> + * B: read CS_TIMESTAMP on GPU
> + *
> + * Request latency: B - A
> + */
> +
> + err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
> + if (err)
> + return err;
> +
> + submit = heap_fence_create(GFP_KERNEL);
> + if (!submit) {
> + semaphore_set(sema, 1);
> + return -ENOMEM;
> + }
> +
> + intel_engine_flush_submission(ce->engine);
> + for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
> + struct i915_request *rq;
> + u32 *cs;
> +
> + rq = i915_request_create(ce);
> + if (IS_ERR(rq)) {
> + semaphore_set(sema, 1);
> + return PTR_ERR(rq);
> + }
> +
> + err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
> + submit,
> + GFP_KERNEL);
> + if (err < 0) {
> + i915_sw_fence_commit(submit);
> + heap_fence_put(submit);
> + i915_request_add(rq);
> + semaphore_set(sema, 1);
> + return err;
> + }
> +
> + cs = intel_ring_begin(rq, 4);
> + if (IS_ERR(cs)) {
> + i915_sw_fence_commit(submit);
> + heap_fence_put(submit);
> + i915_request_add(rq);
> + semaphore_set(sema, 1);
> + return PTR_ERR(cs);
> + }
> +
> + cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
> +
> + intel_ring_advance(rq, cs);
> + i915_request_add(rq);
> + }
> + local_bh_disable();
> + i915_sw_fence_commit(submit);
> + local_bh_enable();
> + intel_engine_flush_submission(ce->engine);
> + heap_fence_put(submit);
> +
> + semaphore_set(sema, 1);
> + err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
> + if (err) {
> + intel_gt_set_wedged(ce->engine->gt);
> + return err;
> + }
> +
> + for (i = 1; i <= TF_COUNT; i++)
> + elapsed[i - 1] = sema[i + 1] - sema[i];
> +
> + cycles = trifilter(elapsed);
> + pr_info("%s: inter-request latency %d cycles, %lluns\n",
> + ce->engine->name, cycles >> TF_BIAS,
> + cycles_to_ns(ce->engine, cycles));
> +
> + return intel_gt_wait_for_idle(ce->engine->gt, HZ);
> +}
> +
> +static int measure_context_switch(struct intel_context *ce)
> +{
> + u32 *sema = hwsp_scratch(ce);
> + const u32 offset = hwsp_offset(ce, sema);
> + struct i915_request *fence = NULL;
> + u32 elapsed[TF_COUNT + 1], cycles;
> + int i, j, err;
> + u32 *cs;
> +
> + /*
> + * Measure how long it takes to advance from one request in one
> + * context to a request in another context. This allows us to
> + * measure how long the context save/restore take, along with all
> + * the inter-context setup we require.
> + *
> + * A: read CS_TIMESTAMP on GPU
> + * switch context
> + * B: read CS_TIMESTAMP on GPU
> + *
> + * Context switch latency: B - A
> + */
> +
> + err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
> + if (err)
> + return err;
> +
> + for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
> + struct intel_context *arr[] = {
> + ce, ce->engine->kernel_context
> + };
> + u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32);
> +
> + for (j = 0; j < ARRAY_SIZE(arr); j++) {
> + struct i915_request *rq;
> +
> + rq = i915_request_create(arr[j]);
> + if (IS_ERR(rq))
> + return PTR_ERR(rq);
> +
> + if (fence) {
> + err = i915_request_await_dma_fence(rq,
> + &fence->fence);
> + if (err) {
> + i915_request_add(rq);
> + return err;
> + }
> + }
> +
> + cs = intel_ring_begin(rq, 4);
> + if (IS_ERR(cs)) {
> + i915_request_add(rq);
> + return PTR_ERR(cs);
> + }
> +
> + cs = emit_timestamp_store(cs, ce, addr);
> + addr += sizeof(u32);
> +
> + intel_ring_advance(rq, cs);
> +
> + i915_request_put(fence);
> + fence = i915_request_get(rq);
> +
> + i915_request_add(rq);
> + }
> + }
> + i915_request_put(fence);
> + intel_engine_flush_submission(ce->engine);
> +
> + semaphore_set(sema, 1);
> + err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
> + if (err) {
> + intel_gt_set_wedged(ce->engine->gt);
> + return err;
> + }
> +
> + for (i = 1; i <= TF_COUNT; i++)
> + elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1];
> +
> + cycles = trifilter(elapsed);
> + pr_info("%s: context switch latency %d cycles, %lluns\n",
> + ce->engine->name, cycles >> TF_BIAS,
> + cycles_to_ns(ce->engine, cycles));
> +
> + return intel_gt_wait_for_idle(ce->engine->gt, HZ);
> +}
> +
> +static int measure_preemption(struct intel_context *ce)
> +{
> + u32 *sema = hwsp_scratch(ce);
> + const u32 offset = hwsp_offset(ce, sema);
> + u32 elapsed[TF_COUNT], cycles;
> + u32 *cs;
> + int i;
> +
> + /*
> + * We measure two latencies while triggering preemption. The first
> + * latency is how long it takes for us to submit a preempting request.
> + * The second latency is how it takes for us to return from the
> + * preemption back to the original context.
> + *
> + * A: read CS_TIMESTAMP from CPU
> + * submit preemption
> + * B: read CS_TIMESTAMP on GPU (in preempting context)
> + * context switch
> + * C: read CS_TIMESTAMP on GPU (in original context)
> + *
> + * Preemption dispatch latency: B - A
> + * Preemption switch latency: C - B
> + */
> +
> + if (!intel_engine_has_preemption(ce->engine))
> + return 0;
> +
> + for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
> + u32 addr = offset + 2 * i * sizeof(u32);
> + struct i915_request *rq;
> +
> + rq = i915_request_create(ce);
> + if (IS_ERR(rq))
> + return PTR_ERR(rq);
> +
> + cs = intel_ring_begin(rq, 12);
> + if (IS_ERR(cs)) {
> + i915_request_add(rq);
> + return PTR_ERR(cs);
> + }
> +
> + cs = emit_store_dw(cs, addr, -1);
> + cs = emit_semaphore_poll_until(cs, offset, i);
> + cs = emit_timestamp_store(cs, ce, addr + sizeof(u32));
> +
> + intel_ring_advance(rq, cs);
> + i915_request_add(rq);
> +
> + if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) {
> + intel_gt_set_wedged(ce->engine->gt);
> + return -EIO;
> + }
> +
> + rq = i915_request_create(ce->engine->kernel_context);
> + if (IS_ERR(rq))
> + return PTR_ERR(rq);
> +
> + cs = intel_ring_begin(rq, 8);
> + if (IS_ERR(cs)) {
> + i915_request_add(rq);
> + return PTR_ERR(cs);
> + }
> +
> + cs = emit_timestamp_store(cs, ce, addr);
> + cs = emit_store_dw(cs, offset, i);
> +
> + intel_ring_advance(rq, cs);
> + rq->sched.attr.priority = I915_PRIORITY_BARRIER;
> +
> + elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
> + i915_request_add(rq);
> + }
> +
> + if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) {
> + intel_gt_set_wedged(ce->engine->gt);
> + return -EIO;
> + }
> +
> + for (i = 1; i <= TF_COUNT; i++)
> + elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1];
> +
> + cycles = trifilter(elapsed);
> + pr_info("%s: preemption dispatch latency %d cycles, %lluns\n",
> + ce->engine->name, cycles >> TF_BIAS,
> + cycles_to_ns(ce->engine, cycles));
> +
> + for (i = 1; i <= TF_COUNT; i++)
> + elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0];
> +
> + cycles = trifilter(elapsed);
> + pr_info("%s: preemption switch latency %d cycles, %lluns\n",
> + ce->engine->name, cycles >> TF_BIAS,
> + cycles_to_ns(ce->engine, cycles));
> +
> + return intel_gt_wait_for_idle(ce->engine->gt, HZ);
> +}
> +
> +struct signal_cb {
> + struct dma_fence_cb base;
> + bool seen;
> +};
> +
> +static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
> +{
> + struct signal_cb *s = container_of(cb, typeof(*s), base);
> +
> + smp_store_mb(s->seen, true); /* be safe, be strong */
> +}
> +
> +static int measure_completion(struct intel_context *ce)
> +{
> + u32 *sema = hwsp_scratch(ce);
> + const u32 offset = hwsp_offset(ce, sema);
> + u32 elapsed[TF_COUNT], cycles;
> + u32 *cs;
> + int err;
> + int i;
> +
> + /*
> + * Measure how long it takes for the signal (interrupt) to be
> + * sent from the GPU to be processed by the CPU.
> + *
> + * A: read CS_TIMESTAMP on GPU
> + * signal
> + * B: read CS_TIMESTAMP from CPU
> + *
> + * Completion latency: B - A
> + */
> +
> + for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
> + struct signal_cb cb = { .seen = false };
> + struct i915_request *rq;
> +
> + rq = i915_request_create(ce);
> + if (IS_ERR(rq))
> + return PTR_ERR(rq);
> +
> + cs = intel_ring_begin(rq, 12);
> + if (IS_ERR(cs)) {
> + i915_request_add(rq);
> + return PTR_ERR(cs);
> + }
> +
> + cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
> + cs = emit_semaphore_poll_until(cs, offset, i);
> + cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
> +
> + intel_ring_advance(rq, cs);
> +
> + dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
> +
> + local_bh_disable();
> + i915_request_add(rq);
> + local_bh_enable();
> +
> + if (wait_for(READ_ONCE(sema[i]) == -1, 50)) {
> + intel_gt_set_wedged(ce->engine->gt);
> + return -EIO;
> + }
> +
> + preempt_disable();
> + semaphore_set(sema, i);
> + while (!READ_ONCE(cb.seen))
> + cpu_relax();
> +
> + elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
> + preempt_enable();
> + }
> +
> + err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
> + if (err) {
> + intel_gt_set_wedged(ce->engine->gt);
> + return err;
> + }
> +
> + for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
> + GEM_BUG_ON(sema[i + 1] == -1);
> + elapsed[i] = elapsed[i] - sema[i + 1];
> + }
> +
> + cycles = trifilter(elapsed);
> + pr_info("%s: completion latency %d cycles, %lluns\n",
> + ce->engine->name, cycles >> TF_BIAS,
> + cycles_to_ns(ce->engine, cycles));
> +
> + return intel_gt_wait_for_idle(ce->engine->gt, HZ);
> +}
> +
> +static void rps_pin(struct intel_gt *gt)
> +{
> + /* Pin the frequency to max */
> + atomic_inc(>->rps.num_waiters);
> + intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
> +
> + mutex_lock(>->rps.lock);
> + intel_rps_set(>->rps, gt->rps.max_freq);
> + mutex_unlock(>->rps.lock);
> +}
> +
> +static void rps_unpin(struct intel_gt *gt)
> +{
> + intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
> + atomic_dec(>->rps.num_waiters);
> +}
> +
> +static void engine_heartbeat_disable(struct intel_engine_cs *engine)
> +{
> + engine->props.heartbeat_interval_ms = 0;
> +
> + intel_engine_pm_get(engine);
> + intel_engine_park_heartbeat(engine);
> +}
> +
> +static void engine_heartbeat_enable(struct intel_engine_cs *engine)
> +{
> + intel_engine_pm_put(engine);
> +
> + engine->props.heartbeat_interval_ms =
> + engine->defaults.heartbeat_interval_ms;
> +}
> +
> +static int perf_request_latency(void *arg)
> +{
> + struct drm_i915_private *i915 = arg;
> + struct intel_engine_cs *engine;
> + struct pm_qos_request qos;
> + int err = 0;
> +
> + if (INTEL_GEN(i915) < 8) /* per-engine CS timestamp, semaphores */
> + return 0;
> +
> + cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
> +
> + for_each_uabi_engine(engine, i915) {
> + struct intel_context *ce;
> +
> + ce = intel_context_create(engine);
> + if (IS_ERR(ce))
> + goto out;
> +
> + err = intel_context_pin(ce);
> + if (err) {
> + intel_context_put(ce);
> + goto out;
> + }
> +
> + engine_heartbeat_disable(engine);
> + rps_pin(engine->gt);
> +
> + if (err == 0)
> + err = measure_semaphore_response(ce);
> + if (err == 0)
> + err = measure_idle_dispatch(ce);
> + if (err == 0)
> + err = measure_busy_dispatch(ce);
> + if (err == 0)
> + err = measure_inter_request(ce);
> + if (err == 0)
> + err = measure_context_switch(ce);
> + if (err == 0)
> + err = measure_preemption(ce);
> + if (err == 0)
> + err = measure_completion(ce);
> +
> + rps_unpin(engine->gt);
> + engine_heartbeat_enable(engine);
> +
> + intel_context_unpin(ce);
> + intel_context_put(ce);
> + if (err)
> + goto out;
> + }
> +
> +out:
> + if (igt_flush_test(i915))
> + err = -EIO;
> +
> + cpu_latency_qos_remove_request(&qos);
> + return err;
> +}
> +
> static int s_sync0(void *arg)
> {
> struct perf_series *ps = arg;
> @@ -2042,6 +2820,7 @@ static int perf_parallel_engines(void *arg)
> int i915_request_perf_selftests(struct drm_i915_private *i915)
> {
> static const struct i915_subtest tests[] = {
> + SUBTEST(perf_request_latency),
> SUBTEST(perf_series_engines),
> SUBTEST(perf_parallel_engines),
> };
> --
> 2.20.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [Intel-gfx] [PATCH] drm/i915/selftests: Measure dispatch latency
2020-05-19 12:47 ` Mika Kuoppala
@ 2020-05-19 12:56 ` Chris Wilson
0 siblings, 0 replies; 13+ messages in thread
From: Chris Wilson @ 2020-05-19 12:56 UTC (permalink / raw)
To: Mika Kuoppala, intel-gfx
Quoting Mika Kuoppala (2020-05-19 13:47:31)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
>
> > A useful metric of the system's health is how fast we can tell the GPU
> > to do various actions, so measure our latency.
> >
> > v2: Refactor all the instruction building into emitters.
> >
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> > Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
>
> Not much nitpicking left. Could have used one goto in the fence
> using tests on error paths but meh.
Error handling is not great here, I agree.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 13+ messages in thread
* [Intel-gfx] [PATCH] drm/i915/selftests: Measure dispatch latency
@ 2020-05-15 15:10 Chris Wilson
2020-05-15 17:32 ` Chris Wilson
` (2 more replies)
0 siblings, 3 replies; 13+ messages in thread
From: Chris Wilson @ 2020-05-15 15:10 UTC (permalink / raw)
To: intel-gfx; +Cc: Chris Wilson
A useful metric of the system's health is how fast we can tell the GPU
to do various actions, so measure our latency.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
drivers/gpu/drm/i915/selftests/i915_request.c | 505 ++++++++++++++++++
1 file changed, 505 insertions(+)
diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c
index 6014e8dfcbb1..94c79066f82a 100644
--- a/drivers/gpu/drm/i915/selftests/i915_request.c
+++ b/drivers/gpu/drm/i915/selftests/i915_request.c
@@ -24,13 +24,16 @@
#include <linux/prime_numbers.h>
#include <linux/pm_qos.h>
+#include <linux/sort.h>
#include "gem/i915_gem_pm.h"
#include "gem/selftests/mock_context.h"
+#include "gt/intel_engine_heartbeat.h"
#include "gt/intel_engine_pm.h"
#include "gt/intel_engine_user.h"
#include "gt/intel_gt.h"
+#include "gt/intel_gt_requests.h"
#include "i915_random.h"
#include "i915_selftest.h"
@@ -1524,6 +1527,507 @@ struct perf_series {
struct intel_context *ce[];
};
+#define COUNT 5
+
+static int cmp_u32(const void *A, const void *B)
+{
+ const u32 *a = A, *b = B;
+
+ return *a - *b;
+}
+
+static u32 trifilter(u32 *a)
+{
+ u64 sum;
+
+ sort(a, COUNT, sizeof(*a), cmp_u32, NULL);
+
+ sum = mul_u32_u32(a[2], 2);
+ sum += a[1];
+ sum += a[3];
+
+ return sum >> 2;
+}
+
+static int measure_semaphore_response(struct intel_context *ce)
+{
+ u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+ const u32 offset =
+ i915_ggtt_offset(ce->engine->status_page.vma) +
+ offset_in_page(sema);
+ u32 elapsed[COUNT], cycles;
+ struct i915_request *rq;
+ u32 *cs;
+ int i;
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 4 + 8 * ARRAY_SIZE(elapsed));
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+ *cs++ = offset;
+ *cs++ = 0;
+ *cs++ = 0xffffffff;
+
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ *cs++ = MI_SEMAPHORE_WAIT |
+ MI_SEMAPHORE_GLOBAL_GTT |
+ MI_SEMAPHORE_POLL |
+ MI_SEMAPHORE_SAD_EQ_SDD;
+ *cs++ = i;
+ *cs++ = offset;
+ *cs++ = 0;
+
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+ *cs++ = ce->engine->mmio_base + 0x358;
+ *cs++ = offset + sizeof(*sema) * i;
+ *cs++ = 0;
+ }
+
+ intel_ring_advance(rq, cs);
+ i915_request_add(rq);
+
+ if (wait_for(READ_ONCE(*sema) == 0xffffffff, 50)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+ WRITE_ONCE(sema[0], i);
+ wmb();
+
+ if (wait_for(READ_ONCE(sema[i]), 50)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ elapsed[i - 1] = (sema[i] - cycles) << COUNT;
+ }
+
+ cycles = trifilter(elapsed);
+ pr_err("%s: semaphore response %d cycles, %lluns\n",
+ ce->engine->name, cycles >> COUNT,
+ i915_cs_timestamp_ticks_to_ns(ce->engine->i915,
+ cycles) >> COUNT);
+
+ return 0;
+}
+
+static int measure_idle_dispatch(struct intel_context *ce)
+{
+ u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+ const u32 offset =
+ i915_ggtt_offset(ce->engine->status_page.vma) +
+ offset_in_page(sema);
+ u32 elapsed[COUNT], cycles;
+ u32 *cs;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+ struct i915_request *rq;
+ int err;
+
+ err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+ if (err)
+ return err;
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 4);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+ *cs++ = ce->engine->mmio_base + 0x358;
+ *cs++ = offset + sizeof(*sema) * i;
+ *cs++ = 0;
+
+ intel_ring_advance(rq, cs);
+
+ elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+ i915_request_add(rq);
+ }
+
+ for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+ if (wait_for(READ_ONCE(sema[i]), 50)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ elapsed[i] = (sema[i] - elapsed[i]) << COUNT;
+ }
+
+ cycles = trifilter(elapsed);
+ pr_err("%s: idle dispatch latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> COUNT,
+ i915_cs_timestamp_ticks_to_ns(ce->engine->i915,
+ cycles) >> COUNT);
+
+ return 0;
+}
+
+static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
+{
+ const u32 offset =
+ i915_ggtt_offset(engine->status_page.vma) +
+ offset_in_page(sema);
+ struct i915_request *rq;
+ u32 *cs;
+
+ rq = i915_request_create(engine->kernel_context);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 4);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ *cs++ = MI_SEMAPHORE_WAIT |
+ MI_SEMAPHORE_GLOBAL_GTT |
+ MI_SEMAPHORE_POLL |
+ mode;
+ *cs++ = value;
+ *cs++ = offset;
+ *cs++ = 0;
+
+ intel_ring_advance(rq, cs);
+ i915_request_add(rq);
+
+ return 0;
+}
+
+static int measure_inter_request(struct intel_context *ce)
+{
+ u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+ const u32 offset =
+ i915_ggtt_offset(ce->engine->status_page.vma) +
+ offset_in_page(sema);
+ u32 elapsed[COUNT + 1], cycles;
+ struct i915_sw_fence *submit;
+ int i, err;
+
+ err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+ if (err)
+ return err;
+
+ submit = heap_fence_create(GFP_KERNEL);
+ if (!submit) {
+ WRITE_ONCE(sema[0], 1);
+ return -ENOMEM;
+ }
+
+ intel_engine_flush_submission(ce->engine);
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ struct i915_request *rq;
+ u32 *cs;
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq)) {
+ WRITE_ONCE(sema[0], 1);
+ return PTR_ERR(rq);
+ }
+
+ err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
+ submit,
+ GFP_KERNEL);
+ if (err < 0) {
+ i915_sw_fence_commit(submit);
+ heap_fence_put(submit);
+ i915_request_add(rq);
+ WRITE_ONCE(sema[0], 1);
+ return err;
+ }
+
+ cs = intel_ring_begin(rq, 4);
+ if (IS_ERR(cs)) {
+ i915_sw_fence_commit(submit);
+ heap_fence_put(submit);
+ i915_request_add(rq);
+ WRITE_ONCE(sema[0], 1);
+ return PTR_ERR(cs);
+ }
+
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+ *cs++ = ce->engine->mmio_base + 0x358;
+ *cs++ = offset + sizeof(*sema) * i;
+ *cs++ = 0;
+
+ intel_ring_advance(rq, cs);
+ i915_request_add(rq);
+ }
+ i915_sw_fence_commit(submit);
+ intel_engine_flush_submission(ce->engine);
+ heap_fence_put(submit);
+
+ WRITE_ONCE(sema[0], 1);
+ if (wait_for(READ_ONCE(sema[COUNT + 1]), 100)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ for (i = 1; i <= COUNT; i++)
+ elapsed[i - 1] = (sema[i + 1] - sema[i]) << COUNT;
+
+ cycles = trifilter(elapsed);
+ pr_err("%s: inter-request latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> COUNT,
+ i915_cs_timestamp_ticks_to_ns(ce->engine->i915,
+ cycles) >> COUNT);
+
+ return 0;
+}
+
+static int measure_lite_restore(struct intel_context *ce)
+{
+ u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+ const u32 offset =
+ i915_ggtt_offset(ce->engine->status_page.vma) +
+ offset_in_page(sema);
+ u32 elapsed[COUNT + 1], cycles;
+ u32 *cs;
+ int i;
+
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ struct i915_request *rq;
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 12);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+ *cs++ = ce->engine->mmio_base + 0x358;
+ *cs++ = offset + sizeof(*sema) * (2 * i + 0);
+ *cs++ = 0;
+
+ *cs++ = MI_SEMAPHORE_WAIT |
+ MI_SEMAPHORE_GLOBAL_GTT |
+ MI_SEMAPHORE_POLL |
+ MI_SEMAPHORE_SAD_GTE_SDD;
+ *cs++ = i;
+ *cs++ = offset;
+ *cs++ = 0;
+
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+ *cs++ = ce->engine->mmio_base + 0x358;
+ *cs++ = offset + sizeof(*sema) * (2 * i + 1);
+ *cs++ = 0;
+
+ intel_ring_advance(rq, cs);
+ i915_request_add(rq);
+ intel_engine_flush_submission(ce->engine);
+
+ WRITE_ONCE(sema[0], i - 1);
+ if (i > 1 && wait_for(READ_ONCE(sema[2 * (i - 1)]), 100)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+ }
+
+ WRITE_ONCE(sema[0], i - 1);
+ if (wait_for(READ_ONCE(sema[2 *i - 1]), 500)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ for (i = 1; i <= COUNT; i++)
+ elapsed[i - 1] = (sema[2 * i + 2] - sema[2 * i + 1]) << COUNT;
+
+ cycles = trifilter(elapsed);
+ pr_err("%s: lite restore latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> COUNT,
+ i915_cs_timestamp_ticks_to_ns(ce->engine->i915,
+ cycles) >> COUNT);
+
+ return 0;
+}
+
+static int measure_context_switch(struct intel_context *ce)
+{
+ u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+ const u32 offset =
+ i915_ggtt_offset(ce->engine->status_page.vma) +
+ offset_in_page(sema);
+ struct i915_request *fence = NULL;
+ u32 elapsed[COUNT + 1], cycles;
+ int i, j, err;
+ u32 *cs;
+
+ err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+ if (err)
+ return err;
+
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ struct intel_context *arr[] = {
+ ce, ce->engine->kernel_context
+ };
+
+ for (j = 0; j < ARRAY_SIZE(arr); j++) {
+ struct i915_request *rq;
+
+ rq = i915_request_create(arr[j]);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ if (fence) {
+ err = i915_request_await_dma_fence(rq,
+ &fence->fence);
+ if (err) {
+ i915_request_add(rq);
+ return err;
+ }
+ }
+
+ cs = intel_ring_begin(rq, 4);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+ *cs++ = ce->engine->mmio_base + 0x358;
+ *cs++ = offset +
+ sizeof(*sema) * (ARRAY_SIZE(arr) * i + j);
+ *cs++ = 0;
+
+ intel_ring_advance(rq, cs);
+
+ i915_request_put(fence);
+ fence = i915_request_get(rq);
+
+ i915_request_add(rq);
+ }
+ }
+ i915_request_put(fence);
+ intel_engine_flush_submission(ce->engine);
+
+ WRITE_ONCE(sema[0], 1);
+ if (wait_for(READ_ONCE(sema[2 *i - 1]), 500)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ for (i = 1; i <= COUNT; i++)
+ elapsed[i - 1] = (sema[2 * i + 2] - sema[2 * i + 1]) << COUNT;
+
+ cycles = trifilter(elapsed);
+ pr_err("%s: context switch latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> COUNT,
+ i915_cs_timestamp_ticks_to_ns(ce->engine->i915,
+ cycles) >> COUNT);
+
+ return 0;
+}
+
+static void rps_pin(struct intel_gt *gt)
+{
+ /* Pin the frequency to max */
+ atomic_inc(>->rps.num_waiters);
+
+ mutex_lock(>->rps.lock);
+ intel_rps_set(>->rps, gt->rps.max_freq);
+ mutex_unlock(>->rps.lock);
+}
+
+static void rps_unpin(struct intel_gt *gt)
+{
+ atomic_dec(>->rps.num_waiters);
+}
+
+static unsigned long engine_heartbeat_disable(struct intel_engine_cs *engine)
+{
+ unsigned long old;
+
+ old = fetch_and_zero(&engine->props.heartbeat_interval_ms);
+
+ intel_engine_pm_get(engine);
+ intel_engine_park_heartbeat(engine);
+
+ return old;
+}
+
+static void engine_heartbeat_enable(struct intel_engine_cs *engine,
+ unsigned long saved)
+{
+ intel_engine_pm_put(engine);
+
+ engine->props.heartbeat_interval_ms = saved;
+}
+
+static int perf_request_latency(void *arg)
+{
+ struct drm_i915_private *i915 = arg;
+ struct intel_engine_cs *engine;
+ struct pm_qos_request qos;
+ int err = 0;
+
+ if (INTEL_GEN(i915) < 8) /* per-engine CS timestamp, semaphores */
+ return 0;
+
+ cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
+
+ for_each_uabi_engine(engine, i915) {
+ unsigned long saved_heartbeat;
+ struct intel_context *ce;
+
+ ce = intel_context_create(engine);
+ if (IS_ERR(ce))
+ goto out;
+
+ err = intel_context_pin(ce);
+ if (err) {
+ intel_context_put(ce);
+ goto out;
+ }
+
+ saved_heartbeat = engine_heartbeat_disable(engine);
+ intel_uncore_forcewake_get(engine->uncore, FORCEWAKE_ALL);
+ rps_pin(engine->gt);
+
+ if (err == 0)
+ err = measure_semaphore_response(ce);
+ if (err == 0)
+ err = measure_idle_dispatch(ce);
+ if (err == 0)
+ err = measure_inter_request(ce);
+ if (err == 0)
+ err = measure_lite_restore(ce);
+ if (err == 0)
+ err = measure_context_switch(ce);
+
+ rps_unpin(engine->gt);
+ intel_uncore_forcewake_put(engine->uncore, FORCEWAKE_ALL);
+ engine_heartbeat_enable(engine, saved_heartbeat);
+
+ intel_context_unpin(ce);
+ intel_context_put(ce);
+ if (err)
+ goto out;
+ }
+
+out:
+ cpu_latency_qos_remove_request(&qos);
+ return err;
+}
+
static int s_sync0(void *arg)
{
struct perf_series *ps = arg;
@@ -2042,6 +2546,7 @@ static int perf_parallel_engines(void *arg)
int i915_request_perf_selftests(struct drm_i915_private *i915)
{
static const struct i915_subtest tests[] = {
+ SUBTEST(perf_request_latency),
SUBTEST(perf_series_engines),
SUBTEST(perf_parallel_engines),
};
--
2.20.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 13+ messages in thread
* [Intel-gfx] [PATCH] drm/i915/selftests: Measure dispatch latency
2020-05-15 15:10 Chris Wilson
@ 2020-05-15 17:32 ` Chris Wilson
2020-05-15 17:58 ` Chris Wilson
2020-05-15 18:02 ` Chris Wilson
2 siblings, 0 replies; 13+ messages in thread
From: Chris Wilson @ 2020-05-15 17:32 UTC (permalink / raw)
To: intel-gfx; +Cc: Chris Wilson
A useful metric of the system's health is how fast we can tell the GPU
to do various actions, so measure our latency.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
drivers/gpu/drm/i915/selftests/i915_request.c | 521 ++++++++++++++++++
1 file changed, 521 insertions(+)
diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c
index 6014e8dfcbb1..a47ac68955ec 100644
--- a/drivers/gpu/drm/i915/selftests/i915_request.c
+++ b/drivers/gpu/drm/i915/selftests/i915_request.c
@@ -24,16 +24,20 @@
#include <linux/prime_numbers.h>
#include <linux/pm_qos.h>
+#include <linux/sort.h>
#include "gem/i915_gem_pm.h"
#include "gem/selftests/mock_context.h"
+#include "gt/intel_engine_heartbeat.h"
#include "gt/intel_engine_pm.h"
#include "gt/intel_engine_user.h"
#include "gt/intel_gt.h"
+#include "gt/intel_gt_requests.h"
#include "i915_random.h"
#include "i915_selftest.h"
+#include "igt_flush_test.h"
#include "igt_live_test.h"
#include "igt_spinner.h"
#include "lib_sw_fence.h"
@@ -1524,6 +1528,522 @@ struct perf_series {
struct intel_context *ce[];
};
+#define COUNT 5
+
+static int cmp_u32(const void *A, const void *B)
+{
+ const u32 *a = A, *b = B;
+
+ return *a - *b;
+}
+
+static u32 trifilter(u32 *a)
+{
+ u64 sum;
+
+ sort(a, COUNT, sizeof(*a), cmp_u32, NULL);
+
+ sum = mul_u32_u32(a[2], 2);
+ sum += a[1];
+ sum += a[3];
+
+ return (sum + 2) >> 2;
+}
+
+static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
+{
+ u64 ns = i915_cs_timestamp_ticks_to_ns(engine->i915, cycles);
+
+ return DIV_ROUND_CLOSEST(ns, 1 << COUNT);
+}
+
+static int measure_semaphore_response(struct intel_context *ce)
+{
+ u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+ const u32 offset =
+ i915_ggtt_offset(ce->engine->status_page.vma) +
+ offset_in_page(sema);
+ u32 elapsed[COUNT], cycles;
+ struct i915_request *rq;
+ u32 *cs;
+ int i;
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 4 + 8 * ARRAY_SIZE(elapsed));
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+ *cs++ = offset;
+ *cs++ = 0;
+ *cs++ = 0xffffffff;
+
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ *cs++ = MI_SEMAPHORE_WAIT |
+ MI_SEMAPHORE_GLOBAL_GTT |
+ MI_SEMAPHORE_POLL |
+ MI_SEMAPHORE_SAD_EQ_SDD;
+ *cs++ = i;
+ *cs++ = offset;
+ *cs++ = 0;
+
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+ *cs++ = ce->engine->mmio_base + 0x358;
+ *cs++ = offset + sizeof(*sema) * i;
+ *cs++ = 0;
+ }
+
+ intel_ring_advance(rq, cs);
+ i915_request_add(rq);
+
+ if (wait_for(READ_ONCE(*sema) == 0xffffffff, 50)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+ WRITE_ONCE(sema[0], i);
+ wmb(); /* flush the update to the cache, and beyond */
+
+ if (wait_for(READ_ONCE(sema[i]), 50)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ elapsed[i - 1] = (sema[i] - cycles) << COUNT;
+ }
+
+ cycles = trifilter(elapsed);
+ pr_err("%s: semaphore response %d cycles, %lluns\n",
+ ce->engine->name, cycles >> COUNT,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_idle_dispatch(struct intel_context *ce)
+{
+ u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+ const u32 offset =
+ i915_ggtt_offset(ce->engine->status_page.vma) +
+ offset_in_page(sema);
+ u32 elapsed[COUNT], cycles;
+ u32 *cs;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+ struct i915_request *rq;
+ int err;
+
+ err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+ if (err)
+ return err;
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 4);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+ *cs++ = ce->engine->mmio_base + 0x358;
+ *cs++ = offset + sizeof(*sema) * i;
+ *cs++ = 0;
+
+ intel_ring_advance(rq, cs);
+
+ elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+
+ local_bh_disable();
+ i915_request_add(rq);
+ local_bh_enable();
+ }
+
+ for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+ if (wait_for(READ_ONCE(sema[i]), 50)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ elapsed[i] = (sema[i] - elapsed[i]) << COUNT;
+ }
+
+ cycles = trifilter(elapsed);
+ pr_err("%s: idle dispatch latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> COUNT,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_busy_dispatch(struct intel_context *ce)
+{
+ u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+ const u32 offset =
+ i915_ggtt_offset(ce->engine->status_page.vma) +
+ offset_in_page(sema);
+ u32 elapsed[COUNT + 1], cycles;
+ u32 *cs;
+ int i;
+
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ struct i915_request *rq;
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 12);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+ *cs++ = ce->engine->mmio_base + 0x358;
+ *cs++ = offset + sizeof(*sema) * i;
+ *cs++ = 0;
+
+ *cs++ = MI_SEMAPHORE_WAIT |
+ MI_SEMAPHORE_GLOBAL_GTT |
+ MI_SEMAPHORE_POLL |
+ MI_SEMAPHORE_SAD_GTE_SDD;
+ *cs++ = i;
+ *cs++ = offset;
+ *cs++ = 0;
+
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+ *cs++ = ce->engine->mmio_base + 0x358;
+ *cs++ = offset + sizeof(*sema) * i;
+ *cs++ = 0;
+
+ intel_ring_advance(rq, cs);
+
+ if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+
+ local_bh_disable();
+ i915_request_add(rq);
+ local_bh_enable();
+
+ WRITE_ONCE(sema[0], i - 1);
+ wmb(); /* flush the update to the cache, and beyond */
+ }
+ WRITE_ONCE(sema[0], i - 1);
+
+ for (i = 1; i <= COUNT; i++)
+ elapsed[i - 1] = (sema[i] - elapsed[i]) << COUNT;
+
+ cycles = trifilter(elapsed);
+ pr_err("%s: busy dispatch latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> COUNT,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
+{
+ const u32 offset =
+ i915_ggtt_offset(engine->status_page.vma) +
+ offset_in_page(sema);
+ struct i915_request *rq;
+ u32 *cs;
+
+ rq = i915_request_create(engine->kernel_context);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 4);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ *cs++ = MI_SEMAPHORE_WAIT |
+ MI_SEMAPHORE_GLOBAL_GTT |
+ MI_SEMAPHORE_POLL |
+ mode;
+ *cs++ = value;
+ *cs++ = offset;
+ *cs++ = 0;
+
+ intel_ring_advance(rq, cs);
+ i915_request_add(rq);
+
+ return 0;
+}
+
+static int measure_inter_request(struct intel_context *ce)
+{
+ u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+ const u32 offset =
+ i915_ggtt_offset(ce->engine->status_page.vma) +
+ offset_in_page(sema);
+ u32 elapsed[COUNT + 1], cycles;
+ struct i915_sw_fence *submit;
+ int i, err;
+
+ err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+ if (err)
+ return err;
+
+ submit = heap_fence_create(GFP_KERNEL);
+ if (!submit) {
+ WRITE_ONCE(sema[0], 1);
+ return -ENOMEM;
+ }
+
+ intel_engine_flush_submission(ce->engine);
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ struct i915_request *rq;
+ u32 *cs;
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq)) {
+ WRITE_ONCE(sema[0], 1);
+ return PTR_ERR(rq);
+ }
+
+ err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
+ submit,
+ GFP_KERNEL);
+ if (err < 0) {
+ i915_sw_fence_commit(submit);
+ heap_fence_put(submit);
+ i915_request_add(rq);
+ WRITE_ONCE(sema[0], 1);
+ return err;
+ }
+
+ cs = intel_ring_begin(rq, 4);
+ if (IS_ERR(cs)) {
+ i915_sw_fence_commit(submit);
+ heap_fence_put(submit);
+ i915_request_add(rq);
+ WRITE_ONCE(sema[0], 1);
+ return PTR_ERR(cs);
+ }
+
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+ *cs++ = ce->engine->mmio_base + 0x358;
+ *cs++ = offset + sizeof(*sema) * i;
+ *cs++ = 0;
+
+ intel_ring_advance(rq, cs);
+ i915_request_add(rq);
+ }
+ local_bh_disable();
+ i915_sw_fence_commit(submit);
+ local_bh_enable();
+ intel_engine_flush_submission(ce->engine);
+ heap_fence_put(submit);
+
+ WRITE_ONCE(sema[0], 1);
+ wmb(); /* flush the update to the cache, and beyond */
+
+ if (wait_for(READ_ONCE(sema[COUNT + 1]), 100)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ for (i = 1; i <= COUNT; i++)
+ elapsed[i - 1] = (sema[i + 1] - sema[i]) << COUNT;
+
+ cycles = trifilter(elapsed);
+ pr_err("%s: inter-request latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> COUNT,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_context_switch(struct intel_context *ce)
+{
+ u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+ const u32 offset =
+ i915_ggtt_offset(ce->engine->status_page.vma) +
+ offset_in_page(sema);
+ struct i915_request *fence = NULL;
+ u32 elapsed[COUNT + 1], cycles;
+ int i, j, err;
+ u32 *cs;
+
+ err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+ if (err)
+ return err;
+
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ struct intel_context *arr[] = {
+ ce, ce->engine->kernel_context
+ };
+
+ for (j = 0; j < ARRAY_SIZE(arr); j++) {
+ struct i915_request *rq;
+
+ rq = i915_request_create(arr[j]);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ if (fence) {
+ err = i915_request_await_dma_fence(rq,
+ &fence->fence);
+ if (err) {
+ i915_request_add(rq);
+ return err;
+ }
+ }
+
+ cs = intel_ring_begin(rq, 4);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+ *cs++ = ce->engine->mmio_base + 0x358;
+ *cs++ = offset +
+ sizeof(*sema) * (ARRAY_SIZE(arr) * i + j);
+ *cs++ = 0;
+
+ intel_ring_advance(rq, cs);
+
+ i915_request_put(fence);
+ fence = i915_request_get(rq);
+
+ i915_request_add(rq);
+ }
+ }
+ i915_request_put(fence);
+ intel_engine_flush_submission(ce->engine);
+
+ WRITE_ONCE(sema[0], 1);
+ wmb(); /* flush the update to the cache, and beyond */
+
+ if (wait_for(READ_ONCE(sema[2 *i - 1]), 500)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ for (i = 1; i <= COUNT; i++)
+ elapsed[i - 1] = (sema[2 * i + 2] - sema[2 * i + 1]) << COUNT;
+
+ cycles = trifilter(elapsed);
+ pr_err("%s: context switch latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> COUNT,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static void rps_pin(struct intel_gt *gt)
+{
+ /* Pin the frequency to max */
+ atomic_inc(>->rps.num_waiters);
+ intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
+
+ mutex_lock(>->rps.lock);
+ intel_rps_set(>->rps, gt->rps.max_freq);
+ mutex_unlock(>->rps.lock);
+}
+
+static void rps_unpin(struct intel_gt *gt)
+{
+ intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
+ atomic_dec(>->rps.num_waiters);
+}
+
+static unsigned long engine_heartbeat_disable(struct intel_engine_cs *engine)
+{
+ unsigned long old;
+
+ old = fetch_and_zero(&engine->props.heartbeat_interval_ms);
+
+ intel_engine_pm_get(engine);
+ intel_engine_park_heartbeat(engine);
+
+ return old;
+}
+
+static void engine_heartbeat_enable(struct intel_engine_cs *engine,
+ unsigned long saved)
+{
+ intel_engine_pm_put(engine);
+
+ engine->props.heartbeat_interval_ms = saved;
+}
+
+static int perf_request_latency(void *arg)
+{
+ struct drm_i915_private *i915 = arg;
+ struct intel_engine_cs *engine;
+ struct pm_qos_request qos;
+ int err = 0;
+
+ if (INTEL_GEN(i915) < 8) /* per-engine CS timestamp, semaphores */
+ return 0;
+
+ cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
+
+ for_each_uabi_engine(engine, i915) {
+ unsigned long saved_heartbeat;
+ struct intel_context *ce;
+
+ ce = intel_context_create(engine);
+ if (IS_ERR(ce))
+ goto out;
+
+ err = intel_context_pin(ce);
+ if (err) {
+ intel_context_put(ce);
+ goto out;
+ }
+
+ saved_heartbeat = engine_heartbeat_disable(engine);
+ rps_pin(engine->gt);
+
+ if (err == 0)
+ err = measure_semaphore_response(ce);
+ if (err == 0)
+ err = measure_idle_dispatch(ce);
+ if (err == 0)
+ err = measure_busy_dispatch(ce);
+ if (err == 0)
+ err = measure_inter_request(ce);
+ if (err == 0)
+ err = measure_context_switch(ce);
+
+ rps_unpin(engine->gt);
+ engine_heartbeat_enable(engine, saved_heartbeat);
+
+ intel_context_unpin(ce);
+ intel_context_put(ce);
+ if (err)
+ goto out;
+ }
+
+out:
+ if (igt_flush_test(i915))
+ err = -EIO;
+
+ cpu_latency_qos_remove_request(&qos);
+ return err;
+}
+
static int s_sync0(void *arg)
{
struct perf_series *ps = arg;
@@ -2042,6 +2562,7 @@ static int perf_parallel_engines(void *arg)
int i915_request_perf_selftests(struct drm_i915_private *i915)
{
static const struct i915_subtest tests[] = {
+ SUBTEST(perf_request_latency),
SUBTEST(perf_series_engines),
SUBTEST(perf_parallel_engines),
};
--
2.20.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 13+ messages in thread
* [Intel-gfx] [PATCH] drm/i915/selftests: Measure dispatch latency
2020-05-15 15:10 Chris Wilson
2020-05-15 17:32 ` Chris Wilson
@ 2020-05-15 17:58 ` Chris Wilson
2020-05-15 18:02 ` Chris Wilson
2 siblings, 0 replies; 13+ messages in thread
From: Chris Wilson @ 2020-05-15 17:58 UTC (permalink / raw)
To: intel-gfx; +Cc: Chris Wilson
A useful metric of the system's health is how fast we can tell the GPU
to do various actions, so measure our latency.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
drivers/gpu/drm/i915/selftests/i915_request.c | 612 ++++++++++++++++++
1 file changed, 612 insertions(+)
diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c
index 6014e8dfcbb1..c265501c26a8 100644
--- a/drivers/gpu/drm/i915/selftests/i915_request.c
+++ b/drivers/gpu/drm/i915/selftests/i915_request.c
@@ -24,16 +24,20 @@
#include <linux/prime_numbers.h>
#include <linux/pm_qos.h>
+#include <linux/sort.h>
#include "gem/i915_gem_pm.h"
#include "gem/selftests/mock_context.h"
+#include "gt/intel_engine_heartbeat.h"
#include "gt/intel_engine_pm.h"
#include "gt/intel_engine_user.h"
#include "gt/intel_gt.h"
+#include "gt/intel_gt_requests.h"
#include "i915_random.h"
#include "i915_selftest.h"
+#include "igt_flush_test.h"
#include "igt_live_test.h"
#include "igt_spinner.h"
#include "lib_sw_fence.h"
@@ -1524,6 +1528,613 @@ struct perf_series {
struct intel_context *ce[];
};
+#define COUNT 5
+
+static int cmp_u32(const void *A, const void *B)
+{
+ const u32 *a = A, *b = B;
+
+ return *a - *b;
+}
+
+static u32 trifilter(u32 *a)
+{
+ u64 sum;
+
+ sort(a, COUNT, sizeof(*a), cmp_u32, NULL);
+
+ sum = mul_u32_u32(a[2], 2);
+ sum += a[1];
+ sum += a[3];
+
+ return (sum + 2) >> 2;
+}
+
+static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
+{
+ u64 ns = i915_cs_timestamp_ticks_to_ns(engine->i915, cycles);
+
+ return DIV_ROUND_CLOSEST(ns, 1 << COUNT);
+}
+
+static int measure_semaphore_response(struct intel_context *ce)
+{
+ u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+ const u32 offset =
+ i915_ggtt_offset(ce->engine->status_page.vma) +
+ offset_in_page(sema);
+ u32 elapsed[COUNT], cycles;
+ struct i915_request *rq;
+ u32 *cs;
+ int i;
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 4 + 8 * ARRAY_SIZE(elapsed));
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+ *cs++ = offset;
+ *cs++ = 0;
+ *cs++ = 0xffffffff;
+
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ *cs++ = MI_SEMAPHORE_WAIT |
+ MI_SEMAPHORE_GLOBAL_GTT |
+ MI_SEMAPHORE_POLL |
+ MI_SEMAPHORE_SAD_EQ_SDD;
+ *cs++ = i;
+ *cs++ = offset;
+ *cs++ = 0;
+
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+ *cs++ = ce->engine->mmio_base + 0x358;
+ *cs++ = offset + sizeof(*sema) * i;
+ *cs++ = 0;
+ }
+
+ intel_ring_advance(rq, cs);
+ i915_request_add(rq);
+
+ if (wait_for(READ_ONCE(*sema) == 0xffffffff, 50)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+ WRITE_ONCE(sema[0], i);
+ wmb(); /* flush the update to the cache, and beyond */
+
+ if (wait_for(READ_ONCE(sema[i]), 50)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ elapsed[i - 1] = (sema[i] - cycles) << COUNT;
+ }
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: semaphore response %d cycles, %lluns\n",
+ ce->engine->name, cycles >> COUNT,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_idle_dispatch(struct intel_context *ce)
+{
+ u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+ const u32 offset =
+ i915_ggtt_offset(ce->engine->status_page.vma) +
+ offset_in_page(sema);
+ u32 elapsed[COUNT], cycles;
+ u32 *cs;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+ struct i915_request *rq;
+ int err;
+
+ err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+ if (err)
+ return err;
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 4);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+ *cs++ = ce->engine->mmio_base + 0x358;
+ *cs++ = offset + sizeof(*sema) * i;
+ *cs++ = 0;
+
+ intel_ring_advance(rq, cs);
+
+ preempt_disable();
+ local_bh_disable();
+ elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+ i915_request_add(rq);
+ local_bh_enable();
+ preempt_enable();
+ }
+
+ for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+ if (wait_for(READ_ONCE(sema[i]), 50)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ elapsed[i] = (sema[i] - elapsed[i]) << COUNT;
+ }
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> COUNT,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_busy_dispatch(struct intel_context *ce)
+{
+ u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+ const u32 offset =
+ i915_ggtt_offset(ce->engine->status_page.vma) +
+ offset_in_page(sema);
+ u32 elapsed[COUNT + 1], cycles;
+ u32 *cs;
+ int i;
+
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ struct i915_request *rq;
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 12);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+ *cs++ = ce->engine->mmio_base + 0x358;
+ *cs++ = offset + sizeof(*sema) * i;
+ *cs++ = 0;
+
+ *cs++ = MI_SEMAPHORE_WAIT |
+ MI_SEMAPHORE_GLOBAL_GTT |
+ MI_SEMAPHORE_POLL |
+ MI_SEMAPHORE_SAD_GTE_SDD;
+ *cs++ = i;
+ *cs++ = offset;
+ *cs++ = 0;
+
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+ *cs++ = ce->engine->mmio_base + 0x358;
+ *cs++ = offset + sizeof(*sema) * i;
+ *cs++ = 0;
+
+ intel_ring_advance(rq, cs);
+
+ if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ preempt_disable();
+ local_bh_disable();
+ elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+ i915_request_add(rq);
+ local_bh_enable();
+ WRITE_ONCE(sema[0], i - 1);
+ wmb(); /* flush the update to the cache, and beyond */
+ preempt_enable();
+ }
+ WRITE_ONCE(sema[0], i - 1);
+
+ for (i = 1; i <= COUNT; i++)
+ elapsed[i - 1] = (sema[i] - elapsed[i]) << COUNT;
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> COUNT,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
+{
+ const u32 offset =
+ i915_ggtt_offset(engine->status_page.vma) +
+ offset_in_page(sema);
+ struct i915_request *rq;
+ u32 *cs;
+
+ rq = i915_request_create(engine->kernel_context);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 4);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ *cs++ = MI_SEMAPHORE_WAIT |
+ MI_SEMAPHORE_GLOBAL_GTT |
+ MI_SEMAPHORE_POLL |
+ mode;
+ *cs++ = value;
+ *cs++ = offset;
+ *cs++ = 0;
+
+ intel_ring_advance(rq, cs);
+ i915_request_add(rq);
+
+ return 0;
+}
+
+static int measure_inter_request(struct intel_context *ce)
+{
+ u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+ const u32 offset =
+ i915_ggtt_offset(ce->engine->status_page.vma) +
+ offset_in_page(sema);
+ u32 elapsed[COUNT + 1], cycles;
+ struct i915_sw_fence *submit;
+ int i, err;
+
+ err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+ if (err)
+ return err;
+
+ submit = heap_fence_create(GFP_KERNEL);
+ if (!submit) {
+ WRITE_ONCE(sema[0], 1);
+ return -ENOMEM;
+ }
+
+ intel_engine_flush_submission(ce->engine);
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ struct i915_request *rq;
+ u32 *cs;
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq)) {
+ WRITE_ONCE(sema[0], 1);
+ return PTR_ERR(rq);
+ }
+
+ err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
+ submit,
+ GFP_KERNEL);
+ if (err < 0) {
+ i915_sw_fence_commit(submit);
+ heap_fence_put(submit);
+ i915_request_add(rq);
+ WRITE_ONCE(sema[0], 1);
+ return err;
+ }
+
+ cs = intel_ring_begin(rq, 4);
+ if (IS_ERR(cs)) {
+ i915_sw_fence_commit(submit);
+ heap_fence_put(submit);
+ i915_request_add(rq);
+ WRITE_ONCE(sema[0], 1);
+ return PTR_ERR(cs);
+ }
+
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+ *cs++ = ce->engine->mmio_base + 0x358;
+ *cs++ = offset + sizeof(*sema) * i;
+ *cs++ = 0;
+
+ intel_ring_advance(rq, cs);
+ i915_request_add(rq);
+ }
+ local_bh_disable();
+ i915_sw_fence_commit(submit);
+ local_bh_enable();
+ intel_engine_flush_submission(ce->engine);
+ heap_fence_put(submit);
+
+ WRITE_ONCE(sema[0], 1);
+ wmb(); /* flush the update to the cache, and beyond */
+
+ if (wait_for(READ_ONCE(sema[COUNT + 1]), 100)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ for (i = 1; i <= COUNT; i++)
+ elapsed[i - 1] = (sema[i + 1] - sema[i]) << COUNT;
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: inter-request latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> COUNT,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_context_switch(struct intel_context *ce)
+{
+ u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+ const u32 offset =
+ i915_ggtt_offset(ce->engine->status_page.vma) +
+ offset_in_page(sema);
+ struct i915_request *fence = NULL;
+ u32 elapsed[COUNT + 1], cycles;
+ int i, j, err;
+ u32 *cs;
+
+ err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+ if (err)
+ return err;
+
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ struct intel_context *arr[] = {
+ ce, ce->engine->kernel_context
+ };
+
+ for (j = 0; j < ARRAY_SIZE(arr); j++) {
+ struct i915_request *rq;
+
+ rq = i915_request_create(arr[j]);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ if (fence) {
+ err = i915_request_await_dma_fence(rq,
+ &fence->fence);
+ if (err) {
+ i915_request_add(rq);
+ return err;
+ }
+ }
+
+ cs = intel_ring_begin(rq, 4);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+ *cs++ = ce->engine->mmio_base + 0x358;
+ *cs++ = offset +
+ sizeof(*sema) * (ARRAY_SIZE(arr) * i + j);
+ *cs++ = 0;
+
+ intel_ring_advance(rq, cs);
+
+ i915_request_put(fence);
+ fence = i915_request_get(rq);
+
+ i915_request_add(rq);
+ }
+ }
+ i915_request_put(fence);
+ intel_engine_flush_submission(ce->engine);
+
+ WRITE_ONCE(sema[0], 1);
+ wmb(); /* flush the update to the cache, and beyond */
+
+ if (wait_for(READ_ONCE(sema[2 *i - 1]), 500)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ for (i = 1; i <= COUNT; i++)
+ elapsed[i - 1] = (sema[2 * i + 2] - sema[2 * i + 1]) << COUNT;
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: context switch latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> COUNT,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+struct signal_cb {
+ struct dma_fence_cb base;
+ bool seen;
+};
+
+static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
+{
+ struct signal_cb *s = container_of(cb, typeof(*s), base);
+
+ smp_store_mb(s->seen, true); /* be safe, be strong */
+}
+
+static int measure_completion(struct intel_context *ce)
+{
+ u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+ const u32 offset =
+ i915_ggtt_offset(ce->engine->status_page.vma) +
+ offset_in_page(sema);
+ u32 elapsed[COUNT], cycles;
+ u32 *cs;
+ int i;
+
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ struct signal_cb cb = { .seen = false };
+ struct i915_request *rq;
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 12);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+ *cs++ = ce->engine->mmio_base + 0x358;
+ *cs++ = offset + sizeof(*sema) * i;
+ *cs++ = 0;
+
+ *cs++ = MI_SEMAPHORE_WAIT |
+ MI_SEMAPHORE_GLOBAL_GTT |
+ MI_SEMAPHORE_POLL |
+ MI_SEMAPHORE_SAD_EQ_SDD;
+ *cs++ = i;
+ *cs++ = offset;
+ *cs++ = 0;
+
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+ *cs++ = ce->engine->mmio_base + 0x358;
+ *cs++ = offset + sizeof(*sema) * i;
+ *cs++ = 0;
+
+ intel_ring_advance(rq, cs);
+
+ dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
+
+ local_bh_disable();
+ i915_request_add(rq);
+ local_bh_enable();
+
+ if (wait_for(READ_ONCE(sema[i]), 50)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ preempt_disable();
+ WRITE_ONCE(sema[0], i);
+ wmb();
+ while (!READ_ONCE(cb.seen))
+ cpu_relax();
+
+ elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+ preempt_enable();
+ }
+
+ for (i = 0; i < ARRAY_SIZE(elapsed); i++)
+ elapsed[i] = (sema[i + 1] - elapsed[i]) << COUNT;
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: completion latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> COUNT,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static void rps_pin(struct intel_gt *gt)
+{
+ /* Pin the frequency to max */
+ atomic_inc(>->rps.num_waiters);
+ intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
+
+ mutex_lock(>->rps.lock);
+ intel_rps_set(>->rps, gt->rps.max_freq);
+ mutex_unlock(>->rps.lock);
+}
+
+static void rps_unpin(struct intel_gt *gt)
+{
+ intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
+ atomic_dec(>->rps.num_waiters);
+}
+
+static unsigned long engine_heartbeat_disable(struct intel_engine_cs *engine)
+{
+ unsigned long old;
+
+ old = fetch_and_zero(&engine->props.heartbeat_interval_ms);
+
+ intel_engine_pm_get(engine);
+ intel_engine_park_heartbeat(engine);
+
+ return old;
+}
+
+static void engine_heartbeat_enable(struct intel_engine_cs *engine,
+ unsigned long saved)
+{
+ intel_engine_pm_put(engine);
+
+ engine->props.heartbeat_interval_ms = saved;
+}
+
+static int perf_request_latency(void *arg)
+{
+ struct drm_i915_private *i915 = arg;
+ struct intel_engine_cs *engine;
+ struct pm_qos_request qos;
+ int err = 0;
+
+ if (INTEL_GEN(i915) < 8) /* per-engine CS timestamp, semaphores */
+ return 0;
+
+ cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
+
+ for_each_uabi_engine(engine, i915) {
+ unsigned long saved_heartbeat;
+ struct intel_context *ce;
+
+ ce = intel_context_create(engine);
+ if (IS_ERR(ce))
+ goto out;
+
+ err = intel_context_pin(ce);
+ if (err) {
+ intel_context_put(ce);
+ goto out;
+ }
+
+ saved_heartbeat = engine_heartbeat_disable(engine);
+ rps_pin(engine->gt);
+
+ if (err == 0)
+ err = measure_semaphore_response(ce);
+ if (err == 0)
+ err = measure_idle_dispatch(ce);
+ if (err == 0)
+ err = measure_busy_dispatch(ce);
+ if (err == 0)
+ err = measure_inter_request(ce);
+ if (err == 0)
+ err = measure_context_switch(ce);
+ if (err == 0)
+ err = measure_completion(ce);
+
+ rps_unpin(engine->gt);
+ engine_heartbeat_enable(engine, saved_heartbeat);
+
+ intel_context_unpin(ce);
+ intel_context_put(ce);
+ if (err)
+ goto out;
+ }
+
+out:
+ if (igt_flush_test(i915))
+ err = -EIO;
+
+ cpu_latency_qos_remove_request(&qos);
+ return err;
+}
+
static int s_sync0(void *arg)
{
struct perf_series *ps = arg;
@@ -2042,6 +2653,7 @@ static int perf_parallel_engines(void *arg)
int i915_request_perf_selftests(struct drm_i915_private *i915)
{
static const struct i915_subtest tests[] = {
+ SUBTEST(perf_request_latency),
SUBTEST(perf_series_engines),
SUBTEST(perf_parallel_engines),
};
--
2.20.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 13+ messages in thread
* [Intel-gfx] [PATCH] drm/i915/selftests: Measure dispatch latency
2020-05-15 15:10 Chris Wilson
2020-05-15 17:32 ` Chris Wilson
2020-05-15 17:58 ` Chris Wilson
@ 2020-05-15 18:02 ` Chris Wilson
2 siblings, 0 replies; 13+ messages in thread
From: Chris Wilson @ 2020-05-15 18:02 UTC (permalink / raw)
To: intel-gfx; +Cc: Chris Wilson
A useful metric of the system's health is how fast we can tell the GPU
to do various actions, so measure our latency.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
drivers/gpu/drm/i915/selftests/i915_request.c | 612 ++++++++++++++++++
1 file changed, 612 insertions(+)
diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c
index 6014e8dfcbb1..bfb601214c4a 100644
--- a/drivers/gpu/drm/i915/selftests/i915_request.c
+++ b/drivers/gpu/drm/i915/selftests/i915_request.c
@@ -24,16 +24,20 @@
#include <linux/prime_numbers.h>
#include <linux/pm_qos.h>
+#include <linux/sort.h>
#include "gem/i915_gem_pm.h"
#include "gem/selftests/mock_context.h"
+#include "gt/intel_engine_heartbeat.h"
#include "gt/intel_engine_pm.h"
#include "gt/intel_engine_user.h"
#include "gt/intel_gt.h"
+#include "gt/intel_gt_requests.h"
#include "i915_random.h"
#include "i915_selftest.h"
+#include "igt_flush_test.h"
#include "igt_live_test.h"
#include "igt_spinner.h"
#include "lib_sw_fence.h"
@@ -1524,6 +1528,613 @@ struct perf_series {
struct intel_context *ce[];
};
+#define COUNT 5
+
+static int cmp_u32(const void *A, const void *B)
+{
+ const u32 *a = A, *b = B;
+
+ return *a - *b;
+}
+
+static u32 trifilter(u32 *a)
+{
+ u64 sum;
+
+ sort(a, COUNT, sizeof(*a), cmp_u32, NULL);
+
+ sum = mul_u32_u32(a[2], 2);
+ sum += a[1];
+ sum += a[3];
+
+ return (sum + 2) >> 2;
+}
+
+static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
+{
+ u64 ns = i915_cs_timestamp_ticks_to_ns(engine->i915, cycles);
+
+ return DIV_ROUND_CLOSEST(ns, 1 << COUNT);
+}
+
+static int measure_semaphore_response(struct intel_context *ce)
+{
+ u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+ const u32 offset =
+ i915_ggtt_offset(ce->engine->status_page.vma) +
+ offset_in_page(sema);
+ u32 elapsed[COUNT], cycles;
+ struct i915_request *rq;
+ u32 *cs;
+ int i;
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 4 + 8 * ARRAY_SIZE(elapsed));
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+ *cs++ = offset;
+ *cs++ = 0;
+ *cs++ = 0xffffffff;
+
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ *cs++ = MI_SEMAPHORE_WAIT |
+ MI_SEMAPHORE_GLOBAL_GTT |
+ MI_SEMAPHORE_POLL |
+ MI_SEMAPHORE_SAD_EQ_SDD;
+ *cs++ = i;
+ *cs++ = offset;
+ *cs++ = 0;
+
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+ *cs++ = ce->engine->mmio_base + 0x358;
+ *cs++ = offset + sizeof(*sema) * i;
+ *cs++ = 0;
+ }
+
+ intel_ring_advance(rq, cs);
+ i915_request_add(rq);
+
+ if (wait_for(READ_ONCE(*sema) == 0xffffffff, 50)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+ WRITE_ONCE(sema[0], i);
+ wmb(); /* flush the update to the cache, and beyond */
+
+ if (wait_for(READ_ONCE(sema[i]), 50)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ elapsed[i - 1] = (sema[i] - cycles) << COUNT;
+ }
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: semaphore response %d cycles, %lluns\n",
+ ce->engine->name, cycles >> COUNT,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_idle_dispatch(struct intel_context *ce)
+{
+ u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+ const u32 offset =
+ i915_ggtt_offset(ce->engine->status_page.vma) +
+ offset_in_page(sema);
+ u32 elapsed[COUNT], cycles;
+ u32 *cs;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+ struct i915_request *rq;
+ int err;
+
+ err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
+ if (err)
+ return err;
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 4);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+ *cs++ = ce->engine->mmio_base + 0x358;
+ *cs++ = offset + sizeof(*sema) * i;
+ *cs++ = 0;
+
+ intel_ring_advance(rq, cs);
+
+ preempt_disable();
+ local_bh_disable();
+ elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+ i915_request_add(rq);
+ local_bh_enable();
+ preempt_enable();
+ }
+
+ for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
+ if (wait_for(READ_ONCE(sema[i]), 50)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ elapsed[i] = (sema[i] - elapsed[i]) << COUNT;
+ }
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> COUNT,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_busy_dispatch(struct intel_context *ce)
+{
+ u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+ const u32 offset =
+ i915_ggtt_offset(ce->engine->status_page.vma) +
+ offset_in_page(sema);
+ u32 elapsed[COUNT + 1], cycles;
+ u32 *cs;
+ int i;
+
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ struct i915_request *rq;
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 12);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+ *cs++ = ce->engine->mmio_base + 0x358;
+ *cs++ = offset + sizeof(*sema) * i;
+ *cs++ = 0;
+
+ *cs++ = MI_SEMAPHORE_WAIT |
+ MI_SEMAPHORE_GLOBAL_GTT |
+ MI_SEMAPHORE_POLL |
+ MI_SEMAPHORE_SAD_GTE_SDD;
+ *cs++ = i;
+ *cs++ = offset;
+ *cs++ = 0;
+
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+ *cs++ = ce->engine->mmio_base + 0x358;
+ *cs++ = offset + sizeof(*sema) * i;
+ *cs++ = 0;
+
+ intel_ring_advance(rq, cs);
+
+ if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ preempt_disable();
+ local_bh_disable();
+ elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+ i915_request_add(rq);
+ local_bh_enable();
+ WRITE_ONCE(sema[0], i - 1);
+ wmb(); /* flush the update to the cache, and beyond */
+ preempt_enable();
+ }
+ WRITE_ONCE(sema[0], i - 1);
+
+ for (i = 1; i <= COUNT; i++)
+ elapsed[i - 1] = (sema[i] - elapsed[i]) << COUNT;
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> COUNT,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
+{
+ const u32 offset =
+ i915_ggtt_offset(engine->status_page.vma) +
+ offset_in_page(sema);
+ struct i915_request *rq;
+ u32 *cs;
+
+ rq = i915_request_create(engine->kernel_context);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 4);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ *cs++ = MI_SEMAPHORE_WAIT |
+ MI_SEMAPHORE_GLOBAL_GTT |
+ MI_SEMAPHORE_POLL |
+ mode;
+ *cs++ = value;
+ *cs++ = offset;
+ *cs++ = 0;
+
+ intel_ring_advance(rq, cs);
+ i915_request_add(rq);
+
+ return 0;
+}
+
+static int measure_inter_request(struct intel_context *ce)
+{
+ u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+ const u32 offset =
+ i915_ggtt_offset(ce->engine->status_page.vma) +
+ offset_in_page(sema);
+ u32 elapsed[COUNT + 1], cycles;
+ struct i915_sw_fence *submit;
+ int i, err;
+
+ err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+ if (err)
+ return err;
+
+ submit = heap_fence_create(GFP_KERNEL);
+ if (!submit) {
+ WRITE_ONCE(sema[0], 1);
+ return -ENOMEM;
+ }
+
+ intel_engine_flush_submission(ce->engine);
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ struct i915_request *rq;
+ u32 *cs;
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq)) {
+ WRITE_ONCE(sema[0], 1);
+ return PTR_ERR(rq);
+ }
+
+ err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
+ submit,
+ GFP_KERNEL);
+ if (err < 0) {
+ i915_sw_fence_commit(submit);
+ heap_fence_put(submit);
+ i915_request_add(rq);
+ WRITE_ONCE(sema[0], 1);
+ return err;
+ }
+
+ cs = intel_ring_begin(rq, 4);
+ if (IS_ERR(cs)) {
+ i915_sw_fence_commit(submit);
+ heap_fence_put(submit);
+ i915_request_add(rq);
+ WRITE_ONCE(sema[0], 1);
+ return PTR_ERR(cs);
+ }
+
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+ *cs++ = ce->engine->mmio_base + 0x358;
+ *cs++ = offset + sizeof(*sema) * i;
+ *cs++ = 0;
+
+ intel_ring_advance(rq, cs);
+ i915_request_add(rq);
+ }
+ local_bh_disable();
+ i915_sw_fence_commit(submit);
+ local_bh_enable();
+ intel_engine_flush_submission(ce->engine);
+ heap_fence_put(submit);
+
+ WRITE_ONCE(sema[0], 1);
+ wmb(); /* flush the update to the cache, and beyond */
+
+ if (wait_for(READ_ONCE(sema[COUNT + 1]), 100)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ for (i = 1; i <= COUNT; i++)
+ elapsed[i - 1] = (sema[i + 1] - sema[i]) << COUNT;
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: inter-request latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> COUNT,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static int measure_context_switch(struct intel_context *ce)
+{
+ u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+ const u32 offset =
+ i915_ggtt_offset(ce->engine->status_page.vma) +
+ offset_in_page(sema);
+ struct i915_request *fence = NULL;
+ u32 elapsed[COUNT + 1], cycles;
+ int i, j, err;
+ u32 *cs;
+
+ err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
+ if (err)
+ return err;
+
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ struct intel_context *arr[] = {
+ ce, ce->engine->kernel_context
+ };
+
+ for (j = 0; j < ARRAY_SIZE(arr); j++) {
+ struct i915_request *rq;
+
+ rq = i915_request_create(arr[j]);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ if (fence) {
+ err = i915_request_await_dma_fence(rq,
+ &fence->fence);
+ if (err) {
+ i915_request_add(rq);
+ return err;
+ }
+ }
+
+ cs = intel_ring_begin(rq, 4);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+ *cs++ = ce->engine->mmio_base + 0x358;
+ *cs++ = offset +
+ sizeof(*sema) * (ARRAY_SIZE(arr) * i + j);
+ *cs++ = 0;
+
+ intel_ring_advance(rq, cs);
+
+ i915_request_put(fence);
+ fence = i915_request_get(rq);
+
+ i915_request_add(rq);
+ }
+ }
+ i915_request_put(fence);
+ intel_engine_flush_submission(ce->engine);
+
+ WRITE_ONCE(sema[0], 1);
+ wmb(); /* flush the update to the cache, and beyond */
+
+ if (wait_for(READ_ONCE(sema[2 *i - 1]), 500)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ for (i = 1; i <= COUNT; i++)
+ elapsed[i - 1] = (sema[2 * i + 2] - sema[2 * i + 1]) << COUNT;
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: context switch latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> COUNT,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+struct signal_cb {
+ struct dma_fence_cb base;
+ bool seen;
+};
+
+static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
+{
+ struct signal_cb *s = container_of(cb, typeof(*s), base);
+
+ smp_store_mb(s->seen, true); /* be safe, be strong */
+}
+
+static int measure_completion(struct intel_context *ce)
+{
+ u32 *sema = memset32(ce->engine->status_page.addr + 1000, 0, 21);
+ const u32 offset =
+ i915_ggtt_offset(ce->engine->status_page.vma) +
+ offset_in_page(sema);
+ u32 elapsed[COUNT], cycles;
+ u32 *cs;
+ int i;
+
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
+ struct signal_cb cb = { .seen = false };
+ struct i915_request *rq;
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 12);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+ *cs++ = ce->engine->mmio_base + 0x358;
+ *cs++ = offset + sizeof(*sema) * i;
+ *cs++ = 0;
+
+ *cs++ = MI_SEMAPHORE_WAIT |
+ MI_SEMAPHORE_GLOBAL_GTT |
+ MI_SEMAPHORE_POLL |
+ MI_SEMAPHORE_SAD_EQ_SDD;
+ *cs++ = i;
+ *cs++ = offset;
+ *cs++ = 0;
+
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+ *cs++ = ce->engine->mmio_base + 0x358;
+ *cs++ = offset + sizeof(*sema) * i;
+ *cs++ = 0;
+
+ intel_ring_advance(rq, cs);
+
+ dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
+
+ local_bh_disable();
+ i915_request_add(rq);
+ local_bh_enable();
+
+ if (wait_for(READ_ONCE(sema[i]), 50)) {
+ intel_gt_set_wedged(ce->engine->gt);
+ return -EIO;
+ }
+
+ preempt_disable();
+ WRITE_ONCE(sema[0], i);
+ wmb();
+ while (!READ_ONCE(cb.seen))
+ cpu_relax();
+
+ elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
+ preempt_enable();
+ }
+
+ for (i = 0; i < ARRAY_SIZE(elapsed); i++)
+ elapsed[i] = (elapsed[i] - sema[i + 1]) << COUNT;
+
+ cycles = trifilter(elapsed);
+ pr_info("%s: completion latency %d cycles, %lluns\n",
+ ce->engine->name, cycles >> COUNT,
+ cycles_to_ns(ce->engine, cycles));
+
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
+}
+
+static void rps_pin(struct intel_gt *gt)
+{
+ /* Pin the frequency to max */
+ atomic_inc(>->rps.num_waiters);
+ intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
+
+ mutex_lock(>->rps.lock);
+ intel_rps_set(>->rps, gt->rps.max_freq);
+ mutex_unlock(>->rps.lock);
+}
+
+static void rps_unpin(struct intel_gt *gt)
+{
+ intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
+ atomic_dec(>->rps.num_waiters);
+}
+
+static unsigned long engine_heartbeat_disable(struct intel_engine_cs *engine)
+{
+ unsigned long old;
+
+ old = fetch_and_zero(&engine->props.heartbeat_interval_ms);
+
+ intel_engine_pm_get(engine);
+ intel_engine_park_heartbeat(engine);
+
+ return old;
+}
+
+static void engine_heartbeat_enable(struct intel_engine_cs *engine,
+ unsigned long saved)
+{
+ intel_engine_pm_put(engine);
+
+ engine->props.heartbeat_interval_ms = saved;
+}
+
+static int perf_request_latency(void *arg)
+{
+ struct drm_i915_private *i915 = arg;
+ struct intel_engine_cs *engine;
+ struct pm_qos_request qos;
+ int err = 0;
+
+ if (INTEL_GEN(i915) < 8) /* per-engine CS timestamp, semaphores */
+ return 0;
+
+ cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
+
+ for_each_uabi_engine(engine, i915) {
+ unsigned long saved_heartbeat;
+ struct intel_context *ce;
+
+ ce = intel_context_create(engine);
+ if (IS_ERR(ce))
+ goto out;
+
+ err = intel_context_pin(ce);
+ if (err) {
+ intel_context_put(ce);
+ goto out;
+ }
+
+ saved_heartbeat = engine_heartbeat_disable(engine);
+ rps_pin(engine->gt);
+
+ if (err == 0)
+ err = measure_semaphore_response(ce);
+ if (err == 0)
+ err = measure_idle_dispatch(ce);
+ if (err == 0)
+ err = measure_busy_dispatch(ce);
+ if (err == 0)
+ err = measure_inter_request(ce);
+ if (err == 0)
+ err = measure_context_switch(ce);
+ if (err == 0)
+ err = measure_completion(ce);
+
+ rps_unpin(engine->gt);
+ engine_heartbeat_enable(engine, saved_heartbeat);
+
+ intel_context_unpin(ce);
+ intel_context_put(ce);
+ if (err)
+ goto out;
+ }
+
+out:
+ if (igt_flush_test(i915))
+ err = -EIO;
+
+ cpu_latency_qos_remove_request(&qos);
+ return err;
+}
+
static int s_sync0(void *arg)
{
struct perf_series *ps = arg;
@@ -2042,6 +2653,7 @@ static int perf_parallel_engines(void *arg)
int i915_request_perf_selftests(struct drm_i915_private *i915)
{
static const struct i915_subtest tests[] = {
+ SUBTEST(perf_request_latency),
SUBTEST(perf_series_engines),
SUBTEST(perf_parallel_engines),
};
--
2.20.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 13+ messages in thread
end of thread, other threads:[~2020-05-19 13:08 UTC | newest]
Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-05-18 8:57 [Intel-gfx] [PATCH] drm/i915/selftests: Measure dispatch latency Chris Wilson
2020-05-18 12:46 ` [Intel-gfx] ✗ Fi.CI.CHECKPATCH: warning for drm/i915/selftests: Measure dispatch latency (rev5) Patchwork
2020-05-18 13:20 ` [Intel-gfx] ✓ Fi.CI.BAT: success " Patchwork
2020-05-18 15:13 ` [Intel-gfx] ✓ Fi.CI.IGT: " Patchwork
-- strict thread matches above, loose matches on Subject: below --
2020-05-19 13:08 [Intel-gfx] [PATCH] drm/i915/selftests: Measure dispatch latency Chris Wilson
2020-05-18 14:39 [Intel-gfx] [PATCH v2] " Chris Wilson
2020-05-18 16:22 ` [Intel-gfx] [PATCH] " Chris Wilson
2020-05-19 11:41 ` Chris Wilson
2020-05-19 12:47 ` Mika Kuoppala
2020-05-19 12:56 ` Chris Wilson
2020-05-15 15:10 Chris Wilson
2020-05-15 17:32 ` Chris Wilson
2020-05-15 17:58 ` Chris Wilson
2020-05-15 18:02 ` Chris Wilson
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.