All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH i-g-t] tests/perf_pmu: Improve accuracy by waiting on spinner to start
@ 2018-03-15 12:56 ` Tvrtko Ursulin
  0 siblings, 0 replies; 53+ messages in thread
From: Tvrtko Ursulin @ 2018-03-15 12:56 UTC (permalink / raw)
  To: igt-dev; +Cc: Intel-gfx

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

More than one test assumes that the spinner is running pretty much
immediately after we have create or submitted it.

In actuality there is a variable delay, especially on execlists platforms,
between submission and spin batch starting to run on the hardware.

To enable tests which care about this level of timing to account for this,
we add a new spin batch constructor which provides an output field which
can be polled to determine when the batch actually started running.

This is implemented via MI_STOREDW_IMM from the spin batch, writing into
memory mapped page shared with userspace.

Using this facility from perf_pmu, where applicable, should improve very
occasional test fails across the set and platforms.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 lib/igt_dummyload.c |  99 +++++++++++++++++++++++++++++++----
 lib/igt_dummyload.h |   9 ++++
 tests/perf_pmu.c    | 145 +++++++++++++++++++++++++++++++++++-----------------
 3 files changed, 196 insertions(+), 57 deletions(-)

diff --git a/lib/igt_dummyload.c b/lib/igt_dummyload.c
index 4b20f23dfe26..0447d2f14d57 100644
--- a/lib/igt_dummyload.c
+++ b/lib/igt_dummyload.c
@@ -74,9 +74,12 @@ fill_reloc(struct drm_i915_gem_relocation_entry *reloc,
 	reloc->write_domain = write_domains;
 }
 
-static int emit_recursive_batch(igt_spin_t *spin,
-				int fd, uint32_t ctx, unsigned engine,
-				uint32_t dep, bool out_fence)
+#define OUT_FENCE	(1 << 0)
+#define POLL_RUN	(1 << 1)
+
+static int
+emit_recursive_batch(igt_spin_t *spin, int fd, uint32_t ctx, unsigned engine,
+		     uint32_t dep, unsigned int flags)
 {
 #define SCRATCH 0
 #define BATCH 1
@@ -116,6 +119,8 @@ static int emit_recursive_batch(igt_spin_t *spin,
 	execbuf.buffer_count++;
 
 	if (dep) {
+		igt_assert(!(flags & POLL_RUN));
+
 		/* dummy write to dependency */
 		obj[SCRATCH].handle = dep;
 		fill_reloc(&relocs[obj[BATCH].relocation_count++],
@@ -123,6 +128,41 @@ static int emit_recursive_batch(igt_spin_t *spin,
 			   I915_GEM_DOMAIN_RENDER,
 			   I915_GEM_DOMAIN_RENDER);
 		execbuf.buffer_count++;
+	} else if (flags & POLL_RUN) {
+		unsigned int offset;
+
+		igt_assert(!dep);
+
+		spin->poll_handle = gem_create(fd, 4096);
+		spin->running = __gem_mmap__wc(fd, spin->poll_handle,
+					       0, 4096, PROT_READ | PROT_WRITE);
+		igt_assert(spin->running);
+		igt_assert_eq(*spin->running, 0);
+
+		*batch++ = MI_STORE_DWORD_IMM | (gen < 6 ? 1 << 22 : 0);
+
+		if (gen >= 8) {
+			offset = sizeof(uint32_t);
+			*batch++ = 0;
+			*batch++ = 0;
+		} else if (gen >= 4) {
+			offset = 2 * sizeof(uint32_t);
+			*batch++ = 0;
+			*batch++ = 0;
+		} else {
+			offset = sizeof(uint32_t);
+			batch[-1]--;
+			*batch++ = 0;
+		}
+
+		*batch++ = 1;
+
+		obj[SCRATCH].handle = spin->poll_handle;
+		fill_reloc(&relocs[obj[BATCH].relocation_count++],
+			   spin->poll_handle, offset,
+			   I915_GEM_DOMAIN_INSTRUCTION,
+			   I915_GEM_DOMAIN_INSTRUCTION);
+		execbuf.buffer_count++;
 	}
 
 	spin->batch = batch;
@@ -170,14 +210,14 @@ static int emit_recursive_batch(igt_spin_t *spin,
 	execbuf.buffers_ptr = to_user_pointer(obj + (2 - execbuf.buffer_count));
 	execbuf.rsvd1 = ctx;
 
-	if (out_fence)
+	if (flags & OUT_FENCE)
 		execbuf.flags |= I915_EXEC_FENCE_OUT;
 
 	for (i = 0; i < nengine; i++) {
 		execbuf.flags &= ~ENGINE_MASK;
 		execbuf.flags |= engines[i];
 		gem_execbuf_wr(fd, &execbuf);
-		if (out_fence) {
+		if (flags & OUT_FENCE) {
 			int _fd = execbuf.rsvd2 >> 32;
 
 			igt_assert(_fd >= 0);
@@ -199,7 +239,7 @@ static int emit_recursive_batch(igt_spin_t *spin,
 
 static igt_spin_t *
 ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
-		      int out_fence)
+		      unsigned int flags)
 {
 	igt_spin_t *spin;
 
@@ -207,7 +247,7 @@ ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
 	igt_assert(spin);
 
 	spin->out_fence = emit_recursive_batch(spin, fd, ctx, engine, dep,
-					       out_fence);
+					       flags);
 
 	pthread_mutex_lock(&list_lock);
 	igt_list_add(&spin->link, &spin_list);
@@ -219,7 +259,7 @@ ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
 igt_spin_t *
 __igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep)
 {
-	return ___igt_spin_batch_new(fd, ctx, engine, dep, false);
+	return ___igt_spin_batch_new(fd, ctx, engine, dep, 0);
 }
 
 /**
@@ -253,7 +293,7 @@ igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep)
 igt_spin_t *
 __igt_spin_batch_new_fence(int fd, uint32_t ctx, unsigned engine)
 {
-	return ___igt_spin_batch_new(fd, ctx, engine, 0, true);
+	return ___igt_spin_batch_new(fd, ctx, engine, 0, OUT_FENCE);
 }
 
 /**
@@ -286,6 +326,42 @@ igt_spin_batch_new_fence(int fd, uint32_t ctx, unsigned engine)
 	return spin;
 }
 
+igt_spin_t *
+__igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine)
+{
+	return ___igt_spin_batch_new(fd, ctx, engine, 0, POLL_RUN);
+}
+
+/**
+ * igt_spin_batch_new_poll:
+ * @fd: open i915 drm file descriptor
+ * @engine: Ring to execute batch OR'd with execbuf flags. If value is less
+ *          than 0, execute on all available rings.
+ *
+ * Start a recursive batch on a ring. Immediately returns a #igt_spin_t that
+ * contains the batch's handle that can be waited upon. The returned structure
+ * must be passed to igt_spin_batch_free() for post-processing.
+ *
+ * igt_spin_t->running will containt a pointer which target will change from
+ * zero to one once the spinner actually starts executing on the GPU.
+ *
+ * Returns:
+ * Structure with helper internal state for igt_spin_batch_free().
+ */
+igt_spin_t *
+igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine)
+{
+	igt_spin_t *spin;
+
+	igt_require_gem(fd);
+	igt_require(gem_mmap__has_wc(fd));
+
+	spin = __igt_spin_batch_new_poll(fd, ctx, engine);
+	igt_assert(gem_bo_busy(fd, spin->handle));
+
+	return spin;
+}
+
 static void notify(union sigval arg)
 {
 	igt_spin_t *spin = arg.sival_ptr;
@@ -367,6 +443,11 @@ void igt_spin_batch_free(int fd, igt_spin_t *spin)
 	igt_spin_batch_end(spin);
 	gem_munmap(spin->batch, BATCH_SIZE);
 
+	if (spin->running) {
+		gem_munmap(spin->running, 4096);
+		gem_close(fd, spin->poll_handle);
+	}
+
 	gem_close(fd, spin->handle);
 
 	if (spin->out_fence >= 0)
diff --git a/lib/igt_dummyload.h b/lib/igt_dummyload.h
index 4103e4ab9e36..7ed93a3884b9 100644
--- a/lib/igt_dummyload.h
+++ b/lib/igt_dummyload.h
@@ -36,6 +36,8 @@ typedef struct igt_spin {
 	struct igt_list link;
 	uint32_t *batch;
 	int out_fence;
+	uint32_t poll_handle;
+	bool *running;
 } igt_spin_t;
 
 igt_spin_t *__igt_spin_batch_new(int fd,
@@ -55,6 +57,13 @@ igt_spin_t *igt_spin_batch_new_fence(int fd,
 				     uint32_t ctx,
 				     unsigned engine);
 
+igt_spin_t *__igt_spin_batch_new_poll(int fd,
+				       uint32_t ctx,
+				       unsigned engine);
+igt_spin_t *igt_spin_batch_new_poll(int fd,
+				    uint32_t ctx,
+				    unsigned engine);
+
 void igt_spin_batch_set_timeout(igt_spin_t *spin, int64_t ns);
 void igt_spin_batch_end(igt_spin_t *spin);
 void igt_spin_batch_free(int fd, igt_spin_t *spin);
diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
index 19fcc95ffc7f..d1b7b23bc646 100644
--- a/tests/perf_pmu.c
+++ b/tests/perf_pmu.c
@@ -184,6 +184,38 @@ static void end_spin(int fd, igt_spin_t *spin, unsigned int flags)
 		usleep(batch_duration_ns / 5000);
 }
 
+static igt_spin_t * __spin_poll(int fd, uint32_t ctx, unsigned long flags)
+{
+	return __igt_spin_batch_new_poll(fd, ctx, flags);
+}
+
+static unsigned long __spin_wait(igt_spin_t *spin)
+{
+	struct timespec start = { };
+
+	igt_nsec_elapsed(&start);
+
+	while (!spin->running);
+
+	return igt_nsec_elapsed(&start);
+}
+
+static igt_spin_t * __spin_sync(int fd, uint32_t ctx, unsigned long flags)
+{
+	igt_spin_t *spin = __spin_poll(fd, ctx, flags);
+
+	__spin_wait(spin);
+
+	return spin;
+}
+
+static igt_spin_t * spin_sync(int fd, uint32_t ctx, unsigned long flags)
+{
+	igt_require_gem(fd);
+
+	return __spin_sync(fd, ctx, flags);
+}
+
 static void
 single(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags)
 {
@@ -195,7 +227,7 @@ single(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags)
 	fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance));
 
 	if (flags & TEST_BUSY)
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	else
 		spin = NULL;
 
@@ -251,13 +283,7 @@ busy_start(int gem_fd, const struct intel_execution_engine2 *e)
 	 */
 	sleep(2);
 
-	spin = __igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
-
-	/*
-	 * Sleep for a bit after making the engine busy to make sure the PMU
-	 * gets enabled when the batch is already running.
-	 */
-	usleep(500e3);
+	spin = __spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 
 	fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance));
 
@@ -300,7 +326,7 @@ busy_double_start(int gem_fd, const struct intel_execution_engine2 *e)
 	 * re-submission in execlists mode. Make sure busyness is correctly
 	 * reported with the engine busy, and after the engine went idle.
 	 */
-	spin[0] = __igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin[0] = __spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	usleep(500e3);
 	spin[1] = __igt_spin_batch_new(gem_fd, ctx, e2ring(gem_fd, e), 0);
 
@@ -386,7 +412,7 @@ busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 
 	igt_assert_eq(i, num_engines);
 
-	spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -413,15 +439,25 @@ busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 
 static void
 __submit_spin_batch(int gem_fd,
+		    igt_spin_t *spin,
 		    struct drm_i915_gem_exec_object2 *obj,
 		    const struct intel_execution_engine2 *e)
 {
 	struct drm_i915_gem_execbuffer2 eb = {
-		.buffer_count = 1,
 		.buffers_ptr = to_user_pointer(obj),
 		.flags = e2ring(gem_fd, e),
 	};
 
+	if (spin->running) {
+		obj[0].handle = spin->poll_handle;
+		obj[0].flags = EXEC_OBJECT_ASYNC;
+		obj[1].handle = spin->handle;
+		eb.buffer_count = 2;
+	} else {
+		obj[0].handle = spin->handle;
+		eb.buffer_count = 1;
+	}
+
 	gem_execbuf(gem_fd, &eb);
 }
 
@@ -429,7 +465,7 @@ static void
 most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 		    const unsigned int num_engines, unsigned int flags)
 {
-	struct drm_i915_gem_exec_object2 obj = {};
+	struct drm_i915_gem_exec_object2 obj[2];
 	const struct intel_execution_engine2 *e_;
 	uint64_t tval[2][num_engines];
 	uint64_t val[num_engines];
@@ -438,20 +474,19 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 	igt_spin_t *spin = NULL;
 	unsigned int idle_idx, i;
 
+	memset(obj, 0, sizeof(obj));
+
 	i = 0;
 	for_each_engine_class_instance(fd, e_) {
 		if (!gem_has_engine(gem_fd, e_->class, e_->instance))
 			continue;
 
-		if (e == e_) {
+		if (e == e_)
 			idle_idx = i;
-		} else if (spin) {
-			__submit_spin_batch(gem_fd, &obj, e_);
-		} else {
-			spin = igt_spin_batch_new(gem_fd, 0,
-						  e2ring(gem_fd, e_), 0);
-			obj.handle = spin->handle;
-		}
+		else if (spin)
+			__submit_spin_batch(gem_fd, spin, obj, e_);
+		else
+			spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e_));
 
 		val[i++] = I915_PMU_ENGINE_BUSY(e_->class, e_->instance);
 	}
@@ -461,6 +496,9 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 	for (i = 0; i < num_engines; i++)
 		fd[i] = open_group(val[i], fd[0]);
 
+	/* Small delay to allow engines to start. */
+	usleep(__spin_wait(spin) * num_engines / 1e3);
+
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -489,7 +527,7 @@ static void
 all_busy_check_all(int gem_fd, const unsigned int num_engines,
 		   unsigned int flags)
 {
-	struct drm_i915_gem_exec_object2 obj = {};
+	struct drm_i915_gem_exec_object2 obj[2];
 	const struct intel_execution_engine2 *e;
 	uint64_t tval[2][num_engines];
 	uint64_t val[num_engines];
@@ -498,18 +536,17 @@ all_busy_check_all(int gem_fd, const unsigned int num_engines,
 	igt_spin_t *spin = NULL;
 	unsigned int i;
 
+	memset(obj, 0, sizeof(obj));
+
 	i = 0;
 	for_each_engine_class_instance(fd, e) {
 		if (!gem_has_engine(gem_fd, e->class, e->instance))
 			continue;
 
-		if (spin) {
-			__submit_spin_batch(gem_fd, &obj, e);
-		} else {
-			spin = igt_spin_batch_new(gem_fd, 0,
-						  e2ring(gem_fd, e), 0);
-			obj.handle = spin->handle;
-		}
+		if (spin)
+			__submit_spin_batch(gem_fd, spin, obj, e);
+		else
+			spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e));
 
 		val[i++] = I915_PMU_ENGINE_BUSY(e->class, e->instance);
 	}
@@ -519,6 +556,9 @@ all_busy_check_all(int gem_fd, const unsigned int num_engines,
 	for (i = 0; i < num_engines; i++)
 		fd[i] = open_group(val[i], fd[0]);
 
+	/* Small delay to allow engines to start. */
+	usleep(__spin_wait(spin) * num_engines / 1e3);
+
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -550,7 +590,7 @@ no_sema(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags)
 	open_group(I915_PMU_ENGINE_WAIT(e->class, e->instance), fd);
 
 	if (flags & TEST_BUSY)
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	else
 		spin = NULL;
 
@@ -884,7 +924,7 @@ multi_client(int gem_fd, const struct intel_execution_engine2 *e)
 	 */
 	fd[1] = open_pmu(config);
 
-	spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 
 	val[0] = val[1] = __pmu_read_single(fd[0], &ts[0]);
 	slept[1] = measured_usleep(batch_duration_ns / 1000);
@@ -1248,7 +1288,7 @@ test_frequency(int gem_fd)
 	igt_require(igt_sysfs_get_u32(sysfs, "gt_boost_freq_mhz") == min_freq);
 
 	gem_quiescent_gpu(gem_fd); /* Idle to be sure the change takes effect */
-	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+	spin = spin_sync(gem_fd, 0, I915_EXEC_RENDER);
 
 	slept = pmu_read_multi(fd, 2, start);
 	measured_usleep(batch_duration_ns / 1000);
@@ -1274,7 +1314,7 @@ test_frequency(int gem_fd)
 	igt_require(igt_sysfs_get_u32(sysfs, "gt_min_freq_mhz") == max_freq);
 
 	gem_quiescent_gpu(gem_fd);
-	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+	spin = spin_sync(gem_fd, 0, I915_EXEC_RENDER);
 
 	slept = pmu_read_multi(fd, 2, start);
 	measured_usleep(batch_duration_ns / 1000);
@@ -1455,6 +1495,8 @@ static void __rearm_spin_batch(igt_spin_t *spin)
 {
 	const uint32_t mi_arb_chk = 0x5 << 23;
 
+	if (spin->running)
+		*spin->running = 0;
        *spin->batch = mi_arb_chk;
        __sync_synchronize();
 }
@@ -1517,7 +1559,7 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 		const unsigned long timeout[] = {
 			pwm_calibration_us * 1000, test_us * 1000
 		};
-		struct drm_i915_gem_exec_object2 obj = {};
+		struct drm_i915_gem_exec_object2 obj[2];
 		uint64_t total_busy_ns = 0, total_idle_ns = 0;
 		igt_spin_t *spin;
 		int ret;
@@ -1530,12 +1572,13 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 			igt_warn("Failed to set scheduling policy!\n");
 
 		/* Allocate our spin batch and idle it. */
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
-		obj.handle = spin->handle;
-		__submit_spin_batch(gem_fd, &obj, e); /* record its location */
+		spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e));
+		memset(obj, 0, sizeof(obj));
+		__submit_spin_batch(gem_fd, spin, obj, e); /* record its location */
 		igt_spin_batch_end(spin);
-		gem_sync(gem_fd, obj.handle);
-		obj.flags |= EXEC_OBJECT_PINNED;
+		gem_sync(gem_fd, spin->handle);
+		obj[0].flags |= EXEC_OBJECT_PINNED;
+		obj[1].flags |= EXEC_OBJECT_PINNED;
 
 		/* 1st pass is calibration, second pass is the test. */
 		for (int pass = 0; pass < ARRAY_SIZE(timeout); pass++) {
@@ -1545,24 +1588,30 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 
 			igt_nsec_elapsed(&test_start);
 			do {
-				unsigned int target_idle_us, t_busy;
+				unsigned int target_idle_us;
+				struct timespec start = { };
+				unsigned long prep_delay_ns;
 
 				/* Restart the spinbatch. */
+				igt_nsec_elapsed(&start);
 				__rearm_spin_batch(spin);
-				__submit_spin_batch(gem_fd, &obj, e);
+				__submit_spin_batch(gem_fd, spin, obj, e);
 
-				/*
-				 * Note that the submission may be delayed to a
-				 * tasklet (ksoftirqd) which cannot run until we
-				 * sleep as we hog the cpu (we are RT).
-				 */
+				 /* Wait for batch to start executing. */
+				__spin_wait(spin);
+				prep_delay_ns = igt_nsec_elapsed(&start);
 
-				t_busy = measured_usleep(busy_us);
+				/* PWM busy sleep. */
+				memset(&start, 0, sizeof(start));
+				igt_nsec_elapsed(&start);
+				measured_usleep(busy_us);
 				igt_spin_batch_end(spin);
-				gem_sync(gem_fd, obj.handle);
+				gem_sync(gem_fd, spin->handle);
 
-				total_busy_ns += t_busy;
+				total_busy_ns += igt_nsec_elapsed(&start);
+				total_idle_ns += prep_delay_ns;
 
+				/* Re-calibrate. */
 				target_idle_us =
 					(100 * total_busy_ns / target_busy_pct - (total_busy_ns + total_idle_ns)) / 1000;
 				total_idle_ns += measured_usleep(target_idle_us);
-- 
2.14.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [igt-dev] [PATCH i-g-t] tests/perf_pmu: Improve accuracy by waiting on spinner to start
@ 2018-03-15 12:56 ` Tvrtko Ursulin
  0 siblings, 0 replies; 53+ messages in thread
From: Tvrtko Ursulin @ 2018-03-15 12:56 UTC (permalink / raw)
  To: igt-dev; +Cc: Intel-gfx, Tvrtko Ursulin

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

More than one test assumes that the spinner is running pretty much
immediately after we have create or submitted it.

In actuality there is a variable delay, especially on execlists platforms,
between submission and spin batch starting to run on the hardware.

To enable tests which care about this level of timing to account for this,
we add a new spin batch constructor which provides an output field which
can be polled to determine when the batch actually started running.

This is implemented via MI_STOREDW_IMM from the spin batch, writing into
memory mapped page shared with userspace.

Using this facility from perf_pmu, where applicable, should improve very
occasional test fails across the set and platforms.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 lib/igt_dummyload.c |  99 +++++++++++++++++++++++++++++++----
 lib/igt_dummyload.h |   9 ++++
 tests/perf_pmu.c    | 145 +++++++++++++++++++++++++++++++++++-----------------
 3 files changed, 196 insertions(+), 57 deletions(-)

diff --git a/lib/igt_dummyload.c b/lib/igt_dummyload.c
index 4b20f23dfe26..0447d2f14d57 100644
--- a/lib/igt_dummyload.c
+++ b/lib/igt_dummyload.c
@@ -74,9 +74,12 @@ fill_reloc(struct drm_i915_gem_relocation_entry *reloc,
 	reloc->write_domain = write_domains;
 }
 
-static int emit_recursive_batch(igt_spin_t *spin,
-				int fd, uint32_t ctx, unsigned engine,
-				uint32_t dep, bool out_fence)
+#define OUT_FENCE	(1 << 0)
+#define POLL_RUN	(1 << 1)
+
+static int
+emit_recursive_batch(igt_spin_t *spin, int fd, uint32_t ctx, unsigned engine,
+		     uint32_t dep, unsigned int flags)
 {
 #define SCRATCH 0
 #define BATCH 1
@@ -116,6 +119,8 @@ static int emit_recursive_batch(igt_spin_t *spin,
 	execbuf.buffer_count++;
 
 	if (dep) {
+		igt_assert(!(flags & POLL_RUN));
+
 		/* dummy write to dependency */
 		obj[SCRATCH].handle = dep;
 		fill_reloc(&relocs[obj[BATCH].relocation_count++],
@@ -123,6 +128,41 @@ static int emit_recursive_batch(igt_spin_t *spin,
 			   I915_GEM_DOMAIN_RENDER,
 			   I915_GEM_DOMAIN_RENDER);
 		execbuf.buffer_count++;
+	} else if (flags & POLL_RUN) {
+		unsigned int offset;
+
+		igt_assert(!dep);
+
+		spin->poll_handle = gem_create(fd, 4096);
+		spin->running = __gem_mmap__wc(fd, spin->poll_handle,
+					       0, 4096, PROT_READ | PROT_WRITE);
+		igt_assert(spin->running);
+		igt_assert_eq(*spin->running, 0);
+
+		*batch++ = MI_STORE_DWORD_IMM | (gen < 6 ? 1 << 22 : 0);
+
+		if (gen >= 8) {
+			offset = sizeof(uint32_t);
+			*batch++ = 0;
+			*batch++ = 0;
+		} else if (gen >= 4) {
+			offset = 2 * sizeof(uint32_t);
+			*batch++ = 0;
+			*batch++ = 0;
+		} else {
+			offset = sizeof(uint32_t);
+			batch[-1]--;
+			*batch++ = 0;
+		}
+
+		*batch++ = 1;
+
+		obj[SCRATCH].handle = spin->poll_handle;
+		fill_reloc(&relocs[obj[BATCH].relocation_count++],
+			   spin->poll_handle, offset,
+			   I915_GEM_DOMAIN_INSTRUCTION,
+			   I915_GEM_DOMAIN_INSTRUCTION);
+		execbuf.buffer_count++;
 	}
 
 	spin->batch = batch;
@@ -170,14 +210,14 @@ static int emit_recursive_batch(igt_spin_t *spin,
 	execbuf.buffers_ptr = to_user_pointer(obj + (2 - execbuf.buffer_count));
 	execbuf.rsvd1 = ctx;
 
-	if (out_fence)
+	if (flags & OUT_FENCE)
 		execbuf.flags |= I915_EXEC_FENCE_OUT;
 
 	for (i = 0; i < nengine; i++) {
 		execbuf.flags &= ~ENGINE_MASK;
 		execbuf.flags |= engines[i];
 		gem_execbuf_wr(fd, &execbuf);
-		if (out_fence) {
+		if (flags & OUT_FENCE) {
 			int _fd = execbuf.rsvd2 >> 32;
 
 			igt_assert(_fd >= 0);
@@ -199,7 +239,7 @@ static int emit_recursive_batch(igt_spin_t *spin,
 
 static igt_spin_t *
 ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
-		      int out_fence)
+		      unsigned int flags)
 {
 	igt_spin_t *spin;
 
@@ -207,7 +247,7 @@ ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
 	igt_assert(spin);
 
 	spin->out_fence = emit_recursive_batch(spin, fd, ctx, engine, dep,
-					       out_fence);
+					       flags);
 
 	pthread_mutex_lock(&list_lock);
 	igt_list_add(&spin->link, &spin_list);
@@ -219,7 +259,7 @@ ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
 igt_spin_t *
 __igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep)
 {
-	return ___igt_spin_batch_new(fd, ctx, engine, dep, false);
+	return ___igt_spin_batch_new(fd, ctx, engine, dep, 0);
 }
 
 /**
@@ -253,7 +293,7 @@ igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep)
 igt_spin_t *
 __igt_spin_batch_new_fence(int fd, uint32_t ctx, unsigned engine)
 {
-	return ___igt_spin_batch_new(fd, ctx, engine, 0, true);
+	return ___igt_spin_batch_new(fd, ctx, engine, 0, OUT_FENCE);
 }
 
 /**
@@ -286,6 +326,42 @@ igt_spin_batch_new_fence(int fd, uint32_t ctx, unsigned engine)
 	return spin;
 }
 
+igt_spin_t *
+__igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine)
+{
+	return ___igt_spin_batch_new(fd, ctx, engine, 0, POLL_RUN);
+}
+
+/**
+ * igt_spin_batch_new_poll:
+ * @fd: open i915 drm file descriptor
+ * @engine: Ring to execute batch OR'd with execbuf flags. If value is less
+ *          than 0, execute on all available rings.
+ *
+ * Start a recursive batch on a ring. Immediately returns a #igt_spin_t that
+ * contains the batch's handle that can be waited upon. The returned structure
+ * must be passed to igt_spin_batch_free() for post-processing.
+ *
+ * igt_spin_t->running will containt a pointer which target will change from
+ * zero to one once the spinner actually starts executing on the GPU.
+ *
+ * Returns:
+ * Structure with helper internal state for igt_spin_batch_free().
+ */
+igt_spin_t *
+igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine)
+{
+	igt_spin_t *spin;
+
+	igt_require_gem(fd);
+	igt_require(gem_mmap__has_wc(fd));
+
+	spin = __igt_spin_batch_new_poll(fd, ctx, engine);
+	igt_assert(gem_bo_busy(fd, spin->handle));
+
+	return spin;
+}
+
 static void notify(union sigval arg)
 {
 	igt_spin_t *spin = arg.sival_ptr;
@@ -367,6 +443,11 @@ void igt_spin_batch_free(int fd, igt_spin_t *spin)
 	igt_spin_batch_end(spin);
 	gem_munmap(spin->batch, BATCH_SIZE);
 
+	if (spin->running) {
+		gem_munmap(spin->running, 4096);
+		gem_close(fd, spin->poll_handle);
+	}
+
 	gem_close(fd, spin->handle);
 
 	if (spin->out_fence >= 0)
diff --git a/lib/igt_dummyload.h b/lib/igt_dummyload.h
index 4103e4ab9e36..7ed93a3884b9 100644
--- a/lib/igt_dummyload.h
+++ b/lib/igt_dummyload.h
@@ -36,6 +36,8 @@ typedef struct igt_spin {
 	struct igt_list link;
 	uint32_t *batch;
 	int out_fence;
+	uint32_t poll_handle;
+	bool *running;
 } igt_spin_t;
 
 igt_spin_t *__igt_spin_batch_new(int fd,
@@ -55,6 +57,13 @@ igt_spin_t *igt_spin_batch_new_fence(int fd,
 				     uint32_t ctx,
 				     unsigned engine);
 
+igt_spin_t *__igt_spin_batch_new_poll(int fd,
+				       uint32_t ctx,
+				       unsigned engine);
+igt_spin_t *igt_spin_batch_new_poll(int fd,
+				    uint32_t ctx,
+				    unsigned engine);
+
 void igt_spin_batch_set_timeout(igt_spin_t *spin, int64_t ns);
 void igt_spin_batch_end(igt_spin_t *spin);
 void igt_spin_batch_free(int fd, igt_spin_t *spin);
diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
index 19fcc95ffc7f..d1b7b23bc646 100644
--- a/tests/perf_pmu.c
+++ b/tests/perf_pmu.c
@@ -184,6 +184,38 @@ static void end_spin(int fd, igt_spin_t *spin, unsigned int flags)
 		usleep(batch_duration_ns / 5000);
 }
 
+static igt_spin_t * __spin_poll(int fd, uint32_t ctx, unsigned long flags)
+{
+	return __igt_spin_batch_new_poll(fd, ctx, flags);
+}
+
+static unsigned long __spin_wait(igt_spin_t *spin)
+{
+	struct timespec start = { };
+
+	igt_nsec_elapsed(&start);
+
+	while (!spin->running);
+
+	return igt_nsec_elapsed(&start);
+}
+
+static igt_spin_t * __spin_sync(int fd, uint32_t ctx, unsigned long flags)
+{
+	igt_spin_t *spin = __spin_poll(fd, ctx, flags);
+
+	__spin_wait(spin);
+
+	return spin;
+}
+
+static igt_spin_t * spin_sync(int fd, uint32_t ctx, unsigned long flags)
+{
+	igt_require_gem(fd);
+
+	return __spin_sync(fd, ctx, flags);
+}
+
 static void
 single(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags)
 {
@@ -195,7 +227,7 @@ single(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags)
 	fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance));
 
 	if (flags & TEST_BUSY)
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	else
 		spin = NULL;
 
@@ -251,13 +283,7 @@ busy_start(int gem_fd, const struct intel_execution_engine2 *e)
 	 */
 	sleep(2);
 
-	spin = __igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
-
-	/*
-	 * Sleep for a bit after making the engine busy to make sure the PMU
-	 * gets enabled when the batch is already running.
-	 */
-	usleep(500e3);
+	spin = __spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 
 	fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance));
 
@@ -300,7 +326,7 @@ busy_double_start(int gem_fd, const struct intel_execution_engine2 *e)
 	 * re-submission in execlists mode. Make sure busyness is correctly
 	 * reported with the engine busy, and after the engine went idle.
 	 */
-	spin[0] = __igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin[0] = __spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	usleep(500e3);
 	spin[1] = __igt_spin_batch_new(gem_fd, ctx, e2ring(gem_fd, e), 0);
 
@@ -386,7 +412,7 @@ busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 
 	igt_assert_eq(i, num_engines);
 
-	spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -413,15 +439,25 @@ busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 
 static void
 __submit_spin_batch(int gem_fd,
+		    igt_spin_t *spin,
 		    struct drm_i915_gem_exec_object2 *obj,
 		    const struct intel_execution_engine2 *e)
 {
 	struct drm_i915_gem_execbuffer2 eb = {
-		.buffer_count = 1,
 		.buffers_ptr = to_user_pointer(obj),
 		.flags = e2ring(gem_fd, e),
 	};
 
+	if (spin->running) {
+		obj[0].handle = spin->poll_handle;
+		obj[0].flags = EXEC_OBJECT_ASYNC;
+		obj[1].handle = spin->handle;
+		eb.buffer_count = 2;
+	} else {
+		obj[0].handle = spin->handle;
+		eb.buffer_count = 1;
+	}
+
 	gem_execbuf(gem_fd, &eb);
 }
 
@@ -429,7 +465,7 @@ static void
 most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 		    const unsigned int num_engines, unsigned int flags)
 {
-	struct drm_i915_gem_exec_object2 obj = {};
+	struct drm_i915_gem_exec_object2 obj[2];
 	const struct intel_execution_engine2 *e_;
 	uint64_t tval[2][num_engines];
 	uint64_t val[num_engines];
@@ -438,20 +474,19 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 	igt_spin_t *spin = NULL;
 	unsigned int idle_idx, i;
 
+	memset(obj, 0, sizeof(obj));
+
 	i = 0;
 	for_each_engine_class_instance(fd, e_) {
 		if (!gem_has_engine(gem_fd, e_->class, e_->instance))
 			continue;
 
-		if (e == e_) {
+		if (e == e_)
 			idle_idx = i;
-		} else if (spin) {
-			__submit_spin_batch(gem_fd, &obj, e_);
-		} else {
-			spin = igt_spin_batch_new(gem_fd, 0,
-						  e2ring(gem_fd, e_), 0);
-			obj.handle = spin->handle;
-		}
+		else if (spin)
+			__submit_spin_batch(gem_fd, spin, obj, e_);
+		else
+			spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e_));
 
 		val[i++] = I915_PMU_ENGINE_BUSY(e_->class, e_->instance);
 	}
@@ -461,6 +496,9 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 	for (i = 0; i < num_engines; i++)
 		fd[i] = open_group(val[i], fd[0]);
 
+	/* Small delay to allow engines to start. */
+	usleep(__spin_wait(spin) * num_engines / 1e3);
+
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -489,7 +527,7 @@ static void
 all_busy_check_all(int gem_fd, const unsigned int num_engines,
 		   unsigned int flags)
 {
-	struct drm_i915_gem_exec_object2 obj = {};
+	struct drm_i915_gem_exec_object2 obj[2];
 	const struct intel_execution_engine2 *e;
 	uint64_t tval[2][num_engines];
 	uint64_t val[num_engines];
@@ -498,18 +536,17 @@ all_busy_check_all(int gem_fd, const unsigned int num_engines,
 	igt_spin_t *spin = NULL;
 	unsigned int i;
 
+	memset(obj, 0, sizeof(obj));
+
 	i = 0;
 	for_each_engine_class_instance(fd, e) {
 		if (!gem_has_engine(gem_fd, e->class, e->instance))
 			continue;
 
-		if (spin) {
-			__submit_spin_batch(gem_fd, &obj, e);
-		} else {
-			spin = igt_spin_batch_new(gem_fd, 0,
-						  e2ring(gem_fd, e), 0);
-			obj.handle = spin->handle;
-		}
+		if (spin)
+			__submit_spin_batch(gem_fd, spin, obj, e);
+		else
+			spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e));
 
 		val[i++] = I915_PMU_ENGINE_BUSY(e->class, e->instance);
 	}
@@ -519,6 +556,9 @@ all_busy_check_all(int gem_fd, const unsigned int num_engines,
 	for (i = 0; i < num_engines; i++)
 		fd[i] = open_group(val[i], fd[0]);
 
+	/* Small delay to allow engines to start. */
+	usleep(__spin_wait(spin) * num_engines / 1e3);
+
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -550,7 +590,7 @@ no_sema(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags)
 	open_group(I915_PMU_ENGINE_WAIT(e->class, e->instance), fd);
 
 	if (flags & TEST_BUSY)
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	else
 		spin = NULL;
 
@@ -884,7 +924,7 @@ multi_client(int gem_fd, const struct intel_execution_engine2 *e)
 	 */
 	fd[1] = open_pmu(config);
 
-	spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 
 	val[0] = val[1] = __pmu_read_single(fd[0], &ts[0]);
 	slept[1] = measured_usleep(batch_duration_ns / 1000);
@@ -1248,7 +1288,7 @@ test_frequency(int gem_fd)
 	igt_require(igt_sysfs_get_u32(sysfs, "gt_boost_freq_mhz") == min_freq);
 
 	gem_quiescent_gpu(gem_fd); /* Idle to be sure the change takes effect */
-	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+	spin = spin_sync(gem_fd, 0, I915_EXEC_RENDER);
 
 	slept = pmu_read_multi(fd, 2, start);
 	measured_usleep(batch_duration_ns / 1000);
@@ -1274,7 +1314,7 @@ test_frequency(int gem_fd)
 	igt_require(igt_sysfs_get_u32(sysfs, "gt_min_freq_mhz") == max_freq);
 
 	gem_quiescent_gpu(gem_fd);
-	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+	spin = spin_sync(gem_fd, 0, I915_EXEC_RENDER);
 
 	slept = pmu_read_multi(fd, 2, start);
 	measured_usleep(batch_duration_ns / 1000);
@@ -1455,6 +1495,8 @@ static void __rearm_spin_batch(igt_spin_t *spin)
 {
 	const uint32_t mi_arb_chk = 0x5 << 23;
 
+	if (spin->running)
+		*spin->running = 0;
        *spin->batch = mi_arb_chk;
        __sync_synchronize();
 }
@@ -1517,7 +1559,7 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 		const unsigned long timeout[] = {
 			pwm_calibration_us * 1000, test_us * 1000
 		};
-		struct drm_i915_gem_exec_object2 obj = {};
+		struct drm_i915_gem_exec_object2 obj[2];
 		uint64_t total_busy_ns = 0, total_idle_ns = 0;
 		igt_spin_t *spin;
 		int ret;
@@ -1530,12 +1572,13 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 			igt_warn("Failed to set scheduling policy!\n");
 
 		/* Allocate our spin batch and idle it. */
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
-		obj.handle = spin->handle;
-		__submit_spin_batch(gem_fd, &obj, e); /* record its location */
+		spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e));
+		memset(obj, 0, sizeof(obj));
+		__submit_spin_batch(gem_fd, spin, obj, e); /* record its location */
 		igt_spin_batch_end(spin);
-		gem_sync(gem_fd, obj.handle);
-		obj.flags |= EXEC_OBJECT_PINNED;
+		gem_sync(gem_fd, spin->handle);
+		obj[0].flags |= EXEC_OBJECT_PINNED;
+		obj[1].flags |= EXEC_OBJECT_PINNED;
 
 		/* 1st pass is calibration, second pass is the test. */
 		for (int pass = 0; pass < ARRAY_SIZE(timeout); pass++) {
@@ -1545,24 +1588,30 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 
 			igt_nsec_elapsed(&test_start);
 			do {
-				unsigned int target_idle_us, t_busy;
+				unsigned int target_idle_us;
+				struct timespec start = { };
+				unsigned long prep_delay_ns;
 
 				/* Restart the spinbatch. */
+				igt_nsec_elapsed(&start);
 				__rearm_spin_batch(spin);
-				__submit_spin_batch(gem_fd, &obj, e);
+				__submit_spin_batch(gem_fd, spin, obj, e);
 
-				/*
-				 * Note that the submission may be delayed to a
-				 * tasklet (ksoftirqd) which cannot run until we
-				 * sleep as we hog the cpu (we are RT).
-				 */
+				 /* Wait for batch to start executing. */
+				__spin_wait(spin);
+				prep_delay_ns = igt_nsec_elapsed(&start);
 
-				t_busy = measured_usleep(busy_us);
+				/* PWM busy sleep. */
+				memset(&start, 0, sizeof(start));
+				igt_nsec_elapsed(&start);
+				measured_usleep(busy_us);
 				igt_spin_batch_end(spin);
-				gem_sync(gem_fd, obj.handle);
+				gem_sync(gem_fd, spin->handle);
 
-				total_busy_ns += t_busy;
+				total_busy_ns += igt_nsec_elapsed(&start);
+				total_idle_ns += prep_delay_ns;
 
+				/* Re-calibrate. */
 				target_idle_us =
 					(100 * total_busy_ns / target_busy_pct - (total_busy_ns + total_idle_ns)) / 1000;
 				total_idle_ns += measured_usleep(target_idle_us);
-- 
2.14.1

_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* Re: [igt-dev] [PATCH i-g-t] tests/perf_pmu: Improve accuracy by waiting on spinner to start
  2018-03-15 12:56 ` [igt-dev] " Tvrtko Ursulin
@ 2018-03-15 13:14   ` Chris Wilson
  -1 siblings, 0 replies; 53+ messages in thread
From: Chris Wilson @ 2018-03-15 13:14 UTC (permalink / raw)
  To: Tvrtko Ursulin, igt-dev; +Cc: Intel-gfx

Quoting Tvrtko Ursulin (2018-03-15 12:56:17)
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> More than one test assumes that the spinner is running pretty much
> immediately after we have create or submitted it.
> 
> In actuality there is a variable delay, especially on execlists platforms,
> between submission and spin batch starting to run on the hardware.
> 
> To enable tests which care about this level of timing to account for this,
> we add a new spin batch constructor which provides an output field which
> can be polled to determine when the batch actually started running.
> 
> This is implemented via MI_STOREDW_IMM from the spin batch, writing into
> memory mapped page shared with userspace.
> 
> Using this facility from perf_pmu, where applicable, should improve very
> occasional test fails across the set and platforms.
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  lib/igt_dummyload.c |  99 +++++++++++++++++++++++++++++++----
>  lib/igt_dummyload.h |   9 ++++
>  tests/perf_pmu.c    | 145 +++++++++++++++++++++++++++++++++++-----------------
>  3 files changed, 196 insertions(+), 57 deletions(-)
> 
> diff --git a/lib/igt_dummyload.c b/lib/igt_dummyload.c
> index 4b20f23dfe26..0447d2f14d57 100644
> --- a/lib/igt_dummyload.c
> +++ b/lib/igt_dummyload.c
> @@ -74,9 +74,12 @@ fill_reloc(struct drm_i915_gem_relocation_entry *reloc,
>         reloc->write_domain = write_domains;
>  }
>  
> -static int emit_recursive_batch(igt_spin_t *spin,
> -                               int fd, uint32_t ctx, unsigned engine,
> -                               uint32_t dep, bool out_fence)
> +#define OUT_FENCE      (1 << 0)
> +#define POLL_RUN       (1 << 1)
> +
> +static int
> +emit_recursive_batch(igt_spin_t *spin, int fd, uint32_t ctx, unsigned engine,
> +                    uint32_t dep, unsigned int flags)
>  {
>  #define SCRATCH 0
>  #define BATCH 1
> @@ -116,6 +119,8 @@ static int emit_recursive_batch(igt_spin_t *spin,
>         execbuf.buffer_count++;
>  
>         if (dep) {
> +               igt_assert(!(flags & POLL_RUN));
> +

Challenge left to the reader :)

>                 /* dummy write to dependency */
>                 obj[SCRATCH].handle = dep;
>                 fill_reloc(&relocs[obj[BATCH].relocation_count++],
> @@ -123,6 +128,41 @@ static int emit_recursive_batch(igt_spin_t *spin,
>                            I915_GEM_DOMAIN_RENDER,
>                            I915_GEM_DOMAIN_RENDER);
>                 execbuf.buffer_count++;
> +       } else if (flags & POLL_RUN) {
> +               unsigned int offset;
> +
> +               igt_assert(!dep);
> +
> +               spin->poll_handle = gem_create(fd, 4096);
> +               spin->running = __gem_mmap__wc(fd, spin->poll_handle,
> +                                              0, 4096, PROT_READ | PROT_WRITE);

Use mmap_cpu and gem_set_caching().

> +               igt_assert(spin->running);
> +               igt_assert_eq(*spin->running, 0);
> +
> +               *batch++ = MI_STORE_DWORD_IMM | (gen < 6 ? 1 << 22 : 0);

Hmm, have we forgot the (len-2) or is this an unusual command that knows
its own length?

> +
> +               if (gen >= 8) {
> +                       offset = sizeof(uint32_t);
> +                       *batch++ = 0;
> +                       *batch++ = 0;
> +               } else if (gen >= 4) {
> +                       offset = 2 * sizeof(uint32_t);
> +                       *batch++ = 0;
> +                       *batch++ = 0;
> +               } else {
> +                       offset = sizeof(uint32_t);
> +                       batch[-1]--;
> +                       *batch++ = 0;
> +               }
> +
> +               *batch++ = 1;
> +
> +               obj[SCRATCH].handle = spin->poll_handle;
> +               fill_reloc(&relocs[obj[BATCH].relocation_count++],
> +                          spin->poll_handle, offset,
> +                          I915_GEM_DOMAIN_INSTRUCTION,
> +                          I915_GEM_DOMAIN_INSTRUCTION);

DOMAIN_RENDER preferably. You don't need the w/a. Could we not lie about
the write-hazard? Removes the need for EXEC_OBJECT_ASYNC and opens up
the possibility for using different dwords for different engines and then
waiting for all-engines.

> +               execbuf.buffer_count++;

gen4 and gen5 require I915_EXEC_SECURE and a DRM_MASTER fd.

We can just do something like
	if (gen == 4 || gen == 5)
		igt_require(igt_device_set_master(fd) == 0));

> +/**
> + * igt_spin_batch_new_poll:
> + * @fd: open i915 drm file descriptor
> + * @engine: Ring to execute batch OR'd with execbuf flags. If value is less
> + *          than 0, execute on all available rings.
> + *
> + * Start a recursive batch on a ring. Immediately returns a #igt_spin_t that
> + * contains the batch's handle that can be waited upon. The returned structure
> + * must be passed to igt_spin_batch_free() for post-processing.
> + *
> + * igt_spin_t->running will containt a pointer which target will change from
> + * zero to one once the spinner actually starts executing on the GPU.
> + *
> + * Returns:
> + * Structure with helper internal state for igt_spin_batch_free().
> + */
> +igt_spin_t *
> +igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine)
> +{
> +       igt_spin_t *spin;
> +
> +       igt_require_gem(fd);
> +       igt_require(gem_mmap__has_wc(fd));

igt_require(gem_can_store_dword(fd, engine));

Not all platforms have a MI_STORE_DWORD/DATA_IMM (with virtual addresses
at least) and some platforms will die (*cough* snb *cough*).

> +
> +       spin = __igt_spin_batch_new_poll(fd, ctx, engine);
> +       igt_assert(gem_bo_busy(fd, spin->handle));
> +
> +       return spin;
> +}

>  igt_spin_t *__igt_spin_batch_new(int fd,
> @@ -55,6 +57,13 @@ igt_spin_t *igt_spin_batch_new_fence(int fd,
>                                      uint32_t ctx,
>                                      unsigned engine);
>  
> +igt_spin_t *__igt_spin_batch_new_poll(int fd,
> +                                      uint32_t ctx,
> +                                      unsigned engine);
> +igt_spin_t *igt_spin_batch_new_poll(int fd,
> +                                   uint32_t ctx,
> +                                   unsigned engine);
> +
>  void igt_spin_batch_set_timeout(igt_spin_t *spin, int64_t ns);
>  void igt_spin_batch_end(igt_spin_t *spin);
>  void igt_spin_batch_free(int fd, igt_spin_t *spin);
> diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
> index 19fcc95ffc7f..d1b7b23bc646 100644
> --- a/tests/perf_pmu.c
> +++ b/tests/perf_pmu.c
> @@ -184,6 +184,38 @@ static void end_spin(int fd, igt_spin_t *spin, unsigned int flags)
>                 usleep(batch_duration_ns / 5000);
>  }
>  
> +static igt_spin_t * __spin_poll(int fd, uint32_t ctx, unsigned long flags)
> +{
> +       return __igt_spin_batch_new_poll(fd, ctx, flags);
> +}
> +
> +static unsigned long __spin_wait(igt_spin_t *spin)
> +{
> +       struct timespec start = { };
> +
> +       igt_nsec_elapsed(&start);
> +
> +       while (!spin->running);

Put ';' on a new line so it's clearly visible.

> +
> +       return igt_nsec_elapsed(&start);
> +}
> +
> +static igt_spin_t * __spin_sync(int fd, uint32_t ctx, unsigned long flags)
> +{
> +       igt_spin_t *spin = __spin_poll(fd, ctx, flags);
> +
> +       __spin_wait(spin);
> +
> +       return spin;
> +}
> +
> +static igt_spin_t * spin_sync(int fd, uint32_t ctx, unsigned long flags)

spin_sync() has connotations with gem_sync(). gem_sync is wait for end,
but spin_sync is wait_for_start. Maybe spin_wait_for_execute? Nah.

> +{
> +       igt_require_gem(fd);
> +
> +       return __spin_sync(fd, ctx, flags);
> +}

>  static void
>  __submit_spin_batch(int gem_fd,
> +                   igt_spin_t *spin,
>                     struct drm_i915_gem_exec_object2 *obj,
>                     const struct intel_execution_engine2 *e)
>  {
>         struct drm_i915_gem_execbuffer2 eb = {
> -               .buffer_count = 1,
>                 .buffers_ptr = to_user_pointer(obj),
>                 .flags = e2ring(gem_fd, e),
>         };
>  
> +       if (spin->running) {
> +               obj[0].handle = spin->poll_handle;
> +               obj[0].flags = EXEC_OBJECT_ASYNC;
> +               obj[1].handle = spin->handle;
> +               eb.buffer_count = 2;
> +       } else {
> +               obj[0].handle = spin->handle;
> +               eb.buffer_count = 1;
> +       }

obj[] must be set up by the caller; the EXEC_OBJECT_PINNED are
essential. Or else the kernel *will* move spin->poll_handle and then it
is fubar.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [igt-dev] [PATCH i-g-t] tests/perf_pmu: Improve accuracy by waiting on spinner to start
@ 2018-03-15 13:14   ` Chris Wilson
  0 siblings, 0 replies; 53+ messages in thread
From: Chris Wilson @ 2018-03-15 13:14 UTC (permalink / raw)
  To: Tvrtko Ursulin, igt-dev; +Cc: Intel-gfx, Tvrtko Ursulin

Quoting Tvrtko Ursulin (2018-03-15 12:56:17)
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> More than one test assumes that the spinner is running pretty much
> immediately after we have create or submitted it.
> 
> In actuality there is a variable delay, especially on execlists platforms,
> between submission and spin batch starting to run on the hardware.
> 
> To enable tests which care about this level of timing to account for this,
> we add a new spin batch constructor which provides an output field which
> can be polled to determine when the batch actually started running.
> 
> This is implemented via MI_STOREDW_IMM from the spin batch, writing into
> memory mapped page shared with userspace.
> 
> Using this facility from perf_pmu, where applicable, should improve very
> occasional test fails across the set and platforms.
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  lib/igt_dummyload.c |  99 +++++++++++++++++++++++++++++++----
>  lib/igt_dummyload.h |   9 ++++
>  tests/perf_pmu.c    | 145 +++++++++++++++++++++++++++++++++++-----------------
>  3 files changed, 196 insertions(+), 57 deletions(-)
> 
> diff --git a/lib/igt_dummyload.c b/lib/igt_dummyload.c
> index 4b20f23dfe26..0447d2f14d57 100644
> --- a/lib/igt_dummyload.c
> +++ b/lib/igt_dummyload.c
> @@ -74,9 +74,12 @@ fill_reloc(struct drm_i915_gem_relocation_entry *reloc,
>         reloc->write_domain = write_domains;
>  }
>  
> -static int emit_recursive_batch(igt_spin_t *spin,
> -                               int fd, uint32_t ctx, unsigned engine,
> -                               uint32_t dep, bool out_fence)
> +#define OUT_FENCE      (1 << 0)
> +#define POLL_RUN       (1 << 1)
> +
> +static int
> +emit_recursive_batch(igt_spin_t *spin, int fd, uint32_t ctx, unsigned engine,
> +                    uint32_t dep, unsigned int flags)
>  {
>  #define SCRATCH 0
>  #define BATCH 1
> @@ -116,6 +119,8 @@ static int emit_recursive_batch(igt_spin_t *spin,
>         execbuf.buffer_count++;
>  
>         if (dep) {
> +               igt_assert(!(flags & POLL_RUN));
> +

Challenge left to the reader :)

>                 /* dummy write to dependency */
>                 obj[SCRATCH].handle = dep;
>                 fill_reloc(&relocs[obj[BATCH].relocation_count++],
> @@ -123,6 +128,41 @@ static int emit_recursive_batch(igt_spin_t *spin,
>                            I915_GEM_DOMAIN_RENDER,
>                            I915_GEM_DOMAIN_RENDER);
>                 execbuf.buffer_count++;
> +       } else if (flags & POLL_RUN) {
> +               unsigned int offset;
> +
> +               igt_assert(!dep);
> +
> +               spin->poll_handle = gem_create(fd, 4096);
> +               spin->running = __gem_mmap__wc(fd, spin->poll_handle,
> +                                              0, 4096, PROT_READ | PROT_WRITE);

Use mmap_cpu and gem_set_caching().

> +               igt_assert(spin->running);
> +               igt_assert_eq(*spin->running, 0);
> +
> +               *batch++ = MI_STORE_DWORD_IMM | (gen < 6 ? 1 << 22 : 0);

Hmm, have we forgot the (len-2) or is this an unusual command that knows
its own length?

> +
> +               if (gen >= 8) {
> +                       offset = sizeof(uint32_t);
> +                       *batch++ = 0;
> +                       *batch++ = 0;
> +               } else if (gen >= 4) {
> +                       offset = 2 * sizeof(uint32_t);
> +                       *batch++ = 0;
> +                       *batch++ = 0;
> +               } else {
> +                       offset = sizeof(uint32_t);
> +                       batch[-1]--;
> +                       *batch++ = 0;
> +               }
> +
> +               *batch++ = 1;
> +
> +               obj[SCRATCH].handle = spin->poll_handle;
> +               fill_reloc(&relocs[obj[BATCH].relocation_count++],
> +                          spin->poll_handle, offset,
> +                          I915_GEM_DOMAIN_INSTRUCTION,
> +                          I915_GEM_DOMAIN_INSTRUCTION);

DOMAIN_RENDER preferably. You don't need the w/a. Could we not lie about
the write-hazard? Removes the need for EXEC_OBJECT_ASYNC and opens up
the possibility for using different dwords for different engines and then
waiting for all-engines.

> +               execbuf.buffer_count++;

gen4 and gen5 require I915_EXEC_SECURE and a DRM_MASTER fd.

We can just do something like
	if (gen == 4 || gen == 5)
		igt_require(igt_device_set_master(fd) == 0));

> +/**
> + * igt_spin_batch_new_poll:
> + * @fd: open i915 drm file descriptor
> + * @engine: Ring to execute batch OR'd with execbuf flags. If value is less
> + *          than 0, execute on all available rings.
> + *
> + * Start a recursive batch on a ring. Immediately returns a #igt_spin_t that
> + * contains the batch's handle that can be waited upon. The returned structure
> + * must be passed to igt_spin_batch_free() for post-processing.
> + *
> + * igt_spin_t->running will containt a pointer which target will change from
> + * zero to one once the spinner actually starts executing on the GPU.
> + *
> + * Returns:
> + * Structure with helper internal state for igt_spin_batch_free().
> + */
> +igt_spin_t *
> +igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine)
> +{
> +       igt_spin_t *spin;
> +
> +       igt_require_gem(fd);
> +       igt_require(gem_mmap__has_wc(fd));

igt_require(gem_can_store_dword(fd, engine));

Not all platforms have a MI_STORE_DWORD/DATA_IMM (with virtual addresses
at least) and some platforms will die (*cough* snb *cough*).

> +
> +       spin = __igt_spin_batch_new_poll(fd, ctx, engine);
> +       igt_assert(gem_bo_busy(fd, spin->handle));
> +
> +       return spin;
> +}

>  igt_spin_t *__igt_spin_batch_new(int fd,
> @@ -55,6 +57,13 @@ igt_spin_t *igt_spin_batch_new_fence(int fd,
>                                      uint32_t ctx,
>                                      unsigned engine);
>  
> +igt_spin_t *__igt_spin_batch_new_poll(int fd,
> +                                      uint32_t ctx,
> +                                      unsigned engine);
> +igt_spin_t *igt_spin_batch_new_poll(int fd,
> +                                   uint32_t ctx,
> +                                   unsigned engine);
> +
>  void igt_spin_batch_set_timeout(igt_spin_t *spin, int64_t ns);
>  void igt_spin_batch_end(igt_spin_t *spin);
>  void igt_spin_batch_free(int fd, igt_spin_t *spin);
> diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
> index 19fcc95ffc7f..d1b7b23bc646 100644
> --- a/tests/perf_pmu.c
> +++ b/tests/perf_pmu.c
> @@ -184,6 +184,38 @@ static void end_spin(int fd, igt_spin_t *spin, unsigned int flags)
>                 usleep(batch_duration_ns / 5000);
>  }
>  
> +static igt_spin_t * __spin_poll(int fd, uint32_t ctx, unsigned long flags)
> +{
> +       return __igt_spin_batch_new_poll(fd, ctx, flags);
> +}
> +
> +static unsigned long __spin_wait(igt_spin_t *spin)
> +{
> +       struct timespec start = { };
> +
> +       igt_nsec_elapsed(&start);
> +
> +       while (!spin->running);

Put ';' on a new line so it's clearly visible.

> +
> +       return igt_nsec_elapsed(&start);
> +}
> +
> +static igt_spin_t * __spin_sync(int fd, uint32_t ctx, unsigned long flags)
> +{
> +       igt_spin_t *spin = __spin_poll(fd, ctx, flags);
> +
> +       __spin_wait(spin);
> +
> +       return spin;
> +}
> +
> +static igt_spin_t * spin_sync(int fd, uint32_t ctx, unsigned long flags)

spin_sync() has connotations with gem_sync(). gem_sync is wait for end,
but spin_sync is wait_for_start. Maybe spin_wait_for_execute? Nah.

> +{
> +       igt_require_gem(fd);
> +
> +       return __spin_sync(fd, ctx, flags);
> +}

>  static void
>  __submit_spin_batch(int gem_fd,
> +                   igt_spin_t *spin,
>                     struct drm_i915_gem_exec_object2 *obj,
>                     const struct intel_execution_engine2 *e)
>  {
>         struct drm_i915_gem_execbuffer2 eb = {
> -               .buffer_count = 1,
>                 .buffers_ptr = to_user_pointer(obj),
>                 .flags = e2ring(gem_fd, e),
>         };
>  
> +       if (spin->running) {
> +               obj[0].handle = spin->poll_handle;
> +               obj[0].flags = EXEC_OBJECT_ASYNC;
> +               obj[1].handle = spin->handle;
> +               eb.buffer_count = 2;
> +       } else {
> +               obj[0].handle = spin->handle;
> +               eb.buffer_count = 1;
> +       }

obj[] must be set up by the caller; the EXEC_OBJECT_PINNED are
essential. Or else the kernel *will* move spin->poll_handle and then it
is fubar.
-Chris
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [igt-dev] [PATCH i-g-t] tests/perf_pmu: Improve accuracy by waiting on spinner to start
  2018-03-15 13:14   ` Chris Wilson
@ 2018-03-15 13:36     ` Tvrtko Ursulin
  -1 siblings, 0 replies; 53+ messages in thread
From: Tvrtko Ursulin @ 2018-03-15 13:36 UTC (permalink / raw)
  To: Chris Wilson, Tvrtko Ursulin, igt-dev; +Cc: Intel-gfx


On 15/03/2018 13:14, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2018-03-15 12:56:17)
>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>
>> More than one test assumes that the spinner is running pretty much
>> immediately after we have create or submitted it.
>>
>> In actuality there is a variable delay, especially on execlists platforms,
>> between submission and spin batch starting to run on the hardware.
>>
>> To enable tests which care about this level of timing to account for this,
>> we add a new spin batch constructor which provides an output field which
>> can be polled to determine when the batch actually started running.
>>
>> This is implemented via MI_STOREDW_IMM from the spin batch, writing into
>> memory mapped page shared with userspace.
>>
>> Using this facility from perf_pmu, where applicable, should improve very
>> occasional test fails across the set and platforms.
>>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>> Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
>> ---
>>   lib/igt_dummyload.c |  99 +++++++++++++++++++++++++++++++----
>>   lib/igt_dummyload.h |   9 ++++
>>   tests/perf_pmu.c    | 145 +++++++++++++++++++++++++++++++++++-----------------
>>   3 files changed, 196 insertions(+), 57 deletions(-)
>>
>> diff --git a/lib/igt_dummyload.c b/lib/igt_dummyload.c
>> index 4b20f23dfe26..0447d2f14d57 100644
>> --- a/lib/igt_dummyload.c
>> +++ b/lib/igt_dummyload.c
>> @@ -74,9 +74,12 @@ fill_reloc(struct drm_i915_gem_relocation_entry *reloc,
>>          reloc->write_domain = write_domains;
>>   }
>>   
>> -static int emit_recursive_batch(igt_spin_t *spin,
>> -                               int fd, uint32_t ctx, unsigned engine,
>> -                               uint32_t dep, bool out_fence)
>> +#define OUT_FENCE      (1 << 0)
>> +#define POLL_RUN       (1 << 1)
>> +
>> +static int
>> +emit_recursive_batch(igt_spin_t *spin, int fd, uint32_t ctx, unsigned engine,
>> +                    uint32_t dep, unsigned int flags)
>>   {
>>   #define SCRATCH 0
>>   #define BATCH 1
>> @@ -116,6 +119,8 @@ static int emit_recursive_batch(igt_spin_t *spin,
>>          execbuf.buffer_count++;
>>   
>>          if (dep) {
>> +               igt_assert(!(flags & POLL_RUN));
>> +
> 
> Challenge left to the reader :)

Well not the reader, whoever gets to need both. :)

>>                  /* dummy write to dependency */
>>                  obj[SCRATCH].handle = dep;
>>                  fill_reloc(&relocs[obj[BATCH].relocation_count++],
>> @@ -123,6 +128,41 @@ static int emit_recursive_batch(igt_spin_t *spin,
>>                             I915_GEM_DOMAIN_RENDER,
>>                             I915_GEM_DOMAIN_RENDER);
>>                  execbuf.buffer_count++;
>> +       } else if (flags & POLL_RUN) {
>> +               unsigned int offset;
>> +
>> +               igt_assert(!dep);
>> +
>> +               spin->poll_handle = gem_create(fd, 4096);
>> +               spin->running = __gem_mmap__wc(fd, spin->poll_handle,
>> +                                              0, 4096, PROT_READ | PROT_WRITE);
> 
> Use mmap_cpu and gem_set_caching().

Wouldn't that get us into coherency issues on some platforms? I keep the 
page mapped for API users to poll on.

> 
>> +               igt_assert(spin->running);
>> +               igt_assert_eq(*spin->running, 0);
>> +
>> +               *batch++ = MI_STORE_DWORD_IMM | (gen < 6 ? 1 << 22 : 0);
> 
> Hmm, have we forgot the (len-2) or is this an unusual command that knows
> its own length?

I lifted the code from elsewhere.

> 
>> +
>> +               if (gen >= 8) {
>> +                       offset = sizeof(uint32_t);
>> +                       *batch++ = 0;
>> +                       *batch++ = 0;
>> +               } else if (gen >= 4) {
>> +                       offset = 2 * sizeof(uint32_t);
>> +                       *batch++ = 0;
>> +                       *batch++ = 0;
>> +               } else {
>> +                       offset = sizeof(uint32_t);
>> +                       batch[-1]--;
>> +                       *batch++ = 0;
>> +               }
>> +
>> +               *batch++ = 1;
>> +
>> +               obj[SCRATCH].handle = spin->poll_handle;
>> +               fill_reloc(&relocs[obj[BATCH].relocation_count++],
>> +                          spin->poll_handle, offset,
>> +                          I915_GEM_DOMAIN_INSTRUCTION,
>> +                          I915_GEM_DOMAIN_INSTRUCTION);
> 
> DOMAIN_RENDER preferably. You don't need the w/a. Could we not lie about
> the write-hazard? Removes the need for EXEC_OBJECT_ASYNC and opens up

Can do.

> the possibility for using different dwords for different engines and then
> waiting for all-engines.

Yes, I could even use this so good to not let me be lazy. :)


>> +               execbuf.buffer_count++;
> 
> gen4 and gen5 require I915_EXEC_SECURE and a DRM_MASTER fd.
> 
> We can just do something like
> 	if (gen == 4 || gen == 5)
> 		igt_require(igt_device_set_master(fd) == 0));

Okay.

> 
>> +/**
>> + * igt_spin_batch_new_poll:
>> + * @fd: open i915 drm file descriptor
>> + * @engine: Ring to execute batch OR'd with execbuf flags. If value is less
>> + *          than 0, execute on all available rings.
>> + *
>> + * Start a recursive batch on a ring. Immediately returns a #igt_spin_t that
>> + * contains the batch's handle that can be waited upon. The returned structure
>> + * must be passed to igt_spin_batch_free() for post-processing.
>> + *
>> + * igt_spin_t->running will containt a pointer which target will change from
>> + * zero to one once the spinner actually starts executing on the GPU.
>> + *
>> + * Returns:
>> + * Structure with helper internal state for igt_spin_batch_free().
>> + */
>> +igt_spin_t *
>> +igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine)
>> +{
>> +       igt_spin_t *spin;
>> +
>> +       igt_require_gem(fd);
>> +       igt_require(gem_mmap__has_wc(fd));
> 
> igt_require(gem_can_store_dword(fd, engine));
> 
> Not all platforms have a MI_STORE_DWORD/DATA_IMM (with virtual addresses
> at least) and some platforms will die (*cough* snb *cough*).

Grr that makes it all problematic. Well, maybe not completely, I can 
just fall back to less accurate method on those platforms.

> 
>> +
>> +       spin = __igt_spin_batch_new_poll(fd, ctx, engine);
>> +       igt_assert(gem_bo_busy(fd, spin->handle));
>> +
>> +       return spin;
>> +}
> 
>>   igt_spin_t *__igt_spin_batch_new(int fd,
>> @@ -55,6 +57,13 @@ igt_spin_t *igt_spin_batch_new_fence(int fd,
>>                                       uint32_t ctx,
>>                                       unsigned engine);
>>   
>> +igt_spin_t *__igt_spin_batch_new_poll(int fd,
>> +                                      uint32_t ctx,
>> +                                      unsigned engine);
>> +igt_spin_t *igt_spin_batch_new_poll(int fd,
>> +                                   uint32_t ctx,
>> +                                   unsigned engine);
>> +
>>   void igt_spin_batch_set_timeout(igt_spin_t *spin, int64_t ns);
>>   void igt_spin_batch_end(igt_spin_t *spin);
>>   void igt_spin_batch_free(int fd, igt_spin_t *spin);
>> diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
>> index 19fcc95ffc7f..d1b7b23bc646 100644
>> --- a/tests/perf_pmu.c
>> +++ b/tests/perf_pmu.c
>> @@ -184,6 +184,38 @@ static void end_spin(int fd, igt_spin_t *spin, unsigned int flags)
>>                  usleep(batch_duration_ns / 5000);
>>   }
>>   
>> +static igt_spin_t * __spin_poll(int fd, uint32_t ctx, unsigned long flags)
>> +{
>> +       return __igt_spin_batch_new_poll(fd, ctx, flags);
>> +}
>> +
>> +static unsigned long __spin_wait(igt_spin_t *spin)
>> +{
>> +       struct timespec start = { };
>> +
>> +       igt_nsec_elapsed(&start);
>> +
>> +       while (!spin->running);
> 
> Put ';' on a new line so it's clearly visible.

Okay.

> 
>> +
>> +       return igt_nsec_elapsed(&start);
>> +}
>> +
>> +static igt_spin_t * __spin_sync(int fd, uint32_t ctx, unsigned long flags)
>> +{
>> +       igt_spin_t *spin = __spin_poll(fd, ctx, flags);
>> +
>> +       __spin_wait(spin);
>> +
>> +       return spin;
>> +}
>> +
>> +static igt_spin_t * spin_sync(int fd, uint32_t ctx, unsigned long flags)
> 
> spin_sync() has connotations with gem_sync(). gem_sync is wait for end,
> but spin_sync is wait_for_start. Maybe spin_wait_for_execute? Nah.
> 
>> +{
>> +       igt_require_gem(fd);
>> +
>> +       return __spin_sync(fd, ctx, flags);
>> +}
> 
>>   static void
>>   __submit_spin_batch(int gem_fd,
>> +                   igt_spin_t *spin,
>>                      struct drm_i915_gem_exec_object2 *obj,
>>                      const struct intel_execution_engine2 *e)
>>   {
>>          struct drm_i915_gem_execbuffer2 eb = {
>> -               .buffer_count = 1,
>>                  .buffers_ptr = to_user_pointer(obj),
>>                  .flags = e2ring(gem_fd, e),
>>          };
>>   
>> +       if (spin->running) {
>> +               obj[0].handle = spin->poll_handle;
>> +               obj[0].flags = EXEC_OBJECT_ASYNC;
>> +               obj[1].handle = spin->handle;
>> +               eb.buffer_count = 2;
>> +       } else {
>> +               obj[0].handle = spin->handle;
>> +               eb.buffer_count = 1;
>> +       }
> 
> obj[] must be set up by the caller; the EXEC_OBJECT_PINNED are
> essential. Or else the kernel *will* move spin->poll_handle and then it
> is fubar.

Why the caller has to do it? It is providing obj array which gets 
populated by the helper and by the kernel. If I add EXEC_OBJECT_PINNED 
to the helper is there a remaining problem?

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [igt-dev] [PATCH i-g-t] tests/perf_pmu: Improve accuracy by waiting on spinner to start
@ 2018-03-15 13:36     ` Tvrtko Ursulin
  0 siblings, 0 replies; 53+ messages in thread
From: Tvrtko Ursulin @ 2018-03-15 13:36 UTC (permalink / raw)
  To: Chris Wilson, Tvrtko Ursulin, igt-dev; +Cc: Intel-gfx, Tvrtko Ursulin


On 15/03/2018 13:14, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2018-03-15 12:56:17)
>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>
>> More than one test assumes that the spinner is running pretty much
>> immediately after we have create or submitted it.
>>
>> In actuality there is a variable delay, especially on execlists platforms,
>> between submission and spin batch starting to run on the hardware.
>>
>> To enable tests which care about this level of timing to account for this,
>> we add a new spin batch constructor which provides an output field which
>> can be polled to determine when the batch actually started running.
>>
>> This is implemented via MI_STOREDW_IMM from the spin batch, writing into
>> memory mapped page shared with userspace.
>>
>> Using this facility from perf_pmu, where applicable, should improve very
>> occasional test fails across the set and platforms.
>>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>> Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
>> ---
>>   lib/igt_dummyload.c |  99 +++++++++++++++++++++++++++++++----
>>   lib/igt_dummyload.h |   9 ++++
>>   tests/perf_pmu.c    | 145 +++++++++++++++++++++++++++++++++++-----------------
>>   3 files changed, 196 insertions(+), 57 deletions(-)
>>
>> diff --git a/lib/igt_dummyload.c b/lib/igt_dummyload.c
>> index 4b20f23dfe26..0447d2f14d57 100644
>> --- a/lib/igt_dummyload.c
>> +++ b/lib/igt_dummyload.c
>> @@ -74,9 +74,12 @@ fill_reloc(struct drm_i915_gem_relocation_entry *reloc,
>>          reloc->write_domain = write_domains;
>>   }
>>   
>> -static int emit_recursive_batch(igt_spin_t *spin,
>> -                               int fd, uint32_t ctx, unsigned engine,
>> -                               uint32_t dep, bool out_fence)
>> +#define OUT_FENCE      (1 << 0)
>> +#define POLL_RUN       (1 << 1)
>> +
>> +static int
>> +emit_recursive_batch(igt_spin_t *spin, int fd, uint32_t ctx, unsigned engine,
>> +                    uint32_t dep, unsigned int flags)
>>   {
>>   #define SCRATCH 0
>>   #define BATCH 1
>> @@ -116,6 +119,8 @@ static int emit_recursive_batch(igt_spin_t *spin,
>>          execbuf.buffer_count++;
>>   
>>          if (dep) {
>> +               igt_assert(!(flags & POLL_RUN));
>> +
> 
> Challenge left to the reader :)

Well not the reader, whoever gets to need both. :)

>>                  /* dummy write to dependency */
>>                  obj[SCRATCH].handle = dep;
>>                  fill_reloc(&relocs[obj[BATCH].relocation_count++],
>> @@ -123,6 +128,41 @@ static int emit_recursive_batch(igt_spin_t *spin,
>>                             I915_GEM_DOMAIN_RENDER,
>>                             I915_GEM_DOMAIN_RENDER);
>>                  execbuf.buffer_count++;
>> +       } else if (flags & POLL_RUN) {
>> +               unsigned int offset;
>> +
>> +               igt_assert(!dep);
>> +
>> +               spin->poll_handle = gem_create(fd, 4096);
>> +               spin->running = __gem_mmap__wc(fd, spin->poll_handle,
>> +                                              0, 4096, PROT_READ | PROT_WRITE);
> 
> Use mmap_cpu and gem_set_caching().

Wouldn't that get us into coherency issues on some platforms? I keep the 
page mapped for API users to poll on.

> 
>> +               igt_assert(spin->running);
>> +               igt_assert_eq(*spin->running, 0);
>> +
>> +               *batch++ = MI_STORE_DWORD_IMM | (gen < 6 ? 1 << 22 : 0);
> 
> Hmm, have we forgot the (len-2) or is this an unusual command that knows
> its own length?

I lifted the code from elsewhere.

> 
>> +
>> +               if (gen >= 8) {
>> +                       offset = sizeof(uint32_t);
>> +                       *batch++ = 0;
>> +                       *batch++ = 0;
>> +               } else if (gen >= 4) {
>> +                       offset = 2 * sizeof(uint32_t);
>> +                       *batch++ = 0;
>> +                       *batch++ = 0;
>> +               } else {
>> +                       offset = sizeof(uint32_t);
>> +                       batch[-1]--;
>> +                       *batch++ = 0;
>> +               }
>> +
>> +               *batch++ = 1;
>> +
>> +               obj[SCRATCH].handle = spin->poll_handle;
>> +               fill_reloc(&relocs[obj[BATCH].relocation_count++],
>> +                          spin->poll_handle, offset,
>> +                          I915_GEM_DOMAIN_INSTRUCTION,
>> +                          I915_GEM_DOMAIN_INSTRUCTION);
> 
> DOMAIN_RENDER preferably. You don't need the w/a. Could we not lie about
> the write-hazard? Removes the need for EXEC_OBJECT_ASYNC and opens up

Can do.

> the possibility for using different dwords for different engines and then
> waiting for all-engines.

Yes, I could even use this so good to not let me be lazy. :)


>> +               execbuf.buffer_count++;
> 
> gen4 and gen5 require I915_EXEC_SECURE and a DRM_MASTER fd.
> 
> We can just do something like
> 	if (gen == 4 || gen == 5)
> 		igt_require(igt_device_set_master(fd) == 0));

Okay.

> 
>> +/**
>> + * igt_spin_batch_new_poll:
>> + * @fd: open i915 drm file descriptor
>> + * @engine: Ring to execute batch OR'd with execbuf flags. If value is less
>> + *          than 0, execute on all available rings.
>> + *
>> + * Start a recursive batch on a ring. Immediately returns a #igt_spin_t that
>> + * contains the batch's handle that can be waited upon. The returned structure
>> + * must be passed to igt_spin_batch_free() for post-processing.
>> + *
>> + * igt_spin_t->running will containt a pointer which target will change from
>> + * zero to one once the spinner actually starts executing on the GPU.
>> + *
>> + * Returns:
>> + * Structure with helper internal state for igt_spin_batch_free().
>> + */
>> +igt_spin_t *
>> +igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine)
>> +{
>> +       igt_spin_t *spin;
>> +
>> +       igt_require_gem(fd);
>> +       igt_require(gem_mmap__has_wc(fd));
> 
> igt_require(gem_can_store_dword(fd, engine));
> 
> Not all platforms have a MI_STORE_DWORD/DATA_IMM (with virtual addresses
> at least) and some platforms will die (*cough* snb *cough*).

Grr that makes it all problematic. Well, maybe not completely, I can 
just fall back to less accurate method on those platforms.

> 
>> +
>> +       spin = __igt_spin_batch_new_poll(fd, ctx, engine);
>> +       igt_assert(gem_bo_busy(fd, spin->handle));
>> +
>> +       return spin;
>> +}
> 
>>   igt_spin_t *__igt_spin_batch_new(int fd,
>> @@ -55,6 +57,13 @@ igt_spin_t *igt_spin_batch_new_fence(int fd,
>>                                       uint32_t ctx,
>>                                       unsigned engine);
>>   
>> +igt_spin_t *__igt_spin_batch_new_poll(int fd,
>> +                                      uint32_t ctx,
>> +                                      unsigned engine);
>> +igt_spin_t *igt_spin_batch_new_poll(int fd,
>> +                                   uint32_t ctx,
>> +                                   unsigned engine);
>> +
>>   void igt_spin_batch_set_timeout(igt_spin_t *spin, int64_t ns);
>>   void igt_spin_batch_end(igt_spin_t *spin);
>>   void igt_spin_batch_free(int fd, igt_spin_t *spin);
>> diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
>> index 19fcc95ffc7f..d1b7b23bc646 100644
>> --- a/tests/perf_pmu.c
>> +++ b/tests/perf_pmu.c
>> @@ -184,6 +184,38 @@ static void end_spin(int fd, igt_spin_t *spin, unsigned int flags)
>>                  usleep(batch_duration_ns / 5000);
>>   }
>>   
>> +static igt_spin_t * __spin_poll(int fd, uint32_t ctx, unsigned long flags)
>> +{
>> +       return __igt_spin_batch_new_poll(fd, ctx, flags);
>> +}
>> +
>> +static unsigned long __spin_wait(igt_spin_t *spin)
>> +{
>> +       struct timespec start = { };
>> +
>> +       igt_nsec_elapsed(&start);
>> +
>> +       while (!spin->running);
> 
> Put ';' on a new line so it's clearly visible.

Okay.

> 
>> +
>> +       return igt_nsec_elapsed(&start);
>> +}
>> +
>> +static igt_spin_t * __spin_sync(int fd, uint32_t ctx, unsigned long flags)
>> +{
>> +       igt_spin_t *spin = __spin_poll(fd, ctx, flags);
>> +
>> +       __spin_wait(spin);
>> +
>> +       return spin;
>> +}
>> +
>> +static igt_spin_t * spin_sync(int fd, uint32_t ctx, unsigned long flags)
> 
> spin_sync() has connotations with gem_sync(). gem_sync is wait for end,
> but spin_sync is wait_for_start. Maybe spin_wait_for_execute? Nah.
> 
>> +{
>> +       igt_require_gem(fd);
>> +
>> +       return __spin_sync(fd, ctx, flags);
>> +}
> 
>>   static void
>>   __submit_spin_batch(int gem_fd,
>> +                   igt_spin_t *spin,
>>                      struct drm_i915_gem_exec_object2 *obj,
>>                      const struct intel_execution_engine2 *e)
>>   {
>>          struct drm_i915_gem_execbuffer2 eb = {
>> -               .buffer_count = 1,
>>                  .buffers_ptr = to_user_pointer(obj),
>>                  .flags = e2ring(gem_fd, e),
>>          };
>>   
>> +       if (spin->running) {
>> +               obj[0].handle = spin->poll_handle;
>> +               obj[0].flags = EXEC_OBJECT_ASYNC;
>> +               obj[1].handle = spin->handle;
>> +               eb.buffer_count = 2;
>> +       } else {
>> +               obj[0].handle = spin->handle;
>> +               eb.buffer_count = 1;
>> +       }
> 
> obj[] must be set up by the caller; the EXEC_OBJECT_PINNED are
> essential. Or else the kernel *will* move spin->poll_handle and then it
> is fubar.

Why the caller has to do it? It is providing obj array which gets 
populated by the helper and by the kernel. If I add EXEC_OBJECT_PINNED 
to the helper is there a remaining problem?

Regards,

Tvrtko
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [igt-dev] [PATCH i-g-t] tests/perf_pmu: Improve accuracy by waiting on spinner to start
  2018-03-15 13:36     ` Tvrtko Ursulin
@ 2018-03-15 13:45       ` Chris Wilson
  -1 siblings, 0 replies; 53+ messages in thread
From: Chris Wilson @ 2018-03-15 13:45 UTC (permalink / raw)
  To: Tvrtko Ursulin, Tvrtko Ursulin, igt-dev; +Cc: Intel-gfx

Quoting Tvrtko Ursulin (2018-03-15 13:36:26)
> 
> On 15/03/2018 13:14, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2018-03-15 12:56:17)
> >> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >>
> >> More than one test assumes that the spinner is running pretty much
> >> immediately after we have create or submitted it.
> >>
> >> In actuality there is a variable delay, especially on execlists platforms,
> >> between submission and spin batch starting to run on the hardware.
> >>
> >> To enable tests which care about this level of timing to account for this,
> >> we add a new spin batch constructor which provides an output field which
> >> can be polled to determine when the batch actually started running.
> >>
> >> This is implemented via MI_STOREDW_IMM from the spin batch, writing into
> >> memory mapped page shared with userspace.
> >>
> >> Using this facility from perf_pmu, where applicable, should improve very
> >> occasional test fails across the set and platforms.
> >>
> >> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >> Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
> >> ---
> >>   lib/igt_dummyload.c |  99 +++++++++++++++++++++++++++++++----
> >>   lib/igt_dummyload.h |   9 ++++
> >>   tests/perf_pmu.c    | 145 +++++++++++++++++++++++++++++++++++-----------------
> >>   3 files changed, 196 insertions(+), 57 deletions(-)
> >>
> >> diff --git a/lib/igt_dummyload.c b/lib/igt_dummyload.c
> >> index 4b20f23dfe26..0447d2f14d57 100644
> >> --- a/lib/igt_dummyload.c
> >> +++ b/lib/igt_dummyload.c
> >> @@ -74,9 +74,12 @@ fill_reloc(struct drm_i915_gem_relocation_entry *reloc,
> >>          reloc->write_domain = write_domains;
> >>   }
> >>   
> >> -static int emit_recursive_batch(igt_spin_t *spin,
> >> -                               int fd, uint32_t ctx, unsigned engine,
> >> -                               uint32_t dep, bool out_fence)
> >> +#define OUT_FENCE      (1 << 0)
> >> +#define POLL_RUN       (1 << 1)
> >> +
> >> +static int
> >> +emit_recursive_batch(igt_spin_t *spin, int fd, uint32_t ctx, unsigned engine,
> >> +                    uint32_t dep, unsigned int flags)
> >>   {
> >>   #define SCRATCH 0
> >>   #define BATCH 1
> >> @@ -116,6 +119,8 @@ static int emit_recursive_batch(igt_spin_t *spin,
> >>          execbuf.buffer_count++;
> >>   
> >>          if (dep) {
> >> +               igt_assert(!(flags & POLL_RUN));
> >> +
> > 
> > Challenge left to the reader :)
> 
> Well not the reader, whoever gets to need both. :)
> 
> >>                  /* dummy write to dependency */
> >>                  obj[SCRATCH].handle = dep;
> >>                  fill_reloc(&relocs[obj[BATCH].relocation_count++],
> >> @@ -123,6 +128,41 @@ static int emit_recursive_batch(igt_spin_t *spin,
> >>                             I915_GEM_DOMAIN_RENDER,
> >>                             I915_GEM_DOMAIN_RENDER);
> >>                  execbuf.buffer_count++;
> >> +       } else if (flags & POLL_RUN) {
> >> +               unsigned int offset;
> >> +
> >> +               igt_assert(!dep);
> >> +
> >> +               spin->poll_handle = gem_create(fd, 4096);
> >> +               spin->running = __gem_mmap__wc(fd, spin->poll_handle,
> >> +                                              0, 4096, PROT_READ | PROT_WRITE);
> > 
> > Use mmap_cpu and gem_set_caching().
> 
> Wouldn't that get us into coherency issues on some platforms? I keep the 
> page mapped for API users to poll on.

bxt-a? The point of using gem_set_caching() is that it is coherent with
the CPU cache even on !llc via snooping. It's then essentially the same
as how we handle breadcrumbs.

Now admittedly, we should do

if (__gem_set_caching() == 0)
	running = __gem_mmap__wb();
else
	running = __gem_mmap__wc();

The caller need be known the wiser; except having to assume the worst
and so __sync_synchronize() if they do *running = x themselves.

> >> +               igt_assert(spin->running);
> >> +               igt_assert_eq(*spin->running, 0);
> >> +
> >> +               *batch++ = MI_STORE_DWORD_IMM | (gen < 6 ? 1 << 22 : 0);
> > 
> > Hmm, have we forgot the (len-2) or is this an unusual command that knows
> > its own length?
> 
> I lifted the code from elsewhere.

I checked, we have the same bug everywhere or nowhere. :|

> >> +/**
> >> + * igt_spin_batch_new_poll:
> >> + * @fd: open i915 drm file descriptor
> >> + * @engine: Ring to execute batch OR'd with execbuf flags. If value is less
> >> + *          than 0, execute on all available rings.
> >> + *
> >> + * Start a recursive batch on a ring. Immediately returns a #igt_spin_t that
> >> + * contains the batch's handle that can be waited upon. The returned structure
> >> + * must be passed to igt_spin_batch_free() for post-processing.
> >> + *
> >> + * igt_spin_t->running will containt a pointer which target will change from
> >> + * zero to one once the spinner actually starts executing on the GPU.
> >> + *
> >> + * Returns:
> >> + * Structure with helper internal state for igt_spin_batch_free().
> >> + */
> >> +igt_spin_t *
> >> +igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine)
> >> +{
> >> +       igt_spin_t *spin;
> >> +
> >> +       igt_require_gem(fd);
> >> +       igt_require(gem_mmap__has_wc(fd));
> > 
> > igt_require(gem_can_store_dword(fd, engine));
> > 
> > Not all platforms have a MI_STORE_DWORD/DATA_IMM (with virtual addresses
> > at least) and some platforms will die (*cough* snb *cough*).
> 
> Grr that makes it all problematic. Well, maybe not completely, I can 
> just fall back to less accurate method on those platforms.

It's only a few, I don't think in the grand scheme of things it's enough
to worry about. We should lose just a few pmu tests on snb.

> >>   static void
> >>   __submit_spin_batch(int gem_fd,
> >> +                   igt_spin_t *spin,
> >>                      struct drm_i915_gem_exec_object2 *obj,
> >>                      const struct intel_execution_engine2 *e)
> >>   {
> >>          struct drm_i915_gem_execbuffer2 eb = {
> >> -               .buffer_count = 1,
> >>                  .buffers_ptr = to_user_pointer(obj),
> >>                  .flags = e2ring(gem_fd, e),
> >>          };
> >>   
> >> +       if (spin->running) {
> >> +               obj[0].handle = spin->poll_handle;
> >> +               obj[0].flags = EXEC_OBJECT_ASYNC;
> >> +               obj[1].handle = spin->handle;
> >> +               eb.buffer_count = 2;
> >> +       } else {
> >> +               obj[0].handle = spin->handle;
> >> +               eb.buffer_count = 1;
> >> +       }
> > 
> > obj[] must be set up by the caller; the EXEC_OBJECT_PINNED are
> > essential. Or else the kernel *will* move spin->poll_handle and then it
> > is fubar.
> 
> Why the caller has to do it? It is providing obj array which gets 
> populated by the helper and by the kernel. If I add EXEC_OBJECT_PINNED 
> to the helper is there a remaining problem?

Yes. The caller needs to ensure that flags = PINNED *and* the offset is
correct. We can't just randomly stuff PINNED in there as that pretty
much guarantees the object will be moved, breaking the implicit
relocations.

As we are making changes to igt_spin_t, one of the ideas was that we put
the obj[] array there (with the offsets and flags setup correctly) so
that we could just feed that in again later without having to worry
about the relocations.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [Intel-gfx] [igt-dev] [PATCH i-g-t] tests/perf_pmu: Improve accuracy by waiting on spinner to start
@ 2018-03-15 13:45       ` Chris Wilson
  0 siblings, 0 replies; 53+ messages in thread
From: Chris Wilson @ 2018-03-15 13:45 UTC (permalink / raw)
  To: Tvrtko Ursulin, Tvrtko Ursulin, igt-dev; +Cc: Intel-gfx

Quoting Tvrtko Ursulin (2018-03-15 13:36:26)
> 
> On 15/03/2018 13:14, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2018-03-15 12:56:17)
> >> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >>
> >> More than one test assumes that the spinner is running pretty much
> >> immediately after we have create or submitted it.
> >>
> >> In actuality there is a variable delay, especially on execlists platforms,
> >> between submission and spin batch starting to run on the hardware.
> >>
> >> To enable tests which care about this level of timing to account for this,
> >> we add a new spin batch constructor which provides an output field which
> >> can be polled to determine when the batch actually started running.
> >>
> >> This is implemented via MI_STOREDW_IMM from the spin batch, writing into
> >> memory mapped page shared with userspace.
> >>
> >> Using this facility from perf_pmu, where applicable, should improve very
> >> occasional test fails across the set and platforms.
> >>
> >> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >> Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
> >> ---
> >>   lib/igt_dummyload.c |  99 +++++++++++++++++++++++++++++++----
> >>   lib/igt_dummyload.h |   9 ++++
> >>   tests/perf_pmu.c    | 145 +++++++++++++++++++++++++++++++++++-----------------
> >>   3 files changed, 196 insertions(+), 57 deletions(-)
> >>
> >> diff --git a/lib/igt_dummyload.c b/lib/igt_dummyload.c
> >> index 4b20f23dfe26..0447d2f14d57 100644
> >> --- a/lib/igt_dummyload.c
> >> +++ b/lib/igt_dummyload.c
> >> @@ -74,9 +74,12 @@ fill_reloc(struct drm_i915_gem_relocation_entry *reloc,
> >>          reloc->write_domain = write_domains;
> >>   }
> >>   
> >> -static int emit_recursive_batch(igt_spin_t *spin,
> >> -                               int fd, uint32_t ctx, unsigned engine,
> >> -                               uint32_t dep, bool out_fence)
> >> +#define OUT_FENCE      (1 << 0)
> >> +#define POLL_RUN       (1 << 1)
> >> +
> >> +static int
> >> +emit_recursive_batch(igt_spin_t *spin, int fd, uint32_t ctx, unsigned engine,
> >> +                    uint32_t dep, unsigned int flags)
> >>   {
> >>   #define SCRATCH 0
> >>   #define BATCH 1
> >> @@ -116,6 +119,8 @@ static int emit_recursive_batch(igt_spin_t *spin,
> >>          execbuf.buffer_count++;
> >>   
> >>          if (dep) {
> >> +               igt_assert(!(flags & POLL_RUN));
> >> +
> > 
> > Challenge left to the reader :)
> 
> Well not the reader, whoever gets to need both. :)
> 
> >>                  /* dummy write to dependency */
> >>                  obj[SCRATCH].handle = dep;
> >>                  fill_reloc(&relocs[obj[BATCH].relocation_count++],
> >> @@ -123,6 +128,41 @@ static int emit_recursive_batch(igt_spin_t *spin,
> >>                             I915_GEM_DOMAIN_RENDER,
> >>                             I915_GEM_DOMAIN_RENDER);
> >>                  execbuf.buffer_count++;
> >> +       } else if (flags & POLL_RUN) {
> >> +               unsigned int offset;
> >> +
> >> +               igt_assert(!dep);
> >> +
> >> +               spin->poll_handle = gem_create(fd, 4096);
> >> +               spin->running = __gem_mmap__wc(fd, spin->poll_handle,
> >> +                                              0, 4096, PROT_READ | PROT_WRITE);
> > 
> > Use mmap_cpu and gem_set_caching().
> 
> Wouldn't that get us into coherency issues on some platforms? I keep the 
> page mapped for API users to poll on.

bxt-a? The point of using gem_set_caching() is that it is coherent with
the CPU cache even on !llc via snooping. It's then essentially the same
as how we handle breadcrumbs.

Now admittedly, we should do

if (__gem_set_caching() == 0)
	running = __gem_mmap__wb();
else
	running = __gem_mmap__wc();

The caller need be known the wiser; except having to assume the worst
and so __sync_synchronize() if they do *running = x themselves.

> >> +               igt_assert(spin->running);
> >> +               igt_assert_eq(*spin->running, 0);
> >> +
> >> +               *batch++ = MI_STORE_DWORD_IMM | (gen < 6 ? 1 << 22 : 0);
> > 
> > Hmm, have we forgot the (len-2) or is this an unusual command that knows
> > its own length?
> 
> I lifted the code from elsewhere.

I checked, we have the same bug everywhere or nowhere. :|

> >> +/**
> >> + * igt_spin_batch_new_poll:
> >> + * @fd: open i915 drm file descriptor
> >> + * @engine: Ring to execute batch OR'd with execbuf flags. If value is less
> >> + *          than 0, execute on all available rings.
> >> + *
> >> + * Start a recursive batch on a ring. Immediately returns a #igt_spin_t that
> >> + * contains the batch's handle that can be waited upon. The returned structure
> >> + * must be passed to igt_spin_batch_free() for post-processing.
> >> + *
> >> + * igt_spin_t->running will containt a pointer which target will change from
> >> + * zero to one once the spinner actually starts executing on the GPU.
> >> + *
> >> + * Returns:
> >> + * Structure with helper internal state for igt_spin_batch_free().
> >> + */
> >> +igt_spin_t *
> >> +igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine)
> >> +{
> >> +       igt_spin_t *spin;
> >> +
> >> +       igt_require_gem(fd);
> >> +       igt_require(gem_mmap__has_wc(fd));
> > 
> > igt_require(gem_can_store_dword(fd, engine));
> > 
> > Not all platforms have a MI_STORE_DWORD/DATA_IMM (with virtual addresses
> > at least) and some platforms will die (*cough* snb *cough*).
> 
> Grr that makes it all problematic. Well, maybe not completely, I can 
> just fall back to less accurate method on those platforms.

It's only a few, I don't think in the grand scheme of things it's enough
to worry about. We should lose just a few pmu tests on snb.

> >>   static void
> >>   __submit_spin_batch(int gem_fd,
> >> +                   igt_spin_t *spin,
> >>                      struct drm_i915_gem_exec_object2 *obj,
> >>                      const struct intel_execution_engine2 *e)
> >>   {
> >>          struct drm_i915_gem_execbuffer2 eb = {
> >> -               .buffer_count = 1,
> >>                  .buffers_ptr = to_user_pointer(obj),
> >>                  .flags = e2ring(gem_fd, e),
> >>          };
> >>   
> >> +       if (spin->running) {
> >> +               obj[0].handle = spin->poll_handle;
> >> +               obj[0].flags = EXEC_OBJECT_ASYNC;
> >> +               obj[1].handle = spin->handle;
> >> +               eb.buffer_count = 2;
> >> +       } else {
> >> +               obj[0].handle = spin->handle;
> >> +               eb.buffer_count = 1;
> >> +       }
> > 
> > obj[] must be set up by the caller; the EXEC_OBJECT_PINNED are
> > essential. Or else the kernel *will* move spin->poll_handle and then it
> > is fubar.
> 
> Why the caller has to do it? It is providing obj array which gets 
> populated by the helper and by the kernel. If I add EXEC_OBJECT_PINNED 
> to the helper is there a remaining problem?

Yes. The caller needs to ensure that flags = PINNED *and* the offset is
correct. We can't just randomly stuff PINNED in there as that pretty
much guarantees the object will be moved, breaking the implicit
relocations.

As we are making changes to igt_spin_t, one of the ideas was that we put
the obj[] array there (with the offsets and flags setup correctly) so
that we could just feed that in again later without having to worry
about the relocations.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 53+ messages in thread

* [igt-dev] ✓ Fi.CI.BAT: success for tests/perf_pmu: Improve accuracy by waiting on spinner to start
  2018-03-15 12:56 ` [igt-dev] " Tvrtko Ursulin
  (?)
  (?)
@ 2018-03-15 14:03 ` Patchwork
  -1 siblings, 0 replies; 53+ messages in thread
From: Patchwork @ 2018-03-15 14:03 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: igt-dev

== Series Details ==

Series: tests/perf_pmu: Improve accuracy by waiting on spinner to start
URL   : https://patchwork.freedesktop.org/series/40027/
State : success

== Summary ==

IGT patchset tested on top of latest successful build
5c146fcff2d51ea426efc538599013e887fe456b tests/kms_pipe_crc_basic: Remove legacy crc tests

with latest DRM-Tip kernel build CI_DRM_3933
95626f201526 drm-tip: 2018y-03m-15d-11h-01m-14s UTC integration manifest

No testlist changes.

---- Known issues:

Test kms_pipe_crc_basic:
        Subgroup suspend-read-crc-pipe-b:
                pass       -> INCOMPLETE (fi-snb-2520m) fdo#103713

fdo#103713 https://bugs.freedesktop.org/show_bug.cgi?id=103713

fi-bdw-5557u     total:285  pass:264  dwarn:0   dfail:0   fail:0   skip:21  time:435s
fi-bdw-gvtdvm    total:285  pass:261  dwarn:0   dfail:0   fail:0   skip:24  time:439s
fi-blb-e6850     total:285  pass:220  dwarn:1   dfail:0   fail:0   skip:64  time:383s
fi-bsw-n3050     total:285  pass:239  dwarn:0   dfail:0   fail:0   skip:46  time:535s
fi-bwr-2160      total:285  pass:180  dwarn:0   dfail:0   fail:0   skip:105 time:295s
fi-bxt-j4205     total:285  pass:256  dwarn:0   dfail:0   fail:0   skip:29  time:511s
fi-byt-j1900     total:285  pass:250  dwarn:0   dfail:0   fail:0   skip:35  time:515s
fi-byt-n2820     total:285  pass:246  dwarn:0   dfail:0   fail:0   skip:39  time:504s
fi-cfl-8700k     total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:410s
fi-cfl-s2        total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:580s
fi-cfl-u         total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:515s
fi-cnl-drrs      total:285  pass:254  dwarn:3   dfail:0   fail:0   skip:28  time:518s
fi-cnl-y3        total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:594s
fi-elk-e7500     total:285  pass:226  dwarn:0   dfail:0   fail:0   skip:59  time:425s
fi-gdg-551       total:285  pass:176  dwarn:0   dfail:0   fail:1   skip:108 time:319s
fi-glk-1         total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:539s
fi-hsw-4770      total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:403s
fi-ilk-650       total:285  pass:225  dwarn:0   dfail:0   fail:0   skip:60  time:423s
fi-ivb-3520m     total:285  pass:256  dwarn:0   dfail:0   fail:0   skip:29  time:477s
fi-ivb-3770      total:285  pass:252  dwarn:0   dfail:0   fail:0   skip:33  time:427s
fi-kbl-7500u     total:285  pass:260  dwarn:1   dfail:0   fail:0   skip:24  time:472s
fi-kbl-7567u     total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:470s
fi-kbl-r         total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:514s
fi-pnv-d510      total:285  pass:219  dwarn:1   dfail:0   fail:0   skip:65  time:655s
fi-skl-6260u     total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:442s
fi-skl-6600u     total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:525s
fi-skl-6700hq    total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:542s
fi-skl-6700k2    total:285  pass:261  dwarn:0   dfail:0   fail:0   skip:24  time:506s
fi-skl-6770hq    total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:494s
fi-skl-guc       total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:428s
fi-skl-gvtdvm    total:285  pass:262  dwarn:0   dfail:0   fail:0   skip:23  time:447s
fi-snb-2520m     total:242  pass:208  dwarn:0   dfail:0   fail:0   skip:33 
fi-snb-2600      total:285  pass:245  dwarn:0   dfail:0   fail:0   skip:40  time:410s

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_1135/issues.html
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [igt-dev] [PATCH i-g-t] tests/perf_pmu: Improve accuracy by waiting on spinner to start
  2018-03-15 13:45       ` [Intel-gfx] " Chris Wilson
@ 2018-03-15 14:37         ` Tvrtko Ursulin
  -1 siblings, 0 replies; 53+ messages in thread
From: Tvrtko Ursulin @ 2018-03-15 14:37 UTC (permalink / raw)
  To: Chris Wilson, Tvrtko Ursulin, igt-dev; +Cc: Intel-gfx


On 15/03/2018 13:45, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2018-03-15 13:36:26)
>>
>> On 15/03/2018 13:14, Chris Wilson wrote:
>>> Quoting Tvrtko Ursulin (2018-03-15 12:56:17)
>>>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>>
>>>> More than one test assumes that the spinner is running pretty much
>>>> immediately after we have create or submitted it.
>>>>
>>>> In actuality there is a variable delay, especially on execlists platforms,
>>>> between submission and spin batch starting to run on the hardware.
>>>>
>>>> To enable tests which care about this level of timing to account for this,
>>>> we add a new spin batch constructor which provides an output field which
>>>> can be polled to determine when the batch actually started running.
>>>>
>>>> This is implemented via MI_STOREDW_IMM from the spin batch, writing into
>>>> memory mapped page shared with userspace.
>>>>
>>>> Using this facility from perf_pmu, where applicable, should improve very
>>>> occasional test fails across the set and platforms.
>>>>
>>>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>> Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
>>>> ---
>>>>    lib/igt_dummyload.c |  99 +++++++++++++++++++++++++++++++----
>>>>    lib/igt_dummyload.h |   9 ++++
>>>>    tests/perf_pmu.c    | 145 +++++++++++++++++++++++++++++++++++-----------------
>>>>    3 files changed, 196 insertions(+), 57 deletions(-)
>>>>
>>>> diff --git a/lib/igt_dummyload.c b/lib/igt_dummyload.c
>>>> index 4b20f23dfe26..0447d2f14d57 100644
>>>> --- a/lib/igt_dummyload.c
>>>> +++ b/lib/igt_dummyload.c
>>>> @@ -74,9 +74,12 @@ fill_reloc(struct drm_i915_gem_relocation_entry *reloc,
>>>>           reloc->write_domain = write_domains;
>>>>    }
>>>>    
>>>> -static int emit_recursive_batch(igt_spin_t *spin,
>>>> -                               int fd, uint32_t ctx, unsigned engine,
>>>> -                               uint32_t dep, bool out_fence)
>>>> +#define OUT_FENCE      (1 << 0)
>>>> +#define POLL_RUN       (1 << 1)
>>>> +
>>>> +static int
>>>> +emit_recursive_batch(igt_spin_t *spin, int fd, uint32_t ctx, unsigned engine,
>>>> +                    uint32_t dep, unsigned int flags)
>>>>    {
>>>>    #define SCRATCH 0
>>>>    #define BATCH 1
>>>> @@ -116,6 +119,8 @@ static int emit_recursive_batch(igt_spin_t *spin,
>>>>           execbuf.buffer_count++;
>>>>    
>>>>           if (dep) {
>>>> +               igt_assert(!(flags & POLL_RUN));
>>>> +
>>>
>>> Challenge left to the reader :)
>>
>> Well not the reader, whoever gets to need both. :)
>>
>>>>                   /* dummy write to dependency */
>>>>                   obj[SCRATCH].handle = dep;
>>>>                   fill_reloc(&relocs[obj[BATCH].relocation_count++],
>>>> @@ -123,6 +128,41 @@ static int emit_recursive_batch(igt_spin_t *spin,
>>>>                              I915_GEM_DOMAIN_RENDER,
>>>>                              I915_GEM_DOMAIN_RENDER);
>>>>                   execbuf.buffer_count++;
>>>> +       } else if (flags & POLL_RUN) {
>>>> +               unsigned int offset;
>>>> +
>>>> +               igt_assert(!dep);
>>>> +
>>>> +               spin->poll_handle = gem_create(fd, 4096);
>>>> +               spin->running = __gem_mmap__wc(fd, spin->poll_handle,
>>>> +                                              0, 4096, PROT_READ | PROT_WRITE);
>>>
>>> Use mmap_cpu and gem_set_caching().
>>
>> Wouldn't that get us into coherency issues on some platforms? I keep the
>> page mapped for API users to poll on.
> 
> bxt-a? The point of using gem_set_caching() is that it is coherent with
> the CPU cache even on !llc via snooping. It's then essentially the same
> as how we handle breadcrumbs.
> 
> Now admittedly, we should do
> 
> if (__gem_set_caching() == 0)
> 	running = __gem_mmap__wb();
> else
> 	running = __gem_mmap__wc();
> 
> The caller need be known the wiser; except having to assume the worst
> and so __sync_synchronize() if they do *running = x themselves.

Ok.

>>>> +               igt_assert(spin->running);
>>>> +               igt_assert_eq(*spin->running, 0);
>>>> +
>>>> +               *batch++ = MI_STORE_DWORD_IMM | (gen < 6 ? 1 << 22 : 0);
>>>
>>> Hmm, have we forgot the (len-2) or is this an unusual command that knows
>>> its own length?
>>
>> I lifted the code from elsewhere.
> 
> I checked, we have the same bug everywhere or nowhere. :|
> 
>>>> +/**
>>>> + * igt_spin_batch_new_poll:
>>>> + * @fd: open i915 drm file descriptor
>>>> + * @engine: Ring to execute batch OR'd with execbuf flags. If value is less
>>>> + *          than 0, execute on all available rings.
>>>> + *
>>>> + * Start a recursive batch on a ring. Immediately returns a #igt_spin_t that
>>>> + * contains the batch's handle that can be waited upon. The returned structure
>>>> + * must be passed to igt_spin_batch_free() for post-processing.
>>>> + *
>>>> + * igt_spin_t->running will containt a pointer which target will change from
>>>> + * zero to one once the spinner actually starts executing on the GPU.
>>>> + *
>>>> + * Returns:
>>>> + * Structure with helper internal state for igt_spin_batch_free().
>>>> + */
>>>> +igt_spin_t *
>>>> +igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine)
>>>> +{
>>>> +       igt_spin_t *spin;
>>>> +
>>>> +       igt_require_gem(fd);
>>>> +       igt_require(gem_mmap__has_wc(fd));
>>>
>>> igt_require(gem_can_store_dword(fd, engine));
>>>
>>> Not all platforms have a MI_STORE_DWORD/DATA_IMM (with virtual addresses
>>> at least) and some platforms will die (*cough* snb *cough*).
>>
>> Grr that makes it all problematic. Well, maybe not completely, I can
>> just fall back to less accurate method on those platforms.
> 
> It's only a few, I don't think in the grand scheme of things it's enough
> to worry about. We should lose just a few pmu tests on snb.
> 
>>>>    static void
>>>>    __submit_spin_batch(int gem_fd,
>>>> +                   igt_spin_t *spin,
>>>>                       struct drm_i915_gem_exec_object2 *obj,
>>>>                       const struct intel_execution_engine2 *e)
>>>>    {
>>>>           struct drm_i915_gem_execbuffer2 eb = {
>>>> -               .buffer_count = 1,
>>>>                   .buffers_ptr = to_user_pointer(obj),
>>>>                   .flags = e2ring(gem_fd, e),
>>>>           };
>>>>    
>>>> +       if (spin->running) {
>>>> +               obj[0].handle = spin->poll_handle;
>>>> +               obj[0].flags = EXEC_OBJECT_ASYNC;
>>>> +               obj[1].handle = spin->handle;
>>>> +               eb.buffer_count = 2;
>>>> +       } else {
>>>> +               obj[0].handle = spin->handle;
>>>> +               eb.buffer_count = 1;
>>>> +       }
>>>
>>> obj[] must be set up by the caller; the EXEC_OBJECT_PINNED are
>>> essential. Or else the kernel *will* move spin->poll_handle and then it
>>> is fubar.
>>
>> Why the caller has to do it? It is providing obj array which gets
>> populated by the helper and by the kernel. If I add EXEC_OBJECT_PINNED
>> to the helper is there a remaining problem?
> 
> Yes. The caller needs to ensure that flags = PINNED *and* the offset is
> correct. We can't just randomly stuff PINNED in there as that pretty
> much guarantees the object will be moved, breaking the implicit
> relocations.

Nevermind I was confused, thought I was always populating the obj array.

> As we are making changes to igt_spin_t, one of the ideas was that we put
> the obj[] array there (with the offsets and flags setup correctly) so
> that we could just feed that in again later without having to worry
> about the relocations.

I tried that before but we couldn't agree on resubmit semantics.

My patch had igt_spin_batch_restart(fd, spin) - so emitting the exact 
same batch, including the dependency. That would actually work well for 
this use case.

So if you are happy with that, I can resurrect that patch, add one more 
to implement stuff from this patch, and rebase perf_pmu changes to follow.

Actually I would change it to igt_spin_batch_restart(fd, spin, engine) - 
so the engine can be changed.

Regards,

Tvrtko




> -Chris
> 
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [igt-dev] [PATCH i-g-t] tests/perf_pmu: Improve accuracy by waiting on spinner to start
@ 2018-03-15 14:37         ` Tvrtko Ursulin
  0 siblings, 0 replies; 53+ messages in thread
From: Tvrtko Ursulin @ 2018-03-15 14:37 UTC (permalink / raw)
  To: Chris Wilson, Tvrtko Ursulin, igt-dev; +Cc: Intel-gfx, Tvrtko Ursulin


On 15/03/2018 13:45, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2018-03-15 13:36:26)
>>
>> On 15/03/2018 13:14, Chris Wilson wrote:
>>> Quoting Tvrtko Ursulin (2018-03-15 12:56:17)
>>>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>>
>>>> More than one test assumes that the spinner is running pretty much
>>>> immediately after we have create or submitted it.
>>>>
>>>> In actuality there is a variable delay, especially on execlists platforms,
>>>> between submission and spin batch starting to run on the hardware.
>>>>
>>>> To enable tests which care about this level of timing to account for this,
>>>> we add a new spin batch constructor which provides an output field which
>>>> can be polled to determine when the batch actually started running.
>>>>
>>>> This is implemented via MI_STOREDW_IMM from the spin batch, writing into
>>>> memory mapped page shared with userspace.
>>>>
>>>> Using this facility from perf_pmu, where applicable, should improve very
>>>> occasional test fails across the set and platforms.
>>>>
>>>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>> Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
>>>> ---
>>>>    lib/igt_dummyload.c |  99 +++++++++++++++++++++++++++++++----
>>>>    lib/igt_dummyload.h |   9 ++++
>>>>    tests/perf_pmu.c    | 145 +++++++++++++++++++++++++++++++++++-----------------
>>>>    3 files changed, 196 insertions(+), 57 deletions(-)
>>>>
>>>> diff --git a/lib/igt_dummyload.c b/lib/igt_dummyload.c
>>>> index 4b20f23dfe26..0447d2f14d57 100644
>>>> --- a/lib/igt_dummyload.c
>>>> +++ b/lib/igt_dummyload.c
>>>> @@ -74,9 +74,12 @@ fill_reloc(struct drm_i915_gem_relocation_entry *reloc,
>>>>           reloc->write_domain = write_domains;
>>>>    }
>>>>    
>>>> -static int emit_recursive_batch(igt_spin_t *spin,
>>>> -                               int fd, uint32_t ctx, unsigned engine,
>>>> -                               uint32_t dep, bool out_fence)
>>>> +#define OUT_FENCE      (1 << 0)
>>>> +#define POLL_RUN       (1 << 1)
>>>> +
>>>> +static int
>>>> +emit_recursive_batch(igt_spin_t *spin, int fd, uint32_t ctx, unsigned engine,
>>>> +                    uint32_t dep, unsigned int flags)
>>>>    {
>>>>    #define SCRATCH 0
>>>>    #define BATCH 1
>>>> @@ -116,6 +119,8 @@ static int emit_recursive_batch(igt_spin_t *spin,
>>>>           execbuf.buffer_count++;
>>>>    
>>>>           if (dep) {
>>>> +               igt_assert(!(flags & POLL_RUN));
>>>> +
>>>
>>> Challenge left to the reader :)
>>
>> Well not the reader, whoever gets to need both. :)
>>
>>>>                   /* dummy write to dependency */
>>>>                   obj[SCRATCH].handle = dep;
>>>>                   fill_reloc(&relocs[obj[BATCH].relocation_count++],
>>>> @@ -123,6 +128,41 @@ static int emit_recursive_batch(igt_spin_t *spin,
>>>>                              I915_GEM_DOMAIN_RENDER,
>>>>                              I915_GEM_DOMAIN_RENDER);
>>>>                   execbuf.buffer_count++;
>>>> +       } else if (flags & POLL_RUN) {
>>>> +               unsigned int offset;
>>>> +
>>>> +               igt_assert(!dep);
>>>> +
>>>> +               spin->poll_handle = gem_create(fd, 4096);
>>>> +               spin->running = __gem_mmap__wc(fd, spin->poll_handle,
>>>> +                                              0, 4096, PROT_READ | PROT_WRITE);
>>>
>>> Use mmap_cpu and gem_set_caching().
>>
>> Wouldn't that get us into coherency issues on some platforms? I keep the
>> page mapped for API users to poll on.
> 
> bxt-a? The point of using gem_set_caching() is that it is coherent with
> the CPU cache even on !llc via snooping. It's then essentially the same
> as how we handle breadcrumbs.
> 
> Now admittedly, we should do
> 
> if (__gem_set_caching() == 0)
> 	running = __gem_mmap__wb();
> else
> 	running = __gem_mmap__wc();
> 
> The caller need be known the wiser; except having to assume the worst
> and so __sync_synchronize() if they do *running = x themselves.

Ok.

>>>> +               igt_assert(spin->running);
>>>> +               igt_assert_eq(*spin->running, 0);
>>>> +
>>>> +               *batch++ = MI_STORE_DWORD_IMM | (gen < 6 ? 1 << 22 : 0);
>>>
>>> Hmm, have we forgot the (len-2) or is this an unusual command that knows
>>> its own length?
>>
>> I lifted the code from elsewhere.
> 
> I checked, we have the same bug everywhere or nowhere. :|
> 
>>>> +/**
>>>> + * igt_spin_batch_new_poll:
>>>> + * @fd: open i915 drm file descriptor
>>>> + * @engine: Ring to execute batch OR'd with execbuf flags. If value is less
>>>> + *          than 0, execute on all available rings.
>>>> + *
>>>> + * Start a recursive batch on a ring. Immediately returns a #igt_spin_t that
>>>> + * contains the batch's handle that can be waited upon. The returned structure
>>>> + * must be passed to igt_spin_batch_free() for post-processing.
>>>> + *
>>>> + * igt_spin_t->running will containt a pointer which target will change from
>>>> + * zero to one once the spinner actually starts executing on the GPU.
>>>> + *
>>>> + * Returns:
>>>> + * Structure with helper internal state for igt_spin_batch_free().
>>>> + */
>>>> +igt_spin_t *
>>>> +igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine)
>>>> +{
>>>> +       igt_spin_t *spin;
>>>> +
>>>> +       igt_require_gem(fd);
>>>> +       igt_require(gem_mmap__has_wc(fd));
>>>
>>> igt_require(gem_can_store_dword(fd, engine));
>>>
>>> Not all platforms have a MI_STORE_DWORD/DATA_IMM (with virtual addresses
>>> at least) and some platforms will die (*cough* snb *cough*).
>>
>> Grr that makes it all problematic. Well, maybe not completely, I can
>> just fall back to less accurate method on those platforms.
> 
> It's only a few, I don't think in the grand scheme of things it's enough
> to worry about. We should lose just a few pmu tests on snb.
> 
>>>>    static void
>>>>    __submit_spin_batch(int gem_fd,
>>>> +                   igt_spin_t *spin,
>>>>                       struct drm_i915_gem_exec_object2 *obj,
>>>>                       const struct intel_execution_engine2 *e)
>>>>    {
>>>>           struct drm_i915_gem_execbuffer2 eb = {
>>>> -               .buffer_count = 1,
>>>>                   .buffers_ptr = to_user_pointer(obj),
>>>>                   .flags = e2ring(gem_fd, e),
>>>>           };
>>>>    
>>>> +       if (spin->running) {
>>>> +               obj[0].handle = spin->poll_handle;
>>>> +               obj[0].flags = EXEC_OBJECT_ASYNC;
>>>> +               obj[1].handle = spin->handle;
>>>> +               eb.buffer_count = 2;
>>>> +       } else {
>>>> +               obj[0].handle = spin->handle;
>>>> +               eb.buffer_count = 1;
>>>> +       }
>>>
>>> obj[] must be set up by the caller; the EXEC_OBJECT_PINNED are
>>> essential. Or else the kernel *will* move spin->poll_handle and then it
>>> is fubar.
>>
>> Why the caller has to do it? It is providing obj array which gets
>> populated by the helper and by the kernel. If I add EXEC_OBJECT_PINNED
>> to the helper is there a remaining problem?
> 
> Yes. The caller needs to ensure that flags = PINNED *and* the offset is
> correct. We can't just randomly stuff PINNED in there as that pretty
> much guarantees the object will be moved, breaking the implicit
> relocations.

Nevermind I was confused, thought I was always populating the obj array.

> As we are making changes to igt_spin_t, one of the ideas was that we put
> the obj[] array there (with the offsets and flags setup correctly) so
> that we could just feed that in again later without having to worry
> about the relocations.

I tried that before but we couldn't agree on resubmit semantics.

My patch had igt_spin_batch_restart(fd, spin) - so emitting the exact 
same batch, including the dependency. That would actually work well for 
this use case.

So if you are happy with that, I can resurrect that patch, add one more 
to implement stuff from this patch, and rebase perf_pmu changes to follow.

Actually I would change it to igt_spin_batch_restart(fd, spin, engine) - 
so the engine can be changed.

Regards,

Tvrtko




> -Chris
> 
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [igt-dev] [PATCH i-g-t] tests/perf_pmu: Improve accuracy by waiting on spinner to start
  2018-03-15 14:37         ` Tvrtko Ursulin
@ 2018-03-15 14:46           ` Chris Wilson
  -1 siblings, 0 replies; 53+ messages in thread
From: Chris Wilson @ 2018-03-15 14:46 UTC (permalink / raw)
  To: Tvrtko Ursulin, Tvrtko Ursulin, igt-dev; +Cc: Intel-gfx

Quoting Tvrtko Ursulin (2018-03-15 14:37:59)
> 
> On 15/03/2018 13:45, Chris Wilson wrote:
> > As we are making changes to igt_spin_t, one of the ideas was that we put
> > the obj[] array there (with the offsets and flags setup correctly) so
> > that we could just feed that in again later without having to worry
> > about the relocations.
> 
> I tried that before but we couldn't agree on resubmit semantics.
> 
> My patch had igt_spin_batch_restart(fd, spin) - so emitting the exact 
> same batch, including the dependency. That would actually work well for 
> this use case.
> 
> So if you are happy with that, I can resurrect that patch, add one more 
> to implement stuff from this patch, and rebase perf_pmu changes to follow.

Honestly, best to do here first, as we will probably take forever to come
up with something we both like and applies to more test cases.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [igt-dev] [PATCH i-g-t] tests/perf_pmu: Improve accuracy by waiting on spinner to start
@ 2018-03-15 14:46           ` Chris Wilson
  0 siblings, 0 replies; 53+ messages in thread
From: Chris Wilson @ 2018-03-15 14:46 UTC (permalink / raw)
  To: Tvrtko Ursulin, Tvrtko Ursulin, igt-dev; +Cc: Intel-gfx, Tvrtko Ursulin

Quoting Tvrtko Ursulin (2018-03-15 14:37:59)
> 
> On 15/03/2018 13:45, Chris Wilson wrote:
> > As we are making changes to igt_spin_t, one of the ideas was that we put
> > the obj[] array there (with the offsets and flags setup correctly) so
> > that we could just feed that in again later without having to worry
> > about the relocations.
> 
> I tried that before but we couldn't agree on resubmit semantics.
> 
> My patch had igt_spin_batch_restart(fd, spin) - so emitting the exact 
> same batch, including the dependency. That would actually work well for 
> this use case.
> 
> So if you are happy with that, I can resurrect that patch, add one more 
> to implement stuff from this patch, and rebase perf_pmu changes to follow.

Honestly, best to do here first, as we will probably take forever to come
up with something we both like and applies to more test cases.
-Chris
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [igt-dev] [PATCH i-g-t] tests/perf_pmu: Improve accuracy by waiting on spinner to start
  2018-03-15 14:46           ` Chris Wilson
@ 2018-03-15 14:53             ` Tvrtko Ursulin
  -1 siblings, 0 replies; 53+ messages in thread
From: Tvrtko Ursulin @ 2018-03-15 14:53 UTC (permalink / raw)
  To: Chris Wilson, Tvrtko Ursulin, igt-dev; +Cc: Intel-gfx


On 15/03/2018 14:46, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2018-03-15 14:37:59)
>>
>> On 15/03/2018 13:45, Chris Wilson wrote:
>>> As we are making changes to igt_spin_t, one of the ideas was that we put
>>> the obj[] array there (with the offsets and flags setup correctly) so
>>> that we could just feed that in again later without having to worry
>>> about the relocations.
>>
>> I tried that before but we couldn't agree on resubmit semantics.
>>
>> My patch had igt_spin_batch_restart(fd, spin) - so emitting the exact
>> same batch, including the dependency. That would actually work well for
>> this use case.
>>
>> So if you are happy with that, I can resurrect that patch, add one more
>> to implement stuff from this patch, and rebase perf_pmu changes to follow.
> 
> Honestly, best to do here first, as we will probably take forever to come
> up with something we both like and applies to more test cases.

Don't quite get what you mean by "best to do here first" - where is 
here? Still locally to perf_pmu so no generic spin batch resubmit yet? 
But I can cache the obj array under spin_batch_t as a shortcut for time 
being?

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [Intel-gfx] [igt-dev] [PATCH i-g-t] tests/perf_pmu: Improve accuracy by waiting on spinner to start
@ 2018-03-15 14:53             ` Tvrtko Ursulin
  0 siblings, 0 replies; 53+ messages in thread
From: Tvrtko Ursulin @ 2018-03-15 14:53 UTC (permalink / raw)
  To: Chris Wilson, Tvrtko Ursulin, igt-dev; +Cc: Intel-gfx


On 15/03/2018 14:46, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2018-03-15 14:37:59)
>>
>> On 15/03/2018 13:45, Chris Wilson wrote:
>>> As we are making changes to igt_spin_t, one of the ideas was that we put
>>> the obj[] array there (with the offsets and flags setup correctly) so
>>> that we could just feed that in again later without having to worry
>>> about the relocations.
>>
>> I tried that before but we couldn't agree on resubmit semantics.
>>
>> My patch had igt_spin_batch_restart(fd, spin) - so emitting the exact
>> same batch, including the dependency. That would actually work well for
>> this use case.
>>
>> So if you are happy with that, I can resurrect that patch, add one more
>> to implement stuff from this patch, and rebase perf_pmu changes to follow.
> 
> Honestly, best to do here first, as we will probably take forever to come
> up with something we both like and applies to more test cases.

Don't quite get what you mean by "best to do here first" - where is 
here? Still locally to perf_pmu so no generic spin batch resubmit yet? 
But I can cache the obj array under spin_batch_t as a shortcut for time 
being?

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [igt-dev] [PATCH i-g-t] tests/perf_pmu: Improve accuracy by waiting on spinner to start
  2018-03-15 14:53             ` [Intel-gfx] " Tvrtko Ursulin
@ 2018-03-15 14:58               ` Chris Wilson
  -1 siblings, 0 replies; 53+ messages in thread
From: Chris Wilson @ 2018-03-15 14:58 UTC (permalink / raw)
  To: Tvrtko Ursulin, Tvrtko Ursulin, igt-dev; +Cc: Intel-gfx

Quoting Tvrtko Ursulin (2018-03-15 14:53:08)
> 
> On 15/03/2018 14:46, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2018-03-15 14:37:59)
> >>
> >> On 15/03/2018 13:45, Chris Wilson wrote:
> >>> As we are making changes to igt_spin_t, one of the ideas was that we put
> >>> the obj[] array there (with the offsets and flags setup correctly) so
> >>> that we could just feed that in again later without having to worry
> >>> about the relocations.
> >>
> >> I tried that before but we couldn't agree on resubmit semantics.
> >>
> >> My patch had igt_spin_batch_restart(fd, spin) - so emitting the exact
> >> same batch, including the dependency. That would actually work well for
> >> this use case.
> >>
> >> So if you are happy with that, I can resurrect that patch, add one more
> >> to implement stuff from this patch, and rebase perf_pmu changes to follow.
> > 
> > Honestly, best to do here first, as we will probably take forever to come
> > up with something we both like and applies to more test cases.
> 
> Don't quite get what you mean by "best to do here first" - where is 
> here?

Fix perf_pmu, then worry about API. We're still waiting for kasan
results, we may have more work ahead of us yet.

> Still locally to perf_pmu so no generic spin batch resubmit yet? 
> But I can cache the obj array under spin_batch_t as a shortcut for time 
> being?

I'd take igt_spin_t.obj[] :) But I don't insist on it, I'd like to get
the wait-for-spin working before tackling the resubmit API. There's a
few other places that either have a open-coded wait-for-submit, or need
one.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [igt-dev] [PATCH i-g-t] tests/perf_pmu: Improve accuracy by waiting on spinner to start
@ 2018-03-15 14:58               ` Chris Wilson
  0 siblings, 0 replies; 53+ messages in thread
From: Chris Wilson @ 2018-03-15 14:58 UTC (permalink / raw)
  To: Tvrtko Ursulin, Tvrtko Ursulin, igt-dev; +Cc: Intel-gfx, Tvrtko Ursulin

Quoting Tvrtko Ursulin (2018-03-15 14:53:08)
> 
> On 15/03/2018 14:46, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2018-03-15 14:37:59)
> >>
> >> On 15/03/2018 13:45, Chris Wilson wrote:
> >>> As we are making changes to igt_spin_t, one of the ideas was that we put
> >>> the obj[] array there (with the offsets and flags setup correctly) so
> >>> that we could just feed that in again later without having to worry
> >>> about the relocations.
> >>
> >> I tried that before but we couldn't agree on resubmit semantics.
> >>
> >> My patch had igt_spin_batch_restart(fd, spin) - so emitting the exact
> >> same batch, including the dependency. That would actually work well for
> >> this use case.
> >>
> >> So if you are happy with that, I can resurrect that patch, add one more
> >> to implement stuff from this patch, and rebase perf_pmu changes to follow.
> > 
> > Honestly, best to do here first, as we will probably take forever to come
> > up with something we both like and applies to more test cases.
> 
> Don't quite get what you mean by "best to do here first" - where is 
> here?

Fix perf_pmu, then worry about API. We're still waiting for kasan
results, we may have more work ahead of us yet.

> Still locally to perf_pmu so no generic spin batch resubmit yet? 
> But I can cache the obj array under spin_batch_t as a shortcut for time 
> being?

I'd take igt_spin_t.obj[] :) But I don't insist on it, I'd like to get
the wait-for-spin working before tackling the resubmit API. There's a
few other places that either have a open-coded wait-for-submit, or need
one.
-Chris
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 53+ messages in thread

* [PATCH i-g-t v2] tests/perf_pmu: Improve accuracy by waiting on spinner to start
  2018-03-15 12:56 ` [igt-dev] " Tvrtko Ursulin
@ 2018-03-15 15:46   ` Tvrtko Ursulin
  -1 siblings, 0 replies; 53+ messages in thread
From: Tvrtko Ursulin @ 2018-03-15 15:46 UTC (permalink / raw)
  To: igt-dev; +Cc: Intel-gfx

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

More than one test assumes that the spinner is running pretty much
immediately after we have create or submitted it.

In actuality there is a variable delay, especially on execlists platforms,
between submission and spin batch starting to run on the hardware.

To enable tests which care about this level of timing to account for this,
we add a new spin batch constructor which provides an output field which
can be polled to determine when the batch actually started running.

This is implemented via MI_STOREDW_IMM from the spin batch, writing into
memory mapped page shared with userspace.

Using this facility from perf_pmu, where applicable, should improve very
occasional test fails across the set and platforms.

v2:
 Chris Wilson:
 * Use caching mapping if available.
 * Handle old gens better.
 * Use gem_can_store_dword.
 * Cache exec obj array in spin_batch_t for easier resubmit.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 lib/igt_dummyload.c  | 149 +++++++++++++++++++++++++++++++++++++++++++--------
 lib/igt_dummyload.h  |  11 ++++
 lib/ioctl_wrappers.c |   2 +-
 lib/ioctl_wrappers.h |   1 +
 tests/perf_pmu.c     | 144 ++++++++++++++++++++++++++++++-------------------
 5 files changed, 230 insertions(+), 77 deletions(-)

diff --git a/lib/igt_dummyload.c b/lib/igt_dummyload.c
index 4b20f23dfe26..83b9727499ff 100644
--- a/lib/igt_dummyload.c
+++ b/lib/igt_dummyload.c
@@ -74,16 +74,19 @@ fill_reloc(struct drm_i915_gem_relocation_entry *reloc,
 	reloc->write_domain = write_domains;
 }
 
-static int emit_recursive_batch(igt_spin_t *spin,
-				int fd, uint32_t ctx, unsigned engine,
-				uint32_t dep, bool out_fence)
+#define OUT_FENCE	(1 << 0)
+#define POLL_RUN	(1 << 1)
+
+static int
+emit_recursive_batch(igt_spin_t *spin, int fd, uint32_t ctx, unsigned engine,
+		     uint32_t dep, unsigned int flags)
 {
 #define SCRATCH 0
 #define BATCH 1
 	const int gen = intel_gen(intel_get_drm_devid(fd));
-	struct drm_i915_gem_exec_object2 obj[2];
 	struct drm_i915_gem_relocation_entry relocs[2];
-	struct drm_i915_gem_execbuffer2 execbuf;
+	struct drm_i915_gem_execbuffer2 *execbuf;
+	struct drm_i915_gem_exec_object2 *obj;
 	unsigned int engines[16];
 	unsigned int nengine;
 	int fence_fd = -1;
@@ -101,8 +104,10 @@ static int emit_recursive_batch(igt_spin_t *spin,
 	}
 	igt_require(nengine);
 
-	memset(&execbuf, 0, sizeof(execbuf));
-	memset(obj, 0, sizeof(obj));
+	memset(&spin->execbuf, 0, sizeof(spin->execbuf));
+	execbuf = &spin->execbuf;
+	memset(spin->obj, 0, sizeof(spin->obj));
+	obj = spin->obj;
 	memset(relocs, 0, sizeof(relocs));
 
 	obj[BATCH].handle = gem_create(fd, BATCH_SIZE);
@@ -113,16 +118,62 @@ static int emit_recursive_batch(igt_spin_t *spin,
 				       	BATCH_SIZE, PROT_WRITE);
 	gem_set_domain(fd, obj[BATCH].handle,
 			I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT);
-	execbuf.buffer_count++;
+	execbuf->buffer_count++;
 
 	if (dep) {
+		igt_assert(!(flags & POLL_RUN));
+
 		/* dummy write to dependency */
 		obj[SCRATCH].handle = dep;
 		fill_reloc(&relocs[obj[BATCH].relocation_count++],
 			   dep, 1020,
 			   I915_GEM_DOMAIN_RENDER,
 			   I915_GEM_DOMAIN_RENDER);
-		execbuf.buffer_count++;
+		execbuf->buffer_count++;
+	} else if (flags & POLL_RUN) {
+		unsigned int offset;
+
+		igt_assert(!dep);
+
+		if (gen == 4 || gen == 5)
+			execbuf->flags |= I915_EXEC_SECURE;
+
+		spin->poll_handle = gem_create(fd, 4096);
+
+		if (__gem_set_caching(fd, spin->poll_handle,
+				      I915_CACHING_CACHED) == 0)
+			spin->running = __gem_mmap__cpu(fd, spin->poll_handle,
+							0, 4096,
+							PROT_READ | PROT_WRITE);
+		else
+			spin->running = __gem_mmap__wc(fd, spin->poll_handle,
+						       0, 4096,
+						       PROT_READ | PROT_WRITE);
+		igt_assert(spin->running);
+		igt_assert_eq(*spin->running, 0);
+
+		*batch++ = MI_STORE_DWORD_IMM | (gen < 6 ? 1 << 22 : 0);
+
+		if (gen >= 8) {
+			offset = sizeof(uint32_t);
+			*batch++ = 0;
+			*batch++ = 0;
+		} else if (gen >= 4) {
+			offset = 2 * sizeof(uint32_t);
+			*batch++ = 0;
+			*batch++ = 0;
+		} else {
+			offset = sizeof(uint32_t);
+			batch[-1]--;
+			*batch++ = 0;
+		}
+
+		*batch++ = 1;
+
+		obj[SCRATCH].handle = spin->poll_handle;
+		fill_reloc(&relocs[obj[BATCH].relocation_count++],
+			   spin->poll_handle, offset, 0, 0);
+		execbuf->buffer_count++;
 	}
 
 	spin->batch = batch;
@@ -167,18 +218,23 @@ static int emit_recursive_batch(igt_spin_t *spin,
 	obj[BATCH].relocation_count++;
 	obj[BATCH].relocs_ptr = to_user_pointer(relocs);
 
-	execbuf.buffers_ptr = to_user_pointer(obj + (2 - execbuf.buffer_count));
-	execbuf.rsvd1 = ctx;
+	execbuf->buffers_ptr = to_user_pointer(obj +
+					       (2 - execbuf->buffer_count));
+	execbuf->rsvd1 = ctx;
 
-	if (out_fence)
-		execbuf.flags |= I915_EXEC_FENCE_OUT;
+	if (flags & OUT_FENCE)
+		execbuf->flags |= I915_EXEC_FENCE_OUT;
 
 	for (i = 0; i < nengine; i++) {
-		execbuf.flags &= ~ENGINE_MASK;
-		execbuf.flags |= engines[i];
-		gem_execbuf_wr(fd, &execbuf);
-		if (out_fence) {
-			int _fd = execbuf.rsvd2 >> 32;
+		execbuf->flags &= ~ENGINE_MASK;
+		execbuf->flags |= engines[i];
+
+		if (flags & POLL_RUN)
+			igt_require(gem_can_store_dword(fd, execbuf->flags));
+
+		gem_execbuf_wr(fd, execbuf);
+		if (flags & OUT_FENCE) {
+			int _fd = execbuf->rsvd2 >> 32;
 
 			igt_assert(_fd >= 0);
 			if (fence_fd == -1) {
@@ -194,12 +250,20 @@ static int emit_recursive_batch(igt_spin_t *spin,
 		}
 	}
 
+	/* Make it easier for callers to resubmit. */
+
+	obj[BATCH].relocation_count = 0;
+	obj[BATCH].relocs_ptr = 0;
+
+	obj[SCRATCH].flags = EXEC_OBJECT_PINNED;
+	obj[BATCH].flags = EXEC_OBJECT_PINNED;
+
 	return fence_fd;
 }
 
 static igt_spin_t *
 ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
-		      int out_fence)
+		      unsigned int flags)
 {
 	igt_spin_t *spin;
 
@@ -207,7 +271,7 @@ ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
 	igt_assert(spin);
 
 	spin->out_fence = emit_recursive_batch(spin, fd, ctx, engine, dep,
-					       out_fence);
+					       flags);
 
 	pthread_mutex_lock(&list_lock);
 	igt_list_add(&spin->link, &spin_list);
@@ -219,7 +283,7 @@ ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
 igt_spin_t *
 __igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep)
 {
-	return ___igt_spin_batch_new(fd, ctx, engine, dep, false);
+	return ___igt_spin_batch_new(fd, ctx, engine, dep, 0);
 }
 
 /**
@@ -253,7 +317,7 @@ igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep)
 igt_spin_t *
 __igt_spin_batch_new_fence(int fd, uint32_t ctx, unsigned engine)
 {
-	return ___igt_spin_batch_new(fd, ctx, engine, 0, true);
+	return ___igt_spin_batch_new(fd, ctx, engine, 0, OUT_FENCE);
 }
 
 /**
@@ -286,6 +350,42 @@ igt_spin_batch_new_fence(int fd, uint32_t ctx, unsigned engine)
 	return spin;
 }
 
+igt_spin_t *
+__igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine)
+{
+	return ___igt_spin_batch_new(fd, ctx, engine, 0, POLL_RUN);
+}
+
+/**
+ * igt_spin_batch_new_poll:
+ * @fd: open i915 drm file descriptor
+ * @engine: Ring to execute batch OR'd with execbuf flags. If value is less
+ *          than 0, execute on all available rings.
+ *
+ * Start a recursive batch on a ring. Immediately returns a #igt_spin_t that
+ * contains the batch's handle that can be waited upon. The returned structure
+ * must be passed to igt_spin_batch_free() for post-processing.
+ *
+ * igt_spin_t->running will containt a pointer which target will change from
+ * zero to one once the spinner actually starts executing on the GPU.
+ *
+ * Returns:
+ * Structure with helper internal state for igt_spin_batch_free().
+ */
+igt_spin_t *
+igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine)
+{
+	igt_spin_t *spin;
+
+	igt_require_gem(fd);
+	igt_require(gem_mmap__has_wc(fd));
+
+	spin = __igt_spin_batch_new_poll(fd, ctx, engine);
+	igt_assert(gem_bo_busy(fd, spin->handle));
+
+	return spin;
+}
+
 static void notify(union sigval arg)
 {
 	igt_spin_t *spin = arg.sival_ptr;
@@ -367,6 +467,11 @@ void igt_spin_batch_free(int fd, igt_spin_t *spin)
 	igt_spin_batch_end(spin);
 	gem_munmap(spin->batch, BATCH_SIZE);
 
+	if (spin->running) {
+		gem_munmap(spin->running, 4096);
+		gem_close(fd, spin->poll_handle);
+	}
+
 	gem_close(fd, spin->handle);
 
 	if (spin->out_fence >= 0)
diff --git a/lib/igt_dummyload.h b/lib/igt_dummyload.h
index 4103e4ab9e36..3103935a309b 100644
--- a/lib/igt_dummyload.h
+++ b/lib/igt_dummyload.h
@@ -36,6 +36,10 @@ typedef struct igt_spin {
 	struct igt_list link;
 	uint32_t *batch;
 	int out_fence;
+	struct drm_i915_gem_exec_object2 obj[2];
+	struct drm_i915_gem_execbuffer2 execbuf;
+	uint32_t poll_handle;
+	bool *running;
 } igt_spin_t;
 
 igt_spin_t *__igt_spin_batch_new(int fd,
@@ -55,6 +59,13 @@ igt_spin_t *igt_spin_batch_new_fence(int fd,
 				     uint32_t ctx,
 				     unsigned engine);
 
+igt_spin_t *__igt_spin_batch_new_poll(int fd,
+				       uint32_t ctx,
+				       unsigned engine);
+igt_spin_t *igt_spin_batch_new_poll(int fd,
+				    uint32_t ctx,
+				    unsigned engine);
+
 void igt_spin_batch_set_timeout(igt_spin_t *spin, int64_t ns);
 void igt_spin_batch_end(igt_spin_t *spin);
 void igt_spin_batch_free(int fd, igt_spin_t *spin);
diff --git a/lib/ioctl_wrappers.c b/lib/ioctl_wrappers.c
index 8748cfcfc04f..4e1a08bf06b4 100644
--- a/lib/ioctl_wrappers.c
+++ b/lib/ioctl_wrappers.c
@@ -198,7 +198,7 @@ void gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride)
 	igt_assert(__gem_set_tiling(fd, handle, tiling, stride) == 0);
 }
 
-static int __gem_set_caching(int fd, uint32_t handle, uint32_t caching)
+int __gem_set_caching(int fd, uint32_t handle, uint32_t caching)
 {
 	struct drm_i915_gem_caching arg;
 	int err;
diff --git a/lib/ioctl_wrappers.h b/lib/ioctl_wrappers.h
index 13fbe3c103c0..b966f72c90a8 100644
--- a/lib/ioctl_wrappers.h
+++ b/lib/ioctl_wrappers.h
@@ -61,6 +61,7 @@ bool gem_get_tiling(int fd, uint32_t handle, uint32_t *tiling, uint32_t *swizzle
 void gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride);
 int __gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride);
 
+int __gem_set_caching(int fd, uint32_t handle, uint32_t caching);
 void gem_set_caching(int fd, uint32_t handle, uint32_t caching);
 uint32_t gem_get_caching(int fd, uint32_t handle);
 uint32_t gem_flink(int fd, uint32_t handle);
diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
index 19fcc95ffc7f..ff2554f91d2a 100644
--- a/tests/perf_pmu.c
+++ b/tests/perf_pmu.c
@@ -170,6 +170,46 @@ static unsigned int e2ring(int gem_fd, const struct intel_execution_engine2 *e)
 #define FLAG_LONG (16)
 #define FLAG_HANG (32)
 
+static igt_spin_t * __spin_poll(int fd, uint32_t ctx, unsigned long flags)
+{
+	if (gem_can_store_dword(fd, flags))
+		return __igt_spin_batch_new_poll(fd, ctx, flags);
+	else
+		return __igt_spin_batch_new(fd, ctx, flags, 0);
+}
+
+static unsigned long __spin_wait(int fd, igt_spin_t *spin)
+{
+	struct timespec start = { };
+
+	igt_nsec_elapsed(&start);
+
+	if (gem_can_store_dword(fd, spin->execbuf.flags)) {
+		while (!spin->running)
+			;
+	} else {
+		usleep(500e3); /* Better than nothing! */
+	}
+
+	return igt_nsec_elapsed(&start);
+}
+
+static igt_spin_t * __spin_sync(int fd, uint32_t ctx, unsigned long flags)
+{
+	igt_spin_t *spin = __spin_poll(fd, ctx, flags);
+
+	__spin_wait(fd, spin);
+
+	return spin;
+}
+
+static igt_spin_t * spin_sync(int fd, uint32_t ctx, unsigned long flags)
+{
+	igt_require_gem(fd);
+
+	return __spin_sync(fd, ctx, flags);
+}
+
 static void end_spin(int fd, igt_spin_t *spin, unsigned int flags)
 {
 	if (!spin)
@@ -195,7 +235,7 @@ single(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags)
 	fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance));
 
 	if (flags & TEST_BUSY)
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	else
 		spin = NULL;
 
@@ -251,13 +291,7 @@ busy_start(int gem_fd, const struct intel_execution_engine2 *e)
 	 */
 	sleep(2);
 
-	spin = __igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
-
-	/*
-	 * Sleep for a bit after making the engine busy to make sure the PMU
-	 * gets enabled when the batch is already running.
-	 */
-	usleep(500e3);
+	spin = __spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 
 	fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance));
 
@@ -300,7 +334,7 @@ busy_double_start(int gem_fd, const struct intel_execution_engine2 *e)
 	 * re-submission in execlists mode. Make sure busyness is correctly
 	 * reported with the engine busy, and after the engine went idle.
 	 */
-	spin[0] = __igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin[0] = __spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	usleep(500e3);
 	spin[1] = __igt_spin_batch_new(gem_fd, ctx, e2ring(gem_fd, e), 0);
 
@@ -386,7 +420,7 @@ busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 
 	igt_assert_eq(i, num_engines);
 
-	spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -412,15 +446,13 @@ busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 }
 
 static void
-__submit_spin_batch(int gem_fd,
-		    struct drm_i915_gem_exec_object2 *obj,
+__submit_spin_batch(int gem_fd, igt_spin_t *spin,
 		    const struct intel_execution_engine2 *e)
 {
-	struct drm_i915_gem_execbuffer2 eb = {
-		.buffer_count = 1,
-		.buffers_ptr = to_user_pointer(obj),
-		.flags = e2ring(gem_fd, e),
-	};
+	struct drm_i915_gem_execbuffer2 eb = spin->execbuf;
+
+	eb.flags &= ~(0x3f | I915_EXEC_BSD_MASK);
+	eb.flags |= e2ring(gem_fd, e);
 
 	gem_execbuf(gem_fd, &eb);
 }
@@ -429,7 +461,6 @@ static void
 most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 		    const unsigned int num_engines, unsigned int flags)
 {
-	struct drm_i915_gem_exec_object2 obj = {};
 	const struct intel_execution_engine2 *e_;
 	uint64_t tval[2][num_engines];
 	uint64_t val[num_engines];
@@ -443,15 +474,12 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 		if (!gem_has_engine(gem_fd, e_->class, e_->instance))
 			continue;
 
-		if (e == e_) {
+		if (e == e_)
 			idle_idx = i;
-		} else if (spin) {
-			__submit_spin_batch(gem_fd, &obj, e_);
-		} else {
-			spin = igt_spin_batch_new(gem_fd, 0,
-						  e2ring(gem_fd, e_), 0);
-			obj.handle = spin->handle;
-		}
+		else if (spin)
+			__submit_spin_batch(gem_fd, spin, e_);
+		else
+			spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e_));
 
 		val[i++] = I915_PMU_ENGINE_BUSY(e_->class, e_->instance);
 	}
@@ -461,6 +489,9 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 	for (i = 0; i < num_engines; i++)
 		fd[i] = open_group(val[i], fd[0]);
 
+	/* Small delay to allow engines to start. */
+	usleep(__spin_wait(gem_fd, spin) * num_engines / 1e3);
+
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -489,7 +520,6 @@ static void
 all_busy_check_all(int gem_fd, const unsigned int num_engines,
 		   unsigned int flags)
 {
-	struct drm_i915_gem_exec_object2 obj = {};
 	const struct intel_execution_engine2 *e;
 	uint64_t tval[2][num_engines];
 	uint64_t val[num_engines];
@@ -503,13 +533,10 @@ all_busy_check_all(int gem_fd, const unsigned int num_engines,
 		if (!gem_has_engine(gem_fd, e->class, e->instance))
 			continue;
 
-		if (spin) {
-			__submit_spin_batch(gem_fd, &obj, e);
-		} else {
-			spin = igt_spin_batch_new(gem_fd, 0,
-						  e2ring(gem_fd, e), 0);
-			obj.handle = spin->handle;
-		}
+		if (spin)
+			__submit_spin_batch(gem_fd, spin, e);
+		else
+			spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e));
 
 		val[i++] = I915_PMU_ENGINE_BUSY(e->class, e->instance);
 	}
@@ -519,6 +546,9 @@ all_busy_check_all(int gem_fd, const unsigned int num_engines,
 	for (i = 0; i < num_engines; i++)
 		fd[i] = open_group(val[i], fd[0]);
 
+	/* Small delay to allow engines to start. */
+	usleep(__spin_wait(gem_fd, spin) * num_engines / 1e3);
+
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -550,7 +580,7 @@ no_sema(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags)
 	open_group(I915_PMU_ENGINE_WAIT(e->class, e->instance), fd);
 
 	if (flags & TEST_BUSY)
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	else
 		spin = NULL;
 
@@ -884,7 +914,7 @@ multi_client(int gem_fd, const struct intel_execution_engine2 *e)
 	 */
 	fd[1] = open_pmu(config);
 
-	spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 
 	val[0] = val[1] = __pmu_read_single(fd[0], &ts[0]);
 	slept[1] = measured_usleep(batch_duration_ns / 1000);
@@ -1248,7 +1278,7 @@ test_frequency(int gem_fd)
 	igt_require(igt_sysfs_get_u32(sysfs, "gt_boost_freq_mhz") == min_freq);
 
 	gem_quiescent_gpu(gem_fd); /* Idle to be sure the change takes effect */
-	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+	spin = spin_sync(gem_fd, 0, I915_EXEC_RENDER);
 
 	slept = pmu_read_multi(fd, 2, start);
 	measured_usleep(batch_duration_ns / 1000);
@@ -1274,7 +1304,7 @@ test_frequency(int gem_fd)
 	igt_require(igt_sysfs_get_u32(sysfs, "gt_min_freq_mhz") == max_freq);
 
 	gem_quiescent_gpu(gem_fd);
-	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+	spin = spin_sync(gem_fd, 0, I915_EXEC_RENDER);
 
 	slept = pmu_read_multi(fd, 2, start);
 	measured_usleep(batch_duration_ns / 1000);
@@ -1455,6 +1485,10 @@ static void __rearm_spin_batch(igt_spin_t *spin)
 {
 	const uint32_t mi_arb_chk = 0x5 << 23;
 
+	if (spin->running) {
+		igt_assert(*spin->running);
+		*spin->running = 0;
+	}
        *spin->batch = mi_arb_chk;
        __sync_synchronize();
 }
@@ -1517,7 +1551,6 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 		const unsigned long timeout[] = {
 			pwm_calibration_us * 1000, test_us * 1000
 		};
-		struct drm_i915_gem_exec_object2 obj = {};
 		uint64_t total_busy_ns = 0, total_idle_ns = 0;
 		igt_spin_t *spin;
 		int ret;
@@ -1530,12 +1563,9 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 			igt_warn("Failed to set scheduling policy!\n");
 
 		/* Allocate our spin batch and idle it. */
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
-		obj.handle = spin->handle;
-		__submit_spin_batch(gem_fd, &obj, e); /* record its location */
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 		igt_spin_batch_end(spin);
-		gem_sync(gem_fd, obj.handle);
-		obj.flags |= EXEC_OBJECT_PINNED;
+		gem_sync(gem_fd, spin->handle);
 
 		/* 1st pass is calibration, second pass is the test. */
 		for (int pass = 0; pass < ARRAY_SIZE(timeout); pass++) {
@@ -1545,24 +1575,30 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 
 			igt_nsec_elapsed(&test_start);
 			do {
-				unsigned int target_idle_us, t_busy;
+				unsigned int target_idle_us;
+				struct timespec start = { };
+				unsigned long prep_delay_ns;
 
 				/* Restart the spinbatch. */
+				igt_nsec_elapsed(&start);
 				__rearm_spin_batch(spin);
-				__submit_spin_batch(gem_fd, &obj, e);
+				__submit_spin_batch(gem_fd, spin, e);
 
-				/*
-				 * Note that the submission may be delayed to a
-				 * tasklet (ksoftirqd) which cannot run until we
-				 * sleep as we hog the cpu (we are RT).
-				 */
+				 /* Wait for batch to start executing. */
+				__spin_wait(gem_fd, spin);
+				prep_delay_ns = igt_nsec_elapsed(&start);
 
-				t_busy = measured_usleep(busy_us);
+				/* PWM busy sleep. */
+				memset(&start, 0, sizeof(start));
+				igt_nsec_elapsed(&start);
+				measured_usleep(busy_us);
 				igt_spin_batch_end(spin);
-				gem_sync(gem_fd, obj.handle);
+				gem_sync(gem_fd, spin->handle);
 
-				total_busy_ns += t_busy;
+				total_busy_ns += igt_nsec_elapsed(&start);
+				total_idle_ns += prep_delay_ns;
 
+				/* Re-calibrate. */
 				target_idle_us =
 					(100 * total_busy_ns / target_busy_pct - (total_busy_ns + total_idle_ns)) / 1000;
 				total_idle_ns += measured_usleep(target_idle_us);
-- 
2.14.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [Intel-gfx] [PATCH i-g-t v2] tests/perf_pmu: Improve accuracy by waiting on spinner to start
@ 2018-03-15 15:46   ` Tvrtko Ursulin
  0 siblings, 0 replies; 53+ messages in thread
From: Tvrtko Ursulin @ 2018-03-15 15:46 UTC (permalink / raw)
  To: igt-dev; +Cc: Intel-gfx

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

More than one test assumes that the spinner is running pretty much
immediately after we have create or submitted it.

In actuality there is a variable delay, especially on execlists platforms,
between submission and spin batch starting to run on the hardware.

To enable tests which care about this level of timing to account for this,
we add a new spin batch constructor which provides an output field which
can be polled to determine when the batch actually started running.

This is implemented via MI_STOREDW_IMM from the spin batch, writing into
memory mapped page shared with userspace.

Using this facility from perf_pmu, where applicable, should improve very
occasional test fails across the set and platforms.

v2:
 Chris Wilson:
 * Use caching mapping if available.
 * Handle old gens better.
 * Use gem_can_store_dword.
 * Cache exec obj array in spin_batch_t for easier resubmit.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 lib/igt_dummyload.c  | 149 +++++++++++++++++++++++++++++++++++++++++++--------
 lib/igt_dummyload.h  |  11 ++++
 lib/ioctl_wrappers.c |   2 +-
 lib/ioctl_wrappers.h |   1 +
 tests/perf_pmu.c     | 144 ++++++++++++++++++++++++++++++-------------------
 5 files changed, 230 insertions(+), 77 deletions(-)

diff --git a/lib/igt_dummyload.c b/lib/igt_dummyload.c
index 4b20f23dfe26..83b9727499ff 100644
--- a/lib/igt_dummyload.c
+++ b/lib/igt_dummyload.c
@@ -74,16 +74,19 @@ fill_reloc(struct drm_i915_gem_relocation_entry *reloc,
 	reloc->write_domain = write_domains;
 }
 
-static int emit_recursive_batch(igt_spin_t *spin,
-				int fd, uint32_t ctx, unsigned engine,
-				uint32_t dep, bool out_fence)
+#define OUT_FENCE	(1 << 0)
+#define POLL_RUN	(1 << 1)
+
+static int
+emit_recursive_batch(igt_spin_t *spin, int fd, uint32_t ctx, unsigned engine,
+		     uint32_t dep, unsigned int flags)
 {
 #define SCRATCH 0
 #define BATCH 1
 	const int gen = intel_gen(intel_get_drm_devid(fd));
-	struct drm_i915_gem_exec_object2 obj[2];
 	struct drm_i915_gem_relocation_entry relocs[2];
-	struct drm_i915_gem_execbuffer2 execbuf;
+	struct drm_i915_gem_execbuffer2 *execbuf;
+	struct drm_i915_gem_exec_object2 *obj;
 	unsigned int engines[16];
 	unsigned int nengine;
 	int fence_fd = -1;
@@ -101,8 +104,10 @@ static int emit_recursive_batch(igt_spin_t *spin,
 	}
 	igt_require(nengine);
 
-	memset(&execbuf, 0, sizeof(execbuf));
-	memset(obj, 0, sizeof(obj));
+	memset(&spin->execbuf, 0, sizeof(spin->execbuf));
+	execbuf = &spin->execbuf;
+	memset(spin->obj, 0, sizeof(spin->obj));
+	obj = spin->obj;
 	memset(relocs, 0, sizeof(relocs));
 
 	obj[BATCH].handle = gem_create(fd, BATCH_SIZE);
@@ -113,16 +118,62 @@ static int emit_recursive_batch(igt_spin_t *spin,
 				       	BATCH_SIZE, PROT_WRITE);
 	gem_set_domain(fd, obj[BATCH].handle,
 			I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT);
-	execbuf.buffer_count++;
+	execbuf->buffer_count++;
 
 	if (dep) {
+		igt_assert(!(flags & POLL_RUN));
+
 		/* dummy write to dependency */
 		obj[SCRATCH].handle = dep;
 		fill_reloc(&relocs[obj[BATCH].relocation_count++],
 			   dep, 1020,
 			   I915_GEM_DOMAIN_RENDER,
 			   I915_GEM_DOMAIN_RENDER);
-		execbuf.buffer_count++;
+		execbuf->buffer_count++;
+	} else if (flags & POLL_RUN) {
+		unsigned int offset;
+
+		igt_assert(!dep);
+
+		if (gen == 4 || gen == 5)
+			execbuf->flags |= I915_EXEC_SECURE;
+
+		spin->poll_handle = gem_create(fd, 4096);
+
+		if (__gem_set_caching(fd, spin->poll_handle,
+				      I915_CACHING_CACHED) == 0)
+			spin->running = __gem_mmap__cpu(fd, spin->poll_handle,
+							0, 4096,
+							PROT_READ | PROT_WRITE);
+		else
+			spin->running = __gem_mmap__wc(fd, spin->poll_handle,
+						       0, 4096,
+						       PROT_READ | PROT_WRITE);
+		igt_assert(spin->running);
+		igt_assert_eq(*spin->running, 0);
+
+		*batch++ = MI_STORE_DWORD_IMM | (gen < 6 ? 1 << 22 : 0);
+
+		if (gen >= 8) {
+			offset = sizeof(uint32_t);
+			*batch++ = 0;
+			*batch++ = 0;
+		} else if (gen >= 4) {
+			offset = 2 * sizeof(uint32_t);
+			*batch++ = 0;
+			*batch++ = 0;
+		} else {
+			offset = sizeof(uint32_t);
+			batch[-1]--;
+			*batch++ = 0;
+		}
+
+		*batch++ = 1;
+
+		obj[SCRATCH].handle = spin->poll_handle;
+		fill_reloc(&relocs[obj[BATCH].relocation_count++],
+			   spin->poll_handle, offset, 0, 0);
+		execbuf->buffer_count++;
 	}
 
 	spin->batch = batch;
@@ -167,18 +218,23 @@ static int emit_recursive_batch(igt_spin_t *spin,
 	obj[BATCH].relocation_count++;
 	obj[BATCH].relocs_ptr = to_user_pointer(relocs);
 
-	execbuf.buffers_ptr = to_user_pointer(obj + (2 - execbuf.buffer_count));
-	execbuf.rsvd1 = ctx;
+	execbuf->buffers_ptr = to_user_pointer(obj +
+					       (2 - execbuf->buffer_count));
+	execbuf->rsvd1 = ctx;
 
-	if (out_fence)
-		execbuf.flags |= I915_EXEC_FENCE_OUT;
+	if (flags & OUT_FENCE)
+		execbuf->flags |= I915_EXEC_FENCE_OUT;
 
 	for (i = 0; i < nengine; i++) {
-		execbuf.flags &= ~ENGINE_MASK;
-		execbuf.flags |= engines[i];
-		gem_execbuf_wr(fd, &execbuf);
-		if (out_fence) {
-			int _fd = execbuf.rsvd2 >> 32;
+		execbuf->flags &= ~ENGINE_MASK;
+		execbuf->flags |= engines[i];
+
+		if (flags & POLL_RUN)
+			igt_require(gem_can_store_dword(fd, execbuf->flags));
+
+		gem_execbuf_wr(fd, execbuf);
+		if (flags & OUT_FENCE) {
+			int _fd = execbuf->rsvd2 >> 32;
 
 			igt_assert(_fd >= 0);
 			if (fence_fd == -1) {
@@ -194,12 +250,20 @@ static int emit_recursive_batch(igt_spin_t *spin,
 		}
 	}
 
+	/* Make it easier for callers to resubmit. */
+
+	obj[BATCH].relocation_count = 0;
+	obj[BATCH].relocs_ptr = 0;
+
+	obj[SCRATCH].flags = EXEC_OBJECT_PINNED;
+	obj[BATCH].flags = EXEC_OBJECT_PINNED;
+
 	return fence_fd;
 }
 
 static igt_spin_t *
 ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
-		      int out_fence)
+		      unsigned int flags)
 {
 	igt_spin_t *spin;
 
@@ -207,7 +271,7 @@ ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
 	igt_assert(spin);
 
 	spin->out_fence = emit_recursive_batch(spin, fd, ctx, engine, dep,
-					       out_fence);
+					       flags);
 
 	pthread_mutex_lock(&list_lock);
 	igt_list_add(&spin->link, &spin_list);
@@ -219,7 +283,7 @@ ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
 igt_spin_t *
 __igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep)
 {
-	return ___igt_spin_batch_new(fd, ctx, engine, dep, false);
+	return ___igt_spin_batch_new(fd, ctx, engine, dep, 0);
 }
 
 /**
@@ -253,7 +317,7 @@ igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep)
 igt_spin_t *
 __igt_spin_batch_new_fence(int fd, uint32_t ctx, unsigned engine)
 {
-	return ___igt_spin_batch_new(fd, ctx, engine, 0, true);
+	return ___igt_spin_batch_new(fd, ctx, engine, 0, OUT_FENCE);
 }
 
 /**
@@ -286,6 +350,42 @@ igt_spin_batch_new_fence(int fd, uint32_t ctx, unsigned engine)
 	return spin;
 }
 
+igt_spin_t *
+__igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine)
+{
+	return ___igt_spin_batch_new(fd, ctx, engine, 0, POLL_RUN);
+}
+
+/**
+ * igt_spin_batch_new_poll:
+ * @fd: open i915 drm file descriptor
+ * @engine: Ring to execute batch OR'd with execbuf flags. If value is less
+ *          than 0, execute on all available rings.
+ *
+ * Start a recursive batch on a ring. Immediately returns a #igt_spin_t that
+ * contains the batch's handle that can be waited upon. The returned structure
+ * must be passed to igt_spin_batch_free() for post-processing.
+ *
+ * igt_spin_t->running will containt a pointer which target will change from
+ * zero to one once the spinner actually starts executing on the GPU.
+ *
+ * Returns:
+ * Structure with helper internal state for igt_spin_batch_free().
+ */
+igt_spin_t *
+igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine)
+{
+	igt_spin_t *spin;
+
+	igt_require_gem(fd);
+	igt_require(gem_mmap__has_wc(fd));
+
+	spin = __igt_spin_batch_new_poll(fd, ctx, engine);
+	igt_assert(gem_bo_busy(fd, spin->handle));
+
+	return spin;
+}
+
 static void notify(union sigval arg)
 {
 	igt_spin_t *spin = arg.sival_ptr;
@@ -367,6 +467,11 @@ void igt_spin_batch_free(int fd, igt_spin_t *spin)
 	igt_spin_batch_end(spin);
 	gem_munmap(spin->batch, BATCH_SIZE);
 
+	if (spin->running) {
+		gem_munmap(spin->running, 4096);
+		gem_close(fd, spin->poll_handle);
+	}
+
 	gem_close(fd, spin->handle);
 
 	if (spin->out_fence >= 0)
diff --git a/lib/igt_dummyload.h b/lib/igt_dummyload.h
index 4103e4ab9e36..3103935a309b 100644
--- a/lib/igt_dummyload.h
+++ b/lib/igt_dummyload.h
@@ -36,6 +36,10 @@ typedef struct igt_spin {
 	struct igt_list link;
 	uint32_t *batch;
 	int out_fence;
+	struct drm_i915_gem_exec_object2 obj[2];
+	struct drm_i915_gem_execbuffer2 execbuf;
+	uint32_t poll_handle;
+	bool *running;
 } igt_spin_t;
 
 igt_spin_t *__igt_spin_batch_new(int fd,
@@ -55,6 +59,13 @@ igt_spin_t *igt_spin_batch_new_fence(int fd,
 				     uint32_t ctx,
 				     unsigned engine);
 
+igt_spin_t *__igt_spin_batch_new_poll(int fd,
+				       uint32_t ctx,
+				       unsigned engine);
+igt_spin_t *igt_spin_batch_new_poll(int fd,
+				    uint32_t ctx,
+				    unsigned engine);
+
 void igt_spin_batch_set_timeout(igt_spin_t *spin, int64_t ns);
 void igt_spin_batch_end(igt_spin_t *spin);
 void igt_spin_batch_free(int fd, igt_spin_t *spin);
diff --git a/lib/ioctl_wrappers.c b/lib/ioctl_wrappers.c
index 8748cfcfc04f..4e1a08bf06b4 100644
--- a/lib/ioctl_wrappers.c
+++ b/lib/ioctl_wrappers.c
@@ -198,7 +198,7 @@ void gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride)
 	igt_assert(__gem_set_tiling(fd, handle, tiling, stride) == 0);
 }
 
-static int __gem_set_caching(int fd, uint32_t handle, uint32_t caching)
+int __gem_set_caching(int fd, uint32_t handle, uint32_t caching)
 {
 	struct drm_i915_gem_caching arg;
 	int err;
diff --git a/lib/ioctl_wrappers.h b/lib/ioctl_wrappers.h
index 13fbe3c103c0..b966f72c90a8 100644
--- a/lib/ioctl_wrappers.h
+++ b/lib/ioctl_wrappers.h
@@ -61,6 +61,7 @@ bool gem_get_tiling(int fd, uint32_t handle, uint32_t *tiling, uint32_t *swizzle
 void gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride);
 int __gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride);
 
+int __gem_set_caching(int fd, uint32_t handle, uint32_t caching);
 void gem_set_caching(int fd, uint32_t handle, uint32_t caching);
 uint32_t gem_get_caching(int fd, uint32_t handle);
 uint32_t gem_flink(int fd, uint32_t handle);
diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
index 19fcc95ffc7f..ff2554f91d2a 100644
--- a/tests/perf_pmu.c
+++ b/tests/perf_pmu.c
@@ -170,6 +170,46 @@ static unsigned int e2ring(int gem_fd, const struct intel_execution_engine2 *e)
 #define FLAG_LONG (16)
 #define FLAG_HANG (32)
 
+static igt_spin_t * __spin_poll(int fd, uint32_t ctx, unsigned long flags)
+{
+	if (gem_can_store_dword(fd, flags))
+		return __igt_spin_batch_new_poll(fd, ctx, flags);
+	else
+		return __igt_spin_batch_new(fd, ctx, flags, 0);
+}
+
+static unsigned long __spin_wait(int fd, igt_spin_t *spin)
+{
+	struct timespec start = { };
+
+	igt_nsec_elapsed(&start);
+
+	if (gem_can_store_dword(fd, spin->execbuf.flags)) {
+		while (!spin->running)
+			;
+	} else {
+		usleep(500e3); /* Better than nothing! */
+	}
+
+	return igt_nsec_elapsed(&start);
+}
+
+static igt_spin_t * __spin_sync(int fd, uint32_t ctx, unsigned long flags)
+{
+	igt_spin_t *spin = __spin_poll(fd, ctx, flags);
+
+	__spin_wait(fd, spin);
+
+	return spin;
+}
+
+static igt_spin_t * spin_sync(int fd, uint32_t ctx, unsigned long flags)
+{
+	igt_require_gem(fd);
+
+	return __spin_sync(fd, ctx, flags);
+}
+
 static void end_spin(int fd, igt_spin_t *spin, unsigned int flags)
 {
 	if (!spin)
@@ -195,7 +235,7 @@ single(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags)
 	fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance));
 
 	if (flags & TEST_BUSY)
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	else
 		spin = NULL;
 
@@ -251,13 +291,7 @@ busy_start(int gem_fd, const struct intel_execution_engine2 *e)
 	 */
 	sleep(2);
 
-	spin = __igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
-
-	/*
-	 * Sleep for a bit after making the engine busy to make sure the PMU
-	 * gets enabled when the batch is already running.
-	 */
-	usleep(500e3);
+	spin = __spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 
 	fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance));
 
@@ -300,7 +334,7 @@ busy_double_start(int gem_fd, const struct intel_execution_engine2 *e)
 	 * re-submission in execlists mode. Make sure busyness is correctly
 	 * reported with the engine busy, and after the engine went idle.
 	 */
-	spin[0] = __igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin[0] = __spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	usleep(500e3);
 	spin[1] = __igt_spin_batch_new(gem_fd, ctx, e2ring(gem_fd, e), 0);
 
@@ -386,7 +420,7 @@ busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 
 	igt_assert_eq(i, num_engines);
 
-	spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -412,15 +446,13 @@ busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 }
 
 static void
-__submit_spin_batch(int gem_fd,
-		    struct drm_i915_gem_exec_object2 *obj,
+__submit_spin_batch(int gem_fd, igt_spin_t *spin,
 		    const struct intel_execution_engine2 *e)
 {
-	struct drm_i915_gem_execbuffer2 eb = {
-		.buffer_count = 1,
-		.buffers_ptr = to_user_pointer(obj),
-		.flags = e2ring(gem_fd, e),
-	};
+	struct drm_i915_gem_execbuffer2 eb = spin->execbuf;
+
+	eb.flags &= ~(0x3f | I915_EXEC_BSD_MASK);
+	eb.flags |= e2ring(gem_fd, e);
 
 	gem_execbuf(gem_fd, &eb);
 }
@@ -429,7 +461,6 @@ static void
 most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 		    const unsigned int num_engines, unsigned int flags)
 {
-	struct drm_i915_gem_exec_object2 obj = {};
 	const struct intel_execution_engine2 *e_;
 	uint64_t tval[2][num_engines];
 	uint64_t val[num_engines];
@@ -443,15 +474,12 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 		if (!gem_has_engine(gem_fd, e_->class, e_->instance))
 			continue;
 
-		if (e == e_) {
+		if (e == e_)
 			idle_idx = i;
-		} else if (spin) {
-			__submit_spin_batch(gem_fd, &obj, e_);
-		} else {
-			spin = igt_spin_batch_new(gem_fd, 0,
-						  e2ring(gem_fd, e_), 0);
-			obj.handle = spin->handle;
-		}
+		else if (spin)
+			__submit_spin_batch(gem_fd, spin, e_);
+		else
+			spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e_));
 
 		val[i++] = I915_PMU_ENGINE_BUSY(e_->class, e_->instance);
 	}
@@ -461,6 +489,9 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 	for (i = 0; i < num_engines; i++)
 		fd[i] = open_group(val[i], fd[0]);
 
+	/* Small delay to allow engines to start. */
+	usleep(__spin_wait(gem_fd, spin) * num_engines / 1e3);
+
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -489,7 +520,6 @@ static void
 all_busy_check_all(int gem_fd, const unsigned int num_engines,
 		   unsigned int flags)
 {
-	struct drm_i915_gem_exec_object2 obj = {};
 	const struct intel_execution_engine2 *e;
 	uint64_t tval[2][num_engines];
 	uint64_t val[num_engines];
@@ -503,13 +533,10 @@ all_busy_check_all(int gem_fd, const unsigned int num_engines,
 		if (!gem_has_engine(gem_fd, e->class, e->instance))
 			continue;
 
-		if (spin) {
-			__submit_spin_batch(gem_fd, &obj, e);
-		} else {
-			spin = igt_spin_batch_new(gem_fd, 0,
-						  e2ring(gem_fd, e), 0);
-			obj.handle = spin->handle;
-		}
+		if (spin)
+			__submit_spin_batch(gem_fd, spin, e);
+		else
+			spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e));
 
 		val[i++] = I915_PMU_ENGINE_BUSY(e->class, e->instance);
 	}
@@ -519,6 +546,9 @@ all_busy_check_all(int gem_fd, const unsigned int num_engines,
 	for (i = 0; i < num_engines; i++)
 		fd[i] = open_group(val[i], fd[0]);
 
+	/* Small delay to allow engines to start. */
+	usleep(__spin_wait(gem_fd, spin) * num_engines / 1e3);
+
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -550,7 +580,7 @@ no_sema(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags)
 	open_group(I915_PMU_ENGINE_WAIT(e->class, e->instance), fd);
 
 	if (flags & TEST_BUSY)
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	else
 		spin = NULL;
 
@@ -884,7 +914,7 @@ multi_client(int gem_fd, const struct intel_execution_engine2 *e)
 	 */
 	fd[1] = open_pmu(config);
 
-	spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 
 	val[0] = val[1] = __pmu_read_single(fd[0], &ts[0]);
 	slept[1] = measured_usleep(batch_duration_ns / 1000);
@@ -1248,7 +1278,7 @@ test_frequency(int gem_fd)
 	igt_require(igt_sysfs_get_u32(sysfs, "gt_boost_freq_mhz") == min_freq);
 
 	gem_quiescent_gpu(gem_fd); /* Idle to be sure the change takes effect */
-	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+	spin = spin_sync(gem_fd, 0, I915_EXEC_RENDER);
 
 	slept = pmu_read_multi(fd, 2, start);
 	measured_usleep(batch_duration_ns / 1000);
@@ -1274,7 +1304,7 @@ test_frequency(int gem_fd)
 	igt_require(igt_sysfs_get_u32(sysfs, "gt_min_freq_mhz") == max_freq);
 
 	gem_quiescent_gpu(gem_fd);
-	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+	spin = spin_sync(gem_fd, 0, I915_EXEC_RENDER);
 
 	slept = pmu_read_multi(fd, 2, start);
 	measured_usleep(batch_duration_ns / 1000);
@@ -1455,6 +1485,10 @@ static void __rearm_spin_batch(igt_spin_t *spin)
 {
 	const uint32_t mi_arb_chk = 0x5 << 23;
 
+	if (spin->running) {
+		igt_assert(*spin->running);
+		*spin->running = 0;
+	}
        *spin->batch = mi_arb_chk;
        __sync_synchronize();
 }
@@ -1517,7 +1551,6 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 		const unsigned long timeout[] = {
 			pwm_calibration_us * 1000, test_us * 1000
 		};
-		struct drm_i915_gem_exec_object2 obj = {};
 		uint64_t total_busy_ns = 0, total_idle_ns = 0;
 		igt_spin_t *spin;
 		int ret;
@@ -1530,12 +1563,9 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 			igt_warn("Failed to set scheduling policy!\n");
 
 		/* Allocate our spin batch and idle it. */
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
-		obj.handle = spin->handle;
-		__submit_spin_batch(gem_fd, &obj, e); /* record its location */
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 		igt_spin_batch_end(spin);
-		gem_sync(gem_fd, obj.handle);
-		obj.flags |= EXEC_OBJECT_PINNED;
+		gem_sync(gem_fd, spin->handle);
 
 		/* 1st pass is calibration, second pass is the test. */
 		for (int pass = 0; pass < ARRAY_SIZE(timeout); pass++) {
@@ -1545,24 +1575,30 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 
 			igt_nsec_elapsed(&test_start);
 			do {
-				unsigned int target_idle_us, t_busy;
+				unsigned int target_idle_us;
+				struct timespec start = { };
+				unsigned long prep_delay_ns;
 
 				/* Restart the spinbatch. */
+				igt_nsec_elapsed(&start);
 				__rearm_spin_batch(spin);
-				__submit_spin_batch(gem_fd, &obj, e);
+				__submit_spin_batch(gem_fd, spin, e);
 
-				/*
-				 * Note that the submission may be delayed to a
-				 * tasklet (ksoftirqd) which cannot run until we
-				 * sleep as we hog the cpu (we are RT).
-				 */
+				 /* Wait for batch to start executing. */
+				__spin_wait(gem_fd, spin);
+				prep_delay_ns = igt_nsec_elapsed(&start);
 
-				t_busy = measured_usleep(busy_us);
+				/* PWM busy sleep. */
+				memset(&start, 0, sizeof(start));
+				igt_nsec_elapsed(&start);
+				measured_usleep(busy_us);
 				igt_spin_batch_end(spin);
-				gem_sync(gem_fd, obj.handle);
+				gem_sync(gem_fd, spin->handle);
 
-				total_busy_ns += t_busy;
+				total_busy_ns += igt_nsec_elapsed(&start);
+				total_idle_ns += prep_delay_ns;
 
+				/* Re-calibrate. */
 				target_idle_us =
 					(100 * total_busy_ns / target_busy_pct - (total_busy_ns + total_idle_ns)) / 1000;
 				total_idle_ns += measured_usleep(target_idle_us);
-- 
2.14.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* Re: [igt-dev] [PATCH i-g-t v2] tests/perf_pmu: Improve accuracy by waiting on spinner to start
  2018-03-15 15:46   ` [Intel-gfx] " Tvrtko Ursulin
@ 2018-03-15 16:01     ` Chris Wilson
  -1 siblings, 0 replies; 53+ messages in thread
From: Chris Wilson @ 2018-03-15 16:01 UTC (permalink / raw)
  To: Tvrtko Ursulin, igt-dev; +Cc: Intel-gfx

Quoting Tvrtko Ursulin (2018-03-15 15:46:00)
>  static void
> -__submit_spin_batch(int gem_fd,
> -                   struct drm_i915_gem_exec_object2 *obj,
> +__submit_spin_batch(int gem_fd, igt_spin_t *spin,
>                     const struct intel_execution_engine2 *e)
>  {
> -       struct drm_i915_gem_execbuffer2 eb = {
> -               .buffer_count = 1,
> -               .buffers_ptr = to_user_pointer(obj),
> -               .flags = e2ring(gem_fd, e),
> -       };
> +       struct drm_i915_gem_execbuffer2 eb = spin->execbuf;
> +
> +       eb.flags &= ~(0x3f | I915_EXEC_BSD_MASK);
> +       eb.flags |= e2ring(gem_fd, e);

I'm dubious about keeping spin->execbuf for precisely this reason.
Almost all the time I want to specify a different execution, and not use
the same context, random fences, etc.

However, it does give a convenient way to get buffers_count and
buffers_ptr, but that is all that is valid (imo) or at least should be
judiciously copied from rather than wholesale.

eb = {
	.buffers_ptr = spin->execbuf.buffers_ptr,
	.buffer_count = spin->execbuf.buffer_count,
	.flags = e2ring(gem_fd, e) | I915_EXEC_NORELOC,
};

> @@ -1545,24 +1575,30 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
>  
>                         igt_nsec_elapsed(&test_start);
>                         do {
> -                               unsigned int target_idle_us, t_busy;
> +                               unsigned int target_idle_us;
> +                               struct timespec start = { };
> +                               unsigned long prep_delay_ns;
>  
>                                 /* Restart the spinbatch. */
> +                               igt_nsec_elapsed(&start);
>                                 __rearm_spin_batch(spin);
> -                               __submit_spin_batch(gem_fd, &obj, e);
> +                               __submit_spin_batch(gem_fd, spin, e);
>  
> -                               /*
> -                                * Note that the submission may be delayed to a
> -                                * tasklet (ksoftirqd) which cannot run until we
> -                                * sleep as we hog the cpu (we are RT).
> -                                */
> +                                /* Wait for batch to start executing. */
> +                               __spin_wait(gem_fd, spin);
> +                               prep_delay_ns = igt_nsec_elapsed(&start);
>  
> -                               t_busy = measured_usleep(busy_us);
> +                               /* PWM busy sleep. */
> +                               memset(&start, 0, sizeof(start));
> +                               igt_nsec_elapsed(&start);

Can just keep using start, it's already has the current time from
calculating prep_delay_ns.

> +                               measured_usleep(busy_us);
>                                 igt_spin_batch_end(spin);
> -                               gem_sync(gem_fd, obj.handle);
> +                               gem_sync(gem_fd, spin->handle);
>  
> -                               total_busy_ns += t_busy;
> +                               total_busy_ns += igt_nsec_elapsed(&start);
> +                               total_idle_ns += prep_delay_ns;

Ok.

Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [igt-dev] [PATCH i-g-t v2] tests/perf_pmu: Improve accuracy by waiting on spinner to start
@ 2018-03-15 16:01     ` Chris Wilson
  0 siblings, 0 replies; 53+ messages in thread
From: Chris Wilson @ 2018-03-15 16:01 UTC (permalink / raw)
  To: Tvrtko Ursulin, igt-dev; +Cc: Intel-gfx, Tvrtko Ursulin

Quoting Tvrtko Ursulin (2018-03-15 15:46:00)
>  static void
> -__submit_spin_batch(int gem_fd,
> -                   struct drm_i915_gem_exec_object2 *obj,
> +__submit_spin_batch(int gem_fd, igt_spin_t *spin,
>                     const struct intel_execution_engine2 *e)
>  {
> -       struct drm_i915_gem_execbuffer2 eb = {
> -               .buffer_count = 1,
> -               .buffers_ptr = to_user_pointer(obj),
> -               .flags = e2ring(gem_fd, e),
> -       };
> +       struct drm_i915_gem_execbuffer2 eb = spin->execbuf;
> +
> +       eb.flags &= ~(0x3f | I915_EXEC_BSD_MASK);
> +       eb.flags |= e2ring(gem_fd, e);

I'm dubious about keeping spin->execbuf for precisely this reason.
Almost all the time I want to specify a different execution, and not use
the same context, random fences, etc.

However, it does give a convenient way to get buffers_count and
buffers_ptr, but that is all that is valid (imo) or at least should be
judiciously copied from rather than wholesale.

eb = {
	.buffers_ptr = spin->execbuf.buffers_ptr,
	.buffer_count = spin->execbuf.buffer_count,
	.flags = e2ring(gem_fd, e) | I915_EXEC_NORELOC,
};

> @@ -1545,24 +1575,30 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
>  
>                         igt_nsec_elapsed(&test_start);
>                         do {
> -                               unsigned int target_idle_us, t_busy;
> +                               unsigned int target_idle_us;
> +                               struct timespec start = { };
> +                               unsigned long prep_delay_ns;
>  
>                                 /* Restart the spinbatch. */
> +                               igt_nsec_elapsed(&start);
>                                 __rearm_spin_batch(spin);
> -                               __submit_spin_batch(gem_fd, &obj, e);
> +                               __submit_spin_batch(gem_fd, spin, e);
>  
> -                               /*
> -                                * Note that the submission may be delayed to a
> -                                * tasklet (ksoftirqd) which cannot run until we
> -                                * sleep as we hog the cpu (we are RT).
> -                                */
> +                                /* Wait for batch to start executing. */
> +                               __spin_wait(gem_fd, spin);
> +                               prep_delay_ns = igt_nsec_elapsed(&start);
>  
> -                               t_busy = measured_usleep(busy_us);
> +                               /* PWM busy sleep. */
> +                               memset(&start, 0, sizeof(start));
> +                               igt_nsec_elapsed(&start);

Can just keep using start, it's already has the current time from
calculating prep_delay_ns.

> +                               measured_usleep(busy_us);
>                                 igt_spin_batch_end(spin);
> -                               gem_sync(gem_fd, obj.handle);
> +                               gem_sync(gem_fd, spin->handle);
>  
> -                               total_busy_ns += t_busy;
> +                               total_busy_ns += igt_nsec_elapsed(&start);
> +                               total_idle_ns += prep_delay_ns;

Ok.

Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
-Chris
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 53+ messages in thread

* [igt-dev] ✗ Fi.CI.IGT: failure for tests/perf_pmu: Improve accuracy by waiting on spinner to start
  2018-03-15 12:56 ` [igt-dev] " Tvrtko Ursulin
                   ` (3 preceding siblings ...)
  (?)
@ 2018-03-15 16:35 ` Patchwork
  -1 siblings, 0 replies; 53+ messages in thread
From: Patchwork @ 2018-03-15 16:35 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: igt-dev

== Series Details ==

Series: tests/perf_pmu: Improve accuracy by waiting on spinner to start
URL   : https://patchwork.freedesktop.org/series/40027/
State : failure

== Summary ==

---- Possible new issues:

Test perf_pmu:
        Subgroup all-busy-check-all:
                pass       -> INCOMPLETE (shard-snb)
        Subgroup all-busy-idle-check-all:
                pass       -> FAIL       (shard-hsw)
                pass       -> INCOMPLETE (shard-snb)
        Subgroup busy-check-all-bcs0:
                pass       -> INCOMPLETE (shard-snb)
        Subgroup busy-double-start-rcs0:
                pass       -> INCOMPLETE (shard-snb)
        Subgroup busy-idle-bcs0:
                pass       -> FAIL       (shard-hsw)
                pass       -> INCOMPLETE (shard-snb)
        Subgroup busy-idle-check-all-bcs0:
                pass       -> FAIL       (shard-hsw)
        Subgroup busy-idle-check-all-rcs0:
                pass       -> FAIL       (shard-hsw)
                pass       -> FAIL       (shard-snb)
        Subgroup busy-idle-check-all-vecs0:
                pass       -> FAIL       (shard-hsw)
        Subgroup busy-idle-rcs0:
                pass       -> FAIL       (shard-hsw)
                pass       -> FAIL       (shard-snb)
        Subgroup busy-idle-vcs0:
                pass       -> FAIL       (shard-hsw)
        Subgroup busy-rcs0:
                pass       -> FAIL       (shard-snb)
        Subgroup busy-start-bcs0:
                pass       -> INCOMPLETE (shard-snb)
        Subgroup most-busy-check-all-rcs0:
                pass       -> INCOMPLETE (shard-snb)
        Subgroup most-busy-check-all-vcs0:
                pass       -> INCOMPLETE (shard-snb)
        Subgroup most-busy-idle-check-all-bcs0:
                pass       -> FAIL       (shard-hsw)
                pass       -> FAIL       (shard-snb)
        Subgroup most-busy-idle-check-all-rcs0:
                pass       -> FAIL       (shard-hsw)
                pass       -> INCOMPLETE (shard-snb)
        Subgroup most-busy-idle-check-all-vcs0:
                pass       -> INCOMPLETE (shard-snb)
        Subgroup most-busy-idle-check-all-vecs0:
                pass       -> FAIL       (shard-hsw)
        Subgroup render-node-busy-bcs0:
                pass       -> INCOMPLETE (shard-snb)
        Subgroup render-node-busy-idle-bcs0:
                pass       -> FAIL       (shard-hsw)
        Subgroup render-node-busy-idle-vcs0:
                pass       -> FAIL       (shard-hsw)

---- Known issues:

Test gem_eio:
        Subgroup in-flight-external:
                incomplete -> PASS       (shard-apl) fdo#105341
Test kms_flip:
        Subgroup flip-vs-modeset-vs-hang:
                pass       -> DMESG-WARN (shard-snb) fdo#103821
        Subgroup plain-flip-fb-recreate-interruptible:
                pass       -> FAIL       (shard-hsw) fdo#100368
Test kms_frontbuffer_tracking:
        Subgroup fbc-1p-primscrn-pri-indfb-draw-mmap-cpu:
                pass       -> FAIL       (shard-apl) fdo#103167

fdo#105341 https://bugs.freedesktop.org/show_bug.cgi?id=105341
fdo#103821 https://bugs.freedesktop.org/show_bug.cgi?id=103821
fdo#100368 https://bugs.freedesktop.org/show_bug.cgi?id=100368
fdo#103167 https://bugs.freedesktop.org/show_bug.cgi?id=103167

shard-apl        total:3353 pass:1763 dwarn:1   dfail:0   fail:8   skip:1580 time:12527s
shard-hsw        total:3442 pass:1755 dwarn:1   dfail:0   fail:14  skip:1671 time:11898s
shard-snb        total:2912 pass:1142 dwarn:2   dfail:0   fail:5   skip:1751 time:4579s
Blacklisted hosts:
shard-kbl        total:2012 pass:1131 dwarn:0   dfail:0   fail:2   skip:877 time:5495s

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_1135/shards.html
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 53+ messages in thread

* [igt-dev] ✗ Fi.CI.BAT: failure for tests/perf_pmu: Improve accuracy by waiting on spinner to start (rev2)
  2018-03-15 12:56 ` [igt-dev] " Tvrtko Ursulin
                   ` (4 preceding siblings ...)
  (?)
@ 2018-03-15 16:49 ` Patchwork
  -1 siblings, 0 replies; 53+ messages in thread
From: Patchwork @ 2018-03-15 16:49 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: igt-dev

== Series Details ==

Series: tests/perf_pmu: Improve accuracy by waiting on spinner to start (rev2)
URL   : https://patchwork.freedesktop.org/series/40027/
State : failure

== Summary ==

IGT patchset tested on top of latest successful build
1cacca207a71695bc46e994c05a691f89e152492 lib/igt_kms: Fix enum type in igt_pipe_has_prop.

with latest DRM-Tip kernel build CI_DRM_3934
3b4800f0237f drm-tip: 2018y-03m-15d-15h-48m-56s UTC integration manifest

No testlist changes.

---- Possible new issues:

Test gem_exec_suspend:
        Subgroup basic-s4-devices:
                pass       -> INCOMPLETE (fi-skl-6700k2)
Test kms_pipe_crc_basic:
        Subgroup read-crc-pipe-a:
                pass       -> INCOMPLETE (fi-elk-e7500)

---- Known issues:

Test debugfs_test:
        Subgroup read_all_entries:
                incomplete -> PASS       (fi-snb-2520m) fdo#103713
Test gem_mmap_gtt:
        Subgroup basic-small-bo-tiledx:
                pass       -> FAIL       (fi-gdg-551) fdo#102575

fdo#103713 https://bugs.freedesktop.org/show_bug.cgi?id=103713
fdo#102575 https://bugs.freedesktop.org/show_bug.cgi?id=102575

fi-bdw-5557u     total:285  pass:264  dwarn:0   dfail:0   fail:0   skip:21  time:435s
fi-bdw-gvtdvm    total:285  pass:261  dwarn:0   dfail:0   fail:0   skip:24  time:448s
fi-blb-e6850     total:285  pass:220  dwarn:1   dfail:0   fail:0   skip:64  time:383s
fi-bsw-n3050     total:285  pass:239  dwarn:0   dfail:0   fail:0   skip:46  time:545s
fi-bwr-2160      total:285  pass:180  dwarn:0   dfail:0   fail:0   skip:105 time:296s
fi-bxt-dsi       total:285  pass:255  dwarn:0   dfail:0   fail:0   skip:30  time:526s
fi-bxt-j4205     total:285  pass:256  dwarn:0   dfail:0   fail:0   skip:29  time:514s
fi-byt-j1900     total:285  pass:250  dwarn:0   dfail:0   fail:0   skip:35  time:515s
fi-byt-n2820     total:285  pass:246  dwarn:0   dfail:0   fail:0   skip:39  time:508s
fi-cfl-8700k     total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:408s
fi-cfl-s2        total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:584s
fi-cfl-u         total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:515s
fi-cnl-drrs      total:285  pass:254  dwarn:3   dfail:0   fail:0   skip:28  time:538s
fi-cnl-y3        total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:589s
fi-elk-e7500     total:235  pass:185  dwarn:0   dfail:0   fail:0   skip:49 
fi-gdg-551       total:285  pass:176  dwarn:0   dfail:0   fail:1   skip:108 time:321s
fi-glk-1         total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:536s
fi-hsw-4770      total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:404s
fi-ilk-650       total:285  pass:225  dwarn:0   dfail:0   fail:0   skip:60  time:420s
fi-ivb-3520m     total:285  pass:256  dwarn:0   dfail:0   fail:0   skip:29  time:462s
fi-ivb-3770      total:285  pass:252  dwarn:0   dfail:0   fail:0   skip:33  time:428s
fi-kbl-7500u     total:285  pass:260  dwarn:1   dfail:0   fail:0   skip:24  time:475s
fi-kbl-7567u     total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:468s
fi-kbl-r         total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:515s
fi-pnv-d510      total:285  pass:219  dwarn:1   dfail:0   fail:0   skip:65  time:656s
fi-skl-6260u     total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:442s
fi-skl-6600u     total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:531s
fi-skl-6700hq    total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:542s
fi-skl-6700k2    total:109  pass:97   dwarn:0   dfail:0   fail:0   skip:11 
fi-skl-6770hq    total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:486s
fi-skl-guc       total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:438s
fi-skl-gvtdvm    total:285  pass:262  dwarn:0   dfail:0   fail:0   skip:23  time:447s
fi-snb-2520m     total:285  pass:245  dwarn:0   dfail:0   fail:0   skip:40  time:564s
fi-snb-2600      total:285  pass:245  dwarn:0   dfail:0   fail:0   skip:40  time:402s

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_1139/issues.html
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 53+ messages in thread

* [PATCH i-g-t v3] tests/perf_pmu: Improve accuracy by waiting on spinner to start
  2018-03-15 16:01     ` Chris Wilson
@ 2018-03-16  7:36       ` Tvrtko Ursulin
  -1 siblings, 0 replies; 53+ messages in thread
From: Tvrtko Ursulin @ 2018-03-16  7:36 UTC (permalink / raw)
  To: igt-dev; +Cc: Intel-gfx

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

More than one test assumes that the spinner is running pretty much
immediately after we have create or submitted it.

In actuality there is a variable delay, especially on execlists platforms,
between submission and spin batch starting to run on the hardware.

To enable tests which care about this level of timing to account for this,
we add a new spin batch constructor which provides an output field which
can be polled to determine when the batch actually started running.

This is implemented via MI_STOREDW_IMM from the spin batch, writing into
memory mapped page shared with userspace.

Using this facility from perf_pmu, where applicable, should improve very
occasional test fails across the set and platforms.

v2:
 Chris Wilson:
 * Use caching mapping if available.
 * Handle old gens better.
 * Use gem_can_store_dword.
 * Cache exec obj array in spin_batch_t for easier resubmit.

v3:
 * Forgot I915_EXEC_NO_RELOC. (Chris Wilson)

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 lib/igt_dummyload.c  | 149 +++++++++++++++++++++++++++++++++++++++++++--------
 lib/igt_dummyload.h  |  11 ++++
 lib/ioctl_wrappers.c |   2 +-
 lib/ioctl_wrappers.h |   1 +
 tests/perf_pmu.c     | 144 ++++++++++++++++++++++++++++++-------------------
 5 files changed, 230 insertions(+), 77 deletions(-)

diff --git a/lib/igt_dummyload.c b/lib/igt_dummyload.c
index 4b20f23dfe26..83b9727499ff 100644
--- a/lib/igt_dummyload.c
+++ b/lib/igt_dummyload.c
@@ -74,16 +74,19 @@ fill_reloc(struct drm_i915_gem_relocation_entry *reloc,
 	reloc->write_domain = write_domains;
 }
 
-static int emit_recursive_batch(igt_spin_t *spin,
-				int fd, uint32_t ctx, unsigned engine,
-				uint32_t dep, bool out_fence)
+#define OUT_FENCE	(1 << 0)
+#define POLL_RUN	(1 << 1)
+
+static int
+emit_recursive_batch(igt_spin_t *spin, int fd, uint32_t ctx, unsigned engine,
+		     uint32_t dep, unsigned int flags)
 {
 #define SCRATCH 0
 #define BATCH 1
 	const int gen = intel_gen(intel_get_drm_devid(fd));
-	struct drm_i915_gem_exec_object2 obj[2];
 	struct drm_i915_gem_relocation_entry relocs[2];
-	struct drm_i915_gem_execbuffer2 execbuf;
+	struct drm_i915_gem_execbuffer2 *execbuf;
+	struct drm_i915_gem_exec_object2 *obj;
 	unsigned int engines[16];
 	unsigned int nengine;
 	int fence_fd = -1;
@@ -101,8 +104,10 @@ static int emit_recursive_batch(igt_spin_t *spin,
 	}
 	igt_require(nengine);
 
-	memset(&execbuf, 0, sizeof(execbuf));
-	memset(obj, 0, sizeof(obj));
+	memset(&spin->execbuf, 0, sizeof(spin->execbuf));
+	execbuf = &spin->execbuf;
+	memset(spin->obj, 0, sizeof(spin->obj));
+	obj = spin->obj;
 	memset(relocs, 0, sizeof(relocs));
 
 	obj[BATCH].handle = gem_create(fd, BATCH_SIZE);
@@ -113,16 +118,62 @@ static int emit_recursive_batch(igt_spin_t *spin,
 				       	BATCH_SIZE, PROT_WRITE);
 	gem_set_domain(fd, obj[BATCH].handle,
 			I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT);
-	execbuf.buffer_count++;
+	execbuf->buffer_count++;
 
 	if (dep) {
+		igt_assert(!(flags & POLL_RUN));
+
 		/* dummy write to dependency */
 		obj[SCRATCH].handle = dep;
 		fill_reloc(&relocs[obj[BATCH].relocation_count++],
 			   dep, 1020,
 			   I915_GEM_DOMAIN_RENDER,
 			   I915_GEM_DOMAIN_RENDER);
-		execbuf.buffer_count++;
+		execbuf->buffer_count++;
+	} else if (flags & POLL_RUN) {
+		unsigned int offset;
+
+		igt_assert(!dep);
+
+		if (gen == 4 || gen == 5)
+			execbuf->flags |= I915_EXEC_SECURE;
+
+		spin->poll_handle = gem_create(fd, 4096);
+
+		if (__gem_set_caching(fd, spin->poll_handle,
+				      I915_CACHING_CACHED) == 0)
+			spin->running = __gem_mmap__cpu(fd, spin->poll_handle,
+							0, 4096,
+							PROT_READ | PROT_WRITE);
+		else
+			spin->running = __gem_mmap__wc(fd, spin->poll_handle,
+						       0, 4096,
+						       PROT_READ | PROT_WRITE);
+		igt_assert(spin->running);
+		igt_assert_eq(*spin->running, 0);
+
+		*batch++ = MI_STORE_DWORD_IMM | (gen < 6 ? 1 << 22 : 0);
+
+		if (gen >= 8) {
+			offset = sizeof(uint32_t);
+			*batch++ = 0;
+			*batch++ = 0;
+		} else if (gen >= 4) {
+			offset = 2 * sizeof(uint32_t);
+			*batch++ = 0;
+			*batch++ = 0;
+		} else {
+			offset = sizeof(uint32_t);
+			batch[-1]--;
+			*batch++ = 0;
+		}
+
+		*batch++ = 1;
+
+		obj[SCRATCH].handle = spin->poll_handle;
+		fill_reloc(&relocs[obj[BATCH].relocation_count++],
+			   spin->poll_handle, offset, 0, 0);
+		execbuf->buffer_count++;
 	}
 
 	spin->batch = batch;
@@ -167,18 +218,23 @@ static int emit_recursive_batch(igt_spin_t *spin,
 	obj[BATCH].relocation_count++;
 	obj[BATCH].relocs_ptr = to_user_pointer(relocs);
 
-	execbuf.buffers_ptr = to_user_pointer(obj + (2 - execbuf.buffer_count));
-	execbuf.rsvd1 = ctx;
+	execbuf->buffers_ptr = to_user_pointer(obj +
+					       (2 - execbuf->buffer_count));
+	execbuf->rsvd1 = ctx;
 
-	if (out_fence)
-		execbuf.flags |= I915_EXEC_FENCE_OUT;
+	if (flags & OUT_FENCE)
+		execbuf->flags |= I915_EXEC_FENCE_OUT;
 
 	for (i = 0; i < nengine; i++) {
-		execbuf.flags &= ~ENGINE_MASK;
-		execbuf.flags |= engines[i];
-		gem_execbuf_wr(fd, &execbuf);
-		if (out_fence) {
-			int _fd = execbuf.rsvd2 >> 32;
+		execbuf->flags &= ~ENGINE_MASK;
+		execbuf->flags |= engines[i];
+
+		if (flags & POLL_RUN)
+			igt_require(gem_can_store_dword(fd, execbuf->flags));
+
+		gem_execbuf_wr(fd, execbuf);
+		if (flags & OUT_FENCE) {
+			int _fd = execbuf->rsvd2 >> 32;
 
 			igt_assert(_fd >= 0);
 			if (fence_fd == -1) {
@@ -194,12 +250,20 @@ static int emit_recursive_batch(igt_spin_t *spin,
 		}
 	}
 
+	/* Make it easier for callers to resubmit. */
+
+	obj[BATCH].relocation_count = 0;
+	obj[BATCH].relocs_ptr = 0;
+
+	obj[SCRATCH].flags = EXEC_OBJECT_PINNED;
+	obj[BATCH].flags = EXEC_OBJECT_PINNED;
+
 	return fence_fd;
 }
 
 static igt_spin_t *
 ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
-		      int out_fence)
+		      unsigned int flags)
 {
 	igt_spin_t *spin;
 
@@ -207,7 +271,7 @@ ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
 	igt_assert(spin);
 
 	spin->out_fence = emit_recursive_batch(spin, fd, ctx, engine, dep,
-					       out_fence);
+					       flags);
 
 	pthread_mutex_lock(&list_lock);
 	igt_list_add(&spin->link, &spin_list);
@@ -219,7 +283,7 @@ ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
 igt_spin_t *
 __igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep)
 {
-	return ___igt_spin_batch_new(fd, ctx, engine, dep, false);
+	return ___igt_spin_batch_new(fd, ctx, engine, dep, 0);
 }
 
 /**
@@ -253,7 +317,7 @@ igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep)
 igt_spin_t *
 __igt_spin_batch_new_fence(int fd, uint32_t ctx, unsigned engine)
 {
-	return ___igt_spin_batch_new(fd, ctx, engine, 0, true);
+	return ___igt_spin_batch_new(fd, ctx, engine, 0, OUT_FENCE);
 }
 
 /**
@@ -286,6 +350,42 @@ igt_spin_batch_new_fence(int fd, uint32_t ctx, unsigned engine)
 	return spin;
 }
 
+igt_spin_t *
+__igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine)
+{
+	return ___igt_spin_batch_new(fd, ctx, engine, 0, POLL_RUN);
+}
+
+/**
+ * igt_spin_batch_new_poll:
+ * @fd: open i915 drm file descriptor
+ * @engine: Ring to execute batch OR'd with execbuf flags. If value is less
+ *          than 0, execute on all available rings.
+ *
+ * Start a recursive batch on a ring. Immediately returns a #igt_spin_t that
+ * contains the batch's handle that can be waited upon. The returned structure
+ * must be passed to igt_spin_batch_free() for post-processing.
+ *
+ * igt_spin_t->running will containt a pointer which target will change from
+ * zero to one once the spinner actually starts executing on the GPU.
+ *
+ * Returns:
+ * Structure with helper internal state for igt_spin_batch_free().
+ */
+igt_spin_t *
+igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine)
+{
+	igt_spin_t *spin;
+
+	igt_require_gem(fd);
+	igt_require(gem_mmap__has_wc(fd));
+
+	spin = __igt_spin_batch_new_poll(fd, ctx, engine);
+	igt_assert(gem_bo_busy(fd, spin->handle));
+
+	return spin;
+}
+
 static void notify(union sigval arg)
 {
 	igt_spin_t *spin = arg.sival_ptr;
@@ -367,6 +467,11 @@ void igt_spin_batch_free(int fd, igt_spin_t *spin)
 	igt_spin_batch_end(spin);
 	gem_munmap(spin->batch, BATCH_SIZE);
 
+	if (spin->running) {
+		gem_munmap(spin->running, 4096);
+		gem_close(fd, spin->poll_handle);
+	}
+
 	gem_close(fd, spin->handle);
 
 	if (spin->out_fence >= 0)
diff --git a/lib/igt_dummyload.h b/lib/igt_dummyload.h
index 4103e4ab9e36..3103935a309b 100644
--- a/lib/igt_dummyload.h
+++ b/lib/igt_dummyload.h
@@ -36,6 +36,10 @@ typedef struct igt_spin {
 	struct igt_list link;
 	uint32_t *batch;
 	int out_fence;
+	struct drm_i915_gem_exec_object2 obj[2];
+	struct drm_i915_gem_execbuffer2 execbuf;
+	uint32_t poll_handle;
+	bool *running;
 } igt_spin_t;
 
 igt_spin_t *__igt_spin_batch_new(int fd,
@@ -55,6 +59,13 @@ igt_spin_t *igt_spin_batch_new_fence(int fd,
 				     uint32_t ctx,
 				     unsigned engine);
 
+igt_spin_t *__igt_spin_batch_new_poll(int fd,
+				       uint32_t ctx,
+				       unsigned engine);
+igt_spin_t *igt_spin_batch_new_poll(int fd,
+				    uint32_t ctx,
+				    unsigned engine);
+
 void igt_spin_batch_set_timeout(igt_spin_t *spin, int64_t ns);
 void igt_spin_batch_end(igt_spin_t *spin);
 void igt_spin_batch_free(int fd, igt_spin_t *spin);
diff --git a/lib/ioctl_wrappers.c b/lib/ioctl_wrappers.c
index 8748cfcfc04f..4e1a08bf06b4 100644
--- a/lib/ioctl_wrappers.c
+++ b/lib/ioctl_wrappers.c
@@ -198,7 +198,7 @@ void gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride)
 	igt_assert(__gem_set_tiling(fd, handle, tiling, stride) == 0);
 }
 
-static int __gem_set_caching(int fd, uint32_t handle, uint32_t caching)
+int __gem_set_caching(int fd, uint32_t handle, uint32_t caching)
 {
 	struct drm_i915_gem_caching arg;
 	int err;
diff --git a/lib/ioctl_wrappers.h b/lib/ioctl_wrappers.h
index 13fbe3c103c0..b966f72c90a8 100644
--- a/lib/ioctl_wrappers.h
+++ b/lib/ioctl_wrappers.h
@@ -61,6 +61,7 @@ bool gem_get_tiling(int fd, uint32_t handle, uint32_t *tiling, uint32_t *swizzle
 void gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride);
 int __gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride);
 
+int __gem_set_caching(int fd, uint32_t handle, uint32_t caching);
 void gem_set_caching(int fd, uint32_t handle, uint32_t caching);
 uint32_t gem_get_caching(int fd, uint32_t handle);
 uint32_t gem_flink(int fd, uint32_t handle);
diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
index 19fcc95ffc7f..b9166d88263f 100644
--- a/tests/perf_pmu.c
+++ b/tests/perf_pmu.c
@@ -170,6 +170,46 @@ static unsigned int e2ring(int gem_fd, const struct intel_execution_engine2 *e)
 #define FLAG_LONG (16)
 #define FLAG_HANG (32)
 
+static igt_spin_t * __spin_poll(int fd, uint32_t ctx, unsigned long flags)
+{
+	if (gem_can_store_dword(fd, flags))
+		return __igt_spin_batch_new_poll(fd, ctx, flags);
+	else
+		return __igt_spin_batch_new(fd, ctx, flags, 0);
+}
+
+static unsigned long __spin_wait(int fd, igt_spin_t *spin)
+{
+	struct timespec start = { };
+
+	igt_nsec_elapsed(&start);
+
+	if (gem_can_store_dword(fd, spin->execbuf.flags)) {
+		while (!spin->running)
+			;
+	} else {
+		usleep(500e3); /* Better than nothing! */
+	}
+
+	return igt_nsec_elapsed(&start);
+}
+
+static igt_spin_t * __spin_sync(int fd, uint32_t ctx, unsigned long flags)
+{
+	igt_spin_t *spin = __spin_poll(fd, ctx, flags);
+
+	__spin_wait(fd, spin);
+
+	return spin;
+}
+
+static igt_spin_t * spin_sync(int fd, uint32_t ctx, unsigned long flags)
+{
+	igt_require_gem(fd);
+
+	return __spin_sync(fd, ctx, flags);
+}
+
 static void end_spin(int fd, igt_spin_t *spin, unsigned int flags)
 {
 	if (!spin)
@@ -195,7 +235,7 @@ single(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags)
 	fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance));
 
 	if (flags & TEST_BUSY)
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	else
 		spin = NULL;
 
@@ -251,13 +291,7 @@ busy_start(int gem_fd, const struct intel_execution_engine2 *e)
 	 */
 	sleep(2);
 
-	spin = __igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
-
-	/*
-	 * Sleep for a bit after making the engine busy to make sure the PMU
-	 * gets enabled when the batch is already running.
-	 */
-	usleep(500e3);
+	spin = __spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 
 	fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance));
 
@@ -300,7 +334,7 @@ busy_double_start(int gem_fd, const struct intel_execution_engine2 *e)
 	 * re-submission in execlists mode. Make sure busyness is correctly
 	 * reported with the engine busy, and after the engine went idle.
 	 */
-	spin[0] = __igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin[0] = __spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	usleep(500e3);
 	spin[1] = __igt_spin_batch_new(gem_fd, ctx, e2ring(gem_fd, e), 0);
 
@@ -386,7 +420,7 @@ busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 
 	igt_assert_eq(i, num_engines);
 
-	spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -412,15 +446,13 @@ busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 }
 
 static void
-__submit_spin_batch(int gem_fd,
-		    struct drm_i915_gem_exec_object2 *obj,
+__submit_spin_batch(int gem_fd, igt_spin_t *spin,
 		    const struct intel_execution_engine2 *e)
 {
-	struct drm_i915_gem_execbuffer2 eb = {
-		.buffer_count = 1,
-		.buffers_ptr = to_user_pointer(obj),
-		.flags = e2ring(gem_fd, e),
-	};
+	struct drm_i915_gem_execbuffer2 eb = spin->execbuf;
+
+	eb.flags &= ~(0x3f | I915_EXEC_BSD_MASK);
+	eb.flags |= e2ring(gem_fd, e) | I915_EXEC_NO_RELOC;
 
 	gem_execbuf(gem_fd, &eb);
 }
@@ -429,7 +461,6 @@ static void
 most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 		    const unsigned int num_engines, unsigned int flags)
 {
-	struct drm_i915_gem_exec_object2 obj = {};
 	const struct intel_execution_engine2 *e_;
 	uint64_t tval[2][num_engines];
 	uint64_t val[num_engines];
@@ -443,15 +474,12 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 		if (!gem_has_engine(gem_fd, e_->class, e_->instance))
 			continue;
 
-		if (e == e_) {
+		if (e == e_)
 			idle_idx = i;
-		} else if (spin) {
-			__submit_spin_batch(gem_fd, &obj, e_);
-		} else {
-			spin = igt_spin_batch_new(gem_fd, 0,
-						  e2ring(gem_fd, e_), 0);
-			obj.handle = spin->handle;
-		}
+		else if (spin)
+			__submit_spin_batch(gem_fd, spin, e_);
+		else
+			spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e_));
 
 		val[i++] = I915_PMU_ENGINE_BUSY(e_->class, e_->instance);
 	}
@@ -461,6 +489,9 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 	for (i = 0; i < num_engines; i++)
 		fd[i] = open_group(val[i], fd[0]);
 
+	/* Small delay to allow engines to start. */
+	usleep(__spin_wait(gem_fd, spin) * num_engines / 1e3);
+
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -489,7 +520,6 @@ static void
 all_busy_check_all(int gem_fd, const unsigned int num_engines,
 		   unsigned int flags)
 {
-	struct drm_i915_gem_exec_object2 obj = {};
 	const struct intel_execution_engine2 *e;
 	uint64_t tval[2][num_engines];
 	uint64_t val[num_engines];
@@ -503,13 +533,10 @@ all_busy_check_all(int gem_fd, const unsigned int num_engines,
 		if (!gem_has_engine(gem_fd, e->class, e->instance))
 			continue;
 
-		if (spin) {
-			__submit_spin_batch(gem_fd, &obj, e);
-		} else {
-			spin = igt_spin_batch_new(gem_fd, 0,
-						  e2ring(gem_fd, e), 0);
-			obj.handle = spin->handle;
-		}
+		if (spin)
+			__submit_spin_batch(gem_fd, spin, e);
+		else
+			spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e));
 
 		val[i++] = I915_PMU_ENGINE_BUSY(e->class, e->instance);
 	}
@@ -519,6 +546,9 @@ all_busy_check_all(int gem_fd, const unsigned int num_engines,
 	for (i = 0; i < num_engines; i++)
 		fd[i] = open_group(val[i], fd[0]);
 
+	/* Small delay to allow engines to start. */
+	usleep(__spin_wait(gem_fd, spin) * num_engines / 1e3);
+
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -550,7 +580,7 @@ no_sema(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags)
 	open_group(I915_PMU_ENGINE_WAIT(e->class, e->instance), fd);
 
 	if (flags & TEST_BUSY)
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	else
 		spin = NULL;
 
@@ -884,7 +914,7 @@ multi_client(int gem_fd, const struct intel_execution_engine2 *e)
 	 */
 	fd[1] = open_pmu(config);
 
-	spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 
 	val[0] = val[1] = __pmu_read_single(fd[0], &ts[0]);
 	slept[1] = measured_usleep(batch_duration_ns / 1000);
@@ -1248,7 +1278,7 @@ test_frequency(int gem_fd)
 	igt_require(igt_sysfs_get_u32(sysfs, "gt_boost_freq_mhz") == min_freq);
 
 	gem_quiescent_gpu(gem_fd); /* Idle to be sure the change takes effect */
-	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+	spin = spin_sync(gem_fd, 0, I915_EXEC_RENDER);
 
 	slept = pmu_read_multi(fd, 2, start);
 	measured_usleep(batch_duration_ns / 1000);
@@ -1274,7 +1304,7 @@ test_frequency(int gem_fd)
 	igt_require(igt_sysfs_get_u32(sysfs, "gt_min_freq_mhz") == max_freq);
 
 	gem_quiescent_gpu(gem_fd);
-	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+	spin = spin_sync(gem_fd, 0, I915_EXEC_RENDER);
 
 	slept = pmu_read_multi(fd, 2, start);
 	measured_usleep(batch_duration_ns / 1000);
@@ -1455,6 +1485,10 @@ static void __rearm_spin_batch(igt_spin_t *spin)
 {
 	const uint32_t mi_arb_chk = 0x5 << 23;
 
+	if (spin->running) {
+		igt_assert(*spin->running);
+		*spin->running = 0;
+	}
        *spin->batch = mi_arb_chk;
        __sync_synchronize();
 }
@@ -1517,7 +1551,6 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 		const unsigned long timeout[] = {
 			pwm_calibration_us * 1000, test_us * 1000
 		};
-		struct drm_i915_gem_exec_object2 obj = {};
 		uint64_t total_busy_ns = 0, total_idle_ns = 0;
 		igt_spin_t *spin;
 		int ret;
@@ -1530,12 +1563,9 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 			igt_warn("Failed to set scheduling policy!\n");
 
 		/* Allocate our spin batch and idle it. */
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
-		obj.handle = spin->handle;
-		__submit_spin_batch(gem_fd, &obj, e); /* record its location */
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 		igt_spin_batch_end(spin);
-		gem_sync(gem_fd, obj.handle);
-		obj.flags |= EXEC_OBJECT_PINNED;
+		gem_sync(gem_fd, spin->handle);
 
 		/* 1st pass is calibration, second pass is the test. */
 		for (int pass = 0; pass < ARRAY_SIZE(timeout); pass++) {
@@ -1545,24 +1575,30 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 
 			igt_nsec_elapsed(&test_start);
 			do {
-				unsigned int target_idle_us, t_busy;
+				unsigned int target_idle_us;
+				struct timespec start = { };
+				unsigned long prep_delay_ns;
 
 				/* Restart the spinbatch. */
+				igt_nsec_elapsed(&start);
 				__rearm_spin_batch(spin);
-				__submit_spin_batch(gem_fd, &obj, e);
+				__submit_spin_batch(gem_fd, spin, e);
 
-				/*
-				 * Note that the submission may be delayed to a
-				 * tasklet (ksoftirqd) which cannot run until we
-				 * sleep as we hog the cpu (we are RT).
-				 */
+				 /* Wait for batch to start executing. */
+				__spin_wait(gem_fd, spin);
+				prep_delay_ns = igt_nsec_elapsed(&start);
 
-				t_busy = measured_usleep(busy_us);
+				/* PWM busy sleep. */
+				memset(&start, 0, sizeof(start));
+				igt_nsec_elapsed(&start);
+				measured_usleep(busy_us);
 				igt_spin_batch_end(spin);
-				gem_sync(gem_fd, obj.handle);
+				gem_sync(gem_fd, spin->handle);
 
-				total_busy_ns += t_busy;
+				total_busy_ns += igt_nsec_elapsed(&start);
+				total_idle_ns += prep_delay_ns;
 
+				/* Re-calibrate. */
 				target_idle_us =
 					(100 * total_busy_ns / target_busy_pct - (total_busy_ns + total_idle_ns)) / 1000;
 				total_idle_ns += measured_usleep(target_idle_us);
-- 
2.14.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [igt-dev] [PATCH i-g-t v3] tests/perf_pmu: Improve accuracy by waiting on spinner to start
@ 2018-03-16  7:36       ` Tvrtko Ursulin
  0 siblings, 0 replies; 53+ messages in thread
From: Tvrtko Ursulin @ 2018-03-16  7:36 UTC (permalink / raw)
  To: igt-dev; +Cc: Intel-gfx, Tvrtko Ursulin

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

More than one test assumes that the spinner is running pretty much
immediately after we have create or submitted it.

In actuality there is a variable delay, especially on execlists platforms,
between submission and spin batch starting to run on the hardware.

To enable tests which care about this level of timing to account for this,
we add a new spin batch constructor which provides an output field which
can be polled to determine when the batch actually started running.

This is implemented via MI_STOREDW_IMM from the spin batch, writing into
memory mapped page shared with userspace.

Using this facility from perf_pmu, where applicable, should improve very
occasional test fails across the set and platforms.

v2:
 Chris Wilson:
 * Use caching mapping if available.
 * Handle old gens better.
 * Use gem_can_store_dword.
 * Cache exec obj array in spin_batch_t for easier resubmit.

v3:
 * Forgot I915_EXEC_NO_RELOC. (Chris Wilson)

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 lib/igt_dummyload.c  | 149 +++++++++++++++++++++++++++++++++++++++++++--------
 lib/igt_dummyload.h  |  11 ++++
 lib/ioctl_wrappers.c |   2 +-
 lib/ioctl_wrappers.h |   1 +
 tests/perf_pmu.c     | 144 ++++++++++++++++++++++++++++++-------------------
 5 files changed, 230 insertions(+), 77 deletions(-)

diff --git a/lib/igt_dummyload.c b/lib/igt_dummyload.c
index 4b20f23dfe26..83b9727499ff 100644
--- a/lib/igt_dummyload.c
+++ b/lib/igt_dummyload.c
@@ -74,16 +74,19 @@ fill_reloc(struct drm_i915_gem_relocation_entry *reloc,
 	reloc->write_domain = write_domains;
 }
 
-static int emit_recursive_batch(igt_spin_t *spin,
-				int fd, uint32_t ctx, unsigned engine,
-				uint32_t dep, bool out_fence)
+#define OUT_FENCE	(1 << 0)
+#define POLL_RUN	(1 << 1)
+
+static int
+emit_recursive_batch(igt_spin_t *spin, int fd, uint32_t ctx, unsigned engine,
+		     uint32_t dep, unsigned int flags)
 {
 #define SCRATCH 0
 #define BATCH 1
 	const int gen = intel_gen(intel_get_drm_devid(fd));
-	struct drm_i915_gem_exec_object2 obj[2];
 	struct drm_i915_gem_relocation_entry relocs[2];
-	struct drm_i915_gem_execbuffer2 execbuf;
+	struct drm_i915_gem_execbuffer2 *execbuf;
+	struct drm_i915_gem_exec_object2 *obj;
 	unsigned int engines[16];
 	unsigned int nengine;
 	int fence_fd = -1;
@@ -101,8 +104,10 @@ static int emit_recursive_batch(igt_spin_t *spin,
 	}
 	igt_require(nengine);
 
-	memset(&execbuf, 0, sizeof(execbuf));
-	memset(obj, 0, sizeof(obj));
+	memset(&spin->execbuf, 0, sizeof(spin->execbuf));
+	execbuf = &spin->execbuf;
+	memset(spin->obj, 0, sizeof(spin->obj));
+	obj = spin->obj;
 	memset(relocs, 0, sizeof(relocs));
 
 	obj[BATCH].handle = gem_create(fd, BATCH_SIZE);
@@ -113,16 +118,62 @@ static int emit_recursive_batch(igt_spin_t *spin,
 				       	BATCH_SIZE, PROT_WRITE);
 	gem_set_domain(fd, obj[BATCH].handle,
 			I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT);
-	execbuf.buffer_count++;
+	execbuf->buffer_count++;
 
 	if (dep) {
+		igt_assert(!(flags & POLL_RUN));
+
 		/* dummy write to dependency */
 		obj[SCRATCH].handle = dep;
 		fill_reloc(&relocs[obj[BATCH].relocation_count++],
 			   dep, 1020,
 			   I915_GEM_DOMAIN_RENDER,
 			   I915_GEM_DOMAIN_RENDER);
-		execbuf.buffer_count++;
+		execbuf->buffer_count++;
+	} else if (flags & POLL_RUN) {
+		unsigned int offset;
+
+		igt_assert(!dep);
+
+		if (gen == 4 || gen == 5)
+			execbuf->flags |= I915_EXEC_SECURE;
+
+		spin->poll_handle = gem_create(fd, 4096);
+
+		if (__gem_set_caching(fd, spin->poll_handle,
+				      I915_CACHING_CACHED) == 0)
+			spin->running = __gem_mmap__cpu(fd, spin->poll_handle,
+							0, 4096,
+							PROT_READ | PROT_WRITE);
+		else
+			spin->running = __gem_mmap__wc(fd, spin->poll_handle,
+						       0, 4096,
+						       PROT_READ | PROT_WRITE);
+		igt_assert(spin->running);
+		igt_assert_eq(*spin->running, 0);
+
+		*batch++ = MI_STORE_DWORD_IMM | (gen < 6 ? 1 << 22 : 0);
+
+		if (gen >= 8) {
+			offset = sizeof(uint32_t);
+			*batch++ = 0;
+			*batch++ = 0;
+		} else if (gen >= 4) {
+			offset = 2 * sizeof(uint32_t);
+			*batch++ = 0;
+			*batch++ = 0;
+		} else {
+			offset = sizeof(uint32_t);
+			batch[-1]--;
+			*batch++ = 0;
+		}
+
+		*batch++ = 1;
+
+		obj[SCRATCH].handle = spin->poll_handle;
+		fill_reloc(&relocs[obj[BATCH].relocation_count++],
+			   spin->poll_handle, offset, 0, 0);
+		execbuf->buffer_count++;
 	}
 
 	spin->batch = batch;
@@ -167,18 +218,23 @@ static int emit_recursive_batch(igt_spin_t *spin,
 	obj[BATCH].relocation_count++;
 	obj[BATCH].relocs_ptr = to_user_pointer(relocs);
 
-	execbuf.buffers_ptr = to_user_pointer(obj + (2 - execbuf.buffer_count));
-	execbuf.rsvd1 = ctx;
+	execbuf->buffers_ptr = to_user_pointer(obj +
+					       (2 - execbuf->buffer_count));
+	execbuf->rsvd1 = ctx;
 
-	if (out_fence)
-		execbuf.flags |= I915_EXEC_FENCE_OUT;
+	if (flags & OUT_FENCE)
+		execbuf->flags |= I915_EXEC_FENCE_OUT;
 
 	for (i = 0; i < nengine; i++) {
-		execbuf.flags &= ~ENGINE_MASK;
-		execbuf.flags |= engines[i];
-		gem_execbuf_wr(fd, &execbuf);
-		if (out_fence) {
-			int _fd = execbuf.rsvd2 >> 32;
+		execbuf->flags &= ~ENGINE_MASK;
+		execbuf->flags |= engines[i];
+
+		if (flags & POLL_RUN)
+			igt_require(gem_can_store_dword(fd, execbuf->flags));
+
+		gem_execbuf_wr(fd, execbuf);
+		if (flags & OUT_FENCE) {
+			int _fd = execbuf->rsvd2 >> 32;
 
 			igt_assert(_fd >= 0);
 			if (fence_fd == -1) {
@@ -194,12 +250,20 @@ static int emit_recursive_batch(igt_spin_t *spin,
 		}
 	}
 
+	/* Make it easier for callers to resubmit. */
+
+	obj[BATCH].relocation_count = 0;
+	obj[BATCH].relocs_ptr = 0;
+
+	obj[SCRATCH].flags = EXEC_OBJECT_PINNED;
+	obj[BATCH].flags = EXEC_OBJECT_PINNED;
+
 	return fence_fd;
 }
 
 static igt_spin_t *
 ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
-		      int out_fence)
+		      unsigned int flags)
 {
 	igt_spin_t *spin;
 
@@ -207,7 +271,7 @@ ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
 	igt_assert(spin);
 
 	spin->out_fence = emit_recursive_batch(spin, fd, ctx, engine, dep,
-					       out_fence);
+					       flags);
 
 	pthread_mutex_lock(&list_lock);
 	igt_list_add(&spin->link, &spin_list);
@@ -219,7 +283,7 @@ ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
 igt_spin_t *
 __igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep)
 {
-	return ___igt_spin_batch_new(fd, ctx, engine, dep, false);
+	return ___igt_spin_batch_new(fd, ctx, engine, dep, 0);
 }
 
 /**
@@ -253,7 +317,7 @@ igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep)
 igt_spin_t *
 __igt_spin_batch_new_fence(int fd, uint32_t ctx, unsigned engine)
 {
-	return ___igt_spin_batch_new(fd, ctx, engine, 0, true);
+	return ___igt_spin_batch_new(fd, ctx, engine, 0, OUT_FENCE);
 }
 
 /**
@@ -286,6 +350,42 @@ igt_spin_batch_new_fence(int fd, uint32_t ctx, unsigned engine)
 	return spin;
 }
 
+igt_spin_t *
+__igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine)
+{
+	return ___igt_spin_batch_new(fd, ctx, engine, 0, POLL_RUN);
+}
+
+/**
+ * igt_spin_batch_new_poll:
+ * @fd: open i915 drm file descriptor
+ * @engine: Ring to execute batch OR'd with execbuf flags. If value is less
+ *          than 0, execute on all available rings.
+ *
+ * Start a recursive batch on a ring. Immediately returns a #igt_spin_t that
+ * contains the batch's handle that can be waited upon. The returned structure
+ * must be passed to igt_spin_batch_free() for post-processing.
+ *
+ * igt_spin_t->running will containt a pointer which target will change from
+ * zero to one once the spinner actually starts executing on the GPU.
+ *
+ * Returns:
+ * Structure with helper internal state for igt_spin_batch_free().
+ */
+igt_spin_t *
+igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine)
+{
+	igt_spin_t *spin;
+
+	igt_require_gem(fd);
+	igt_require(gem_mmap__has_wc(fd));
+
+	spin = __igt_spin_batch_new_poll(fd, ctx, engine);
+	igt_assert(gem_bo_busy(fd, spin->handle));
+
+	return spin;
+}
+
 static void notify(union sigval arg)
 {
 	igt_spin_t *spin = arg.sival_ptr;
@@ -367,6 +467,11 @@ void igt_spin_batch_free(int fd, igt_spin_t *spin)
 	igt_spin_batch_end(spin);
 	gem_munmap(spin->batch, BATCH_SIZE);
 
+	if (spin->running) {
+		gem_munmap(spin->running, 4096);
+		gem_close(fd, spin->poll_handle);
+	}
+
 	gem_close(fd, spin->handle);
 
 	if (spin->out_fence >= 0)
diff --git a/lib/igt_dummyload.h b/lib/igt_dummyload.h
index 4103e4ab9e36..3103935a309b 100644
--- a/lib/igt_dummyload.h
+++ b/lib/igt_dummyload.h
@@ -36,6 +36,10 @@ typedef struct igt_spin {
 	struct igt_list link;
 	uint32_t *batch;
 	int out_fence;
+	struct drm_i915_gem_exec_object2 obj[2];
+	struct drm_i915_gem_execbuffer2 execbuf;
+	uint32_t poll_handle;
+	bool *running;
 } igt_spin_t;
 
 igt_spin_t *__igt_spin_batch_new(int fd,
@@ -55,6 +59,13 @@ igt_spin_t *igt_spin_batch_new_fence(int fd,
 				     uint32_t ctx,
 				     unsigned engine);
 
+igt_spin_t *__igt_spin_batch_new_poll(int fd,
+				       uint32_t ctx,
+				       unsigned engine);
+igt_spin_t *igt_spin_batch_new_poll(int fd,
+				    uint32_t ctx,
+				    unsigned engine);
+
 void igt_spin_batch_set_timeout(igt_spin_t *spin, int64_t ns);
 void igt_spin_batch_end(igt_spin_t *spin);
 void igt_spin_batch_free(int fd, igt_spin_t *spin);
diff --git a/lib/ioctl_wrappers.c b/lib/ioctl_wrappers.c
index 8748cfcfc04f..4e1a08bf06b4 100644
--- a/lib/ioctl_wrappers.c
+++ b/lib/ioctl_wrappers.c
@@ -198,7 +198,7 @@ void gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride)
 	igt_assert(__gem_set_tiling(fd, handle, tiling, stride) == 0);
 }
 
-static int __gem_set_caching(int fd, uint32_t handle, uint32_t caching)
+int __gem_set_caching(int fd, uint32_t handle, uint32_t caching)
 {
 	struct drm_i915_gem_caching arg;
 	int err;
diff --git a/lib/ioctl_wrappers.h b/lib/ioctl_wrappers.h
index 13fbe3c103c0..b966f72c90a8 100644
--- a/lib/ioctl_wrappers.h
+++ b/lib/ioctl_wrappers.h
@@ -61,6 +61,7 @@ bool gem_get_tiling(int fd, uint32_t handle, uint32_t *tiling, uint32_t *swizzle
 void gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride);
 int __gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride);
 
+int __gem_set_caching(int fd, uint32_t handle, uint32_t caching);
 void gem_set_caching(int fd, uint32_t handle, uint32_t caching);
 uint32_t gem_get_caching(int fd, uint32_t handle);
 uint32_t gem_flink(int fd, uint32_t handle);
diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
index 19fcc95ffc7f..b9166d88263f 100644
--- a/tests/perf_pmu.c
+++ b/tests/perf_pmu.c
@@ -170,6 +170,46 @@ static unsigned int e2ring(int gem_fd, const struct intel_execution_engine2 *e)
 #define FLAG_LONG (16)
 #define FLAG_HANG (32)
 
+static igt_spin_t * __spin_poll(int fd, uint32_t ctx, unsigned long flags)
+{
+	if (gem_can_store_dword(fd, flags))
+		return __igt_spin_batch_new_poll(fd, ctx, flags);
+	else
+		return __igt_spin_batch_new(fd, ctx, flags, 0);
+}
+
+static unsigned long __spin_wait(int fd, igt_spin_t *spin)
+{
+	struct timespec start = { };
+
+	igt_nsec_elapsed(&start);
+
+	if (gem_can_store_dword(fd, spin->execbuf.flags)) {
+		while (!spin->running)
+			;
+	} else {
+		usleep(500e3); /* Better than nothing! */
+	}
+
+	return igt_nsec_elapsed(&start);
+}
+
+static igt_spin_t * __spin_sync(int fd, uint32_t ctx, unsigned long flags)
+{
+	igt_spin_t *spin = __spin_poll(fd, ctx, flags);
+
+	__spin_wait(fd, spin);
+
+	return spin;
+}
+
+static igt_spin_t * spin_sync(int fd, uint32_t ctx, unsigned long flags)
+{
+	igt_require_gem(fd);
+
+	return __spin_sync(fd, ctx, flags);
+}
+
 static void end_spin(int fd, igt_spin_t *spin, unsigned int flags)
 {
 	if (!spin)
@@ -195,7 +235,7 @@ single(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags)
 	fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance));
 
 	if (flags & TEST_BUSY)
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	else
 		spin = NULL;
 
@@ -251,13 +291,7 @@ busy_start(int gem_fd, const struct intel_execution_engine2 *e)
 	 */
 	sleep(2);
 
-	spin = __igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
-
-	/*
-	 * Sleep for a bit after making the engine busy to make sure the PMU
-	 * gets enabled when the batch is already running.
-	 */
-	usleep(500e3);
+	spin = __spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 
 	fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance));
 
@@ -300,7 +334,7 @@ busy_double_start(int gem_fd, const struct intel_execution_engine2 *e)
 	 * re-submission in execlists mode. Make sure busyness is correctly
 	 * reported with the engine busy, and after the engine went idle.
 	 */
-	spin[0] = __igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin[0] = __spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	usleep(500e3);
 	spin[1] = __igt_spin_batch_new(gem_fd, ctx, e2ring(gem_fd, e), 0);
 
@@ -386,7 +420,7 @@ busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 
 	igt_assert_eq(i, num_engines);
 
-	spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -412,15 +446,13 @@ busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 }
 
 static void
-__submit_spin_batch(int gem_fd,
-		    struct drm_i915_gem_exec_object2 *obj,
+__submit_spin_batch(int gem_fd, igt_spin_t *spin,
 		    const struct intel_execution_engine2 *e)
 {
-	struct drm_i915_gem_execbuffer2 eb = {
-		.buffer_count = 1,
-		.buffers_ptr = to_user_pointer(obj),
-		.flags = e2ring(gem_fd, e),
-	};
+	struct drm_i915_gem_execbuffer2 eb = spin->execbuf;
+
+	eb.flags &= ~(0x3f | I915_EXEC_BSD_MASK);
+	eb.flags |= e2ring(gem_fd, e) | I915_EXEC_NO_RELOC;
 
 	gem_execbuf(gem_fd, &eb);
 }
@@ -429,7 +461,6 @@ static void
 most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 		    const unsigned int num_engines, unsigned int flags)
 {
-	struct drm_i915_gem_exec_object2 obj = {};
 	const struct intel_execution_engine2 *e_;
 	uint64_t tval[2][num_engines];
 	uint64_t val[num_engines];
@@ -443,15 +474,12 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 		if (!gem_has_engine(gem_fd, e_->class, e_->instance))
 			continue;
 
-		if (e == e_) {
+		if (e == e_)
 			idle_idx = i;
-		} else if (spin) {
-			__submit_spin_batch(gem_fd, &obj, e_);
-		} else {
-			spin = igt_spin_batch_new(gem_fd, 0,
-						  e2ring(gem_fd, e_), 0);
-			obj.handle = spin->handle;
-		}
+		else if (spin)
+			__submit_spin_batch(gem_fd, spin, e_);
+		else
+			spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e_));
 
 		val[i++] = I915_PMU_ENGINE_BUSY(e_->class, e_->instance);
 	}
@@ -461,6 +489,9 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 	for (i = 0; i < num_engines; i++)
 		fd[i] = open_group(val[i], fd[0]);
 
+	/* Small delay to allow engines to start. */
+	usleep(__spin_wait(gem_fd, spin) * num_engines / 1e3);
+
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -489,7 +520,6 @@ static void
 all_busy_check_all(int gem_fd, const unsigned int num_engines,
 		   unsigned int flags)
 {
-	struct drm_i915_gem_exec_object2 obj = {};
 	const struct intel_execution_engine2 *e;
 	uint64_t tval[2][num_engines];
 	uint64_t val[num_engines];
@@ -503,13 +533,10 @@ all_busy_check_all(int gem_fd, const unsigned int num_engines,
 		if (!gem_has_engine(gem_fd, e->class, e->instance))
 			continue;
 
-		if (spin) {
-			__submit_spin_batch(gem_fd, &obj, e);
-		} else {
-			spin = igt_spin_batch_new(gem_fd, 0,
-						  e2ring(gem_fd, e), 0);
-			obj.handle = spin->handle;
-		}
+		if (spin)
+			__submit_spin_batch(gem_fd, spin, e);
+		else
+			spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e));
 
 		val[i++] = I915_PMU_ENGINE_BUSY(e->class, e->instance);
 	}
@@ -519,6 +546,9 @@ all_busy_check_all(int gem_fd, const unsigned int num_engines,
 	for (i = 0; i < num_engines; i++)
 		fd[i] = open_group(val[i], fd[0]);
 
+	/* Small delay to allow engines to start. */
+	usleep(__spin_wait(gem_fd, spin) * num_engines / 1e3);
+
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -550,7 +580,7 @@ no_sema(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags)
 	open_group(I915_PMU_ENGINE_WAIT(e->class, e->instance), fd);
 
 	if (flags & TEST_BUSY)
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	else
 		spin = NULL;
 
@@ -884,7 +914,7 @@ multi_client(int gem_fd, const struct intel_execution_engine2 *e)
 	 */
 	fd[1] = open_pmu(config);
 
-	spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 
 	val[0] = val[1] = __pmu_read_single(fd[0], &ts[0]);
 	slept[1] = measured_usleep(batch_duration_ns / 1000);
@@ -1248,7 +1278,7 @@ test_frequency(int gem_fd)
 	igt_require(igt_sysfs_get_u32(sysfs, "gt_boost_freq_mhz") == min_freq);
 
 	gem_quiescent_gpu(gem_fd); /* Idle to be sure the change takes effect */
-	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+	spin = spin_sync(gem_fd, 0, I915_EXEC_RENDER);
 
 	slept = pmu_read_multi(fd, 2, start);
 	measured_usleep(batch_duration_ns / 1000);
@@ -1274,7 +1304,7 @@ test_frequency(int gem_fd)
 	igt_require(igt_sysfs_get_u32(sysfs, "gt_min_freq_mhz") == max_freq);
 
 	gem_quiescent_gpu(gem_fd);
-	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+	spin = spin_sync(gem_fd, 0, I915_EXEC_RENDER);
 
 	slept = pmu_read_multi(fd, 2, start);
 	measured_usleep(batch_duration_ns / 1000);
@@ -1455,6 +1485,10 @@ static void __rearm_spin_batch(igt_spin_t *spin)
 {
 	const uint32_t mi_arb_chk = 0x5 << 23;
 
+	if (spin->running) {
+		igt_assert(*spin->running);
+		*spin->running = 0;
+	}
        *spin->batch = mi_arb_chk;
        __sync_synchronize();
 }
@@ -1517,7 +1551,6 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 		const unsigned long timeout[] = {
 			pwm_calibration_us * 1000, test_us * 1000
 		};
-		struct drm_i915_gem_exec_object2 obj = {};
 		uint64_t total_busy_ns = 0, total_idle_ns = 0;
 		igt_spin_t *spin;
 		int ret;
@@ -1530,12 +1563,9 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 			igt_warn("Failed to set scheduling policy!\n");
 
 		/* Allocate our spin batch and idle it. */
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
-		obj.handle = spin->handle;
-		__submit_spin_batch(gem_fd, &obj, e); /* record its location */
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 		igt_spin_batch_end(spin);
-		gem_sync(gem_fd, obj.handle);
-		obj.flags |= EXEC_OBJECT_PINNED;
+		gem_sync(gem_fd, spin->handle);
 
 		/* 1st pass is calibration, second pass is the test. */
 		for (int pass = 0; pass < ARRAY_SIZE(timeout); pass++) {
@@ -1545,24 +1575,30 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 
 			igt_nsec_elapsed(&test_start);
 			do {
-				unsigned int target_idle_us, t_busy;
+				unsigned int target_idle_us;
+				struct timespec start = { };
+				unsigned long prep_delay_ns;
 
 				/* Restart the spinbatch. */
+				igt_nsec_elapsed(&start);
 				__rearm_spin_batch(spin);
-				__submit_spin_batch(gem_fd, &obj, e);
+				__submit_spin_batch(gem_fd, spin, e);
 
-				/*
-				 * Note that the submission may be delayed to a
-				 * tasklet (ksoftirqd) which cannot run until we
-				 * sleep as we hog the cpu (we are RT).
-				 */
+				 /* Wait for batch to start executing. */
+				__spin_wait(gem_fd, spin);
+				prep_delay_ns = igt_nsec_elapsed(&start);
 
-				t_busy = measured_usleep(busy_us);
+				/* PWM busy sleep. */
+				memset(&start, 0, sizeof(start));
+				igt_nsec_elapsed(&start);
+				measured_usleep(busy_us);
 				igt_spin_batch_end(spin);
-				gem_sync(gem_fd, obj.handle);
+				gem_sync(gem_fd, spin->handle);
 
-				total_busy_ns += t_busy;
+				total_busy_ns += igt_nsec_elapsed(&start);
+				total_idle_ns += prep_delay_ns;
 
+				/* Re-calibrate. */
 				target_idle_us =
 					(100 * total_busy_ns / target_busy_pct - (total_busy_ns + total_idle_ns)) / 1000;
 				total_idle_ns += measured_usleep(target_idle_us);
-- 
2.14.1

_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [igt-dev] ✓ Fi.CI.BAT: success for tests/perf_pmu: Improve accuracy by waiting on spinner to start (rev3)
  2018-03-15 12:56 ` [igt-dev] " Tvrtko Ursulin
                   ` (5 preceding siblings ...)
  (?)
@ 2018-03-16  8:02 ` Patchwork
  -1 siblings, 0 replies; 53+ messages in thread
From: Patchwork @ 2018-03-16  8:02 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: igt-dev

== Series Details ==

Series: tests/perf_pmu: Improve accuracy by waiting on spinner to start (rev3)
URL   : https://patchwork.freedesktop.org/series/40027/
State : success

== Summary ==

IGT patchset tested on top of latest successful build
2e92134b4e4f754239f9721b8781ce2fc0aab07c tests/kms_frontbuffer_tracking: Reduce fbc status spam, v2.

with latest DRM-Tip kernel build CI_DRM_3935
9ac56f76a6a2 drm-tip: 2018y-03m-15d-22h-52m-42s UTC integration manifest

No testlist changes.

---- Known issues:

Test gem_mmap_gtt:
        Subgroup basic-small-bo-tiledx:
                pass       -> FAIL       (fi-gdg-551) fdo#102575

fdo#102575 https://bugs.freedesktop.org/show_bug.cgi?id=102575

fi-bdw-5557u     total:285  pass:264  dwarn:0   dfail:0   fail:0   skip:21  time:432s
fi-bdw-gvtdvm    total:285  pass:261  dwarn:0   dfail:0   fail:0   skip:24  time:442s
fi-blb-e6850     total:285  pass:220  dwarn:1   dfail:0   fail:0   skip:64  time:380s
fi-bsw-n3050     total:285  pass:239  dwarn:0   dfail:0   fail:0   skip:46  time:539s
fi-bwr-2160      total:285  pass:180  dwarn:0   dfail:0   fail:0   skip:105 time:299s
fi-bxt-j4205     total:285  pass:256  dwarn:0   dfail:0   fail:0   skip:29  time:517s
fi-byt-j1900     total:285  pass:250  dwarn:0   dfail:0   fail:0   skip:35  time:517s
fi-byt-n2820     total:285  pass:246  dwarn:0   dfail:0   fail:0   skip:39  time:504s
fi-cfl-8700k     total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:410s
fi-cfl-s2        total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:590s
fi-cfl-u         total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:512s
fi-cnl-drrs      total:285  pass:254  dwarn:3   dfail:0   fail:0   skip:28  time:538s
fi-cnl-y3        total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:591s
fi-elk-e7500     total:285  pass:226  dwarn:0   dfail:0   fail:0   skip:59  time:427s
fi-gdg-551       total:285  pass:176  dwarn:0   dfail:0   fail:1   skip:108 time:320s
fi-glk-1         total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:540s
fi-hsw-4770      total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:404s
fi-ilk-650       total:285  pass:225  dwarn:0   dfail:0   fail:0   skip:60  time:424s
fi-ivb-3520m     total:285  pass:256  dwarn:0   dfail:0   fail:0   skip:29  time:474s
fi-ivb-3770      total:285  pass:252  dwarn:0   dfail:0   fail:0   skip:33  time:428s
fi-kbl-7500u     total:285  pass:260  dwarn:1   dfail:0   fail:0   skip:24  time:477s
fi-kbl-7567u     total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:467s
fi-kbl-r         total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:514s
fi-pnv-d510      total:285  pass:219  dwarn:1   dfail:0   fail:0   skip:65  time:657s
fi-skl-6260u     total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:437s
fi-skl-6600u     total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:534s
fi-skl-6700hq    total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:538s
fi-skl-6700k2    total:285  pass:261  dwarn:0   dfail:0   fail:0   skip:24  time:510s
fi-skl-6770hq    total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:496s
fi-skl-guc       total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:428s
fi-skl-gvtdvm    total:285  pass:262  dwarn:0   dfail:0   fail:0   skip:23  time:443s
fi-snb-2520m     total:285  pass:245  dwarn:0   dfail:0   fail:0   skip:40  time:600s
fi-snb-2600      total:285  pass:245  dwarn:0   dfail:0   fail:0   skip:40  time:400s

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_1144/issues.html
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 53+ messages in thread

* [igt-dev] ✗ Fi.CI.IGT: failure for tests/perf_pmu: Improve accuracy by waiting on spinner to start (rev3)
  2018-03-15 12:56 ` [igt-dev] " Tvrtko Ursulin
                   ` (6 preceding siblings ...)
  (?)
@ 2018-03-16  8:52 ` Patchwork
  -1 siblings, 0 replies; 53+ messages in thread
From: Patchwork @ 2018-03-16  8:52 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: igt-dev

== Series Details ==

Series: tests/perf_pmu: Improve accuracy by waiting on spinner to start (rev3)
URL   : https://patchwork.freedesktop.org/series/40027/
State : failure

== Summary ==

---- Possible new issues:

Test gem_tiled_swapping:
        Subgroup non-threaded:
                skip       -> PASS       (shard-snb)
Test kms_cursor_crc:
        Subgroup cursor-64x64-suspend:
                pass       -> SKIP       (shard-snb)
Test kms_cursor_legacy:
        Subgroup flip-vs-cursor-busy-crc-atomic:
                pass       -> DMESG-FAIL (shard-snb)
Test kms_draw_crc:
        Subgroup draw-method-xrgb2101010-blt-xtiled:
                skip       -> PASS       (shard-snb)
Test perf_pmu:
        Subgroup all-busy-check-all:
                pass       -> INCOMPLETE (shard-snb)
        Subgroup busy-idle-check-all-bcs0:
                pass       -> FAIL       (shard-hsw)
        Subgroup busy-idle-check-all-rcs0:
                pass       -> FAIL       (shard-hsw)
                pass       -> FAIL       (shard-snb)
        Subgroup busy-idle-check-all-vecs0:
                pass       -> FAIL       (shard-hsw)
        Subgroup busy-idle-rcs0:
                pass       -> FAIL       (shard-hsw)
        Subgroup busy-start-rcs0:
                pass       -> FAIL       (shard-snb)
        Subgroup most-busy-check-all-bcs0:
                pass       -> FAIL       (shard-snb)
        Subgroup most-busy-check-all-vcs0:
                pass       -> INCOMPLETE (shard-snb)
        Subgroup most-busy-idle-check-all-bcs0:
                pass       -> FAIL       (shard-hsw)
                pass       -> FAIL       (shard-snb)
        Subgroup most-busy-idle-check-all-rcs0:
                pass       -> FAIL       (shard-hsw)
                pass       -> INCOMPLETE (shard-snb)
        Subgroup most-busy-idle-check-all-vcs0:
                pass       -> FAIL       (shard-hsw)
        Subgroup multi-client-bcs0:
                pass       -> INCOMPLETE (shard-snb)
        Subgroup render-node-busy-bcs0:
                pass       -> INCOMPLETE (shard-snb)
        Subgroup render-node-busy-idle-vecs0:
                pass       -> FAIL       (shard-hsw)
        Subgroup render-node-busy-rcs0:
                pass       -> FAIL       (shard-snb)

---- Known issues:

Test gem_eio:
        Subgroup in-flight-external:
                incomplete -> PASS       (shard-apl) fdo#105341
Test gem_exec_suspend:
        Subgroup basic-s3:
                skip       -> PASS       (shard-snb) fdo#104311
Test kms_flip:
        Subgroup 2x-flip-vs-expired-vblank-interruptible:
                fail       -> PASS       (shard-hsw) fdo#102887
        Subgroup 2x-plain-flip-ts-check:
                fail       -> PASS       (shard-hsw) fdo#100368
Test kms_frontbuffer_tracking:
        Subgroup fbc-1p-primscrn-cur-indfb-draw-mmap-gtt:
                pass       -> FAIL       (shard-apl) fdo#101623
Test kms_plane_multiple:
        Subgroup atomic-pipe-a-tiling-x:
                fail       -> PASS       (shard-snb) fdo#103166
Test kms_rotation_crc:
        Subgroup primary-rotation-180:
                fail       -> PASS       (shard-snb) fdo#103925
Test kms_sysfs_edid_timing:
                warn       -> PASS       (shard-apl) fdo#100047
Test prime_vgem:
        Subgroup basic-fence-flip:
                pass       -> FAIL       (shard-apl) fdo#104008

fdo#105341 https://bugs.freedesktop.org/show_bug.cgi?id=105341
fdo#104311 https://bugs.freedesktop.org/show_bug.cgi?id=104311
fdo#102887 https://bugs.freedesktop.org/show_bug.cgi?id=102887
fdo#100368 https://bugs.freedesktop.org/show_bug.cgi?id=100368
fdo#101623 https://bugs.freedesktop.org/show_bug.cgi?id=101623
fdo#103166 https://bugs.freedesktop.org/show_bug.cgi?id=103166
fdo#103925 https://bugs.freedesktop.org/show_bug.cgi?id=103925
fdo#100047 https://bugs.freedesktop.org/show_bug.cgi?id=100047
fdo#104008 https://bugs.freedesktop.org/show_bug.cgi?id=104008

shard-apl        total:3442 pass:1813 dwarn:1   dfail:0   fail:9   skip:1619 time:12973s
shard-hsw        total:3442 pass:1760 dwarn:1   dfail:0   fail:9   skip:1671 time:11923s
shard-snb        total:3298 pass:1287 dwarn:1   dfail:1   fail:8   skip:1996 time:6491s
Blacklisted hosts:
shard-kbl        total:2136 pass:1185 dwarn:0   dfail:0   fail:5   skip:944 time:5155s

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_1144/shards.html
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 53+ messages in thread

* [PATCH i-g-t v4] tests/perf_pmu: Improve accuracy by waiting on spinner to start
  2018-03-16  7:36       ` [igt-dev] " Tvrtko Ursulin
@ 2018-03-16 10:17         ` Tvrtko Ursulin
  -1 siblings, 0 replies; 53+ messages in thread
From: Tvrtko Ursulin @ 2018-03-16 10:17 UTC (permalink / raw)
  To: igt-dev; +Cc: Intel-gfx

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

More than one test assumes that the spinner is running pretty much
immediately after we have create or submitted it.

In actuality there is a variable delay, especially on execlists platforms,
between submission and spin batch starting to run on the hardware.

To enable tests which care about this level of timing to account for this,
we add a new spin batch constructor which provides an output field which
can be polled to determine when the batch actually started running.

This is implemented via MI_STOREDW_IMM from the spin batch, writing into
memory mapped page shared with userspace.

Using this facility from perf_pmu, where applicable, should improve very
occasional test fails across the set and platforms.

v2:
 Chris Wilson:
 * Use caching mapping if available.
 * Handle old gens better.
 * Use gem_can_store_dword.
 * Cache exec obj array in spin_batch_t for easier resubmit.

v3:
 * Forgot I915_EXEC_NO_RELOC. (Chris Wilson)

v4:
 * Mask out all non-engine flags in gem_can_store_dword.
 * Added some debug logging.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 lib/igt_dummyload.c  | 149 ++++++++++++++++++++++++++++++++++++-------
 lib/igt_dummyload.h  |  11 ++++
 lib/igt_gt.c         |   2 +-
 lib/ioctl_wrappers.c |   2 +-
 lib/ioctl_wrappers.h |   1 +
 tests/perf_pmu.c     | 175 ++++++++++++++++++++++++++++++++++-----------------
 6 files changed, 260 insertions(+), 80 deletions(-)

diff --git a/lib/igt_dummyload.c b/lib/igt_dummyload.c
index 4b20f23dfe26..83b9727499ff 100644
--- a/lib/igt_dummyload.c
+++ b/lib/igt_dummyload.c
@@ -74,16 +74,19 @@ fill_reloc(struct drm_i915_gem_relocation_entry *reloc,
 	reloc->write_domain = write_domains;
 }
 
-static int emit_recursive_batch(igt_spin_t *spin,
-				int fd, uint32_t ctx, unsigned engine,
-				uint32_t dep, bool out_fence)
+#define OUT_FENCE	(1 << 0)
+#define POLL_RUN	(1 << 1)
+
+static int
+emit_recursive_batch(igt_spin_t *spin, int fd, uint32_t ctx, unsigned engine,
+		     uint32_t dep, unsigned int flags)
 {
 #define SCRATCH 0
 #define BATCH 1
 	const int gen = intel_gen(intel_get_drm_devid(fd));
-	struct drm_i915_gem_exec_object2 obj[2];
 	struct drm_i915_gem_relocation_entry relocs[2];
-	struct drm_i915_gem_execbuffer2 execbuf;
+	struct drm_i915_gem_execbuffer2 *execbuf;
+	struct drm_i915_gem_exec_object2 *obj;
 	unsigned int engines[16];
 	unsigned int nengine;
 	int fence_fd = -1;
@@ -101,8 +104,10 @@ static int emit_recursive_batch(igt_spin_t *spin,
 	}
 	igt_require(nengine);
 
-	memset(&execbuf, 0, sizeof(execbuf));
-	memset(obj, 0, sizeof(obj));
+	memset(&spin->execbuf, 0, sizeof(spin->execbuf));
+	execbuf = &spin->execbuf;
+	memset(spin->obj, 0, sizeof(spin->obj));
+	obj = spin->obj;
 	memset(relocs, 0, sizeof(relocs));
 
 	obj[BATCH].handle = gem_create(fd, BATCH_SIZE);
@@ -113,16 +118,62 @@ static int emit_recursive_batch(igt_spin_t *spin,
 				       	BATCH_SIZE, PROT_WRITE);
 	gem_set_domain(fd, obj[BATCH].handle,
 			I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT);
-	execbuf.buffer_count++;
+	execbuf->buffer_count++;
 
 	if (dep) {
+		igt_assert(!(flags & POLL_RUN));
+
 		/* dummy write to dependency */
 		obj[SCRATCH].handle = dep;
 		fill_reloc(&relocs[obj[BATCH].relocation_count++],
 			   dep, 1020,
 			   I915_GEM_DOMAIN_RENDER,
 			   I915_GEM_DOMAIN_RENDER);
-		execbuf.buffer_count++;
+		execbuf->buffer_count++;
+	} else if (flags & POLL_RUN) {
+		unsigned int offset;
+
+		igt_assert(!dep);
+
+		if (gen == 4 || gen == 5)
+			execbuf->flags |= I915_EXEC_SECURE;
+
+		spin->poll_handle = gem_create(fd, 4096);
+
+		if (__gem_set_caching(fd, spin->poll_handle,
+				      I915_CACHING_CACHED) == 0)
+			spin->running = __gem_mmap__cpu(fd, spin->poll_handle,
+							0, 4096,
+							PROT_READ | PROT_WRITE);
+		else
+			spin->running = __gem_mmap__wc(fd, spin->poll_handle,
+						       0, 4096,
+						       PROT_READ | PROT_WRITE);
+		igt_assert(spin->running);
+		igt_assert_eq(*spin->running, 0);
+
+		*batch++ = MI_STORE_DWORD_IMM | (gen < 6 ? 1 << 22 : 0);
+
+		if (gen >= 8) {
+			offset = sizeof(uint32_t);
+			*batch++ = 0;
+			*batch++ = 0;
+		} else if (gen >= 4) {
+			offset = 2 * sizeof(uint32_t);
+			*batch++ = 0;
+			*batch++ = 0;
+		} else {
+			offset = sizeof(uint32_t);
+			batch[-1]--;
+			*batch++ = 0;
+		}
+
+		*batch++ = 1;
+
+		obj[SCRATCH].handle = spin->poll_handle;
+		fill_reloc(&relocs[obj[BATCH].relocation_count++],
+			   spin->poll_handle, offset, 0, 0);
+		execbuf->buffer_count++;
 	}
 
 	spin->batch = batch;
@@ -167,18 +218,23 @@ static int emit_recursive_batch(igt_spin_t *spin,
 	obj[BATCH].relocation_count++;
 	obj[BATCH].relocs_ptr = to_user_pointer(relocs);
 
-	execbuf.buffers_ptr = to_user_pointer(obj + (2 - execbuf.buffer_count));
-	execbuf.rsvd1 = ctx;
+	execbuf->buffers_ptr = to_user_pointer(obj +
+					       (2 - execbuf->buffer_count));
+	execbuf->rsvd1 = ctx;
 
-	if (out_fence)
-		execbuf.flags |= I915_EXEC_FENCE_OUT;
+	if (flags & OUT_FENCE)
+		execbuf->flags |= I915_EXEC_FENCE_OUT;
 
 	for (i = 0; i < nengine; i++) {
-		execbuf.flags &= ~ENGINE_MASK;
-		execbuf.flags |= engines[i];
-		gem_execbuf_wr(fd, &execbuf);
-		if (out_fence) {
-			int _fd = execbuf.rsvd2 >> 32;
+		execbuf->flags &= ~ENGINE_MASK;
+		execbuf->flags |= engines[i];
+
+		if (flags & POLL_RUN)
+			igt_require(gem_can_store_dword(fd, execbuf->flags));
+
+		gem_execbuf_wr(fd, execbuf);
+		if (flags & OUT_FENCE) {
+			int _fd = execbuf->rsvd2 >> 32;
 
 			igt_assert(_fd >= 0);
 			if (fence_fd == -1) {
@@ -194,12 +250,20 @@ static int emit_recursive_batch(igt_spin_t *spin,
 		}
 	}
 
+	/* Make it easier for callers to resubmit. */
+
+	obj[BATCH].relocation_count = 0;
+	obj[BATCH].relocs_ptr = 0;
+
+	obj[SCRATCH].flags = EXEC_OBJECT_PINNED;
+	obj[BATCH].flags = EXEC_OBJECT_PINNED;
+
 	return fence_fd;
 }
 
 static igt_spin_t *
 ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
-		      int out_fence)
+		      unsigned int flags)
 {
 	igt_spin_t *spin;
 
@@ -207,7 +271,7 @@ ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
 	igt_assert(spin);
 
 	spin->out_fence = emit_recursive_batch(spin, fd, ctx, engine, dep,
-					       out_fence);
+					       flags);
 
 	pthread_mutex_lock(&list_lock);
 	igt_list_add(&spin->link, &spin_list);
@@ -219,7 +283,7 @@ ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
 igt_spin_t *
 __igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep)
 {
-	return ___igt_spin_batch_new(fd, ctx, engine, dep, false);
+	return ___igt_spin_batch_new(fd, ctx, engine, dep, 0);
 }
 
 /**
@@ -253,7 +317,7 @@ igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep)
 igt_spin_t *
 __igt_spin_batch_new_fence(int fd, uint32_t ctx, unsigned engine)
 {
-	return ___igt_spin_batch_new(fd, ctx, engine, 0, true);
+	return ___igt_spin_batch_new(fd, ctx, engine, 0, OUT_FENCE);
 }
 
 /**
@@ -286,6 +350,42 @@ igt_spin_batch_new_fence(int fd, uint32_t ctx, unsigned engine)
 	return spin;
 }
 
+igt_spin_t *
+__igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine)
+{
+	return ___igt_spin_batch_new(fd, ctx, engine, 0, POLL_RUN);
+}
+
+/**
+ * igt_spin_batch_new_poll:
+ * @fd: open i915 drm file descriptor
+ * @engine: Ring to execute batch OR'd with execbuf flags. If value is less
+ *          than 0, execute on all available rings.
+ *
+ * Start a recursive batch on a ring. Immediately returns a #igt_spin_t that
+ * contains the batch's handle that can be waited upon. The returned structure
+ * must be passed to igt_spin_batch_free() for post-processing.
+ *
+ * igt_spin_t->running will containt a pointer which target will change from
+ * zero to one once the spinner actually starts executing on the GPU.
+ *
+ * Returns:
+ * Structure with helper internal state for igt_spin_batch_free().
+ */
+igt_spin_t *
+igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine)
+{
+	igt_spin_t *spin;
+
+	igt_require_gem(fd);
+	igt_require(gem_mmap__has_wc(fd));
+
+	spin = __igt_spin_batch_new_poll(fd, ctx, engine);
+	igt_assert(gem_bo_busy(fd, spin->handle));
+
+	return spin;
+}
+
 static void notify(union sigval arg)
 {
 	igt_spin_t *spin = arg.sival_ptr;
@@ -367,6 +467,11 @@ void igt_spin_batch_free(int fd, igt_spin_t *spin)
 	igt_spin_batch_end(spin);
 	gem_munmap(spin->batch, BATCH_SIZE);
 
+	if (spin->running) {
+		gem_munmap(spin->running, 4096);
+		gem_close(fd, spin->poll_handle);
+	}
+
 	gem_close(fd, spin->handle);
 
 	if (spin->out_fence >= 0)
diff --git a/lib/igt_dummyload.h b/lib/igt_dummyload.h
index 4103e4ab9e36..3103935a309b 100644
--- a/lib/igt_dummyload.h
+++ b/lib/igt_dummyload.h
@@ -36,6 +36,10 @@ typedef struct igt_spin {
 	struct igt_list link;
 	uint32_t *batch;
 	int out_fence;
+	struct drm_i915_gem_exec_object2 obj[2];
+	struct drm_i915_gem_execbuffer2 execbuf;
+	uint32_t poll_handle;
+	bool *running;
 } igt_spin_t;
 
 igt_spin_t *__igt_spin_batch_new(int fd,
@@ -55,6 +59,13 @@ igt_spin_t *igt_spin_batch_new_fence(int fd,
 				     uint32_t ctx,
 				     unsigned engine);
 
+igt_spin_t *__igt_spin_batch_new_poll(int fd,
+				       uint32_t ctx,
+				       unsigned engine);
+igt_spin_t *igt_spin_batch_new_poll(int fd,
+				    uint32_t ctx,
+				    unsigned engine);
+
 void igt_spin_batch_set_timeout(igt_spin_t *spin, int64_t ns);
 void igt_spin_batch_end(igt_spin_t *spin);
 void igt_spin_batch_free(int fd, igt_spin_t *spin);
diff --git a/lib/igt_gt.c b/lib/igt_gt.c
index 01aebc670862..4569fd36bd85 100644
--- a/lib/igt_gt.c
+++ b/lib/igt_gt.c
@@ -609,7 +609,7 @@ bool gem_can_store_dword(int fd, unsigned int engine)
 	if (gen == 3 && (info->is_grantsdale || info->is_alviso))
 		return false; /* only supports physical addresses */
 
-	if (gen == 6 && (engine & ~(3<<13)) == I915_EXEC_BSD)
+	if (gen == 6 && ((engine & 0x3f) == I915_EXEC_BSD))
 		return false; /* kills the machine! */
 
 	if (info->is_broadwater)
diff --git a/lib/ioctl_wrappers.c b/lib/ioctl_wrappers.c
index 8748cfcfc04f..4e1a08bf06b4 100644
--- a/lib/ioctl_wrappers.c
+++ b/lib/ioctl_wrappers.c
@@ -198,7 +198,7 @@ void gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride)
 	igt_assert(__gem_set_tiling(fd, handle, tiling, stride) == 0);
 }
 
-static int __gem_set_caching(int fd, uint32_t handle, uint32_t caching)
+int __gem_set_caching(int fd, uint32_t handle, uint32_t caching)
 {
 	struct drm_i915_gem_caching arg;
 	int err;
diff --git a/lib/ioctl_wrappers.h b/lib/ioctl_wrappers.h
index 13fbe3c103c0..b966f72c90a8 100644
--- a/lib/ioctl_wrappers.h
+++ b/lib/ioctl_wrappers.h
@@ -61,6 +61,7 @@ bool gem_get_tiling(int fd, uint32_t handle, uint32_t *tiling, uint32_t *swizzle
 void gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride);
 int __gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride);
 
+int __gem_set_caching(int fd, uint32_t handle, uint32_t caching);
 void gem_set_caching(int fd, uint32_t handle, uint32_t caching);
 uint32_t gem_get_caching(int fd, uint32_t handle);
 uint32_t gem_flink(int fd, uint32_t handle);
diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
index 19fcc95ffc7f..27ec305079e1 100644
--- a/tests/perf_pmu.c
+++ b/tests/perf_pmu.c
@@ -170,6 +170,56 @@ static unsigned int e2ring(int gem_fd, const struct intel_execution_engine2 *e)
 #define FLAG_LONG (16)
 #define FLAG_HANG (32)
 
+static igt_spin_t * __spin_poll(int fd, uint32_t ctx, unsigned long flags)
+{
+	if (gem_can_store_dword(fd, flags))
+		return __igt_spin_batch_new_poll(fd, ctx, flags);
+	else
+		return __igt_spin_batch_new(fd, ctx, flags, 0);
+}
+
+static unsigned long __spin_wait(int fd, igt_spin_t *spin)
+{
+	struct timespec start = { };
+
+	igt_nsec_elapsed(&start);
+
+	if (gem_can_store_dword(fd, spin->execbuf.flags)) {
+		unsigned long timeout = 0;
+
+		while (!spin->running) {
+			unsigned long t = igt_nsec_elapsed(&start);
+
+			if ((t - timeout) > 250e6) {
+				timeout = t;
+				igt_warn("Spinner not running after %.2fms\n",
+					 (double)t / 1e6);
+			}
+		};
+	} else {
+		igt_debug("__spin_wait - usleep mode\n");
+		usleep(500e3); /* Better than nothing! */
+	}
+
+	return igt_nsec_elapsed(&start);
+}
+
+static igt_spin_t * __spin_sync(int fd, uint32_t ctx, unsigned long flags)
+{
+	igt_spin_t *spin = __spin_poll(fd, ctx, flags);
+
+	__spin_wait(fd, spin);
+
+	return spin;
+}
+
+static igt_spin_t * spin_sync(int fd, uint32_t ctx, unsigned long flags)
+{
+	igt_require_gem(fd);
+
+	return __spin_sync(fd, ctx, flags);
+}
+
 static void end_spin(int fd, igt_spin_t *spin, unsigned int flags)
 {
 	if (!spin)
@@ -180,8 +230,25 @@ static void end_spin(int fd, igt_spin_t *spin, unsigned int flags)
 	if (flags & FLAG_SYNC)
 		gem_sync(fd, spin->handle);
 
-	if (flags & TEST_TRAILING_IDLE)
-		usleep(batch_duration_ns / 5000);
+	if (flags & TEST_TRAILING_IDLE) {
+		unsigned long t, timeout = 0;
+		struct timespec start = { };
+
+		igt_nsec_elapsed(&start);
+
+		do {
+			t = igt_nsec_elapsed(&start);
+
+			if (gem_bo_busy(fd, spin->handle) &&
+			    (t - timeout) > 10e6) {
+				timeout = t;
+				igt_warn("Spinner not idle after %.2fms\n",
+					 (double)t / 1e6);
+			}
+
+			usleep(1e3);
+		} while (t < batch_duration_ns / 5);
+	}
 }
 
 static void
@@ -195,7 +262,7 @@ single(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags)
 	fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance));
 
 	if (flags & TEST_BUSY)
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	else
 		spin = NULL;
 
@@ -251,13 +318,7 @@ busy_start(int gem_fd, const struct intel_execution_engine2 *e)
 	 */
 	sleep(2);
 
-	spin = __igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
-
-	/*
-	 * Sleep for a bit after making the engine busy to make sure the PMU
-	 * gets enabled when the batch is already running.
-	 */
-	usleep(500e3);
+	spin = __spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 
 	fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance));
 
@@ -300,7 +361,7 @@ busy_double_start(int gem_fd, const struct intel_execution_engine2 *e)
 	 * re-submission in execlists mode. Make sure busyness is correctly
 	 * reported with the engine busy, and after the engine went idle.
 	 */
-	spin[0] = __igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin[0] = __spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	usleep(500e3);
 	spin[1] = __igt_spin_batch_new(gem_fd, ctx, e2ring(gem_fd, e), 0);
 
@@ -386,7 +447,7 @@ busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 
 	igt_assert_eq(i, num_engines);
 
-	spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -412,15 +473,13 @@ busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 }
 
 static void
-__submit_spin_batch(int gem_fd,
-		    struct drm_i915_gem_exec_object2 *obj,
+__submit_spin_batch(int gem_fd, igt_spin_t *spin,
 		    const struct intel_execution_engine2 *e)
 {
-	struct drm_i915_gem_execbuffer2 eb = {
-		.buffer_count = 1,
-		.buffers_ptr = to_user_pointer(obj),
-		.flags = e2ring(gem_fd, e),
-	};
+	struct drm_i915_gem_execbuffer2 eb = spin->execbuf;
+
+	eb.flags &= ~(0x3f | I915_EXEC_BSD_MASK);
+	eb.flags |= e2ring(gem_fd, e) | I915_EXEC_NO_RELOC;
 
 	gem_execbuf(gem_fd, &eb);
 }
@@ -429,7 +488,6 @@ static void
 most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 		    const unsigned int num_engines, unsigned int flags)
 {
-	struct drm_i915_gem_exec_object2 obj = {};
 	const struct intel_execution_engine2 *e_;
 	uint64_t tval[2][num_engines];
 	uint64_t val[num_engines];
@@ -443,15 +501,12 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 		if (!gem_has_engine(gem_fd, e_->class, e_->instance))
 			continue;
 
-		if (e == e_) {
+		if (e == e_)
 			idle_idx = i;
-		} else if (spin) {
-			__submit_spin_batch(gem_fd, &obj, e_);
-		} else {
-			spin = igt_spin_batch_new(gem_fd, 0,
-						  e2ring(gem_fd, e_), 0);
-			obj.handle = spin->handle;
-		}
+		else if (spin)
+			__submit_spin_batch(gem_fd, spin, e_);
+		else
+			spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e_));
 
 		val[i++] = I915_PMU_ENGINE_BUSY(e_->class, e_->instance);
 	}
@@ -461,6 +516,9 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 	for (i = 0; i < num_engines; i++)
 		fd[i] = open_group(val[i], fd[0]);
 
+	/* Small delay to allow engines to start. */
+	usleep(__spin_wait(gem_fd, spin) * num_engines / 1e3);
+
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -489,7 +547,6 @@ static void
 all_busy_check_all(int gem_fd, const unsigned int num_engines,
 		   unsigned int flags)
 {
-	struct drm_i915_gem_exec_object2 obj = {};
 	const struct intel_execution_engine2 *e;
 	uint64_t tval[2][num_engines];
 	uint64_t val[num_engines];
@@ -503,13 +560,10 @@ all_busy_check_all(int gem_fd, const unsigned int num_engines,
 		if (!gem_has_engine(gem_fd, e->class, e->instance))
 			continue;
 
-		if (spin) {
-			__submit_spin_batch(gem_fd, &obj, e);
-		} else {
-			spin = igt_spin_batch_new(gem_fd, 0,
-						  e2ring(gem_fd, e), 0);
-			obj.handle = spin->handle;
-		}
+		if (spin)
+			__submit_spin_batch(gem_fd, spin, e);
+		else
+			spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e));
 
 		val[i++] = I915_PMU_ENGINE_BUSY(e->class, e->instance);
 	}
@@ -519,6 +573,9 @@ all_busy_check_all(int gem_fd, const unsigned int num_engines,
 	for (i = 0; i < num_engines; i++)
 		fd[i] = open_group(val[i], fd[0]);
 
+	/* Small delay to allow engines to start. */
+	usleep(__spin_wait(gem_fd, spin) * num_engines / 1e3);
+
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -550,7 +607,7 @@ no_sema(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags)
 	open_group(I915_PMU_ENGINE_WAIT(e->class, e->instance), fd);
 
 	if (flags & TEST_BUSY)
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	else
 		spin = NULL;
 
@@ -884,7 +941,7 @@ multi_client(int gem_fd, const struct intel_execution_engine2 *e)
 	 */
 	fd[1] = open_pmu(config);
 
-	spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 
 	val[0] = val[1] = __pmu_read_single(fd[0], &ts[0]);
 	slept[1] = measured_usleep(batch_duration_ns / 1000);
@@ -1248,7 +1305,7 @@ test_frequency(int gem_fd)
 	igt_require(igt_sysfs_get_u32(sysfs, "gt_boost_freq_mhz") == min_freq);
 
 	gem_quiescent_gpu(gem_fd); /* Idle to be sure the change takes effect */
-	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+	spin = spin_sync(gem_fd, 0, I915_EXEC_RENDER);
 
 	slept = pmu_read_multi(fd, 2, start);
 	measured_usleep(batch_duration_ns / 1000);
@@ -1274,7 +1331,7 @@ test_frequency(int gem_fd)
 	igt_require(igt_sysfs_get_u32(sysfs, "gt_min_freq_mhz") == max_freq);
 
 	gem_quiescent_gpu(gem_fd);
-	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+	spin = spin_sync(gem_fd, 0, I915_EXEC_RENDER);
 
 	slept = pmu_read_multi(fd, 2, start);
 	measured_usleep(batch_duration_ns / 1000);
@@ -1455,6 +1512,10 @@ static void __rearm_spin_batch(igt_spin_t *spin)
 {
 	const uint32_t mi_arb_chk = 0x5 << 23;
 
+	if (spin->running) {
+		igt_assert(*spin->running);
+		*spin->running = 0;
+	}
        *spin->batch = mi_arb_chk;
        __sync_synchronize();
 }
@@ -1517,7 +1578,6 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 		const unsigned long timeout[] = {
 			pwm_calibration_us * 1000, test_us * 1000
 		};
-		struct drm_i915_gem_exec_object2 obj = {};
 		uint64_t total_busy_ns = 0, total_idle_ns = 0;
 		igt_spin_t *spin;
 		int ret;
@@ -1530,12 +1590,9 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 			igt_warn("Failed to set scheduling policy!\n");
 
 		/* Allocate our spin batch and idle it. */
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
-		obj.handle = spin->handle;
-		__submit_spin_batch(gem_fd, &obj, e); /* record its location */
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 		igt_spin_batch_end(spin);
-		gem_sync(gem_fd, obj.handle);
-		obj.flags |= EXEC_OBJECT_PINNED;
+		gem_sync(gem_fd, spin->handle);
 
 		/* 1st pass is calibration, second pass is the test. */
 		for (int pass = 0; pass < ARRAY_SIZE(timeout); pass++) {
@@ -1545,24 +1602,30 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 
 			igt_nsec_elapsed(&test_start);
 			do {
-				unsigned int target_idle_us, t_busy;
+				unsigned int target_idle_us;
+				struct timespec start = { };
+				unsigned long prep_delay_ns;
 
 				/* Restart the spinbatch. */
+				igt_nsec_elapsed(&start);
 				__rearm_spin_batch(spin);
-				__submit_spin_batch(gem_fd, &obj, e);
+				__submit_spin_batch(gem_fd, spin, e);
 
-				/*
-				 * Note that the submission may be delayed to a
-				 * tasklet (ksoftirqd) which cannot run until we
-				 * sleep as we hog the cpu (we are RT).
-				 */
+				 /* Wait for batch to start executing. */
+				__spin_wait(gem_fd, spin);
+				prep_delay_ns = igt_nsec_elapsed(&start);
 
-				t_busy = measured_usleep(busy_us);
+				/* PWM busy sleep. */
+				memset(&start, 0, sizeof(start));
+				igt_nsec_elapsed(&start);
+				measured_usleep(busy_us);
 				igt_spin_batch_end(spin);
-				gem_sync(gem_fd, obj.handle);
+				gem_sync(gem_fd, spin->handle);
 
-				total_busy_ns += t_busy;
+				total_busy_ns += igt_nsec_elapsed(&start);
+				total_idle_ns += prep_delay_ns;
 
+				/* Re-calibrate. */
 				target_idle_us =
 					(100 * total_busy_ns / target_busy_pct - (total_busy_ns + total_idle_ns)) / 1000;
 				total_idle_ns += measured_usleep(target_idle_us);
-- 
2.14.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [igt-dev] [PATCH i-g-t v4] tests/perf_pmu: Improve accuracy by waiting on spinner to start
@ 2018-03-16 10:17         ` Tvrtko Ursulin
  0 siblings, 0 replies; 53+ messages in thread
From: Tvrtko Ursulin @ 2018-03-16 10:17 UTC (permalink / raw)
  To: igt-dev; +Cc: Intel-gfx, Tvrtko Ursulin

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

More than one test assumes that the spinner is running pretty much
immediately after we have create or submitted it.

In actuality there is a variable delay, especially on execlists platforms,
between submission and spin batch starting to run on the hardware.

To enable tests which care about this level of timing to account for this,
we add a new spin batch constructor which provides an output field which
can be polled to determine when the batch actually started running.

This is implemented via MI_STOREDW_IMM from the spin batch, writing into
memory mapped page shared with userspace.

Using this facility from perf_pmu, where applicable, should improve very
occasional test fails across the set and platforms.

v2:
 Chris Wilson:
 * Use caching mapping if available.
 * Handle old gens better.
 * Use gem_can_store_dword.
 * Cache exec obj array in spin_batch_t for easier resubmit.

v3:
 * Forgot I915_EXEC_NO_RELOC. (Chris Wilson)

v4:
 * Mask out all non-engine flags in gem_can_store_dword.
 * Added some debug logging.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 lib/igt_dummyload.c  | 149 ++++++++++++++++++++++++++++++++++++-------
 lib/igt_dummyload.h  |  11 ++++
 lib/igt_gt.c         |   2 +-
 lib/ioctl_wrappers.c |   2 +-
 lib/ioctl_wrappers.h |   1 +
 tests/perf_pmu.c     | 175 ++++++++++++++++++++++++++++++++++-----------------
 6 files changed, 260 insertions(+), 80 deletions(-)

diff --git a/lib/igt_dummyload.c b/lib/igt_dummyload.c
index 4b20f23dfe26..83b9727499ff 100644
--- a/lib/igt_dummyload.c
+++ b/lib/igt_dummyload.c
@@ -74,16 +74,19 @@ fill_reloc(struct drm_i915_gem_relocation_entry *reloc,
 	reloc->write_domain = write_domains;
 }
 
-static int emit_recursive_batch(igt_spin_t *spin,
-				int fd, uint32_t ctx, unsigned engine,
-				uint32_t dep, bool out_fence)
+#define OUT_FENCE	(1 << 0)
+#define POLL_RUN	(1 << 1)
+
+static int
+emit_recursive_batch(igt_spin_t *spin, int fd, uint32_t ctx, unsigned engine,
+		     uint32_t dep, unsigned int flags)
 {
 #define SCRATCH 0
 #define BATCH 1
 	const int gen = intel_gen(intel_get_drm_devid(fd));
-	struct drm_i915_gem_exec_object2 obj[2];
 	struct drm_i915_gem_relocation_entry relocs[2];
-	struct drm_i915_gem_execbuffer2 execbuf;
+	struct drm_i915_gem_execbuffer2 *execbuf;
+	struct drm_i915_gem_exec_object2 *obj;
 	unsigned int engines[16];
 	unsigned int nengine;
 	int fence_fd = -1;
@@ -101,8 +104,10 @@ static int emit_recursive_batch(igt_spin_t *spin,
 	}
 	igt_require(nengine);
 
-	memset(&execbuf, 0, sizeof(execbuf));
-	memset(obj, 0, sizeof(obj));
+	memset(&spin->execbuf, 0, sizeof(spin->execbuf));
+	execbuf = &spin->execbuf;
+	memset(spin->obj, 0, sizeof(spin->obj));
+	obj = spin->obj;
 	memset(relocs, 0, sizeof(relocs));
 
 	obj[BATCH].handle = gem_create(fd, BATCH_SIZE);
@@ -113,16 +118,62 @@ static int emit_recursive_batch(igt_spin_t *spin,
 				       	BATCH_SIZE, PROT_WRITE);
 	gem_set_domain(fd, obj[BATCH].handle,
 			I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT);
-	execbuf.buffer_count++;
+	execbuf->buffer_count++;
 
 	if (dep) {
+		igt_assert(!(flags & POLL_RUN));
+
 		/* dummy write to dependency */
 		obj[SCRATCH].handle = dep;
 		fill_reloc(&relocs[obj[BATCH].relocation_count++],
 			   dep, 1020,
 			   I915_GEM_DOMAIN_RENDER,
 			   I915_GEM_DOMAIN_RENDER);
-		execbuf.buffer_count++;
+		execbuf->buffer_count++;
+	} else if (flags & POLL_RUN) {
+		unsigned int offset;
+
+		igt_assert(!dep);
+
+		if (gen == 4 || gen == 5)
+			execbuf->flags |= I915_EXEC_SECURE;
+
+		spin->poll_handle = gem_create(fd, 4096);
+
+		if (__gem_set_caching(fd, spin->poll_handle,
+				      I915_CACHING_CACHED) == 0)
+			spin->running = __gem_mmap__cpu(fd, spin->poll_handle,
+							0, 4096,
+							PROT_READ | PROT_WRITE);
+		else
+			spin->running = __gem_mmap__wc(fd, spin->poll_handle,
+						       0, 4096,
+						       PROT_READ | PROT_WRITE);
+		igt_assert(spin->running);
+		igt_assert_eq(*spin->running, 0);
+
+		*batch++ = MI_STORE_DWORD_IMM | (gen < 6 ? 1 << 22 : 0);
+
+		if (gen >= 8) {
+			offset = sizeof(uint32_t);
+			*batch++ = 0;
+			*batch++ = 0;
+		} else if (gen >= 4) {
+			offset = 2 * sizeof(uint32_t);
+			*batch++ = 0;
+			*batch++ = 0;
+		} else {
+			offset = sizeof(uint32_t);
+			batch[-1]--;
+			*batch++ = 0;
+		}
+
+		*batch++ = 1;
+
+		obj[SCRATCH].handle = spin->poll_handle;
+		fill_reloc(&relocs[obj[BATCH].relocation_count++],
+			   spin->poll_handle, offset, 0, 0);
+		execbuf->buffer_count++;
 	}
 
 	spin->batch = batch;
@@ -167,18 +218,23 @@ static int emit_recursive_batch(igt_spin_t *spin,
 	obj[BATCH].relocation_count++;
 	obj[BATCH].relocs_ptr = to_user_pointer(relocs);
 
-	execbuf.buffers_ptr = to_user_pointer(obj + (2 - execbuf.buffer_count));
-	execbuf.rsvd1 = ctx;
+	execbuf->buffers_ptr = to_user_pointer(obj +
+					       (2 - execbuf->buffer_count));
+	execbuf->rsvd1 = ctx;
 
-	if (out_fence)
-		execbuf.flags |= I915_EXEC_FENCE_OUT;
+	if (flags & OUT_FENCE)
+		execbuf->flags |= I915_EXEC_FENCE_OUT;
 
 	for (i = 0; i < nengine; i++) {
-		execbuf.flags &= ~ENGINE_MASK;
-		execbuf.flags |= engines[i];
-		gem_execbuf_wr(fd, &execbuf);
-		if (out_fence) {
-			int _fd = execbuf.rsvd2 >> 32;
+		execbuf->flags &= ~ENGINE_MASK;
+		execbuf->flags |= engines[i];
+
+		if (flags & POLL_RUN)
+			igt_require(gem_can_store_dword(fd, execbuf->flags));
+
+		gem_execbuf_wr(fd, execbuf);
+		if (flags & OUT_FENCE) {
+			int _fd = execbuf->rsvd2 >> 32;
 
 			igt_assert(_fd >= 0);
 			if (fence_fd == -1) {
@@ -194,12 +250,20 @@ static int emit_recursive_batch(igt_spin_t *spin,
 		}
 	}
 
+	/* Make it easier for callers to resubmit. */
+
+	obj[BATCH].relocation_count = 0;
+	obj[BATCH].relocs_ptr = 0;
+
+	obj[SCRATCH].flags = EXEC_OBJECT_PINNED;
+	obj[BATCH].flags = EXEC_OBJECT_PINNED;
+
 	return fence_fd;
 }
 
 static igt_spin_t *
 ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
-		      int out_fence)
+		      unsigned int flags)
 {
 	igt_spin_t *spin;
 
@@ -207,7 +271,7 @@ ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
 	igt_assert(spin);
 
 	spin->out_fence = emit_recursive_batch(spin, fd, ctx, engine, dep,
-					       out_fence);
+					       flags);
 
 	pthread_mutex_lock(&list_lock);
 	igt_list_add(&spin->link, &spin_list);
@@ -219,7 +283,7 @@ ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
 igt_spin_t *
 __igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep)
 {
-	return ___igt_spin_batch_new(fd, ctx, engine, dep, false);
+	return ___igt_spin_batch_new(fd, ctx, engine, dep, 0);
 }
 
 /**
@@ -253,7 +317,7 @@ igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep)
 igt_spin_t *
 __igt_spin_batch_new_fence(int fd, uint32_t ctx, unsigned engine)
 {
-	return ___igt_spin_batch_new(fd, ctx, engine, 0, true);
+	return ___igt_spin_batch_new(fd, ctx, engine, 0, OUT_FENCE);
 }
 
 /**
@@ -286,6 +350,42 @@ igt_spin_batch_new_fence(int fd, uint32_t ctx, unsigned engine)
 	return spin;
 }
 
+igt_spin_t *
+__igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine)
+{
+	return ___igt_spin_batch_new(fd, ctx, engine, 0, POLL_RUN);
+}
+
+/**
+ * igt_spin_batch_new_poll:
+ * @fd: open i915 drm file descriptor
+ * @engine: Ring to execute batch OR'd with execbuf flags. If value is less
+ *          than 0, execute on all available rings.
+ *
+ * Start a recursive batch on a ring. Immediately returns a #igt_spin_t that
+ * contains the batch's handle that can be waited upon. The returned structure
+ * must be passed to igt_spin_batch_free() for post-processing.
+ *
+ * igt_spin_t->running will containt a pointer which target will change from
+ * zero to one once the spinner actually starts executing on the GPU.
+ *
+ * Returns:
+ * Structure with helper internal state for igt_spin_batch_free().
+ */
+igt_spin_t *
+igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine)
+{
+	igt_spin_t *spin;
+
+	igt_require_gem(fd);
+	igt_require(gem_mmap__has_wc(fd));
+
+	spin = __igt_spin_batch_new_poll(fd, ctx, engine);
+	igt_assert(gem_bo_busy(fd, spin->handle));
+
+	return spin;
+}
+
 static void notify(union sigval arg)
 {
 	igt_spin_t *spin = arg.sival_ptr;
@@ -367,6 +467,11 @@ void igt_spin_batch_free(int fd, igt_spin_t *spin)
 	igt_spin_batch_end(spin);
 	gem_munmap(spin->batch, BATCH_SIZE);
 
+	if (spin->running) {
+		gem_munmap(spin->running, 4096);
+		gem_close(fd, spin->poll_handle);
+	}
+
 	gem_close(fd, spin->handle);
 
 	if (spin->out_fence >= 0)
diff --git a/lib/igt_dummyload.h b/lib/igt_dummyload.h
index 4103e4ab9e36..3103935a309b 100644
--- a/lib/igt_dummyload.h
+++ b/lib/igt_dummyload.h
@@ -36,6 +36,10 @@ typedef struct igt_spin {
 	struct igt_list link;
 	uint32_t *batch;
 	int out_fence;
+	struct drm_i915_gem_exec_object2 obj[2];
+	struct drm_i915_gem_execbuffer2 execbuf;
+	uint32_t poll_handle;
+	bool *running;
 } igt_spin_t;
 
 igt_spin_t *__igt_spin_batch_new(int fd,
@@ -55,6 +59,13 @@ igt_spin_t *igt_spin_batch_new_fence(int fd,
 				     uint32_t ctx,
 				     unsigned engine);
 
+igt_spin_t *__igt_spin_batch_new_poll(int fd,
+				       uint32_t ctx,
+				       unsigned engine);
+igt_spin_t *igt_spin_batch_new_poll(int fd,
+				    uint32_t ctx,
+				    unsigned engine);
+
 void igt_spin_batch_set_timeout(igt_spin_t *spin, int64_t ns);
 void igt_spin_batch_end(igt_spin_t *spin);
 void igt_spin_batch_free(int fd, igt_spin_t *spin);
diff --git a/lib/igt_gt.c b/lib/igt_gt.c
index 01aebc670862..4569fd36bd85 100644
--- a/lib/igt_gt.c
+++ b/lib/igt_gt.c
@@ -609,7 +609,7 @@ bool gem_can_store_dword(int fd, unsigned int engine)
 	if (gen == 3 && (info->is_grantsdale || info->is_alviso))
 		return false; /* only supports physical addresses */
 
-	if (gen == 6 && (engine & ~(3<<13)) == I915_EXEC_BSD)
+	if (gen == 6 && ((engine & 0x3f) == I915_EXEC_BSD))
 		return false; /* kills the machine! */
 
 	if (info->is_broadwater)
diff --git a/lib/ioctl_wrappers.c b/lib/ioctl_wrappers.c
index 8748cfcfc04f..4e1a08bf06b4 100644
--- a/lib/ioctl_wrappers.c
+++ b/lib/ioctl_wrappers.c
@@ -198,7 +198,7 @@ void gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride)
 	igt_assert(__gem_set_tiling(fd, handle, tiling, stride) == 0);
 }
 
-static int __gem_set_caching(int fd, uint32_t handle, uint32_t caching)
+int __gem_set_caching(int fd, uint32_t handle, uint32_t caching)
 {
 	struct drm_i915_gem_caching arg;
 	int err;
diff --git a/lib/ioctl_wrappers.h b/lib/ioctl_wrappers.h
index 13fbe3c103c0..b966f72c90a8 100644
--- a/lib/ioctl_wrappers.h
+++ b/lib/ioctl_wrappers.h
@@ -61,6 +61,7 @@ bool gem_get_tiling(int fd, uint32_t handle, uint32_t *tiling, uint32_t *swizzle
 void gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride);
 int __gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride);
 
+int __gem_set_caching(int fd, uint32_t handle, uint32_t caching);
 void gem_set_caching(int fd, uint32_t handle, uint32_t caching);
 uint32_t gem_get_caching(int fd, uint32_t handle);
 uint32_t gem_flink(int fd, uint32_t handle);
diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
index 19fcc95ffc7f..27ec305079e1 100644
--- a/tests/perf_pmu.c
+++ b/tests/perf_pmu.c
@@ -170,6 +170,56 @@ static unsigned int e2ring(int gem_fd, const struct intel_execution_engine2 *e)
 #define FLAG_LONG (16)
 #define FLAG_HANG (32)
 
+static igt_spin_t * __spin_poll(int fd, uint32_t ctx, unsigned long flags)
+{
+	if (gem_can_store_dword(fd, flags))
+		return __igt_spin_batch_new_poll(fd, ctx, flags);
+	else
+		return __igt_spin_batch_new(fd, ctx, flags, 0);
+}
+
+static unsigned long __spin_wait(int fd, igt_spin_t *spin)
+{
+	struct timespec start = { };
+
+	igt_nsec_elapsed(&start);
+
+	if (gem_can_store_dword(fd, spin->execbuf.flags)) {
+		unsigned long timeout = 0;
+
+		while (!spin->running) {
+			unsigned long t = igt_nsec_elapsed(&start);
+
+			if ((t - timeout) > 250e6) {
+				timeout = t;
+				igt_warn("Spinner not running after %.2fms\n",
+					 (double)t / 1e6);
+			}
+		};
+	} else {
+		igt_debug("__spin_wait - usleep mode\n");
+		usleep(500e3); /* Better than nothing! */
+	}
+
+	return igt_nsec_elapsed(&start);
+}
+
+static igt_spin_t * __spin_sync(int fd, uint32_t ctx, unsigned long flags)
+{
+	igt_spin_t *spin = __spin_poll(fd, ctx, flags);
+
+	__spin_wait(fd, spin);
+
+	return spin;
+}
+
+static igt_spin_t * spin_sync(int fd, uint32_t ctx, unsigned long flags)
+{
+	igt_require_gem(fd);
+
+	return __spin_sync(fd, ctx, flags);
+}
+
 static void end_spin(int fd, igt_spin_t *spin, unsigned int flags)
 {
 	if (!spin)
@@ -180,8 +230,25 @@ static void end_spin(int fd, igt_spin_t *spin, unsigned int flags)
 	if (flags & FLAG_SYNC)
 		gem_sync(fd, spin->handle);
 
-	if (flags & TEST_TRAILING_IDLE)
-		usleep(batch_duration_ns / 5000);
+	if (flags & TEST_TRAILING_IDLE) {
+		unsigned long t, timeout = 0;
+		struct timespec start = { };
+
+		igt_nsec_elapsed(&start);
+
+		do {
+			t = igt_nsec_elapsed(&start);
+
+			if (gem_bo_busy(fd, spin->handle) &&
+			    (t - timeout) > 10e6) {
+				timeout = t;
+				igt_warn("Spinner not idle after %.2fms\n",
+					 (double)t / 1e6);
+			}
+
+			usleep(1e3);
+		} while (t < batch_duration_ns / 5);
+	}
 }
 
 static void
@@ -195,7 +262,7 @@ single(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags)
 	fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance));
 
 	if (flags & TEST_BUSY)
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	else
 		spin = NULL;
 
@@ -251,13 +318,7 @@ busy_start(int gem_fd, const struct intel_execution_engine2 *e)
 	 */
 	sleep(2);
 
-	spin = __igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
-
-	/*
-	 * Sleep for a bit after making the engine busy to make sure the PMU
-	 * gets enabled when the batch is already running.
-	 */
-	usleep(500e3);
+	spin = __spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 
 	fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance));
 
@@ -300,7 +361,7 @@ busy_double_start(int gem_fd, const struct intel_execution_engine2 *e)
 	 * re-submission in execlists mode. Make sure busyness is correctly
 	 * reported with the engine busy, and after the engine went idle.
 	 */
-	spin[0] = __igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin[0] = __spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	usleep(500e3);
 	spin[1] = __igt_spin_batch_new(gem_fd, ctx, e2ring(gem_fd, e), 0);
 
@@ -386,7 +447,7 @@ busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 
 	igt_assert_eq(i, num_engines);
 
-	spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -412,15 +473,13 @@ busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 }
 
 static void
-__submit_spin_batch(int gem_fd,
-		    struct drm_i915_gem_exec_object2 *obj,
+__submit_spin_batch(int gem_fd, igt_spin_t *spin,
 		    const struct intel_execution_engine2 *e)
 {
-	struct drm_i915_gem_execbuffer2 eb = {
-		.buffer_count = 1,
-		.buffers_ptr = to_user_pointer(obj),
-		.flags = e2ring(gem_fd, e),
-	};
+	struct drm_i915_gem_execbuffer2 eb = spin->execbuf;
+
+	eb.flags &= ~(0x3f | I915_EXEC_BSD_MASK);
+	eb.flags |= e2ring(gem_fd, e) | I915_EXEC_NO_RELOC;
 
 	gem_execbuf(gem_fd, &eb);
 }
@@ -429,7 +488,6 @@ static void
 most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 		    const unsigned int num_engines, unsigned int flags)
 {
-	struct drm_i915_gem_exec_object2 obj = {};
 	const struct intel_execution_engine2 *e_;
 	uint64_t tval[2][num_engines];
 	uint64_t val[num_engines];
@@ -443,15 +501,12 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 		if (!gem_has_engine(gem_fd, e_->class, e_->instance))
 			continue;
 
-		if (e == e_) {
+		if (e == e_)
 			idle_idx = i;
-		} else if (spin) {
-			__submit_spin_batch(gem_fd, &obj, e_);
-		} else {
-			spin = igt_spin_batch_new(gem_fd, 0,
-						  e2ring(gem_fd, e_), 0);
-			obj.handle = spin->handle;
-		}
+		else if (spin)
+			__submit_spin_batch(gem_fd, spin, e_);
+		else
+			spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e_));
 
 		val[i++] = I915_PMU_ENGINE_BUSY(e_->class, e_->instance);
 	}
@@ -461,6 +516,9 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 	for (i = 0; i < num_engines; i++)
 		fd[i] = open_group(val[i], fd[0]);
 
+	/* Small delay to allow engines to start. */
+	usleep(__spin_wait(gem_fd, spin) * num_engines / 1e3);
+
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -489,7 +547,6 @@ static void
 all_busy_check_all(int gem_fd, const unsigned int num_engines,
 		   unsigned int flags)
 {
-	struct drm_i915_gem_exec_object2 obj = {};
 	const struct intel_execution_engine2 *e;
 	uint64_t tval[2][num_engines];
 	uint64_t val[num_engines];
@@ -503,13 +560,10 @@ all_busy_check_all(int gem_fd, const unsigned int num_engines,
 		if (!gem_has_engine(gem_fd, e->class, e->instance))
 			continue;
 
-		if (spin) {
-			__submit_spin_batch(gem_fd, &obj, e);
-		} else {
-			spin = igt_spin_batch_new(gem_fd, 0,
-						  e2ring(gem_fd, e), 0);
-			obj.handle = spin->handle;
-		}
+		if (spin)
+			__submit_spin_batch(gem_fd, spin, e);
+		else
+			spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e));
 
 		val[i++] = I915_PMU_ENGINE_BUSY(e->class, e->instance);
 	}
@@ -519,6 +573,9 @@ all_busy_check_all(int gem_fd, const unsigned int num_engines,
 	for (i = 0; i < num_engines; i++)
 		fd[i] = open_group(val[i], fd[0]);
 
+	/* Small delay to allow engines to start. */
+	usleep(__spin_wait(gem_fd, spin) * num_engines / 1e3);
+
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -550,7 +607,7 @@ no_sema(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags)
 	open_group(I915_PMU_ENGINE_WAIT(e->class, e->instance), fd);
 
 	if (flags & TEST_BUSY)
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	else
 		spin = NULL;
 
@@ -884,7 +941,7 @@ multi_client(int gem_fd, const struct intel_execution_engine2 *e)
 	 */
 	fd[1] = open_pmu(config);
 
-	spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 
 	val[0] = val[1] = __pmu_read_single(fd[0], &ts[0]);
 	slept[1] = measured_usleep(batch_duration_ns / 1000);
@@ -1248,7 +1305,7 @@ test_frequency(int gem_fd)
 	igt_require(igt_sysfs_get_u32(sysfs, "gt_boost_freq_mhz") == min_freq);
 
 	gem_quiescent_gpu(gem_fd); /* Idle to be sure the change takes effect */
-	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+	spin = spin_sync(gem_fd, 0, I915_EXEC_RENDER);
 
 	slept = pmu_read_multi(fd, 2, start);
 	measured_usleep(batch_duration_ns / 1000);
@@ -1274,7 +1331,7 @@ test_frequency(int gem_fd)
 	igt_require(igt_sysfs_get_u32(sysfs, "gt_min_freq_mhz") == max_freq);
 
 	gem_quiescent_gpu(gem_fd);
-	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+	spin = spin_sync(gem_fd, 0, I915_EXEC_RENDER);
 
 	slept = pmu_read_multi(fd, 2, start);
 	measured_usleep(batch_duration_ns / 1000);
@@ -1455,6 +1512,10 @@ static void __rearm_spin_batch(igt_spin_t *spin)
 {
 	const uint32_t mi_arb_chk = 0x5 << 23;
 
+	if (spin->running) {
+		igt_assert(*spin->running);
+		*spin->running = 0;
+	}
        *spin->batch = mi_arb_chk;
        __sync_synchronize();
 }
@@ -1517,7 +1578,6 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 		const unsigned long timeout[] = {
 			pwm_calibration_us * 1000, test_us * 1000
 		};
-		struct drm_i915_gem_exec_object2 obj = {};
 		uint64_t total_busy_ns = 0, total_idle_ns = 0;
 		igt_spin_t *spin;
 		int ret;
@@ -1530,12 +1590,9 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 			igt_warn("Failed to set scheduling policy!\n");
 
 		/* Allocate our spin batch and idle it. */
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
-		obj.handle = spin->handle;
-		__submit_spin_batch(gem_fd, &obj, e); /* record its location */
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 		igt_spin_batch_end(spin);
-		gem_sync(gem_fd, obj.handle);
-		obj.flags |= EXEC_OBJECT_PINNED;
+		gem_sync(gem_fd, spin->handle);
 
 		/* 1st pass is calibration, second pass is the test. */
 		for (int pass = 0; pass < ARRAY_SIZE(timeout); pass++) {
@@ -1545,24 +1602,30 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 
 			igt_nsec_elapsed(&test_start);
 			do {
-				unsigned int target_idle_us, t_busy;
+				unsigned int target_idle_us;
+				struct timespec start = { };
+				unsigned long prep_delay_ns;
 
 				/* Restart the spinbatch. */
+				igt_nsec_elapsed(&start);
 				__rearm_spin_batch(spin);
-				__submit_spin_batch(gem_fd, &obj, e);
+				__submit_spin_batch(gem_fd, spin, e);
 
-				/*
-				 * Note that the submission may be delayed to a
-				 * tasklet (ksoftirqd) which cannot run until we
-				 * sleep as we hog the cpu (we are RT).
-				 */
+				 /* Wait for batch to start executing. */
+				__spin_wait(gem_fd, spin);
+				prep_delay_ns = igt_nsec_elapsed(&start);
 
-				t_busy = measured_usleep(busy_us);
+				/* PWM busy sleep. */
+				memset(&start, 0, sizeof(start));
+				igt_nsec_elapsed(&start);
+				measured_usleep(busy_us);
 				igt_spin_batch_end(spin);
-				gem_sync(gem_fd, obj.handle);
+				gem_sync(gem_fd, spin->handle);
 
-				total_busy_ns += t_busy;
+				total_busy_ns += igt_nsec_elapsed(&start);
+				total_idle_ns += prep_delay_ns;
 
+				/* Re-calibrate. */
 				target_idle_us =
 					(100 * total_busy_ns / target_busy_pct - (total_busy_ns + total_idle_ns)) / 1000;
 				total_idle_ns += measured_usleep(target_idle_us);
-- 
2.14.1

_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [igt-dev] ✓ Fi.CI.BAT: success for tests/perf_pmu: Improve accuracy by waiting on spinner to start (rev4)
  2018-03-15 12:56 ` [igt-dev] " Tvrtko Ursulin
                   ` (7 preceding siblings ...)
  (?)
@ 2018-03-16 11:20 ` Patchwork
  -1 siblings, 0 replies; 53+ messages in thread
From: Patchwork @ 2018-03-16 11:20 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: igt-dev

== Series Details ==

Series: tests/perf_pmu: Improve accuracy by waiting on spinner to start (rev4)
URL   : https://patchwork.freedesktop.org/series/40027/
State : success

== Summary ==

IGT patchset tested on top of latest successful build
2e92134b4e4f754239f9721b8781ce2fc0aab07c tests/kms_frontbuffer_tracking: Reduce fbc status spam, v2.

with latest DRM-Tip kernel build CI_DRM_3937
f4c0fa0314e0 drm-tip: 2018y-03m-16d-10h-16m-52s UTC integration manifest

No testlist changes.

---- Possible new issues:

Test kms_pipe_crc_basic:
        Subgroup hang-read-crc-pipe-b:
                dmesg-warn -> PASS       (fi-elk-e7500)
        Subgroup hang-read-crc-pipe-c:
                incomplete -> SKIP       (fi-elk-e7500)

---- Known issues:

Test kms_pipe_crc_basic:
        Subgroup suspend-read-crc-pipe-c:
                incomplete -> PASS       (fi-bxt-dsi) fdo#103927

fdo#103927 https://bugs.freedesktop.org/show_bug.cgi?id=103927

fi-bdw-5557u     total:285  pass:264  dwarn:0   dfail:0   fail:0   skip:21  time:439s
fi-bdw-gvtdvm    total:285  pass:261  dwarn:0   dfail:0   fail:0   skip:24  time:446s
fi-blb-e6850     total:285  pass:220  dwarn:1   dfail:0   fail:0   skip:64  time:383s
fi-bsw-n3050     total:285  pass:239  dwarn:0   dfail:0   fail:0   skip:46  time:535s
fi-bwr-2160      total:285  pass:180  dwarn:0   dfail:0   fail:0   skip:105 time:298s
fi-bxt-dsi       total:285  pass:255  dwarn:0   dfail:0   fail:0   skip:30  time:514s
fi-bxt-j4205     total:285  pass:256  dwarn:0   dfail:0   fail:0   skip:29  time:514s
fi-byt-j1900     total:285  pass:250  dwarn:0   dfail:0   fail:0   skip:35  time:525s
fi-byt-n2820     total:285  pass:246  dwarn:0   dfail:0   fail:0   skip:39  time:510s
fi-cfl-8700k     total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:408s
fi-cfl-s2        total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:577s
fi-cfl-u         total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:509s
fi-cnl-drrs      total:285  pass:254  dwarn:3   dfail:0   fail:0   skip:28  time:531s
fi-elk-e7500     total:285  pass:225  dwarn:1   dfail:0   fail:0   skip:59  time:427s
fi-gdg-551       total:285  pass:176  dwarn:0   dfail:0   fail:1   skip:108 time:317s
fi-glk-1         total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:539s
fi-hsw-4770      total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:402s
fi-ilk-650       total:285  pass:225  dwarn:0   dfail:0   fail:0   skip:60  time:426s
fi-ivb-3520m     total:285  pass:256  dwarn:0   dfail:0   fail:0   skip:29  time:467s
fi-ivb-3770      total:285  pass:252  dwarn:0   dfail:0   fail:0   skip:33  time:438s
fi-kbl-7500u     total:285  pass:260  dwarn:1   dfail:0   fail:0   skip:24  time:476s
fi-kbl-7567u     total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:470s
fi-kbl-r         total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:514s
fi-pnv-d510      total:285  pass:219  dwarn:1   dfail:0   fail:0   skip:65  time:656s
fi-skl-6260u     total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:438s
fi-skl-6600u     total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:544s
fi-skl-6700hq    total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:540s
fi-skl-6700k2    total:285  pass:261  dwarn:0   dfail:0   fail:0   skip:24  time:504s
fi-skl-6770hq    total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:501s
fi-skl-guc       total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:426s
fi-skl-gvtdvm    total:285  pass:262  dwarn:0   dfail:0   fail:0   skip:23  time:444s
fi-snb-2520m     total:285  pass:245  dwarn:0   dfail:0   fail:0   skip:40  time:588s
fi-snb-2600      total:285  pass:245  dwarn:0   dfail:0   fail:0   skip:40  time:408s

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_1146/issues.html
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 53+ messages in thread

* [PATCH i-g-t v5] tests/perf_pmu: Improve accuracy by waiting on spinner to start
  2018-03-16 10:17         ` [igt-dev] " Tvrtko Ursulin
@ 2018-03-16 12:18           ` Tvrtko Ursulin
  -1 siblings, 0 replies; 53+ messages in thread
From: Tvrtko Ursulin @ 2018-03-16 12:18 UTC (permalink / raw)
  To: igt-dev; +Cc: Intel-gfx

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

More than one test assumes that the spinner is running pretty much
immediately after we have create or submitted it.

In actuality there is a variable delay, especially on execlists platforms,
between submission and spin batch starting to run on the hardware.

To enable tests which care about this level of timing to account for this,
we add a new spin batch constructor which provides an output field which
can be polled to determine when the batch actually started running.

This is implemented via MI_STOREDW_IMM from the spin batch, writing into
memory mapped page shared with userspace.

Using this facility from perf_pmu, where applicable, should improve very
occasional test fails across the set and platforms.

v2:
 Chris Wilson:
 * Use caching mapping if available.
 * Handle old gens better.
 * Use gem_can_store_dword.
 * Cache exec obj array in spin_batch_t for easier resubmit.

v3:
 * Forgot I915_EXEC_NO_RELOC. (Chris Wilson)

v4:
 * Mask out all non-engine flags in gem_can_store_dword.
 * Added some debug logging.

v5:
 * Fix relocs and batch munmap. (Chris)
 * Added assert idle spinner batch looks as expected.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 lib/igt_dummyload.c  | 152 +++++++++++++++++++++++++++++++++++++-------
 lib/igt_dummyload.h  |  11 ++++
 lib/igt_gt.c         |   2 +-
 lib/ioctl_wrappers.c |   2 +-
 lib/ioctl_wrappers.h |   1 +
 tests/perf_pmu.c     | 176 +++++++++++++++++++++++++++++++++++----------------
 6 files changed, 263 insertions(+), 81 deletions(-)

diff --git a/lib/igt_dummyload.c b/lib/igt_dummyload.c
index 4b20f23dfe26..2462771291a6 100644
--- a/lib/igt_dummyload.c
+++ b/lib/igt_dummyload.c
@@ -74,16 +74,19 @@ fill_reloc(struct drm_i915_gem_relocation_entry *reloc,
 	reloc->write_domain = write_domains;
 }
 
-static int emit_recursive_batch(igt_spin_t *spin,
-				int fd, uint32_t ctx, unsigned engine,
-				uint32_t dep, bool out_fence)
+#define OUT_FENCE	(1 << 0)
+#define POLL_RUN	(1 << 1)
+
+static int
+emit_recursive_batch(igt_spin_t *spin, int fd, uint32_t ctx, unsigned engine,
+		     uint32_t dep, unsigned int flags)
 {
 #define SCRATCH 0
 #define BATCH 1
 	const int gen = intel_gen(intel_get_drm_devid(fd));
-	struct drm_i915_gem_exec_object2 obj[2];
 	struct drm_i915_gem_relocation_entry relocs[2];
-	struct drm_i915_gem_execbuffer2 execbuf;
+	struct drm_i915_gem_execbuffer2 *execbuf;
+	struct drm_i915_gem_exec_object2 *obj;
 	unsigned int engines[16];
 	unsigned int nengine;
 	int fence_fd = -1;
@@ -101,8 +104,10 @@ static int emit_recursive_batch(igt_spin_t *spin,
 	}
 	igt_require(nengine);
 
-	memset(&execbuf, 0, sizeof(execbuf));
-	memset(obj, 0, sizeof(obj));
+	memset(&spin->execbuf, 0, sizeof(spin->execbuf));
+	execbuf = &spin->execbuf;
+	memset(spin->obj, 0, sizeof(spin->obj));
+	obj = spin->obj;
 	memset(relocs, 0, sizeof(relocs));
 
 	obj[BATCH].handle = gem_create(fd, BATCH_SIZE);
@@ -113,16 +118,62 @@ static int emit_recursive_batch(igt_spin_t *spin,
 				       	BATCH_SIZE, PROT_WRITE);
 	gem_set_domain(fd, obj[BATCH].handle,
 			I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT);
-	execbuf.buffer_count++;
+	execbuf->buffer_count++;
 
 	if (dep) {
+		igt_assert(!(flags & POLL_RUN));
+
 		/* dummy write to dependency */
 		obj[SCRATCH].handle = dep;
 		fill_reloc(&relocs[obj[BATCH].relocation_count++],
 			   dep, 1020,
 			   I915_GEM_DOMAIN_RENDER,
 			   I915_GEM_DOMAIN_RENDER);
-		execbuf.buffer_count++;
+		execbuf->buffer_count++;
+	} else if (flags & POLL_RUN) {
+		unsigned int offset;
+
+		igt_assert(!dep);
+
+		if (gen == 4 || gen == 5)
+			execbuf->flags |= I915_EXEC_SECURE;
+
+		spin->poll_handle = gem_create(fd, 4096);
+
+		if (__gem_set_caching(fd, spin->poll_handle,
+				      I915_CACHING_CACHED) == 0)
+			spin->running = __gem_mmap__cpu(fd, spin->poll_handle,
+							0, 4096,
+							PROT_READ | PROT_WRITE);
+		else
+			spin->running = __gem_mmap__wc(fd, spin->poll_handle,
+						       0, 4096,
+						       PROT_READ | PROT_WRITE);
+		igt_assert(spin->running);
+		igt_assert_eq(*spin->running, 0);
+
+		*batch++ = MI_STORE_DWORD_IMM | (gen < 6 ? 1 << 22 : 0);
+
+		if (gen >= 8) {
+			offset = 1;
+			*batch++ = 0;
+			*batch++ = 0;
+		} else if (gen >= 4) {
+			offset = 2;
+			*batch++ = 0;
+			*batch++ = 0;
+		} else {
+			offset = 1;
+			batch[-1]--;
+			*batch++ = 0;
+		}
+
+		*batch++ = 1;
+
+		obj[SCRATCH].handle = spin->poll_handle;
+		fill_reloc(&relocs[obj[BATCH].relocation_count++],
+			   spin->poll_handle, offset, 0, 0);
+		execbuf->buffer_count++;
 	}
 
 	spin->batch = batch;
@@ -167,18 +218,23 @@ static int emit_recursive_batch(igt_spin_t *spin,
 	obj[BATCH].relocation_count++;
 	obj[BATCH].relocs_ptr = to_user_pointer(relocs);
 
-	execbuf.buffers_ptr = to_user_pointer(obj + (2 - execbuf.buffer_count));
-	execbuf.rsvd1 = ctx;
+	execbuf->buffers_ptr = to_user_pointer(obj +
+					       (2 - execbuf->buffer_count));
+	execbuf->rsvd1 = ctx;
 
-	if (out_fence)
-		execbuf.flags |= I915_EXEC_FENCE_OUT;
+	if (flags & OUT_FENCE)
+		execbuf->flags |= I915_EXEC_FENCE_OUT;
 
 	for (i = 0; i < nengine; i++) {
-		execbuf.flags &= ~ENGINE_MASK;
-		execbuf.flags |= engines[i];
-		gem_execbuf_wr(fd, &execbuf);
-		if (out_fence) {
-			int _fd = execbuf.rsvd2 >> 32;
+		execbuf->flags &= ~ENGINE_MASK;
+		execbuf->flags |= engines[i];
+
+		if (flags & POLL_RUN)
+			igt_require(gem_can_store_dword(fd, execbuf->flags));
+
+		gem_execbuf_wr(fd, execbuf);
+		if (flags & OUT_FENCE) {
+			int _fd = execbuf->rsvd2 >> 32;
 
 			igt_assert(_fd >= 0);
 			if (fence_fd == -1) {
@@ -194,12 +250,20 @@ static int emit_recursive_batch(igt_spin_t *spin,
 		}
 	}
 
+	/* Make it easier for callers to resubmit. */
+
+	obj[BATCH].relocation_count = 0;
+	obj[BATCH].relocs_ptr = 0;
+
+	obj[SCRATCH].flags = EXEC_OBJECT_PINNED;
+	obj[BATCH].flags = EXEC_OBJECT_PINNED;
+
 	return fence_fd;
 }
 
 static igt_spin_t *
 ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
-		      int out_fence)
+		      unsigned int flags)
 {
 	igt_spin_t *spin;
 
@@ -207,7 +271,7 @@ ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
 	igt_assert(spin);
 
 	spin->out_fence = emit_recursive_batch(spin, fd, ctx, engine, dep,
-					       out_fence);
+					       flags);
 
 	pthread_mutex_lock(&list_lock);
 	igt_list_add(&spin->link, &spin_list);
@@ -219,7 +283,7 @@ ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
 igt_spin_t *
 __igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep)
 {
-	return ___igt_spin_batch_new(fd, ctx, engine, dep, false);
+	return ___igt_spin_batch_new(fd, ctx, engine, dep, 0);
 }
 
 /**
@@ -253,7 +317,7 @@ igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep)
 igt_spin_t *
 __igt_spin_batch_new_fence(int fd, uint32_t ctx, unsigned engine)
 {
-	return ___igt_spin_batch_new(fd, ctx, engine, 0, true);
+	return ___igt_spin_batch_new(fd, ctx, engine, 0, OUT_FENCE);
 }
 
 /**
@@ -286,6 +350,42 @@ igt_spin_batch_new_fence(int fd, uint32_t ctx, unsigned engine)
 	return spin;
 }
 
+igt_spin_t *
+__igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine)
+{
+	return ___igt_spin_batch_new(fd, ctx, engine, 0, POLL_RUN);
+}
+
+/**
+ * igt_spin_batch_new_poll:
+ * @fd: open i915 drm file descriptor
+ * @engine: Ring to execute batch OR'd with execbuf flags. If value is less
+ *          than 0, execute on all available rings.
+ *
+ * Start a recursive batch on a ring. Immediately returns a #igt_spin_t that
+ * contains the batch's handle that can be waited upon. The returned structure
+ * must be passed to igt_spin_batch_free() for post-processing.
+ *
+ * igt_spin_t->running will containt a pointer which target will change from
+ * zero to one once the spinner actually starts executing on the GPU.
+ *
+ * Returns:
+ * Structure with helper internal state for igt_spin_batch_free().
+ */
+igt_spin_t *
+igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine)
+{
+	igt_spin_t *spin;
+
+	igt_require_gem(fd);
+	igt_require(gem_mmap__has_wc(fd));
+
+	spin = __igt_spin_batch_new_poll(fd, ctx, engine);
+	igt_assert(gem_bo_busy(fd, spin->handle));
+
+	return spin;
+}
+
 static void notify(union sigval arg)
 {
 	igt_spin_t *spin = arg.sival_ptr;
@@ -365,7 +465,13 @@ void igt_spin_batch_free(int fd, igt_spin_t *spin)
 		timer_delete(spin->timer);
 
 	igt_spin_batch_end(spin);
-	gem_munmap(spin->batch, BATCH_SIZE);
+	gem_munmap((void *)((unsigned long)spin->batch & (~4095UL)),
+		   BATCH_SIZE);
+
+	if (spin->running) {
+		gem_munmap(spin->running, 4096);
+		gem_close(fd, spin->poll_handle);
+	}
 
 	gem_close(fd, spin->handle);
 
diff --git a/lib/igt_dummyload.h b/lib/igt_dummyload.h
index 4103e4ab9e36..3103935a309b 100644
--- a/lib/igt_dummyload.h
+++ b/lib/igt_dummyload.h
@@ -36,6 +36,10 @@ typedef struct igt_spin {
 	struct igt_list link;
 	uint32_t *batch;
 	int out_fence;
+	struct drm_i915_gem_exec_object2 obj[2];
+	struct drm_i915_gem_execbuffer2 execbuf;
+	uint32_t poll_handle;
+	bool *running;
 } igt_spin_t;
 
 igt_spin_t *__igt_spin_batch_new(int fd,
@@ -55,6 +59,13 @@ igt_spin_t *igt_spin_batch_new_fence(int fd,
 				     uint32_t ctx,
 				     unsigned engine);
 
+igt_spin_t *__igt_spin_batch_new_poll(int fd,
+				       uint32_t ctx,
+				       unsigned engine);
+igt_spin_t *igt_spin_batch_new_poll(int fd,
+				    uint32_t ctx,
+				    unsigned engine);
+
 void igt_spin_batch_set_timeout(igt_spin_t *spin, int64_t ns);
 void igt_spin_batch_end(igt_spin_t *spin);
 void igt_spin_batch_free(int fd, igt_spin_t *spin);
diff --git a/lib/igt_gt.c b/lib/igt_gt.c
index 01aebc670862..4569fd36bd85 100644
--- a/lib/igt_gt.c
+++ b/lib/igt_gt.c
@@ -609,7 +609,7 @@ bool gem_can_store_dword(int fd, unsigned int engine)
 	if (gen == 3 && (info->is_grantsdale || info->is_alviso))
 		return false; /* only supports physical addresses */
 
-	if (gen == 6 && (engine & ~(3<<13)) == I915_EXEC_BSD)
+	if (gen == 6 && ((engine & 0x3f) == I915_EXEC_BSD))
 		return false; /* kills the machine! */
 
 	if (info->is_broadwater)
diff --git a/lib/ioctl_wrappers.c b/lib/ioctl_wrappers.c
index 8748cfcfc04f..4e1a08bf06b4 100644
--- a/lib/ioctl_wrappers.c
+++ b/lib/ioctl_wrappers.c
@@ -198,7 +198,7 @@ void gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride)
 	igt_assert(__gem_set_tiling(fd, handle, tiling, stride) == 0);
 }
 
-static int __gem_set_caching(int fd, uint32_t handle, uint32_t caching)
+int __gem_set_caching(int fd, uint32_t handle, uint32_t caching)
 {
 	struct drm_i915_gem_caching arg;
 	int err;
diff --git a/lib/ioctl_wrappers.h b/lib/ioctl_wrappers.h
index 13fbe3c103c0..b966f72c90a8 100644
--- a/lib/ioctl_wrappers.h
+++ b/lib/ioctl_wrappers.h
@@ -61,6 +61,7 @@ bool gem_get_tiling(int fd, uint32_t handle, uint32_t *tiling, uint32_t *swizzle
 void gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride);
 int __gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride);
 
+int __gem_set_caching(int fd, uint32_t handle, uint32_t caching);
 void gem_set_caching(int fd, uint32_t handle, uint32_t caching);
 uint32_t gem_get_caching(int fd, uint32_t handle);
 uint32_t gem_flink(int fd, uint32_t handle);
diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
index 19fcc95ffc7f..594e38ddcc07 100644
--- a/tests/perf_pmu.c
+++ b/tests/perf_pmu.c
@@ -170,6 +170,56 @@ static unsigned int e2ring(int gem_fd, const struct intel_execution_engine2 *e)
 #define FLAG_LONG (16)
 #define FLAG_HANG (32)
 
+static igt_spin_t * __spin_poll(int fd, uint32_t ctx, unsigned long flags)
+{
+	if (gem_can_store_dword(fd, flags))
+		return __igt_spin_batch_new_poll(fd, ctx, flags);
+	else
+		return __igt_spin_batch_new(fd, ctx, flags, 0);
+}
+
+static unsigned long __spin_wait(int fd, igt_spin_t *spin)
+{
+	struct timespec start = { };
+
+	igt_nsec_elapsed(&start);
+
+	if (gem_can_store_dword(fd, spin->execbuf.flags)) {
+		unsigned long timeout = 0;
+
+		while (!spin->running) {
+			unsigned long t = igt_nsec_elapsed(&start);
+
+			if ((t - timeout) > 250e6) {
+				timeout = t;
+				igt_warn("Spinner not running after %.2fms\n",
+					 (double)t / 1e6);
+			}
+		};
+	} else {
+		igt_debug("__spin_wait - usleep mode\n");
+		usleep(500e3); /* Better than nothing! */
+	}
+
+	return igt_nsec_elapsed(&start);
+}
+
+static igt_spin_t * __spin_sync(int fd, uint32_t ctx, unsigned long flags)
+{
+	igt_spin_t *spin = __spin_poll(fd, ctx, flags);
+
+	__spin_wait(fd, spin);
+
+	return spin;
+}
+
+static igt_spin_t * spin_sync(int fd, uint32_t ctx, unsigned long flags)
+{
+	igt_require_gem(fd);
+
+	return __spin_sync(fd, ctx, flags);
+}
+
 static void end_spin(int fd, igt_spin_t *spin, unsigned int flags)
 {
 	if (!spin)
@@ -180,8 +230,25 @@ static void end_spin(int fd, igt_spin_t *spin, unsigned int flags)
 	if (flags & FLAG_SYNC)
 		gem_sync(fd, spin->handle);
 
-	if (flags & TEST_TRAILING_IDLE)
-		usleep(batch_duration_ns / 5000);
+	if (flags & TEST_TRAILING_IDLE) {
+		unsigned long t, timeout = 0;
+		struct timespec start = { };
+
+		igt_nsec_elapsed(&start);
+
+		do {
+			t = igt_nsec_elapsed(&start);
+
+			if (gem_bo_busy(fd, spin->handle) &&
+			    (t - timeout) > 10e6) {
+				timeout = t;
+				igt_warn("Spinner not idle after %.2fms\n",
+					 (double)t / 1e6);
+			}
+
+			usleep(1e3);
+		} while (t < batch_duration_ns / 5);
+	}
 }
 
 static void
@@ -195,7 +262,7 @@ single(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags)
 	fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance));
 
 	if (flags & TEST_BUSY)
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	else
 		spin = NULL;
 
@@ -251,13 +318,7 @@ busy_start(int gem_fd, const struct intel_execution_engine2 *e)
 	 */
 	sleep(2);
 
-	spin = __igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
-
-	/*
-	 * Sleep for a bit after making the engine busy to make sure the PMU
-	 * gets enabled when the batch is already running.
-	 */
-	usleep(500e3);
+	spin = __spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 
 	fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance));
 
@@ -300,7 +361,7 @@ busy_double_start(int gem_fd, const struct intel_execution_engine2 *e)
 	 * re-submission in execlists mode. Make sure busyness is correctly
 	 * reported with the engine busy, and after the engine went idle.
 	 */
-	spin[0] = __igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin[0] = __spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	usleep(500e3);
 	spin[1] = __igt_spin_batch_new(gem_fd, ctx, e2ring(gem_fd, e), 0);
 
@@ -386,7 +447,7 @@ busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 
 	igt_assert_eq(i, num_engines);
 
-	spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -412,15 +473,13 @@ busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 }
 
 static void
-__submit_spin_batch(int gem_fd,
-		    struct drm_i915_gem_exec_object2 *obj,
+__submit_spin_batch(int gem_fd, igt_spin_t *spin,
 		    const struct intel_execution_engine2 *e)
 {
-	struct drm_i915_gem_execbuffer2 eb = {
-		.buffer_count = 1,
-		.buffers_ptr = to_user_pointer(obj),
-		.flags = e2ring(gem_fd, e),
-	};
+	struct drm_i915_gem_execbuffer2 eb = spin->execbuf;
+
+	eb.flags &= ~(0x3f | I915_EXEC_BSD_MASK);
+	eb.flags |= e2ring(gem_fd, e) | I915_EXEC_NO_RELOC;
 
 	gem_execbuf(gem_fd, &eb);
 }
@@ -429,7 +488,6 @@ static void
 most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 		    const unsigned int num_engines, unsigned int flags)
 {
-	struct drm_i915_gem_exec_object2 obj = {};
 	const struct intel_execution_engine2 *e_;
 	uint64_t tval[2][num_engines];
 	uint64_t val[num_engines];
@@ -443,15 +501,12 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 		if (!gem_has_engine(gem_fd, e_->class, e_->instance))
 			continue;
 
-		if (e == e_) {
+		if (e == e_)
 			idle_idx = i;
-		} else if (spin) {
-			__submit_spin_batch(gem_fd, &obj, e_);
-		} else {
-			spin = igt_spin_batch_new(gem_fd, 0,
-						  e2ring(gem_fd, e_), 0);
-			obj.handle = spin->handle;
-		}
+		else if (spin)
+			__submit_spin_batch(gem_fd, spin, e_);
+		else
+			spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e_));
 
 		val[i++] = I915_PMU_ENGINE_BUSY(e_->class, e_->instance);
 	}
@@ -461,6 +516,9 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 	for (i = 0; i < num_engines; i++)
 		fd[i] = open_group(val[i], fd[0]);
 
+	/* Small delay to allow engines to start. */
+	usleep(__spin_wait(gem_fd, spin) * num_engines / 1e3);
+
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -489,7 +547,6 @@ static void
 all_busy_check_all(int gem_fd, const unsigned int num_engines,
 		   unsigned int flags)
 {
-	struct drm_i915_gem_exec_object2 obj = {};
 	const struct intel_execution_engine2 *e;
 	uint64_t tval[2][num_engines];
 	uint64_t val[num_engines];
@@ -503,13 +560,10 @@ all_busy_check_all(int gem_fd, const unsigned int num_engines,
 		if (!gem_has_engine(gem_fd, e->class, e->instance))
 			continue;
 
-		if (spin) {
-			__submit_spin_batch(gem_fd, &obj, e);
-		} else {
-			spin = igt_spin_batch_new(gem_fd, 0,
-						  e2ring(gem_fd, e), 0);
-			obj.handle = spin->handle;
-		}
+		if (spin)
+			__submit_spin_batch(gem_fd, spin, e);
+		else
+			spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e));
 
 		val[i++] = I915_PMU_ENGINE_BUSY(e->class, e->instance);
 	}
@@ -519,6 +573,9 @@ all_busy_check_all(int gem_fd, const unsigned int num_engines,
 	for (i = 0; i < num_engines; i++)
 		fd[i] = open_group(val[i], fd[0]);
 
+	/* Small delay to allow engines to start. */
+	usleep(__spin_wait(gem_fd, spin) * num_engines / 1e3);
+
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -550,7 +607,7 @@ no_sema(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags)
 	open_group(I915_PMU_ENGINE_WAIT(e->class, e->instance), fd);
 
 	if (flags & TEST_BUSY)
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	else
 		spin = NULL;
 
@@ -884,7 +941,7 @@ multi_client(int gem_fd, const struct intel_execution_engine2 *e)
 	 */
 	fd[1] = open_pmu(config);
 
-	spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 
 	val[0] = val[1] = __pmu_read_single(fd[0], &ts[0]);
 	slept[1] = measured_usleep(batch_duration_ns / 1000);
@@ -1248,7 +1305,7 @@ test_frequency(int gem_fd)
 	igt_require(igt_sysfs_get_u32(sysfs, "gt_boost_freq_mhz") == min_freq);
 
 	gem_quiescent_gpu(gem_fd); /* Idle to be sure the change takes effect */
-	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+	spin = spin_sync(gem_fd, 0, I915_EXEC_RENDER);
 
 	slept = pmu_read_multi(fd, 2, start);
 	measured_usleep(batch_duration_ns / 1000);
@@ -1274,7 +1331,7 @@ test_frequency(int gem_fd)
 	igt_require(igt_sysfs_get_u32(sysfs, "gt_min_freq_mhz") == max_freq);
 
 	gem_quiescent_gpu(gem_fd);
-	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+	spin = spin_sync(gem_fd, 0, I915_EXEC_RENDER);
 
 	slept = pmu_read_multi(fd, 2, start);
 	measured_usleep(batch_duration_ns / 1000);
@@ -1455,6 +1512,11 @@ static void __rearm_spin_batch(igt_spin_t *spin)
 {
 	const uint32_t mi_arb_chk = 0x5 << 23;
 
+	if (spin->running) {
+		igt_assert(*spin->running);
+		*spin->running = 0;
+	}
+	igt_assert_eq(*spin->batch, MI_BATCH_BUFFER_END);
        *spin->batch = mi_arb_chk;
        __sync_synchronize();
 }
@@ -1517,7 +1579,6 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 		const unsigned long timeout[] = {
 			pwm_calibration_us * 1000, test_us * 1000
 		};
-		struct drm_i915_gem_exec_object2 obj = {};
 		uint64_t total_busy_ns = 0, total_idle_ns = 0;
 		igt_spin_t *spin;
 		int ret;
@@ -1530,12 +1591,9 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 			igt_warn("Failed to set scheduling policy!\n");
 
 		/* Allocate our spin batch and idle it. */
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
-		obj.handle = spin->handle;
-		__submit_spin_batch(gem_fd, &obj, e); /* record its location */
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 		igt_spin_batch_end(spin);
-		gem_sync(gem_fd, obj.handle);
-		obj.flags |= EXEC_OBJECT_PINNED;
+		gem_sync(gem_fd, spin->handle);
 
 		/* 1st pass is calibration, second pass is the test. */
 		for (int pass = 0; pass < ARRAY_SIZE(timeout); pass++) {
@@ -1545,24 +1603,30 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 
 			igt_nsec_elapsed(&test_start);
 			do {
-				unsigned int target_idle_us, t_busy;
+				unsigned int target_idle_us;
+				struct timespec start = { };
+				unsigned long prep_delay_ns;
 
 				/* Restart the spinbatch. */
+				igt_nsec_elapsed(&start);
 				__rearm_spin_batch(spin);
-				__submit_spin_batch(gem_fd, &obj, e);
+				__submit_spin_batch(gem_fd, spin, e);
 
-				/*
-				 * Note that the submission may be delayed to a
-				 * tasklet (ksoftirqd) which cannot run until we
-				 * sleep as we hog the cpu (we are RT).
-				 */
+				 /* Wait for batch to start executing. */
+				__spin_wait(gem_fd, spin);
+				prep_delay_ns = igt_nsec_elapsed(&start);
 
-				t_busy = measured_usleep(busy_us);
+				/* PWM busy sleep. */
+				memset(&start, 0, sizeof(start));
+				igt_nsec_elapsed(&start);
+				measured_usleep(busy_us);
 				igt_spin_batch_end(spin);
-				gem_sync(gem_fd, obj.handle);
+				gem_sync(gem_fd, spin->handle);
 
-				total_busy_ns += t_busy;
+				total_busy_ns += igt_nsec_elapsed(&start);
+				total_idle_ns += prep_delay_ns;
 
+				/* Re-calibrate. */
 				target_idle_us =
 					(100 * total_busy_ns / target_busy_pct - (total_busy_ns + total_idle_ns)) / 1000;
 				total_idle_ns += measured_usleep(target_idle_us);
-- 
2.14.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [igt-dev] [PATCH i-g-t v5] tests/perf_pmu: Improve accuracy by waiting on spinner to start
@ 2018-03-16 12:18           ` Tvrtko Ursulin
  0 siblings, 0 replies; 53+ messages in thread
From: Tvrtko Ursulin @ 2018-03-16 12:18 UTC (permalink / raw)
  To: igt-dev; +Cc: Intel-gfx, Tvrtko Ursulin

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

More than one test assumes that the spinner is running pretty much
immediately after we have create or submitted it.

In actuality there is a variable delay, especially on execlists platforms,
between submission and spin batch starting to run on the hardware.

To enable tests which care about this level of timing to account for this,
we add a new spin batch constructor which provides an output field which
can be polled to determine when the batch actually started running.

This is implemented via MI_STOREDW_IMM from the spin batch, writing into
memory mapped page shared with userspace.

Using this facility from perf_pmu, where applicable, should improve very
occasional test fails across the set and platforms.

v2:
 Chris Wilson:
 * Use caching mapping if available.
 * Handle old gens better.
 * Use gem_can_store_dword.
 * Cache exec obj array in spin_batch_t for easier resubmit.

v3:
 * Forgot I915_EXEC_NO_RELOC. (Chris Wilson)

v4:
 * Mask out all non-engine flags in gem_can_store_dword.
 * Added some debug logging.

v5:
 * Fix relocs and batch munmap. (Chris)
 * Added assert idle spinner batch looks as expected.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 lib/igt_dummyload.c  | 152 +++++++++++++++++++++++++++++++++++++-------
 lib/igt_dummyload.h  |  11 ++++
 lib/igt_gt.c         |   2 +-
 lib/ioctl_wrappers.c |   2 +-
 lib/ioctl_wrappers.h |   1 +
 tests/perf_pmu.c     | 176 +++++++++++++++++++++++++++++++++++----------------
 6 files changed, 263 insertions(+), 81 deletions(-)

diff --git a/lib/igt_dummyload.c b/lib/igt_dummyload.c
index 4b20f23dfe26..2462771291a6 100644
--- a/lib/igt_dummyload.c
+++ b/lib/igt_dummyload.c
@@ -74,16 +74,19 @@ fill_reloc(struct drm_i915_gem_relocation_entry *reloc,
 	reloc->write_domain = write_domains;
 }
 
-static int emit_recursive_batch(igt_spin_t *spin,
-				int fd, uint32_t ctx, unsigned engine,
-				uint32_t dep, bool out_fence)
+#define OUT_FENCE	(1 << 0)
+#define POLL_RUN	(1 << 1)
+
+static int
+emit_recursive_batch(igt_spin_t *spin, int fd, uint32_t ctx, unsigned engine,
+		     uint32_t dep, unsigned int flags)
 {
 #define SCRATCH 0
 #define BATCH 1
 	const int gen = intel_gen(intel_get_drm_devid(fd));
-	struct drm_i915_gem_exec_object2 obj[2];
 	struct drm_i915_gem_relocation_entry relocs[2];
-	struct drm_i915_gem_execbuffer2 execbuf;
+	struct drm_i915_gem_execbuffer2 *execbuf;
+	struct drm_i915_gem_exec_object2 *obj;
 	unsigned int engines[16];
 	unsigned int nengine;
 	int fence_fd = -1;
@@ -101,8 +104,10 @@ static int emit_recursive_batch(igt_spin_t *spin,
 	}
 	igt_require(nengine);
 
-	memset(&execbuf, 0, sizeof(execbuf));
-	memset(obj, 0, sizeof(obj));
+	memset(&spin->execbuf, 0, sizeof(spin->execbuf));
+	execbuf = &spin->execbuf;
+	memset(spin->obj, 0, sizeof(spin->obj));
+	obj = spin->obj;
 	memset(relocs, 0, sizeof(relocs));
 
 	obj[BATCH].handle = gem_create(fd, BATCH_SIZE);
@@ -113,16 +118,62 @@ static int emit_recursive_batch(igt_spin_t *spin,
 				       	BATCH_SIZE, PROT_WRITE);
 	gem_set_domain(fd, obj[BATCH].handle,
 			I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT);
-	execbuf.buffer_count++;
+	execbuf->buffer_count++;
 
 	if (dep) {
+		igt_assert(!(flags & POLL_RUN));
+
 		/* dummy write to dependency */
 		obj[SCRATCH].handle = dep;
 		fill_reloc(&relocs[obj[BATCH].relocation_count++],
 			   dep, 1020,
 			   I915_GEM_DOMAIN_RENDER,
 			   I915_GEM_DOMAIN_RENDER);
-		execbuf.buffer_count++;
+		execbuf->buffer_count++;
+	} else if (flags & POLL_RUN) {
+		unsigned int offset;
+
+		igt_assert(!dep);
+
+		if (gen == 4 || gen == 5)
+			execbuf->flags |= I915_EXEC_SECURE;
+
+		spin->poll_handle = gem_create(fd, 4096);
+
+		if (__gem_set_caching(fd, spin->poll_handle,
+				      I915_CACHING_CACHED) == 0)
+			spin->running = __gem_mmap__cpu(fd, spin->poll_handle,
+							0, 4096,
+							PROT_READ | PROT_WRITE);
+		else
+			spin->running = __gem_mmap__wc(fd, spin->poll_handle,
+						       0, 4096,
+						       PROT_READ | PROT_WRITE);
+		igt_assert(spin->running);
+		igt_assert_eq(*spin->running, 0);
+
+		*batch++ = MI_STORE_DWORD_IMM | (gen < 6 ? 1 << 22 : 0);
+
+		if (gen >= 8) {
+			offset = 1;
+			*batch++ = 0;
+			*batch++ = 0;
+		} else if (gen >= 4) {
+			offset = 2;
+			*batch++ = 0;
+			*batch++ = 0;
+		} else {
+			offset = 1;
+			batch[-1]--;
+			*batch++ = 0;
+		}
+
+		*batch++ = 1;
+
+		obj[SCRATCH].handle = spin->poll_handle;
+		fill_reloc(&relocs[obj[BATCH].relocation_count++],
+			   spin->poll_handle, offset, 0, 0);
+		execbuf->buffer_count++;
 	}
 
 	spin->batch = batch;
@@ -167,18 +218,23 @@ static int emit_recursive_batch(igt_spin_t *spin,
 	obj[BATCH].relocation_count++;
 	obj[BATCH].relocs_ptr = to_user_pointer(relocs);
 
-	execbuf.buffers_ptr = to_user_pointer(obj + (2 - execbuf.buffer_count));
-	execbuf.rsvd1 = ctx;
+	execbuf->buffers_ptr = to_user_pointer(obj +
+					       (2 - execbuf->buffer_count));
+	execbuf->rsvd1 = ctx;
 
-	if (out_fence)
-		execbuf.flags |= I915_EXEC_FENCE_OUT;
+	if (flags & OUT_FENCE)
+		execbuf->flags |= I915_EXEC_FENCE_OUT;
 
 	for (i = 0; i < nengine; i++) {
-		execbuf.flags &= ~ENGINE_MASK;
-		execbuf.flags |= engines[i];
-		gem_execbuf_wr(fd, &execbuf);
-		if (out_fence) {
-			int _fd = execbuf.rsvd2 >> 32;
+		execbuf->flags &= ~ENGINE_MASK;
+		execbuf->flags |= engines[i];
+
+		if (flags & POLL_RUN)
+			igt_require(gem_can_store_dword(fd, execbuf->flags));
+
+		gem_execbuf_wr(fd, execbuf);
+		if (flags & OUT_FENCE) {
+			int _fd = execbuf->rsvd2 >> 32;
 
 			igt_assert(_fd >= 0);
 			if (fence_fd == -1) {
@@ -194,12 +250,20 @@ static int emit_recursive_batch(igt_spin_t *spin,
 		}
 	}
 
+	/* Make it easier for callers to resubmit. */
+
+	obj[BATCH].relocation_count = 0;
+	obj[BATCH].relocs_ptr = 0;
+
+	obj[SCRATCH].flags = EXEC_OBJECT_PINNED;
+	obj[BATCH].flags = EXEC_OBJECT_PINNED;
+
 	return fence_fd;
 }
 
 static igt_spin_t *
 ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
-		      int out_fence)
+		      unsigned int flags)
 {
 	igt_spin_t *spin;
 
@@ -207,7 +271,7 @@ ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
 	igt_assert(spin);
 
 	spin->out_fence = emit_recursive_batch(spin, fd, ctx, engine, dep,
-					       out_fence);
+					       flags);
 
 	pthread_mutex_lock(&list_lock);
 	igt_list_add(&spin->link, &spin_list);
@@ -219,7 +283,7 @@ ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
 igt_spin_t *
 __igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep)
 {
-	return ___igt_spin_batch_new(fd, ctx, engine, dep, false);
+	return ___igt_spin_batch_new(fd, ctx, engine, dep, 0);
 }
 
 /**
@@ -253,7 +317,7 @@ igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep)
 igt_spin_t *
 __igt_spin_batch_new_fence(int fd, uint32_t ctx, unsigned engine)
 {
-	return ___igt_spin_batch_new(fd, ctx, engine, 0, true);
+	return ___igt_spin_batch_new(fd, ctx, engine, 0, OUT_FENCE);
 }
 
 /**
@@ -286,6 +350,42 @@ igt_spin_batch_new_fence(int fd, uint32_t ctx, unsigned engine)
 	return spin;
 }
 
+igt_spin_t *
+__igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine)
+{
+	return ___igt_spin_batch_new(fd, ctx, engine, 0, POLL_RUN);
+}
+
+/**
+ * igt_spin_batch_new_poll:
+ * @fd: open i915 drm file descriptor
+ * @engine: Ring to execute batch OR'd with execbuf flags. If value is less
+ *          than 0, execute on all available rings.
+ *
+ * Start a recursive batch on a ring. Immediately returns a #igt_spin_t that
+ * contains the batch's handle that can be waited upon. The returned structure
+ * must be passed to igt_spin_batch_free() for post-processing.
+ *
+ * igt_spin_t->running will containt a pointer which target will change from
+ * zero to one once the spinner actually starts executing on the GPU.
+ *
+ * Returns:
+ * Structure with helper internal state for igt_spin_batch_free().
+ */
+igt_spin_t *
+igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine)
+{
+	igt_spin_t *spin;
+
+	igt_require_gem(fd);
+	igt_require(gem_mmap__has_wc(fd));
+
+	spin = __igt_spin_batch_new_poll(fd, ctx, engine);
+	igt_assert(gem_bo_busy(fd, spin->handle));
+
+	return spin;
+}
+
 static void notify(union sigval arg)
 {
 	igt_spin_t *spin = arg.sival_ptr;
@@ -365,7 +465,13 @@ void igt_spin_batch_free(int fd, igt_spin_t *spin)
 		timer_delete(spin->timer);
 
 	igt_spin_batch_end(spin);
-	gem_munmap(spin->batch, BATCH_SIZE);
+	gem_munmap((void *)((unsigned long)spin->batch & (~4095UL)),
+		   BATCH_SIZE);
+
+	if (spin->running) {
+		gem_munmap(spin->running, 4096);
+		gem_close(fd, spin->poll_handle);
+	}
 
 	gem_close(fd, spin->handle);
 
diff --git a/lib/igt_dummyload.h b/lib/igt_dummyload.h
index 4103e4ab9e36..3103935a309b 100644
--- a/lib/igt_dummyload.h
+++ b/lib/igt_dummyload.h
@@ -36,6 +36,10 @@ typedef struct igt_spin {
 	struct igt_list link;
 	uint32_t *batch;
 	int out_fence;
+	struct drm_i915_gem_exec_object2 obj[2];
+	struct drm_i915_gem_execbuffer2 execbuf;
+	uint32_t poll_handle;
+	bool *running;
 } igt_spin_t;
 
 igt_spin_t *__igt_spin_batch_new(int fd,
@@ -55,6 +59,13 @@ igt_spin_t *igt_spin_batch_new_fence(int fd,
 				     uint32_t ctx,
 				     unsigned engine);
 
+igt_spin_t *__igt_spin_batch_new_poll(int fd,
+				       uint32_t ctx,
+				       unsigned engine);
+igt_spin_t *igt_spin_batch_new_poll(int fd,
+				    uint32_t ctx,
+				    unsigned engine);
+
 void igt_spin_batch_set_timeout(igt_spin_t *spin, int64_t ns);
 void igt_spin_batch_end(igt_spin_t *spin);
 void igt_spin_batch_free(int fd, igt_spin_t *spin);
diff --git a/lib/igt_gt.c b/lib/igt_gt.c
index 01aebc670862..4569fd36bd85 100644
--- a/lib/igt_gt.c
+++ b/lib/igt_gt.c
@@ -609,7 +609,7 @@ bool gem_can_store_dword(int fd, unsigned int engine)
 	if (gen == 3 && (info->is_grantsdale || info->is_alviso))
 		return false; /* only supports physical addresses */
 
-	if (gen == 6 && (engine & ~(3<<13)) == I915_EXEC_BSD)
+	if (gen == 6 && ((engine & 0x3f) == I915_EXEC_BSD))
 		return false; /* kills the machine! */
 
 	if (info->is_broadwater)
diff --git a/lib/ioctl_wrappers.c b/lib/ioctl_wrappers.c
index 8748cfcfc04f..4e1a08bf06b4 100644
--- a/lib/ioctl_wrappers.c
+++ b/lib/ioctl_wrappers.c
@@ -198,7 +198,7 @@ void gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride)
 	igt_assert(__gem_set_tiling(fd, handle, tiling, stride) == 0);
 }
 
-static int __gem_set_caching(int fd, uint32_t handle, uint32_t caching)
+int __gem_set_caching(int fd, uint32_t handle, uint32_t caching)
 {
 	struct drm_i915_gem_caching arg;
 	int err;
diff --git a/lib/ioctl_wrappers.h b/lib/ioctl_wrappers.h
index 13fbe3c103c0..b966f72c90a8 100644
--- a/lib/ioctl_wrappers.h
+++ b/lib/ioctl_wrappers.h
@@ -61,6 +61,7 @@ bool gem_get_tiling(int fd, uint32_t handle, uint32_t *tiling, uint32_t *swizzle
 void gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride);
 int __gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride);
 
+int __gem_set_caching(int fd, uint32_t handle, uint32_t caching);
 void gem_set_caching(int fd, uint32_t handle, uint32_t caching);
 uint32_t gem_get_caching(int fd, uint32_t handle);
 uint32_t gem_flink(int fd, uint32_t handle);
diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
index 19fcc95ffc7f..594e38ddcc07 100644
--- a/tests/perf_pmu.c
+++ b/tests/perf_pmu.c
@@ -170,6 +170,56 @@ static unsigned int e2ring(int gem_fd, const struct intel_execution_engine2 *e)
 #define FLAG_LONG (16)
 #define FLAG_HANG (32)
 
+static igt_spin_t * __spin_poll(int fd, uint32_t ctx, unsigned long flags)
+{
+	if (gem_can_store_dword(fd, flags))
+		return __igt_spin_batch_new_poll(fd, ctx, flags);
+	else
+		return __igt_spin_batch_new(fd, ctx, flags, 0);
+}
+
+static unsigned long __spin_wait(int fd, igt_spin_t *spin)
+{
+	struct timespec start = { };
+
+	igt_nsec_elapsed(&start);
+
+	if (gem_can_store_dword(fd, spin->execbuf.flags)) {
+		unsigned long timeout = 0;
+
+		while (!spin->running) {
+			unsigned long t = igt_nsec_elapsed(&start);
+
+			if ((t - timeout) > 250e6) {
+				timeout = t;
+				igt_warn("Spinner not running after %.2fms\n",
+					 (double)t / 1e6);
+			}
+		};
+	} else {
+		igt_debug("__spin_wait - usleep mode\n");
+		usleep(500e3); /* Better than nothing! */
+	}
+
+	return igt_nsec_elapsed(&start);
+}
+
+static igt_spin_t * __spin_sync(int fd, uint32_t ctx, unsigned long flags)
+{
+	igt_spin_t *spin = __spin_poll(fd, ctx, flags);
+
+	__spin_wait(fd, spin);
+
+	return spin;
+}
+
+static igt_spin_t * spin_sync(int fd, uint32_t ctx, unsigned long flags)
+{
+	igt_require_gem(fd);
+
+	return __spin_sync(fd, ctx, flags);
+}
+
 static void end_spin(int fd, igt_spin_t *spin, unsigned int flags)
 {
 	if (!spin)
@@ -180,8 +230,25 @@ static void end_spin(int fd, igt_spin_t *spin, unsigned int flags)
 	if (flags & FLAG_SYNC)
 		gem_sync(fd, spin->handle);
 
-	if (flags & TEST_TRAILING_IDLE)
-		usleep(batch_duration_ns / 5000);
+	if (flags & TEST_TRAILING_IDLE) {
+		unsigned long t, timeout = 0;
+		struct timespec start = { };
+
+		igt_nsec_elapsed(&start);
+
+		do {
+			t = igt_nsec_elapsed(&start);
+
+			if (gem_bo_busy(fd, spin->handle) &&
+			    (t - timeout) > 10e6) {
+				timeout = t;
+				igt_warn("Spinner not idle after %.2fms\n",
+					 (double)t / 1e6);
+			}
+
+			usleep(1e3);
+		} while (t < batch_duration_ns / 5);
+	}
 }
 
 static void
@@ -195,7 +262,7 @@ single(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags)
 	fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance));
 
 	if (flags & TEST_BUSY)
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	else
 		spin = NULL;
 
@@ -251,13 +318,7 @@ busy_start(int gem_fd, const struct intel_execution_engine2 *e)
 	 */
 	sleep(2);
 
-	spin = __igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
-
-	/*
-	 * Sleep for a bit after making the engine busy to make sure the PMU
-	 * gets enabled when the batch is already running.
-	 */
-	usleep(500e3);
+	spin = __spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 
 	fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance));
 
@@ -300,7 +361,7 @@ busy_double_start(int gem_fd, const struct intel_execution_engine2 *e)
 	 * re-submission in execlists mode. Make sure busyness is correctly
 	 * reported with the engine busy, and after the engine went idle.
 	 */
-	spin[0] = __igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin[0] = __spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	usleep(500e3);
 	spin[1] = __igt_spin_batch_new(gem_fd, ctx, e2ring(gem_fd, e), 0);
 
@@ -386,7 +447,7 @@ busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 
 	igt_assert_eq(i, num_engines);
 
-	spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -412,15 +473,13 @@ busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 }
 
 static void
-__submit_spin_batch(int gem_fd,
-		    struct drm_i915_gem_exec_object2 *obj,
+__submit_spin_batch(int gem_fd, igt_spin_t *spin,
 		    const struct intel_execution_engine2 *e)
 {
-	struct drm_i915_gem_execbuffer2 eb = {
-		.buffer_count = 1,
-		.buffers_ptr = to_user_pointer(obj),
-		.flags = e2ring(gem_fd, e),
-	};
+	struct drm_i915_gem_execbuffer2 eb = spin->execbuf;
+
+	eb.flags &= ~(0x3f | I915_EXEC_BSD_MASK);
+	eb.flags |= e2ring(gem_fd, e) | I915_EXEC_NO_RELOC;
 
 	gem_execbuf(gem_fd, &eb);
 }
@@ -429,7 +488,6 @@ static void
 most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 		    const unsigned int num_engines, unsigned int flags)
 {
-	struct drm_i915_gem_exec_object2 obj = {};
 	const struct intel_execution_engine2 *e_;
 	uint64_t tval[2][num_engines];
 	uint64_t val[num_engines];
@@ -443,15 +501,12 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 		if (!gem_has_engine(gem_fd, e_->class, e_->instance))
 			continue;
 
-		if (e == e_) {
+		if (e == e_)
 			idle_idx = i;
-		} else if (spin) {
-			__submit_spin_batch(gem_fd, &obj, e_);
-		} else {
-			spin = igt_spin_batch_new(gem_fd, 0,
-						  e2ring(gem_fd, e_), 0);
-			obj.handle = spin->handle;
-		}
+		else if (spin)
+			__submit_spin_batch(gem_fd, spin, e_);
+		else
+			spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e_));
 
 		val[i++] = I915_PMU_ENGINE_BUSY(e_->class, e_->instance);
 	}
@@ -461,6 +516,9 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 	for (i = 0; i < num_engines; i++)
 		fd[i] = open_group(val[i], fd[0]);
 
+	/* Small delay to allow engines to start. */
+	usleep(__spin_wait(gem_fd, spin) * num_engines / 1e3);
+
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -489,7 +547,6 @@ static void
 all_busy_check_all(int gem_fd, const unsigned int num_engines,
 		   unsigned int flags)
 {
-	struct drm_i915_gem_exec_object2 obj = {};
 	const struct intel_execution_engine2 *e;
 	uint64_t tval[2][num_engines];
 	uint64_t val[num_engines];
@@ -503,13 +560,10 @@ all_busy_check_all(int gem_fd, const unsigned int num_engines,
 		if (!gem_has_engine(gem_fd, e->class, e->instance))
 			continue;
 
-		if (spin) {
-			__submit_spin_batch(gem_fd, &obj, e);
-		} else {
-			spin = igt_spin_batch_new(gem_fd, 0,
-						  e2ring(gem_fd, e), 0);
-			obj.handle = spin->handle;
-		}
+		if (spin)
+			__submit_spin_batch(gem_fd, spin, e);
+		else
+			spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e));
 
 		val[i++] = I915_PMU_ENGINE_BUSY(e->class, e->instance);
 	}
@@ -519,6 +573,9 @@ all_busy_check_all(int gem_fd, const unsigned int num_engines,
 	for (i = 0; i < num_engines; i++)
 		fd[i] = open_group(val[i], fd[0]);
 
+	/* Small delay to allow engines to start. */
+	usleep(__spin_wait(gem_fd, spin) * num_engines / 1e3);
+
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -550,7 +607,7 @@ no_sema(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags)
 	open_group(I915_PMU_ENGINE_WAIT(e->class, e->instance), fd);
 
 	if (flags & TEST_BUSY)
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	else
 		spin = NULL;
 
@@ -884,7 +941,7 @@ multi_client(int gem_fd, const struct intel_execution_engine2 *e)
 	 */
 	fd[1] = open_pmu(config);
 
-	spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 
 	val[0] = val[1] = __pmu_read_single(fd[0], &ts[0]);
 	slept[1] = measured_usleep(batch_duration_ns / 1000);
@@ -1248,7 +1305,7 @@ test_frequency(int gem_fd)
 	igt_require(igt_sysfs_get_u32(sysfs, "gt_boost_freq_mhz") == min_freq);
 
 	gem_quiescent_gpu(gem_fd); /* Idle to be sure the change takes effect */
-	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+	spin = spin_sync(gem_fd, 0, I915_EXEC_RENDER);
 
 	slept = pmu_read_multi(fd, 2, start);
 	measured_usleep(batch_duration_ns / 1000);
@@ -1274,7 +1331,7 @@ test_frequency(int gem_fd)
 	igt_require(igt_sysfs_get_u32(sysfs, "gt_min_freq_mhz") == max_freq);
 
 	gem_quiescent_gpu(gem_fd);
-	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+	spin = spin_sync(gem_fd, 0, I915_EXEC_RENDER);
 
 	slept = pmu_read_multi(fd, 2, start);
 	measured_usleep(batch_duration_ns / 1000);
@@ -1455,6 +1512,11 @@ static void __rearm_spin_batch(igt_spin_t *spin)
 {
 	const uint32_t mi_arb_chk = 0x5 << 23;
 
+	if (spin->running) {
+		igt_assert(*spin->running);
+		*spin->running = 0;
+	}
+	igt_assert_eq(*spin->batch, MI_BATCH_BUFFER_END);
        *spin->batch = mi_arb_chk;
        __sync_synchronize();
 }
@@ -1517,7 +1579,6 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 		const unsigned long timeout[] = {
 			pwm_calibration_us * 1000, test_us * 1000
 		};
-		struct drm_i915_gem_exec_object2 obj = {};
 		uint64_t total_busy_ns = 0, total_idle_ns = 0;
 		igt_spin_t *spin;
 		int ret;
@@ -1530,12 +1591,9 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 			igt_warn("Failed to set scheduling policy!\n");
 
 		/* Allocate our spin batch and idle it. */
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
-		obj.handle = spin->handle;
-		__submit_spin_batch(gem_fd, &obj, e); /* record its location */
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 		igt_spin_batch_end(spin);
-		gem_sync(gem_fd, obj.handle);
-		obj.flags |= EXEC_OBJECT_PINNED;
+		gem_sync(gem_fd, spin->handle);
 
 		/* 1st pass is calibration, second pass is the test. */
 		for (int pass = 0; pass < ARRAY_SIZE(timeout); pass++) {
@@ -1545,24 +1603,30 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 
 			igt_nsec_elapsed(&test_start);
 			do {
-				unsigned int target_idle_us, t_busy;
+				unsigned int target_idle_us;
+				struct timespec start = { };
+				unsigned long prep_delay_ns;
 
 				/* Restart the spinbatch. */
+				igt_nsec_elapsed(&start);
 				__rearm_spin_batch(spin);
-				__submit_spin_batch(gem_fd, &obj, e);
+				__submit_spin_batch(gem_fd, spin, e);
 
-				/*
-				 * Note that the submission may be delayed to a
-				 * tasklet (ksoftirqd) which cannot run until we
-				 * sleep as we hog the cpu (we are RT).
-				 */
+				 /* Wait for batch to start executing. */
+				__spin_wait(gem_fd, spin);
+				prep_delay_ns = igt_nsec_elapsed(&start);
 
-				t_busy = measured_usleep(busy_us);
+				/* PWM busy sleep. */
+				memset(&start, 0, sizeof(start));
+				igt_nsec_elapsed(&start);
+				measured_usleep(busy_us);
 				igt_spin_batch_end(spin);
-				gem_sync(gem_fd, obj.handle);
+				gem_sync(gem_fd, spin->handle);
 
-				total_busy_ns += t_busy;
+				total_busy_ns += igt_nsec_elapsed(&start);
+				total_idle_ns += prep_delay_ns;
 
+				/* Re-calibrate. */
 				target_idle_us =
 					(100 * total_busy_ns / target_busy_pct - (total_busy_ns + total_idle_ns)) / 1000;
 				total_idle_ns += measured_usleep(target_idle_us);
-- 
2.14.1

_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [igt-dev] ✗ Fi.CI.IGT: failure for tests/perf_pmu: Improve accuracy by waiting on spinner to start (rev4)
  2018-03-15 12:56 ` [igt-dev] " Tvrtko Ursulin
                   ` (8 preceding siblings ...)
  (?)
@ 2018-03-16 12:27 ` Patchwork
  -1 siblings, 0 replies; 53+ messages in thread
From: Patchwork @ 2018-03-16 12:27 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: igt-dev

== Series Details ==

Series: tests/perf_pmu: Improve accuracy by waiting on spinner to start (rev4)
URL   : https://patchwork.freedesktop.org/series/40027/
State : failure

== Summary ==

---- Possible new issues:

Test gem_tiled_swapping:
        Subgroup non-threaded:
                skip       -> PASS       (shard-snb)
Test kms_cursor_legacy:
        Subgroup cursora-vs-flipa-legacy:
                pass       -> SKIP       (shard-snb)
Test perf_pmu:
        Subgroup all-busy-idle-check-all:
                pass       -> FAIL       (shard-hsw)
                pass       -> INCOMPLETE (shard-snb)
        Subgroup busy-bcs0:
                pass       -> INCOMPLETE (shard-snb)
        Subgroup busy-double-start-bcs0:
                pass       -> INCOMPLETE (shard-snb)
        Subgroup busy-double-start-rcs0:
                pass       -> INCOMPLETE (shard-snb)
        Subgroup busy-idle-check-all-rcs0:
                pass       -> FAIL       (shard-hsw)
        Subgroup busy-idle-no-semaphores-bcs0:
                pass       -> WARN       (shard-hsw)
                pass       -> INCOMPLETE (shard-snb)
        Subgroup busy-idle-no-semaphores-rcs0:
                pass       -> WARN       (shard-hsw)
                pass       -> DMESG-WARN (shard-snb)
        Subgroup busy-idle-rcs0:
                pass       -> FAIL       (shard-hsw)
                pass       -> FAIL       (shard-snb)
        Subgroup busy-idle-vcs0:
                pass       -> FAIL       (shard-hsw)
        Subgroup busy-idle-vecs0:
                pass       -> FAIL       (shard-hsw)
        Subgroup busy-no-semaphores-bcs0:
                pass       -> INCOMPLETE (shard-snb)
        Subgroup busy-rcs0:
                pass       -> FAIL       (shard-snb)
        Subgroup most-busy-check-all-bcs0:
                pass       -> FAIL       (shard-snb)
        Subgroup most-busy-check-all-vcs0:
                pass       -> INCOMPLETE (shard-snb)
        Subgroup most-busy-idle-check-all-bcs0:
                pass       -> FAIL       (shard-hsw)
        Subgroup most-busy-idle-check-all-rcs0:
                pass       -> INCOMPLETE (shard-snb)
        Subgroup most-busy-idle-check-all-vcs0:
                pass       -> FAIL       (shard-hsw)
        Subgroup most-busy-idle-check-all-vecs0:
                pass       -> FAIL       (shard-hsw)
        Subgroup render-node-busy-idle-rcs0:
                pass       -> FAIL       (shard-hsw)
        Subgroup render-node-busy-idle-vcs0:
                pass       -> FAIL       (shard-hsw)

---- Known issues:

Test gem_workarounds:
        Subgroup suspend-resume-context:
                incomplete -> PASS       (shard-apl) fdo#103375
Test kms_flip:
        Subgroup 2x-flip-vs-expired-vblank:
                fail       -> PASS       (shard-hsw) fdo#102887
        Subgroup 2x-plain-flip-ts-check-interruptible:
                pass       -> FAIL       (shard-hsw) fdo#100368 +1
Test kms_frontbuffer_tracking:
        Subgroup fbc-rgb101010-draw-mmap-wc:
                fail       -> PASS       (shard-apl) fdo#101623
Test kms_plane_multiple:
        Subgroup atomic-pipe-a-tiling-x:
                fail       -> PASS       (shard-snb) fdo#103166
Test kms_rotation_crc:
        Subgroup primary-rotation-180:
                fail       -> PASS       (shard-snb) fdo#103925
Test kms_sysfs_edid_timing:
                pass       -> WARN       (shard-apl) fdo#100047
Test kms_vblank:
        Subgroup pipe-b-ts-continuation-suspend:
                pass       -> SKIP       (shard-snb) fdo#105411

fdo#103375 https://bugs.freedesktop.org/show_bug.cgi?id=103375
fdo#102887 https://bugs.freedesktop.org/show_bug.cgi?id=102887
fdo#100368 https://bugs.freedesktop.org/show_bug.cgi?id=100368
fdo#101623 https://bugs.freedesktop.org/show_bug.cgi?id=101623
fdo#103166 https://bugs.freedesktop.org/show_bug.cgi?id=103166
fdo#103925 https://bugs.freedesktop.org/show_bug.cgi?id=103925
fdo#100047 https://bugs.freedesktop.org/show_bug.cgi?id=100047
fdo#105411 https://bugs.freedesktop.org/show_bug.cgi?id=105411

shard-apl        total:3442 pass:1814 dwarn:1   dfail:0   fail:7   skip:1619 time:13038s
shard-hsw        total:3442 pass:1755 dwarn:1   dfail:0   fail:12  skip:1671 time:11982s
shard-snb        total:3171 pass:1245 dwarn:2   dfail:0   fail:4   skip:1912 time:5413s
Blacklisted hosts:
shard-kbl        total:2137 pass:1209 dwarn:0   dfail:0   fail:7   skip:921 time:5877s

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_1146/shards.html
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 53+ messages in thread

* [PATCH i-g-t v6] tests/perf_pmu: Improve accuracy by waiting on spinner to start
  2018-03-16 12:18           ` [igt-dev] " Tvrtko Ursulin
@ 2018-03-16 13:31             ` Tvrtko Ursulin
  -1 siblings, 0 replies; 53+ messages in thread
From: Tvrtko Ursulin @ 2018-03-16 13:31 UTC (permalink / raw)
  To: igt-dev; +Cc: Intel-gfx

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

More than one test assumes that the spinner is running pretty much
immediately after we have create or submitted it.

In actuality there is a variable delay, especially on execlists platforms,
between submission and spin batch starting to run on the hardware.

To enable tests which care about this level of timing to account for this,
we add a new spin batch constructor which provides an output field which
can be polled to determine when the batch actually started running.

This is implemented via MI_STOREDW_IMM from the spin batch, writing into
memory mapped page shared with userspace.

Using this facility from perf_pmu, where applicable, should improve very
occasional test fails across the set and platforms.

v2:
 Chris Wilson:
 * Use caching mapping if available.
 * Handle old gens better.
 * Use gem_can_store_dword.
 * Cache exec obj array in spin_batch_t for easier resubmit.

v3:
 * Forgot I915_EXEC_NO_RELOC. (Chris Wilson)

v4:
 * Mask out all non-engine flags in gem_can_store_dword.
 * Added some debug logging.

v5:
 * Fix relocs and batch munmap. (Chris)
 * Added assert idle spinner batch looks as expected.

v6:
 * Skip accuracy tests when !gem_can_store_dword.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 lib/igt_dummyload.c  | 152 ++++++++++++++++++++++++++++++++++++-------
 lib/igt_dummyload.h  |  11 ++++
 lib/igt_gt.c         |   2 +-
 lib/ioctl_wrappers.c |   2 +-
 lib/ioctl_wrappers.h |   1 +
 tests/perf_pmu.c     | 179 +++++++++++++++++++++++++++++++++++----------------
 6 files changed, 266 insertions(+), 81 deletions(-)

diff --git a/lib/igt_dummyload.c b/lib/igt_dummyload.c
index 4b20f23dfe26..2462771291a6 100644
--- a/lib/igt_dummyload.c
+++ b/lib/igt_dummyload.c
@@ -74,16 +74,19 @@ fill_reloc(struct drm_i915_gem_relocation_entry *reloc,
 	reloc->write_domain = write_domains;
 }
 
-static int emit_recursive_batch(igt_spin_t *spin,
-				int fd, uint32_t ctx, unsigned engine,
-				uint32_t dep, bool out_fence)
+#define OUT_FENCE	(1 << 0)
+#define POLL_RUN	(1 << 1)
+
+static int
+emit_recursive_batch(igt_spin_t *spin, int fd, uint32_t ctx, unsigned engine,
+		     uint32_t dep, unsigned int flags)
 {
 #define SCRATCH 0
 #define BATCH 1
 	const int gen = intel_gen(intel_get_drm_devid(fd));
-	struct drm_i915_gem_exec_object2 obj[2];
 	struct drm_i915_gem_relocation_entry relocs[2];
-	struct drm_i915_gem_execbuffer2 execbuf;
+	struct drm_i915_gem_execbuffer2 *execbuf;
+	struct drm_i915_gem_exec_object2 *obj;
 	unsigned int engines[16];
 	unsigned int nengine;
 	int fence_fd = -1;
@@ -101,8 +104,10 @@ static int emit_recursive_batch(igt_spin_t *spin,
 	}
 	igt_require(nengine);
 
-	memset(&execbuf, 0, sizeof(execbuf));
-	memset(obj, 0, sizeof(obj));
+	memset(&spin->execbuf, 0, sizeof(spin->execbuf));
+	execbuf = &spin->execbuf;
+	memset(spin->obj, 0, sizeof(spin->obj));
+	obj = spin->obj;
 	memset(relocs, 0, sizeof(relocs));
 
 	obj[BATCH].handle = gem_create(fd, BATCH_SIZE);
@@ -113,16 +118,62 @@ static int emit_recursive_batch(igt_spin_t *spin,
 				       	BATCH_SIZE, PROT_WRITE);
 	gem_set_domain(fd, obj[BATCH].handle,
 			I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT);
-	execbuf.buffer_count++;
+	execbuf->buffer_count++;
 
 	if (dep) {
+		igt_assert(!(flags & POLL_RUN));
+
 		/* dummy write to dependency */
 		obj[SCRATCH].handle = dep;
 		fill_reloc(&relocs[obj[BATCH].relocation_count++],
 			   dep, 1020,
 			   I915_GEM_DOMAIN_RENDER,
 			   I915_GEM_DOMAIN_RENDER);
-		execbuf.buffer_count++;
+		execbuf->buffer_count++;
+	} else if (flags & POLL_RUN) {
+		unsigned int offset;
+
+		igt_assert(!dep);
+
+		if (gen == 4 || gen == 5)
+			execbuf->flags |= I915_EXEC_SECURE;
+
+		spin->poll_handle = gem_create(fd, 4096);
+
+		if (__gem_set_caching(fd, spin->poll_handle,
+				      I915_CACHING_CACHED) == 0)
+			spin->running = __gem_mmap__cpu(fd, spin->poll_handle,
+							0, 4096,
+							PROT_READ | PROT_WRITE);
+		else
+			spin->running = __gem_mmap__wc(fd, spin->poll_handle,
+						       0, 4096,
+						       PROT_READ | PROT_WRITE);
+		igt_assert(spin->running);
+		igt_assert_eq(*spin->running, 0);
+
+		*batch++ = MI_STORE_DWORD_IMM | (gen < 6 ? 1 << 22 : 0);
+
+		if (gen >= 8) {
+			offset = 1;
+			*batch++ = 0;
+			*batch++ = 0;
+		} else if (gen >= 4) {
+			offset = 2;
+			*batch++ = 0;
+			*batch++ = 0;
+		} else {
+			offset = 1;
+			batch[-1]--;
+			*batch++ = 0;
+		}
+
+		*batch++ = 1;
+
+		obj[SCRATCH].handle = spin->poll_handle;
+		fill_reloc(&relocs[obj[BATCH].relocation_count++],
+			   spin->poll_handle, offset, 0, 0);
+		execbuf->buffer_count++;
 	}
 
 	spin->batch = batch;
@@ -167,18 +218,23 @@ static int emit_recursive_batch(igt_spin_t *spin,
 	obj[BATCH].relocation_count++;
 	obj[BATCH].relocs_ptr = to_user_pointer(relocs);
 
-	execbuf.buffers_ptr = to_user_pointer(obj + (2 - execbuf.buffer_count));
-	execbuf.rsvd1 = ctx;
+	execbuf->buffers_ptr = to_user_pointer(obj +
+					       (2 - execbuf->buffer_count));
+	execbuf->rsvd1 = ctx;
 
-	if (out_fence)
-		execbuf.flags |= I915_EXEC_FENCE_OUT;
+	if (flags & OUT_FENCE)
+		execbuf->flags |= I915_EXEC_FENCE_OUT;
 
 	for (i = 0; i < nengine; i++) {
-		execbuf.flags &= ~ENGINE_MASK;
-		execbuf.flags |= engines[i];
-		gem_execbuf_wr(fd, &execbuf);
-		if (out_fence) {
-			int _fd = execbuf.rsvd2 >> 32;
+		execbuf->flags &= ~ENGINE_MASK;
+		execbuf->flags |= engines[i];
+
+		if (flags & POLL_RUN)
+			igt_require(gem_can_store_dword(fd, execbuf->flags));
+
+		gem_execbuf_wr(fd, execbuf);
+		if (flags & OUT_FENCE) {
+			int _fd = execbuf->rsvd2 >> 32;
 
 			igt_assert(_fd >= 0);
 			if (fence_fd == -1) {
@@ -194,12 +250,20 @@ static int emit_recursive_batch(igt_spin_t *spin,
 		}
 	}
 
+	/* Make it easier for callers to resubmit. */
+
+	obj[BATCH].relocation_count = 0;
+	obj[BATCH].relocs_ptr = 0;
+
+	obj[SCRATCH].flags = EXEC_OBJECT_PINNED;
+	obj[BATCH].flags = EXEC_OBJECT_PINNED;
+
 	return fence_fd;
 }
 
 static igt_spin_t *
 ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
-		      int out_fence)
+		      unsigned int flags)
 {
 	igt_spin_t *spin;
 
@@ -207,7 +271,7 @@ ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
 	igt_assert(spin);
 
 	spin->out_fence = emit_recursive_batch(spin, fd, ctx, engine, dep,
-					       out_fence);
+					       flags);
 
 	pthread_mutex_lock(&list_lock);
 	igt_list_add(&spin->link, &spin_list);
@@ -219,7 +283,7 @@ ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
 igt_spin_t *
 __igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep)
 {
-	return ___igt_spin_batch_new(fd, ctx, engine, dep, false);
+	return ___igt_spin_batch_new(fd, ctx, engine, dep, 0);
 }
 
 /**
@@ -253,7 +317,7 @@ igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep)
 igt_spin_t *
 __igt_spin_batch_new_fence(int fd, uint32_t ctx, unsigned engine)
 {
-	return ___igt_spin_batch_new(fd, ctx, engine, 0, true);
+	return ___igt_spin_batch_new(fd, ctx, engine, 0, OUT_FENCE);
 }
 
 /**
@@ -286,6 +350,42 @@ igt_spin_batch_new_fence(int fd, uint32_t ctx, unsigned engine)
 	return spin;
 }
 
+igt_spin_t *
+__igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine)
+{
+	return ___igt_spin_batch_new(fd, ctx, engine, 0, POLL_RUN);
+}
+
+/**
+ * igt_spin_batch_new_poll:
+ * @fd: open i915 drm file descriptor
+ * @engine: Ring to execute batch OR'd with execbuf flags. If value is less
+ *          than 0, execute on all available rings.
+ *
+ * Start a recursive batch on a ring. Immediately returns a #igt_spin_t that
+ * contains the batch's handle that can be waited upon. The returned structure
+ * must be passed to igt_spin_batch_free() for post-processing.
+ *
+ * igt_spin_t->running will containt a pointer which target will change from
+ * zero to one once the spinner actually starts executing on the GPU.
+ *
+ * Returns:
+ * Structure with helper internal state for igt_spin_batch_free().
+ */
+igt_spin_t *
+igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine)
+{
+	igt_spin_t *spin;
+
+	igt_require_gem(fd);
+	igt_require(gem_mmap__has_wc(fd));
+
+	spin = __igt_spin_batch_new_poll(fd, ctx, engine);
+	igt_assert(gem_bo_busy(fd, spin->handle));
+
+	return spin;
+}
+
 static void notify(union sigval arg)
 {
 	igt_spin_t *spin = arg.sival_ptr;
@@ -365,7 +465,13 @@ void igt_spin_batch_free(int fd, igt_spin_t *spin)
 		timer_delete(spin->timer);
 
 	igt_spin_batch_end(spin);
-	gem_munmap(spin->batch, BATCH_SIZE);
+	gem_munmap((void *)((unsigned long)spin->batch & (~4095UL)),
+		   BATCH_SIZE);
+
+	if (spin->running) {
+		gem_munmap(spin->running, 4096);
+		gem_close(fd, spin->poll_handle);
+	}
 
 	gem_close(fd, spin->handle);
 
diff --git a/lib/igt_dummyload.h b/lib/igt_dummyload.h
index 4103e4ab9e36..3103935a309b 100644
--- a/lib/igt_dummyload.h
+++ b/lib/igt_dummyload.h
@@ -36,6 +36,10 @@ typedef struct igt_spin {
 	struct igt_list link;
 	uint32_t *batch;
 	int out_fence;
+	struct drm_i915_gem_exec_object2 obj[2];
+	struct drm_i915_gem_execbuffer2 execbuf;
+	uint32_t poll_handle;
+	bool *running;
 } igt_spin_t;
 
 igt_spin_t *__igt_spin_batch_new(int fd,
@@ -55,6 +59,13 @@ igt_spin_t *igt_spin_batch_new_fence(int fd,
 				     uint32_t ctx,
 				     unsigned engine);
 
+igt_spin_t *__igt_spin_batch_new_poll(int fd,
+				       uint32_t ctx,
+				       unsigned engine);
+igt_spin_t *igt_spin_batch_new_poll(int fd,
+				    uint32_t ctx,
+				    unsigned engine);
+
 void igt_spin_batch_set_timeout(igt_spin_t *spin, int64_t ns);
 void igt_spin_batch_end(igt_spin_t *spin);
 void igt_spin_batch_free(int fd, igt_spin_t *spin);
diff --git a/lib/igt_gt.c b/lib/igt_gt.c
index 01aebc670862..4569fd36bd85 100644
--- a/lib/igt_gt.c
+++ b/lib/igt_gt.c
@@ -609,7 +609,7 @@ bool gem_can_store_dword(int fd, unsigned int engine)
 	if (gen == 3 && (info->is_grantsdale || info->is_alviso))
 		return false; /* only supports physical addresses */
 
-	if (gen == 6 && (engine & ~(3<<13)) == I915_EXEC_BSD)
+	if (gen == 6 && ((engine & 0x3f) == I915_EXEC_BSD))
 		return false; /* kills the machine! */
 
 	if (info->is_broadwater)
diff --git a/lib/ioctl_wrappers.c b/lib/ioctl_wrappers.c
index 8748cfcfc04f..4e1a08bf06b4 100644
--- a/lib/ioctl_wrappers.c
+++ b/lib/ioctl_wrappers.c
@@ -198,7 +198,7 @@ void gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride)
 	igt_assert(__gem_set_tiling(fd, handle, tiling, stride) == 0);
 }
 
-static int __gem_set_caching(int fd, uint32_t handle, uint32_t caching)
+int __gem_set_caching(int fd, uint32_t handle, uint32_t caching)
 {
 	struct drm_i915_gem_caching arg;
 	int err;
diff --git a/lib/ioctl_wrappers.h b/lib/ioctl_wrappers.h
index 13fbe3c103c0..b966f72c90a8 100644
--- a/lib/ioctl_wrappers.h
+++ b/lib/ioctl_wrappers.h
@@ -61,6 +61,7 @@ bool gem_get_tiling(int fd, uint32_t handle, uint32_t *tiling, uint32_t *swizzle
 void gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride);
 int __gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride);
 
+int __gem_set_caching(int fd, uint32_t handle, uint32_t caching);
 void gem_set_caching(int fd, uint32_t handle, uint32_t caching);
 uint32_t gem_get_caching(int fd, uint32_t handle);
 uint32_t gem_flink(int fd, uint32_t handle);
diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
index 19fcc95ffc7f..87875fb9e06a 100644
--- a/tests/perf_pmu.c
+++ b/tests/perf_pmu.c
@@ -170,6 +170,56 @@ static unsigned int e2ring(int gem_fd, const struct intel_execution_engine2 *e)
 #define FLAG_LONG (16)
 #define FLAG_HANG (32)
 
+static igt_spin_t * __spin_poll(int fd, uint32_t ctx, unsigned long flags)
+{
+	if (gem_can_store_dword(fd, flags))
+		return __igt_spin_batch_new_poll(fd, ctx, flags);
+	else
+		return __igt_spin_batch_new(fd, ctx, flags, 0);
+}
+
+static unsigned long __spin_wait(int fd, igt_spin_t *spin)
+{
+	struct timespec start = { };
+
+	igt_nsec_elapsed(&start);
+
+	if (gem_can_store_dword(fd, spin->execbuf.flags)) {
+		unsigned long timeout = 0;
+
+		while (!spin->running) {
+			unsigned long t = igt_nsec_elapsed(&start);
+
+			if ((t - timeout) > 250e6) {
+				timeout = t;
+				igt_warn("Spinner not running after %.2fms\n",
+					 (double)t / 1e6);
+			}
+		};
+	} else {
+		igt_debug("__spin_wait - usleep mode\n");
+		usleep(500e3); /* Better than nothing! */
+	}
+
+	return igt_nsec_elapsed(&start);
+}
+
+static igt_spin_t * __spin_sync(int fd, uint32_t ctx, unsigned long flags)
+{
+	igt_spin_t *spin = __spin_poll(fd, ctx, flags);
+
+	__spin_wait(fd, spin);
+
+	return spin;
+}
+
+static igt_spin_t * spin_sync(int fd, uint32_t ctx, unsigned long flags)
+{
+	igt_require_gem(fd);
+
+	return __spin_sync(fd, ctx, flags);
+}
+
 static void end_spin(int fd, igt_spin_t *spin, unsigned int flags)
 {
 	if (!spin)
@@ -180,8 +230,25 @@ static void end_spin(int fd, igt_spin_t *spin, unsigned int flags)
 	if (flags & FLAG_SYNC)
 		gem_sync(fd, spin->handle);
 
-	if (flags & TEST_TRAILING_IDLE)
-		usleep(batch_duration_ns / 5000);
+	if (flags & TEST_TRAILING_IDLE) {
+		unsigned long t, timeout = 0;
+		struct timespec start = { };
+
+		igt_nsec_elapsed(&start);
+
+		do {
+			t = igt_nsec_elapsed(&start);
+
+			if (gem_bo_busy(fd, spin->handle) &&
+			    (t - timeout) > 10e6) {
+				timeout = t;
+				igt_warn("Spinner not idle after %.2fms\n",
+					 (double)t / 1e6);
+			}
+
+			usleep(1e3);
+		} while (t < batch_duration_ns / 5);
+	}
 }
 
 static void
@@ -195,7 +262,7 @@ single(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags)
 	fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance));
 
 	if (flags & TEST_BUSY)
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	else
 		spin = NULL;
 
@@ -251,13 +318,7 @@ busy_start(int gem_fd, const struct intel_execution_engine2 *e)
 	 */
 	sleep(2);
 
-	spin = __igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
-
-	/*
-	 * Sleep for a bit after making the engine busy to make sure the PMU
-	 * gets enabled when the batch is already running.
-	 */
-	usleep(500e3);
+	spin = __spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 
 	fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance));
 
@@ -300,7 +361,7 @@ busy_double_start(int gem_fd, const struct intel_execution_engine2 *e)
 	 * re-submission in execlists mode. Make sure busyness is correctly
 	 * reported with the engine busy, and after the engine went idle.
 	 */
-	spin[0] = __igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin[0] = __spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	usleep(500e3);
 	spin[1] = __igt_spin_batch_new(gem_fd, ctx, e2ring(gem_fd, e), 0);
 
@@ -386,7 +447,7 @@ busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 
 	igt_assert_eq(i, num_engines);
 
-	spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -412,15 +473,13 @@ busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 }
 
 static void
-__submit_spin_batch(int gem_fd,
-		    struct drm_i915_gem_exec_object2 *obj,
+__submit_spin_batch(int gem_fd, igt_spin_t *spin,
 		    const struct intel_execution_engine2 *e)
 {
-	struct drm_i915_gem_execbuffer2 eb = {
-		.buffer_count = 1,
-		.buffers_ptr = to_user_pointer(obj),
-		.flags = e2ring(gem_fd, e),
-	};
+	struct drm_i915_gem_execbuffer2 eb = spin->execbuf;
+
+	eb.flags &= ~(0x3f | I915_EXEC_BSD_MASK);
+	eb.flags |= e2ring(gem_fd, e) | I915_EXEC_NO_RELOC;
 
 	gem_execbuf(gem_fd, &eb);
 }
@@ -429,7 +488,6 @@ static void
 most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 		    const unsigned int num_engines, unsigned int flags)
 {
-	struct drm_i915_gem_exec_object2 obj = {};
 	const struct intel_execution_engine2 *e_;
 	uint64_t tval[2][num_engines];
 	uint64_t val[num_engines];
@@ -443,15 +501,12 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 		if (!gem_has_engine(gem_fd, e_->class, e_->instance))
 			continue;
 
-		if (e == e_) {
+		if (e == e_)
 			idle_idx = i;
-		} else if (spin) {
-			__submit_spin_batch(gem_fd, &obj, e_);
-		} else {
-			spin = igt_spin_batch_new(gem_fd, 0,
-						  e2ring(gem_fd, e_), 0);
-			obj.handle = spin->handle;
-		}
+		else if (spin)
+			__submit_spin_batch(gem_fd, spin, e_);
+		else
+			spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e_));
 
 		val[i++] = I915_PMU_ENGINE_BUSY(e_->class, e_->instance);
 	}
@@ -461,6 +516,9 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 	for (i = 0; i < num_engines; i++)
 		fd[i] = open_group(val[i], fd[0]);
 
+	/* Small delay to allow engines to start. */
+	usleep(__spin_wait(gem_fd, spin) * num_engines / 1e3);
+
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -489,7 +547,6 @@ static void
 all_busy_check_all(int gem_fd, const unsigned int num_engines,
 		   unsigned int flags)
 {
-	struct drm_i915_gem_exec_object2 obj = {};
 	const struct intel_execution_engine2 *e;
 	uint64_t tval[2][num_engines];
 	uint64_t val[num_engines];
@@ -503,13 +560,10 @@ all_busy_check_all(int gem_fd, const unsigned int num_engines,
 		if (!gem_has_engine(gem_fd, e->class, e->instance))
 			continue;
 
-		if (spin) {
-			__submit_spin_batch(gem_fd, &obj, e);
-		} else {
-			spin = igt_spin_batch_new(gem_fd, 0,
-						  e2ring(gem_fd, e), 0);
-			obj.handle = spin->handle;
-		}
+		if (spin)
+			__submit_spin_batch(gem_fd, spin, e);
+		else
+			spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e));
 
 		val[i++] = I915_PMU_ENGINE_BUSY(e->class, e->instance);
 	}
@@ -519,6 +573,9 @@ all_busy_check_all(int gem_fd, const unsigned int num_engines,
 	for (i = 0; i < num_engines; i++)
 		fd[i] = open_group(val[i], fd[0]);
 
+	/* Small delay to allow engines to start. */
+	usleep(__spin_wait(gem_fd, spin) * num_engines / 1e3);
+
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -550,7 +607,7 @@ no_sema(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags)
 	open_group(I915_PMU_ENGINE_WAIT(e->class, e->instance), fd);
 
 	if (flags & TEST_BUSY)
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	else
 		spin = NULL;
 
@@ -884,7 +941,7 @@ multi_client(int gem_fd, const struct intel_execution_engine2 *e)
 	 */
 	fd[1] = open_pmu(config);
 
-	spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 
 	val[0] = val[1] = __pmu_read_single(fd[0], &ts[0]);
 	slept[1] = measured_usleep(batch_duration_ns / 1000);
@@ -1248,7 +1305,7 @@ test_frequency(int gem_fd)
 	igt_require(igt_sysfs_get_u32(sysfs, "gt_boost_freq_mhz") == min_freq);
 
 	gem_quiescent_gpu(gem_fd); /* Idle to be sure the change takes effect */
-	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+	spin = spin_sync(gem_fd, 0, I915_EXEC_RENDER);
 
 	slept = pmu_read_multi(fd, 2, start);
 	measured_usleep(batch_duration_ns / 1000);
@@ -1274,7 +1331,7 @@ test_frequency(int gem_fd)
 	igt_require(igt_sysfs_get_u32(sysfs, "gt_min_freq_mhz") == max_freq);
 
 	gem_quiescent_gpu(gem_fd);
-	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+	spin = spin_sync(gem_fd, 0, I915_EXEC_RENDER);
 
 	slept = pmu_read_multi(fd, 2, start);
 	measured_usleep(batch_duration_ns / 1000);
@@ -1455,6 +1512,11 @@ static void __rearm_spin_batch(igt_spin_t *spin)
 {
 	const uint32_t mi_arb_chk = 0x5 << 23;
 
+	if (spin->running) {
+		igt_assert(*spin->running);
+		*spin->running = 0;
+	}
+	igt_assert_eq(*spin->batch, MI_BATCH_BUFFER_END);
        *spin->batch = mi_arb_chk;
        __sync_synchronize();
 }
@@ -1489,6 +1551,9 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 	/* Sampling platforms cannot reach the high accuracy criteria. */
 	igt_require(gem_has_execlists(gem_fd));
 
+	/* Need store dword for accurate PWM. */
+	igt_require(gem_can_store_dword(gem_fd, e2ring(gem_fd, e)));
+
 	while (idle_us < 2500) {
 		busy_us *= 2;
 		idle_us *= 2;
@@ -1517,7 +1582,6 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 		const unsigned long timeout[] = {
 			pwm_calibration_us * 1000, test_us * 1000
 		};
-		struct drm_i915_gem_exec_object2 obj = {};
 		uint64_t total_busy_ns = 0, total_idle_ns = 0;
 		igt_spin_t *spin;
 		int ret;
@@ -1530,12 +1594,9 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 			igt_warn("Failed to set scheduling policy!\n");
 
 		/* Allocate our spin batch and idle it. */
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
-		obj.handle = spin->handle;
-		__submit_spin_batch(gem_fd, &obj, e); /* record its location */
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 		igt_spin_batch_end(spin);
-		gem_sync(gem_fd, obj.handle);
-		obj.flags |= EXEC_OBJECT_PINNED;
+		gem_sync(gem_fd, spin->handle);
 
 		/* 1st pass is calibration, second pass is the test. */
 		for (int pass = 0; pass < ARRAY_SIZE(timeout); pass++) {
@@ -1545,24 +1606,30 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 
 			igt_nsec_elapsed(&test_start);
 			do {
-				unsigned int target_idle_us, t_busy;
+				unsigned int target_idle_us;
+				struct timespec start = { };
+				unsigned long prep_delay_ns;
 
 				/* Restart the spinbatch. */
+				igt_nsec_elapsed(&start);
 				__rearm_spin_batch(spin);
-				__submit_spin_batch(gem_fd, &obj, e);
+				__submit_spin_batch(gem_fd, spin, e);
 
-				/*
-				 * Note that the submission may be delayed to a
-				 * tasklet (ksoftirqd) which cannot run until we
-				 * sleep as we hog the cpu (we are RT).
-				 */
+				 /* Wait for batch to start executing. */
+				__spin_wait(gem_fd, spin);
+				prep_delay_ns = igt_nsec_elapsed(&start);
 
-				t_busy = measured_usleep(busy_us);
+				/* PWM busy sleep. */
+				memset(&start, 0, sizeof(start));
+				igt_nsec_elapsed(&start);
+				measured_usleep(busy_us);
 				igt_spin_batch_end(spin);
-				gem_sync(gem_fd, obj.handle);
+				gem_sync(gem_fd, spin->handle);
 
-				total_busy_ns += t_busy;
+				total_busy_ns += igt_nsec_elapsed(&start);
+				total_idle_ns += prep_delay_ns;
 
+				/* Re-calibrate. */
 				target_idle_us =
 					(100 * total_busy_ns / target_busy_pct - (total_busy_ns + total_idle_ns)) / 1000;
 				total_idle_ns += measured_usleep(target_idle_us);
-- 
2.14.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [Intel-gfx] [PATCH i-g-t v6] tests/perf_pmu: Improve accuracy by waiting on spinner to start
@ 2018-03-16 13:31             ` Tvrtko Ursulin
  0 siblings, 0 replies; 53+ messages in thread
From: Tvrtko Ursulin @ 2018-03-16 13:31 UTC (permalink / raw)
  To: igt-dev; +Cc: Intel-gfx

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

More than one test assumes that the spinner is running pretty much
immediately after we have create or submitted it.

In actuality there is a variable delay, especially on execlists platforms,
between submission and spin batch starting to run on the hardware.

To enable tests which care about this level of timing to account for this,
we add a new spin batch constructor which provides an output field which
can be polled to determine when the batch actually started running.

This is implemented via MI_STOREDW_IMM from the spin batch, writing into
memory mapped page shared with userspace.

Using this facility from perf_pmu, where applicable, should improve very
occasional test fails across the set and platforms.

v2:
 Chris Wilson:
 * Use caching mapping if available.
 * Handle old gens better.
 * Use gem_can_store_dword.
 * Cache exec obj array in spin_batch_t for easier resubmit.

v3:
 * Forgot I915_EXEC_NO_RELOC. (Chris Wilson)

v4:
 * Mask out all non-engine flags in gem_can_store_dword.
 * Added some debug logging.

v5:
 * Fix relocs and batch munmap. (Chris)
 * Added assert idle spinner batch looks as expected.

v6:
 * Skip accuracy tests when !gem_can_store_dword.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 lib/igt_dummyload.c  | 152 ++++++++++++++++++++++++++++++++++++-------
 lib/igt_dummyload.h  |  11 ++++
 lib/igt_gt.c         |   2 +-
 lib/ioctl_wrappers.c |   2 +-
 lib/ioctl_wrappers.h |   1 +
 tests/perf_pmu.c     | 179 +++++++++++++++++++++++++++++++++++----------------
 6 files changed, 266 insertions(+), 81 deletions(-)

diff --git a/lib/igt_dummyload.c b/lib/igt_dummyload.c
index 4b20f23dfe26..2462771291a6 100644
--- a/lib/igt_dummyload.c
+++ b/lib/igt_dummyload.c
@@ -74,16 +74,19 @@ fill_reloc(struct drm_i915_gem_relocation_entry *reloc,
 	reloc->write_domain = write_domains;
 }
 
-static int emit_recursive_batch(igt_spin_t *spin,
-				int fd, uint32_t ctx, unsigned engine,
-				uint32_t dep, bool out_fence)
+#define OUT_FENCE	(1 << 0)
+#define POLL_RUN	(1 << 1)
+
+static int
+emit_recursive_batch(igt_spin_t *spin, int fd, uint32_t ctx, unsigned engine,
+		     uint32_t dep, unsigned int flags)
 {
 #define SCRATCH 0
 #define BATCH 1
 	const int gen = intel_gen(intel_get_drm_devid(fd));
-	struct drm_i915_gem_exec_object2 obj[2];
 	struct drm_i915_gem_relocation_entry relocs[2];
-	struct drm_i915_gem_execbuffer2 execbuf;
+	struct drm_i915_gem_execbuffer2 *execbuf;
+	struct drm_i915_gem_exec_object2 *obj;
 	unsigned int engines[16];
 	unsigned int nengine;
 	int fence_fd = -1;
@@ -101,8 +104,10 @@ static int emit_recursive_batch(igt_spin_t *spin,
 	}
 	igt_require(nengine);
 
-	memset(&execbuf, 0, sizeof(execbuf));
-	memset(obj, 0, sizeof(obj));
+	memset(&spin->execbuf, 0, sizeof(spin->execbuf));
+	execbuf = &spin->execbuf;
+	memset(spin->obj, 0, sizeof(spin->obj));
+	obj = spin->obj;
 	memset(relocs, 0, sizeof(relocs));
 
 	obj[BATCH].handle = gem_create(fd, BATCH_SIZE);
@@ -113,16 +118,62 @@ static int emit_recursive_batch(igt_spin_t *spin,
 				       	BATCH_SIZE, PROT_WRITE);
 	gem_set_domain(fd, obj[BATCH].handle,
 			I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT);
-	execbuf.buffer_count++;
+	execbuf->buffer_count++;
 
 	if (dep) {
+		igt_assert(!(flags & POLL_RUN));
+
 		/* dummy write to dependency */
 		obj[SCRATCH].handle = dep;
 		fill_reloc(&relocs[obj[BATCH].relocation_count++],
 			   dep, 1020,
 			   I915_GEM_DOMAIN_RENDER,
 			   I915_GEM_DOMAIN_RENDER);
-		execbuf.buffer_count++;
+		execbuf->buffer_count++;
+	} else if (flags & POLL_RUN) {
+		unsigned int offset;
+
+		igt_assert(!dep);
+
+		if (gen == 4 || gen == 5)
+			execbuf->flags |= I915_EXEC_SECURE;
+
+		spin->poll_handle = gem_create(fd, 4096);
+
+		if (__gem_set_caching(fd, spin->poll_handle,
+				      I915_CACHING_CACHED) == 0)
+			spin->running = __gem_mmap__cpu(fd, spin->poll_handle,
+							0, 4096,
+							PROT_READ | PROT_WRITE);
+		else
+			spin->running = __gem_mmap__wc(fd, spin->poll_handle,
+						       0, 4096,
+						       PROT_READ | PROT_WRITE);
+		igt_assert(spin->running);
+		igt_assert_eq(*spin->running, 0);
+
+		*batch++ = MI_STORE_DWORD_IMM | (gen < 6 ? 1 << 22 : 0);
+
+		if (gen >= 8) {
+			offset = 1;
+			*batch++ = 0;
+			*batch++ = 0;
+		} else if (gen >= 4) {
+			offset = 2;
+			*batch++ = 0;
+			*batch++ = 0;
+		} else {
+			offset = 1;
+			batch[-1]--;
+			*batch++ = 0;
+		}
+
+		*batch++ = 1;
+
+		obj[SCRATCH].handle = spin->poll_handle;
+		fill_reloc(&relocs[obj[BATCH].relocation_count++],
+			   spin->poll_handle, offset, 0, 0);
+		execbuf->buffer_count++;
 	}
 
 	spin->batch = batch;
@@ -167,18 +218,23 @@ static int emit_recursive_batch(igt_spin_t *spin,
 	obj[BATCH].relocation_count++;
 	obj[BATCH].relocs_ptr = to_user_pointer(relocs);
 
-	execbuf.buffers_ptr = to_user_pointer(obj + (2 - execbuf.buffer_count));
-	execbuf.rsvd1 = ctx;
+	execbuf->buffers_ptr = to_user_pointer(obj +
+					       (2 - execbuf->buffer_count));
+	execbuf->rsvd1 = ctx;
 
-	if (out_fence)
-		execbuf.flags |= I915_EXEC_FENCE_OUT;
+	if (flags & OUT_FENCE)
+		execbuf->flags |= I915_EXEC_FENCE_OUT;
 
 	for (i = 0; i < nengine; i++) {
-		execbuf.flags &= ~ENGINE_MASK;
-		execbuf.flags |= engines[i];
-		gem_execbuf_wr(fd, &execbuf);
-		if (out_fence) {
-			int _fd = execbuf.rsvd2 >> 32;
+		execbuf->flags &= ~ENGINE_MASK;
+		execbuf->flags |= engines[i];
+
+		if (flags & POLL_RUN)
+			igt_require(gem_can_store_dword(fd, execbuf->flags));
+
+		gem_execbuf_wr(fd, execbuf);
+		if (flags & OUT_FENCE) {
+			int _fd = execbuf->rsvd2 >> 32;
 
 			igt_assert(_fd >= 0);
 			if (fence_fd == -1) {
@@ -194,12 +250,20 @@ static int emit_recursive_batch(igt_spin_t *spin,
 		}
 	}
 
+	/* Make it easier for callers to resubmit. */
+
+	obj[BATCH].relocation_count = 0;
+	obj[BATCH].relocs_ptr = 0;
+
+	obj[SCRATCH].flags = EXEC_OBJECT_PINNED;
+	obj[BATCH].flags = EXEC_OBJECT_PINNED;
+
 	return fence_fd;
 }
 
 static igt_spin_t *
 ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
-		      int out_fence)
+		      unsigned int flags)
 {
 	igt_spin_t *spin;
 
@@ -207,7 +271,7 @@ ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
 	igt_assert(spin);
 
 	spin->out_fence = emit_recursive_batch(spin, fd, ctx, engine, dep,
-					       out_fence);
+					       flags);
 
 	pthread_mutex_lock(&list_lock);
 	igt_list_add(&spin->link, &spin_list);
@@ -219,7 +283,7 @@ ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
 igt_spin_t *
 __igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep)
 {
-	return ___igt_spin_batch_new(fd, ctx, engine, dep, false);
+	return ___igt_spin_batch_new(fd, ctx, engine, dep, 0);
 }
 
 /**
@@ -253,7 +317,7 @@ igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep)
 igt_spin_t *
 __igt_spin_batch_new_fence(int fd, uint32_t ctx, unsigned engine)
 {
-	return ___igt_spin_batch_new(fd, ctx, engine, 0, true);
+	return ___igt_spin_batch_new(fd, ctx, engine, 0, OUT_FENCE);
 }
 
 /**
@@ -286,6 +350,42 @@ igt_spin_batch_new_fence(int fd, uint32_t ctx, unsigned engine)
 	return spin;
 }
 
+igt_spin_t *
+__igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine)
+{
+	return ___igt_spin_batch_new(fd, ctx, engine, 0, POLL_RUN);
+}
+
+/**
+ * igt_spin_batch_new_poll:
+ * @fd: open i915 drm file descriptor
+ * @engine: Ring to execute batch OR'd with execbuf flags. If value is less
+ *          than 0, execute on all available rings.
+ *
+ * Start a recursive batch on a ring. Immediately returns a #igt_spin_t that
+ * contains the batch's handle that can be waited upon. The returned structure
+ * must be passed to igt_spin_batch_free() for post-processing.
+ *
+ * igt_spin_t->running will containt a pointer which target will change from
+ * zero to one once the spinner actually starts executing on the GPU.
+ *
+ * Returns:
+ * Structure with helper internal state for igt_spin_batch_free().
+ */
+igt_spin_t *
+igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine)
+{
+	igt_spin_t *spin;
+
+	igt_require_gem(fd);
+	igt_require(gem_mmap__has_wc(fd));
+
+	spin = __igt_spin_batch_new_poll(fd, ctx, engine);
+	igt_assert(gem_bo_busy(fd, spin->handle));
+
+	return spin;
+}
+
 static void notify(union sigval arg)
 {
 	igt_spin_t *spin = arg.sival_ptr;
@@ -365,7 +465,13 @@ void igt_spin_batch_free(int fd, igt_spin_t *spin)
 		timer_delete(spin->timer);
 
 	igt_spin_batch_end(spin);
-	gem_munmap(spin->batch, BATCH_SIZE);
+	gem_munmap((void *)((unsigned long)spin->batch & (~4095UL)),
+		   BATCH_SIZE);
+
+	if (spin->running) {
+		gem_munmap(spin->running, 4096);
+		gem_close(fd, spin->poll_handle);
+	}
 
 	gem_close(fd, spin->handle);
 
diff --git a/lib/igt_dummyload.h b/lib/igt_dummyload.h
index 4103e4ab9e36..3103935a309b 100644
--- a/lib/igt_dummyload.h
+++ b/lib/igt_dummyload.h
@@ -36,6 +36,10 @@ typedef struct igt_spin {
 	struct igt_list link;
 	uint32_t *batch;
 	int out_fence;
+	struct drm_i915_gem_exec_object2 obj[2];
+	struct drm_i915_gem_execbuffer2 execbuf;
+	uint32_t poll_handle;
+	bool *running;
 } igt_spin_t;
 
 igt_spin_t *__igt_spin_batch_new(int fd,
@@ -55,6 +59,13 @@ igt_spin_t *igt_spin_batch_new_fence(int fd,
 				     uint32_t ctx,
 				     unsigned engine);
 
+igt_spin_t *__igt_spin_batch_new_poll(int fd,
+				       uint32_t ctx,
+				       unsigned engine);
+igt_spin_t *igt_spin_batch_new_poll(int fd,
+				    uint32_t ctx,
+				    unsigned engine);
+
 void igt_spin_batch_set_timeout(igt_spin_t *spin, int64_t ns);
 void igt_spin_batch_end(igt_spin_t *spin);
 void igt_spin_batch_free(int fd, igt_spin_t *spin);
diff --git a/lib/igt_gt.c b/lib/igt_gt.c
index 01aebc670862..4569fd36bd85 100644
--- a/lib/igt_gt.c
+++ b/lib/igt_gt.c
@@ -609,7 +609,7 @@ bool gem_can_store_dword(int fd, unsigned int engine)
 	if (gen == 3 && (info->is_grantsdale || info->is_alviso))
 		return false; /* only supports physical addresses */
 
-	if (gen == 6 && (engine & ~(3<<13)) == I915_EXEC_BSD)
+	if (gen == 6 && ((engine & 0x3f) == I915_EXEC_BSD))
 		return false; /* kills the machine! */
 
 	if (info->is_broadwater)
diff --git a/lib/ioctl_wrappers.c b/lib/ioctl_wrappers.c
index 8748cfcfc04f..4e1a08bf06b4 100644
--- a/lib/ioctl_wrappers.c
+++ b/lib/ioctl_wrappers.c
@@ -198,7 +198,7 @@ void gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride)
 	igt_assert(__gem_set_tiling(fd, handle, tiling, stride) == 0);
 }
 
-static int __gem_set_caching(int fd, uint32_t handle, uint32_t caching)
+int __gem_set_caching(int fd, uint32_t handle, uint32_t caching)
 {
 	struct drm_i915_gem_caching arg;
 	int err;
diff --git a/lib/ioctl_wrappers.h b/lib/ioctl_wrappers.h
index 13fbe3c103c0..b966f72c90a8 100644
--- a/lib/ioctl_wrappers.h
+++ b/lib/ioctl_wrappers.h
@@ -61,6 +61,7 @@ bool gem_get_tiling(int fd, uint32_t handle, uint32_t *tiling, uint32_t *swizzle
 void gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride);
 int __gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride);
 
+int __gem_set_caching(int fd, uint32_t handle, uint32_t caching);
 void gem_set_caching(int fd, uint32_t handle, uint32_t caching);
 uint32_t gem_get_caching(int fd, uint32_t handle);
 uint32_t gem_flink(int fd, uint32_t handle);
diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
index 19fcc95ffc7f..87875fb9e06a 100644
--- a/tests/perf_pmu.c
+++ b/tests/perf_pmu.c
@@ -170,6 +170,56 @@ static unsigned int e2ring(int gem_fd, const struct intel_execution_engine2 *e)
 #define FLAG_LONG (16)
 #define FLAG_HANG (32)
 
+static igt_spin_t * __spin_poll(int fd, uint32_t ctx, unsigned long flags)
+{
+	if (gem_can_store_dword(fd, flags))
+		return __igt_spin_batch_new_poll(fd, ctx, flags);
+	else
+		return __igt_spin_batch_new(fd, ctx, flags, 0);
+}
+
+static unsigned long __spin_wait(int fd, igt_spin_t *spin)
+{
+	struct timespec start = { };
+
+	igt_nsec_elapsed(&start);
+
+	if (gem_can_store_dword(fd, spin->execbuf.flags)) {
+		unsigned long timeout = 0;
+
+		while (!spin->running) {
+			unsigned long t = igt_nsec_elapsed(&start);
+
+			if ((t - timeout) > 250e6) {
+				timeout = t;
+				igt_warn("Spinner not running after %.2fms\n",
+					 (double)t / 1e6);
+			}
+		};
+	} else {
+		igt_debug("__spin_wait - usleep mode\n");
+		usleep(500e3); /* Better than nothing! */
+	}
+
+	return igt_nsec_elapsed(&start);
+}
+
+static igt_spin_t * __spin_sync(int fd, uint32_t ctx, unsigned long flags)
+{
+	igt_spin_t *spin = __spin_poll(fd, ctx, flags);
+
+	__spin_wait(fd, spin);
+
+	return spin;
+}
+
+static igt_spin_t * spin_sync(int fd, uint32_t ctx, unsigned long flags)
+{
+	igt_require_gem(fd);
+
+	return __spin_sync(fd, ctx, flags);
+}
+
 static void end_spin(int fd, igt_spin_t *spin, unsigned int flags)
 {
 	if (!spin)
@@ -180,8 +230,25 @@ static void end_spin(int fd, igt_spin_t *spin, unsigned int flags)
 	if (flags & FLAG_SYNC)
 		gem_sync(fd, spin->handle);
 
-	if (flags & TEST_TRAILING_IDLE)
-		usleep(batch_duration_ns / 5000);
+	if (flags & TEST_TRAILING_IDLE) {
+		unsigned long t, timeout = 0;
+		struct timespec start = { };
+
+		igt_nsec_elapsed(&start);
+
+		do {
+			t = igt_nsec_elapsed(&start);
+
+			if (gem_bo_busy(fd, spin->handle) &&
+			    (t - timeout) > 10e6) {
+				timeout = t;
+				igt_warn("Spinner not idle after %.2fms\n",
+					 (double)t / 1e6);
+			}
+
+			usleep(1e3);
+		} while (t < batch_duration_ns / 5);
+	}
 }
 
 static void
@@ -195,7 +262,7 @@ single(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags)
 	fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance));
 
 	if (flags & TEST_BUSY)
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	else
 		spin = NULL;
 
@@ -251,13 +318,7 @@ busy_start(int gem_fd, const struct intel_execution_engine2 *e)
 	 */
 	sleep(2);
 
-	spin = __igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
-
-	/*
-	 * Sleep for a bit after making the engine busy to make sure the PMU
-	 * gets enabled when the batch is already running.
-	 */
-	usleep(500e3);
+	spin = __spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 
 	fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance));
 
@@ -300,7 +361,7 @@ busy_double_start(int gem_fd, const struct intel_execution_engine2 *e)
 	 * re-submission in execlists mode. Make sure busyness is correctly
 	 * reported with the engine busy, and after the engine went idle.
 	 */
-	spin[0] = __igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin[0] = __spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	usleep(500e3);
 	spin[1] = __igt_spin_batch_new(gem_fd, ctx, e2ring(gem_fd, e), 0);
 
@@ -386,7 +447,7 @@ busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 
 	igt_assert_eq(i, num_engines);
 
-	spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -412,15 +473,13 @@ busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 }
 
 static void
-__submit_spin_batch(int gem_fd,
-		    struct drm_i915_gem_exec_object2 *obj,
+__submit_spin_batch(int gem_fd, igt_spin_t *spin,
 		    const struct intel_execution_engine2 *e)
 {
-	struct drm_i915_gem_execbuffer2 eb = {
-		.buffer_count = 1,
-		.buffers_ptr = to_user_pointer(obj),
-		.flags = e2ring(gem_fd, e),
-	};
+	struct drm_i915_gem_execbuffer2 eb = spin->execbuf;
+
+	eb.flags &= ~(0x3f | I915_EXEC_BSD_MASK);
+	eb.flags |= e2ring(gem_fd, e) | I915_EXEC_NO_RELOC;
 
 	gem_execbuf(gem_fd, &eb);
 }
@@ -429,7 +488,6 @@ static void
 most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 		    const unsigned int num_engines, unsigned int flags)
 {
-	struct drm_i915_gem_exec_object2 obj = {};
 	const struct intel_execution_engine2 *e_;
 	uint64_t tval[2][num_engines];
 	uint64_t val[num_engines];
@@ -443,15 +501,12 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 		if (!gem_has_engine(gem_fd, e_->class, e_->instance))
 			continue;
 
-		if (e == e_) {
+		if (e == e_)
 			idle_idx = i;
-		} else if (spin) {
-			__submit_spin_batch(gem_fd, &obj, e_);
-		} else {
-			spin = igt_spin_batch_new(gem_fd, 0,
-						  e2ring(gem_fd, e_), 0);
-			obj.handle = spin->handle;
-		}
+		else if (spin)
+			__submit_spin_batch(gem_fd, spin, e_);
+		else
+			spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e_));
 
 		val[i++] = I915_PMU_ENGINE_BUSY(e_->class, e_->instance);
 	}
@@ -461,6 +516,9 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 	for (i = 0; i < num_engines; i++)
 		fd[i] = open_group(val[i], fd[0]);
 
+	/* Small delay to allow engines to start. */
+	usleep(__spin_wait(gem_fd, spin) * num_engines / 1e3);
+
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -489,7 +547,6 @@ static void
 all_busy_check_all(int gem_fd, const unsigned int num_engines,
 		   unsigned int flags)
 {
-	struct drm_i915_gem_exec_object2 obj = {};
 	const struct intel_execution_engine2 *e;
 	uint64_t tval[2][num_engines];
 	uint64_t val[num_engines];
@@ -503,13 +560,10 @@ all_busy_check_all(int gem_fd, const unsigned int num_engines,
 		if (!gem_has_engine(gem_fd, e->class, e->instance))
 			continue;
 
-		if (spin) {
-			__submit_spin_batch(gem_fd, &obj, e);
-		} else {
-			spin = igt_spin_batch_new(gem_fd, 0,
-						  e2ring(gem_fd, e), 0);
-			obj.handle = spin->handle;
-		}
+		if (spin)
+			__submit_spin_batch(gem_fd, spin, e);
+		else
+			spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e));
 
 		val[i++] = I915_PMU_ENGINE_BUSY(e->class, e->instance);
 	}
@@ -519,6 +573,9 @@ all_busy_check_all(int gem_fd, const unsigned int num_engines,
 	for (i = 0; i < num_engines; i++)
 		fd[i] = open_group(val[i], fd[0]);
 
+	/* Small delay to allow engines to start. */
+	usleep(__spin_wait(gem_fd, spin) * num_engines / 1e3);
+
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -550,7 +607,7 @@ no_sema(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags)
 	open_group(I915_PMU_ENGINE_WAIT(e->class, e->instance), fd);
 
 	if (flags & TEST_BUSY)
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	else
 		spin = NULL;
 
@@ -884,7 +941,7 @@ multi_client(int gem_fd, const struct intel_execution_engine2 *e)
 	 */
 	fd[1] = open_pmu(config);
 
-	spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 
 	val[0] = val[1] = __pmu_read_single(fd[0], &ts[0]);
 	slept[1] = measured_usleep(batch_duration_ns / 1000);
@@ -1248,7 +1305,7 @@ test_frequency(int gem_fd)
 	igt_require(igt_sysfs_get_u32(sysfs, "gt_boost_freq_mhz") == min_freq);
 
 	gem_quiescent_gpu(gem_fd); /* Idle to be sure the change takes effect */
-	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+	spin = spin_sync(gem_fd, 0, I915_EXEC_RENDER);
 
 	slept = pmu_read_multi(fd, 2, start);
 	measured_usleep(batch_duration_ns / 1000);
@@ -1274,7 +1331,7 @@ test_frequency(int gem_fd)
 	igt_require(igt_sysfs_get_u32(sysfs, "gt_min_freq_mhz") == max_freq);
 
 	gem_quiescent_gpu(gem_fd);
-	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+	spin = spin_sync(gem_fd, 0, I915_EXEC_RENDER);
 
 	slept = pmu_read_multi(fd, 2, start);
 	measured_usleep(batch_duration_ns / 1000);
@@ -1455,6 +1512,11 @@ static void __rearm_spin_batch(igt_spin_t *spin)
 {
 	const uint32_t mi_arb_chk = 0x5 << 23;
 
+	if (spin->running) {
+		igt_assert(*spin->running);
+		*spin->running = 0;
+	}
+	igt_assert_eq(*spin->batch, MI_BATCH_BUFFER_END);
        *spin->batch = mi_arb_chk;
        __sync_synchronize();
 }
@@ -1489,6 +1551,9 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 	/* Sampling platforms cannot reach the high accuracy criteria. */
 	igt_require(gem_has_execlists(gem_fd));
 
+	/* Need store dword for accurate PWM. */
+	igt_require(gem_can_store_dword(gem_fd, e2ring(gem_fd, e)));
+
 	while (idle_us < 2500) {
 		busy_us *= 2;
 		idle_us *= 2;
@@ -1517,7 +1582,6 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 		const unsigned long timeout[] = {
 			pwm_calibration_us * 1000, test_us * 1000
 		};
-		struct drm_i915_gem_exec_object2 obj = {};
 		uint64_t total_busy_ns = 0, total_idle_ns = 0;
 		igt_spin_t *spin;
 		int ret;
@@ -1530,12 +1594,9 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 			igt_warn("Failed to set scheduling policy!\n");
 
 		/* Allocate our spin batch and idle it. */
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
-		obj.handle = spin->handle;
-		__submit_spin_batch(gem_fd, &obj, e); /* record its location */
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 		igt_spin_batch_end(spin);
-		gem_sync(gem_fd, obj.handle);
-		obj.flags |= EXEC_OBJECT_PINNED;
+		gem_sync(gem_fd, spin->handle);
 
 		/* 1st pass is calibration, second pass is the test. */
 		for (int pass = 0; pass < ARRAY_SIZE(timeout); pass++) {
@@ -1545,24 +1606,30 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 
 			igt_nsec_elapsed(&test_start);
 			do {
-				unsigned int target_idle_us, t_busy;
+				unsigned int target_idle_us;
+				struct timespec start = { };
+				unsigned long prep_delay_ns;
 
 				/* Restart the spinbatch. */
+				igt_nsec_elapsed(&start);
 				__rearm_spin_batch(spin);
-				__submit_spin_batch(gem_fd, &obj, e);
+				__submit_spin_batch(gem_fd, spin, e);
 
-				/*
-				 * Note that the submission may be delayed to a
-				 * tasklet (ksoftirqd) which cannot run until we
-				 * sleep as we hog the cpu (we are RT).
-				 */
+				 /* Wait for batch to start executing. */
+				__spin_wait(gem_fd, spin);
+				prep_delay_ns = igt_nsec_elapsed(&start);
 
-				t_busy = measured_usleep(busy_us);
+				/* PWM busy sleep. */
+				memset(&start, 0, sizeof(start));
+				igt_nsec_elapsed(&start);
+				measured_usleep(busy_us);
 				igt_spin_batch_end(spin);
-				gem_sync(gem_fd, obj.handle);
+				gem_sync(gem_fd, spin->handle);
 
-				total_busy_ns += t_busy;
+				total_busy_ns += igt_nsec_elapsed(&start);
+				total_idle_ns += prep_delay_ns;
 
+				/* Re-calibrate. */
 				target_idle_us =
 					(100 * total_busy_ns / target_busy_pct - (total_busy_ns + total_idle_ns)) / 1000;
 				total_idle_ns += measured_usleep(target_idle_us);
-- 
2.14.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [igt-dev] ✓ Fi.CI.BAT: success for tests/perf_pmu: Improve accuracy by waiting on spinner to start (rev6)
  2018-03-15 12:56 ` [igt-dev] " Tvrtko Ursulin
                   ` (9 preceding siblings ...)
  (?)
@ 2018-03-16 14:30 ` Patchwork
  -1 siblings, 0 replies; 53+ messages in thread
From: Patchwork @ 2018-03-16 14:30 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: igt-dev

== Series Details ==

Series: tests/perf_pmu: Improve accuracy by waiting on spinner to start (rev6)
URL   : https://patchwork.freedesktop.org/series/40027/
State : success

== Summary ==

IGT patchset tested on top of latest successful build
2e92134b4e4f754239f9721b8781ce2fc0aab07c tests/kms_frontbuffer_tracking: Reduce fbc status spam, v2.

with latest DRM-Tip kernel build CI_DRM_3939
b8d045232d3b drm-tip: 2018y-03m-16d-12h-56m-11s UTC integration manifest

No testlist changes.

---- Known issues:

Test kms_flip:
        Subgroup basic-flip-vs-wf_vblank:
                pass       -> FAIL       (fi-cfl-s2) fdo#100368
Test prime_vgem:
        Subgroup basic-fence-flip:
                pass       -> FAIL       (fi-ilk-650) fdo#104008

fdo#100368 https://bugs.freedesktop.org/show_bug.cgi?id=100368
fdo#104008 https://bugs.freedesktop.org/show_bug.cgi?id=104008

fi-bdw-5557u     total:285  pass:264  dwarn:0   dfail:0   fail:0   skip:21  time:433s
fi-bdw-gvtdvm    total:285  pass:261  dwarn:0   dfail:0   fail:0   skip:24  time:440s
fi-blb-e6850     total:285  pass:220  dwarn:1   dfail:0   fail:0   skip:64  time:387s
fi-bsw-n3050     total:285  pass:239  dwarn:0   dfail:0   fail:0   skip:46  time:540s
fi-bwr-2160      total:285  pass:180  dwarn:0   dfail:0   fail:0   skip:105 time:300s
fi-bxt-dsi       total:285  pass:255  dwarn:0   dfail:0   fail:0   skip:30  time:515s
fi-bxt-j4205     total:285  pass:256  dwarn:0   dfail:0   fail:0   skip:29  time:518s
fi-byt-j1900     total:285  pass:250  dwarn:0   dfail:0   fail:0   skip:35  time:516s
fi-byt-n2820     total:285  pass:246  dwarn:0   dfail:0   fail:0   skip:39  time:504s
fi-cfl-8700k     total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:410s
fi-cfl-s2        total:285  pass:258  dwarn:0   dfail:0   fail:1   skip:26  time:578s
fi-cfl-u         total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:512s
fi-cnl-drrs      total:285  pass:254  dwarn:3   dfail:0   fail:0   skip:28  time:524s
fi-cnl-y3        total:108  pass:96   dwarn:0   dfail:0   fail:0   skip:11 
fi-elk-e7500     total:285  pass:225  dwarn:1   dfail:0   fail:0   skip:59  time:427s
fi-gdg-551       total:285  pass:176  dwarn:0   dfail:0   fail:1   skip:108 time:317s
fi-glk-1         total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:538s
fi-hsw-4770      total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:406s
fi-ilk-650       total:285  pass:224  dwarn:0   dfail:0   fail:1   skip:60  time:423s
fi-ivb-3520m     total:285  pass:256  dwarn:0   dfail:0   fail:0   skip:29  time:472s
fi-ivb-3770      total:285  pass:252  dwarn:0   dfail:0   fail:0   skip:33  time:430s
fi-kbl-7500u     total:285  pass:260  dwarn:1   dfail:0   fail:0   skip:24  time:476s
fi-kbl-7567u     total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:471s
fi-kbl-r         total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:515s
fi-pnv-d510      total:285  pass:219  dwarn:1   dfail:0   fail:0   skip:65  time:654s
fi-skl-6260u     total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:449s
fi-skl-6600u     total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:528s
fi-skl-6700hq    total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:540s
fi-skl-6700k2    total:285  pass:261  dwarn:0   dfail:0   fail:0   skip:24  time:501s
fi-skl-6770hq    total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:491s
fi-skl-guc       total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:428s
fi-skl-gvtdvm    total:285  pass:262  dwarn:0   dfail:0   fail:0   skip:23  time:445s
fi-snb-2520m     total:242  pass:208  dwarn:0   dfail:0   fail:0   skip:33 
fi-snb-2600      total:285  pass:245  dwarn:0   dfail:0   fail:0   skip:40  time:399s

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_1150/issues.html
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 53+ messages in thread

* [igt-dev] ✗ Fi.CI.IGT: failure for tests/perf_pmu: Improve accuracy by waiting on spinner to start (rev6)
  2018-03-15 12:56 ` [igt-dev] " Tvrtko Ursulin
                   ` (10 preceding siblings ...)
  (?)
@ 2018-03-19 10:24 ` Patchwork
  -1 siblings, 0 replies; 53+ messages in thread
From: Patchwork @ 2018-03-19 10:24 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: igt-dev

== Series Details ==

Series: tests/perf_pmu: Improve accuracy by waiting on spinner to start (rev6)
URL   : https://patchwork.freedesktop.org/series/40027/
State : failure

== Summary ==

---- Possible new issues:

Test kms_cursor_legacy:
        Subgroup flip-vs-cursor-busy-crc-legacy:
                pass       -> FAIL       (shard-snb)
Test perf_pmu:
        Subgroup all-busy-check-all:
                pass       -> FAIL       (shard-snb)
        Subgroup all-busy-idle-check-all:
                pass       -> FAIL       (shard-hsw)
        Subgroup busy-check-all-bcs0:
                pass       -> INCOMPLETE (shard-snb)
        Subgroup busy-double-start-bcs0:
                pass       -> INCOMPLETE (shard-snb)
        Subgroup busy-double-start-rcs0:
                pass       -> INCOMPLETE (shard-snb)
        Subgroup busy-idle-check-all-bcs0:
                pass       -> INCOMPLETE (shard-snb)
        Subgroup busy-idle-check-all-vecs0:
                pass       -> FAIL       (shard-hsw)
        Subgroup busy-idle-no-semaphores-bcs0:
                pass       -> WARN       (shard-hsw)
                pass       -> INCOMPLETE (shard-snb)
        Subgroup busy-idle-rcs0:
                pass       -> FAIL       (shard-hsw)
        Subgroup busy-start-rcs0:
                pass       -> FAIL       (shard-snb)
        Subgroup most-busy-check-all-bcs0:
                pass       -> FAIL       (shard-snb)
        Subgroup most-busy-check-all-rcs0:
                pass       -> INCOMPLETE (shard-snb)
        Subgroup most-busy-check-all-vcs0:
                pass       -> FAIL       (shard-snb)
        Subgroup most-busy-idle-check-all-bcs0:
                pass       -> FAIL       (shard-hsw)
        Subgroup most-busy-idle-check-all-vcs0:
                pass       -> FAIL       (shard-hsw)
                pass       -> INCOMPLETE (shard-snb)
        Subgroup multi-client-rcs0:
                pass       -> FAIL       (shard-snb)
        Subgroup render-node-busy-idle-rcs0:
                pass       -> FAIL       (shard-hsw)
                pass       -> FAIL       (shard-snb)
        Subgroup render-node-busy-idle-vcs0:
                pass       -> FAIL       (shard-hsw)
        Subgroup render-node-busy-idle-vecs0:
                pass       -> FAIL       (shard-hsw)
        Subgroup render-node-busy-rcs0:
                pass       -> DMESG-FAIL (shard-snb)

---- Known issues:

Test gem_fenced_exec_thrash:
        Subgroup 2-spare-fences:
                dmesg-warn -> PASS       (shard-hsw) fdo#102614 +1
Test kms_flip:
        Subgroup 2x-plain-flip-fb-recreate-interruptible:
                fail       -> PASS       (shard-hsw) fdo#100368
Test kms_rotation_crc:
        Subgroup primary-rotation-180:
                pass       -> FAIL       (shard-snb) fdo#103925
        Subgroup sprite-rotation-180:
                fail       -> PASS       (shard-hsw) fdo#105185
Test perf:
        Subgroup blocking:
                pass       -> FAIL       (shard-hsw) fdo#102252
Test prime_vgem:
        Subgroup basic-fence-flip:
                fail       -> PASS       (shard-apl) fdo#104008

fdo#102614 https://bugs.freedesktop.org/show_bug.cgi?id=102614
fdo#100368 https://bugs.freedesktop.org/show_bug.cgi?id=100368
fdo#103925 https://bugs.freedesktop.org/show_bug.cgi?id=103925
fdo#105185 https://bugs.freedesktop.org/show_bug.cgi?id=105185
fdo#102252 https://bugs.freedesktop.org/show_bug.cgi?id=102252
fdo#104008 https://bugs.freedesktop.org/show_bug.cgi?id=104008

shard-apl        total:3442 pass:1814 dwarn:1   dfail:0   fail:7   skip:1619 time:12974s
shard-hsw        total:3442 pass:1758 dwarn:1   dfail:0   fail:10  skip:1671 time:11946s
shard-snb        total:3130 pass:1223 dwarn:1   dfail:1   fail:12  skip:1885 time:5818s
Blacklisted hosts:
shard-kbl        total:3437 pass:1935 dwarn:1   dfail:0   fail:10  skip:1490 time:9574s

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_1150/shards.html
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 53+ messages in thread

* [PATCH i-g-t v7] tests/perf_pmu: Improve accuracy by waiting on spinner to start
  2018-03-16 13:31             ` [Intel-gfx] " Tvrtko Ursulin
@ 2018-03-19 13:56               ` Tvrtko Ursulin
  -1 siblings, 0 replies; 53+ messages in thread
From: Tvrtko Ursulin @ 2018-03-19 13:56 UTC (permalink / raw)
  To: igt-dev; +Cc: Intel-gfx

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

More than one test assumes that the spinner is running pretty much
immediately after we have create or submitted it.

In actuality there is a variable delay, especially on execlists platforms,
between submission and spin batch starting to run on the hardware.

To enable tests which care about this level of timing to account for this,
we add a new spin batch constructor which provides an output field which
can be polled to determine when the batch actually started running.

This is implemented via MI_STOREDW_IMM from the spin batch, writing into
memory mapped page shared with userspace.

Using this facility from perf_pmu, where applicable, should improve very
occasional test fails across the set and platforms.

v2:
 Chris Wilson:
 * Use caching mapping if available.
 * Handle old gens better.
 * Use gem_can_store_dword.
 * Cache exec obj array in spin_batch_t for easier resubmit.

v3:
 * Forgot I915_EXEC_NO_RELOC. (Chris Wilson)

v4:
 * Mask out all non-engine flags in gem_can_store_dword.
 * Added some debug logging.

v5:
 * Fix relocs and batch munmap. (Chris)
 * Added assert idle spinner batch looks as expected.

v6:
 * Skip accuracy tests when !gem_can_store_dword.

v7:
 * Fix batch recursion reloc address.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 lib/igt_dummyload.c  | 157 +++++++++++++++++++++++++++++++++++++-------
 lib/igt_dummyload.h  |  11 ++++
 lib/igt_gt.c         |   2 +-
 lib/ioctl_wrappers.c |   2 +-
 lib/ioctl_wrappers.h |   1 +
 tests/perf_pmu.c     | 179 +++++++++++++++++++++++++++++++++++----------------
 6 files changed, 269 insertions(+), 83 deletions(-)

diff --git a/lib/igt_dummyload.c b/lib/igt_dummyload.c
index 4b20f23dfe26..44e771be46ee 100644
--- a/lib/igt_dummyload.c
+++ b/lib/igt_dummyload.c
@@ -74,20 +74,23 @@ fill_reloc(struct drm_i915_gem_relocation_entry *reloc,
 	reloc->write_domain = write_domains;
 }
 
-static int emit_recursive_batch(igt_spin_t *spin,
-				int fd, uint32_t ctx, unsigned engine,
-				uint32_t dep, bool out_fence)
+#define OUT_FENCE	(1 << 0)
+#define POLL_RUN	(1 << 1)
+
+static int
+emit_recursive_batch(igt_spin_t *spin, int fd, uint32_t ctx, unsigned engine,
+		     uint32_t dep, unsigned int flags)
 {
 #define SCRATCH 0
 #define BATCH 1
 	const int gen = intel_gen(intel_get_drm_devid(fd));
-	struct drm_i915_gem_exec_object2 obj[2];
 	struct drm_i915_gem_relocation_entry relocs[2];
-	struct drm_i915_gem_execbuffer2 execbuf;
+	struct drm_i915_gem_execbuffer2 *execbuf;
+	struct drm_i915_gem_exec_object2 *obj;
 	unsigned int engines[16];
 	unsigned int nengine;
 	int fence_fd = -1;
-	uint32_t *batch;
+	uint32_t *batch, *batch_start;
 	int i;
 
 	nengine = 0;
@@ -101,8 +104,10 @@ static int emit_recursive_batch(igt_spin_t *spin,
 	}
 	igt_require(nengine);
 
-	memset(&execbuf, 0, sizeof(execbuf));
-	memset(obj, 0, sizeof(obj));
+	memset(&spin->execbuf, 0, sizeof(spin->execbuf));
+	execbuf = &spin->execbuf;
+	memset(spin->obj, 0, sizeof(spin->obj));
+	obj = spin->obj;
 	memset(relocs, 0, sizeof(relocs));
 
 	obj[BATCH].handle = gem_create(fd, BATCH_SIZE);
@@ -113,16 +118,63 @@ static int emit_recursive_batch(igt_spin_t *spin,
 				       	BATCH_SIZE, PROT_WRITE);
 	gem_set_domain(fd, obj[BATCH].handle,
 			I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT);
-	execbuf.buffer_count++;
+	execbuf->buffer_count++;
+	batch_start = batch;
 
 	if (dep) {
+		igt_assert(!(flags & POLL_RUN));
+
 		/* dummy write to dependency */
 		obj[SCRATCH].handle = dep;
 		fill_reloc(&relocs[obj[BATCH].relocation_count++],
 			   dep, 1020,
 			   I915_GEM_DOMAIN_RENDER,
 			   I915_GEM_DOMAIN_RENDER);
-		execbuf.buffer_count++;
+		execbuf->buffer_count++;
+	} else if (flags & POLL_RUN) {
+		unsigned int offset;
+
+		igt_assert(!dep);
+
+		if (gen == 4 || gen == 5)
+			execbuf->flags |= I915_EXEC_SECURE;
+
+		spin->poll_handle = gem_create(fd, 4096);
+
+		if (__gem_set_caching(fd, spin->poll_handle,
+				      I915_CACHING_CACHED) == 0)
+			spin->running = __gem_mmap__cpu(fd, spin->poll_handle,
+							0, 4096,
+							PROT_READ | PROT_WRITE);
+		else
+			spin->running = __gem_mmap__wc(fd, spin->poll_handle,
+						       0, 4096,
+						       PROT_READ | PROT_WRITE);
+		igt_assert(spin->running);
+		igt_assert_eq(*spin->running, 0);
+
+		*batch++ = MI_STORE_DWORD_IMM | (gen < 6 ? 1 << 22 : 0);
+
+		if (gen >= 8) {
+			offset = 1;
+			*batch++ = 0;
+			*batch++ = 0;
+		} else if (gen >= 4) {
+			offset = 2;
+			*batch++ = 0;
+			*batch++ = 0;
+		} else {
+			offset = 1;
+			batch[-1]--;
+			*batch++ = 0;
+		}
+
+		*batch++ = 1;
+
+		obj[SCRATCH].handle = spin->poll_handle;
+		fill_reloc(&relocs[obj[BATCH].relocation_count++],
+			   spin->poll_handle, offset, 0, 0);
+		execbuf->buffer_count++;
 	}
 
 	spin->batch = batch;
@@ -146,7 +198,7 @@ static int emit_recursive_batch(igt_spin_t *spin,
 
 	/* recurse */
 	fill_reloc(&relocs[obj[BATCH].relocation_count],
-		   obj[BATCH].handle, (batch - spin->batch) + 1,
+		   obj[BATCH].handle, (batch - batch_start) + 1,
 		   I915_GEM_DOMAIN_COMMAND, 0);
 	if (gen >= 8) {
 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
@@ -167,18 +219,23 @@ static int emit_recursive_batch(igt_spin_t *spin,
 	obj[BATCH].relocation_count++;
 	obj[BATCH].relocs_ptr = to_user_pointer(relocs);
 
-	execbuf.buffers_ptr = to_user_pointer(obj + (2 - execbuf.buffer_count));
-	execbuf.rsvd1 = ctx;
+	execbuf->buffers_ptr = to_user_pointer(obj +
+					       (2 - execbuf->buffer_count));
+	execbuf->rsvd1 = ctx;
 
-	if (out_fence)
-		execbuf.flags |= I915_EXEC_FENCE_OUT;
+	if (flags & OUT_FENCE)
+		execbuf->flags |= I915_EXEC_FENCE_OUT;
 
 	for (i = 0; i < nengine; i++) {
-		execbuf.flags &= ~ENGINE_MASK;
-		execbuf.flags |= engines[i];
-		gem_execbuf_wr(fd, &execbuf);
-		if (out_fence) {
-			int _fd = execbuf.rsvd2 >> 32;
+		execbuf->flags &= ~ENGINE_MASK;
+		execbuf->flags |= engines[i];
+
+		if (flags & POLL_RUN)
+			igt_require(gem_can_store_dword(fd, execbuf->flags));
+
+		gem_execbuf_wr(fd, execbuf);
+		if (flags & OUT_FENCE) {
+			int _fd = execbuf->rsvd2 >> 32;
 
 			igt_assert(_fd >= 0);
 			if (fence_fd == -1) {
@@ -194,12 +251,20 @@ static int emit_recursive_batch(igt_spin_t *spin,
 		}
 	}
 
+	/* Make it easier for callers to resubmit. */
+
+	obj[BATCH].relocation_count = 0;
+	obj[BATCH].relocs_ptr = 0;
+
+	obj[SCRATCH].flags = EXEC_OBJECT_PINNED;
+	obj[BATCH].flags = EXEC_OBJECT_PINNED;
+
 	return fence_fd;
 }
 
 static igt_spin_t *
 ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
-		      int out_fence)
+		      unsigned int flags)
 {
 	igt_spin_t *spin;
 
@@ -207,7 +272,7 @@ ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
 	igt_assert(spin);
 
 	spin->out_fence = emit_recursive_batch(spin, fd, ctx, engine, dep,
-					       out_fence);
+					       flags);
 
 	pthread_mutex_lock(&list_lock);
 	igt_list_add(&spin->link, &spin_list);
@@ -219,7 +284,7 @@ ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
 igt_spin_t *
 __igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep)
 {
-	return ___igt_spin_batch_new(fd, ctx, engine, dep, false);
+	return ___igt_spin_batch_new(fd, ctx, engine, dep, 0);
 }
 
 /**
@@ -253,7 +318,7 @@ igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep)
 igt_spin_t *
 __igt_spin_batch_new_fence(int fd, uint32_t ctx, unsigned engine)
 {
-	return ___igt_spin_batch_new(fd, ctx, engine, 0, true);
+	return ___igt_spin_batch_new(fd, ctx, engine, 0, OUT_FENCE);
 }
 
 /**
@@ -286,6 +351,42 @@ igt_spin_batch_new_fence(int fd, uint32_t ctx, unsigned engine)
 	return spin;
 }
 
+igt_spin_t *
+__igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine)
+{
+	return ___igt_spin_batch_new(fd, ctx, engine, 0, POLL_RUN);
+}
+
+/**
+ * igt_spin_batch_new_poll:
+ * @fd: open i915 drm file descriptor
+ * @engine: Ring to execute batch OR'd with execbuf flags. If value is less
+ *          than 0, execute on all available rings.
+ *
+ * Start a recursive batch on a ring. Immediately returns a #igt_spin_t that
+ * contains the batch's handle that can be waited upon. The returned structure
+ * must be passed to igt_spin_batch_free() for post-processing.
+ *
+ * igt_spin_t->running will containt a pointer which target will change from
+ * zero to one once the spinner actually starts executing on the GPU.
+ *
+ * Returns:
+ * Structure with helper internal state for igt_spin_batch_free().
+ */
+igt_spin_t *
+igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine)
+{
+	igt_spin_t *spin;
+
+	igt_require_gem(fd);
+	igt_require(gem_mmap__has_wc(fd));
+
+	spin = __igt_spin_batch_new_poll(fd, ctx, engine);
+	igt_assert(gem_bo_busy(fd, spin->handle));
+
+	return spin;
+}
+
 static void notify(union sigval arg)
 {
 	igt_spin_t *spin = arg.sival_ptr;
@@ -365,7 +466,13 @@ void igt_spin_batch_free(int fd, igt_spin_t *spin)
 		timer_delete(spin->timer);
 
 	igt_spin_batch_end(spin);
-	gem_munmap(spin->batch, BATCH_SIZE);
+	gem_munmap((void *)((unsigned long)spin->batch & (~4095UL)),
+		   BATCH_SIZE);
+
+	if (spin->running) {
+		gem_munmap(spin->running, 4096);
+		gem_close(fd, spin->poll_handle);
+	}
 
 	gem_close(fd, spin->handle);
 
diff --git a/lib/igt_dummyload.h b/lib/igt_dummyload.h
index 4103e4ab9e36..3103935a309b 100644
--- a/lib/igt_dummyload.h
+++ b/lib/igt_dummyload.h
@@ -36,6 +36,10 @@ typedef struct igt_spin {
 	struct igt_list link;
 	uint32_t *batch;
 	int out_fence;
+	struct drm_i915_gem_exec_object2 obj[2];
+	struct drm_i915_gem_execbuffer2 execbuf;
+	uint32_t poll_handle;
+	bool *running;
 } igt_spin_t;
 
 igt_spin_t *__igt_spin_batch_new(int fd,
@@ -55,6 +59,13 @@ igt_spin_t *igt_spin_batch_new_fence(int fd,
 				     uint32_t ctx,
 				     unsigned engine);
 
+igt_spin_t *__igt_spin_batch_new_poll(int fd,
+				       uint32_t ctx,
+				       unsigned engine);
+igt_spin_t *igt_spin_batch_new_poll(int fd,
+				    uint32_t ctx,
+				    unsigned engine);
+
 void igt_spin_batch_set_timeout(igt_spin_t *spin, int64_t ns);
 void igt_spin_batch_end(igt_spin_t *spin);
 void igt_spin_batch_free(int fd, igt_spin_t *spin);
diff --git a/lib/igt_gt.c b/lib/igt_gt.c
index 01aebc670862..4569fd36bd85 100644
--- a/lib/igt_gt.c
+++ b/lib/igt_gt.c
@@ -609,7 +609,7 @@ bool gem_can_store_dword(int fd, unsigned int engine)
 	if (gen == 3 && (info->is_grantsdale || info->is_alviso))
 		return false; /* only supports physical addresses */
 
-	if (gen == 6 && (engine & ~(3<<13)) == I915_EXEC_BSD)
+	if (gen == 6 && ((engine & 0x3f) == I915_EXEC_BSD))
 		return false; /* kills the machine! */
 
 	if (info->is_broadwater)
diff --git a/lib/ioctl_wrappers.c b/lib/ioctl_wrappers.c
index 8748cfcfc04f..4e1a08bf06b4 100644
--- a/lib/ioctl_wrappers.c
+++ b/lib/ioctl_wrappers.c
@@ -198,7 +198,7 @@ void gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride)
 	igt_assert(__gem_set_tiling(fd, handle, tiling, stride) == 0);
 }
 
-static int __gem_set_caching(int fd, uint32_t handle, uint32_t caching)
+int __gem_set_caching(int fd, uint32_t handle, uint32_t caching)
 {
 	struct drm_i915_gem_caching arg;
 	int err;
diff --git a/lib/ioctl_wrappers.h b/lib/ioctl_wrappers.h
index 13fbe3c103c0..b966f72c90a8 100644
--- a/lib/ioctl_wrappers.h
+++ b/lib/ioctl_wrappers.h
@@ -61,6 +61,7 @@ bool gem_get_tiling(int fd, uint32_t handle, uint32_t *tiling, uint32_t *swizzle
 void gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride);
 int __gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride);
 
+int __gem_set_caching(int fd, uint32_t handle, uint32_t caching);
 void gem_set_caching(int fd, uint32_t handle, uint32_t caching);
 uint32_t gem_get_caching(int fd, uint32_t handle);
 uint32_t gem_flink(int fd, uint32_t handle);
diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
index 19fcc95ffc7f..87875fb9e06a 100644
--- a/tests/perf_pmu.c
+++ b/tests/perf_pmu.c
@@ -170,6 +170,56 @@ static unsigned int e2ring(int gem_fd, const struct intel_execution_engine2 *e)
 #define FLAG_LONG (16)
 #define FLAG_HANG (32)
 
+static igt_spin_t * __spin_poll(int fd, uint32_t ctx, unsigned long flags)
+{
+	if (gem_can_store_dword(fd, flags))
+		return __igt_spin_batch_new_poll(fd, ctx, flags);
+	else
+		return __igt_spin_batch_new(fd, ctx, flags, 0);
+}
+
+static unsigned long __spin_wait(int fd, igt_spin_t *spin)
+{
+	struct timespec start = { };
+
+	igt_nsec_elapsed(&start);
+
+	if (gem_can_store_dword(fd, spin->execbuf.flags)) {
+		unsigned long timeout = 0;
+
+		while (!spin->running) {
+			unsigned long t = igt_nsec_elapsed(&start);
+
+			if ((t - timeout) > 250e6) {
+				timeout = t;
+				igt_warn("Spinner not running after %.2fms\n",
+					 (double)t / 1e6);
+			}
+		};
+	} else {
+		igt_debug("__spin_wait - usleep mode\n");
+		usleep(500e3); /* Better than nothing! */
+	}
+
+	return igt_nsec_elapsed(&start);
+}
+
+static igt_spin_t * __spin_sync(int fd, uint32_t ctx, unsigned long flags)
+{
+	igt_spin_t *spin = __spin_poll(fd, ctx, flags);
+
+	__spin_wait(fd, spin);
+
+	return spin;
+}
+
+static igt_spin_t * spin_sync(int fd, uint32_t ctx, unsigned long flags)
+{
+	igt_require_gem(fd);
+
+	return __spin_sync(fd, ctx, flags);
+}
+
 static void end_spin(int fd, igt_spin_t *spin, unsigned int flags)
 {
 	if (!spin)
@@ -180,8 +230,25 @@ static void end_spin(int fd, igt_spin_t *spin, unsigned int flags)
 	if (flags & FLAG_SYNC)
 		gem_sync(fd, spin->handle);
 
-	if (flags & TEST_TRAILING_IDLE)
-		usleep(batch_duration_ns / 5000);
+	if (flags & TEST_TRAILING_IDLE) {
+		unsigned long t, timeout = 0;
+		struct timespec start = { };
+
+		igt_nsec_elapsed(&start);
+
+		do {
+			t = igt_nsec_elapsed(&start);
+
+			if (gem_bo_busy(fd, spin->handle) &&
+			    (t - timeout) > 10e6) {
+				timeout = t;
+				igt_warn("Spinner not idle after %.2fms\n",
+					 (double)t / 1e6);
+			}
+
+			usleep(1e3);
+		} while (t < batch_duration_ns / 5);
+	}
 }
 
 static void
@@ -195,7 +262,7 @@ single(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags)
 	fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance));
 
 	if (flags & TEST_BUSY)
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	else
 		spin = NULL;
 
@@ -251,13 +318,7 @@ busy_start(int gem_fd, const struct intel_execution_engine2 *e)
 	 */
 	sleep(2);
 
-	spin = __igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
-
-	/*
-	 * Sleep for a bit after making the engine busy to make sure the PMU
-	 * gets enabled when the batch is already running.
-	 */
-	usleep(500e3);
+	spin = __spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 
 	fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance));
 
@@ -300,7 +361,7 @@ busy_double_start(int gem_fd, const struct intel_execution_engine2 *e)
 	 * re-submission in execlists mode. Make sure busyness is correctly
 	 * reported with the engine busy, and after the engine went idle.
 	 */
-	spin[0] = __igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin[0] = __spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	usleep(500e3);
 	spin[1] = __igt_spin_batch_new(gem_fd, ctx, e2ring(gem_fd, e), 0);
 
@@ -386,7 +447,7 @@ busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 
 	igt_assert_eq(i, num_engines);
 
-	spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -412,15 +473,13 @@ busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 }
 
 static void
-__submit_spin_batch(int gem_fd,
-		    struct drm_i915_gem_exec_object2 *obj,
+__submit_spin_batch(int gem_fd, igt_spin_t *spin,
 		    const struct intel_execution_engine2 *e)
 {
-	struct drm_i915_gem_execbuffer2 eb = {
-		.buffer_count = 1,
-		.buffers_ptr = to_user_pointer(obj),
-		.flags = e2ring(gem_fd, e),
-	};
+	struct drm_i915_gem_execbuffer2 eb = spin->execbuf;
+
+	eb.flags &= ~(0x3f | I915_EXEC_BSD_MASK);
+	eb.flags |= e2ring(gem_fd, e) | I915_EXEC_NO_RELOC;
 
 	gem_execbuf(gem_fd, &eb);
 }
@@ -429,7 +488,6 @@ static void
 most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 		    const unsigned int num_engines, unsigned int flags)
 {
-	struct drm_i915_gem_exec_object2 obj = {};
 	const struct intel_execution_engine2 *e_;
 	uint64_t tval[2][num_engines];
 	uint64_t val[num_engines];
@@ -443,15 +501,12 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 		if (!gem_has_engine(gem_fd, e_->class, e_->instance))
 			continue;
 
-		if (e == e_) {
+		if (e == e_)
 			idle_idx = i;
-		} else if (spin) {
-			__submit_spin_batch(gem_fd, &obj, e_);
-		} else {
-			spin = igt_spin_batch_new(gem_fd, 0,
-						  e2ring(gem_fd, e_), 0);
-			obj.handle = spin->handle;
-		}
+		else if (spin)
+			__submit_spin_batch(gem_fd, spin, e_);
+		else
+			spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e_));
 
 		val[i++] = I915_PMU_ENGINE_BUSY(e_->class, e_->instance);
 	}
@@ -461,6 +516,9 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 	for (i = 0; i < num_engines; i++)
 		fd[i] = open_group(val[i], fd[0]);
 
+	/* Small delay to allow engines to start. */
+	usleep(__spin_wait(gem_fd, spin) * num_engines / 1e3);
+
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -489,7 +547,6 @@ static void
 all_busy_check_all(int gem_fd, const unsigned int num_engines,
 		   unsigned int flags)
 {
-	struct drm_i915_gem_exec_object2 obj = {};
 	const struct intel_execution_engine2 *e;
 	uint64_t tval[2][num_engines];
 	uint64_t val[num_engines];
@@ -503,13 +560,10 @@ all_busy_check_all(int gem_fd, const unsigned int num_engines,
 		if (!gem_has_engine(gem_fd, e->class, e->instance))
 			continue;
 
-		if (spin) {
-			__submit_spin_batch(gem_fd, &obj, e);
-		} else {
-			spin = igt_spin_batch_new(gem_fd, 0,
-						  e2ring(gem_fd, e), 0);
-			obj.handle = spin->handle;
-		}
+		if (spin)
+			__submit_spin_batch(gem_fd, spin, e);
+		else
+			spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e));
 
 		val[i++] = I915_PMU_ENGINE_BUSY(e->class, e->instance);
 	}
@@ -519,6 +573,9 @@ all_busy_check_all(int gem_fd, const unsigned int num_engines,
 	for (i = 0; i < num_engines; i++)
 		fd[i] = open_group(val[i], fd[0]);
 
+	/* Small delay to allow engines to start. */
+	usleep(__spin_wait(gem_fd, spin) * num_engines / 1e3);
+
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -550,7 +607,7 @@ no_sema(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags)
 	open_group(I915_PMU_ENGINE_WAIT(e->class, e->instance), fd);
 
 	if (flags & TEST_BUSY)
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	else
 		spin = NULL;
 
@@ -884,7 +941,7 @@ multi_client(int gem_fd, const struct intel_execution_engine2 *e)
 	 */
 	fd[1] = open_pmu(config);
 
-	spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 
 	val[0] = val[1] = __pmu_read_single(fd[0], &ts[0]);
 	slept[1] = measured_usleep(batch_duration_ns / 1000);
@@ -1248,7 +1305,7 @@ test_frequency(int gem_fd)
 	igt_require(igt_sysfs_get_u32(sysfs, "gt_boost_freq_mhz") == min_freq);
 
 	gem_quiescent_gpu(gem_fd); /* Idle to be sure the change takes effect */
-	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+	spin = spin_sync(gem_fd, 0, I915_EXEC_RENDER);
 
 	slept = pmu_read_multi(fd, 2, start);
 	measured_usleep(batch_duration_ns / 1000);
@@ -1274,7 +1331,7 @@ test_frequency(int gem_fd)
 	igt_require(igt_sysfs_get_u32(sysfs, "gt_min_freq_mhz") == max_freq);
 
 	gem_quiescent_gpu(gem_fd);
-	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+	spin = spin_sync(gem_fd, 0, I915_EXEC_RENDER);
 
 	slept = pmu_read_multi(fd, 2, start);
 	measured_usleep(batch_duration_ns / 1000);
@@ -1455,6 +1512,11 @@ static void __rearm_spin_batch(igt_spin_t *spin)
 {
 	const uint32_t mi_arb_chk = 0x5 << 23;
 
+	if (spin->running) {
+		igt_assert(*spin->running);
+		*spin->running = 0;
+	}
+	igt_assert_eq(*spin->batch, MI_BATCH_BUFFER_END);
        *spin->batch = mi_arb_chk;
        __sync_synchronize();
 }
@@ -1489,6 +1551,9 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 	/* Sampling platforms cannot reach the high accuracy criteria. */
 	igt_require(gem_has_execlists(gem_fd));
 
+	/* Need store dword for accurate PWM. */
+	igt_require(gem_can_store_dword(gem_fd, e2ring(gem_fd, e)));
+
 	while (idle_us < 2500) {
 		busy_us *= 2;
 		idle_us *= 2;
@@ -1517,7 +1582,6 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 		const unsigned long timeout[] = {
 			pwm_calibration_us * 1000, test_us * 1000
 		};
-		struct drm_i915_gem_exec_object2 obj = {};
 		uint64_t total_busy_ns = 0, total_idle_ns = 0;
 		igt_spin_t *spin;
 		int ret;
@@ -1530,12 +1594,9 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 			igt_warn("Failed to set scheduling policy!\n");
 
 		/* Allocate our spin batch and idle it. */
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
-		obj.handle = spin->handle;
-		__submit_spin_batch(gem_fd, &obj, e); /* record its location */
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 		igt_spin_batch_end(spin);
-		gem_sync(gem_fd, obj.handle);
-		obj.flags |= EXEC_OBJECT_PINNED;
+		gem_sync(gem_fd, spin->handle);
 
 		/* 1st pass is calibration, second pass is the test. */
 		for (int pass = 0; pass < ARRAY_SIZE(timeout); pass++) {
@@ -1545,24 +1606,30 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 
 			igt_nsec_elapsed(&test_start);
 			do {
-				unsigned int target_idle_us, t_busy;
+				unsigned int target_idle_us;
+				struct timespec start = { };
+				unsigned long prep_delay_ns;
 
 				/* Restart the spinbatch. */
+				igt_nsec_elapsed(&start);
 				__rearm_spin_batch(spin);
-				__submit_spin_batch(gem_fd, &obj, e);
+				__submit_spin_batch(gem_fd, spin, e);
 
-				/*
-				 * Note that the submission may be delayed to a
-				 * tasklet (ksoftirqd) which cannot run until we
-				 * sleep as we hog the cpu (we are RT).
-				 */
+				 /* Wait for batch to start executing. */
+				__spin_wait(gem_fd, spin);
+				prep_delay_ns = igt_nsec_elapsed(&start);
 
-				t_busy = measured_usleep(busy_us);
+				/* PWM busy sleep. */
+				memset(&start, 0, sizeof(start));
+				igt_nsec_elapsed(&start);
+				measured_usleep(busy_us);
 				igt_spin_batch_end(spin);
-				gem_sync(gem_fd, obj.handle);
+				gem_sync(gem_fd, spin->handle);
 
-				total_busy_ns += t_busy;
+				total_busy_ns += igt_nsec_elapsed(&start);
+				total_idle_ns += prep_delay_ns;
 
+				/* Re-calibrate. */
 				target_idle_us =
 					(100 * total_busy_ns / target_busy_pct - (total_busy_ns + total_idle_ns)) / 1000;
 				total_idle_ns += measured_usleep(target_idle_us);
-- 
2.14.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [Intel-gfx] [PATCH i-g-t v7] tests/perf_pmu: Improve accuracy by waiting on spinner to start
@ 2018-03-19 13:56               ` Tvrtko Ursulin
  0 siblings, 0 replies; 53+ messages in thread
From: Tvrtko Ursulin @ 2018-03-19 13:56 UTC (permalink / raw)
  To: igt-dev; +Cc: Intel-gfx

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

More than one test assumes that the spinner is running pretty much
immediately after we have create or submitted it.

In actuality there is a variable delay, especially on execlists platforms,
between submission and spin batch starting to run on the hardware.

To enable tests which care about this level of timing to account for this,
we add a new spin batch constructor which provides an output field which
can be polled to determine when the batch actually started running.

This is implemented via MI_STOREDW_IMM from the spin batch, writing into
memory mapped page shared with userspace.

Using this facility from perf_pmu, where applicable, should improve very
occasional test fails across the set and platforms.

v2:
 Chris Wilson:
 * Use caching mapping if available.
 * Handle old gens better.
 * Use gem_can_store_dword.
 * Cache exec obj array in spin_batch_t for easier resubmit.

v3:
 * Forgot I915_EXEC_NO_RELOC. (Chris Wilson)

v4:
 * Mask out all non-engine flags in gem_can_store_dword.
 * Added some debug logging.

v5:
 * Fix relocs and batch munmap. (Chris)
 * Added assert idle spinner batch looks as expected.

v6:
 * Skip accuracy tests when !gem_can_store_dword.

v7:
 * Fix batch recursion reloc address.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 lib/igt_dummyload.c  | 157 +++++++++++++++++++++++++++++++++++++-------
 lib/igt_dummyload.h  |  11 ++++
 lib/igt_gt.c         |   2 +-
 lib/ioctl_wrappers.c |   2 +-
 lib/ioctl_wrappers.h |   1 +
 tests/perf_pmu.c     | 179 +++++++++++++++++++++++++++++++++++----------------
 6 files changed, 269 insertions(+), 83 deletions(-)

diff --git a/lib/igt_dummyload.c b/lib/igt_dummyload.c
index 4b20f23dfe26..44e771be46ee 100644
--- a/lib/igt_dummyload.c
+++ b/lib/igt_dummyload.c
@@ -74,20 +74,23 @@ fill_reloc(struct drm_i915_gem_relocation_entry *reloc,
 	reloc->write_domain = write_domains;
 }
 
-static int emit_recursive_batch(igt_spin_t *spin,
-				int fd, uint32_t ctx, unsigned engine,
-				uint32_t dep, bool out_fence)
+#define OUT_FENCE	(1 << 0)
+#define POLL_RUN	(1 << 1)
+
+static int
+emit_recursive_batch(igt_spin_t *spin, int fd, uint32_t ctx, unsigned engine,
+		     uint32_t dep, unsigned int flags)
 {
 #define SCRATCH 0
 #define BATCH 1
 	const int gen = intel_gen(intel_get_drm_devid(fd));
-	struct drm_i915_gem_exec_object2 obj[2];
 	struct drm_i915_gem_relocation_entry relocs[2];
-	struct drm_i915_gem_execbuffer2 execbuf;
+	struct drm_i915_gem_execbuffer2 *execbuf;
+	struct drm_i915_gem_exec_object2 *obj;
 	unsigned int engines[16];
 	unsigned int nengine;
 	int fence_fd = -1;
-	uint32_t *batch;
+	uint32_t *batch, *batch_start;
 	int i;
 
 	nengine = 0;
@@ -101,8 +104,10 @@ static int emit_recursive_batch(igt_spin_t *spin,
 	}
 	igt_require(nengine);
 
-	memset(&execbuf, 0, sizeof(execbuf));
-	memset(obj, 0, sizeof(obj));
+	memset(&spin->execbuf, 0, sizeof(spin->execbuf));
+	execbuf = &spin->execbuf;
+	memset(spin->obj, 0, sizeof(spin->obj));
+	obj = spin->obj;
 	memset(relocs, 0, sizeof(relocs));
 
 	obj[BATCH].handle = gem_create(fd, BATCH_SIZE);
@@ -113,16 +118,63 @@ static int emit_recursive_batch(igt_spin_t *spin,
 				       	BATCH_SIZE, PROT_WRITE);
 	gem_set_domain(fd, obj[BATCH].handle,
 			I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT);
-	execbuf.buffer_count++;
+	execbuf->buffer_count++;
+	batch_start = batch;
 
 	if (dep) {
+		igt_assert(!(flags & POLL_RUN));
+
 		/* dummy write to dependency */
 		obj[SCRATCH].handle = dep;
 		fill_reloc(&relocs[obj[BATCH].relocation_count++],
 			   dep, 1020,
 			   I915_GEM_DOMAIN_RENDER,
 			   I915_GEM_DOMAIN_RENDER);
-		execbuf.buffer_count++;
+		execbuf->buffer_count++;
+	} else if (flags & POLL_RUN) {
+		unsigned int offset;
+
+		igt_assert(!dep);
+
+		if (gen == 4 || gen == 5)
+			execbuf->flags |= I915_EXEC_SECURE;
+
+		spin->poll_handle = gem_create(fd, 4096);
+
+		if (__gem_set_caching(fd, spin->poll_handle,
+				      I915_CACHING_CACHED) == 0)
+			spin->running = __gem_mmap__cpu(fd, spin->poll_handle,
+							0, 4096,
+							PROT_READ | PROT_WRITE);
+		else
+			spin->running = __gem_mmap__wc(fd, spin->poll_handle,
+						       0, 4096,
+						       PROT_READ | PROT_WRITE);
+		igt_assert(spin->running);
+		igt_assert_eq(*spin->running, 0);
+
+		*batch++ = MI_STORE_DWORD_IMM | (gen < 6 ? 1 << 22 : 0);
+
+		if (gen >= 8) {
+			offset = 1;
+			*batch++ = 0;
+			*batch++ = 0;
+		} else if (gen >= 4) {
+			offset = 2;
+			*batch++ = 0;
+			*batch++ = 0;
+		} else {
+			offset = 1;
+			batch[-1]--;
+			*batch++ = 0;
+		}
+
+		*batch++ = 1;
+
+		obj[SCRATCH].handle = spin->poll_handle;
+		fill_reloc(&relocs[obj[BATCH].relocation_count++],
+			   spin->poll_handle, offset, 0, 0);
+		execbuf->buffer_count++;
 	}
 
 	spin->batch = batch;
@@ -146,7 +198,7 @@ static int emit_recursive_batch(igt_spin_t *spin,
 
 	/* recurse */
 	fill_reloc(&relocs[obj[BATCH].relocation_count],
-		   obj[BATCH].handle, (batch - spin->batch) + 1,
+		   obj[BATCH].handle, (batch - batch_start) + 1,
 		   I915_GEM_DOMAIN_COMMAND, 0);
 	if (gen >= 8) {
 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
@@ -167,18 +219,23 @@ static int emit_recursive_batch(igt_spin_t *spin,
 	obj[BATCH].relocation_count++;
 	obj[BATCH].relocs_ptr = to_user_pointer(relocs);
 
-	execbuf.buffers_ptr = to_user_pointer(obj + (2 - execbuf.buffer_count));
-	execbuf.rsvd1 = ctx;
+	execbuf->buffers_ptr = to_user_pointer(obj +
+					       (2 - execbuf->buffer_count));
+	execbuf->rsvd1 = ctx;
 
-	if (out_fence)
-		execbuf.flags |= I915_EXEC_FENCE_OUT;
+	if (flags & OUT_FENCE)
+		execbuf->flags |= I915_EXEC_FENCE_OUT;
 
 	for (i = 0; i < nengine; i++) {
-		execbuf.flags &= ~ENGINE_MASK;
-		execbuf.flags |= engines[i];
-		gem_execbuf_wr(fd, &execbuf);
-		if (out_fence) {
-			int _fd = execbuf.rsvd2 >> 32;
+		execbuf->flags &= ~ENGINE_MASK;
+		execbuf->flags |= engines[i];
+
+		if (flags & POLL_RUN)
+			igt_require(gem_can_store_dword(fd, execbuf->flags));
+
+		gem_execbuf_wr(fd, execbuf);
+		if (flags & OUT_FENCE) {
+			int _fd = execbuf->rsvd2 >> 32;
 
 			igt_assert(_fd >= 0);
 			if (fence_fd == -1) {
@@ -194,12 +251,20 @@ static int emit_recursive_batch(igt_spin_t *spin,
 		}
 	}
 
+	/* Make it easier for callers to resubmit. */
+
+	obj[BATCH].relocation_count = 0;
+	obj[BATCH].relocs_ptr = 0;
+
+	obj[SCRATCH].flags = EXEC_OBJECT_PINNED;
+	obj[BATCH].flags = EXEC_OBJECT_PINNED;
+
 	return fence_fd;
 }
 
 static igt_spin_t *
 ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
-		      int out_fence)
+		      unsigned int flags)
 {
 	igt_spin_t *spin;
 
@@ -207,7 +272,7 @@ ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
 	igt_assert(spin);
 
 	spin->out_fence = emit_recursive_batch(spin, fd, ctx, engine, dep,
-					       out_fence);
+					       flags);
 
 	pthread_mutex_lock(&list_lock);
 	igt_list_add(&spin->link, &spin_list);
@@ -219,7 +284,7 @@ ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
 igt_spin_t *
 __igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep)
 {
-	return ___igt_spin_batch_new(fd, ctx, engine, dep, false);
+	return ___igt_spin_batch_new(fd, ctx, engine, dep, 0);
 }
 
 /**
@@ -253,7 +318,7 @@ igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep)
 igt_spin_t *
 __igt_spin_batch_new_fence(int fd, uint32_t ctx, unsigned engine)
 {
-	return ___igt_spin_batch_new(fd, ctx, engine, 0, true);
+	return ___igt_spin_batch_new(fd, ctx, engine, 0, OUT_FENCE);
 }
 
 /**
@@ -286,6 +351,42 @@ igt_spin_batch_new_fence(int fd, uint32_t ctx, unsigned engine)
 	return spin;
 }
 
+igt_spin_t *
+__igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine)
+{
+	return ___igt_spin_batch_new(fd, ctx, engine, 0, POLL_RUN);
+}
+
+/**
+ * igt_spin_batch_new_poll:
+ * @fd: open i915 drm file descriptor
+ * @engine: Ring to execute batch OR'd with execbuf flags. If value is less
+ *          than 0, execute on all available rings.
+ *
+ * Start a recursive batch on a ring. Immediately returns a #igt_spin_t that
+ * contains the batch's handle that can be waited upon. The returned structure
+ * must be passed to igt_spin_batch_free() for post-processing.
+ *
+ * igt_spin_t->running will containt a pointer which target will change from
+ * zero to one once the spinner actually starts executing on the GPU.
+ *
+ * Returns:
+ * Structure with helper internal state for igt_spin_batch_free().
+ */
+igt_spin_t *
+igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine)
+{
+	igt_spin_t *spin;
+
+	igt_require_gem(fd);
+	igt_require(gem_mmap__has_wc(fd));
+
+	spin = __igt_spin_batch_new_poll(fd, ctx, engine);
+	igt_assert(gem_bo_busy(fd, spin->handle));
+
+	return spin;
+}
+
 static void notify(union sigval arg)
 {
 	igt_spin_t *spin = arg.sival_ptr;
@@ -365,7 +466,13 @@ void igt_spin_batch_free(int fd, igt_spin_t *spin)
 		timer_delete(spin->timer);
 
 	igt_spin_batch_end(spin);
-	gem_munmap(spin->batch, BATCH_SIZE);
+	gem_munmap((void *)((unsigned long)spin->batch & (~4095UL)),
+		   BATCH_SIZE);
+
+	if (spin->running) {
+		gem_munmap(spin->running, 4096);
+		gem_close(fd, spin->poll_handle);
+	}
 
 	gem_close(fd, spin->handle);
 
diff --git a/lib/igt_dummyload.h b/lib/igt_dummyload.h
index 4103e4ab9e36..3103935a309b 100644
--- a/lib/igt_dummyload.h
+++ b/lib/igt_dummyload.h
@@ -36,6 +36,10 @@ typedef struct igt_spin {
 	struct igt_list link;
 	uint32_t *batch;
 	int out_fence;
+	struct drm_i915_gem_exec_object2 obj[2];
+	struct drm_i915_gem_execbuffer2 execbuf;
+	uint32_t poll_handle;
+	bool *running;
 } igt_spin_t;
 
 igt_spin_t *__igt_spin_batch_new(int fd,
@@ -55,6 +59,13 @@ igt_spin_t *igt_spin_batch_new_fence(int fd,
 				     uint32_t ctx,
 				     unsigned engine);
 
+igt_spin_t *__igt_spin_batch_new_poll(int fd,
+				       uint32_t ctx,
+				       unsigned engine);
+igt_spin_t *igt_spin_batch_new_poll(int fd,
+				    uint32_t ctx,
+				    unsigned engine);
+
 void igt_spin_batch_set_timeout(igt_spin_t *spin, int64_t ns);
 void igt_spin_batch_end(igt_spin_t *spin);
 void igt_spin_batch_free(int fd, igt_spin_t *spin);
diff --git a/lib/igt_gt.c b/lib/igt_gt.c
index 01aebc670862..4569fd36bd85 100644
--- a/lib/igt_gt.c
+++ b/lib/igt_gt.c
@@ -609,7 +609,7 @@ bool gem_can_store_dword(int fd, unsigned int engine)
 	if (gen == 3 && (info->is_grantsdale || info->is_alviso))
 		return false; /* only supports physical addresses */
 
-	if (gen == 6 && (engine & ~(3<<13)) == I915_EXEC_BSD)
+	if (gen == 6 && ((engine & 0x3f) == I915_EXEC_BSD))
 		return false; /* kills the machine! */
 
 	if (info->is_broadwater)
diff --git a/lib/ioctl_wrappers.c b/lib/ioctl_wrappers.c
index 8748cfcfc04f..4e1a08bf06b4 100644
--- a/lib/ioctl_wrappers.c
+++ b/lib/ioctl_wrappers.c
@@ -198,7 +198,7 @@ void gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride)
 	igt_assert(__gem_set_tiling(fd, handle, tiling, stride) == 0);
 }
 
-static int __gem_set_caching(int fd, uint32_t handle, uint32_t caching)
+int __gem_set_caching(int fd, uint32_t handle, uint32_t caching)
 {
 	struct drm_i915_gem_caching arg;
 	int err;
diff --git a/lib/ioctl_wrappers.h b/lib/ioctl_wrappers.h
index 13fbe3c103c0..b966f72c90a8 100644
--- a/lib/ioctl_wrappers.h
+++ b/lib/ioctl_wrappers.h
@@ -61,6 +61,7 @@ bool gem_get_tiling(int fd, uint32_t handle, uint32_t *tiling, uint32_t *swizzle
 void gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride);
 int __gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride);
 
+int __gem_set_caching(int fd, uint32_t handle, uint32_t caching);
 void gem_set_caching(int fd, uint32_t handle, uint32_t caching);
 uint32_t gem_get_caching(int fd, uint32_t handle);
 uint32_t gem_flink(int fd, uint32_t handle);
diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
index 19fcc95ffc7f..87875fb9e06a 100644
--- a/tests/perf_pmu.c
+++ b/tests/perf_pmu.c
@@ -170,6 +170,56 @@ static unsigned int e2ring(int gem_fd, const struct intel_execution_engine2 *e)
 #define FLAG_LONG (16)
 #define FLAG_HANG (32)
 
+static igt_spin_t * __spin_poll(int fd, uint32_t ctx, unsigned long flags)
+{
+	if (gem_can_store_dword(fd, flags))
+		return __igt_spin_batch_new_poll(fd, ctx, flags);
+	else
+		return __igt_spin_batch_new(fd, ctx, flags, 0);
+}
+
+static unsigned long __spin_wait(int fd, igt_spin_t *spin)
+{
+	struct timespec start = { };
+
+	igt_nsec_elapsed(&start);
+
+	if (gem_can_store_dword(fd, spin->execbuf.flags)) {
+		unsigned long timeout = 0;
+
+		while (!spin->running) {
+			unsigned long t = igt_nsec_elapsed(&start);
+
+			if ((t - timeout) > 250e6) {
+				timeout = t;
+				igt_warn("Spinner not running after %.2fms\n",
+					 (double)t / 1e6);
+			}
+		};
+	} else {
+		igt_debug("__spin_wait - usleep mode\n");
+		usleep(500e3); /* Better than nothing! */
+	}
+
+	return igt_nsec_elapsed(&start);
+}
+
+static igt_spin_t * __spin_sync(int fd, uint32_t ctx, unsigned long flags)
+{
+	igt_spin_t *spin = __spin_poll(fd, ctx, flags);
+
+	__spin_wait(fd, spin);
+
+	return spin;
+}
+
+static igt_spin_t * spin_sync(int fd, uint32_t ctx, unsigned long flags)
+{
+	igt_require_gem(fd);
+
+	return __spin_sync(fd, ctx, flags);
+}
+
 static void end_spin(int fd, igt_spin_t *spin, unsigned int flags)
 {
 	if (!spin)
@@ -180,8 +230,25 @@ static void end_spin(int fd, igt_spin_t *spin, unsigned int flags)
 	if (flags & FLAG_SYNC)
 		gem_sync(fd, spin->handle);
 
-	if (flags & TEST_TRAILING_IDLE)
-		usleep(batch_duration_ns / 5000);
+	if (flags & TEST_TRAILING_IDLE) {
+		unsigned long t, timeout = 0;
+		struct timespec start = { };
+
+		igt_nsec_elapsed(&start);
+
+		do {
+			t = igt_nsec_elapsed(&start);
+
+			if (gem_bo_busy(fd, spin->handle) &&
+			    (t - timeout) > 10e6) {
+				timeout = t;
+				igt_warn("Spinner not idle after %.2fms\n",
+					 (double)t / 1e6);
+			}
+
+			usleep(1e3);
+		} while (t < batch_duration_ns / 5);
+	}
 }
 
 static void
@@ -195,7 +262,7 @@ single(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags)
 	fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance));
 
 	if (flags & TEST_BUSY)
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	else
 		spin = NULL;
 
@@ -251,13 +318,7 @@ busy_start(int gem_fd, const struct intel_execution_engine2 *e)
 	 */
 	sleep(2);
 
-	spin = __igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
-
-	/*
-	 * Sleep for a bit after making the engine busy to make sure the PMU
-	 * gets enabled when the batch is already running.
-	 */
-	usleep(500e3);
+	spin = __spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 
 	fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance));
 
@@ -300,7 +361,7 @@ busy_double_start(int gem_fd, const struct intel_execution_engine2 *e)
 	 * re-submission in execlists mode. Make sure busyness is correctly
 	 * reported with the engine busy, and after the engine went idle.
 	 */
-	spin[0] = __igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin[0] = __spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	usleep(500e3);
 	spin[1] = __igt_spin_batch_new(gem_fd, ctx, e2ring(gem_fd, e), 0);
 
@@ -386,7 +447,7 @@ busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 
 	igt_assert_eq(i, num_engines);
 
-	spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -412,15 +473,13 @@ busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 }
 
 static void
-__submit_spin_batch(int gem_fd,
-		    struct drm_i915_gem_exec_object2 *obj,
+__submit_spin_batch(int gem_fd, igt_spin_t *spin,
 		    const struct intel_execution_engine2 *e)
 {
-	struct drm_i915_gem_execbuffer2 eb = {
-		.buffer_count = 1,
-		.buffers_ptr = to_user_pointer(obj),
-		.flags = e2ring(gem_fd, e),
-	};
+	struct drm_i915_gem_execbuffer2 eb = spin->execbuf;
+
+	eb.flags &= ~(0x3f | I915_EXEC_BSD_MASK);
+	eb.flags |= e2ring(gem_fd, e) | I915_EXEC_NO_RELOC;
 
 	gem_execbuf(gem_fd, &eb);
 }
@@ -429,7 +488,6 @@ static void
 most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 		    const unsigned int num_engines, unsigned int flags)
 {
-	struct drm_i915_gem_exec_object2 obj = {};
 	const struct intel_execution_engine2 *e_;
 	uint64_t tval[2][num_engines];
 	uint64_t val[num_engines];
@@ -443,15 +501,12 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 		if (!gem_has_engine(gem_fd, e_->class, e_->instance))
 			continue;
 
-		if (e == e_) {
+		if (e == e_)
 			idle_idx = i;
-		} else if (spin) {
-			__submit_spin_batch(gem_fd, &obj, e_);
-		} else {
-			spin = igt_spin_batch_new(gem_fd, 0,
-						  e2ring(gem_fd, e_), 0);
-			obj.handle = spin->handle;
-		}
+		else if (spin)
+			__submit_spin_batch(gem_fd, spin, e_);
+		else
+			spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e_));
 
 		val[i++] = I915_PMU_ENGINE_BUSY(e_->class, e_->instance);
 	}
@@ -461,6 +516,9 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 	for (i = 0; i < num_engines; i++)
 		fd[i] = open_group(val[i], fd[0]);
 
+	/* Small delay to allow engines to start. */
+	usleep(__spin_wait(gem_fd, spin) * num_engines / 1e3);
+
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -489,7 +547,6 @@ static void
 all_busy_check_all(int gem_fd, const unsigned int num_engines,
 		   unsigned int flags)
 {
-	struct drm_i915_gem_exec_object2 obj = {};
 	const struct intel_execution_engine2 *e;
 	uint64_t tval[2][num_engines];
 	uint64_t val[num_engines];
@@ -503,13 +560,10 @@ all_busy_check_all(int gem_fd, const unsigned int num_engines,
 		if (!gem_has_engine(gem_fd, e->class, e->instance))
 			continue;
 
-		if (spin) {
-			__submit_spin_batch(gem_fd, &obj, e);
-		} else {
-			spin = igt_spin_batch_new(gem_fd, 0,
-						  e2ring(gem_fd, e), 0);
-			obj.handle = spin->handle;
-		}
+		if (spin)
+			__submit_spin_batch(gem_fd, spin, e);
+		else
+			spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e));
 
 		val[i++] = I915_PMU_ENGINE_BUSY(e->class, e->instance);
 	}
@@ -519,6 +573,9 @@ all_busy_check_all(int gem_fd, const unsigned int num_engines,
 	for (i = 0; i < num_engines; i++)
 		fd[i] = open_group(val[i], fd[0]);
 
+	/* Small delay to allow engines to start. */
+	usleep(__spin_wait(gem_fd, spin) * num_engines / 1e3);
+
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -550,7 +607,7 @@ no_sema(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags)
 	open_group(I915_PMU_ENGINE_WAIT(e->class, e->instance), fd);
 
 	if (flags & TEST_BUSY)
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	else
 		spin = NULL;
 
@@ -884,7 +941,7 @@ multi_client(int gem_fd, const struct intel_execution_engine2 *e)
 	 */
 	fd[1] = open_pmu(config);
 
-	spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 
 	val[0] = val[1] = __pmu_read_single(fd[0], &ts[0]);
 	slept[1] = measured_usleep(batch_duration_ns / 1000);
@@ -1248,7 +1305,7 @@ test_frequency(int gem_fd)
 	igt_require(igt_sysfs_get_u32(sysfs, "gt_boost_freq_mhz") == min_freq);
 
 	gem_quiescent_gpu(gem_fd); /* Idle to be sure the change takes effect */
-	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+	spin = spin_sync(gem_fd, 0, I915_EXEC_RENDER);
 
 	slept = pmu_read_multi(fd, 2, start);
 	measured_usleep(batch_duration_ns / 1000);
@@ -1274,7 +1331,7 @@ test_frequency(int gem_fd)
 	igt_require(igt_sysfs_get_u32(sysfs, "gt_min_freq_mhz") == max_freq);
 
 	gem_quiescent_gpu(gem_fd);
-	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+	spin = spin_sync(gem_fd, 0, I915_EXEC_RENDER);
 
 	slept = pmu_read_multi(fd, 2, start);
 	measured_usleep(batch_duration_ns / 1000);
@@ -1455,6 +1512,11 @@ static void __rearm_spin_batch(igt_spin_t *spin)
 {
 	const uint32_t mi_arb_chk = 0x5 << 23;
 
+	if (spin->running) {
+		igt_assert(*spin->running);
+		*spin->running = 0;
+	}
+	igt_assert_eq(*spin->batch, MI_BATCH_BUFFER_END);
        *spin->batch = mi_arb_chk;
        __sync_synchronize();
 }
@@ -1489,6 +1551,9 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 	/* Sampling platforms cannot reach the high accuracy criteria. */
 	igt_require(gem_has_execlists(gem_fd));
 
+	/* Need store dword for accurate PWM. */
+	igt_require(gem_can_store_dword(gem_fd, e2ring(gem_fd, e)));
+
 	while (idle_us < 2500) {
 		busy_us *= 2;
 		idle_us *= 2;
@@ -1517,7 +1582,6 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 		const unsigned long timeout[] = {
 			pwm_calibration_us * 1000, test_us * 1000
 		};
-		struct drm_i915_gem_exec_object2 obj = {};
 		uint64_t total_busy_ns = 0, total_idle_ns = 0;
 		igt_spin_t *spin;
 		int ret;
@@ -1530,12 +1594,9 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 			igt_warn("Failed to set scheduling policy!\n");
 
 		/* Allocate our spin batch and idle it. */
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
-		obj.handle = spin->handle;
-		__submit_spin_batch(gem_fd, &obj, e); /* record its location */
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 		igt_spin_batch_end(spin);
-		gem_sync(gem_fd, obj.handle);
-		obj.flags |= EXEC_OBJECT_PINNED;
+		gem_sync(gem_fd, spin->handle);
 
 		/* 1st pass is calibration, second pass is the test. */
 		for (int pass = 0; pass < ARRAY_SIZE(timeout); pass++) {
@@ -1545,24 +1606,30 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 
 			igt_nsec_elapsed(&test_start);
 			do {
-				unsigned int target_idle_us, t_busy;
+				unsigned int target_idle_us;
+				struct timespec start = { };
+				unsigned long prep_delay_ns;
 
 				/* Restart the spinbatch. */
+				igt_nsec_elapsed(&start);
 				__rearm_spin_batch(spin);
-				__submit_spin_batch(gem_fd, &obj, e);
+				__submit_spin_batch(gem_fd, spin, e);
 
-				/*
-				 * Note that the submission may be delayed to a
-				 * tasklet (ksoftirqd) which cannot run until we
-				 * sleep as we hog the cpu (we are RT).
-				 */
+				 /* Wait for batch to start executing. */
+				__spin_wait(gem_fd, spin);
+				prep_delay_ns = igt_nsec_elapsed(&start);
 
-				t_busy = measured_usleep(busy_us);
+				/* PWM busy sleep. */
+				memset(&start, 0, sizeof(start));
+				igt_nsec_elapsed(&start);
+				measured_usleep(busy_us);
 				igt_spin_batch_end(spin);
-				gem_sync(gem_fd, obj.handle);
+				gem_sync(gem_fd, spin->handle);
 
-				total_busy_ns += t_busy;
+				total_busy_ns += igt_nsec_elapsed(&start);
+				total_idle_ns += prep_delay_ns;
 
+				/* Re-calibrate. */
 				target_idle_us =
 					(100 * total_busy_ns / target_busy_pct - (total_busy_ns + total_idle_ns)) / 1000;
 				total_idle_ns += measured_usleep(target_idle_us);
-- 
2.14.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* Re: [PATCH i-g-t v7] tests/perf_pmu: Improve accuracy by waiting on spinner to start
  2018-03-19 13:56               ` [Intel-gfx] " Tvrtko Ursulin
@ 2018-03-19 14:02                 ` Chris Wilson
  -1 siblings, 0 replies; 53+ messages in thread
From: Chris Wilson @ 2018-03-19 14:02 UTC (permalink / raw)
  To: Tvrtko Ursulin, igt-dev; +Cc: Intel-gfx

Quoting Tvrtko Ursulin (2018-03-19 13:56:05)
> @@ -146,7 +198,7 @@ static int emit_recursive_batch(igt_spin_t *spin,
>  
>         /* recurse */
>         fill_reloc(&relocs[obj[BATCH].relocation_count],
> -                  obj[BATCH].handle, (batch - spin->batch) + 1,
> +                  obj[BATCH].handle, (batch - batch_start) + 1,

D'oh.

>                    I915_GEM_DOMAIN_COMMAND, 0);
>         if (gen >= 8) {
>                 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
> @@ -167,18 +219,23 @@ static int emit_recursive_batch(igt_spin_t *spin,
>         obj[BATCH].relocation_count++;
>         obj[BATCH].relocs_ptr = to_user_pointer(relocs);
>  
> -       execbuf.buffers_ptr = to_user_pointer(obj + (2 - execbuf.buffer_count));
> -       execbuf.rsvd1 = ctx;
> +       execbuf->buffers_ptr = to_user_pointer(obj +
> +                                              (2 - execbuf->buffer_count));
> +       execbuf->rsvd1 = ctx;
>  
> -       if (out_fence)
> -               execbuf.flags |= I915_EXEC_FENCE_OUT;
> +       if (flags & OUT_FENCE)
> +               execbuf->flags |= I915_EXEC_FENCE_OUT;
>  
>         for (i = 0; i < nengine; i++) {
> -               execbuf.flags &= ~ENGINE_MASK;
> -               execbuf.flags |= engines[i];
> -               gem_execbuf_wr(fd, &execbuf);
> -               if (out_fence) {
> -                       int _fd = execbuf.rsvd2 >> 32;
> +               execbuf->flags &= ~ENGINE_MASK;
> +               execbuf->flags |= engines[i];
> +
> +               if (flags & POLL_RUN)
> +                       igt_require(gem_can_store_dword(fd, execbuf->flags));

Hmm, we need to lift this. If we abort on a secondary engine, we haven't
yet attached ourselves into the igt_terminate_spin_batches and so we'll
get GPU hangs.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [igt-dev] [Intel-gfx] [PATCH i-g-t v7] tests/perf_pmu: Improve accuracy by waiting on spinner to start
@ 2018-03-19 14:02                 ` Chris Wilson
  0 siblings, 0 replies; 53+ messages in thread
From: Chris Wilson @ 2018-03-19 14:02 UTC (permalink / raw)
  To: Tvrtko Ursulin, igt-dev; +Cc: Intel-gfx

Quoting Tvrtko Ursulin (2018-03-19 13:56:05)
> @@ -146,7 +198,7 @@ static int emit_recursive_batch(igt_spin_t *spin,
>  
>         /* recurse */
>         fill_reloc(&relocs[obj[BATCH].relocation_count],
> -                  obj[BATCH].handle, (batch - spin->batch) + 1,
> +                  obj[BATCH].handle, (batch - batch_start) + 1,

D'oh.

>                    I915_GEM_DOMAIN_COMMAND, 0);
>         if (gen >= 8) {
>                 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
> @@ -167,18 +219,23 @@ static int emit_recursive_batch(igt_spin_t *spin,
>         obj[BATCH].relocation_count++;
>         obj[BATCH].relocs_ptr = to_user_pointer(relocs);
>  
> -       execbuf.buffers_ptr = to_user_pointer(obj + (2 - execbuf.buffer_count));
> -       execbuf.rsvd1 = ctx;
> +       execbuf->buffers_ptr = to_user_pointer(obj +
> +                                              (2 - execbuf->buffer_count));
> +       execbuf->rsvd1 = ctx;
>  
> -       if (out_fence)
> -               execbuf.flags |= I915_EXEC_FENCE_OUT;
> +       if (flags & OUT_FENCE)
> +               execbuf->flags |= I915_EXEC_FENCE_OUT;
>  
>         for (i = 0; i < nengine; i++) {
> -               execbuf.flags &= ~ENGINE_MASK;
> -               execbuf.flags |= engines[i];
> -               gem_execbuf_wr(fd, &execbuf);
> -               if (out_fence) {
> -                       int _fd = execbuf.rsvd2 >> 32;
> +               execbuf->flags &= ~ENGINE_MASK;
> +               execbuf->flags |= engines[i];
> +
> +               if (flags & POLL_RUN)
> +                       igt_require(gem_can_store_dword(fd, execbuf->flags));

Hmm, we need to lift this. If we abort on a secondary engine, we haven't
yet attached ourselves into the igt_terminate_spin_batches and so we'll
get GPU hangs.
-Chris
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH i-g-t v7] tests/perf_pmu: Improve accuracy by waiting on spinner to start
  2018-03-19 13:56               ` [Intel-gfx] " Tvrtko Ursulin
@ 2018-03-19 15:29                 ` Chris Wilson
  -1 siblings, 0 replies; 53+ messages in thread
From: Chris Wilson @ 2018-03-19 15:29 UTC (permalink / raw)
  To: Tvrtko Ursulin, igt-dev; +Cc: Intel-gfx

Quoting Tvrtko Ursulin (2018-03-19 13:56:05)
> @@ -443,15 +501,12 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
>                 if (!gem_has_engine(gem_fd, e_->class, e_->instance))
>                         continue;
>  
> -               if (e == e_) {
> +               if (e == e_)
>                         idle_idx = i;
> -               } else if (spin) {
> -                       __submit_spin_batch(gem_fd, &obj, e_);
> -               } else {
> -                       spin = igt_spin_batch_new(gem_fd, 0,
> -                                                 e2ring(gem_fd, e_), 0);
> -                       obj.handle = spin->handle;
> -               }
> +               else if (spin)
> +                       __submit_spin_batch(gem_fd, spin, e_);
> +               else
> +                       spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e_));

So this is what is killing snb. We resubmit the spin-batch, with its
MI_STORE_DWORD_IMM intact, onto each ring. Instant machine death for snb
when we reach vcs.

If we tweak the spinner to jump to a location 64bytes past the start, we
can opt out of the MI_STORE_DW when not required. Let me go an cook up a
delta.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [igt-dev] [Intel-gfx] [PATCH i-g-t v7] tests/perf_pmu: Improve accuracy by waiting on spinner to start
@ 2018-03-19 15:29                 ` Chris Wilson
  0 siblings, 0 replies; 53+ messages in thread
From: Chris Wilson @ 2018-03-19 15:29 UTC (permalink / raw)
  To: Tvrtko Ursulin, igt-dev; +Cc: Intel-gfx

Quoting Tvrtko Ursulin (2018-03-19 13:56:05)
> @@ -443,15 +501,12 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
>                 if (!gem_has_engine(gem_fd, e_->class, e_->instance))
>                         continue;
>  
> -               if (e == e_) {
> +               if (e == e_)
>                         idle_idx = i;
> -               } else if (spin) {
> -                       __submit_spin_batch(gem_fd, &obj, e_);
> -               } else {
> -                       spin = igt_spin_batch_new(gem_fd, 0,
> -                                                 e2ring(gem_fd, e_), 0);
> -                       obj.handle = spin->handle;
> -               }
> +               else if (spin)
> +                       __submit_spin_batch(gem_fd, spin, e_);
> +               else
> +                       spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e_));

So this is what is killing snb. We resubmit the spin-batch, with its
MI_STORE_DWORD_IMM intact, onto each ring. Instant machine death for snb
when we reach vcs.

If we tweak the spinner to jump to a location 64bytes past the start, we
can opt out of the MI_STORE_DW when not required. Let me go an cook up a
delta.
-Chris
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH i-g-t v7] tests/perf_pmu: Improve accuracy by waiting on spinner to start
  2018-03-19 15:29                 ` [igt-dev] [Intel-gfx] " Chris Wilson
@ 2018-03-19 15:33                   ` Chris Wilson
  -1 siblings, 0 replies; 53+ messages in thread
From: Chris Wilson @ 2018-03-19 15:33 UTC (permalink / raw)
  To: Tvrtko Ursulin, igt-dev; +Cc: Intel-gfx

Quoting Chris Wilson (2018-03-19 15:29:21)
> Quoting Tvrtko Ursulin (2018-03-19 13:56:05)
> > @@ -443,15 +501,12 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
> >                 if (!gem_has_engine(gem_fd, e_->class, e_->instance))
> >                         continue;
> >  
> > -               if (e == e_) {
> > +               if (e == e_)
> >                         idle_idx = i;
> > -               } else if (spin) {
> > -                       __submit_spin_batch(gem_fd, &obj, e_);
> > -               } else {
> > -                       spin = igt_spin_batch_new(gem_fd, 0,
> > -                                                 e2ring(gem_fd, e_), 0);
> > -                       obj.handle = spin->handle;
> > -               }
> > +               else if (spin)
> > +                       __submit_spin_batch(gem_fd, spin, e_);
> > +               else
> > +                       spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e_));
> 
> So this is what is killing snb. We resubmit the spin-batch, with its
> MI_STORE_DWORD_IMM intact, onto each ring. Instant machine death for snb
> when we reach vcs.
> 
> If we tweak the spinner to jump to a location 64bytes past the start, we
> can opt out of the MI_STORE_DW when not required. Let me go an cook up a
> delta.

diff --git a/lib/igt_dummyload.c b/lib/igt_dummyload.c
index b7a89fd..2a3c3b5 100644
--- a/lib/igt_dummyload.c
+++ b/lib/igt_dummyload.c
@@ -84,7 +84,7 @@ emit_recursive_batch(igt_spin_t *spin, int fd, uint32_t ctx, unsigned engine,
 #define SCRATCH 0
 #define BATCH 1
 	const int gen = intel_gen(intel_get_drm_devid(fd));
-	struct drm_i915_gem_relocation_entry relocs[2];
+	struct drm_i915_gem_relocation_entry relocs[2], *r;
 	struct drm_i915_gem_execbuffer2 *execbuf;
 	struct drm_i915_gem_exec_object2 *obj;
 	unsigned int engines[16];
@@ -182,7 +182,7 @@ emit_recursive_batch(igt_spin_t *spin, int fd, uint32_t ctx, unsigned engine,
 		execbuf->buffer_count++;
 	}
 
-	spin->batch = batch;
+	spin->batch = batch_start + 64/sizeof(*batch);
 	spin->handle = obj[BATCH].handle;
 
 	/* Allow ourselves to be preempted */
@@ -202,26 +202,25 @@ emit_recursive_batch(igt_spin_t *spin, int fd, uint32_t ctx, unsigned engine,
 	batch += 1000;
 
 	/* recurse */
-	fill_reloc(&relocs[obj[BATCH].relocation_count],
-		   obj[BATCH].handle, (batch - batch_start) + 1,
-		   I915_GEM_DOMAIN_COMMAND, 0);
+	r = &relocs[obj[BATCH].relocation_count++];
+	r->target_handle = obj[BATCH].handle;
+	r->offset = (batch + 1 - batch_start) * sizeof(*batch);
+	r->read_domains = I915_GEM_DOMAIN_COMMAND;
+	r->delta = 64;
 	if (gen >= 8) {
 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
-		*batch++ = 0;
+		*batch++ = r->delta;
 		*batch++ = 0;
 	} else if (gen >= 6) {
 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8;
-		*batch++ = 0;
+		*batch++ = r->delta;
 	} else {
 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
-		*batch = 0;
-		if (gen < 4) {
-			*batch |= 1;
-			relocs[obj[BATCH].relocation_count].delta = 1;
-		}
+		if (gen < 4)
+			r->delta |= 1;
+		*batch = r->delta;
 		batch++;
 	}
-	obj[BATCH].relocation_count++;
 	obj[BATCH].relocs_ptr = to_user_pointer(relocs);
 
 	execbuf->buffers_ptr = to_user_pointer(obj +
diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
index 87875fb..469b9be 100644
--- a/tests/perf_pmu.c
+++ b/tests/perf_pmu.c
@@ -474,12 +474,14 @@ busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 
 static void
 __submit_spin_batch(int gem_fd, igt_spin_t *spin,
-		    const struct intel_execution_engine2 *e)
+		    const struct intel_execution_engine2 *e,
+		    int offset)
 {
 	struct drm_i915_gem_execbuffer2 eb = spin->execbuf;
 
 	eb.flags &= ~(0x3f | I915_EXEC_BSD_MASK);
 	eb.flags |= e2ring(gem_fd, e) | I915_EXEC_NO_RELOC;
+	eb.batch_start_offset += offset;
 
 	gem_execbuf(gem_fd, &eb);
 }
@@ -504,7 +506,7 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 		if (e == e_)
 			idle_idx = i;
 		else if (spin)
-			__submit_spin_batch(gem_fd, spin, e_);
+			__submit_spin_batch(gem_fd, spin, e_, 64);
 		else
 			spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e_));
 
@@ -561,7 +563,7 @@ all_busy_check_all(int gem_fd, const unsigned int num_engines,
 			continue;
 
 		if (spin)
-			__submit_spin_batch(gem_fd, spin, e);
+			__submit_spin_batch(gem_fd, spin, e, 64);
 		else
 			spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e));
 
@@ -1613,7 +1615,7 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 				/* Restart the spinbatch. */
 				igt_nsec_elapsed(&start);
 				__rearm_spin_batch(spin);
-				__submit_spin_batch(gem_fd, spin, e);
+				__submit_spin_batch(gem_fd, spin, e, 0);
 
 				 /* Wait for batch to start executing. */
 				__spin_wait(gem_fd, spin);
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* Re: [igt-dev] [Intel-gfx] [PATCH i-g-t v7] tests/perf_pmu: Improve accuracy by waiting on spinner to start
@ 2018-03-19 15:33                   ` Chris Wilson
  0 siblings, 0 replies; 53+ messages in thread
From: Chris Wilson @ 2018-03-19 15:33 UTC (permalink / raw)
  To: Tvrtko Ursulin, igt-dev; +Cc: Intel-gfx

Quoting Chris Wilson (2018-03-19 15:29:21)
> Quoting Tvrtko Ursulin (2018-03-19 13:56:05)
> > @@ -443,15 +501,12 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
> >                 if (!gem_has_engine(gem_fd, e_->class, e_->instance))
> >                         continue;
> >  
> > -               if (e == e_) {
> > +               if (e == e_)
> >                         idle_idx = i;
> > -               } else if (spin) {
> > -                       __submit_spin_batch(gem_fd, &obj, e_);
> > -               } else {
> > -                       spin = igt_spin_batch_new(gem_fd, 0,
> > -                                                 e2ring(gem_fd, e_), 0);
> > -                       obj.handle = spin->handle;
> > -               }
> > +               else if (spin)
> > +                       __submit_spin_batch(gem_fd, spin, e_);
> > +               else
> > +                       spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e_));
> 
> So this is what is killing snb. We resubmit the spin-batch, with its
> MI_STORE_DWORD_IMM intact, onto each ring. Instant machine death for snb
> when we reach vcs.
> 
> If we tweak the spinner to jump to a location 64bytes past the start, we
> can opt out of the MI_STORE_DW when not required. Let me go an cook up a
> delta.

diff --git a/lib/igt_dummyload.c b/lib/igt_dummyload.c
index b7a89fd..2a3c3b5 100644
--- a/lib/igt_dummyload.c
+++ b/lib/igt_dummyload.c
@@ -84,7 +84,7 @@ emit_recursive_batch(igt_spin_t *spin, int fd, uint32_t ctx, unsigned engine,
 #define SCRATCH 0
 #define BATCH 1
 	const int gen = intel_gen(intel_get_drm_devid(fd));
-	struct drm_i915_gem_relocation_entry relocs[2];
+	struct drm_i915_gem_relocation_entry relocs[2], *r;
 	struct drm_i915_gem_execbuffer2 *execbuf;
 	struct drm_i915_gem_exec_object2 *obj;
 	unsigned int engines[16];
@@ -182,7 +182,7 @@ emit_recursive_batch(igt_spin_t *spin, int fd, uint32_t ctx, unsigned engine,
 		execbuf->buffer_count++;
 	}
 
-	spin->batch = batch;
+	spin->batch = batch_start + 64/sizeof(*batch);
 	spin->handle = obj[BATCH].handle;
 
 	/* Allow ourselves to be preempted */
@@ -202,26 +202,25 @@ emit_recursive_batch(igt_spin_t *spin, int fd, uint32_t ctx, unsigned engine,
 	batch += 1000;
 
 	/* recurse */
-	fill_reloc(&relocs[obj[BATCH].relocation_count],
-		   obj[BATCH].handle, (batch - batch_start) + 1,
-		   I915_GEM_DOMAIN_COMMAND, 0);
+	r = &relocs[obj[BATCH].relocation_count++];
+	r->target_handle = obj[BATCH].handle;
+	r->offset = (batch + 1 - batch_start) * sizeof(*batch);
+	r->read_domains = I915_GEM_DOMAIN_COMMAND;
+	r->delta = 64;
 	if (gen >= 8) {
 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
-		*batch++ = 0;
+		*batch++ = r->delta;
 		*batch++ = 0;
 	} else if (gen >= 6) {
 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8;
-		*batch++ = 0;
+		*batch++ = r->delta;
 	} else {
 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
-		*batch = 0;
-		if (gen < 4) {
-			*batch |= 1;
-			relocs[obj[BATCH].relocation_count].delta = 1;
-		}
+		if (gen < 4)
+			r->delta |= 1;
+		*batch = r->delta;
 		batch++;
 	}
-	obj[BATCH].relocation_count++;
 	obj[BATCH].relocs_ptr = to_user_pointer(relocs);
 
 	execbuf->buffers_ptr = to_user_pointer(obj +
diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
index 87875fb..469b9be 100644
--- a/tests/perf_pmu.c
+++ b/tests/perf_pmu.c
@@ -474,12 +474,14 @@ busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 
 static void
 __submit_spin_batch(int gem_fd, igt_spin_t *spin,
-		    const struct intel_execution_engine2 *e)
+		    const struct intel_execution_engine2 *e,
+		    int offset)
 {
 	struct drm_i915_gem_execbuffer2 eb = spin->execbuf;
 
 	eb.flags &= ~(0x3f | I915_EXEC_BSD_MASK);
 	eb.flags |= e2ring(gem_fd, e) | I915_EXEC_NO_RELOC;
+	eb.batch_start_offset += offset;
 
 	gem_execbuf(gem_fd, &eb);
 }
@@ -504,7 +506,7 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 		if (e == e_)
 			idle_idx = i;
 		else if (spin)
-			__submit_spin_batch(gem_fd, spin, e_);
+			__submit_spin_batch(gem_fd, spin, e_, 64);
 		else
 			spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e_));
 
@@ -561,7 +563,7 @@ all_busy_check_all(int gem_fd, const unsigned int num_engines,
 			continue;
 
 		if (spin)
-			__submit_spin_batch(gem_fd, spin, e);
+			__submit_spin_batch(gem_fd, spin, e, 64);
 		else
 			spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e));
 
@@ -1613,7 +1615,7 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 				/* Restart the spinbatch. */
 				igt_nsec_elapsed(&start);
 				__rearm_spin_batch(spin);
-				__submit_spin_batch(gem_fd, spin, e);
+				__submit_spin_batch(gem_fd, spin, e, 0);
 
 				 /* Wait for batch to start executing. */
 				__spin_wait(gem_fd, spin);
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [PATCH i-g-t v8] tests/perf_pmu: Improve accuracy by waiting on spinner to start
  2018-03-19 15:33                   ` [igt-dev] [Intel-gfx] " Chris Wilson
@ 2018-03-19 16:59                     ` Tvrtko Ursulin
  -1 siblings, 0 replies; 53+ messages in thread
From: Tvrtko Ursulin @ 2018-03-19 16:59 UTC (permalink / raw)
  To: igt-dev; +Cc: Intel-gfx

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

More than one test assumes that the spinner is running pretty much
immediately after we have create or submitted it.

In actuality there is a variable delay, especially on execlists platforms,
between submission and spin batch starting to run on the hardware.

To enable tests which care about this level of timing to account for this,
we add a new spin batch constructor which provides an output field which
can be polled to determine when the batch actually started running.

This is implemented via MI_STOREDW_IMM from the spin batch, writing into
memory mapped page shared with userspace.

Using this facility from perf_pmu, where applicable, should improve very
occasional test fails across the set and platforms.

v2:
 Chris Wilson:
 * Use caching mapping if available.
 * Handle old gens better.
 * Use gem_can_store_dword.
 * Cache exec obj array in spin_batch_t for easier resubmit.

v3:
 * Forgot I915_EXEC_NO_RELOC. (Chris Wilson)

v4:
 * Mask out all non-engine flags in gem_can_store_dword.
 * Added some debug logging.

v5:
 * Fix relocs and batch munmap. (Chris)
 * Added assert idle spinner batch looks as expected.

v6:
 * Skip accuracy tests when !gem_can_store_dword.

v7:
 * Fix batch recursion reloc address.

v8:
 Chris Wilson:
 * Pull up gem_can_store_dword check before we start submitting.
 * Build spinner batch in a way we can skip store dword when not
   needed so we can run on SandyBridge.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 lib/igt_dummyload.c  | 192 ++++++++++++++++++++++++++++++++++++++++-----------
 lib/igt_dummyload.h  |  11 +++
 lib/igt_gt.c         |   2 +-
 lib/ioctl_wrappers.c |   2 +-
 lib/ioctl_wrappers.h |   1 +
 tests/perf_pmu.c     | 183 +++++++++++++++++++++++++++++++++---------------
 6 files changed, 293 insertions(+), 98 deletions(-)

diff --git a/lib/igt_dummyload.c b/lib/igt_dummyload.c
index 4b20f23dfe26..ce84628095b5 100644
--- a/lib/igt_dummyload.c
+++ b/lib/igt_dummyload.c
@@ -74,35 +74,48 @@ fill_reloc(struct drm_i915_gem_relocation_entry *reloc,
 	reloc->write_domain = write_domains;
 }
 
-static int emit_recursive_batch(igt_spin_t *spin,
-				int fd, uint32_t ctx, unsigned engine,
-				uint32_t dep, bool out_fence)
+#define OUT_FENCE	(1 << 0)
+#define POLL_RUN	(1 << 1)
+
+static int
+emit_recursive_batch(igt_spin_t *spin, int fd, uint32_t ctx, unsigned engine,
+		     uint32_t dep, unsigned int flags)
 {
 #define SCRATCH 0
 #define BATCH 1
 	const int gen = intel_gen(intel_get_drm_devid(fd));
-	struct drm_i915_gem_exec_object2 obj[2];
-	struct drm_i915_gem_relocation_entry relocs[2];
-	struct drm_i915_gem_execbuffer2 execbuf;
+	struct drm_i915_gem_relocation_entry relocs[2], *r;
+	struct drm_i915_gem_execbuffer2 *execbuf;
+	struct drm_i915_gem_exec_object2 *obj;
 	unsigned int engines[16];
 	unsigned int nengine;
 	int fence_fd = -1;
-	uint32_t *batch;
+	uint32_t *batch, *batch_start;
 	int i;
 
 	nengine = 0;
 	if (engine == -1) {
-		for_each_engine(fd, engine)
-			if (engine)
+		for_each_engine(fd, engine) {
+			if (engine) {
+			if (flags & POLL_RUN)
+				igt_require(!(flags & POLL_RUN) ||
+					    gem_can_store_dword(fd, engine));
+
 				engines[nengine++] = engine;
+			}
+		}
 	} else {
 		gem_require_ring(fd, engine);
+		igt_require(!(flags & POLL_RUN) ||
+			    gem_can_store_dword(fd, engine));
 		engines[nengine++] = engine;
 	}
 	igt_require(nengine);
 
-	memset(&execbuf, 0, sizeof(execbuf));
-	memset(obj, 0, sizeof(obj));
+	memset(&spin->execbuf, 0, sizeof(spin->execbuf));
+	execbuf = &spin->execbuf;
+	memset(spin->obj, 0, sizeof(spin->obj));
+	obj = spin->obj;
 	memset(relocs, 0, sizeof(relocs));
 
 	obj[BATCH].handle = gem_create(fd, BATCH_SIZE);
@@ -113,19 +126,66 @@ static int emit_recursive_batch(igt_spin_t *spin,
 				       	BATCH_SIZE, PROT_WRITE);
 	gem_set_domain(fd, obj[BATCH].handle,
 			I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT);
-	execbuf.buffer_count++;
+	execbuf->buffer_count++;
+	batch_start = batch;
 
 	if (dep) {
+		igt_assert(!(flags & POLL_RUN));
+
 		/* dummy write to dependency */
 		obj[SCRATCH].handle = dep;
 		fill_reloc(&relocs[obj[BATCH].relocation_count++],
 			   dep, 1020,
 			   I915_GEM_DOMAIN_RENDER,
 			   I915_GEM_DOMAIN_RENDER);
-		execbuf.buffer_count++;
+		execbuf->buffer_count++;
+	} else if (flags & POLL_RUN) {
+		unsigned int offset;
+
+		igt_assert(!dep);
+
+		if (gen == 4 || gen == 5)
+			execbuf->flags |= I915_EXEC_SECURE;
+
+		spin->poll_handle = gem_create(fd, 4096);
+
+		if (__gem_set_caching(fd, spin->poll_handle,
+				      I915_CACHING_CACHED) == 0)
+			spin->running = __gem_mmap__cpu(fd, spin->poll_handle,
+							0, 4096,
+							PROT_READ | PROT_WRITE);
+		else
+			spin->running = __gem_mmap__wc(fd, spin->poll_handle,
+						       0, 4096,
+						       PROT_READ | PROT_WRITE);
+		igt_assert(spin->running);
+		igt_assert_eq(*spin->running, 0);
+
+		*batch++ = MI_STORE_DWORD_IMM | (gen < 6 ? 1 << 22 : 0);
+
+		if (gen >= 8) {
+			offset = 1;
+			*batch++ = 0;
+			*batch++ = 0;
+		} else if (gen >= 4) {
+			offset = 2;
+			*batch++ = 0;
+			*batch++ = 0;
+		} else {
+			offset = 1;
+			batch[-1]--;
+			*batch++ = 0;
+		}
+
+		*batch++ = 1;
+
+		obj[SCRATCH].handle = spin->poll_handle;
+		fill_reloc(&relocs[obj[BATCH].relocation_count++],
+			   spin->poll_handle, offset, 0, 0);
+		execbuf->buffer_count++;
 	}
 
-	spin->batch = batch;
+	spin->batch = batch = batch_start + 64 / sizeof(*batch);
 	spin->handle = obj[BATCH].handle;
 
 	/* Allow ourselves to be preempted */
@@ -145,40 +205,42 @@ static int emit_recursive_batch(igt_spin_t *spin,
 	batch += 1000;
 
 	/* recurse */
-	fill_reloc(&relocs[obj[BATCH].relocation_count],
-		   obj[BATCH].handle, (batch - spin->batch) + 1,
-		   I915_GEM_DOMAIN_COMMAND, 0);
+	r = &relocs[obj[BATCH].relocation_count++];
+	r->target_handle = obj[BATCH].handle;
+	r->offset = (batch + 1 - batch_start) * sizeof(*batch);
+	r->read_domains = I915_GEM_DOMAIN_COMMAND;
+	r->delta = 64;
 	if (gen >= 8) {
 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
-		*batch++ = 0;
+		*batch++ = r->delta;
 		*batch++ = 0;
 	} else if (gen >= 6) {
 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8;
-		*batch++ = 0;
+		*batch++ = r->delta;
 	} else {
 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
-		*batch = 0;
-		if (gen < 4) {
-			*batch |= 1;
-			relocs[obj[BATCH].relocation_count].delta = 1;
-		}
+		if (gen < 4)
+			r->delta |= 1;
+		*batch = r->delta;
 		batch++;
 	}
-	obj[BATCH].relocation_count++;
 	obj[BATCH].relocs_ptr = to_user_pointer(relocs);
 
-	execbuf.buffers_ptr = to_user_pointer(obj + (2 - execbuf.buffer_count));
-	execbuf.rsvd1 = ctx;
+	execbuf->buffers_ptr = to_user_pointer(obj +
+					       (2 - execbuf->buffer_count));
+	execbuf->rsvd1 = ctx;
 
-	if (out_fence)
-		execbuf.flags |= I915_EXEC_FENCE_OUT;
+	if (flags & OUT_FENCE)
+		execbuf->flags |= I915_EXEC_FENCE_OUT;
 
 	for (i = 0; i < nengine; i++) {
-		execbuf.flags &= ~ENGINE_MASK;
-		execbuf.flags |= engines[i];
-		gem_execbuf_wr(fd, &execbuf);
-		if (out_fence) {
-			int _fd = execbuf.rsvd2 >> 32;
+		execbuf->flags &= ~ENGINE_MASK;
+		execbuf->flags |= engines[i];
+
+		gem_execbuf_wr(fd, execbuf);
+
+		if (flags & OUT_FENCE) {
+			int _fd = execbuf->rsvd2 >> 32;
 
 			igt_assert(_fd >= 0);
 			if (fence_fd == -1) {
@@ -194,12 +256,20 @@ static int emit_recursive_batch(igt_spin_t *spin,
 		}
 	}
 
+	/* Make it easier for callers to resubmit. */
+
+	obj[BATCH].relocation_count = 0;
+	obj[BATCH].relocs_ptr = 0;
+
+	obj[SCRATCH].flags = EXEC_OBJECT_PINNED;
+	obj[BATCH].flags = EXEC_OBJECT_PINNED;
+
 	return fence_fd;
 }
 
 static igt_spin_t *
 ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
-		      int out_fence)
+		      unsigned int flags)
 {
 	igt_spin_t *spin;
 
@@ -207,7 +277,7 @@ ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
 	igt_assert(spin);
 
 	spin->out_fence = emit_recursive_batch(spin, fd, ctx, engine, dep,
-					       out_fence);
+					       flags);
 
 	pthread_mutex_lock(&list_lock);
 	igt_list_add(&spin->link, &spin_list);
@@ -219,7 +289,7 @@ ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
 igt_spin_t *
 __igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep)
 {
-	return ___igt_spin_batch_new(fd, ctx, engine, dep, false);
+	return ___igt_spin_batch_new(fd, ctx, engine, dep, 0);
 }
 
 /**
@@ -253,7 +323,7 @@ igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep)
 igt_spin_t *
 __igt_spin_batch_new_fence(int fd, uint32_t ctx, unsigned engine)
 {
-	return ___igt_spin_batch_new(fd, ctx, engine, 0, true);
+	return ___igt_spin_batch_new(fd, ctx, engine, 0, OUT_FENCE);
 }
 
 /**
@@ -286,6 +356,42 @@ igt_spin_batch_new_fence(int fd, uint32_t ctx, unsigned engine)
 	return spin;
 }
 
+igt_spin_t *
+__igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine)
+{
+	return ___igt_spin_batch_new(fd, ctx, engine, 0, POLL_RUN);
+}
+
+/**
+ * igt_spin_batch_new_poll:
+ * @fd: open i915 drm file descriptor
+ * @engine: Ring to execute batch OR'd with execbuf flags. If value is less
+ *          than 0, execute on all available rings.
+ *
+ * Start a recursive batch on a ring. Immediately returns a #igt_spin_t that
+ * contains the batch's handle that can be waited upon. The returned structure
+ * must be passed to igt_spin_batch_free() for post-processing.
+ *
+ * igt_spin_t->running will containt a pointer which target will change from
+ * zero to one once the spinner actually starts executing on the GPU.
+ *
+ * Returns:
+ * Structure with helper internal state for igt_spin_batch_free().
+ */
+igt_spin_t *
+igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine)
+{
+	igt_spin_t *spin;
+
+	igt_require_gem(fd);
+	igt_require(gem_mmap__has_wc(fd));
+
+	spin = __igt_spin_batch_new_poll(fd, ctx, engine);
+	igt_assert(gem_bo_busy(fd, spin->handle));
+
+	return spin;
+}
+
 static void notify(union sigval arg)
 {
 	igt_spin_t *spin = arg.sival_ptr;
@@ -340,6 +446,8 @@ void igt_spin_batch_end(igt_spin_t *spin)
 	if (!spin)
 		return;
 
+	igt_assert(*spin->batch == MI_ARB_CHK ||
+		   *spin->batch == MI_BATCH_BUFFER_END);
 	*spin->batch = MI_BATCH_BUFFER_END;
 	__sync_synchronize();
 }
@@ -365,7 +473,13 @@ void igt_spin_batch_free(int fd, igt_spin_t *spin)
 		timer_delete(spin->timer);
 
 	igt_spin_batch_end(spin);
-	gem_munmap(spin->batch, BATCH_SIZE);
+	gem_munmap((void *)((unsigned long)spin->batch & (~4095UL)),
+		   BATCH_SIZE);
+
+	if (spin->running) {
+		gem_munmap(spin->running, 4096);
+		gem_close(fd, spin->poll_handle);
+	}
 
 	gem_close(fd, spin->handle);
 
diff --git a/lib/igt_dummyload.h b/lib/igt_dummyload.h
index 4103e4ab9e36..3103935a309b 100644
--- a/lib/igt_dummyload.h
+++ b/lib/igt_dummyload.h
@@ -36,6 +36,10 @@ typedef struct igt_spin {
 	struct igt_list link;
 	uint32_t *batch;
 	int out_fence;
+	struct drm_i915_gem_exec_object2 obj[2];
+	struct drm_i915_gem_execbuffer2 execbuf;
+	uint32_t poll_handle;
+	bool *running;
 } igt_spin_t;
 
 igt_spin_t *__igt_spin_batch_new(int fd,
@@ -55,6 +59,13 @@ igt_spin_t *igt_spin_batch_new_fence(int fd,
 				     uint32_t ctx,
 				     unsigned engine);
 
+igt_spin_t *__igt_spin_batch_new_poll(int fd,
+				       uint32_t ctx,
+				       unsigned engine);
+igt_spin_t *igt_spin_batch_new_poll(int fd,
+				    uint32_t ctx,
+				    unsigned engine);
+
 void igt_spin_batch_set_timeout(igt_spin_t *spin, int64_t ns);
 void igt_spin_batch_end(igt_spin_t *spin);
 void igt_spin_batch_free(int fd, igt_spin_t *spin);
diff --git a/lib/igt_gt.c b/lib/igt_gt.c
index 01aebc670862..4569fd36bd85 100644
--- a/lib/igt_gt.c
+++ b/lib/igt_gt.c
@@ -609,7 +609,7 @@ bool gem_can_store_dword(int fd, unsigned int engine)
 	if (gen == 3 && (info->is_grantsdale || info->is_alviso))
 		return false; /* only supports physical addresses */
 
-	if (gen == 6 && (engine & ~(3<<13)) == I915_EXEC_BSD)
+	if (gen == 6 && ((engine & 0x3f) == I915_EXEC_BSD))
 		return false; /* kills the machine! */
 
 	if (info->is_broadwater)
diff --git a/lib/ioctl_wrappers.c b/lib/ioctl_wrappers.c
index 8748cfcfc04f..4e1a08bf06b4 100644
--- a/lib/ioctl_wrappers.c
+++ b/lib/ioctl_wrappers.c
@@ -198,7 +198,7 @@ void gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride)
 	igt_assert(__gem_set_tiling(fd, handle, tiling, stride) == 0);
 }
 
-static int __gem_set_caching(int fd, uint32_t handle, uint32_t caching)
+int __gem_set_caching(int fd, uint32_t handle, uint32_t caching)
 {
 	struct drm_i915_gem_caching arg;
 	int err;
diff --git a/lib/ioctl_wrappers.h b/lib/ioctl_wrappers.h
index 13fbe3c103c0..b966f72c90a8 100644
--- a/lib/ioctl_wrappers.h
+++ b/lib/ioctl_wrappers.h
@@ -61,6 +61,7 @@ bool gem_get_tiling(int fd, uint32_t handle, uint32_t *tiling, uint32_t *swizzle
 void gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride);
 int __gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride);
 
+int __gem_set_caching(int fd, uint32_t handle, uint32_t caching);
 void gem_set_caching(int fd, uint32_t handle, uint32_t caching);
 uint32_t gem_get_caching(int fd, uint32_t handle);
 uint32_t gem_flink(int fd, uint32_t handle);
diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
index 19fcc95ffc7f..469b9becdbac 100644
--- a/tests/perf_pmu.c
+++ b/tests/perf_pmu.c
@@ -170,6 +170,56 @@ static unsigned int e2ring(int gem_fd, const struct intel_execution_engine2 *e)
 #define FLAG_LONG (16)
 #define FLAG_HANG (32)
 
+static igt_spin_t * __spin_poll(int fd, uint32_t ctx, unsigned long flags)
+{
+	if (gem_can_store_dword(fd, flags))
+		return __igt_spin_batch_new_poll(fd, ctx, flags);
+	else
+		return __igt_spin_batch_new(fd, ctx, flags, 0);
+}
+
+static unsigned long __spin_wait(int fd, igt_spin_t *spin)
+{
+	struct timespec start = { };
+
+	igt_nsec_elapsed(&start);
+
+	if (gem_can_store_dword(fd, spin->execbuf.flags)) {
+		unsigned long timeout = 0;
+
+		while (!spin->running) {
+			unsigned long t = igt_nsec_elapsed(&start);
+
+			if ((t - timeout) > 250e6) {
+				timeout = t;
+				igt_warn("Spinner not running after %.2fms\n",
+					 (double)t / 1e6);
+			}
+		};
+	} else {
+		igt_debug("__spin_wait - usleep mode\n");
+		usleep(500e3); /* Better than nothing! */
+	}
+
+	return igt_nsec_elapsed(&start);
+}
+
+static igt_spin_t * __spin_sync(int fd, uint32_t ctx, unsigned long flags)
+{
+	igt_spin_t *spin = __spin_poll(fd, ctx, flags);
+
+	__spin_wait(fd, spin);
+
+	return spin;
+}
+
+static igt_spin_t * spin_sync(int fd, uint32_t ctx, unsigned long flags)
+{
+	igt_require_gem(fd);
+
+	return __spin_sync(fd, ctx, flags);
+}
+
 static void end_spin(int fd, igt_spin_t *spin, unsigned int flags)
 {
 	if (!spin)
@@ -180,8 +230,25 @@ static void end_spin(int fd, igt_spin_t *spin, unsigned int flags)
 	if (flags & FLAG_SYNC)
 		gem_sync(fd, spin->handle);
 
-	if (flags & TEST_TRAILING_IDLE)
-		usleep(batch_duration_ns / 5000);
+	if (flags & TEST_TRAILING_IDLE) {
+		unsigned long t, timeout = 0;
+		struct timespec start = { };
+
+		igt_nsec_elapsed(&start);
+
+		do {
+			t = igt_nsec_elapsed(&start);
+
+			if (gem_bo_busy(fd, spin->handle) &&
+			    (t - timeout) > 10e6) {
+				timeout = t;
+				igt_warn("Spinner not idle after %.2fms\n",
+					 (double)t / 1e6);
+			}
+
+			usleep(1e3);
+		} while (t < batch_duration_ns / 5);
+	}
 }
 
 static void
@@ -195,7 +262,7 @@ single(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags)
 	fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance));
 
 	if (flags & TEST_BUSY)
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	else
 		spin = NULL;
 
@@ -251,13 +318,7 @@ busy_start(int gem_fd, const struct intel_execution_engine2 *e)
 	 */
 	sleep(2);
 
-	spin = __igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
-
-	/*
-	 * Sleep for a bit after making the engine busy to make sure the PMU
-	 * gets enabled when the batch is already running.
-	 */
-	usleep(500e3);
+	spin = __spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 
 	fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance));
 
@@ -300,7 +361,7 @@ busy_double_start(int gem_fd, const struct intel_execution_engine2 *e)
 	 * re-submission in execlists mode. Make sure busyness is correctly
 	 * reported with the engine busy, and after the engine went idle.
 	 */
-	spin[0] = __igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin[0] = __spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	usleep(500e3);
 	spin[1] = __igt_spin_batch_new(gem_fd, ctx, e2ring(gem_fd, e), 0);
 
@@ -386,7 +447,7 @@ busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 
 	igt_assert_eq(i, num_engines);
 
-	spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -412,15 +473,15 @@ busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 }
 
 static void
-__submit_spin_batch(int gem_fd,
-		    struct drm_i915_gem_exec_object2 *obj,
-		    const struct intel_execution_engine2 *e)
+__submit_spin_batch(int gem_fd, igt_spin_t *spin,
+		    const struct intel_execution_engine2 *e,
+		    int offset)
 {
-	struct drm_i915_gem_execbuffer2 eb = {
-		.buffer_count = 1,
-		.buffers_ptr = to_user_pointer(obj),
-		.flags = e2ring(gem_fd, e),
-	};
+	struct drm_i915_gem_execbuffer2 eb = spin->execbuf;
+
+	eb.flags &= ~(0x3f | I915_EXEC_BSD_MASK);
+	eb.flags |= e2ring(gem_fd, e) | I915_EXEC_NO_RELOC;
+	eb.batch_start_offset += offset;
 
 	gem_execbuf(gem_fd, &eb);
 }
@@ -429,7 +490,6 @@ static void
 most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 		    const unsigned int num_engines, unsigned int flags)
 {
-	struct drm_i915_gem_exec_object2 obj = {};
 	const struct intel_execution_engine2 *e_;
 	uint64_t tval[2][num_engines];
 	uint64_t val[num_engines];
@@ -443,15 +503,12 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 		if (!gem_has_engine(gem_fd, e_->class, e_->instance))
 			continue;
 
-		if (e == e_) {
+		if (e == e_)
 			idle_idx = i;
-		} else if (spin) {
-			__submit_spin_batch(gem_fd, &obj, e_);
-		} else {
-			spin = igt_spin_batch_new(gem_fd, 0,
-						  e2ring(gem_fd, e_), 0);
-			obj.handle = spin->handle;
-		}
+		else if (spin)
+			__submit_spin_batch(gem_fd, spin, e_, 64);
+		else
+			spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e_));
 
 		val[i++] = I915_PMU_ENGINE_BUSY(e_->class, e_->instance);
 	}
@@ -461,6 +518,9 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 	for (i = 0; i < num_engines; i++)
 		fd[i] = open_group(val[i], fd[0]);
 
+	/* Small delay to allow engines to start. */
+	usleep(__spin_wait(gem_fd, spin) * num_engines / 1e3);
+
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -489,7 +549,6 @@ static void
 all_busy_check_all(int gem_fd, const unsigned int num_engines,
 		   unsigned int flags)
 {
-	struct drm_i915_gem_exec_object2 obj = {};
 	const struct intel_execution_engine2 *e;
 	uint64_t tval[2][num_engines];
 	uint64_t val[num_engines];
@@ -503,13 +562,10 @@ all_busy_check_all(int gem_fd, const unsigned int num_engines,
 		if (!gem_has_engine(gem_fd, e->class, e->instance))
 			continue;
 
-		if (spin) {
-			__submit_spin_batch(gem_fd, &obj, e);
-		} else {
-			spin = igt_spin_batch_new(gem_fd, 0,
-						  e2ring(gem_fd, e), 0);
-			obj.handle = spin->handle;
-		}
+		if (spin)
+			__submit_spin_batch(gem_fd, spin, e, 64);
+		else
+			spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e));
 
 		val[i++] = I915_PMU_ENGINE_BUSY(e->class, e->instance);
 	}
@@ -519,6 +575,9 @@ all_busy_check_all(int gem_fd, const unsigned int num_engines,
 	for (i = 0; i < num_engines; i++)
 		fd[i] = open_group(val[i], fd[0]);
 
+	/* Small delay to allow engines to start. */
+	usleep(__spin_wait(gem_fd, spin) * num_engines / 1e3);
+
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -550,7 +609,7 @@ no_sema(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags)
 	open_group(I915_PMU_ENGINE_WAIT(e->class, e->instance), fd);
 
 	if (flags & TEST_BUSY)
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	else
 		spin = NULL;
 
@@ -884,7 +943,7 @@ multi_client(int gem_fd, const struct intel_execution_engine2 *e)
 	 */
 	fd[1] = open_pmu(config);
 
-	spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 
 	val[0] = val[1] = __pmu_read_single(fd[0], &ts[0]);
 	slept[1] = measured_usleep(batch_duration_ns / 1000);
@@ -1248,7 +1307,7 @@ test_frequency(int gem_fd)
 	igt_require(igt_sysfs_get_u32(sysfs, "gt_boost_freq_mhz") == min_freq);
 
 	gem_quiescent_gpu(gem_fd); /* Idle to be sure the change takes effect */
-	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+	spin = spin_sync(gem_fd, 0, I915_EXEC_RENDER);
 
 	slept = pmu_read_multi(fd, 2, start);
 	measured_usleep(batch_duration_ns / 1000);
@@ -1274,7 +1333,7 @@ test_frequency(int gem_fd)
 	igt_require(igt_sysfs_get_u32(sysfs, "gt_min_freq_mhz") == max_freq);
 
 	gem_quiescent_gpu(gem_fd);
-	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+	spin = spin_sync(gem_fd, 0, I915_EXEC_RENDER);
 
 	slept = pmu_read_multi(fd, 2, start);
 	measured_usleep(batch_duration_ns / 1000);
@@ -1455,6 +1514,11 @@ static void __rearm_spin_batch(igt_spin_t *spin)
 {
 	const uint32_t mi_arb_chk = 0x5 << 23;
 
+	if (spin->running) {
+		igt_assert(*spin->running);
+		*spin->running = 0;
+	}
+	igt_assert_eq(*spin->batch, MI_BATCH_BUFFER_END);
        *spin->batch = mi_arb_chk;
        __sync_synchronize();
 }
@@ -1489,6 +1553,9 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 	/* Sampling platforms cannot reach the high accuracy criteria. */
 	igt_require(gem_has_execlists(gem_fd));
 
+	/* Need store dword for accurate PWM. */
+	igt_require(gem_can_store_dword(gem_fd, e2ring(gem_fd, e)));
+
 	while (idle_us < 2500) {
 		busy_us *= 2;
 		idle_us *= 2;
@@ -1517,7 +1584,6 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 		const unsigned long timeout[] = {
 			pwm_calibration_us * 1000, test_us * 1000
 		};
-		struct drm_i915_gem_exec_object2 obj = {};
 		uint64_t total_busy_ns = 0, total_idle_ns = 0;
 		igt_spin_t *spin;
 		int ret;
@@ -1530,12 +1596,9 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 			igt_warn("Failed to set scheduling policy!\n");
 
 		/* Allocate our spin batch and idle it. */
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
-		obj.handle = spin->handle;
-		__submit_spin_batch(gem_fd, &obj, e); /* record its location */
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 		igt_spin_batch_end(spin);
-		gem_sync(gem_fd, obj.handle);
-		obj.flags |= EXEC_OBJECT_PINNED;
+		gem_sync(gem_fd, spin->handle);
 
 		/* 1st pass is calibration, second pass is the test. */
 		for (int pass = 0; pass < ARRAY_SIZE(timeout); pass++) {
@@ -1545,24 +1608,30 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 
 			igt_nsec_elapsed(&test_start);
 			do {
-				unsigned int target_idle_us, t_busy;
+				unsigned int target_idle_us;
+				struct timespec start = { };
+				unsigned long prep_delay_ns;
 
 				/* Restart the spinbatch. */
+				igt_nsec_elapsed(&start);
 				__rearm_spin_batch(spin);
-				__submit_spin_batch(gem_fd, &obj, e);
+				__submit_spin_batch(gem_fd, spin, e, 0);
 
-				/*
-				 * Note that the submission may be delayed to a
-				 * tasklet (ksoftirqd) which cannot run until we
-				 * sleep as we hog the cpu (we are RT).
-				 */
+				 /* Wait for batch to start executing. */
+				__spin_wait(gem_fd, spin);
+				prep_delay_ns = igt_nsec_elapsed(&start);
 
-				t_busy = measured_usleep(busy_us);
+				/* PWM busy sleep. */
+				memset(&start, 0, sizeof(start));
+				igt_nsec_elapsed(&start);
+				measured_usleep(busy_us);
 				igt_spin_batch_end(spin);
-				gem_sync(gem_fd, obj.handle);
+				gem_sync(gem_fd, spin->handle);
 
-				total_busy_ns += t_busy;
+				total_busy_ns += igt_nsec_elapsed(&start);
+				total_idle_ns += prep_delay_ns;
 
+				/* Re-calibrate. */
 				target_idle_us =
 					(100 * total_busy_ns / target_busy_pct - (total_busy_ns + total_idle_ns)) / 1000;
 				total_idle_ns += measured_usleep(target_idle_us);
-- 
2.14.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [igt-dev] [PATCH i-g-t v8] tests/perf_pmu: Improve accuracy by waiting on spinner to start
@ 2018-03-19 16:59                     ` Tvrtko Ursulin
  0 siblings, 0 replies; 53+ messages in thread
From: Tvrtko Ursulin @ 2018-03-19 16:59 UTC (permalink / raw)
  To: igt-dev; +Cc: Intel-gfx, Tvrtko Ursulin

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

More than one test assumes that the spinner is running pretty much
immediately after we have create or submitted it.

In actuality there is a variable delay, especially on execlists platforms,
between submission and spin batch starting to run on the hardware.

To enable tests which care about this level of timing to account for this,
we add a new spin batch constructor which provides an output field which
can be polled to determine when the batch actually started running.

This is implemented via MI_STOREDW_IMM from the spin batch, writing into
memory mapped page shared with userspace.

Using this facility from perf_pmu, where applicable, should improve very
occasional test fails across the set and platforms.

v2:
 Chris Wilson:
 * Use caching mapping if available.
 * Handle old gens better.
 * Use gem_can_store_dword.
 * Cache exec obj array in spin_batch_t for easier resubmit.

v3:
 * Forgot I915_EXEC_NO_RELOC. (Chris Wilson)

v4:
 * Mask out all non-engine flags in gem_can_store_dword.
 * Added some debug logging.

v5:
 * Fix relocs and batch munmap. (Chris)
 * Added assert idle spinner batch looks as expected.

v6:
 * Skip accuracy tests when !gem_can_store_dword.

v7:
 * Fix batch recursion reloc address.

v8:
 Chris Wilson:
 * Pull up gem_can_store_dword check before we start submitting.
 * Build spinner batch in a way we can skip store dword when not
   needed so we can run on SandyBridge.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 lib/igt_dummyload.c  | 192 ++++++++++++++++++++++++++++++++++++++++-----------
 lib/igt_dummyload.h  |  11 +++
 lib/igt_gt.c         |   2 +-
 lib/ioctl_wrappers.c |   2 +-
 lib/ioctl_wrappers.h |   1 +
 tests/perf_pmu.c     | 183 +++++++++++++++++++++++++++++++++---------------
 6 files changed, 293 insertions(+), 98 deletions(-)

diff --git a/lib/igt_dummyload.c b/lib/igt_dummyload.c
index 4b20f23dfe26..ce84628095b5 100644
--- a/lib/igt_dummyload.c
+++ b/lib/igt_dummyload.c
@@ -74,35 +74,48 @@ fill_reloc(struct drm_i915_gem_relocation_entry *reloc,
 	reloc->write_domain = write_domains;
 }
 
-static int emit_recursive_batch(igt_spin_t *spin,
-				int fd, uint32_t ctx, unsigned engine,
-				uint32_t dep, bool out_fence)
+#define OUT_FENCE	(1 << 0)
+#define POLL_RUN	(1 << 1)
+
+static int
+emit_recursive_batch(igt_spin_t *spin, int fd, uint32_t ctx, unsigned engine,
+		     uint32_t dep, unsigned int flags)
 {
 #define SCRATCH 0
 #define BATCH 1
 	const int gen = intel_gen(intel_get_drm_devid(fd));
-	struct drm_i915_gem_exec_object2 obj[2];
-	struct drm_i915_gem_relocation_entry relocs[2];
-	struct drm_i915_gem_execbuffer2 execbuf;
+	struct drm_i915_gem_relocation_entry relocs[2], *r;
+	struct drm_i915_gem_execbuffer2 *execbuf;
+	struct drm_i915_gem_exec_object2 *obj;
 	unsigned int engines[16];
 	unsigned int nengine;
 	int fence_fd = -1;
-	uint32_t *batch;
+	uint32_t *batch, *batch_start;
 	int i;
 
 	nengine = 0;
 	if (engine == -1) {
-		for_each_engine(fd, engine)
-			if (engine)
+		for_each_engine(fd, engine) {
+			if (engine) {
+			if (flags & POLL_RUN)
+				igt_require(!(flags & POLL_RUN) ||
+					    gem_can_store_dword(fd, engine));
+
 				engines[nengine++] = engine;
+			}
+		}
 	} else {
 		gem_require_ring(fd, engine);
+		igt_require(!(flags & POLL_RUN) ||
+			    gem_can_store_dword(fd, engine));
 		engines[nengine++] = engine;
 	}
 	igt_require(nengine);
 
-	memset(&execbuf, 0, sizeof(execbuf));
-	memset(obj, 0, sizeof(obj));
+	memset(&spin->execbuf, 0, sizeof(spin->execbuf));
+	execbuf = &spin->execbuf;
+	memset(spin->obj, 0, sizeof(spin->obj));
+	obj = spin->obj;
 	memset(relocs, 0, sizeof(relocs));
 
 	obj[BATCH].handle = gem_create(fd, BATCH_SIZE);
@@ -113,19 +126,66 @@ static int emit_recursive_batch(igt_spin_t *spin,
 				       	BATCH_SIZE, PROT_WRITE);
 	gem_set_domain(fd, obj[BATCH].handle,
 			I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT);
-	execbuf.buffer_count++;
+	execbuf->buffer_count++;
+	batch_start = batch;
 
 	if (dep) {
+		igt_assert(!(flags & POLL_RUN));
+
 		/* dummy write to dependency */
 		obj[SCRATCH].handle = dep;
 		fill_reloc(&relocs[obj[BATCH].relocation_count++],
 			   dep, 1020,
 			   I915_GEM_DOMAIN_RENDER,
 			   I915_GEM_DOMAIN_RENDER);
-		execbuf.buffer_count++;
+		execbuf->buffer_count++;
+	} else if (flags & POLL_RUN) {
+		unsigned int offset;
+
+		igt_assert(!dep);
+
+		if (gen == 4 || gen == 5)
+			execbuf->flags |= I915_EXEC_SECURE;
+
+		spin->poll_handle = gem_create(fd, 4096);
+
+		if (__gem_set_caching(fd, spin->poll_handle,
+				      I915_CACHING_CACHED) == 0)
+			spin->running = __gem_mmap__cpu(fd, spin->poll_handle,
+							0, 4096,
+							PROT_READ | PROT_WRITE);
+		else
+			spin->running = __gem_mmap__wc(fd, spin->poll_handle,
+						       0, 4096,
+						       PROT_READ | PROT_WRITE);
+		igt_assert(spin->running);
+		igt_assert_eq(*spin->running, 0);
+
+		*batch++ = MI_STORE_DWORD_IMM | (gen < 6 ? 1 << 22 : 0);
+
+		if (gen >= 8) {
+			offset = 1;
+			*batch++ = 0;
+			*batch++ = 0;
+		} else if (gen >= 4) {
+			offset = 2;
+			*batch++ = 0;
+			*batch++ = 0;
+		} else {
+			offset = 1;
+			batch[-1]--;
+			*batch++ = 0;
+		}
+
+		*batch++ = 1;
+
+		obj[SCRATCH].handle = spin->poll_handle;
+		fill_reloc(&relocs[obj[BATCH].relocation_count++],
+			   spin->poll_handle, offset, 0, 0);
+		execbuf->buffer_count++;
 	}
 
-	spin->batch = batch;
+	spin->batch = batch = batch_start + 64 / sizeof(*batch);
 	spin->handle = obj[BATCH].handle;
 
 	/* Allow ourselves to be preempted */
@@ -145,40 +205,42 @@ static int emit_recursive_batch(igt_spin_t *spin,
 	batch += 1000;
 
 	/* recurse */
-	fill_reloc(&relocs[obj[BATCH].relocation_count],
-		   obj[BATCH].handle, (batch - spin->batch) + 1,
-		   I915_GEM_DOMAIN_COMMAND, 0);
+	r = &relocs[obj[BATCH].relocation_count++];
+	r->target_handle = obj[BATCH].handle;
+	r->offset = (batch + 1 - batch_start) * sizeof(*batch);
+	r->read_domains = I915_GEM_DOMAIN_COMMAND;
+	r->delta = 64;
 	if (gen >= 8) {
 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
-		*batch++ = 0;
+		*batch++ = r->delta;
 		*batch++ = 0;
 	} else if (gen >= 6) {
 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8;
-		*batch++ = 0;
+		*batch++ = r->delta;
 	} else {
 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
-		*batch = 0;
-		if (gen < 4) {
-			*batch |= 1;
-			relocs[obj[BATCH].relocation_count].delta = 1;
-		}
+		if (gen < 4)
+			r->delta |= 1;
+		*batch = r->delta;
 		batch++;
 	}
-	obj[BATCH].relocation_count++;
 	obj[BATCH].relocs_ptr = to_user_pointer(relocs);
 
-	execbuf.buffers_ptr = to_user_pointer(obj + (2 - execbuf.buffer_count));
-	execbuf.rsvd1 = ctx;
+	execbuf->buffers_ptr = to_user_pointer(obj +
+					       (2 - execbuf->buffer_count));
+	execbuf->rsvd1 = ctx;
 
-	if (out_fence)
-		execbuf.flags |= I915_EXEC_FENCE_OUT;
+	if (flags & OUT_FENCE)
+		execbuf->flags |= I915_EXEC_FENCE_OUT;
 
 	for (i = 0; i < nengine; i++) {
-		execbuf.flags &= ~ENGINE_MASK;
-		execbuf.flags |= engines[i];
-		gem_execbuf_wr(fd, &execbuf);
-		if (out_fence) {
-			int _fd = execbuf.rsvd2 >> 32;
+		execbuf->flags &= ~ENGINE_MASK;
+		execbuf->flags |= engines[i];
+
+		gem_execbuf_wr(fd, execbuf);
+
+		if (flags & OUT_FENCE) {
+			int _fd = execbuf->rsvd2 >> 32;
 
 			igt_assert(_fd >= 0);
 			if (fence_fd == -1) {
@@ -194,12 +256,20 @@ static int emit_recursive_batch(igt_spin_t *spin,
 		}
 	}
 
+	/* Make it easier for callers to resubmit. */
+
+	obj[BATCH].relocation_count = 0;
+	obj[BATCH].relocs_ptr = 0;
+
+	obj[SCRATCH].flags = EXEC_OBJECT_PINNED;
+	obj[BATCH].flags = EXEC_OBJECT_PINNED;
+
 	return fence_fd;
 }
 
 static igt_spin_t *
 ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
-		      int out_fence)
+		      unsigned int flags)
 {
 	igt_spin_t *spin;
 
@@ -207,7 +277,7 @@ ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
 	igt_assert(spin);
 
 	spin->out_fence = emit_recursive_batch(spin, fd, ctx, engine, dep,
-					       out_fence);
+					       flags);
 
 	pthread_mutex_lock(&list_lock);
 	igt_list_add(&spin->link, &spin_list);
@@ -219,7 +289,7 @@ ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
 igt_spin_t *
 __igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep)
 {
-	return ___igt_spin_batch_new(fd, ctx, engine, dep, false);
+	return ___igt_spin_batch_new(fd, ctx, engine, dep, 0);
 }
 
 /**
@@ -253,7 +323,7 @@ igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep)
 igt_spin_t *
 __igt_spin_batch_new_fence(int fd, uint32_t ctx, unsigned engine)
 {
-	return ___igt_spin_batch_new(fd, ctx, engine, 0, true);
+	return ___igt_spin_batch_new(fd, ctx, engine, 0, OUT_FENCE);
 }
 
 /**
@@ -286,6 +356,42 @@ igt_spin_batch_new_fence(int fd, uint32_t ctx, unsigned engine)
 	return spin;
 }
 
+igt_spin_t *
+__igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine)
+{
+	return ___igt_spin_batch_new(fd, ctx, engine, 0, POLL_RUN);
+}
+
+/**
+ * igt_spin_batch_new_poll:
+ * @fd: open i915 drm file descriptor
+ * @engine: Ring to execute batch OR'd with execbuf flags. If value is less
+ *          than 0, execute on all available rings.
+ *
+ * Start a recursive batch on a ring. Immediately returns a #igt_spin_t that
+ * contains the batch's handle that can be waited upon. The returned structure
+ * must be passed to igt_spin_batch_free() for post-processing.
+ *
+ * igt_spin_t->running will containt a pointer which target will change from
+ * zero to one once the spinner actually starts executing on the GPU.
+ *
+ * Returns:
+ * Structure with helper internal state for igt_spin_batch_free().
+ */
+igt_spin_t *
+igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine)
+{
+	igt_spin_t *spin;
+
+	igt_require_gem(fd);
+	igt_require(gem_mmap__has_wc(fd));
+
+	spin = __igt_spin_batch_new_poll(fd, ctx, engine);
+	igt_assert(gem_bo_busy(fd, spin->handle));
+
+	return spin;
+}
+
 static void notify(union sigval arg)
 {
 	igt_spin_t *spin = arg.sival_ptr;
@@ -340,6 +446,8 @@ void igt_spin_batch_end(igt_spin_t *spin)
 	if (!spin)
 		return;
 
+	igt_assert(*spin->batch == MI_ARB_CHK ||
+		   *spin->batch == MI_BATCH_BUFFER_END);
 	*spin->batch = MI_BATCH_BUFFER_END;
 	__sync_synchronize();
 }
@@ -365,7 +473,13 @@ void igt_spin_batch_free(int fd, igt_spin_t *spin)
 		timer_delete(spin->timer);
 
 	igt_spin_batch_end(spin);
-	gem_munmap(spin->batch, BATCH_SIZE);
+	gem_munmap((void *)((unsigned long)spin->batch & (~4095UL)),
+		   BATCH_SIZE);
+
+	if (spin->running) {
+		gem_munmap(spin->running, 4096);
+		gem_close(fd, spin->poll_handle);
+	}
 
 	gem_close(fd, spin->handle);
 
diff --git a/lib/igt_dummyload.h b/lib/igt_dummyload.h
index 4103e4ab9e36..3103935a309b 100644
--- a/lib/igt_dummyload.h
+++ b/lib/igt_dummyload.h
@@ -36,6 +36,10 @@ typedef struct igt_spin {
 	struct igt_list link;
 	uint32_t *batch;
 	int out_fence;
+	struct drm_i915_gem_exec_object2 obj[2];
+	struct drm_i915_gem_execbuffer2 execbuf;
+	uint32_t poll_handle;
+	bool *running;
 } igt_spin_t;
 
 igt_spin_t *__igt_spin_batch_new(int fd,
@@ -55,6 +59,13 @@ igt_spin_t *igt_spin_batch_new_fence(int fd,
 				     uint32_t ctx,
 				     unsigned engine);
 
+igt_spin_t *__igt_spin_batch_new_poll(int fd,
+				       uint32_t ctx,
+				       unsigned engine);
+igt_spin_t *igt_spin_batch_new_poll(int fd,
+				    uint32_t ctx,
+				    unsigned engine);
+
 void igt_spin_batch_set_timeout(igt_spin_t *spin, int64_t ns);
 void igt_spin_batch_end(igt_spin_t *spin);
 void igt_spin_batch_free(int fd, igt_spin_t *spin);
diff --git a/lib/igt_gt.c b/lib/igt_gt.c
index 01aebc670862..4569fd36bd85 100644
--- a/lib/igt_gt.c
+++ b/lib/igt_gt.c
@@ -609,7 +609,7 @@ bool gem_can_store_dword(int fd, unsigned int engine)
 	if (gen == 3 && (info->is_grantsdale || info->is_alviso))
 		return false; /* only supports physical addresses */
 
-	if (gen == 6 && (engine & ~(3<<13)) == I915_EXEC_BSD)
+	if (gen == 6 && ((engine & 0x3f) == I915_EXEC_BSD))
 		return false; /* kills the machine! */
 
 	if (info->is_broadwater)
diff --git a/lib/ioctl_wrappers.c b/lib/ioctl_wrappers.c
index 8748cfcfc04f..4e1a08bf06b4 100644
--- a/lib/ioctl_wrappers.c
+++ b/lib/ioctl_wrappers.c
@@ -198,7 +198,7 @@ void gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride)
 	igt_assert(__gem_set_tiling(fd, handle, tiling, stride) == 0);
 }
 
-static int __gem_set_caching(int fd, uint32_t handle, uint32_t caching)
+int __gem_set_caching(int fd, uint32_t handle, uint32_t caching)
 {
 	struct drm_i915_gem_caching arg;
 	int err;
diff --git a/lib/ioctl_wrappers.h b/lib/ioctl_wrappers.h
index 13fbe3c103c0..b966f72c90a8 100644
--- a/lib/ioctl_wrappers.h
+++ b/lib/ioctl_wrappers.h
@@ -61,6 +61,7 @@ bool gem_get_tiling(int fd, uint32_t handle, uint32_t *tiling, uint32_t *swizzle
 void gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride);
 int __gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride);
 
+int __gem_set_caching(int fd, uint32_t handle, uint32_t caching);
 void gem_set_caching(int fd, uint32_t handle, uint32_t caching);
 uint32_t gem_get_caching(int fd, uint32_t handle);
 uint32_t gem_flink(int fd, uint32_t handle);
diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
index 19fcc95ffc7f..469b9becdbac 100644
--- a/tests/perf_pmu.c
+++ b/tests/perf_pmu.c
@@ -170,6 +170,56 @@ static unsigned int e2ring(int gem_fd, const struct intel_execution_engine2 *e)
 #define FLAG_LONG (16)
 #define FLAG_HANG (32)
 
+static igt_spin_t * __spin_poll(int fd, uint32_t ctx, unsigned long flags)
+{
+	if (gem_can_store_dword(fd, flags))
+		return __igt_spin_batch_new_poll(fd, ctx, flags);
+	else
+		return __igt_spin_batch_new(fd, ctx, flags, 0);
+}
+
+static unsigned long __spin_wait(int fd, igt_spin_t *spin)
+{
+	struct timespec start = { };
+
+	igt_nsec_elapsed(&start);
+
+	if (gem_can_store_dword(fd, spin->execbuf.flags)) {
+		unsigned long timeout = 0;
+
+		while (!spin->running) {
+			unsigned long t = igt_nsec_elapsed(&start);
+
+			if ((t - timeout) > 250e6) {
+				timeout = t;
+				igt_warn("Spinner not running after %.2fms\n",
+					 (double)t / 1e6);
+			}
+		};
+	} else {
+		igt_debug("__spin_wait - usleep mode\n");
+		usleep(500e3); /* Better than nothing! */
+	}
+
+	return igt_nsec_elapsed(&start);
+}
+
+static igt_spin_t * __spin_sync(int fd, uint32_t ctx, unsigned long flags)
+{
+	igt_spin_t *spin = __spin_poll(fd, ctx, flags);
+
+	__spin_wait(fd, spin);
+
+	return spin;
+}
+
+static igt_spin_t * spin_sync(int fd, uint32_t ctx, unsigned long flags)
+{
+	igt_require_gem(fd);
+
+	return __spin_sync(fd, ctx, flags);
+}
+
 static void end_spin(int fd, igt_spin_t *spin, unsigned int flags)
 {
 	if (!spin)
@@ -180,8 +230,25 @@ static void end_spin(int fd, igt_spin_t *spin, unsigned int flags)
 	if (flags & FLAG_SYNC)
 		gem_sync(fd, spin->handle);
 
-	if (flags & TEST_TRAILING_IDLE)
-		usleep(batch_duration_ns / 5000);
+	if (flags & TEST_TRAILING_IDLE) {
+		unsigned long t, timeout = 0;
+		struct timespec start = { };
+
+		igt_nsec_elapsed(&start);
+
+		do {
+			t = igt_nsec_elapsed(&start);
+
+			if (gem_bo_busy(fd, spin->handle) &&
+			    (t - timeout) > 10e6) {
+				timeout = t;
+				igt_warn("Spinner not idle after %.2fms\n",
+					 (double)t / 1e6);
+			}
+
+			usleep(1e3);
+		} while (t < batch_duration_ns / 5);
+	}
 }
 
 static void
@@ -195,7 +262,7 @@ single(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags)
 	fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance));
 
 	if (flags & TEST_BUSY)
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	else
 		spin = NULL;
 
@@ -251,13 +318,7 @@ busy_start(int gem_fd, const struct intel_execution_engine2 *e)
 	 */
 	sleep(2);
 
-	spin = __igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
-
-	/*
-	 * Sleep for a bit after making the engine busy to make sure the PMU
-	 * gets enabled when the batch is already running.
-	 */
-	usleep(500e3);
+	spin = __spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 
 	fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance));
 
@@ -300,7 +361,7 @@ busy_double_start(int gem_fd, const struct intel_execution_engine2 *e)
 	 * re-submission in execlists mode. Make sure busyness is correctly
 	 * reported with the engine busy, and after the engine went idle.
 	 */
-	spin[0] = __igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin[0] = __spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	usleep(500e3);
 	spin[1] = __igt_spin_batch_new(gem_fd, ctx, e2ring(gem_fd, e), 0);
 
@@ -386,7 +447,7 @@ busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 
 	igt_assert_eq(i, num_engines);
 
-	spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -412,15 +473,15 @@ busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 }
 
 static void
-__submit_spin_batch(int gem_fd,
-		    struct drm_i915_gem_exec_object2 *obj,
-		    const struct intel_execution_engine2 *e)
+__submit_spin_batch(int gem_fd, igt_spin_t *spin,
+		    const struct intel_execution_engine2 *e,
+		    int offset)
 {
-	struct drm_i915_gem_execbuffer2 eb = {
-		.buffer_count = 1,
-		.buffers_ptr = to_user_pointer(obj),
-		.flags = e2ring(gem_fd, e),
-	};
+	struct drm_i915_gem_execbuffer2 eb = spin->execbuf;
+
+	eb.flags &= ~(0x3f | I915_EXEC_BSD_MASK);
+	eb.flags |= e2ring(gem_fd, e) | I915_EXEC_NO_RELOC;
+	eb.batch_start_offset += offset;
 
 	gem_execbuf(gem_fd, &eb);
 }
@@ -429,7 +490,6 @@ static void
 most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 		    const unsigned int num_engines, unsigned int flags)
 {
-	struct drm_i915_gem_exec_object2 obj = {};
 	const struct intel_execution_engine2 *e_;
 	uint64_t tval[2][num_engines];
 	uint64_t val[num_engines];
@@ -443,15 +503,12 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 		if (!gem_has_engine(gem_fd, e_->class, e_->instance))
 			continue;
 
-		if (e == e_) {
+		if (e == e_)
 			idle_idx = i;
-		} else if (spin) {
-			__submit_spin_batch(gem_fd, &obj, e_);
-		} else {
-			spin = igt_spin_batch_new(gem_fd, 0,
-						  e2ring(gem_fd, e_), 0);
-			obj.handle = spin->handle;
-		}
+		else if (spin)
+			__submit_spin_batch(gem_fd, spin, e_, 64);
+		else
+			spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e_));
 
 		val[i++] = I915_PMU_ENGINE_BUSY(e_->class, e_->instance);
 	}
@@ -461,6 +518,9 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 	for (i = 0; i < num_engines; i++)
 		fd[i] = open_group(val[i], fd[0]);
 
+	/* Small delay to allow engines to start. */
+	usleep(__spin_wait(gem_fd, spin) * num_engines / 1e3);
+
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -489,7 +549,6 @@ static void
 all_busy_check_all(int gem_fd, const unsigned int num_engines,
 		   unsigned int flags)
 {
-	struct drm_i915_gem_exec_object2 obj = {};
 	const struct intel_execution_engine2 *e;
 	uint64_t tval[2][num_engines];
 	uint64_t val[num_engines];
@@ -503,13 +562,10 @@ all_busy_check_all(int gem_fd, const unsigned int num_engines,
 		if (!gem_has_engine(gem_fd, e->class, e->instance))
 			continue;
 
-		if (spin) {
-			__submit_spin_batch(gem_fd, &obj, e);
-		} else {
-			spin = igt_spin_batch_new(gem_fd, 0,
-						  e2ring(gem_fd, e), 0);
-			obj.handle = spin->handle;
-		}
+		if (spin)
+			__submit_spin_batch(gem_fd, spin, e, 64);
+		else
+			spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e));
 
 		val[i++] = I915_PMU_ENGINE_BUSY(e->class, e->instance);
 	}
@@ -519,6 +575,9 @@ all_busy_check_all(int gem_fd, const unsigned int num_engines,
 	for (i = 0; i < num_engines; i++)
 		fd[i] = open_group(val[i], fd[0]);
 
+	/* Small delay to allow engines to start. */
+	usleep(__spin_wait(gem_fd, spin) * num_engines / 1e3);
+
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -550,7 +609,7 @@ no_sema(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags)
 	open_group(I915_PMU_ENGINE_WAIT(e->class, e->instance), fd);
 
 	if (flags & TEST_BUSY)
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	else
 		spin = NULL;
 
@@ -884,7 +943,7 @@ multi_client(int gem_fd, const struct intel_execution_engine2 *e)
 	 */
 	fd[1] = open_pmu(config);
 
-	spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 
 	val[0] = val[1] = __pmu_read_single(fd[0], &ts[0]);
 	slept[1] = measured_usleep(batch_duration_ns / 1000);
@@ -1248,7 +1307,7 @@ test_frequency(int gem_fd)
 	igt_require(igt_sysfs_get_u32(sysfs, "gt_boost_freq_mhz") == min_freq);
 
 	gem_quiescent_gpu(gem_fd); /* Idle to be sure the change takes effect */
-	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+	spin = spin_sync(gem_fd, 0, I915_EXEC_RENDER);
 
 	slept = pmu_read_multi(fd, 2, start);
 	measured_usleep(batch_duration_ns / 1000);
@@ -1274,7 +1333,7 @@ test_frequency(int gem_fd)
 	igt_require(igt_sysfs_get_u32(sysfs, "gt_min_freq_mhz") == max_freq);
 
 	gem_quiescent_gpu(gem_fd);
-	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+	spin = spin_sync(gem_fd, 0, I915_EXEC_RENDER);
 
 	slept = pmu_read_multi(fd, 2, start);
 	measured_usleep(batch_duration_ns / 1000);
@@ -1455,6 +1514,11 @@ static void __rearm_spin_batch(igt_spin_t *spin)
 {
 	const uint32_t mi_arb_chk = 0x5 << 23;
 
+	if (spin->running) {
+		igt_assert(*spin->running);
+		*spin->running = 0;
+	}
+	igt_assert_eq(*spin->batch, MI_BATCH_BUFFER_END);
        *spin->batch = mi_arb_chk;
        __sync_synchronize();
 }
@@ -1489,6 +1553,9 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 	/* Sampling platforms cannot reach the high accuracy criteria. */
 	igt_require(gem_has_execlists(gem_fd));
 
+	/* Need store dword for accurate PWM. */
+	igt_require(gem_can_store_dword(gem_fd, e2ring(gem_fd, e)));
+
 	while (idle_us < 2500) {
 		busy_us *= 2;
 		idle_us *= 2;
@@ -1517,7 +1584,6 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 		const unsigned long timeout[] = {
 			pwm_calibration_us * 1000, test_us * 1000
 		};
-		struct drm_i915_gem_exec_object2 obj = {};
 		uint64_t total_busy_ns = 0, total_idle_ns = 0;
 		igt_spin_t *spin;
 		int ret;
@@ -1530,12 +1596,9 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 			igt_warn("Failed to set scheduling policy!\n");
 
 		/* Allocate our spin batch and idle it. */
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
-		obj.handle = spin->handle;
-		__submit_spin_batch(gem_fd, &obj, e); /* record its location */
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 		igt_spin_batch_end(spin);
-		gem_sync(gem_fd, obj.handle);
-		obj.flags |= EXEC_OBJECT_PINNED;
+		gem_sync(gem_fd, spin->handle);
 
 		/* 1st pass is calibration, second pass is the test. */
 		for (int pass = 0; pass < ARRAY_SIZE(timeout); pass++) {
@@ -1545,24 +1608,30 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 
 			igt_nsec_elapsed(&test_start);
 			do {
-				unsigned int target_idle_us, t_busy;
+				unsigned int target_idle_us;
+				struct timespec start = { };
+				unsigned long prep_delay_ns;
 
 				/* Restart the spinbatch. */
+				igt_nsec_elapsed(&start);
 				__rearm_spin_batch(spin);
-				__submit_spin_batch(gem_fd, &obj, e);
+				__submit_spin_batch(gem_fd, spin, e, 0);
 
-				/*
-				 * Note that the submission may be delayed to a
-				 * tasklet (ksoftirqd) which cannot run until we
-				 * sleep as we hog the cpu (we are RT).
-				 */
+				 /* Wait for batch to start executing. */
+				__spin_wait(gem_fd, spin);
+				prep_delay_ns = igt_nsec_elapsed(&start);
 
-				t_busy = measured_usleep(busy_us);
+				/* PWM busy sleep. */
+				memset(&start, 0, sizeof(start));
+				igt_nsec_elapsed(&start);
+				measured_usleep(busy_us);
 				igt_spin_batch_end(spin);
-				gem_sync(gem_fd, obj.handle);
+				gem_sync(gem_fd, spin->handle);
 
-				total_busy_ns += t_busy;
+				total_busy_ns += igt_nsec_elapsed(&start);
+				total_idle_ns += prep_delay_ns;
 
+				/* Re-calibrate. */
 				target_idle_us =
 					(100 * total_busy_ns / target_busy_pct - (total_busy_ns + total_idle_ns)) / 1000;
 				total_idle_ns += measured_usleep(target_idle_us);
-- 
2.14.1

_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [igt-dev] ✓ Fi.CI.BAT: success for tests/perf_pmu: Improve accuracy by waiting on spinner to start (rev9)
  2018-03-15 12:56 ` [igt-dev] " Tvrtko Ursulin
                   ` (11 preceding siblings ...)
  (?)
@ 2018-03-19 21:12 ` Patchwork
  -1 siblings, 0 replies; 53+ messages in thread
From: Patchwork @ 2018-03-19 21:12 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: igt-dev

== Series Details ==

Series: tests/perf_pmu: Improve accuracy by waiting on spinner to start (rev9)
URL   : https://patchwork.freedesktop.org/series/40027/
State : success

== Summary ==

IGT patchset tested on top of latest successful build
b09e979a67817a9b068f841bda81940b9d208850 tools/aubdump: For gen10+ support addresses up to 4GB

with latest DRM-Tip kernel build CI_DRM_3951
260af42eeff0 drm-tip: 2018y-03m-19d-17h-15m-08s UTC integration manifest

No testlist changes.

fi-bdw-5557u     total:285  pass:264  dwarn:0   dfail:0   fail:0   skip:21  time:430s
fi-bdw-gvtdvm    total:285  pass:261  dwarn:0   dfail:0   fail:0   skip:24  time:445s
fi-blb-e6850     total:285  pass:220  dwarn:1   dfail:0   fail:0   skip:64  time:382s
fi-bsw-n3050     total:285  pass:239  dwarn:0   dfail:0   fail:0   skip:46  time:539s
fi-bwr-2160      total:285  pass:180  dwarn:0   dfail:0   fail:0   skip:105 time:301s
fi-bxt-j4205     total:285  pass:256  dwarn:0   dfail:0   fail:0   skip:29  time:512s
fi-byt-j1900     total:285  pass:250  dwarn:0   dfail:0   fail:0   skip:35  time:520s
fi-byt-n2820     total:285  pass:246  dwarn:0   dfail:0   fail:0   skip:39  time:504s
fi-cfl-8700k     total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:409s
fi-cfl-s2        total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:578s
fi-cfl-u         total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:513s
fi-cnl-drrs      total:285  pass:254  dwarn:3   dfail:0   fail:0   skip:28  time:524s
fi-elk-e7500     total:285  pass:225  dwarn:1   dfail:0   fail:0   skip:59  time:430s
fi-gdg-551       total:285  pass:176  dwarn:0   dfail:0   fail:1   skip:108 time:318s
fi-glk-1         total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:541s
fi-hsw-4770      total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:402s
fi-ilk-650       total:285  pass:225  dwarn:0   dfail:0   fail:0   skip:60  time:421s
fi-ivb-3520m     total:285  pass:256  dwarn:0   dfail:0   fail:0   skip:29  time:473s
fi-ivb-3770      total:285  pass:252  dwarn:0   dfail:0   fail:0   skip:33  time:431s
fi-kbl-7500u     total:285  pass:260  dwarn:1   dfail:0   fail:0   skip:24  time:472s
fi-kbl-7567u     total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:467s
fi-kbl-r         total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:514s
fi-pnv-d510      total:285  pass:219  dwarn:1   dfail:0   fail:0   skip:65  time:661s
fi-skl-6260u     total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:442s
fi-skl-6600u     total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:537s
fi-skl-6700hq    total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:545s
fi-skl-6700k2    total:285  pass:261  dwarn:0   dfail:0   fail:0   skip:24  time:502s
fi-skl-6770hq    total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:496s
fi-skl-guc       total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:428s
fi-skl-gvtdvm    total:285  pass:262  dwarn:0   dfail:0   fail:0   skip:23  time:444s
fi-snb-2520m     total:285  pass:245  dwarn:0   dfail:0   fail:0   skip:40  time:565s
fi-snb-2600      total:285  pass:245  dwarn:0   dfail:0   fail:0   skip:40  time:399s

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_1161/issues.html
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 53+ messages in thread

* [igt-dev] ✓ Fi.CI.IGT: success for tests/perf_pmu: Improve accuracy by waiting on spinner to start (rev9)
  2018-03-15 12:56 ` [igt-dev] " Tvrtko Ursulin
                   ` (12 preceding siblings ...)
  (?)
@ 2018-03-20  0:26 ` Patchwork
  -1 siblings, 0 replies; 53+ messages in thread
From: Patchwork @ 2018-03-20  0:26 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: igt-dev

== Series Details ==

Series: tests/perf_pmu: Improve accuracy by waiting on spinner to start (rev9)
URL   : https://patchwork.freedesktop.org/series/40027/
State : success

== Summary ==

---- Known issues:

Test kms_flip:
        Subgroup 2x-blocking-wf_vblank:
                pass       -> FAIL       (shard-hsw) fdo#100368 +1
        Subgroup 2x-flip-vs-expired-vblank-interruptible:
                pass       -> FAIL       (shard-hsw) fdo#102887
Test kms_setmode:
        Subgroup basic:
                pass       -> FAIL       (shard-hsw) fdo#99912

fdo#100368 https://bugs.freedesktop.org/show_bug.cgi?id=100368
fdo#102887 https://bugs.freedesktop.org/show_bug.cgi?id=102887
fdo#99912 https://bugs.freedesktop.org/show_bug.cgi?id=99912

shard-apl        total:3478 pass:1813 dwarn:1   dfail:0   fail:8   skip:1655 time:13142s
shard-hsw        total:3478 pass:1766 dwarn:1   dfail:0   fail:3   skip:1707 time:11776s
shard-snb        total:3478 pass:1358 dwarn:1   dfail:0   fail:2   skip:2117 time:7301s
Blacklisted hosts:
shard-kbl        total:3478 pass:1933 dwarn:7   dfail:0   fail:10  skip:1528 time:9899s

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_1161/shards.html
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 53+ messages in thread

* [PATCH i-g-t v9] tests/perf_pmu: Improve accuracy by waiting on spinner to start
  2018-03-19 16:59                     ` [igt-dev] " Tvrtko Ursulin
@ 2018-03-20 13:51                       ` Tvrtko Ursulin
  -1 siblings, 0 replies; 53+ messages in thread
From: Tvrtko Ursulin @ 2018-03-20 13:51 UTC (permalink / raw)
  To: igt-dev; +Cc: Intel-gfx

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

More than one test assumes that the spinner is running pretty much
immediately after we have create or submitted it.

In actuality there is a variable delay, especially on execlists platforms,
between submission and spin batch starting to run on the hardware.

To enable tests which care about this level of timing to account for this,
we add a new spin batch constructor which provides an output field which
can be polled to determine when the batch actually started running.

This is implemented via MI_STOREDW_IMM from the spin batch, writing into
memory mapped page shared with userspace.

Using this facility from perf_pmu, where applicable, should improve very
occasional test fails across the set and platforms.

v2:
 Chris Wilson:
 * Use caching mapping if available.
 * Handle old gens better.
 * Use gem_can_store_dword.
 * Cache exec obj array in spin_batch_t for easier resubmit.

v3:
 * Forgot I915_EXEC_NO_RELOC. (Chris Wilson)

v4:
 * Mask out all non-engine flags in gem_can_store_dword.
 * Added some debug logging.

v5:
 * Fix relocs and batch munmap. (Chris)
 * Added assert idle spinner batch looks as expected.

v6:
 * Skip accuracy tests when !gem_can_store_dword.

v7:
 * Fix batch recursion reloc address.

v8:
 Chris Wilson:
 * Pull up gem_can_store_dword check before we start submitting.
 * Build spinner batch in a way we can skip store dword when not
   needed so we can run on SandyBridge.

v9:
 * Fix wait on spinner.
 * More tweaks to accuracy test.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 lib/igt_dummyload.c  | 192 +++++++++++++++++++++++++++++++++++++++----------
 lib/igt_dummyload.h  |  11 +++
 lib/igt_gt.c         |   2 +-
 lib/ioctl_wrappers.c |   2 +-
 lib/ioctl_wrappers.h |   1 +
 tests/perf_pmu.c     | 199 +++++++++++++++++++++++++++++++++++----------------
 6 files changed, 306 insertions(+), 101 deletions(-)

diff --git a/lib/igt_dummyload.c b/lib/igt_dummyload.c
index 4b20f23dfe26..ce84628095b5 100644
--- a/lib/igt_dummyload.c
+++ b/lib/igt_dummyload.c
@@ -74,35 +74,48 @@ fill_reloc(struct drm_i915_gem_relocation_entry *reloc,
 	reloc->write_domain = write_domains;
 }
 
-static int emit_recursive_batch(igt_spin_t *spin,
-				int fd, uint32_t ctx, unsigned engine,
-				uint32_t dep, bool out_fence)
+#define OUT_FENCE	(1 << 0)
+#define POLL_RUN	(1 << 1)
+
+static int
+emit_recursive_batch(igt_spin_t *spin, int fd, uint32_t ctx, unsigned engine,
+		     uint32_t dep, unsigned int flags)
 {
 #define SCRATCH 0
 #define BATCH 1
 	const int gen = intel_gen(intel_get_drm_devid(fd));
-	struct drm_i915_gem_exec_object2 obj[2];
-	struct drm_i915_gem_relocation_entry relocs[2];
-	struct drm_i915_gem_execbuffer2 execbuf;
+	struct drm_i915_gem_relocation_entry relocs[2], *r;
+	struct drm_i915_gem_execbuffer2 *execbuf;
+	struct drm_i915_gem_exec_object2 *obj;
 	unsigned int engines[16];
 	unsigned int nengine;
 	int fence_fd = -1;
-	uint32_t *batch;
+	uint32_t *batch, *batch_start;
 	int i;
 
 	nengine = 0;
 	if (engine == -1) {
-		for_each_engine(fd, engine)
-			if (engine)
+		for_each_engine(fd, engine) {
+			if (engine) {
+			if (flags & POLL_RUN)
+				igt_require(!(flags & POLL_RUN) ||
+					    gem_can_store_dword(fd, engine));
+
 				engines[nengine++] = engine;
+			}
+		}
 	} else {
 		gem_require_ring(fd, engine);
+		igt_require(!(flags & POLL_RUN) ||
+			    gem_can_store_dword(fd, engine));
 		engines[nengine++] = engine;
 	}
 	igt_require(nengine);
 
-	memset(&execbuf, 0, sizeof(execbuf));
-	memset(obj, 0, sizeof(obj));
+	memset(&spin->execbuf, 0, sizeof(spin->execbuf));
+	execbuf = &spin->execbuf;
+	memset(spin->obj, 0, sizeof(spin->obj));
+	obj = spin->obj;
 	memset(relocs, 0, sizeof(relocs));
 
 	obj[BATCH].handle = gem_create(fd, BATCH_SIZE);
@@ -113,19 +126,66 @@ static int emit_recursive_batch(igt_spin_t *spin,
 				       	BATCH_SIZE, PROT_WRITE);
 	gem_set_domain(fd, obj[BATCH].handle,
 			I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT);
-	execbuf.buffer_count++;
+	execbuf->buffer_count++;
+	batch_start = batch;
 
 	if (dep) {
+		igt_assert(!(flags & POLL_RUN));
+
 		/* dummy write to dependency */
 		obj[SCRATCH].handle = dep;
 		fill_reloc(&relocs[obj[BATCH].relocation_count++],
 			   dep, 1020,
 			   I915_GEM_DOMAIN_RENDER,
 			   I915_GEM_DOMAIN_RENDER);
-		execbuf.buffer_count++;
+		execbuf->buffer_count++;
+	} else if (flags & POLL_RUN) {
+		unsigned int offset;
+
+		igt_assert(!dep);
+
+		if (gen == 4 || gen == 5)
+			execbuf->flags |= I915_EXEC_SECURE;
+
+		spin->poll_handle = gem_create(fd, 4096);
+
+		if (__gem_set_caching(fd, spin->poll_handle,
+				      I915_CACHING_CACHED) == 0)
+			spin->running = __gem_mmap__cpu(fd, spin->poll_handle,
+							0, 4096,
+							PROT_READ | PROT_WRITE);
+		else
+			spin->running = __gem_mmap__wc(fd, spin->poll_handle,
+						       0, 4096,
+						       PROT_READ | PROT_WRITE);
+		igt_assert(spin->running);
+		igt_assert_eq(*spin->running, 0);
+
+		*batch++ = MI_STORE_DWORD_IMM | (gen < 6 ? 1 << 22 : 0);
+
+		if (gen >= 8) {
+			offset = 1;
+			*batch++ = 0;
+			*batch++ = 0;
+		} else if (gen >= 4) {
+			offset = 2;
+			*batch++ = 0;
+			*batch++ = 0;
+		} else {
+			offset = 1;
+			batch[-1]--;
+			*batch++ = 0;
+		}
+
+		*batch++ = 1;
+
+		obj[SCRATCH].handle = spin->poll_handle;
+		fill_reloc(&relocs[obj[BATCH].relocation_count++],
+			   spin->poll_handle, offset, 0, 0);
+		execbuf->buffer_count++;
 	}
 
-	spin->batch = batch;
+	spin->batch = batch = batch_start + 64 / sizeof(*batch);
 	spin->handle = obj[BATCH].handle;
 
 	/* Allow ourselves to be preempted */
@@ -145,40 +205,42 @@ static int emit_recursive_batch(igt_spin_t *spin,
 	batch += 1000;
 
 	/* recurse */
-	fill_reloc(&relocs[obj[BATCH].relocation_count],
-		   obj[BATCH].handle, (batch - spin->batch) + 1,
-		   I915_GEM_DOMAIN_COMMAND, 0);
+	r = &relocs[obj[BATCH].relocation_count++];
+	r->target_handle = obj[BATCH].handle;
+	r->offset = (batch + 1 - batch_start) * sizeof(*batch);
+	r->read_domains = I915_GEM_DOMAIN_COMMAND;
+	r->delta = 64;
 	if (gen >= 8) {
 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
-		*batch++ = 0;
+		*batch++ = r->delta;
 		*batch++ = 0;
 	} else if (gen >= 6) {
 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8;
-		*batch++ = 0;
+		*batch++ = r->delta;
 	} else {
 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
-		*batch = 0;
-		if (gen < 4) {
-			*batch |= 1;
-			relocs[obj[BATCH].relocation_count].delta = 1;
-		}
+		if (gen < 4)
+			r->delta |= 1;
+		*batch = r->delta;
 		batch++;
 	}
-	obj[BATCH].relocation_count++;
 	obj[BATCH].relocs_ptr = to_user_pointer(relocs);
 
-	execbuf.buffers_ptr = to_user_pointer(obj + (2 - execbuf.buffer_count));
-	execbuf.rsvd1 = ctx;
+	execbuf->buffers_ptr = to_user_pointer(obj +
+					       (2 - execbuf->buffer_count));
+	execbuf->rsvd1 = ctx;
 
-	if (out_fence)
-		execbuf.flags |= I915_EXEC_FENCE_OUT;
+	if (flags & OUT_FENCE)
+		execbuf->flags |= I915_EXEC_FENCE_OUT;
 
 	for (i = 0; i < nengine; i++) {
-		execbuf.flags &= ~ENGINE_MASK;
-		execbuf.flags |= engines[i];
-		gem_execbuf_wr(fd, &execbuf);
-		if (out_fence) {
-			int _fd = execbuf.rsvd2 >> 32;
+		execbuf->flags &= ~ENGINE_MASK;
+		execbuf->flags |= engines[i];
+
+		gem_execbuf_wr(fd, execbuf);
+
+		if (flags & OUT_FENCE) {
+			int _fd = execbuf->rsvd2 >> 32;
 
 			igt_assert(_fd >= 0);
 			if (fence_fd == -1) {
@@ -194,12 +256,20 @@ static int emit_recursive_batch(igt_spin_t *spin,
 		}
 	}
 
+	/* Make it easier for callers to resubmit. */
+
+	obj[BATCH].relocation_count = 0;
+	obj[BATCH].relocs_ptr = 0;
+
+	obj[SCRATCH].flags = EXEC_OBJECT_PINNED;
+	obj[BATCH].flags = EXEC_OBJECT_PINNED;
+
 	return fence_fd;
 }
 
 static igt_spin_t *
 ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
-		      int out_fence)
+		      unsigned int flags)
 {
 	igt_spin_t *spin;
 
@@ -207,7 +277,7 @@ ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
 	igt_assert(spin);
 
 	spin->out_fence = emit_recursive_batch(spin, fd, ctx, engine, dep,
-					       out_fence);
+					       flags);
 
 	pthread_mutex_lock(&list_lock);
 	igt_list_add(&spin->link, &spin_list);
@@ -219,7 +289,7 @@ ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
 igt_spin_t *
 __igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep)
 {
-	return ___igt_spin_batch_new(fd, ctx, engine, dep, false);
+	return ___igt_spin_batch_new(fd, ctx, engine, dep, 0);
 }
 
 /**
@@ -253,7 +323,7 @@ igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep)
 igt_spin_t *
 __igt_spin_batch_new_fence(int fd, uint32_t ctx, unsigned engine)
 {
-	return ___igt_spin_batch_new(fd, ctx, engine, 0, true);
+	return ___igt_spin_batch_new(fd, ctx, engine, 0, OUT_FENCE);
 }
 
 /**
@@ -286,6 +356,42 @@ igt_spin_batch_new_fence(int fd, uint32_t ctx, unsigned engine)
 	return spin;
 }
 
+igt_spin_t *
+__igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine)
+{
+	return ___igt_spin_batch_new(fd, ctx, engine, 0, POLL_RUN);
+}
+
+/**
+ * igt_spin_batch_new_poll:
+ * @fd: open i915 drm file descriptor
+ * @engine: Ring to execute batch OR'd with execbuf flags. If value is less
+ *          than 0, execute on all available rings.
+ *
+ * Start a recursive batch on a ring. Immediately returns a #igt_spin_t that
+ * contains the batch's handle that can be waited upon. The returned structure
+ * must be passed to igt_spin_batch_free() for post-processing.
+ *
+ * igt_spin_t->running will containt a pointer which target will change from
+ * zero to one once the spinner actually starts executing on the GPU.
+ *
+ * Returns:
+ * Structure with helper internal state for igt_spin_batch_free().
+ */
+igt_spin_t *
+igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine)
+{
+	igt_spin_t *spin;
+
+	igt_require_gem(fd);
+	igt_require(gem_mmap__has_wc(fd));
+
+	spin = __igt_spin_batch_new_poll(fd, ctx, engine);
+	igt_assert(gem_bo_busy(fd, spin->handle));
+
+	return spin;
+}
+
 static void notify(union sigval arg)
 {
 	igt_spin_t *spin = arg.sival_ptr;
@@ -340,6 +446,8 @@ void igt_spin_batch_end(igt_spin_t *spin)
 	if (!spin)
 		return;
 
+	igt_assert(*spin->batch == MI_ARB_CHK ||
+		   *spin->batch == MI_BATCH_BUFFER_END);
 	*spin->batch = MI_BATCH_BUFFER_END;
 	__sync_synchronize();
 }
@@ -365,7 +473,13 @@ void igt_spin_batch_free(int fd, igt_spin_t *spin)
 		timer_delete(spin->timer);
 
 	igt_spin_batch_end(spin);
-	gem_munmap(spin->batch, BATCH_SIZE);
+	gem_munmap((void *)((unsigned long)spin->batch & (~4095UL)),
+		   BATCH_SIZE);
+
+	if (spin->running) {
+		gem_munmap(spin->running, 4096);
+		gem_close(fd, spin->poll_handle);
+	}
 
 	gem_close(fd, spin->handle);
 
diff --git a/lib/igt_dummyload.h b/lib/igt_dummyload.h
index 4103e4ab9e36..3103935a309b 100644
--- a/lib/igt_dummyload.h
+++ b/lib/igt_dummyload.h
@@ -36,6 +36,10 @@ typedef struct igt_spin {
 	struct igt_list link;
 	uint32_t *batch;
 	int out_fence;
+	struct drm_i915_gem_exec_object2 obj[2];
+	struct drm_i915_gem_execbuffer2 execbuf;
+	uint32_t poll_handle;
+	bool *running;
 } igt_spin_t;
 
 igt_spin_t *__igt_spin_batch_new(int fd,
@@ -55,6 +59,13 @@ igt_spin_t *igt_spin_batch_new_fence(int fd,
 				     uint32_t ctx,
 				     unsigned engine);
 
+igt_spin_t *__igt_spin_batch_new_poll(int fd,
+				       uint32_t ctx,
+				       unsigned engine);
+igt_spin_t *igt_spin_batch_new_poll(int fd,
+				    uint32_t ctx,
+				    unsigned engine);
+
 void igt_spin_batch_set_timeout(igt_spin_t *spin, int64_t ns);
 void igt_spin_batch_end(igt_spin_t *spin);
 void igt_spin_batch_free(int fd, igt_spin_t *spin);
diff --git a/lib/igt_gt.c b/lib/igt_gt.c
index 01aebc670862..4569fd36bd85 100644
--- a/lib/igt_gt.c
+++ b/lib/igt_gt.c
@@ -609,7 +609,7 @@ bool gem_can_store_dword(int fd, unsigned int engine)
 	if (gen == 3 && (info->is_grantsdale || info->is_alviso))
 		return false; /* only supports physical addresses */
 
-	if (gen == 6 && (engine & ~(3<<13)) == I915_EXEC_BSD)
+	if (gen == 6 && ((engine & 0x3f) == I915_EXEC_BSD))
 		return false; /* kills the machine! */
 
 	if (info->is_broadwater)
diff --git a/lib/ioctl_wrappers.c b/lib/ioctl_wrappers.c
index 8748cfcfc04f..4e1a08bf06b4 100644
--- a/lib/ioctl_wrappers.c
+++ b/lib/ioctl_wrappers.c
@@ -198,7 +198,7 @@ void gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride)
 	igt_assert(__gem_set_tiling(fd, handle, tiling, stride) == 0);
 }
 
-static int __gem_set_caching(int fd, uint32_t handle, uint32_t caching)
+int __gem_set_caching(int fd, uint32_t handle, uint32_t caching)
 {
 	struct drm_i915_gem_caching arg;
 	int err;
diff --git a/lib/ioctl_wrappers.h b/lib/ioctl_wrappers.h
index 13fbe3c103c0..b966f72c90a8 100644
--- a/lib/ioctl_wrappers.h
+++ b/lib/ioctl_wrappers.h
@@ -61,6 +61,7 @@ bool gem_get_tiling(int fd, uint32_t handle, uint32_t *tiling, uint32_t *swizzle
 void gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride);
 int __gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride);
 
+int __gem_set_caching(int fd, uint32_t handle, uint32_t caching);
 void gem_set_caching(int fd, uint32_t handle, uint32_t caching);
 uint32_t gem_get_caching(int fd, uint32_t handle);
 uint32_t gem_flink(int fd, uint32_t handle);
diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
index 19fcc95ffc7f..5eaaaecab399 100644
--- a/tests/perf_pmu.c
+++ b/tests/perf_pmu.c
@@ -170,6 +170,56 @@ static unsigned int e2ring(int gem_fd, const struct intel_execution_engine2 *e)
 #define FLAG_LONG (16)
 #define FLAG_HANG (32)
 
+static igt_spin_t * __spin_poll(int fd, uint32_t ctx, unsigned long flags)
+{
+	if (gem_can_store_dword(fd, flags))
+		return __igt_spin_batch_new_poll(fd, ctx, flags);
+	else
+		return __igt_spin_batch_new(fd, ctx, flags, 0);
+}
+
+static unsigned long __spin_wait(int fd, igt_spin_t *spin)
+{
+	struct timespec start = { };
+
+	igt_nsec_elapsed(&start);
+
+	if (spin->running) {
+		unsigned long timeout = 0;
+
+		while (!*((volatile bool *)spin->running)) {
+			unsigned long t = igt_nsec_elapsed(&start);
+
+			if ((t - timeout) > 250e6) {
+				timeout = t;
+				igt_warn("Spinner not running after %.2fms\n",
+					 (double)t / 1e6);
+			}
+		}
+	} else {
+		igt_debug("__spin_wait - usleep mode\n");
+		usleep(500e3); /* Better than nothing! */
+	}
+
+	return igt_nsec_elapsed(&start);
+}
+
+static igt_spin_t * __spin_sync(int fd, uint32_t ctx, unsigned long flags)
+{
+	igt_spin_t *spin = __spin_poll(fd, ctx, flags);
+
+	__spin_wait(fd, spin);
+
+	return spin;
+}
+
+static igt_spin_t * spin_sync(int fd, uint32_t ctx, unsigned long flags)
+{
+	igt_require_gem(fd);
+
+	return __spin_sync(fd, ctx, flags);
+}
+
 static void end_spin(int fd, igt_spin_t *spin, unsigned int flags)
 {
 	if (!spin)
@@ -180,8 +230,25 @@ static void end_spin(int fd, igt_spin_t *spin, unsigned int flags)
 	if (flags & FLAG_SYNC)
 		gem_sync(fd, spin->handle);
 
-	if (flags & TEST_TRAILING_IDLE)
-		usleep(batch_duration_ns / 5000);
+	if (flags & TEST_TRAILING_IDLE) {
+		unsigned long t, timeout = 0;
+		struct timespec start = { };
+
+		igt_nsec_elapsed(&start);
+
+		do {
+			t = igt_nsec_elapsed(&start);
+
+			if (gem_bo_busy(fd, spin->handle) &&
+			    (t - timeout) > 10e6) {
+				timeout = t;
+				igt_warn("Spinner not idle after %.2fms\n",
+					 (double)t / 1e6);
+			}
+
+			usleep(1e3);
+		} while (t < batch_duration_ns / 5);
+	}
 }
 
 static void
@@ -195,7 +262,7 @@ single(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags)
 	fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance));
 
 	if (flags & TEST_BUSY)
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	else
 		spin = NULL;
 
@@ -251,13 +318,7 @@ busy_start(int gem_fd, const struct intel_execution_engine2 *e)
 	 */
 	sleep(2);
 
-	spin = __igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
-
-	/*
-	 * Sleep for a bit after making the engine busy to make sure the PMU
-	 * gets enabled when the batch is already running.
-	 */
-	usleep(500e3);
+	spin = __spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 
 	fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance));
 
@@ -300,7 +361,7 @@ busy_double_start(int gem_fd, const struct intel_execution_engine2 *e)
 	 * re-submission in execlists mode. Make sure busyness is correctly
 	 * reported with the engine busy, and after the engine went idle.
 	 */
-	spin[0] = __igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin[0] = __spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	usleep(500e3);
 	spin[1] = __igt_spin_batch_new(gem_fd, ctx, e2ring(gem_fd, e), 0);
 
@@ -386,7 +447,7 @@ busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 
 	igt_assert_eq(i, num_engines);
 
-	spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -412,15 +473,15 @@ busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 }
 
 static void
-__submit_spin_batch(int gem_fd,
-		    struct drm_i915_gem_exec_object2 *obj,
-		    const struct intel_execution_engine2 *e)
+__submit_spin_batch(int gem_fd, igt_spin_t *spin,
+		    const struct intel_execution_engine2 *e,
+		    int offset)
 {
-	struct drm_i915_gem_execbuffer2 eb = {
-		.buffer_count = 1,
-		.buffers_ptr = to_user_pointer(obj),
-		.flags = e2ring(gem_fd, e),
-	};
+	struct drm_i915_gem_execbuffer2 eb = spin->execbuf;
+
+	eb.flags &= ~(0x3f | I915_EXEC_BSD_MASK);
+	eb.flags |= e2ring(gem_fd, e) | I915_EXEC_NO_RELOC;
+	eb.batch_start_offset += offset;
 
 	gem_execbuf(gem_fd, &eb);
 }
@@ -429,7 +490,6 @@ static void
 most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 		    const unsigned int num_engines, unsigned int flags)
 {
-	struct drm_i915_gem_exec_object2 obj = {};
 	const struct intel_execution_engine2 *e_;
 	uint64_t tval[2][num_engines];
 	uint64_t val[num_engines];
@@ -443,15 +503,12 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 		if (!gem_has_engine(gem_fd, e_->class, e_->instance))
 			continue;
 
-		if (e == e_) {
+		if (e == e_)
 			idle_idx = i;
-		} else if (spin) {
-			__submit_spin_batch(gem_fd, &obj, e_);
-		} else {
-			spin = igt_spin_batch_new(gem_fd, 0,
-						  e2ring(gem_fd, e_), 0);
-			obj.handle = spin->handle;
-		}
+		else if (spin)
+			__submit_spin_batch(gem_fd, spin, e_, 64);
+		else
+			spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e_));
 
 		val[i++] = I915_PMU_ENGINE_BUSY(e_->class, e_->instance);
 	}
@@ -461,6 +518,9 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 	for (i = 0; i < num_engines; i++)
 		fd[i] = open_group(val[i], fd[0]);
 
+	/* Small delay to allow engines to start. */
+	usleep(__spin_wait(gem_fd, spin) * num_engines / 1e3);
+
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -489,7 +549,6 @@ static void
 all_busy_check_all(int gem_fd, const unsigned int num_engines,
 		   unsigned int flags)
 {
-	struct drm_i915_gem_exec_object2 obj = {};
 	const struct intel_execution_engine2 *e;
 	uint64_t tval[2][num_engines];
 	uint64_t val[num_engines];
@@ -503,13 +562,10 @@ all_busy_check_all(int gem_fd, const unsigned int num_engines,
 		if (!gem_has_engine(gem_fd, e->class, e->instance))
 			continue;
 
-		if (spin) {
-			__submit_spin_batch(gem_fd, &obj, e);
-		} else {
-			spin = igt_spin_batch_new(gem_fd, 0,
-						  e2ring(gem_fd, e), 0);
-			obj.handle = spin->handle;
-		}
+		if (spin)
+			__submit_spin_batch(gem_fd, spin, e, 64);
+		else
+			spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e));
 
 		val[i++] = I915_PMU_ENGINE_BUSY(e->class, e->instance);
 	}
@@ -519,6 +575,9 @@ all_busy_check_all(int gem_fd, const unsigned int num_engines,
 	for (i = 0; i < num_engines; i++)
 		fd[i] = open_group(val[i], fd[0]);
 
+	/* Small delay to allow engines to start. */
+	usleep(__spin_wait(gem_fd, spin) * num_engines / 1e3);
+
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -550,7 +609,7 @@ no_sema(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags)
 	open_group(I915_PMU_ENGINE_WAIT(e->class, e->instance), fd);
 
 	if (flags & TEST_BUSY)
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	else
 		spin = NULL;
 
@@ -884,7 +943,7 @@ multi_client(int gem_fd, const struct intel_execution_engine2 *e)
 	 */
 	fd[1] = open_pmu(config);
 
-	spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 
 	val[0] = val[1] = __pmu_read_single(fd[0], &ts[0]);
 	slept[1] = measured_usleep(batch_duration_ns / 1000);
@@ -1248,7 +1307,7 @@ test_frequency(int gem_fd)
 	igt_require(igt_sysfs_get_u32(sysfs, "gt_boost_freq_mhz") == min_freq);
 
 	gem_quiescent_gpu(gem_fd); /* Idle to be sure the change takes effect */
-	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+	spin = spin_sync(gem_fd, 0, I915_EXEC_RENDER);
 
 	slept = pmu_read_multi(fd, 2, start);
 	measured_usleep(batch_duration_ns / 1000);
@@ -1274,7 +1333,7 @@ test_frequency(int gem_fd)
 	igt_require(igt_sysfs_get_u32(sysfs, "gt_min_freq_mhz") == max_freq);
 
 	gem_quiescent_gpu(gem_fd);
-	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+	spin = spin_sync(gem_fd, 0, I915_EXEC_RENDER);
 
 	slept = pmu_read_multi(fd, 2, start);
 	measured_usleep(batch_duration_ns / 1000);
@@ -1455,6 +1514,11 @@ static void __rearm_spin_batch(igt_spin_t *spin)
 {
 	const uint32_t mi_arb_chk = 0x5 << 23;
 
+	if (spin->running) {
+		igt_assert(*spin->running);
+		*spin->running = 0;
+	}
+	igt_assert_eq(*spin->batch, MI_BATCH_BUFFER_END);
        *spin->batch = mi_arb_chk;
        __sync_synchronize();
 }
@@ -1489,6 +1553,9 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 	/* Sampling platforms cannot reach the high accuracy criteria. */
 	igt_require(gem_has_execlists(gem_fd));
 
+	/* Need store dword for accurate PWM. */
+	igt_require(gem_can_store_dword(gem_fd, e2ring(gem_fd, e)));
+
 	while (idle_us < 2500) {
 		busy_us *= 2;
 		idle_us *= 2;
@@ -1517,8 +1584,8 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 		const unsigned long timeout[] = {
 			pwm_calibration_us * 1000, test_us * 1000
 		};
-		struct drm_i915_gem_exec_object2 obj = {};
 		uint64_t total_busy_ns = 0, total_idle_ns = 0;
+		uint64_t target_idle_us = idle_us;
 		igt_spin_t *spin;
 		int ret;
 
@@ -1530,12 +1597,9 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 			igt_warn("Failed to set scheduling policy!\n");
 
 		/* Allocate our spin batch and idle it. */
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
-		obj.handle = spin->handle;
-		__submit_spin_batch(gem_fd, &obj, e); /* record its location */
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 		igt_spin_batch_end(spin);
-		gem_sync(gem_fd, obj.handle);
-		obj.flags |= EXEC_OBJECT_PINNED;
+		gem_sync(gem_fd, spin->handle);
 
 		/* 1st pass is calibration, second pass is the test. */
 		for (int pass = 0; pass < ARRAY_SIZE(timeout); pass++) {
@@ -1545,27 +1609,42 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 
 			igt_nsec_elapsed(&test_start);
 			do {
-				unsigned int target_idle_us, t_busy;
+				unsigned long prep_delay_ns, run_delay_ns;
+				struct timespec start = { };
+				double error;
 
 				/* Restart the spinbatch. */
+				igt_nsec_elapsed(&start);
+
 				__rearm_spin_batch(spin);
-				__submit_spin_batch(gem_fd, &obj, e);
+				__submit_spin_batch(gem_fd, spin, e, 0);
 
-				/*
-				 * Note that the submission may be delayed to a
-				 * tasklet (ksoftirqd) which cannot run until we
-				 * sleep as we hog the cpu (we are RT).
-				 */
+				prep_delay_ns = igt_nsec_elapsed(&start);
+				run_delay_ns = __spin_wait(gem_fd, spin);
+
+				/* PWM busy sleep. */
+				measured_usleep(busy_us);
 
-				t_busy = measured_usleep(busy_us);
 				igt_spin_batch_end(spin);
-				gem_sync(gem_fd, obj.handle);
 
-				total_busy_ns += t_busy;
+				total_busy_ns += igt_nsec_elapsed(&start) -
+						 prep_delay_ns - run_delay_ns;
+
+				/* PWM idle sleep. */
+				memset(&start, 0, sizeof(start));
+				igt_nsec_elapsed(&start);
+				gem_sync(gem_fd, spin->handle);
+				measured_usleep(target_idle_us -
+						prep_delay_ns / 1000);
+				total_idle_ns += igt_nsec_elapsed(&start);
+
+				/* Re-calibrate. */
+				error = (double)total_busy_ns /
+					(total_busy_ns + total_idle_ns) -
+					(double)target_busy_pct / 100.0;
 
-				target_idle_us =
-					(100 * total_busy_ns / target_busy_pct - (total_busy_ns + total_idle_ns)) / 1000;
-				total_idle_ns += measured_usleep(target_idle_us);
+				target_idle_us = (double)target_idle_us *
+						 (1.0 + error);
 			} while (igt_nsec_elapsed(&test_start) < timeout[pass]);
 
 			busy_ns += total_busy_ns;
-- 
2.14.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [igt-dev] [PATCH i-g-t v9] tests/perf_pmu: Improve accuracy by waiting on spinner to start
@ 2018-03-20 13:51                       ` Tvrtko Ursulin
  0 siblings, 0 replies; 53+ messages in thread
From: Tvrtko Ursulin @ 2018-03-20 13:51 UTC (permalink / raw)
  To: igt-dev; +Cc: Intel-gfx, Tvrtko Ursulin

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

More than one test assumes that the spinner is running pretty much
immediately after we have create or submitted it.

In actuality there is a variable delay, especially on execlists platforms,
between submission and spin batch starting to run on the hardware.

To enable tests which care about this level of timing to account for this,
we add a new spin batch constructor which provides an output field which
can be polled to determine when the batch actually started running.

This is implemented via MI_STOREDW_IMM from the spin batch, writing into
memory mapped page shared with userspace.

Using this facility from perf_pmu, where applicable, should improve very
occasional test fails across the set and platforms.

v2:
 Chris Wilson:
 * Use caching mapping if available.
 * Handle old gens better.
 * Use gem_can_store_dword.
 * Cache exec obj array in spin_batch_t for easier resubmit.

v3:
 * Forgot I915_EXEC_NO_RELOC. (Chris Wilson)

v4:
 * Mask out all non-engine flags in gem_can_store_dword.
 * Added some debug logging.

v5:
 * Fix relocs and batch munmap. (Chris)
 * Added assert idle spinner batch looks as expected.

v6:
 * Skip accuracy tests when !gem_can_store_dword.

v7:
 * Fix batch recursion reloc address.

v8:
 Chris Wilson:
 * Pull up gem_can_store_dword check before we start submitting.
 * Build spinner batch in a way we can skip store dword when not
   needed so we can run on SandyBridge.

v9:
 * Fix wait on spinner.
 * More tweaks to accuracy test.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 lib/igt_dummyload.c  | 192 +++++++++++++++++++++++++++++++++++++++----------
 lib/igt_dummyload.h  |  11 +++
 lib/igt_gt.c         |   2 +-
 lib/ioctl_wrappers.c |   2 +-
 lib/ioctl_wrappers.h |   1 +
 tests/perf_pmu.c     | 199 +++++++++++++++++++++++++++++++++++----------------
 6 files changed, 306 insertions(+), 101 deletions(-)

diff --git a/lib/igt_dummyload.c b/lib/igt_dummyload.c
index 4b20f23dfe26..ce84628095b5 100644
--- a/lib/igt_dummyload.c
+++ b/lib/igt_dummyload.c
@@ -74,35 +74,48 @@ fill_reloc(struct drm_i915_gem_relocation_entry *reloc,
 	reloc->write_domain = write_domains;
 }
 
-static int emit_recursive_batch(igt_spin_t *spin,
-				int fd, uint32_t ctx, unsigned engine,
-				uint32_t dep, bool out_fence)
+#define OUT_FENCE	(1 << 0)
+#define POLL_RUN	(1 << 1)
+
+static int
+emit_recursive_batch(igt_spin_t *spin, int fd, uint32_t ctx, unsigned engine,
+		     uint32_t dep, unsigned int flags)
 {
 #define SCRATCH 0
 #define BATCH 1
 	const int gen = intel_gen(intel_get_drm_devid(fd));
-	struct drm_i915_gem_exec_object2 obj[2];
-	struct drm_i915_gem_relocation_entry relocs[2];
-	struct drm_i915_gem_execbuffer2 execbuf;
+	struct drm_i915_gem_relocation_entry relocs[2], *r;
+	struct drm_i915_gem_execbuffer2 *execbuf;
+	struct drm_i915_gem_exec_object2 *obj;
 	unsigned int engines[16];
 	unsigned int nengine;
 	int fence_fd = -1;
-	uint32_t *batch;
+	uint32_t *batch, *batch_start;
 	int i;
 
 	nengine = 0;
 	if (engine == -1) {
-		for_each_engine(fd, engine)
-			if (engine)
+		for_each_engine(fd, engine) {
+			if (engine) {
+			if (flags & POLL_RUN)
+				igt_require(!(flags & POLL_RUN) ||
+					    gem_can_store_dword(fd, engine));
+
 				engines[nengine++] = engine;
+			}
+		}
 	} else {
 		gem_require_ring(fd, engine);
+		igt_require(!(flags & POLL_RUN) ||
+			    gem_can_store_dword(fd, engine));
 		engines[nengine++] = engine;
 	}
 	igt_require(nengine);
 
-	memset(&execbuf, 0, sizeof(execbuf));
-	memset(obj, 0, sizeof(obj));
+	memset(&spin->execbuf, 0, sizeof(spin->execbuf));
+	execbuf = &spin->execbuf;
+	memset(spin->obj, 0, sizeof(spin->obj));
+	obj = spin->obj;
 	memset(relocs, 0, sizeof(relocs));
 
 	obj[BATCH].handle = gem_create(fd, BATCH_SIZE);
@@ -113,19 +126,66 @@ static int emit_recursive_batch(igt_spin_t *spin,
 				       	BATCH_SIZE, PROT_WRITE);
 	gem_set_domain(fd, obj[BATCH].handle,
 			I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT);
-	execbuf.buffer_count++;
+	execbuf->buffer_count++;
+	batch_start = batch;
 
 	if (dep) {
+		igt_assert(!(flags & POLL_RUN));
+
 		/* dummy write to dependency */
 		obj[SCRATCH].handle = dep;
 		fill_reloc(&relocs[obj[BATCH].relocation_count++],
 			   dep, 1020,
 			   I915_GEM_DOMAIN_RENDER,
 			   I915_GEM_DOMAIN_RENDER);
-		execbuf.buffer_count++;
+		execbuf->buffer_count++;
+	} else if (flags & POLL_RUN) {
+		unsigned int offset;
+
+		igt_assert(!dep);
+
+		if (gen == 4 || gen == 5)
+			execbuf->flags |= I915_EXEC_SECURE;
+
+		spin->poll_handle = gem_create(fd, 4096);
+
+		if (__gem_set_caching(fd, spin->poll_handle,
+				      I915_CACHING_CACHED) == 0)
+			spin->running = __gem_mmap__cpu(fd, spin->poll_handle,
+							0, 4096,
+							PROT_READ | PROT_WRITE);
+		else
+			spin->running = __gem_mmap__wc(fd, spin->poll_handle,
+						       0, 4096,
+						       PROT_READ | PROT_WRITE);
+		igt_assert(spin->running);
+		igt_assert_eq(*spin->running, 0);
+
+		*batch++ = MI_STORE_DWORD_IMM | (gen < 6 ? 1 << 22 : 0);
+
+		if (gen >= 8) {
+			offset = 1;
+			*batch++ = 0;
+			*batch++ = 0;
+		} else if (gen >= 4) {
+			offset = 2;
+			*batch++ = 0;
+			*batch++ = 0;
+		} else {
+			offset = 1;
+			batch[-1]--;
+			*batch++ = 0;
+		}
+
+		*batch++ = 1;
+
+		obj[SCRATCH].handle = spin->poll_handle;
+		fill_reloc(&relocs[obj[BATCH].relocation_count++],
+			   spin->poll_handle, offset, 0, 0);
+		execbuf->buffer_count++;
 	}
 
-	spin->batch = batch;
+	spin->batch = batch = batch_start + 64 / sizeof(*batch);
 	spin->handle = obj[BATCH].handle;
 
 	/* Allow ourselves to be preempted */
@@ -145,40 +205,42 @@ static int emit_recursive_batch(igt_spin_t *spin,
 	batch += 1000;
 
 	/* recurse */
-	fill_reloc(&relocs[obj[BATCH].relocation_count],
-		   obj[BATCH].handle, (batch - spin->batch) + 1,
-		   I915_GEM_DOMAIN_COMMAND, 0);
+	r = &relocs[obj[BATCH].relocation_count++];
+	r->target_handle = obj[BATCH].handle;
+	r->offset = (batch + 1 - batch_start) * sizeof(*batch);
+	r->read_domains = I915_GEM_DOMAIN_COMMAND;
+	r->delta = 64;
 	if (gen >= 8) {
 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
-		*batch++ = 0;
+		*batch++ = r->delta;
 		*batch++ = 0;
 	} else if (gen >= 6) {
 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8;
-		*batch++ = 0;
+		*batch++ = r->delta;
 	} else {
 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
-		*batch = 0;
-		if (gen < 4) {
-			*batch |= 1;
-			relocs[obj[BATCH].relocation_count].delta = 1;
-		}
+		if (gen < 4)
+			r->delta |= 1;
+		*batch = r->delta;
 		batch++;
 	}
-	obj[BATCH].relocation_count++;
 	obj[BATCH].relocs_ptr = to_user_pointer(relocs);
 
-	execbuf.buffers_ptr = to_user_pointer(obj + (2 - execbuf.buffer_count));
-	execbuf.rsvd1 = ctx;
+	execbuf->buffers_ptr = to_user_pointer(obj +
+					       (2 - execbuf->buffer_count));
+	execbuf->rsvd1 = ctx;
 
-	if (out_fence)
-		execbuf.flags |= I915_EXEC_FENCE_OUT;
+	if (flags & OUT_FENCE)
+		execbuf->flags |= I915_EXEC_FENCE_OUT;
 
 	for (i = 0; i < nengine; i++) {
-		execbuf.flags &= ~ENGINE_MASK;
-		execbuf.flags |= engines[i];
-		gem_execbuf_wr(fd, &execbuf);
-		if (out_fence) {
-			int _fd = execbuf.rsvd2 >> 32;
+		execbuf->flags &= ~ENGINE_MASK;
+		execbuf->flags |= engines[i];
+
+		gem_execbuf_wr(fd, execbuf);
+
+		if (flags & OUT_FENCE) {
+			int _fd = execbuf->rsvd2 >> 32;
 
 			igt_assert(_fd >= 0);
 			if (fence_fd == -1) {
@@ -194,12 +256,20 @@ static int emit_recursive_batch(igt_spin_t *spin,
 		}
 	}
 
+	/* Make it easier for callers to resubmit. */
+
+	obj[BATCH].relocation_count = 0;
+	obj[BATCH].relocs_ptr = 0;
+
+	obj[SCRATCH].flags = EXEC_OBJECT_PINNED;
+	obj[BATCH].flags = EXEC_OBJECT_PINNED;
+
 	return fence_fd;
 }
 
 static igt_spin_t *
 ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
-		      int out_fence)
+		      unsigned int flags)
 {
 	igt_spin_t *spin;
 
@@ -207,7 +277,7 @@ ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
 	igt_assert(spin);
 
 	spin->out_fence = emit_recursive_batch(spin, fd, ctx, engine, dep,
-					       out_fence);
+					       flags);
 
 	pthread_mutex_lock(&list_lock);
 	igt_list_add(&spin->link, &spin_list);
@@ -219,7 +289,7 @@ ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
 igt_spin_t *
 __igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep)
 {
-	return ___igt_spin_batch_new(fd, ctx, engine, dep, false);
+	return ___igt_spin_batch_new(fd, ctx, engine, dep, 0);
 }
 
 /**
@@ -253,7 +323,7 @@ igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep)
 igt_spin_t *
 __igt_spin_batch_new_fence(int fd, uint32_t ctx, unsigned engine)
 {
-	return ___igt_spin_batch_new(fd, ctx, engine, 0, true);
+	return ___igt_spin_batch_new(fd, ctx, engine, 0, OUT_FENCE);
 }
 
 /**
@@ -286,6 +356,42 @@ igt_spin_batch_new_fence(int fd, uint32_t ctx, unsigned engine)
 	return spin;
 }
 
+igt_spin_t *
+__igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine)
+{
+	return ___igt_spin_batch_new(fd, ctx, engine, 0, POLL_RUN);
+}
+
+/**
+ * igt_spin_batch_new_poll:
+ * @fd: open i915 drm file descriptor
+ * @engine: Ring to execute batch OR'd with execbuf flags. If value is less
+ *          than 0, execute on all available rings.
+ *
+ * Start a recursive batch on a ring. Immediately returns a #igt_spin_t that
+ * contains the batch's handle that can be waited upon. The returned structure
+ * must be passed to igt_spin_batch_free() for post-processing.
+ *
+ * igt_spin_t->running will containt a pointer which target will change from
+ * zero to one once the spinner actually starts executing on the GPU.
+ *
+ * Returns:
+ * Structure with helper internal state for igt_spin_batch_free().
+ */
+igt_spin_t *
+igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine)
+{
+	igt_spin_t *spin;
+
+	igt_require_gem(fd);
+	igt_require(gem_mmap__has_wc(fd));
+
+	spin = __igt_spin_batch_new_poll(fd, ctx, engine);
+	igt_assert(gem_bo_busy(fd, spin->handle));
+
+	return spin;
+}
+
 static void notify(union sigval arg)
 {
 	igt_spin_t *spin = arg.sival_ptr;
@@ -340,6 +446,8 @@ void igt_spin_batch_end(igt_spin_t *spin)
 	if (!spin)
 		return;
 
+	igt_assert(*spin->batch == MI_ARB_CHK ||
+		   *spin->batch == MI_BATCH_BUFFER_END);
 	*spin->batch = MI_BATCH_BUFFER_END;
 	__sync_synchronize();
 }
@@ -365,7 +473,13 @@ void igt_spin_batch_free(int fd, igt_spin_t *spin)
 		timer_delete(spin->timer);
 
 	igt_spin_batch_end(spin);
-	gem_munmap(spin->batch, BATCH_SIZE);
+	gem_munmap((void *)((unsigned long)spin->batch & (~4095UL)),
+		   BATCH_SIZE);
+
+	if (spin->running) {
+		gem_munmap(spin->running, 4096);
+		gem_close(fd, spin->poll_handle);
+	}
 
 	gem_close(fd, spin->handle);
 
diff --git a/lib/igt_dummyload.h b/lib/igt_dummyload.h
index 4103e4ab9e36..3103935a309b 100644
--- a/lib/igt_dummyload.h
+++ b/lib/igt_dummyload.h
@@ -36,6 +36,10 @@ typedef struct igt_spin {
 	struct igt_list link;
 	uint32_t *batch;
 	int out_fence;
+	struct drm_i915_gem_exec_object2 obj[2];
+	struct drm_i915_gem_execbuffer2 execbuf;
+	uint32_t poll_handle;
+	bool *running;
 } igt_spin_t;
 
 igt_spin_t *__igt_spin_batch_new(int fd,
@@ -55,6 +59,13 @@ igt_spin_t *igt_spin_batch_new_fence(int fd,
 				     uint32_t ctx,
 				     unsigned engine);
 
+igt_spin_t *__igt_spin_batch_new_poll(int fd,
+				       uint32_t ctx,
+				       unsigned engine);
+igt_spin_t *igt_spin_batch_new_poll(int fd,
+				    uint32_t ctx,
+				    unsigned engine);
+
 void igt_spin_batch_set_timeout(igt_spin_t *spin, int64_t ns);
 void igt_spin_batch_end(igt_spin_t *spin);
 void igt_spin_batch_free(int fd, igt_spin_t *spin);
diff --git a/lib/igt_gt.c b/lib/igt_gt.c
index 01aebc670862..4569fd36bd85 100644
--- a/lib/igt_gt.c
+++ b/lib/igt_gt.c
@@ -609,7 +609,7 @@ bool gem_can_store_dword(int fd, unsigned int engine)
 	if (gen == 3 && (info->is_grantsdale || info->is_alviso))
 		return false; /* only supports physical addresses */
 
-	if (gen == 6 && (engine & ~(3<<13)) == I915_EXEC_BSD)
+	if (gen == 6 && ((engine & 0x3f) == I915_EXEC_BSD))
 		return false; /* kills the machine! */
 
 	if (info->is_broadwater)
diff --git a/lib/ioctl_wrappers.c b/lib/ioctl_wrappers.c
index 8748cfcfc04f..4e1a08bf06b4 100644
--- a/lib/ioctl_wrappers.c
+++ b/lib/ioctl_wrappers.c
@@ -198,7 +198,7 @@ void gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride)
 	igt_assert(__gem_set_tiling(fd, handle, tiling, stride) == 0);
 }
 
-static int __gem_set_caching(int fd, uint32_t handle, uint32_t caching)
+int __gem_set_caching(int fd, uint32_t handle, uint32_t caching)
 {
 	struct drm_i915_gem_caching arg;
 	int err;
diff --git a/lib/ioctl_wrappers.h b/lib/ioctl_wrappers.h
index 13fbe3c103c0..b966f72c90a8 100644
--- a/lib/ioctl_wrappers.h
+++ b/lib/ioctl_wrappers.h
@@ -61,6 +61,7 @@ bool gem_get_tiling(int fd, uint32_t handle, uint32_t *tiling, uint32_t *swizzle
 void gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride);
 int __gem_set_tiling(int fd, uint32_t handle, uint32_t tiling, uint32_t stride);
 
+int __gem_set_caching(int fd, uint32_t handle, uint32_t caching);
 void gem_set_caching(int fd, uint32_t handle, uint32_t caching);
 uint32_t gem_get_caching(int fd, uint32_t handle);
 uint32_t gem_flink(int fd, uint32_t handle);
diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
index 19fcc95ffc7f..5eaaaecab399 100644
--- a/tests/perf_pmu.c
+++ b/tests/perf_pmu.c
@@ -170,6 +170,56 @@ static unsigned int e2ring(int gem_fd, const struct intel_execution_engine2 *e)
 #define FLAG_LONG (16)
 #define FLAG_HANG (32)
 
+static igt_spin_t * __spin_poll(int fd, uint32_t ctx, unsigned long flags)
+{
+	if (gem_can_store_dword(fd, flags))
+		return __igt_spin_batch_new_poll(fd, ctx, flags);
+	else
+		return __igt_spin_batch_new(fd, ctx, flags, 0);
+}
+
+static unsigned long __spin_wait(int fd, igt_spin_t *spin)
+{
+	struct timespec start = { };
+
+	igt_nsec_elapsed(&start);
+
+	if (spin->running) {
+		unsigned long timeout = 0;
+
+		while (!*((volatile bool *)spin->running)) {
+			unsigned long t = igt_nsec_elapsed(&start);
+
+			if ((t - timeout) > 250e6) {
+				timeout = t;
+				igt_warn("Spinner not running after %.2fms\n",
+					 (double)t / 1e6);
+			}
+		}
+	} else {
+		igt_debug("__spin_wait - usleep mode\n");
+		usleep(500e3); /* Better than nothing! */
+	}
+
+	return igt_nsec_elapsed(&start);
+}
+
+static igt_spin_t * __spin_sync(int fd, uint32_t ctx, unsigned long flags)
+{
+	igt_spin_t *spin = __spin_poll(fd, ctx, flags);
+
+	__spin_wait(fd, spin);
+
+	return spin;
+}
+
+static igt_spin_t * spin_sync(int fd, uint32_t ctx, unsigned long flags)
+{
+	igt_require_gem(fd);
+
+	return __spin_sync(fd, ctx, flags);
+}
+
 static void end_spin(int fd, igt_spin_t *spin, unsigned int flags)
 {
 	if (!spin)
@@ -180,8 +230,25 @@ static void end_spin(int fd, igt_spin_t *spin, unsigned int flags)
 	if (flags & FLAG_SYNC)
 		gem_sync(fd, spin->handle);
 
-	if (flags & TEST_TRAILING_IDLE)
-		usleep(batch_duration_ns / 5000);
+	if (flags & TEST_TRAILING_IDLE) {
+		unsigned long t, timeout = 0;
+		struct timespec start = { };
+
+		igt_nsec_elapsed(&start);
+
+		do {
+			t = igt_nsec_elapsed(&start);
+
+			if (gem_bo_busy(fd, spin->handle) &&
+			    (t - timeout) > 10e6) {
+				timeout = t;
+				igt_warn("Spinner not idle after %.2fms\n",
+					 (double)t / 1e6);
+			}
+
+			usleep(1e3);
+		} while (t < batch_duration_ns / 5);
+	}
 }
 
 static void
@@ -195,7 +262,7 @@ single(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags)
 	fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance));
 
 	if (flags & TEST_BUSY)
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	else
 		spin = NULL;
 
@@ -251,13 +318,7 @@ busy_start(int gem_fd, const struct intel_execution_engine2 *e)
 	 */
 	sleep(2);
 
-	spin = __igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
-
-	/*
-	 * Sleep for a bit after making the engine busy to make sure the PMU
-	 * gets enabled when the batch is already running.
-	 */
-	usleep(500e3);
+	spin = __spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 
 	fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance));
 
@@ -300,7 +361,7 @@ busy_double_start(int gem_fd, const struct intel_execution_engine2 *e)
 	 * re-submission in execlists mode. Make sure busyness is correctly
 	 * reported with the engine busy, and after the engine went idle.
 	 */
-	spin[0] = __igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin[0] = __spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	usleep(500e3);
 	spin[1] = __igt_spin_batch_new(gem_fd, ctx, e2ring(gem_fd, e), 0);
 
@@ -386,7 +447,7 @@ busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 
 	igt_assert_eq(i, num_engines);
 
-	spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -412,15 +473,15 @@ busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 }
 
 static void
-__submit_spin_batch(int gem_fd,
-		    struct drm_i915_gem_exec_object2 *obj,
-		    const struct intel_execution_engine2 *e)
+__submit_spin_batch(int gem_fd, igt_spin_t *spin,
+		    const struct intel_execution_engine2 *e,
+		    int offset)
 {
-	struct drm_i915_gem_execbuffer2 eb = {
-		.buffer_count = 1,
-		.buffers_ptr = to_user_pointer(obj),
-		.flags = e2ring(gem_fd, e),
-	};
+	struct drm_i915_gem_execbuffer2 eb = spin->execbuf;
+
+	eb.flags &= ~(0x3f | I915_EXEC_BSD_MASK);
+	eb.flags |= e2ring(gem_fd, e) | I915_EXEC_NO_RELOC;
+	eb.batch_start_offset += offset;
 
 	gem_execbuf(gem_fd, &eb);
 }
@@ -429,7 +490,6 @@ static void
 most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 		    const unsigned int num_engines, unsigned int flags)
 {
-	struct drm_i915_gem_exec_object2 obj = {};
 	const struct intel_execution_engine2 *e_;
 	uint64_t tval[2][num_engines];
 	uint64_t val[num_engines];
@@ -443,15 +503,12 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 		if (!gem_has_engine(gem_fd, e_->class, e_->instance))
 			continue;
 
-		if (e == e_) {
+		if (e == e_)
 			idle_idx = i;
-		} else if (spin) {
-			__submit_spin_batch(gem_fd, &obj, e_);
-		} else {
-			spin = igt_spin_batch_new(gem_fd, 0,
-						  e2ring(gem_fd, e_), 0);
-			obj.handle = spin->handle;
-		}
+		else if (spin)
+			__submit_spin_batch(gem_fd, spin, e_, 64);
+		else
+			spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e_));
 
 		val[i++] = I915_PMU_ENGINE_BUSY(e_->class, e_->instance);
 	}
@@ -461,6 +518,9 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
 	for (i = 0; i < num_engines; i++)
 		fd[i] = open_group(val[i], fd[0]);
 
+	/* Small delay to allow engines to start. */
+	usleep(__spin_wait(gem_fd, spin) * num_engines / 1e3);
+
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -489,7 +549,6 @@ static void
 all_busy_check_all(int gem_fd, const unsigned int num_engines,
 		   unsigned int flags)
 {
-	struct drm_i915_gem_exec_object2 obj = {};
 	const struct intel_execution_engine2 *e;
 	uint64_t tval[2][num_engines];
 	uint64_t val[num_engines];
@@ -503,13 +562,10 @@ all_busy_check_all(int gem_fd, const unsigned int num_engines,
 		if (!gem_has_engine(gem_fd, e->class, e->instance))
 			continue;
 
-		if (spin) {
-			__submit_spin_batch(gem_fd, &obj, e);
-		} else {
-			spin = igt_spin_batch_new(gem_fd, 0,
-						  e2ring(gem_fd, e), 0);
-			obj.handle = spin->handle;
-		}
+		if (spin)
+			__submit_spin_batch(gem_fd, spin, e, 64);
+		else
+			spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e));
 
 		val[i++] = I915_PMU_ENGINE_BUSY(e->class, e->instance);
 	}
@@ -519,6 +575,9 @@ all_busy_check_all(int gem_fd, const unsigned int num_engines,
 	for (i = 0; i < num_engines; i++)
 		fd[i] = open_group(val[i], fd[0]);
 
+	/* Small delay to allow engines to start. */
+	usleep(__spin_wait(gem_fd, spin) * num_engines / 1e3);
+
 	pmu_read_multi(fd[0], num_engines, tval[0]);
 	slept = measured_usleep(batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
@@ -550,7 +609,7 @@ no_sema(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags)
 	open_group(I915_PMU_ENGINE_WAIT(e->class, e->instance), fd);
 
 	if (flags & TEST_BUSY)
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 	else
 		spin = NULL;
 
@@ -884,7 +943,7 @@ multi_client(int gem_fd, const struct intel_execution_engine2 *e)
 	 */
 	fd[1] = open_pmu(config);
 
-	spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 
 	val[0] = val[1] = __pmu_read_single(fd[0], &ts[0]);
 	slept[1] = measured_usleep(batch_duration_ns / 1000);
@@ -1248,7 +1307,7 @@ test_frequency(int gem_fd)
 	igt_require(igt_sysfs_get_u32(sysfs, "gt_boost_freq_mhz") == min_freq);
 
 	gem_quiescent_gpu(gem_fd); /* Idle to be sure the change takes effect */
-	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+	spin = spin_sync(gem_fd, 0, I915_EXEC_RENDER);
 
 	slept = pmu_read_multi(fd, 2, start);
 	measured_usleep(batch_duration_ns / 1000);
@@ -1274,7 +1333,7 @@ test_frequency(int gem_fd)
 	igt_require(igt_sysfs_get_u32(sysfs, "gt_min_freq_mhz") == max_freq);
 
 	gem_quiescent_gpu(gem_fd);
-	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+	spin = spin_sync(gem_fd, 0, I915_EXEC_RENDER);
 
 	slept = pmu_read_multi(fd, 2, start);
 	measured_usleep(batch_duration_ns / 1000);
@@ -1455,6 +1514,11 @@ static void __rearm_spin_batch(igt_spin_t *spin)
 {
 	const uint32_t mi_arb_chk = 0x5 << 23;
 
+	if (spin->running) {
+		igt_assert(*spin->running);
+		*spin->running = 0;
+	}
+	igt_assert_eq(*spin->batch, MI_BATCH_BUFFER_END);
        *spin->batch = mi_arb_chk;
        __sync_synchronize();
 }
@@ -1489,6 +1553,9 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 	/* Sampling platforms cannot reach the high accuracy criteria. */
 	igt_require(gem_has_execlists(gem_fd));
 
+	/* Need store dword for accurate PWM. */
+	igt_require(gem_can_store_dword(gem_fd, e2ring(gem_fd, e)));
+
 	while (idle_us < 2500) {
 		busy_us *= 2;
 		idle_us *= 2;
@@ -1517,8 +1584,8 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 		const unsigned long timeout[] = {
 			pwm_calibration_us * 1000, test_us * 1000
 		};
-		struct drm_i915_gem_exec_object2 obj = {};
 		uint64_t total_busy_ns = 0, total_idle_ns = 0;
+		uint64_t target_idle_us = idle_us;
 		igt_spin_t *spin;
 		int ret;
 
@@ -1530,12 +1597,9 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 			igt_warn("Failed to set scheduling policy!\n");
 
 		/* Allocate our spin batch and idle it. */
-		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
-		obj.handle = spin->handle;
-		__submit_spin_batch(gem_fd, &obj, e); /* record its location */
+		spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e));
 		igt_spin_batch_end(spin);
-		gem_sync(gem_fd, obj.handle);
-		obj.flags |= EXEC_OBJECT_PINNED;
+		gem_sync(gem_fd, spin->handle);
 
 		/* 1st pass is calibration, second pass is the test. */
 		for (int pass = 0; pass < ARRAY_SIZE(timeout); pass++) {
@@ -1545,27 +1609,42 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 
 			igt_nsec_elapsed(&test_start);
 			do {
-				unsigned int target_idle_us, t_busy;
+				unsigned long prep_delay_ns, run_delay_ns;
+				struct timespec start = { };
+				double error;
 
 				/* Restart the spinbatch. */
+				igt_nsec_elapsed(&start);
+
 				__rearm_spin_batch(spin);
-				__submit_spin_batch(gem_fd, &obj, e);
+				__submit_spin_batch(gem_fd, spin, e, 0);
 
-				/*
-				 * Note that the submission may be delayed to a
-				 * tasklet (ksoftirqd) which cannot run until we
-				 * sleep as we hog the cpu (we are RT).
-				 */
+				prep_delay_ns = igt_nsec_elapsed(&start);
+				run_delay_ns = __spin_wait(gem_fd, spin);
+
+				/* PWM busy sleep. */
+				measured_usleep(busy_us);
 
-				t_busy = measured_usleep(busy_us);
 				igt_spin_batch_end(spin);
-				gem_sync(gem_fd, obj.handle);
 
-				total_busy_ns += t_busy;
+				total_busy_ns += igt_nsec_elapsed(&start) -
+						 prep_delay_ns - run_delay_ns;
+
+				/* PWM idle sleep. */
+				memset(&start, 0, sizeof(start));
+				igt_nsec_elapsed(&start);
+				gem_sync(gem_fd, spin->handle);
+				measured_usleep(target_idle_us -
+						prep_delay_ns / 1000);
+				total_idle_ns += igt_nsec_elapsed(&start);
+
+				/* Re-calibrate. */
+				error = (double)total_busy_ns /
+					(total_busy_ns + total_idle_ns) -
+					(double)target_busy_pct / 100.0;
 
-				target_idle_us =
-					(100 * total_busy_ns / target_busy_pct - (total_busy_ns + total_idle_ns)) / 1000;
-				total_idle_ns += measured_usleep(target_idle_us);
+				target_idle_us = (double)target_idle_us *
+						 (1.0 + error);
 			} while (igt_nsec_elapsed(&test_start) < timeout[pass]);
 
 			busy_ns += total_busy_ns;
-- 
2.14.1

_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [igt-dev] ✓ Fi.CI.BAT: success for tests/perf_pmu: Improve accuracy by waiting on spinner to start (rev10)
  2018-03-15 12:56 ` [igt-dev] " Tvrtko Ursulin
                   ` (13 preceding siblings ...)
  (?)
@ 2018-03-20 17:12 ` Patchwork
  -1 siblings, 0 replies; 53+ messages in thread
From: Patchwork @ 2018-03-20 17:12 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: igt-dev

== Series Details ==

Series: tests/perf_pmu: Improve accuracy by waiting on spinner to start (rev10)
URL   : https://patchwork.freedesktop.org/series/40027/
State : success

== Summary ==

IGT patchset tested on top of latest successful build
178e7f3da66cd02660a86257df75708a0efa3bbc tests/kms_frontbuffer_tracking: Update check for PSR status

with latest DRM-Tip kernel build CI_DRM_3958
9d737cebc219 drm-tip: 2018y-03m-20d-14h-56m-05s UTC integration manifest

No testlist changes.

---- Possible new issues:

Test kms_pipe_crc_basic:
        Subgroup suspend-read-crc-pipe-a:
                incomplete -> PASS       (fi-cfl-s2)

---- Known issues:

Test gem_mmap_gtt:
        Subgroup basic-small-bo-tiledx:
                pass       -> FAIL       (fi-gdg-551) fdo#102575

fdo#102575 https://bugs.freedesktop.org/show_bug.cgi?id=102575

fi-bdw-5557u     total:285  pass:264  dwarn:0   dfail:0   fail:0   skip:21  time:433s
fi-bdw-gvtdvm    total:285  pass:261  dwarn:0   dfail:0   fail:0   skip:24  time:441s
fi-blb-e6850     total:285  pass:220  dwarn:1   dfail:0   fail:0   skip:64  time:383s
fi-bsw-n3050     total:285  pass:239  dwarn:0   dfail:0   fail:0   skip:46  time:536s
fi-bwr-2160      total:285  pass:180  dwarn:0   dfail:0   fail:0   skip:105 time:299s
fi-bxt-dsi       total:285  pass:255  dwarn:0   dfail:0   fail:0   skip:30  time:514s
fi-bxt-j4205     total:285  pass:256  dwarn:0   dfail:0   fail:0   skip:29  time:514s
fi-byt-j1900     total:285  pass:250  dwarn:0   dfail:0   fail:0   skip:35  time:518s
fi-byt-n2820     total:285  pass:246  dwarn:0   dfail:0   fail:0   skip:39  time:504s
fi-cfl-8700k     total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:410s
fi-cfl-s2        total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:582s
fi-cfl-u         total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:512s
fi-cnl-drrs      total:285  pass:254  dwarn:3   dfail:0   fail:0   skip:28  time:521s
fi-elk-e7500     total:285  pass:225  dwarn:1   dfail:0   fail:0   skip:59  time:427s
fi-gdg-551       total:285  pass:176  dwarn:0   dfail:0   fail:1   skip:108 time:316s
fi-glk-1         total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:541s
fi-hsw-4770      total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:407s
fi-ilk-650       total:285  pass:225  dwarn:0   dfail:0   fail:0   skip:60  time:419s
fi-ivb-3520m     total:285  pass:256  dwarn:0   dfail:0   fail:0   skip:29  time:464s
fi-ivb-3770      total:285  pass:252  dwarn:0   dfail:0   fail:0   skip:33  time:428s
fi-kbl-7500u     total:285  pass:260  dwarn:1   dfail:0   fail:0   skip:24  time:475s
fi-kbl-7567u     total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:468s
fi-kbl-r         total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:515s
fi-pnv-d510      total:285  pass:219  dwarn:1   dfail:0   fail:0   skip:65  time:661s
fi-skl-6260u     total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:442s
fi-skl-6600u     total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:534s
fi-skl-6700hq    total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:541s
fi-skl-6700k2    total:285  pass:261  dwarn:0   dfail:0   fail:0   skip:24  time:505s
fi-skl-6770hq    total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:486s
fi-skl-guc       total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:428s
fi-skl-gvtdvm    total:285  pass:262  dwarn:0   dfail:0   fail:0   skip:23  time:446s
fi-snb-2520m     total:285  pass:245  dwarn:0   dfail:0   fail:0   skip:40  time:581s
fi-snb-2600      total:285  pass:245  dwarn:0   dfail:0   fail:0   skip:40  time:410s

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_1168/issues.html
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 53+ messages in thread

* [igt-dev] ✓ Fi.CI.IGT: success for tests/perf_pmu: Improve accuracy by waiting on spinner to start (rev10)
  2018-03-15 12:56 ` [igt-dev] " Tvrtko Ursulin
                   ` (14 preceding siblings ...)
  (?)
@ 2018-03-20 20:31 ` Patchwork
  -1 siblings, 0 replies; 53+ messages in thread
From: Patchwork @ 2018-03-20 20:31 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: igt-dev

== Series Details ==

Series: tests/perf_pmu: Improve accuracy by waiting on spinner to start (rev10)
URL   : https://patchwork.freedesktop.org/series/40027/
State : success

== Summary ==

---- Known issues:

Test kms_setmode:
        Subgroup basic:
                pass       -> FAIL       (shard-apl) fdo#99912
Test kms_sysfs_edid_timing:
                warn       -> PASS       (shard-apl) fdo#100047
Test kms_vblank:
        Subgroup pipe-b-ts-continuation-dpms-suspend:
                incomplete -> PASS       (shard-hsw) fdo#105054

fdo#99912 https://bugs.freedesktop.org/show_bug.cgi?id=99912
fdo#100047 https://bugs.freedesktop.org/show_bug.cgi?id=100047
fdo#105054 https://bugs.freedesktop.org/show_bug.cgi?id=105054

shard-apl        total:3478 pass:1814 dwarn:1   dfail:0   fail:8   skip:1655 time:12923s
shard-hsw        total:3478 pass:1768 dwarn:1   dfail:0   fail:1   skip:1707 time:11835s
shard-snb        total:3478 pass:1358 dwarn:1   dfail:0   fail:2   skip:2117 time:7226s
Blacklisted hosts:
shard-kbl        total:3478 pass:1940 dwarn:1   dfail:0   fail:9   skip:1528 time:9854s

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_1168/shards.html
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 53+ messages in thread

end of thread, other threads:[~2018-03-20 20:31 UTC | newest]

Thread overview: 53+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-03-15 12:56 [PATCH i-g-t] tests/perf_pmu: Improve accuracy by waiting on spinner to start Tvrtko Ursulin
2018-03-15 12:56 ` [igt-dev] " Tvrtko Ursulin
2018-03-15 13:14 ` Chris Wilson
2018-03-15 13:14   ` Chris Wilson
2018-03-15 13:36   ` Tvrtko Ursulin
2018-03-15 13:36     ` Tvrtko Ursulin
2018-03-15 13:45     ` Chris Wilson
2018-03-15 13:45       ` [Intel-gfx] " Chris Wilson
2018-03-15 14:37       ` Tvrtko Ursulin
2018-03-15 14:37         ` Tvrtko Ursulin
2018-03-15 14:46         ` Chris Wilson
2018-03-15 14:46           ` Chris Wilson
2018-03-15 14:53           ` Tvrtko Ursulin
2018-03-15 14:53             ` [Intel-gfx] " Tvrtko Ursulin
2018-03-15 14:58             ` Chris Wilson
2018-03-15 14:58               ` Chris Wilson
2018-03-15 14:03 ` [igt-dev] ✓ Fi.CI.BAT: success for " Patchwork
2018-03-15 15:46 ` [PATCH i-g-t v2] " Tvrtko Ursulin
2018-03-15 15:46   ` [Intel-gfx] " Tvrtko Ursulin
2018-03-15 16:01   ` [igt-dev] " Chris Wilson
2018-03-15 16:01     ` Chris Wilson
2018-03-16  7:36     ` [PATCH i-g-t v3] " Tvrtko Ursulin
2018-03-16  7:36       ` [igt-dev] " Tvrtko Ursulin
2018-03-16 10:17       ` [PATCH i-g-t v4] " Tvrtko Ursulin
2018-03-16 10:17         ` [igt-dev] " Tvrtko Ursulin
2018-03-16 12:18         ` [PATCH i-g-t v5] " Tvrtko Ursulin
2018-03-16 12:18           ` [igt-dev] " Tvrtko Ursulin
2018-03-16 13:31           ` [PATCH i-g-t v6] " Tvrtko Ursulin
2018-03-16 13:31             ` [Intel-gfx] " Tvrtko Ursulin
2018-03-19 13:56             ` [PATCH i-g-t v7] " Tvrtko Ursulin
2018-03-19 13:56               ` [Intel-gfx] " Tvrtko Ursulin
2018-03-19 14:02               ` Chris Wilson
2018-03-19 14:02                 ` [igt-dev] [Intel-gfx] " Chris Wilson
2018-03-19 15:29               ` Chris Wilson
2018-03-19 15:29                 ` [igt-dev] [Intel-gfx] " Chris Wilson
2018-03-19 15:33                 ` Chris Wilson
2018-03-19 15:33                   ` [igt-dev] [Intel-gfx] " Chris Wilson
2018-03-19 16:59                   ` [PATCH i-g-t v8] " Tvrtko Ursulin
2018-03-19 16:59                     ` [igt-dev] " Tvrtko Ursulin
2018-03-20 13:51                     ` [PATCH i-g-t v9] " Tvrtko Ursulin
2018-03-20 13:51                       ` [igt-dev] " Tvrtko Ursulin
2018-03-15 16:35 ` [igt-dev] ✗ Fi.CI.IGT: failure for " Patchwork
2018-03-15 16:49 ` [igt-dev] ✗ Fi.CI.BAT: failure for tests/perf_pmu: Improve accuracy by waiting on spinner to start (rev2) Patchwork
2018-03-16  8:02 ` [igt-dev] ✓ Fi.CI.BAT: success for tests/perf_pmu: Improve accuracy by waiting on spinner to start (rev3) Patchwork
2018-03-16  8:52 ` [igt-dev] ✗ Fi.CI.IGT: failure " Patchwork
2018-03-16 11:20 ` [igt-dev] ✓ Fi.CI.BAT: success for tests/perf_pmu: Improve accuracy by waiting on spinner to start (rev4) Patchwork
2018-03-16 12:27 ` [igt-dev] ✗ Fi.CI.IGT: failure " Patchwork
2018-03-16 14:30 ` [igt-dev] ✓ Fi.CI.BAT: success for tests/perf_pmu: Improve accuracy by waiting on spinner to start (rev6) Patchwork
2018-03-19 10:24 ` [igt-dev] ✗ Fi.CI.IGT: failure " Patchwork
2018-03-19 21:12 ` [igt-dev] ✓ Fi.CI.BAT: success for tests/perf_pmu: Improve accuracy by waiting on spinner to start (rev9) Patchwork
2018-03-20  0:26 ` [igt-dev] ✓ Fi.CI.IGT: " Patchwork
2018-03-20 17:12 ` [igt-dev] ✓ Fi.CI.BAT: success for tests/perf_pmu: Improve accuracy by waiting on spinner to start (rev10) Patchwork
2018-03-20 20:31 ` [igt-dev] ✓ Fi.CI.IGT: " Patchwork

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.