All of lore.kernel.org
 help / color / mirror / Atom feed
* [Intel-gfx] [PATCH i-g-t] i915/gem_exec_schedule: Try to spot unfairness
@ 2020-11-24 23:39 ` Chris Wilson
  0 siblings, 0 replies; 20+ messages in thread
From: Chris Wilson @ 2020-11-24 23:39 UTC (permalink / raw)
  To: intel-gfx; +Cc: igt-dev, Chris Wilson

An important property for multi-client systems is that each client gets
a 'fair' allotment of system time. (Where fairness is at the whim of the
context properties, such as priorities.) This test forks N independent
clients (albeit they happen to share a single vm), and does an equal
amount of work in client and asserts that they take an equal amount of
time.

Though we have never claimed to have a completely fair scheduler, that
is what is expected.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Ramalingam C <ramalingam.c@intel.com>
---
 tests/i915/gem_exec_schedule.c | 847 +++++++++++++++++++++++++++++++++
 1 file changed, 847 insertions(+)

diff --git a/tests/i915/gem_exec_schedule.c b/tests/i915/gem_exec_schedule.c
index f23d63ac3..d888efcd7 100644
--- a/tests/i915/gem_exec_schedule.c
+++ b/tests/i915/gem_exec_schedule.c
@@ -29,6 +29,7 @@
 #include <sys/poll.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
+#include <sys/resource.h>
 #include <sys/syscall.h>
 #include <sched.h>
 #include <signal.h>
@@ -2516,6 +2517,819 @@ static void measure_semaphore_power(int i915)
 	rapl_close(&pkg);
 }
 
+static int read_timestamp_frequency(int i915)
+{
+	int value = 0;
+	drm_i915_getparam_t gp = {
+		.value = &value,
+		.param = I915_PARAM_CS_TIMESTAMP_FREQUENCY,
+	};
+	ioctl(i915, DRM_IOCTL_I915_GETPARAM, &gp);
+	return value;
+}
+
+static uint64_t div64_u64_round_up(uint64_t x, uint64_t y)
+{
+	return (x + y - 1) / y;
+}
+
+static uint64_t ns_to_ctx_ticks(int i915, uint64_t ns)
+{
+	int f = read_timestamp_frequency(i915);
+	if (intel_gen(intel_get_drm_devid(i915)) == 11)
+		f = 12500000; /* icl!!! are you feeling alright? CTX vs CS */
+	return div64_u64_round_up(ns * f, NSEC_PER_SEC);
+}
+
+static uint64_t ticks_to_ns(int i915, uint64_t ticks)
+{
+	return div64_u64_round_up(ticks * NSEC_PER_SEC,
+				  read_timestamp_frequency(i915));
+}
+
+#define MI_INSTR(opcode, flags) (((opcode) << 23) | (flags))
+
+#define MI_MATH(x)                      MI_INSTR(0x1a, (x) - 1)
+#define MI_MATH_INSTR(opcode, op1, op2) ((opcode) << 20 | (op1) << 10 | (op2))
+/* Opcodes for MI_MATH_INSTR */
+#define   MI_MATH_NOOP                  MI_MATH_INSTR(0x000, 0x0, 0x0)
+#define   MI_MATH_LOAD(op1, op2)        MI_MATH_INSTR(0x080, op1, op2)
+#define   MI_MATH_LOADINV(op1, op2)     MI_MATH_INSTR(0x480, op1, op2)
+#define   MI_MATH_LOAD0(op1)            MI_MATH_INSTR(0x081, op1)
+#define   MI_MATH_LOAD1(op1)            MI_MATH_INSTR(0x481, op1)
+#define   MI_MATH_ADD                   MI_MATH_INSTR(0x100, 0x0, 0x0)
+#define   MI_MATH_SUB                   MI_MATH_INSTR(0x101, 0x0, 0x0)
+#define   MI_MATH_AND                   MI_MATH_INSTR(0x102, 0x0, 0x0)
+#define   MI_MATH_OR                    MI_MATH_INSTR(0x103, 0x0, 0x0)
+#define   MI_MATH_XOR                   MI_MATH_INSTR(0x104, 0x0, 0x0)
+#define   MI_MATH_STORE(op1, op2)       MI_MATH_INSTR(0x180, op1, op2)
+#define   MI_MATH_STOREINV(op1, op2)    MI_MATH_INSTR(0x580, op1, op2)
+/* Registers used as operands in MI_MATH_INSTR */
+#define   MI_MATH_REG(x)                (x)
+#define   MI_MATH_REG_SRCA              0x20
+#define   MI_MATH_REG_SRCB              0x21
+#define   MI_MATH_REG_ACCU              0x31
+#define   MI_MATH_REG_ZF                0x32
+#define   MI_MATH_REG_CF                0x33
+
+#define MI_LOAD_REGISTER_REG    MI_INSTR(0x2A, 1)
+
+static void delay(int i915,
+		  const struct intel_execution_engine2 *e,
+		  uint32_t handle,
+		  uint64_t addr,
+		  uint64_t ns)
+{
+	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
+	const uint32_t base = gem_engine_mmio_base(i915, e->name);
+#define CS_GPR(x) (base + 0x600 + 8 * (x))
+#define RUNTIME (base + 0x3a8)
+	enum { START_TS, NOW_TS };
+	uint32_t *map, *cs, *jmp;
+
+	igt_require(base);
+
+	/* Loop until CTX_TIMESTAMP - initial > @ns */
+
+	cs = map = gem_mmap__device_coherent(i915, handle, 0, 4096, PROT_WRITE);
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(START_TS) + 4;
+	*cs++ = 0;
+	*cs++ = MI_LOAD_REGISTER_REG;
+	*cs++ = RUNTIME;
+	*cs++ = CS_GPR(START_TS);
+
+	while (offset_in_page(cs) & 63)
+		*cs++ = 0;
+	jmp = cs;
+
+	*cs++ = 0x5 << 23; /* MI_ARB_CHECK */
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(NOW_TS) + 4;
+	*cs++ = 0;
+	*cs++ = MI_LOAD_REGISTER_REG;
+	*cs++ = RUNTIME;
+	*cs++ = CS_GPR(NOW_TS);
+
+	/* delta = now - start; inverted to match COND_BBE */
+	*cs++ = MI_MATH(4);
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(NOW_TS));
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(START_TS));
+	*cs++ = MI_MATH_SUB;
+	*cs++ = MI_MATH_STOREINV(MI_MATH_REG(NOW_TS), MI_MATH_REG_ACCU);
+
+	/* Save delta for reading by COND_BBE */
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_GPR(NOW_TS);
+	*cs++ = addr + 4000;
+	*cs++ = addr >> 32;
+
+	/* Delay between SRM and COND_BBE to post the writes */
+	for (int n = 0; n < 8; n++) {
+		*cs++ = MI_STORE_DWORD_IMM;
+		if (use_64b) {
+			*cs++ = addr + 4064;
+			*cs++ = addr >> 32;
+		} else {
+			*cs++ = 0;
+			*cs++ = addr + 4064;
+		}
+		*cs++ = 0;
+	}
+
+	/* Break if delta [time elapsed] > ns */
+	*cs++ = MI_COND_BATCH_BUFFER_END | MI_DO_COMPARE | (1 + use_64b);
+	*cs++ = ~ns_to_ctx_ticks(i915, ns);
+	*cs++ = addr + 4000;
+	*cs++ = addr >> 32;
+
+	/* Otherwise back to recalculating delta */
+	*cs++ = MI_BATCH_BUFFER_START | 1 << 8 | use_64b;
+	*cs++ = addr + offset_in_page(jmp);
+	*cs++ = addr >> 32;
+
+	munmap(map, 4096);
+}
+
+static struct drm_i915_gem_exec_object2
+delay_create(int i915, uint32_t ctx,
+	     const struct intel_execution_engine2 *e,
+	     uint64_t target_ns)
+{
+	struct drm_i915_gem_exec_object2 obj = {
+		.handle = batch_create(i915),
+		.flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS,
+	};
+	struct drm_i915_gem_execbuffer2 execbuf = {
+		.buffers_ptr = to_user_pointer(&obj),
+		.buffer_count = 1,
+		.rsvd1 = ctx,
+		.flags = e->flags,
+	};
+
+	obj.offset = obj.handle << 12;
+	gem_execbuf(i915, &execbuf);
+	gem_sync(i915, obj.handle);
+
+	delay(i915, e, obj.handle, obj.offset, target_ns);
+
+	obj.flags |= EXEC_OBJECT_PINNED;
+	return obj;
+}
+
+static void tslog(int i915,
+		  const struct intel_execution_engine2 *e,
+		  uint32_t handle,
+		  uint64_t addr)
+{
+	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
+	const uint32_t base = gem_engine_mmio_base(i915, e->name);
+#define CS_GPR(x) (base + 0x600 + 8 * (x))
+#define CS_TIMESTAMP (base + 0x358)
+	enum { INC, MASK, ADDR };
+	uint32_t *timestamp_lo, *addr_lo;
+	uint32_t *map, *cs;
+
+	igt_require(base);
+
+	map = gem_mmap__device_coherent(i915, handle, 0, 4096, PROT_WRITE);
+	cs = map + 512;
+
+	/* Record the current CS_TIMESTAMP into a journal [a 512 slot ring]. */
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_TIMESTAMP;
+	timestamp_lo = cs;
+	*cs++ = addr;
+	*cs++ = addr >> 32;
+
+	/* Load the address + inc & mask variables */
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(ADDR);
+	addr_lo = cs;
+	*cs++ = addr;
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(ADDR) + 4;
+	*cs++ = addr >> 32;
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(INC);
+	*cs++ = 4;
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(INC) + 4;
+	*cs++ = 0;
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(MASK);
+	*cs++ = 0xfffff7ff;
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(MASK) + 4;
+	*cs++ = 0xffffffff;
+
+	/* Increment the [ring] address for saving CS_TIMESTAMP */
+	*cs++ = MI_MATH(8);
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(INC));
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(ADDR));
+	*cs++ = MI_MATH_ADD;
+	*cs++ = MI_MATH_STORE(MI_MATH_REG(ADDR), MI_MATH_REG_ACCU);
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(ADDR));
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(MASK));
+	*cs++ = MI_MATH_AND;
+	*cs++ = MI_MATH_STORE(MI_MATH_REG(ADDR), MI_MATH_REG_ACCU);
+
+	/* Rewrite the batch buffer for the next execution */
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_GPR(ADDR);
+	*cs++ = addr + offset_in_page(timestamp_lo);
+	*cs++ = addr >> 32;
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_GPR(ADDR);
+	*cs++ = addr + offset_in_page(addr_lo);
+	*cs++ = addr >> 32;
+
+	*cs++ = MI_BATCH_BUFFER_END;
+
+	munmap(map, 4096);
+}
+
+static struct drm_i915_gem_exec_object2
+tslog_create(int i915, uint32_t ctx, const struct intel_execution_engine2 *e)
+{
+	struct drm_i915_gem_exec_object2 obj = {
+		.handle = batch_create(i915),
+		.flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS,
+	};
+	struct drm_i915_gem_execbuffer2 execbuf = {
+		.buffers_ptr = to_user_pointer(&obj),
+		.buffer_count = 1,
+		.rsvd1 = ctx,
+		.flags = e->flags,
+	};
+
+	obj.offset = obj.handle << 12;
+	gem_execbuf(i915, &execbuf);
+	gem_sync(i915, obj.handle);
+
+	tslog(i915, e, obj.handle, obj.offset);
+
+	obj.flags |= EXEC_OBJECT_PINNED;
+	return obj;
+}
+
+static int cmp_u32(const void *A, const void *B)
+{
+	const uint32_t *a = A, *b = B;
+
+	if (*a < *b)
+		return -1;
+	else if (*a > *b)
+		return 1;
+	else
+		return 0;
+}
+
+static bool has_ctx_timestamp(int i915, const struct intel_execution_engine2 *e)
+{
+	const int gen = intel_gen(intel_get_drm_devid(i915));
+
+	if (gen == 8 && e->class == I915_ENGINE_CLASS_VIDEO)
+		return false; /* looks fubar */
+
+	return true;
+}
+
+static struct intel_execution_engine2
+pick_random_engine(int i915, const struct intel_execution_engine2 *not)
+{
+	const struct intel_execution_engine2 *e;
+	unsigned int count = 0;
+
+	__for_each_physical_engine(i915, e) {
+		if (e->flags == not->flags)
+			continue;
+		if (!gem_class_has_mutable_submission(i915, e->class))
+			continue;
+		count++;
+	}
+	if (!count)
+		return *not;
+
+	count = rand() % count;
+	__for_each_physical_engine(i915, e) {
+		if (e->flags == not->flags)
+			continue;
+		if (!gem_class_has_mutable_submission(i915, e->class))
+			continue;
+		if (!count--)
+			break;
+	}
+
+	return *e;
+}
+
+static void fair_child(int i915, uint32_t ctx,
+		       const struct intel_execution_engine2 *e,
+		       uint64_t frame_ns,
+		       int timeline,
+		       uint32_t common,
+		       unsigned int flags,
+		       unsigned long *ctl,
+		       unsigned long *out)
+#define F_SYNC		(1 << 0)
+#define F_PACE		(1 << 1)
+#define F_FLOW		(1 << 2)
+#define F_HALF		(1 << 3)
+#define F_SOLO		(1 << 4)
+#define F_SPARE		(1 << 5)
+#define F_NEXT		(1 << 6)
+#define F_VIP		(1 << 7)
+#define F_RRUL		(1 << 8)
+#define F_SHARE		(1 << 9)
+#define F_PING		(1 << 10)
+#define F_THROTTLE	(1 << 11)
+#define F_ISOLATE	(1 << 12)
+{
+	const int batches_per_frame = flags & F_SOLO ? 1 : 3;
+	struct drm_i915_gem_exec_object2 obj[4] = {
+		{},
+		{
+			.handle = common ?: gem_create(i915, 4096),
+		},
+		delay_create(i915, ctx, e, frame_ns / batches_per_frame),
+		delay_create(i915, ctx, e, frame_ns / batches_per_frame),
+	};
+	struct intel_execution_engine2 ping = *e;
+	int p_fence = -1, n_fence = -1;
+	unsigned long count = 0;
+	int n;
+
+	srandom(getpid());
+	if (flags & F_PING)
+		ping = pick_random_engine(i915, e);
+	obj[0] = tslog_create(i915, ctx, &ping);
+
+	while (!READ_ONCE(*ctl)) {
+		struct drm_i915_gem_execbuffer2 execbuf = {
+			.buffers_ptr = to_user_pointer(obj),
+			.buffer_count = 4,
+			.rsvd1 = ctx,
+			.rsvd2 = -1,
+			.flags = e->flags,
+		};
+
+		if (flags & F_FLOW) {
+			unsigned int seq;
+
+			seq = count;
+			if (flags & F_NEXT)
+				seq++;
+
+			execbuf.rsvd2 =
+				sw_sync_timeline_create_fence(timeline, seq);
+			execbuf.flags |= I915_EXEC_FENCE_IN;
+		}
+
+		execbuf.flags |= I915_EXEC_FENCE_OUT;
+		gem_execbuf_wr(i915, &execbuf);
+		n_fence = execbuf.rsvd2 >> 32;
+		execbuf.flags &= ~(I915_EXEC_FENCE_OUT | I915_EXEC_FENCE_IN);
+		for (n = 1; n < batches_per_frame; n++)
+			gem_execbuf(i915, &execbuf);
+		close(execbuf.rsvd2);
+
+		execbuf.buffer_count = 1;
+		execbuf.batch_start_offset = 2048;
+		execbuf.flags = ping.flags | I915_EXEC_FENCE_IN;
+		execbuf.rsvd2 = n_fence;
+		gem_execbuf(i915, &execbuf);
+
+		if (flags & F_PACE && p_fence != -1) {
+			struct pollfd pfd = {
+				.fd = p_fence,
+				.events = POLLIN,
+			};
+			poll(&pfd, 1, -1);
+		}
+		close(p_fence);
+
+		if (flags & F_SYNC) {
+			struct pollfd pfd = {
+				.fd = n_fence,
+				.events = POLLIN,
+			};
+			poll(&pfd, 1, -1);
+		}
+
+		if (flags & F_THROTTLE)
+			igt_ioctl(i915, DRM_IOCTL_I915_GEM_THROTTLE, 0);
+
+		igt_swap(obj[2], obj[3]);
+		igt_swap(p_fence, n_fence);
+		count++;
+	}
+	close(p_fence);
+
+	gem_close(i915, obj[3].handle);
+	gem_close(i915, obj[2].handle);
+	if (obj[1].handle != common)
+		gem_close(i915, obj[1].handle);
+
+	gem_sync(i915, obj[0].handle);
+	if (out) {
+		uint32_t *map;
+
+		map = gem_mmap__device_coherent(i915, obj[0].handle,
+						0, 4096, PROT_WRITE);
+		for (n = 1; n < min(count, 512); n++) {
+			igt_assert(map[n]);
+			map[n - 1] = map[n] - map[n - 1];
+		}
+		qsort(map, --n, sizeof(*map), cmp_u32);
+		*out = ticks_to_ns(i915, map[n / 2]);
+		munmap(map, 4096);
+	}
+	gem_close(i915, obj[0].handle);
+}
+
+static int cmp_ul(const void *A, const void *B)
+{
+	const unsigned long *a = A, *b = B;
+
+	if (*a < *b)
+		return -1;
+	else if (*a > *b)
+		return 1;
+	else
+		return 0;
+}
+
+static uint64_t d_cpu_time(const struct rusage *a, const struct rusage *b)
+{
+	uint64_t cpu_time = 0;
+
+	cpu_time += (a->ru_utime.tv_sec - b->ru_utime.tv_sec) * NSEC_PER_SEC;
+	cpu_time += (a->ru_utime.tv_usec - b->ru_utime.tv_usec) * 1000;
+
+	cpu_time += (a->ru_stime.tv_sec - b->ru_stime.tv_sec) * NSEC_PER_SEC;
+	cpu_time += (a->ru_stime.tv_usec - b->ru_stime.tv_usec) * 1000;
+
+	return cpu_time;
+}
+
+static void timeline_advance(int timeline, int delay_ns)
+{
+	struct timespec tv = { .tv_nsec = delay_ns };
+	nanosleep(&tv, NULL);
+	sw_sync_timeline_inc(timeline, 1);
+}
+
+static void fairness(int i915,
+		     const struct intel_execution_engine2 *e,
+		     int timeout, unsigned int flags)
+{
+	const int frame_ns = 16666 * 1000;
+	const int fence_ns = flags & F_HALF ? 2 * frame_ns : frame_ns;
+	unsigned long *result;
+	uint32_t common = 0;
+
+	igt_require(has_ctx_timestamp(i915, e));
+	igt_require(gem_class_has_mutable_submission(i915, e->class));
+
+	if (flags & F_SHARE)
+		common = gem_create(i915, 4095);
+
+	result = mmap(NULL, 4096, PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
+
+	for (int n = 2; n <= 64; n <<= 1) { /* 32 == 500us per client */
+		int timeline = sw_sync_timeline_create();
+		int nfences = timeout * NSEC_PER_SEC / fence_ns + 1;
+		const int nchild = n - 1; /* odd for easy medians */
+		const int child_ns = frame_ns / (nchild + !!(flags & F_SPARE));
+		const int lo = nchild / 4;
+		const int hi = (3 * nchild + 3) / 4 - 1;
+		struct rusage old_usage, usage;
+		uint64_t cpu_time, d_time;
+		unsigned long vip = -1;
+		struct timespec tv;
+		struct igt_mean m;
+
+		if (flags & F_PING) {
+			struct intel_execution_engine2 *ping;
+
+			__for_each_physical_engine(i915, ping) {
+				if (ping->flags == e->flags)
+					continue;
+
+				igt_fork(child, 1) {
+					uint32_t ctx = gem_context_clone_with_engines(i915, 0);
+
+					fair_child(i915, ctx, ping,
+						   child_ns / 8,
+						   -1, common,
+						   F_SOLO | F_PACE | F_SHARE,
+						   &result[nchild],
+						   NULL);
+
+					gem_context_destroy(i915, ctx);
+				}
+			}
+		}
+
+		memset(result, 0, (nchild + 1) * sizeof(result[0]));
+		getrusage(RUSAGE_CHILDREN, &old_usage);
+		igt_nsec_elapsed(memset(&tv, 0, sizeof(tv)));
+		igt_fork(child, nchild) {
+			uint32_t ctx;
+
+			if (flags & F_ISOLATE) {
+				int clone, dmabuf = -1;
+
+				if (common)
+					dmabuf = prime_handle_to_fd(i915, common);
+
+				clone = gem_reopen_driver(i915);
+				gem_context_copy_engines(i915, 0, clone, 0);
+				i915 = clone;
+
+				if (dmabuf != -1)
+					common = prime_fd_to_handle(i915, dmabuf);
+			}
+
+			ctx = gem_context_clone_with_engines(i915, 0);
+
+			if (flags & F_VIP && child == 0) {
+				gem_context_set_priority(i915, ctx, MAX_PRIO);
+				flags |= F_FLOW;
+			}
+			if (flags & F_RRUL && child == 0)
+				flags |= F_SOLO | F_FLOW | F_SYNC;
+
+			fair_child(i915, ctx, e, child_ns,
+				   timeline, common, flags,
+				   &result[nchild],
+				   &result[child]);
+
+			gem_context_destroy(i915, ctx);
+		}
+
+		while (nfences--)
+			timeline_advance(timeline, fence_ns);
+
+		result[nchild] = 1;
+		for (int child = 0; child < nchild; child++) {
+			while (!READ_ONCE(result[child]))
+				timeline_advance(timeline, fence_ns);
+		}
+
+		igt_waitchildren();
+		close(timeline);
+
+		/* Are we running out of CPU time, and fail to submit frames? */
+		d_time = igt_nsec_elapsed(&tv);
+		getrusage(RUSAGE_CHILDREN, &usage);
+		cpu_time = d_cpu_time(&usage, &old_usage);
+		if (10 * cpu_time > 9 * d_time) {
+			if (nchild > 7)
+				break;
+
+			igt_skip_on_f(10 * cpu_time > 9 * d_time,
+				      "%.0f%% CPU usage, presuming capacity exceeded\n",
+				      100. * cpu_time / d_time);
+		}
+
+		igt_mean_init(&m);
+		for (int child = 0; child < nchild; child++)
+			igt_mean_add(&m, result[child]);
+
+		if (flags & (F_VIP | F_RRUL))
+			vip = result[0];
+
+		qsort(result, nchild, sizeof(*result), cmp_ul);
+		igt_info("%2d clients, range: [%.1f, %.1f], iqr: [%.1f, %.1f], median: %.1f, mean: %.1f ± %.2f ms\n",
+			 nchild,
+			 1e-6 * result[0],  1e-6 * result[nchild - 1],
+			 1e-6 * result[lo], 1e-6 * result[hi],
+			 1e-6 * result[nchild / 2],
+			 1e-6 * igt_mean_get(&m),
+			 1e-6 * sqrt(igt_mean_get_variance(&m)));
+
+		if (vip != -1) {
+			igt_info("VIP interval %.2f ms\n", 1e-6 * vip);
+			igt_assert(4 * vip > 3 * fence_ns &&
+				   3 * vip < 4 * fence_ns);
+		}
+
+		/* May be slowed due to sheer volume of context switches */
+		igt_assert(4 * igt_mean_get(&m) > 3 * fence_ns &&
+			       igt_mean_get(&m) < 3 * fence_ns);
+
+		igt_assert(4 * igt_mean_get(&m) > 3 * result[nchild / 2] &&
+			   3 * igt_mean_get(&m) < 4 * result[nchild / 2]);
+
+		igt_assert(2 * (result[hi] - result[lo]) < result[nchild / 2]);
+	}
+
+	munmap(result, 4096);
+	if (common)
+		gem_close(i915, common);
+}
+
+static void test_fairness(int i915, int timeout)
+{
+	static const struct {
+		const char *name;
+		unsigned int flags;
+	} fair[] = {
+		/*
+		 * none - maximal greed in each client
+		 *
+		 * Push as many frames from each client as fast as possible
+		 */
+		{ "none",       0 },
+		{ "none-vip",   F_VIP }, /* one vip client must meet deadlines */
+		{ "none-solo",  F_SOLO }, /* 1 batch per frame per client */
+		{ "none-share", F_SHARE }, /* read from a common buffer */
+		{ "none-rrul",  F_RRUL }, /* "realtime-response under load" */
+		{ "none-ping",  F_PING }, /* measure inter-engine fairness */
+
+		/*
+		 * throttle - original per client throttling
+		 *
+		 * Used for front buffering rendering where there is no
+		 * extenal frame marker. Each client tries to only keep
+		 * 20ms of work submitted, though that measurement is
+		 * flawed...
+		 *
+		 * This is used by Xorg to try and maintain some resembalance
+		 * of input/output consistency when being feed a continuous
+		 * stream of X11 draw requests straight into scanout, where
+		 * the clients may submit the work faster than can be drawn.
+		 *
+		 * Throttling tracks requests per-file (and assumes that
+		 * all requests are in submission order across the whole file),
+		 * so we split each child to its own fd.
+		 */
+		{ "throttle",       F_THROTTLE | F_ISOLATE },
+		{ "throttle-vip",   F_THROTTLE | F_ISOLATE | F_VIP },
+		{ "throttle-solo",  F_THROTTLE | F_ISOLATE | F_SOLO },
+		{ "throttle-share", F_THROTTLE | F_ISOLATE | F_SHARE },
+		{ "throttle-rrul",  F_THROTTLE | F_ISOLATE | F_RRUL },
+
+		/*
+		 * pace - mesa "submit double buffering"
+		 *
+		 * Submit a frame, wait for previous frame to start. This
+		 * prevents each client from getting too far ahead of its
+		 * rendering, maintaining a consistent input/output latency.
+		 */
+		{ "pace",       F_PACE },
+		{ "pace-solo",  F_PACE | F_SOLO},
+		{ "pace-share", F_PACE | F_SHARE},
+		{ "pace-ping",  F_PACE | F_SHARE | F_PING},
+
+		/* sync - only submit a frame at a time */
+		{ "sync",      F_SYNC },
+		{ "sync-vip",  F_SYNC | F_VIP },
+		{ "sync-solo", F_SYNC | F_SOLO },
+
+		/* flow - synchronise execution against the clock (vblank) */
+		{ "flow",       F_PACE | F_FLOW },
+		{ "flow-share", F_PACE | F_FLOW | F_SHARE },
+		{ "flow-ping",  F_PACE | F_FLOW | F_SHARE | F_PING },
+
+		/* next - submit ahead of the clock (vblank double buffering) */
+		{ "next",       F_PACE | F_FLOW | F_NEXT },
+		{ "next-share", F_PACE | F_FLOW | F_NEXT | F_SHARE },
+		{ "next-ping",  F_PACE | F_FLOW | F_NEXT | F_SHARE | F_PING },
+
+		/* spare - underutilise by a single client timeslice */
+		{ "spare", F_PACE | F_FLOW | F_SPARE },
+
+		/* half - run at half pace (submit 16ms of work every 32ms) */
+		{ "half",  F_PACE | F_FLOW | F_HALF },
+
+		{}
+	};
+
+	igt_fixture {
+		igt_info("CS timestamp frequency: %d\n",
+			 read_timestamp_frequency(i915));
+
+		igt_require(intel_gen(intel_get_drm_devid(i915)) >= 8);
+	}
+
+	for (typeof(*fair) *f = fair; f->name; f++) {
+		igt_subtest_with_dynamic_f("fair-%s", f->name)  {
+			const struct intel_execution_engine2 *e;
+
+			__for_each_physical_engine(i915, e) {
+				if (!gem_class_can_store_dword(i915, e->class))
+					continue;
+
+				igt_dynamic_f("%s", e->name)
+					fairness(i915, e, timeout, f->flags);
+			}
+		}
+	}
+}
+
+static uint32_t read_ctx_timestamp(int i915,
+				   uint32_t ctx,
+				   const struct intel_execution_engine2 *e)
+{
+	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
+	const uint32_t base = gem_engine_mmio_base(i915, e->name);
+	struct drm_i915_gem_relocation_entry reloc;
+	struct drm_i915_gem_exec_object2 obj = {
+		.handle = gem_create(i915, 4096),
+		.offset = 32 << 20,
+		.relocs_ptr = to_user_pointer(&reloc),
+		.relocation_count = 1,
+	};
+	struct drm_i915_gem_execbuffer2 execbuf = {
+		.buffers_ptr = to_user_pointer(&obj),
+		.buffer_count = 1,
+		.flags = e->flags,
+		.rsvd1 = ctx,
+	};
+#define RUNTIME (base + 0x3a8)
+	uint32_t *map, *cs;
+	uint32_t ts;
+
+	igt_require(base);
+
+	cs = map = gem_mmap__device_coherent(i915, obj.handle,
+					     0, 4096, PROT_WRITE);
+
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = RUNTIME;
+	memset(&reloc, 0, sizeof(reloc));
+	reloc.target_handle = obj.handle;
+	reloc.presumed_offset = obj.offset;
+	reloc.offset = offset_in_page(cs);
+	reloc.delta = 4000;
+	*cs++ = obj.offset + 4000;
+	*cs++ = obj.offset >> 32;
+
+	*cs++ = MI_BATCH_BUFFER_END;
+
+	gem_execbuf(i915, &execbuf);
+	gem_sync(i915, obj.handle);
+	gem_close(i915, obj.handle);
+
+	ts = map[1000];
+	munmap(map, 4096);
+
+	return ts;
+}
+
+static void fairslice(int i915,
+		      const struct intel_execution_engine2 *e,
+		      unsigned long flags)
+{
+	igt_spin_t *spin = NULL;
+	uint32_t ctx[3];
+	uint32_t ts[3];
+
+	for (int i = 0; i < ARRAY_SIZE(ctx); i++) {
+		ctx[i] = gem_context_clone_with_engines(i915, 0);
+		if (spin == NULL) {
+			spin = __igt_spin_new(i915,
+					      .ctx = ctx[i],
+					      .engine = e->flags,
+					      .flags = flags);
+		} else {
+			struct drm_i915_gem_execbuffer2 eb = {
+				.buffer_count = 1,
+				.buffers_ptr = to_user_pointer(&spin->obj[IGT_SPIN_BATCH]),
+				.flags = e->flags,
+				.rsvd1 = ctx[i],
+			};
+			gem_execbuf(i915, &eb);
+		}
+	}
+
+	sleep(2); /* over the course of many timeslices */
+
+	igt_assert(gem_bo_busy(i915, spin->handle));
+	igt_spin_end(spin);
+	for (int i = 0; i < ARRAY_SIZE(ctx); i++)
+		ts[i] = read_ctx_timestamp(i915, ctx[i], e);
+
+	for (int i = 0; i < ARRAY_SIZE(ctx); i++)
+		gem_context_destroy(i915, ctx[i]);
+	igt_spin_free(i915, spin);
+
+	qsort(ts, 3, sizeof(*ts), cmp_u32);
+	igt_info("%s: [%.1f, %.1f] ms\n", e->name,
+		 1e-6 * ticks_to_ns(i915, ts[0]),
+		 1e-6 * ticks_to_ns(i915, ts[2]));
+
+	igt_assert(ts[0] && ts[2] > ts[0]);
+	igt_assert(4 * ts[0] > 3 * ts[2]);
+}
+
 #define test_each_engine(T, i915, e) \
 	igt_subtest_with_dynamic(T) __for_each_physical_engine(i915, e) \
 		igt_dynamic_f("%s", e->name)
@@ -2582,6 +3396,35 @@ igt_main
 		test_each_engine("lateslice", fd, e)
 			lateslice(fd, e->flags);
 
+		igt_subtest_group {
+			igt_fixture {
+				igt_require(gem_scheduler_has_semaphores(fd));
+				igt_require(gem_scheduler_has_preemption(fd));
+				igt_require(intel_gen(intel_get_drm_devid(fd)) >= 8);
+			}
+
+			test_each_engine("fairslice", fd, e)
+				fairslice(fd, e, 0);
+
+			test_each_engine("u-fairslice", fd, e)
+				fairslice(fd, e, IGT_SPIN_USERPTR);
+
+			igt_subtest("fairslice-all")  {
+				__for_each_physical_engine(fd, e) {
+					igt_fork(child, 1)
+						fairslice(fd, e, 0);
+				}
+				igt_waitchildren();
+			}
+			igt_subtest("u-fairslice-all")  {
+				__for_each_physical_engine(fd, e) {
+					igt_fork(child, 1)
+						fairslice(fd, e, IGT_SPIN_USERPTR);
+				}
+				igt_waitchildren();
+			}
+		}
+
 		test_each_engine("submit-early-slice", fd, e)
 			submit_slice(fd, e, EARLY_SUBMIT);
 		test_each_engine("submit-golden-slice", fd, e)
@@ -2610,6 +3453,10 @@ igt_main
 		test_each_engine_store("promotion", fd, e)
 			promotion(fd, e->flags);
 
+		igt_subtest_group {
+			test_fairness(fd, 2);
+		}
+
 		igt_subtest_group {
 			igt_fixture {
 				igt_require(gem_scheduler_has_preemption(fd));
-- 
2.29.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [igt-dev] [PATCH i-g-t] i915/gem_exec_schedule: Try to spot unfairness
@ 2020-11-24 23:39 ` Chris Wilson
  0 siblings, 0 replies; 20+ messages in thread
From: Chris Wilson @ 2020-11-24 23:39 UTC (permalink / raw)
  To: intel-gfx; +Cc: igt-dev, Tvrtko Ursulin, Chris Wilson

An important property for multi-client systems is that each client gets
a 'fair' allotment of system time. (Where fairness is at the whim of the
context properties, such as priorities.) This test forks N independent
clients (albeit they happen to share a single vm), and does an equal
amount of work in client and asserts that they take an equal amount of
time.

Though we have never claimed to have a completely fair scheduler, that
is what is expected.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Ramalingam C <ramalingam.c@intel.com>
---
 tests/i915/gem_exec_schedule.c | 847 +++++++++++++++++++++++++++++++++
 1 file changed, 847 insertions(+)

diff --git a/tests/i915/gem_exec_schedule.c b/tests/i915/gem_exec_schedule.c
index f23d63ac3..d888efcd7 100644
--- a/tests/i915/gem_exec_schedule.c
+++ b/tests/i915/gem_exec_schedule.c
@@ -29,6 +29,7 @@
 #include <sys/poll.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
+#include <sys/resource.h>
 #include <sys/syscall.h>
 #include <sched.h>
 #include <signal.h>
@@ -2516,6 +2517,819 @@ static void measure_semaphore_power(int i915)
 	rapl_close(&pkg);
 }
 
+static int read_timestamp_frequency(int i915)
+{
+	int value = 0;
+	drm_i915_getparam_t gp = {
+		.value = &value,
+		.param = I915_PARAM_CS_TIMESTAMP_FREQUENCY,
+	};
+	ioctl(i915, DRM_IOCTL_I915_GETPARAM, &gp);
+	return value;
+}
+
+static uint64_t div64_u64_round_up(uint64_t x, uint64_t y)
+{
+	return (x + y - 1) / y;
+}
+
+static uint64_t ns_to_ctx_ticks(int i915, uint64_t ns)
+{
+	int f = read_timestamp_frequency(i915);
+	if (intel_gen(intel_get_drm_devid(i915)) == 11)
+		f = 12500000; /* icl!!! are you feeling alright? CTX vs CS */
+	return div64_u64_round_up(ns * f, NSEC_PER_SEC);
+}
+
+static uint64_t ticks_to_ns(int i915, uint64_t ticks)
+{
+	return div64_u64_round_up(ticks * NSEC_PER_SEC,
+				  read_timestamp_frequency(i915));
+}
+
+#define MI_INSTR(opcode, flags) (((opcode) << 23) | (flags))
+
+#define MI_MATH(x)                      MI_INSTR(0x1a, (x) - 1)
+#define MI_MATH_INSTR(opcode, op1, op2) ((opcode) << 20 | (op1) << 10 | (op2))
+/* Opcodes for MI_MATH_INSTR */
+#define   MI_MATH_NOOP                  MI_MATH_INSTR(0x000, 0x0, 0x0)
+#define   MI_MATH_LOAD(op1, op2)        MI_MATH_INSTR(0x080, op1, op2)
+#define   MI_MATH_LOADINV(op1, op2)     MI_MATH_INSTR(0x480, op1, op2)
+#define   MI_MATH_LOAD0(op1)            MI_MATH_INSTR(0x081, op1)
+#define   MI_MATH_LOAD1(op1)            MI_MATH_INSTR(0x481, op1)
+#define   MI_MATH_ADD                   MI_MATH_INSTR(0x100, 0x0, 0x0)
+#define   MI_MATH_SUB                   MI_MATH_INSTR(0x101, 0x0, 0x0)
+#define   MI_MATH_AND                   MI_MATH_INSTR(0x102, 0x0, 0x0)
+#define   MI_MATH_OR                    MI_MATH_INSTR(0x103, 0x0, 0x0)
+#define   MI_MATH_XOR                   MI_MATH_INSTR(0x104, 0x0, 0x0)
+#define   MI_MATH_STORE(op1, op2)       MI_MATH_INSTR(0x180, op1, op2)
+#define   MI_MATH_STOREINV(op1, op2)    MI_MATH_INSTR(0x580, op1, op2)
+/* Registers used as operands in MI_MATH_INSTR */
+#define   MI_MATH_REG(x)                (x)
+#define   MI_MATH_REG_SRCA              0x20
+#define   MI_MATH_REG_SRCB              0x21
+#define   MI_MATH_REG_ACCU              0x31
+#define   MI_MATH_REG_ZF                0x32
+#define   MI_MATH_REG_CF                0x33
+
+#define MI_LOAD_REGISTER_REG    MI_INSTR(0x2A, 1)
+
+static void delay(int i915,
+		  const struct intel_execution_engine2 *e,
+		  uint32_t handle,
+		  uint64_t addr,
+		  uint64_t ns)
+{
+	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
+	const uint32_t base = gem_engine_mmio_base(i915, e->name);
+#define CS_GPR(x) (base + 0x600 + 8 * (x))
+#define RUNTIME (base + 0x3a8)
+	enum { START_TS, NOW_TS };
+	uint32_t *map, *cs, *jmp;
+
+	igt_require(base);
+
+	/* Loop until CTX_TIMESTAMP - initial > @ns */
+
+	cs = map = gem_mmap__device_coherent(i915, handle, 0, 4096, PROT_WRITE);
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(START_TS) + 4;
+	*cs++ = 0;
+	*cs++ = MI_LOAD_REGISTER_REG;
+	*cs++ = RUNTIME;
+	*cs++ = CS_GPR(START_TS);
+
+	while (offset_in_page(cs) & 63)
+		*cs++ = 0;
+	jmp = cs;
+
+	*cs++ = 0x5 << 23; /* MI_ARB_CHECK */
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(NOW_TS) + 4;
+	*cs++ = 0;
+	*cs++ = MI_LOAD_REGISTER_REG;
+	*cs++ = RUNTIME;
+	*cs++ = CS_GPR(NOW_TS);
+
+	/* delta = now - start; inverted to match COND_BBE */
+	*cs++ = MI_MATH(4);
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(NOW_TS));
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(START_TS));
+	*cs++ = MI_MATH_SUB;
+	*cs++ = MI_MATH_STOREINV(MI_MATH_REG(NOW_TS), MI_MATH_REG_ACCU);
+
+	/* Save delta for reading by COND_BBE */
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_GPR(NOW_TS);
+	*cs++ = addr + 4000;
+	*cs++ = addr >> 32;
+
+	/* Delay between SRM and COND_BBE to post the writes */
+	for (int n = 0; n < 8; n++) {
+		*cs++ = MI_STORE_DWORD_IMM;
+		if (use_64b) {
+			*cs++ = addr + 4064;
+			*cs++ = addr >> 32;
+		} else {
+			*cs++ = 0;
+			*cs++ = addr + 4064;
+		}
+		*cs++ = 0;
+	}
+
+	/* Break if delta [time elapsed] > ns */
+	*cs++ = MI_COND_BATCH_BUFFER_END | MI_DO_COMPARE | (1 + use_64b);
+	*cs++ = ~ns_to_ctx_ticks(i915, ns);
+	*cs++ = addr + 4000;
+	*cs++ = addr >> 32;
+
+	/* Otherwise back to recalculating delta */
+	*cs++ = MI_BATCH_BUFFER_START | 1 << 8 | use_64b;
+	*cs++ = addr + offset_in_page(jmp);
+	*cs++ = addr >> 32;
+
+	munmap(map, 4096);
+}
+
+static struct drm_i915_gem_exec_object2
+delay_create(int i915, uint32_t ctx,
+	     const struct intel_execution_engine2 *e,
+	     uint64_t target_ns)
+{
+	struct drm_i915_gem_exec_object2 obj = {
+		.handle = batch_create(i915),
+		.flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS,
+	};
+	struct drm_i915_gem_execbuffer2 execbuf = {
+		.buffers_ptr = to_user_pointer(&obj),
+		.buffer_count = 1,
+		.rsvd1 = ctx,
+		.flags = e->flags,
+	};
+
+	obj.offset = obj.handle << 12;
+	gem_execbuf(i915, &execbuf);
+	gem_sync(i915, obj.handle);
+
+	delay(i915, e, obj.handle, obj.offset, target_ns);
+
+	obj.flags |= EXEC_OBJECT_PINNED;
+	return obj;
+}
+
+static void tslog(int i915,
+		  const struct intel_execution_engine2 *e,
+		  uint32_t handle,
+		  uint64_t addr)
+{
+	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
+	const uint32_t base = gem_engine_mmio_base(i915, e->name);
+#define CS_GPR(x) (base + 0x600 + 8 * (x))
+#define CS_TIMESTAMP (base + 0x358)
+	enum { INC, MASK, ADDR };
+	uint32_t *timestamp_lo, *addr_lo;
+	uint32_t *map, *cs;
+
+	igt_require(base);
+
+	map = gem_mmap__device_coherent(i915, handle, 0, 4096, PROT_WRITE);
+	cs = map + 512;
+
+	/* Record the current CS_TIMESTAMP into a journal [a 512 slot ring]. */
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_TIMESTAMP;
+	timestamp_lo = cs;
+	*cs++ = addr;
+	*cs++ = addr >> 32;
+
+	/* Load the address + inc & mask variables */
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(ADDR);
+	addr_lo = cs;
+	*cs++ = addr;
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(ADDR) + 4;
+	*cs++ = addr >> 32;
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(INC);
+	*cs++ = 4;
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(INC) + 4;
+	*cs++ = 0;
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(MASK);
+	*cs++ = 0xfffff7ff;
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(MASK) + 4;
+	*cs++ = 0xffffffff;
+
+	/* Increment the [ring] address for saving CS_TIMESTAMP */
+	*cs++ = MI_MATH(8);
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(INC));
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(ADDR));
+	*cs++ = MI_MATH_ADD;
+	*cs++ = MI_MATH_STORE(MI_MATH_REG(ADDR), MI_MATH_REG_ACCU);
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(ADDR));
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(MASK));
+	*cs++ = MI_MATH_AND;
+	*cs++ = MI_MATH_STORE(MI_MATH_REG(ADDR), MI_MATH_REG_ACCU);
+
+	/* Rewrite the batch buffer for the next execution */
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_GPR(ADDR);
+	*cs++ = addr + offset_in_page(timestamp_lo);
+	*cs++ = addr >> 32;
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_GPR(ADDR);
+	*cs++ = addr + offset_in_page(addr_lo);
+	*cs++ = addr >> 32;
+
+	*cs++ = MI_BATCH_BUFFER_END;
+
+	munmap(map, 4096);
+}
+
+static struct drm_i915_gem_exec_object2
+tslog_create(int i915, uint32_t ctx, const struct intel_execution_engine2 *e)
+{
+	struct drm_i915_gem_exec_object2 obj = {
+		.handle = batch_create(i915),
+		.flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS,
+	};
+	struct drm_i915_gem_execbuffer2 execbuf = {
+		.buffers_ptr = to_user_pointer(&obj),
+		.buffer_count = 1,
+		.rsvd1 = ctx,
+		.flags = e->flags,
+	};
+
+	obj.offset = obj.handle << 12;
+	gem_execbuf(i915, &execbuf);
+	gem_sync(i915, obj.handle);
+
+	tslog(i915, e, obj.handle, obj.offset);
+
+	obj.flags |= EXEC_OBJECT_PINNED;
+	return obj;
+}
+
+static int cmp_u32(const void *A, const void *B)
+{
+	const uint32_t *a = A, *b = B;
+
+	if (*a < *b)
+		return -1;
+	else if (*a > *b)
+		return 1;
+	else
+		return 0;
+}
+
+static bool has_ctx_timestamp(int i915, const struct intel_execution_engine2 *e)
+{
+	const int gen = intel_gen(intel_get_drm_devid(i915));
+
+	if (gen == 8 && e->class == I915_ENGINE_CLASS_VIDEO)
+		return false; /* looks fubar */
+
+	return true;
+}
+
+static struct intel_execution_engine2
+pick_random_engine(int i915, const struct intel_execution_engine2 *not)
+{
+	const struct intel_execution_engine2 *e;
+	unsigned int count = 0;
+
+	__for_each_physical_engine(i915, e) {
+		if (e->flags == not->flags)
+			continue;
+		if (!gem_class_has_mutable_submission(i915, e->class))
+			continue;
+		count++;
+	}
+	if (!count)
+		return *not;
+
+	count = rand() % count;
+	__for_each_physical_engine(i915, e) {
+		if (e->flags == not->flags)
+			continue;
+		if (!gem_class_has_mutable_submission(i915, e->class))
+			continue;
+		if (!count--)
+			break;
+	}
+
+	return *e;
+}
+
+static void fair_child(int i915, uint32_t ctx,
+		       const struct intel_execution_engine2 *e,
+		       uint64_t frame_ns,
+		       int timeline,
+		       uint32_t common,
+		       unsigned int flags,
+		       unsigned long *ctl,
+		       unsigned long *out)
+#define F_SYNC		(1 << 0)
+#define F_PACE		(1 << 1)
+#define F_FLOW		(1 << 2)
+#define F_HALF		(1 << 3)
+#define F_SOLO		(1 << 4)
+#define F_SPARE		(1 << 5)
+#define F_NEXT		(1 << 6)
+#define F_VIP		(1 << 7)
+#define F_RRUL		(1 << 8)
+#define F_SHARE		(1 << 9)
+#define F_PING		(1 << 10)
+#define F_THROTTLE	(1 << 11)
+#define F_ISOLATE	(1 << 12)
+{
+	const int batches_per_frame = flags & F_SOLO ? 1 : 3;
+	struct drm_i915_gem_exec_object2 obj[4] = {
+		{},
+		{
+			.handle = common ?: gem_create(i915, 4096),
+		},
+		delay_create(i915, ctx, e, frame_ns / batches_per_frame),
+		delay_create(i915, ctx, e, frame_ns / batches_per_frame),
+	};
+	struct intel_execution_engine2 ping = *e;
+	int p_fence = -1, n_fence = -1;
+	unsigned long count = 0;
+	int n;
+
+	srandom(getpid());
+	if (flags & F_PING)
+		ping = pick_random_engine(i915, e);
+	obj[0] = tslog_create(i915, ctx, &ping);
+
+	while (!READ_ONCE(*ctl)) {
+		struct drm_i915_gem_execbuffer2 execbuf = {
+			.buffers_ptr = to_user_pointer(obj),
+			.buffer_count = 4,
+			.rsvd1 = ctx,
+			.rsvd2 = -1,
+			.flags = e->flags,
+		};
+
+		if (flags & F_FLOW) {
+			unsigned int seq;
+
+			seq = count;
+			if (flags & F_NEXT)
+				seq++;
+
+			execbuf.rsvd2 =
+				sw_sync_timeline_create_fence(timeline, seq);
+			execbuf.flags |= I915_EXEC_FENCE_IN;
+		}
+
+		execbuf.flags |= I915_EXEC_FENCE_OUT;
+		gem_execbuf_wr(i915, &execbuf);
+		n_fence = execbuf.rsvd2 >> 32;
+		execbuf.flags &= ~(I915_EXEC_FENCE_OUT | I915_EXEC_FENCE_IN);
+		for (n = 1; n < batches_per_frame; n++)
+			gem_execbuf(i915, &execbuf);
+		close(execbuf.rsvd2);
+
+		execbuf.buffer_count = 1;
+		execbuf.batch_start_offset = 2048;
+		execbuf.flags = ping.flags | I915_EXEC_FENCE_IN;
+		execbuf.rsvd2 = n_fence;
+		gem_execbuf(i915, &execbuf);
+
+		if (flags & F_PACE && p_fence != -1) {
+			struct pollfd pfd = {
+				.fd = p_fence,
+				.events = POLLIN,
+			};
+			poll(&pfd, 1, -1);
+		}
+		close(p_fence);
+
+		if (flags & F_SYNC) {
+			struct pollfd pfd = {
+				.fd = n_fence,
+				.events = POLLIN,
+			};
+			poll(&pfd, 1, -1);
+		}
+
+		if (flags & F_THROTTLE)
+			igt_ioctl(i915, DRM_IOCTL_I915_GEM_THROTTLE, 0);
+
+		igt_swap(obj[2], obj[3]);
+		igt_swap(p_fence, n_fence);
+		count++;
+	}
+	close(p_fence);
+
+	gem_close(i915, obj[3].handle);
+	gem_close(i915, obj[2].handle);
+	if (obj[1].handle != common)
+		gem_close(i915, obj[1].handle);
+
+	gem_sync(i915, obj[0].handle);
+	if (out) {
+		uint32_t *map;
+
+		map = gem_mmap__device_coherent(i915, obj[0].handle,
+						0, 4096, PROT_WRITE);
+		for (n = 1; n < min(count, 512); n++) {
+			igt_assert(map[n]);
+			map[n - 1] = map[n] - map[n - 1];
+		}
+		qsort(map, --n, sizeof(*map), cmp_u32);
+		*out = ticks_to_ns(i915, map[n / 2]);
+		munmap(map, 4096);
+	}
+	gem_close(i915, obj[0].handle);
+}
+
+static int cmp_ul(const void *A, const void *B)
+{
+	const unsigned long *a = A, *b = B;
+
+	if (*a < *b)
+		return -1;
+	else if (*a > *b)
+		return 1;
+	else
+		return 0;
+}
+
+static uint64_t d_cpu_time(const struct rusage *a, const struct rusage *b)
+{
+	uint64_t cpu_time = 0;
+
+	cpu_time += (a->ru_utime.tv_sec - b->ru_utime.tv_sec) * NSEC_PER_SEC;
+	cpu_time += (a->ru_utime.tv_usec - b->ru_utime.tv_usec) * 1000;
+
+	cpu_time += (a->ru_stime.tv_sec - b->ru_stime.tv_sec) * NSEC_PER_SEC;
+	cpu_time += (a->ru_stime.tv_usec - b->ru_stime.tv_usec) * 1000;
+
+	return cpu_time;
+}
+
+static void timeline_advance(int timeline, int delay_ns)
+{
+	struct timespec tv = { .tv_nsec = delay_ns };
+	nanosleep(&tv, NULL);
+	sw_sync_timeline_inc(timeline, 1);
+}
+
+static void fairness(int i915,
+		     const struct intel_execution_engine2 *e,
+		     int timeout, unsigned int flags)
+{
+	const int frame_ns = 16666 * 1000;
+	const int fence_ns = flags & F_HALF ? 2 * frame_ns : frame_ns;
+	unsigned long *result;
+	uint32_t common = 0;
+
+	igt_require(has_ctx_timestamp(i915, e));
+	igt_require(gem_class_has_mutable_submission(i915, e->class));
+
+	if (flags & F_SHARE)
+		common = gem_create(i915, 4095);
+
+	result = mmap(NULL, 4096, PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
+
+	for (int n = 2; n <= 64; n <<= 1) { /* 32 == 500us per client */
+		int timeline = sw_sync_timeline_create();
+		int nfences = timeout * NSEC_PER_SEC / fence_ns + 1;
+		const int nchild = n - 1; /* odd for easy medians */
+		const int child_ns = frame_ns / (nchild + !!(flags & F_SPARE));
+		const int lo = nchild / 4;
+		const int hi = (3 * nchild + 3) / 4 - 1;
+		struct rusage old_usage, usage;
+		uint64_t cpu_time, d_time;
+		unsigned long vip = -1;
+		struct timespec tv;
+		struct igt_mean m;
+
+		if (flags & F_PING) {
+			struct intel_execution_engine2 *ping;
+
+			__for_each_physical_engine(i915, ping) {
+				if (ping->flags == e->flags)
+					continue;
+
+				igt_fork(child, 1) {
+					uint32_t ctx = gem_context_clone_with_engines(i915, 0);
+
+					fair_child(i915, ctx, ping,
+						   child_ns / 8,
+						   -1, common,
+						   F_SOLO | F_PACE | F_SHARE,
+						   &result[nchild],
+						   NULL);
+
+					gem_context_destroy(i915, ctx);
+				}
+			}
+		}
+
+		memset(result, 0, (nchild + 1) * sizeof(result[0]));
+		getrusage(RUSAGE_CHILDREN, &old_usage);
+		igt_nsec_elapsed(memset(&tv, 0, sizeof(tv)));
+		igt_fork(child, nchild) {
+			uint32_t ctx;
+
+			if (flags & F_ISOLATE) {
+				int clone, dmabuf = -1;
+
+				if (common)
+					dmabuf = prime_handle_to_fd(i915, common);
+
+				clone = gem_reopen_driver(i915);
+				gem_context_copy_engines(i915, 0, clone, 0);
+				i915 = clone;
+
+				if (dmabuf != -1)
+					common = prime_fd_to_handle(i915, dmabuf);
+			}
+
+			ctx = gem_context_clone_with_engines(i915, 0);
+
+			if (flags & F_VIP && child == 0) {
+				gem_context_set_priority(i915, ctx, MAX_PRIO);
+				flags |= F_FLOW;
+			}
+			if (flags & F_RRUL && child == 0)
+				flags |= F_SOLO | F_FLOW | F_SYNC;
+
+			fair_child(i915, ctx, e, child_ns,
+				   timeline, common, flags,
+				   &result[nchild],
+				   &result[child]);
+
+			gem_context_destroy(i915, ctx);
+		}
+
+		while (nfences--)
+			timeline_advance(timeline, fence_ns);
+
+		result[nchild] = 1;
+		for (int child = 0; child < nchild; child++) {
+			while (!READ_ONCE(result[child]))
+				timeline_advance(timeline, fence_ns);
+		}
+
+		igt_waitchildren();
+		close(timeline);
+
+		/* Are we running out of CPU time, and fail to submit frames? */
+		d_time = igt_nsec_elapsed(&tv);
+		getrusage(RUSAGE_CHILDREN, &usage);
+		cpu_time = d_cpu_time(&usage, &old_usage);
+		if (10 * cpu_time > 9 * d_time) {
+			if (nchild > 7)
+				break;
+
+			igt_skip_on_f(10 * cpu_time > 9 * d_time,
+				      "%.0f%% CPU usage, presuming capacity exceeded\n",
+				      100. * cpu_time / d_time);
+		}
+
+		igt_mean_init(&m);
+		for (int child = 0; child < nchild; child++)
+			igt_mean_add(&m, result[child]);
+
+		if (flags & (F_VIP | F_RRUL))
+			vip = result[0];
+
+		qsort(result, nchild, sizeof(*result), cmp_ul);
+		igt_info("%2d clients, range: [%.1f, %.1f], iqr: [%.1f, %.1f], median: %.1f, mean: %.1f ± %.2f ms\n",
+			 nchild,
+			 1e-6 * result[0],  1e-6 * result[nchild - 1],
+			 1e-6 * result[lo], 1e-6 * result[hi],
+			 1e-6 * result[nchild / 2],
+			 1e-6 * igt_mean_get(&m),
+			 1e-6 * sqrt(igt_mean_get_variance(&m)));
+
+		if (vip != -1) {
+			igt_info("VIP interval %.2f ms\n", 1e-6 * vip);
+			igt_assert(4 * vip > 3 * fence_ns &&
+				   3 * vip < 4 * fence_ns);
+		}
+
+		/* May be slowed due to sheer volume of context switches */
+		igt_assert(4 * igt_mean_get(&m) > 3 * fence_ns &&
+			       igt_mean_get(&m) < 3 * fence_ns);
+
+		igt_assert(4 * igt_mean_get(&m) > 3 * result[nchild / 2] &&
+			   3 * igt_mean_get(&m) < 4 * result[nchild / 2]);
+
+		igt_assert(2 * (result[hi] - result[lo]) < result[nchild / 2]);
+	}
+
+	munmap(result, 4096);
+	if (common)
+		gem_close(i915, common);
+}
+
+static void test_fairness(int i915, int timeout)
+{
+	static const struct {
+		const char *name;
+		unsigned int flags;
+	} fair[] = {
+		/*
+		 * none - maximal greed in each client
+		 *
+		 * Push as many frames from each client as fast as possible
+		 */
+		{ "none",       0 },
+		{ "none-vip",   F_VIP }, /* one vip client must meet deadlines */
+		{ "none-solo",  F_SOLO }, /* 1 batch per frame per client */
+		{ "none-share", F_SHARE }, /* read from a common buffer */
+		{ "none-rrul",  F_RRUL }, /* "realtime-response under load" */
+		{ "none-ping",  F_PING }, /* measure inter-engine fairness */
+
+		/*
+		 * throttle - original per client throttling
+		 *
+		 * Used for front buffering rendering where there is no
+		 * extenal frame marker. Each client tries to only keep
+		 * 20ms of work submitted, though that measurement is
+		 * flawed...
+		 *
+		 * This is used by Xorg to try and maintain some resembalance
+		 * of input/output consistency when being feed a continuous
+		 * stream of X11 draw requests straight into scanout, where
+		 * the clients may submit the work faster than can be drawn.
+		 *
+		 * Throttling tracks requests per-file (and assumes that
+		 * all requests are in submission order across the whole file),
+		 * so we split each child to its own fd.
+		 */
+		{ "throttle",       F_THROTTLE | F_ISOLATE },
+		{ "throttle-vip",   F_THROTTLE | F_ISOLATE | F_VIP },
+		{ "throttle-solo",  F_THROTTLE | F_ISOLATE | F_SOLO },
+		{ "throttle-share", F_THROTTLE | F_ISOLATE | F_SHARE },
+		{ "throttle-rrul",  F_THROTTLE | F_ISOLATE | F_RRUL },
+
+		/*
+		 * pace - mesa "submit double buffering"
+		 *
+		 * Submit a frame, wait for previous frame to start. This
+		 * prevents each client from getting too far ahead of its
+		 * rendering, maintaining a consistent input/output latency.
+		 */
+		{ "pace",       F_PACE },
+		{ "pace-solo",  F_PACE | F_SOLO},
+		{ "pace-share", F_PACE | F_SHARE},
+		{ "pace-ping",  F_PACE | F_SHARE | F_PING},
+
+		/* sync - only submit a frame at a time */
+		{ "sync",      F_SYNC },
+		{ "sync-vip",  F_SYNC | F_VIP },
+		{ "sync-solo", F_SYNC | F_SOLO },
+
+		/* flow - synchronise execution against the clock (vblank) */
+		{ "flow",       F_PACE | F_FLOW },
+		{ "flow-share", F_PACE | F_FLOW | F_SHARE },
+		{ "flow-ping",  F_PACE | F_FLOW | F_SHARE | F_PING },
+
+		/* next - submit ahead of the clock (vblank double buffering) */
+		{ "next",       F_PACE | F_FLOW | F_NEXT },
+		{ "next-share", F_PACE | F_FLOW | F_NEXT | F_SHARE },
+		{ "next-ping",  F_PACE | F_FLOW | F_NEXT | F_SHARE | F_PING },
+
+		/* spare - underutilise by a single client timeslice */
+		{ "spare", F_PACE | F_FLOW | F_SPARE },
+
+		/* half - run at half pace (submit 16ms of work every 32ms) */
+		{ "half",  F_PACE | F_FLOW | F_HALF },
+
+		{}
+	};
+
+	igt_fixture {
+		igt_info("CS timestamp frequency: %d\n",
+			 read_timestamp_frequency(i915));
+
+		igt_require(intel_gen(intel_get_drm_devid(i915)) >= 8);
+	}
+
+	for (typeof(*fair) *f = fair; f->name; f++) {
+		igt_subtest_with_dynamic_f("fair-%s", f->name)  {
+			const struct intel_execution_engine2 *e;
+
+			__for_each_physical_engine(i915, e) {
+				if (!gem_class_can_store_dword(i915, e->class))
+					continue;
+
+				igt_dynamic_f("%s", e->name)
+					fairness(i915, e, timeout, f->flags);
+			}
+		}
+	}
+}
+
+static uint32_t read_ctx_timestamp(int i915,
+				   uint32_t ctx,
+				   const struct intel_execution_engine2 *e)
+{
+	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
+	const uint32_t base = gem_engine_mmio_base(i915, e->name);
+	struct drm_i915_gem_relocation_entry reloc;
+	struct drm_i915_gem_exec_object2 obj = {
+		.handle = gem_create(i915, 4096),
+		.offset = 32 << 20,
+		.relocs_ptr = to_user_pointer(&reloc),
+		.relocation_count = 1,
+	};
+	struct drm_i915_gem_execbuffer2 execbuf = {
+		.buffers_ptr = to_user_pointer(&obj),
+		.buffer_count = 1,
+		.flags = e->flags,
+		.rsvd1 = ctx,
+	};
+#define RUNTIME (base + 0x3a8)
+	uint32_t *map, *cs;
+	uint32_t ts;
+
+	igt_require(base);
+
+	cs = map = gem_mmap__device_coherent(i915, obj.handle,
+					     0, 4096, PROT_WRITE);
+
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = RUNTIME;
+	memset(&reloc, 0, sizeof(reloc));
+	reloc.target_handle = obj.handle;
+	reloc.presumed_offset = obj.offset;
+	reloc.offset = offset_in_page(cs);
+	reloc.delta = 4000;
+	*cs++ = obj.offset + 4000;
+	*cs++ = obj.offset >> 32;
+
+	*cs++ = MI_BATCH_BUFFER_END;
+
+	gem_execbuf(i915, &execbuf);
+	gem_sync(i915, obj.handle);
+	gem_close(i915, obj.handle);
+
+	ts = map[1000];
+	munmap(map, 4096);
+
+	return ts;
+}
+
+static void fairslice(int i915,
+		      const struct intel_execution_engine2 *e,
+		      unsigned long flags)
+{
+	igt_spin_t *spin = NULL;
+	uint32_t ctx[3];
+	uint32_t ts[3];
+
+	for (int i = 0; i < ARRAY_SIZE(ctx); i++) {
+		ctx[i] = gem_context_clone_with_engines(i915, 0);
+		if (spin == NULL) {
+			spin = __igt_spin_new(i915,
+					      .ctx = ctx[i],
+					      .engine = e->flags,
+					      .flags = flags);
+		} else {
+			struct drm_i915_gem_execbuffer2 eb = {
+				.buffer_count = 1,
+				.buffers_ptr = to_user_pointer(&spin->obj[IGT_SPIN_BATCH]),
+				.flags = e->flags,
+				.rsvd1 = ctx[i],
+			};
+			gem_execbuf(i915, &eb);
+		}
+	}
+
+	sleep(2); /* over the course of many timeslices */
+
+	igt_assert(gem_bo_busy(i915, spin->handle));
+	igt_spin_end(spin);
+	for (int i = 0; i < ARRAY_SIZE(ctx); i++)
+		ts[i] = read_ctx_timestamp(i915, ctx[i], e);
+
+	for (int i = 0; i < ARRAY_SIZE(ctx); i++)
+		gem_context_destroy(i915, ctx[i]);
+	igt_spin_free(i915, spin);
+
+	qsort(ts, 3, sizeof(*ts), cmp_u32);
+	igt_info("%s: [%.1f, %.1f] ms\n", e->name,
+		 1e-6 * ticks_to_ns(i915, ts[0]),
+		 1e-6 * ticks_to_ns(i915, ts[2]));
+
+	igt_assert(ts[0] && ts[2] > ts[0]);
+	igt_assert(4 * ts[0] > 3 * ts[2]);
+}
+
 #define test_each_engine(T, i915, e) \
 	igt_subtest_with_dynamic(T) __for_each_physical_engine(i915, e) \
 		igt_dynamic_f("%s", e->name)
@@ -2582,6 +3396,35 @@ igt_main
 		test_each_engine("lateslice", fd, e)
 			lateslice(fd, e->flags);
 
+		igt_subtest_group {
+			igt_fixture {
+				igt_require(gem_scheduler_has_semaphores(fd));
+				igt_require(gem_scheduler_has_preemption(fd));
+				igt_require(intel_gen(intel_get_drm_devid(fd)) >= 8);
+			}
+
+			test_each_engine("fairslice", fd, e)
+				fairslice(fd, e, 0);
+
+			test_each_engine("u-fairslice", fd, e)
+				fairslice(fd, e, IGT_SPIN_USERPTR);
+
+			igt_subtest("fairslice-all")  {
+				__for_each_physical_engine(fd, e) {
+					igt_fork(child, 1)
+						fairslice(fd, e, 0);
+				}
+				igt_waitchildren();
+			}
+			igt_subtest("u-fairslice-all")  {
+				__for_each_physical_engine(fd, e) {
+					igt_fork(child, 1)
+						fairslice(fd, e, IGT_SPIN_USERPTR);
+				}
+				igt_waitchildren();
+			}
+		}
+
 		test_each_engine("submit-early-slice", fd, e)
 			submit_slice(fd, e, EARLY_SUBMIT);
 		test_each_engine("submit-golden-slice", fd, e)
@@ -2610,6 +3453,10 @@ igt_main
 		test_each_engine_store("promotion", fd, e)
 			promotion(fd, e->flags);
 
+		igt_subtest_group {
+			test_fairness(fd, 2);
+		}
+
 		igt_subtest_group {
 			igt_fixture {
 				igt_require(gem_scheduler_has_preemption(fd));
-- 
2.29.2

_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [igt-dev] ✓ Fi.CI.BAT: success for i915/gem_exec_schedule: Try to spot unfairness (rev10)
  2020-11-24 23:39 ` [igt-dev] " Chris Wilson
  (?)
@ 2020-11-25  0:33 ` Patchwork
  -1 siblings, 0 replies; 20+ messages in thread
From: Patchwork @ 2020-11-25  0:33 UTC (permalink / raw)
  To: Chris Wilson; +Cc: igt-dev


[-- Attachment #1.1: Type: text/plain, Size: 5438 bytes --]

== Series Details ==

Series: i915/gem_exec_schedule: Try to spot unfairness (rev10)
URL   : https://patchwork.freedesktop.org/series/77887/
State : success

== Summary ==

CI Bug Log - changes from CI_DRM_9385 -> IGTPW_5222
====================================================

Summary
-------

  **SUCCESS**

  No regressions found.

  External URL: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_5222/index.html

Known issues
------------

  Here are the changes found in IGTPW_5222 that come from known issues:

### IGT changes ###

#### Issues hit ####

  * igt@kms_busy@basic@flip:
    - fi-kbl-soraka:      [PASS][1] -> [DMESG-WARN][2] ([i915#1982])
   [1]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9385/fi-kbl-soraka/igt@kms_busy@basic@flip.html
   [2]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_5222/fi-kbl-soraka/igt@kms_busy@basic@flip.html
    - fi-tgl-y:           [PASS][3] -> [DMESG-WARN][4] ([i915#1982])
   [3]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9385/fi-tgl-y/igt@kms_busy@basic@flip.html
   [4]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_5222/fi-tgl-y/igt@kms_busy@basic@flip.html

  * igt@kms_cursor_legacy@basic-busy-flip-before-cursor-legacy:
    - fi-icl-u2:          [PASS][5] -> [DMESG-WARN][6] ([i915#1982]) +1 similar issue
   [5]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9385/fi-icl-u2/igt@kms_cursor_legacy@basic-busy-flip-before-cursor-legacy.html
   [6]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_5222/fi-icl-u2/igt@kms_cursor_legacy@basic-busy-flip-before-cursor-legacy.html

  * igt@prime_self_import@basic-with_one_bo:
    - fi-tgl-y:           [PASS][7] -> [DMESG-WARN][8] ([i915#402])
   [7]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9385/fi-tgl-y/igt@prime_self_import@basic-with_one_bo.html
   [8]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_5222/fi-tgl-y/igt@prime_self_import@basic-with_one_bo.html

  
#### Possible fixes ####

  * igt@gem_exec_suspend@basic-s0:
    - fi-cfl-8109u:       [DMESG-WARN][9] ([i915#262]) -> [PASS][10] +1 similar issue
   [9]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9385/fi-cfl-8109u/igt@gem_exec_suspend@basic-s0.html
   [10]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_5222/fi-cfl-8109u/igt@gem_exec_suspend@basic-s0.html

  * igt@i915_pm_rpm@module-reload:
    - fi-byt-j1900:       [DMESG-WARN][11] ([i915#1982]) -> [PASS][12]
   [11]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9385/fi-byt-j1900/igt@i915_pm_rpm@module-reload.html
   [12]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_5222/fi-byt-j1900/igt@i915_pm_rpm@module-reload.html

  * igt@kms_cursor_legacy@basic-busy-flip-before-cursor-atomic:
    - fi-bsw-kefka:       [DMESG-WARN][13] ([i915#1982]) -> [PASS][14]
   [13]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9385/fi-bsw-kefka/igt@kms_cursor_legacy@basic-busy-flip-before-cursor-atomic.html
   [14]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_5222/fi-bsw-kefka/igt@kms_cursor_legacy@basic-busy-flip-before-cursor-atomic.html

  * igt@prime_vgem@basic-read:
    - fi-tgl-y:           [DMESG-WARN][15] ([i915#402]) -> [PASS][16]
   [15]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9385/fi-tgl-y/igt@prime_vgem@basic-read.html
   [16]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_5222/fi-tgl-y/igt@prime_vgem@basic-read.html

  
  {name}: This element is suppressed. This means it is ignored when computing
          the status of the difference (SUCCESS, WARNING, or FAILURE).

  [i915#1982]: https://gitlab.freedesktop.org/drm/intel/issues/1982
  [i915#262]: https://gitlab.freedesktop.org/drm/intel/issues/262
  [i915#402]: https://gitlab.freedesktop.org/drm/intel/issues/402


Participating hosts (43 -> 39)
------------------------------

  Missing    (4): fi-ilk-m540 fi-bsw-cyan fi-bdw-samus fi-hsw-4200u 


Build changes
-------------

  * CI: CI-20190529 -> None
  * IGT: IGT_5870 -> IGTPW_5222

  CI-20190529: 20190529
  CI_DRM_9385: 3d37e624f60f40cea80e784617686ae2917e9b01 @ git://anongit.freedesktop.org/gfx-ci/linux
  IGTPW_5222: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_5222/index.html
  IGT_5870: 08b13995b85df26a77212e4fb21fd772976ef33c @ git://anongit.freedesktop.org/xorg/app/intel-gpu-tools



== Testlist changes ==

+igt@gem_exec_schedule@fairslice
+igt@gem_exec_schedule@fairslice-all
+igt@gem_exec_schedule@fair-flow
+igt@gem_exec_schedule@fair-flow-ping
+igt@gem_exec_schedule@fair-flow-share
+igt@gem_exec_schedule@fair-half
+igt@gem_exec_schedule@fair-next
+igt@gem_exec_schedule@fair-next-ping
+igt@gem_exec_schedule@fair-next-share
+igt@gem_exec_schedule@fair-none
+igt@gem_exec_schedule@fair-none-ping
+igt@gem_exec_schedule@fair-none-rrul
+igt@gem_exec_schedule@fair-none-share
+igt@gem_exec_schedule@fair-none-solo
+igt@gem_exec_schedule@fair-none-vip
+igt@gem_exec_schedule@fair-pace
+igt@gem_exec_schedule@fair-pace-ping
+igt@gem_exec_schedule@fair-pace-share
+igt@gem_exec_schedule@fair-pace-solo
+igt@gem_exec_schedule@fair-spare
+igt@gem_exec_schedule@fair-sync
+igt@gem_exec_schedule@fair-sync-solo
+igt@gem_exec_schedule@fair-sync-vip
+igt@gem_exec_schedule@fair-throttle
+igt@gem_exec_schedule@fair-throttle-rrul
+igt@gem_exec_schedule@fair-throttle-share
+igt@gem_exec_schedule@fair-throttle-solo
+igt@gem_exec_schedule@fair-throttle-vip
+igt@gem_exec_schedule@u-fairslice
+igt@gem_exec_schedule@u-fairslice-all

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_5222/index.html

[-- Attachment #1.2: Type: text/html, Size: 6755 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [igt-dev] ✗ Fi.CI.IGT: failure for i915/gem_exec_schedule: Try to spot unfairness (rev10)
  2020-11-24 23:39 ` [igt-dev] " Chris Wilson
  (?)
  (?)
@ 2020-11-25  5:29 ` Patchwork
  -1 siblings, 0 replies; 20+ messages in thread
From: Patchwork @ 2020-11-25  5:29 UTC (permalink / raw)
  To: Chris Wilson; +Cc: igt-dev


[-- Attachment #1.1: Type: text/plain, Size: 30272 bytes --]

== Series Details ==

Series: i915/gem_exec_schedule: Try to spot unfairness (rev10)
URL   : https://patchwork.freedesktop.org/series/77887/
State : failure

== Summary ==

CI Bug Log - changes from CI_DRM_9385_full -> IGTPW_5222_full
====================================================

Summary
-------

  **FAILURE**

  Serious unknown changes coming with IGTPW_5222_full absolutely need to be
  verified manually.
  
  If you think the reported changes have nothing to do with the changes
  introduced in IGTPW_5222_full, please notify your bug team to allow them
  to document this new failure mode, which will reduce false positives in CI.

  External URL: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_5222/index.html

Possible new issues
-------------------

  Here are the unknown changes that may have been introduced in IGTPW_5222_full:

### IGT changes ###

#### Possible regressions ####

  * {igt@gem_exec_schedule@fair-next-ping@vecs0} (NEW):
    - shard-iclb:         NOTRUN -> [SKIP][1] +15 similar issues
   [1]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_5222/shard-iclb1/igt@gem_exec_schedule@fair-next-ping@vecs0.html

  * {igt@gem_exec_schedule@fair-none-ping@rcs0} (NEW):
    - shard-tglb:         NOTRUN -> [SKIP][2] +19 similar issues
   [2]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_5222/shard-tglb8/igt@gem_exec_schedule@fair-none-ping@rcs0.html

  * {igt@gem_exec_schedule@fair-none-solo@rcs0} (NEW):
    - shard-kbl:          NOTRUN -> [FAIL][3] +14 similar issues
   [3]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_5222/shard-kbl2/igt@gem_exec_schedule@fair-none-solo@rcs0.html

  * {igt@gem_exec_schedule@fair-pace-solo@vecs0} (NEW):
    - shard-glk:          NOTRUN -> [FAIL][4] +9 similar issues
   [4]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_5222/shard-glk6/igt@gem_exec_schedule@fair-pace-solo@vecs0.html

  * {igt@gem_exec_schedule@fair-throttle-solo@vcs1} (NEW):
    - shard-tglb:         NOTRUN -> [FAIL][5] +29 similar issues
   [5]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_5222/shard-tglb7/igt@gem_exec_schedule@fair-throttle-solo@vcs1.html

  * {igt@gem_exec_schedule@fair-throttle@rcs0} (NEW):
    - shard-iclb:         NOTRUN -> [FAIL][6] +22 similar issues
   [6]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_5222/shard-iclb6/igt@gem_exec_schedule@fair-throttle@rcs0.html

  * {igt@gem_exec_schedule@u-fairslice@rcs0} (NEW):
    - shard-tglb:         NOTRUN -> [DMESG-WARN][7]
   [7]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_5222/shard-tglb6/igt@gem_exec_schedule@u-fairslice@rcs0.html

  * igt@kms_vblank@pipe-c-wait-forked-busy-hang:
    - shard-hsw:          [PASS][8] -> [INCOMPLETE][9]
   [8]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9385/shard-hsw7/igt@kms_vblank@pipe-c-wait-forked-busy-hang.html
   [9]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_5222/shard-hsw8/igt@kms_vblank@pipe-c-wait-forked-busy-hang.html

  
#### Warnings ####

  * igt@i915_pm_rc6_residency@rc6-idle:
    - shard-iclb:         [WARN][10] ([i915#1804]) -> [WARN][11]
   [10]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9385/shard-iclb6/igt@i915_pm_rc6_residency@rc6-idle.html
   [11]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_5222/shard-iclb5/igt@i915_pm_rc6_residency@rc6-idle.html

  
New tests
---------

  New tests have been introduced between CI_DRM_9385_full and IGTPW_5222_full:

### New IGT tests (170) ###

  * igt@gem_exec_schedule@fair-flow:
    - Statuses : 2 skip(s)
    - Exec time: [0.0] s

  * igt@gem_exec_schedule@fair-flow-ping:
    - Statuses : 2 skip(s)
    - Exec time: [0.0] s

  * igt@gem_exec_schedule@fair-flow-ping@bcs0:
    - Statuses : 5 skip(s)
    - Exec time: [0.0, 4.21] s

  * igt@gem_exec_schedule@fair-flow-ping@rcs0:
    - Statuses : 5 skip(s)
    - Exec time: [2.13, 4.42] s

  * igt@gem_exec_schedule@fair-flow-ping@vcs0:
    - Statuses : 5 skip(s)
    - Exec time: [2.16, 4.21] s

  * igt@gem_exec_schedule@fair-flow-ping@vcs1:
    - Statuses : 2 skip(s)
    - Exec time: [2.14, 4.21] s

  * igt@gem_exec_schedule@fair-flow-ping@vecs0:
    - Statuses : 5 skip(s)
    - Exec time: [2.14, 4.21] s

  * igt@gem_exec_schedule@fair-flow-share:
    - Statuses : 2 skip(s)
    - Exec time: [0.0] s

  * igt@gem_exec_schedule@fair-flow-share@bcs0:
    - Statuses : 2 pass(s) 3 skip(s)
    - Exec time: [0.0, 13.27] s

  * igt@gem_exec_schedule@fair-flow-share@rcs0:
    - Statuses : 5 pass(s)
    - Exec time: [8.87, 13.27] s

  * igt@gem_exec_schedule@fair-flow-share@vcs0:
    - Statuses : 5 pass(s)
    - Exec time: [9.00, 13.30] s

  * igt@gem_exec_schedule@fair-flow-share@vcs1:
    - Statuses : 3 pass(s)
    - Exec time: [11.11, 13.27] s

  * igt@gem_exec_schedule@fair-flow-share@vecs0:
    - Statuses : 5 pass(s)
    - Exec time: [9.00, 13.29] s

  * igt@gem_exec_schedule@fair-flow@bcs0:
    - Statuses : 2 pass(s) 3 skip(s)
    - Exec time: [0.0, 13.52] s

  * igt@gem_exec_schedule@fair-flow@rcs0:
    - Statuses : 5 pass(s)
    - Exec time: [8.99, 13.58] s

  * igt@gem_exec_schedule@fair-flow@vcs0:
    - Statuses : 5 pass(s)
    - Exec time: [8.98, 13.98] s

  * igt@gem_exec_schedule@fair-flow@vcs1:
    - Statuses : 2 pass(s)
    - Exec time: [13.12, 14.01] s

  * igt@gem_exec_schedule@fair-flow@vecs0:
    - Statuses : 5 pass(s)
    - Exec time: [9.05, 13.96] s

  * igt@gem_exec_schedule@fair-half:
    - Statuses : 2 skip(s)
    - Exec time: [0.0] s

  * igt@gem_exec_schedule@fair-half@bcs0:
    - Statuses : 2 pass(s) 3 skip(s)
    - Exec time: [0.0, 13.72] s

  * igt@gem_exec_schedule@fair-half@rcs0:
    - Statuses : 5 pass(s)
    - Exec time: [11.50, 16.09] s

  * igt@gem_exec_schedule@fair-half@vcs0:
    - Statuses : 5 pass(s)
    - Exec time: [11.51, 16.02] s

  * igt@gem_exec_schedule@fair-half@vcs1:
    - Statuses : 3 pass(s)
    - Exec time: [11.55, 13.72] s

  * igt@gem_exec_schedule@fair-half@vecs0:
    - Statuses : 5 pass(s)
    - Exec time: [11.03, 16.02] s

  * igt@gem_exec_schedule@fair-next:
    - Statuses : 2 skip(s)
    - Exec time: [0.0] s

  * igt@gem_exec_schedule@fair-next-ping:
    - Statuses : 2 skip(s)
    - Exec time: [0.0] s

  * igt@gem_exec_schedule@fair-next-ping@bcs0:
    - Statuses : 5 skip(s)
    - Exec time: [0.0, 4.22] s

  * igt@gem_exec_schedule@fair-next-ping@rcs0:
    - Statuses : 5 skip(s)
    - Exec time: [2.14, 4.39] s

  * igt@gem_exec_schedule@fair-next-ping@vcs0:
    - Statuses : 5 skip(s)
    - Exec time: [2.16, 4.22] s

  * igt@gem_exec_schedule@fair-next-ping@vcs1:
    - Statuses : 3 skip(s)
    - Exec time: [2.14, 4.22] s

  * igt@gem_exec_schedule@fair-next-ping@vecs0:
    - Statuses : 5 skip(s)
    - Exec time: [2.14, 4.22] s

  * igt@gem_exec_schedule@fair-next-share:
    - Statuses : 2 skip(s)
    - Exec time: [0.0] s

  * igt@gem_exec_schedule@fair-next-share@bcs0:
    - Statuses : 2 pass(s) 3 skip(s)
    - Exec time: [0.0, 13.23] s

  * igt@gem_exec_schedule@fair-next-share@rcs0:
    - Statuses : 5 pass(s)
    - Exec time: [8.89, 13.22] s

  * igt@gem_exec_schedule@fair-next-share@vcs0:
    - Statuses : 5 pass(s)
    - Exec time: [9.01, 13.26] s

  * igt@gem_exec_schedule@fair-next-share@vcs1:
    - Statuses : 2 pass(s)
    - Exec time: [11.13, 13.06] s

  * igt@gem_exec_schedule@fair-next-share@vecs0:
    - Statuses : 5 pass(s)
    - Exec time: [9.01, 13.25] s

  * igt@gem_exec_schedule@fair-next@bcs0:
    - Statuses : 2 pass(s) 3 skip(s)
    - Exec time: [0.0, 13.47] s

  * igt@gem_exec_schedule@fair-next@rcs0:
    - Statuses : 5 pass(s)
    - Exec time: [9.00, 13.70] s

  * igt@gem_exec_schedule@fair-next@vcs0:
    - Statuses : 5 pass(s)
    - Exec time: [8.96, 13.92] s

  * igt@gem_exec_schedule@fair-next@vcs1:
    - Statuses : 2 pass(s)
    - Exec time: [13.09, 13.94] s

  * igt@gem_exec_schedule@fair-next@vecs0:
    - Statuses : 5 pass(s)
    - Exec time: [8.97, 13.89] s

  * igt@gem_exec_schedule@fair-none:
    - Statuses : 2 skip(s)
    - Exec time: [0.0] s

  * igt@gem_exec_schedule@fair-none-ping:
    - Statuses : 1 skip(s)
    - Exec time: [0.0] s

  * igt@gem_exec_schedule@fair-none-ping@bcs0:
    - Statuses : 5 skip(s)
    - Exec time: [0.0, 7.50] s

  * igt@gem_exec_schedule@fair-none-ping@rcs0:
    - Statuses : 5 skip(s)
    - Exec time: [2.43, 7.38] s

  * igt@gem_exec_schedule@fair-none-ping@vcs0:
    - Statuses : 5 skip(s)
    - Exec time: [2.57, 7.29] s

  * igt@gem_exec_schedule@fair-none-ping@vcs1:
    - Statuses : 3 skip(s)
    - Exec time: [5.06, 7.30] s

  * igt@gem_exec_schedule@fair-none-ping@vecs0:
    - Statuses : 5 skip(s)
    - Exec time: [2.57, 7.32] s

  * igt@gem_exec_schedule@fair-none-rrul:
    - Statuses : 2 skip(s)
    - Exec time: [0.0] s

  * igt@gem_exec_schedule@fair-none-rrul@bcs0:
    - Statuses : 2 pass(s) 3 skip(s)
    - Exec time: [0.0, 12.33] s

  * igt@gem_exec_schedule@fair-none-rrul@rcs0:
    - Statuses : 1 fail(s) 4 pass(s)
    - Exec time: [9.70, 14.30] s

  * igt@gem_exec_schedule@fair-none-rrul@vcs0:
    - Statuses : 5 pass(s)
    - Exec time: [9.62, 12.22] s

  * igt@gem_exec_schedule@fair-none-rrul@vcs1:
    - Statuses : 2 pass(s)
    - Exec time: [11.54, 13.02] s

  * igt@gem_exec_schedule@fair-none-rrul@vecs0:
    - Statuses : 5 pass(s)
    - Exec time: [10.01, 12.93] s

  * igt@gem_exec_schedule@fair-none-share:
    - Statuses : 1 skip(s)
    - Exec time: [0.0] s

  * igt@gem_exec_schedule@fair-none-share@bcs0:
    - Statuses : 2 pass(s) 3 skip(s)
    - Exec time: [0.0, 12.09] s

  * igt@gem_exec_schedule@fair-none-share@rcs0:
    - Statuses : 4 pass(s) 1 skip(s)
    - Exec time: [7.10, 13.48] s

  * igt@gem_exec_schedule@fair-none-share@vcs0:
    - Statuses : 4 pass(s) 1 skip(s)
    - Exec time: [7.50, 14.21] s

  * igt@gem_exec_schedule@fair-none-share@vcs1:
    - Statuses : 2 pass(s)
    - Exec time: [10.10, 12.00] s

  * igt@gem_exec_schedule@fair-none-share@vecs0:
    - Statuses : 4 pass(s) 1 skip(s)
    - Exec time: [7.43, 11.94] s

  * igt@gem_exec_schedule@fair-none-solo:
    - Statuses : 2 skip(s)
    - Exec time: [0.0] s

  * igt@gem_exec_schedule@fair-none-solo@bcs0:
    - Statuses : 1 pass(s) 2 skip(s)
    - Exec time: [0.0, 18.19] s

  * igt@gem_exec_schedule@fair-none-solo@rcs0:
    - Statuses : 2 fail(s) 1 pass(s)
    - Exec time: [13.15, 15.24] s

  * igt@gem_exec_schedule@fair-none-solo@vcs0:
    - Statuses : 3 pass(s)
    - Exec time: [14.90, 18.10] s

  * igt@gem_exec_schedule@fair-none-solo@vcs1:
    - Statuses : 2 pass(s)
    - Exec time: [14.86, 18.26] s

  * igt@gem_exec_schedule@fair-none-solo@vecs0:
    - Statuses : 3 pass(s)
    - Exec time: [14.17, 17.30] s

  * igt@gem_exec_schedule@fair-none-vip:
    - Statuses : 2 skip(s)
    - Exec time: [0.0] s

  * igt@gem_exec_schedule@fair-none-vip@bcs0:
    - Statuses : 2 pass(s) 3 skip(s)
    - Exec time: [0.0, 13.32] s

  * igt@gem_exec_schedule@fair-none-vip@rcs0:
    - Statuses : 3 fail(s) 2 pass(s)
    - Exec time: [9.89, 12.81] s

  * igt@gem_exec_schedule@fair-none-vip@vcs0:
    - Statuses : 5 pass(s)
    - Exec time: [10.58, 13.32] s

  * igt@gem_exec_schedule@fair-none-vip@vcs1:
    - Statuses : 2 pass(s)
    - Exec time: [10.72, 12.29] s

  * igt@gem_exec_schedule@fair-none-vip@vecs0:
    - Statuses : 1 fail(s) 4 pass(s)
    - Exec time: [10.58, 13.37] s

  * igt@gem_exec_schedule@fair-none@bcs0:
    - Statuses : 2 pass(s) 3 skip(s)
    - Exec time: [0.0, 12.23] s

  * igt@gem_exec_schedule@fair-none@rcs0:
    - Statuses : 2 fail(s) 3 pass(s)
    - Exec time: [9.82, 14.40] s

  * igt@gem_exec_schedule@fair-none@vcs0:
    - Statuses : 4 pass(s) 1 skip(s)
    - Exec time: [7.59, 11.99] s

  * igt@gem_exec_schedule@fair-none@vcs1:
    - Statuses : 2 pass(s)
    - Exec time: [10.06, 12.04] s

  * igt@gem_exec_schedule@fair-none@vecs0:
    - Statuses : 4 pass(s) 1 skip(s)
    - Exec time: [7.55, 12.02] s

  * igt@gem_exec_schedule@fair-pace:
    - Statuses : 2 skip(s)
    - Exec time: [0.0] s

  * igt@gem_exec_schedule@fair-pace-ping:
    - Statuses : 2 skip(s)
    - Exec time: [0.0] s

  * igt@gem_exec_schedule@fair-pace-ping@bcs0:
    - Statuses : 5 skip(s)
    - Exec time: [0.0, 4.21] s

  * igt@gem_exec_schedule@fair-pace-ping@rcs0:
    - Statuses : 5 skip(s)
    - Exec time: [2.13, 4.46] s

  * igt@gem_exec_schedule@fair-pace-ping@vcs0:
    - Statuses : 5 skip(s)
    - Exec time: [2.15, 4.44] s

  * igt@gem_exec_schedule@fair-pace-ping@vcs1:
    - Statuses : 2 skip(s)
    - Exec time: [2.18, 4.20] s

  * igt@gem_exec_schedule@fair-pace-ping@vecs0:
    - Statuses : 5 skip(s)
    - Exec time: [2.14, 4.42] s

  * igt@gem_exec_schedule@fair-pace-share:
    - Statuses : 2 skip(s)
    - Exec time: [0.0] s

  * igt@gem_exec_schedule@fair-pace-share@bcs0:
    - Statuses : 2 fail(s) 3 skip(s)
    - Exec time: [0.0, 10.63] s

  * igt@gem_exec_schedule@fair-pace-share@rcs0:
    - Statuses : 3 fail(s) 2 pass(s)
    - Exec time: [8.43, 12.16] s

  * igt@gem_exec_schedule@fair-pace-share@vcs0:
    - Statuses : 2 fail(s) 3 pass(s)
    - Exec time: [8.45, 10.92] s

  * igt@gem_exec_schedule@fair-pace-share@vcs1:
    - Statuses : 1 fail(s) 1 pass(s)
    - Exec time: [8.57, 10.62] s

  * igt@gem_exec_schedule@fair-pace-share@vecs0:
    - Statuses : 1 fail(s) 4 pass(s)
    - Exec time: [8.75, 10.94] s

  * igt@gem_exec_schedule@fair-pace-solo:
    - Statuses : 2 skip(s)
    - Exec time: [0.0] s

  * igt@gem_exec_schedule@fair-pace-solo@bcs0:
    - Statuses : 1 pass(s) 2 skip(s)
    - Exec time: [0.0, 10.62] s

  * igt@gem_exec_schedule@fair-pace-solo@rcs0:
    - Statuses : 3 fail(s)
    - Exec time: [8.45, 8.93] s

  * igt@gem_exec_schedule@fair-pace-solo@vcs0:
    - Statuses : 3 fail(s)
    - Exec time: [8.40, 8.85] s

  * igt@gem_exec_schedule@fair-pace-solo@vcs1:
    - Statuses : 2 fail(s)
    - Exec time: [8.39, 8.53] s

  * igt@gem_exec_schedule@fair-pace-solo@vecs0:
    - Statuses : 1 fail(s) 2 pass(s)
    - Exec time: [8.84, 10.90] s

  * igt@gem_exec_schedule@fair-pace@bcs0:
    - Statuses : 2 fail(s) 3 skip(s)
    - Exec time: [0.0, 10.61] s

  * igt@gem_exec_schedule@fair-pace@rcs0:
    - Statuses : 4 fail(s) 1 pass(s)
    - Exec time: [8.49, 10.75] s

  * igt@gem_exec_schedule@fair-pace@vcs0:
    - Statuses : 4 fail(s) 1 pass(s)
    - Exec time: [8.43, 10.62] s

  * igt@gem_exec_schedule@fair-pace@vcs1:
    - Statuses : 3 fail(s)
    - Exec time: [8.57, 10.63] s

  * igt@gem_exec_schedule@fair-pace@vecs0:
    - Statuses : 3 fail(s) 2 pass(s)
    - Exec time: [8.42, 12.44] s

  * igt@gem_exec_schedule@fair-spare:
    - Statuses : 2 skip(s)
    - Exec time: [0.0] s

  * igt@gem_exec_schedule@fair-spare@bcs0:
    - Statuses : 2 pass(s) 3 skip(s)
    - Exec time: [0.0, 13.47] s

  * igt@gem_exec_schedule@fair-spare@rcs0:
    - Statuses : 5 pass(s)
    - Exec time: [9.03, 13.77] s

  * igt@gem_exec_schedule@fair-spare@vcs0:
    - Statuses : 5 pass(s)
    - Exec time: [8.94, 14.36] s

  * igt@gem_exec_schedule@fair-spare@vcs1:
    - Statuses : 3 pass(s)
    - Exec time: [13.13, 13.94] s

  * igt@gem_exec_schedule@fair-spare@vecs0:
    - Statuses : 5 pass(s)
    - Exec time: [8.94, 13.97] s

  * igt@gem_exec_schedule@fair-sync:
    - Statuses : 2 skip(s)
    - Exec time: [0.0] s

  * igt@gem_exec_schedule@fair-sync-solo:
    - Statuses : 2 skip(s)
    - Exec time: [0.0] s

  * igt@gem_exec_schedule@fair-sync-solo@bcs0:
    - Statuses : 2 pass(s) 3 skip(s)
    - Exec time: [0.0, 12.90] s

  * igt@gem_exec_schedule@fair-sync-solo@rcs0:
    - Statuses : 5 pass(s)
    - Exec time: [10.56, 19.98] s

  * igt@gem_exec_schedule@fair-sync-solo@vcs0:
    - Statuses : 5 pass(s)
    - Exec time: [10.57, 23.62] s

  * igt@gem_exec_schedule@fair-sync-solo@vcs1:
    - Statuses : 2 pass(s)
    - Exec time: [10.53, 13.56] s

  * igt@gem_exec_schedule@fair-sync-solo@vecs0:
    - Statuses : 5 pass(s)
    - Exec time: [10.56, 23.55] s

  * igt@gem_exec_schedule@fair-sync-vip:
    - Statuses : 2 skip(s)
    - Exec time: [0.0] s

  * igt@gem_exec_schedule@fair-sync-vip@bcs0:
    - Statuses : 2 pass(s) 3 skip(s)
    - Exec time: [0.0, 13.13] s

  * igt@gem_exec_schedule@fair-sync-vip@rcs0:
    - Statuses : 5 pass(s)
    - Exec time: [8.83, 13.75] s

  * igt@gem_exec_schedule@fair-sync-vip@vcs0:
    - Statuses : 5 pass(s)
    - Exec time: [8.82, 13.96] s

  * igt@gem_exec_schedule@fair-sync-vip@vcs1:
    - Statuses : 2 pass(s)
    - Exec time: [12.98, 13.96] s

  * igt@gem_exec_schedule@fair-sync-vip@vecs0:
    - Statuses : 5 pass(s)
    - Exec time: [8.46, 12.99] s

  * igt@gem_exec_schedule@fair-sync@bcs0:
    - Statuses : 2 pass(s) 3 skip(s)
    - Exec time: [0.0, 13.01] s

  * igt@gem_exec_schedule@fair-sync@rcs0:
    - Statuses : 5 pass(s)
    - Exec time: [8.72, 13.66] s

  * igt@gem_exec_schedule@fair-sync@vcs0:
    - Statuses : 5 pass(s)
    - Exec time: [8.69, 13.78] s

  * igt@gem_exec_schedule@fair-sync@vcs1:
    - Statuses : 2 pass(s)
    - Exec time: [12.89, 13.76] s

  * igt@gem_exec_schedule@fair-sync@vecs0:
    - Statuses : 5 pass(s)
    - Exec time: [8.67, 13.75] s

  * igt@gem_exec_schedule@fair-throttle:
    - Statuses : 2 skip(s)
    - Exec time: [0.0] s

  * igt@gem_exec_schedule@fair-throttle-rrul:
    - Statuses : 2 skip(s)
    - Exec time: [0.0] s

  * igt@gem_exec_schedule@fair-throttle-rrul@bcs0:
    - Statuses : 1 fail(s) 1 pass(s) 3 skip(s)
    - Exec time: [0.0, 11.56] s

  * igt@gem_exec_schedule@fair-throttle-rrul@rcs0:
    - Statuses : 3 fail(s) 2 pass(s)
    - Exec time: [6.37, 12.59] s

  * igt@gem_exec_schedule@fair-throttle-rrul@vcs0:
    - Statuses : 2 fail(s) 3 pass(s)
    - Exec time: [8.92, 12.71] s

  * igt@gem_exec_schedule@fair-throttle-rrul@vcs1:
    - Statuses : 1 fail(s) 2 pass(s)
    - Exec time: [11.04, 11.42] s

  * igt@gem_exec_schedule@fair-throttle-rrul@vecs0:
    - Statuses : 3 fail(s) 2 pass(s)
    - Exec time: [8.80, 11.48] s

  * igt@gem_exec_schedule@fair-throttle-share:
    - Statuses : 2 skip(s)
    - Exec time: [0.0] s

  * igt@gem_exec_schedule@fair-throttle-share@bcs0:
    - Statuses : 2 pass(s) 3 skip(s)
    - Exec time: [0.0, 11.58] s

  * igt@gem_exec_schedule@fair-throttle-share@rcs0:
    - Statuses : 3 fail(s) 2 pass(s)
    - Exec time: [8.84, 12.03] s

  * igt@gem_exec_schedule@fair-throttle-share@vcs0:
    - Statuses : 1 fail(s) 4 pass(s)
    - Exec time: [8.89, 12.09] s

  * igt@gem_exec_schedule@fair-throttle-share@vcs1:
    - Statuses : 1 fail(s) 1 pass(s)
    - Exec time: [11.26, 11.51] s

  * igt@gem_exec_schedule@fair-throttle-share@vecs0:
    - Statuses : 5 pass(s)
    - Exec time: [8.88, 11.96] s

  * igt@gem_exec_schedule@fair-throttle-solo:
    - Statuses : 2 skip(s)
    - Exec time: [0.0] s

  * igt@gem_exec_schedule@fair-throttle-solo@bcs0:
    - Statuses : 2 fail(s) 3 skip(s)
    - Exec time: [0.0, 12.88] s

  * igt@gem_exec_schedule@fair-throttle-solo@rcs0:
    - Statuses : 3 fail(s) 2 pass(s)
    - Exec time: [11.56, 13.53] s

  * igt@gem_exec_schedule@fair-throttle-solo@vcs0:
    - Statuses : 3 fail(s) 2 pass(s)
    - Exec time: [11.95, 13.24] s

  * igt@gem_exec_schedule@fair-throttle-solo@vcs1:
    - Statuses : 2 fail(s)
    - Exec time: [11.73, 12.87] s

  * igt@gem_exec_schedule@fair-throttle-solo@vecs0:
    - Statuses : 3 fail(s) 2 pass(s)
    - Exec time: [11.65, 13.22] s

  * igt@gem_exec_schedule@fair-throttle-vip:
    - Statuses : 2 skip(s)
    - Exec time: [0.0] s

  * igt@gem_exec_schedule@fair-throttle-vip@bcs0:
    - Statuses : 2 pass(s) 3 skip(s)
    - Exec time: [0.0, 12.21] s

  * igt@gem_exec_schedule@fair-throttle-vip@rcs0:
    - Statuses : 2 fail(s) 3 pass(s)
    - Exec time: [9.18, 11.64] s

  * igt@gem_exec_schedule@fair-throttle-vip@vcs0:
    - Statuses : 5 pass(s)
    - Exec time: [9.29, 13.30] s

  * igt@gem_exec_schedule@fair-throttle-vip@vcs1:
    - Statuses : 1 fail(s) 1 pass(s)
    - Exec time: [11.52, 12.18] s

  * igt@gem_exec_schedule@fair-throttle-vip@vecs0:
    - Statuses : 5 pass(s)
    - Exec time: [9.36, 13.21] s

  * igt@gem_exec_schedule@fair-throttle@bcs0:
    - Statuses : 2 pass(s) 3 skip(s)
    - Exec time: [0.0, 11.58] s

  * igt@gem_exec_schedule@fair-throttle@rcs0:
    - Statuses : 3 fail(s) 2 pass(s)
    - Exec time: [8.82, 11.49] s

  * igt@gem_exec_schedule@fair-throttle@vcs0:
    - Statuses : 5 pass(s)
    - Exec time: [9.06, 13.65] s

  * igt@gem_exec_schedule@fair-throttle@vcs1:
    - Statuses : 1 fail(s) 1 pass(s)
    - Exec time: [11.02, 11.85] s

  * igt@gem_exec_schedule@fair-throttle@vecs0:
    - Statuses : 5 pass(s)
    - Exec time: [9.00, 11.72] s

  * igt@gem_exec_schedule@fairslice:
    - Statuses : 2 skip(s)
    - Exec time: [0.0] s

  * igt@gem_exec_schedule@fairslice-all:
    - Statuses : 5 pass(s) 2 skip(s)
    - Exec time: [0.0, 2.10] s

  * igt@gem_exec_schedule@fairslice@bcs0:
    - Statuses : 5 pass(s)
    - Exec time: [2.01, 2.03] s

  * igt@gem_exec_schedule@fairslice@rcs0:
    - Statuses : 5 pass(s)
    - Exec time: [2.01, 2.03] s

  * igt@gem_exec_schedule@fairslice@vcs0:
    - Statuses : 5 pass(s)
    - Exec time: [2.01, 2.02] s

  * igt@gem_exec_schedule@fairslice@vcs1:
    - Statuses : 2 pass(s)
    - Exec time: [2.01] s

  * igt@gem_exec_schedule@fairslice@vecs0:
    - Statuses : 5 pass(s)
    - Exec time: [2.01, 2.03] s

  * igt@gem_exec_schedule@u-fairslice:
    - Statuses : 2 skip(s)
    - Exec time: [0.0] s

  * igt@gem_exec_schedule@u-fairslice-all:
    - Statuses : 5 pass(s) 2 skip(s)
    - Exec time: [0.0, 2.11] s

  * igt@gem_exec_schedule@u-fairslice@bcs0:
    - Statuses : 5 pass(s)
    - Exec time: [2.00, 2.02] s

  * igt@gem_exec_schedule@u-fairslice@rcs0:
    - Statuses : 2 dmesg-warn(s) 3 pass(s)
    - Exec time: [2.01, 2.04] s

  * igt@gem_exec_schedule@u-fairslice@vcs0:
    - Statuses : 5 pass(s)
    - Exec time: [2.00, 2.02] s

  * igt@gem_exec_schedule@u-fairslice@vcs1:
    - Statuses : 3 pass(s)
    - Exec time: [2.00, 2.01] s

  * igt@gem_exec_schedule@u-fairslice@vecs0:
    - Statuses : 5 pass(s)
    - Exec time: [2.00, 2.02] s

  

Known issues
------------

  Here are the changes found in IGTPW_5222_full that come from known issues:

### IGT changes ###

#### Issues hit ####

  * igt@i915_pm_rc6_residency@rc6-idle:
    - shard-hsw:          [PASS][12] -> [WARN][13] ([i915#1519])
   [12]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9385/shard-hsw1/igt@i915_pm_rc6_residency@rc6-idle.html
   [13]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_5222/shard-hsw1/igt@i915_pm_rc6_residency@rc6-idle.html

  * igt@i915_selftest@live@active:
    - shard-tglb:         [PASS][14] -> [DMESG-FAIL][15] ([i915#666])
   [14]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9385/shard-tglb2/igt@i915_selftest@live@active.html
   [15]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_5222/shard-tglb2/igt@i915_selftest@live@active.html

  * igt@kms_big_fb@x-tiled-16bpp-rotate-0:
    - shard-hsw:          [PASS][16] -> [DMESG-WARN][17] ([i915#1982]) +1 similar issue
   [16]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9385/shard-hsw2/igt@kms_big_fb@x-tiled-16bpp-rotate-0.html
   [17]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_5222/shard-hsw6/igt@kms_big_fb@x-tiled-16bpp-rotate-0.html

  * igt@kms_cursor_legacy@basic-flip-before-cursor-varying-size:
    - shard-iclb:         [PASS][18] -> [DMESG-WARN][19] ([i915#1982])
   [18]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9385/shard-iclb5/igt@kms_cursor_legacy@basic-flip-before-cursor-varying-size.html
   [19]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_5222/shard-iclb8/igt@kms_cursor_legacy@basic-flip-before-cursor-varying-size.html

  * igt@kms_cursor_legacy@cursorb-vs-flipa-atomic-transitions:
    - shard-glk:          [PASS][20] -> [DMESG-WARN][21] ([i915#1982]) +2 similar issues
   [20]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9385/shard-glk8/igt@kms_cursor_legacy@cursorb-vs-flipa-atomic-transitions.html
   [21]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_5222/shard-glk9/igt@kms_cursor_legacy@cursorb-vs-flipa-atomic-transitions.html

  * igt@kms_cursor_legacy@flip-vs-cursor-crc-atomic:
    - shard-tglb:         [PASS][22] -> [FAIL][23] ([i915#2346])
   [22]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9385/shard-tglb1/igt@kms_cursor_legacy@flip-vs-cursor-crc-atomic.html
   [23]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_5222/shard-tglb1/igt@kms_cursor_legacy@flip-vs-cursor-crc-atomic.html

  * igt@kms_draw_crc@draw-method-rgb565-render-untiled:
    - shard-iclb:         [PASS][24] -> [FAIL][25] ([i915#52] / [i915#54])
   [24]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9385/shard-iclb7/igt@kms_draw_crc@draw-method-rgb565-render-untiled.html
   [25]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_5222/shard-iclb8/igt@kms_draw_crc@draw-method-rgb565-render-untiled.html

  * igt@kms_flip@flip-vs-suspend@c-hdmi-a1:
    - shard-hsw:          [PASS][26] -> [INCOMPLETE][27] ([i915#2055])
   [26]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9385/shard-hsw2/igt@kms_flip@flip-vs-suspend@c-hdmi-a1.html
   [27]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_5222/shard-hsw2/igt@kms_flip@flip-vs-suspend@c-hdmi-a1.html

  * igt@kms_frontbuffer_tracking@fbc-2p-scndscrn-pri-indfb-draw-blt:
    - shard-glk:          [PASS][28] -> [FAIL][29] ([i915#49])
   [28]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9385/shard-glk2/igt@kms_frontbuffer_tracking@fbc-2p-scndscrn-pri-indfb-draw-blt.html
   [29]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_5222/shard-glk8/igt@kms_frontbuffer_tracking@fbc-2p-scndscrn-pri-indfb-draw-blt.html

  * igt@kms_frontbuffer_tracking@fbc-farfromfence:
    - shard-kbl:          [PASS][30] -> [DMESG-WARN][31] ([i915#1982]) +2 similar issues
   [30]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9385/shard-kbl7/igt@kms_frontbuffer_tracking@fbc-farfromfence.html
   [31]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_5222/shard-kbl3/igt@kms_frontbuffer_tracking@fbc-farfromfence.html

  * igt@kms_frontbuffer_tracking@fbcpsr-rgb101010-draw-render:
    - shard-tglb:         [PASS][32] -> [DMESG-WARN][33] ([i915#1982]) +1 similar issue
   [32]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9385/shard-tglb7/igt@kms_frontbuffer_tracking@fbcpsr-rgb101010-draw-render.html
   [33]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_5222/shard-tglb5/igt@kms_frontbuffer_tracking@fbcpsr-rgb101010-draw-render.html

  * igt@kms_pipe_crc_basic@nonblocking-crc-pipe-c:
    - shard-apl:          [PASS][34] -> [DMESG-WARN][35] ([i915#1635] / [i915#1982]) +5 similar issues
   [34]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9385/shard-apl4/igt@kms_pipe_crc_basic@nonblocking-crc-pipe-c.html
   [35]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_5222/shard-apl8/igt@kms_pipe_crc_basic@nonblocking-crc-pipe-c.html

  * igt@kms_psr@psr2_sprite_plane_move:
    - shard-iclb:         [PASS][36] -> [SKIP][37] ([fdo#109441]) +1 similar issue
   [36]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9385/shard-iclb2/igt@kms_psr@psr2_sprite_plane_move.html
   [37]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_5222/shard-iclb6/igt@kms_psr@psr2_sprite_plane_move.html

  * igt@perf@polling-parameterized:
    - shard-iclb:         [PASS][38] -> [FAIL][39] ([i915#1542])
   [38]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9385/shard-iclb4/igt@perf@polling-parameterized.html
   [39]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_5222/shard-iclb6/igt@perf@polling-parameterized.html

  
#### Possible fixes ####

  * igt@device_reset@unbind-reset-rebind:
    - shard-glk:          [INCOMPLETE][40] ([i915#2283] / [i915#2405]) -> [PASS][41]
   [40]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9385/shard-glk5/igt@device_reset@unbind-reset-rebind.html
   [41]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_5222/shard-glk1/igt@device_reset@unbind-reset-rebind.html
    - shard-apl:          [INCOMPLETE][42] ([i915#1635] / [i915#2283] / [i915#2405]) -> [PASS][43]
   [42]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9385/shard-apl1/igt@device_reset@unbind-reset-rebind.html
   [43]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_5222/shard-apl6/igt@device_reset@unbind-reset-rebind.html
    - shard-kbl:          [INCOMPLETE][44] ([i915#2283] / [i915#2405]) -> [PASS][45]
   [44]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9385/shard-kbl7/igt@device_reset@unbind-reset-rebind.html
   [45]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_5222/shard-kbl6/igt@device_reset@unbind-reset-rebind.html
    - shard-tglb:         [INCOMPLETE][46] ([i915#1602] / [i915#750]) -> [PASS][47]
   [46]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9385/shard-tglb8/igt@device_reset@unbind-reset-rebind.html
   [47]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_5222/shard-tglb8/igt@device_reset@unbind-reset-rebind.html
    - shard-iclb:         [INCOMPLETE][48] ([i915#2283] / [i915#2405]) -> [PASS][49]
   [48]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9385/shard-iclb8/igt@device_reset@unbind-reset-rebind.html
   [49]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_5222/shard-iclb4/igt@device_reset@unbind-reset-rebind.html

  * {igt@gem_exec_capture@pi@bcs0}:
    - shard-iclb:         [INCOMPLETE][50] ([i915#2369] / [i915#2502]) -> [PASS][51]
   [50]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9385/shard-iclb5/igt@gem_exec_capture@pi@bcs0.html
   [51]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_5222/shard-iclb3/igt@gem_exec_capture@pi@bcs0.html

  * igt@gem_exec_whisper@basic-fds-forked:
    - shard-glk:          [DMESG-WARN][52] ([i915#118] / [i915#95]) -> [PASS][53]
   [52]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9385/shard-glk6/igt@gem_exec_whisper@basic-fds-forked.html
   [53]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_5222/shard-glk8/igt@gem_exec_whisper@basic-fds-forked.html

  * igt@i915_pm_backlight@fade_with_suspend:
    - shard-iclb:         [DMESG-WARN][54] -> [PASS][55]
   [54]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9385/shard-iclb6/igt@i915_pm_backlight@fade_with_suspend.html
   [55]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_5222/shard-iclb7/igt@i915_pm_backlight@fade_with_suspend.html

  * igt@i915_pm_dc@dc6-psr:
    - shard-iclb:         [FAIL][56] ([i915#454]) -> [PASS][57]
   [56]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9385/shard-iclb8/igt@i915_pm_dc@dc6-psr.html
   [57]: https://intel-gfx-ci.01.org/tree/d

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_5222/index.html

[-- Attachment #1.2: Type: text/html, Size: 36816 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [Intel-gfx] [igt-dev] [PATCH i-g-t] i915/gem_exec_schedule: Try to spot unfairness
  2020-11-24 23:39 ` [igt-dev] " Chris Wilson
@ 2020-11-25 11:25   ` Tvrtko Ursulin
  -1 siblings, 0 replies; 20+ messages in thread
From: Tvrtko Ursulin @ 2020-11-25 11:25 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx; +Cc: igt-dev


On 24/11/2020 23:39, Chris Wilson wrote:
> An important property for multi-client systems is that each client gets
> a 'fair' allotment of system time. (Where fairness is at the whim of the
> context properties, such as priorities.) This test forks N independent
> clients (albeit they happen to share a single vm), and does an equal
> amount of work in client and asserts that they take an equal amount of
> time.
> 
> Though we have never claimed to have a completely fair scheduler, that
> is what is expected.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Ramalingam C <ramalingam.c@intel.com>
> ---
>   tests/i915/gem_exec_schedule.c | 847 +++++++++++++++++++++++++++++++++
>   1 file changed, 847 insertions(+)
> 
> diff --git a/tests/i915/gem_exec_schedule.c b/tests/i915/gem_exec_schedule.c
> index f23d63ac3..d888efcd7 100644
> --- a/tests/i915/gem_exec_schedule.c
> +++ b/tests/i915/gem_exec_schedule.c
> @@ -29,6 +29,7 @@
>   #include <sys/poll.h>
>   #include <sys/ioctl.h>
>   #include <sys/mman.h>
> +#include <sys/resource.h>
>   #include <sys/syscall.h>
>   #include <sched.h>
>   #include <signal.h>
> @@ -2516,6 +2517,819 @@ static void measure_semaphore_power(int i915)
>   	rapl_close(&pkg);
>   }
>   
> +static int read_timestamp_frequency(int i915)
> +{
> +	int value = 0;
> +	drm_i915_getparam_t gp = {
> +		.value = &value,
> +		.param = I915_PARAM_CS_TIMESTAMP_FREQUENCY,
> +	};
> +	ioctl(i915, DRM_IOCTL_I915_GETPARAM, &gp);
> +	return value;
> +}
> +
> +static uint64_t div64_u64_round_up(uint64_t x, uint64_t y)
> +{
> +	return (x + y - 1) / y;
> +}
> +
> +static uint64_t ns_to_ctx_ticks(int i915, uint64_t ns)
> +{
> +	int f = read_timestamp_frequency(i915);
> +	if (intel_gen(intel_get_drm_devid(i915)) == 11)
> +		f = 12500000; /* icl!!! are you feeling alright? CTX vs CS */
> +	return div64_u64_round_up(ns * f, NSEC_PER_SEC);
> +}
> +
> +static uint64_t ticks_to_ns(int i915, uint64_t ticks)
> +{
> +	return div64_u64_round_up(ticks * NSEC_PER_SEC,
> +				  read_timestamp_frequency(i915));
> +}
> +
> +#define MI_INSTR(opcode, flags) (((opcode) << 23) | (flags))
> +
> +#define MI_MATH(x)                      MI_INSTR(0x1a, (x) - 1)
> +#define MI_MATH_INSTR(opcode, op1, op2) ((opcode) << 20 | (op1) << 10 | (op2))
> +/* Opcodes for MI_MATH_INSTR */
> +#define   MI_MATH_NOOP                  MI_MATH_INSTR(0x000, 0x0, 0x0)
> +#define   MI_MATH_LOAD(op1, op2)        MI_MATH_INSTR(0x080, op1, op2)
> +#define   MI_MATH_LOADINV(op1, op2)     MI_MATH_INSTR(0x480, op1, op2)
> +#define   MI_MATH_LOAD0(op1)            MI_MATH_INSTR(0x081, op1)
> +#define   MI_MATH_LOAD1(op1)            MI_MATH_INSTR(0x481, op1)
> +#define   MI_MATH_ADD                   MI_MATH_INSTR(0x100, 0x0, 0x0)
> +#define   MI_MATH_SUB                   MI_MATH_INSTR(0x101, 0x0, 0x0)
> +#define   MI_MATH_AND                   MI_MATH_INSTR(0x102, 0x0, 0x0)
> +#define   MI_MATH_OR                    MI_MATH_INSTR(0x103, 0x0, 0x0)
> +#define   MI_MATH_XOR                   MI_MATH_INSTR(0x104, 0x0, 0x0)
> +#define   MI_MATH_STORE(op1, op2)       MI_MATH_INSTR(0x180, op1, op2)
> +#define   MI_MATH_STOREINV(op1, op2)    MI_MATH_INSTR(0x580, op1, op2)
> +/* Registers used as operands in MI_MATH_INSTR */
> +#define   MI_MATH_REG(x)                (x)
> +#define   MI_MATH_REG_SRCA              0x20
> +#define   MI_MATH_REG_SRCB              0x21
> +#define   MI_MATH_REG_ACCU              0x31
> +#define   MI_MATH_REG_ZF                0x32
> +#define   MI_MATH_REG_CF                0x33
> +
> +#define MI_LOAD_REGISTER_REG    MI_INSTR(0x2A, 1)
> +
> +static void delay(int i915,
> +		  const struct intel_execution_engine2 *e,
> +		  uint32_t handle,
> +		  uint64_t addr,
> +		  uint64_t ns)
> +{
> +	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
> +	const uint32_t base = gem_engine_mmio_base(i915, e->name);
> +#define CS_GPR(x) (base + 0x600 + 8 * (x))
> +#define RUNTIME (base + 0x3a8)
> +	enum { START_TS, NOW_TS };
> +	uint32_t *map, *cs, *jmp;
> +
> +	igt_require(base);
> +
> +	/* Loop until CTX_TIMESTAMP - initial > @ns */
> +
> +	cs = map = gem_mmap__device_coherent(i915, handle, 0, 4096, PROT_WRITE);
> +
> +	*cs++ = MI_LOAD_REGISTER_IMM;
> +	*cs++ = CS_GPR(START_TS) + 4;
> +	*cs++ = 0;
> +	*cs++ = MI_LOAD_REGISTER_REG;
> +	*cs++ = RUNTIME;
> +	*cs++ = CS_GPR(START_TS);
> +
> +	while (offset_in_page(cs) & 63)
> +		*cs++ = 0;
> +	jmp = cs;
> +
> +	*cs++ = 0x5 << 23; /* MI_ARB_CHECK */
> +
> +	*cs++ = MI_LOAD_REGISTER_IMM;
> +	*cs++ = CS_GPR(NOW_TS) + 4;
> +	*cs++ = 0;
> +	*cs++ = MI_LOAD_REGISTER_REG;
> +	*cs++ = RUNTIME;
> +	*cs++ = CS_GPR(NOW_TS);
> +
> +	/* delta = now - start; inverted to match COND_BBE */
> +	*cs++ = MI_MATH(4);
> +	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(NOW_TS));
> +	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(START_TS));
> +	*cs++ = MI_MATH_SUB;
> +	*cs++ = MI_MATH_STOREINV(MI_MATH_REG(NOW_TS), MI_MATH_REG_ACCU);
> +
> +	/* Save delta for reading by COND_BBE */
> +	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
> +	*cs++ = CS_GPR(NOW_TS);
> +	*cs++ = addr + 4000;
> +	*cs++ = addr >> 32;
> +
> +	/* Delay between SRM and COND_BBE to post the writes */
> +	for (int n = 0; n < 8; n++) {
> +		*cs++ = MI_STORE_DWORD_IMM;
> +		if (use_64b) {
> +			*cs++ = addr + 4064;
> +			*cs++ = addr >> 32;
> +		} else {
> +			*cs++ = 0;
> +			*cs++ = addr + 4064;
> +		}
> +		*cs++ = 0;
> +	}
> +
> +	/* Break if delta [time elapsed] > ns */
> +	*cs++ = MI_COND_BATCH_BUFFER_END | MI_DO_COMPARE | (1 + use_64b);
> +	*cs++ = ~ns_to_ctx_ticks(i915, ns);
> +	*cs++ = addr + 4000;
> +	*cs++ = addr >> 32;
> +
> +	/* Otherwise back to recalculating delta */
> +	*cs++ = MI_BATCH_BUFFER_START | 1 << 8 | use_64b;
> +	*cs++ = addr + offset_in_page(jmp);
> +	*cs++ = addr >> 32;
> +
> +	munmap(map, 4096);
> +}
> +
> +static struct drm_i915_gem_exec_object2
> +delay_create(int i915, uint32_t ctx,
> +	     const struct intel_execution_engine2 *e,
> +	     uint64_t target_ns)
> +{
> +	struct drm_i915_gem_exec_object2 obj = {
> +		.handle = batch_create(i915),
> +		.flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS,
> +	};
> +	struct drm_i915_gem_execbuffer2 execbuf = {
> +		.buffers_ptr = to_user_pointer(&obj),
> +		.buffer_count = 1,
> +		.rsvd1 = ctx,
> +		.flags = e->flags,
> +	};
> +
> +	obj.offset = obj.handle << 12;
> +	gem_execbuf(i915, &execbuf);
> +	gem_sync(i915, obj.handle);
> +
> +	delay(i915, e, obj.handle, obj.offset, target_ns);
> +
> +	obj.flags |= EXEC_OBJECT_PINNED;
> +	return obj;
> +}
> +
> +static void tslog(int i915,
> +		  const struct intel_execution_engine2 *e,
> +		  uint32_t handle,
> +		  uint64_t addr)
> +{
> +	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
> +	const uint32_t base = gem_engine_mmio_base(i915, e->name);
> +#define CS_GPR(x) (base + 0x600 + 8 * (x))
> +#define CS_TIMESTAMP (base + 0x358)
> +	enum { INC, MASK, ADDR };
> +	uint32_t *timestamp_lo, *addr_lo;
> +	uint32_t *map, *cs;
> +
> +	igt_require(base);
> +
> +	map = gem_mmap__device_coherent(i915, handle, 0, 4096, PROT_WRITE);
> +	cs = map + 512;
> +
> +	/* Record the current CS_TIMESTAMP into a journal [a 512 slot ring]. */
> +	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
> +	*cs++ = CS_TIMESTAMP;
> +	timestamp_lo = cs;
> +	*cs++ = addr;
> +	*cs++ = addr >> 32;
> +
> +	/* Load the address + inc & mask variables */
> +	*cs++ = MI_LOAD_REGISTER_IMM;
> +	*cs++ = CS_GPR(ADDR);
> +	addr_lo = cs;
> +	*cs++ = addr;
> +	*cs++ = MI_LOAD_REGISTER_IMM;
> +	*cs++ = CS_GPR(ADDR) + 4;
> +	*cs++ = addr >> 32;
> +
> +	*cs++ = MI_LOAD_REGISTER_IMM;
> +	*cs++ = CS_GPR(INC);
> +	*cs++ = 4;
> +	*cs++ = MI_LOAD_REGISTER_IMM;
> +	*cs++ = CS_GPR(INC) + 4;
> +	*cs++ = 0;
> +
> +	*cs++ = MI_LOAD_REGISTER_IMM;
> +	*cs++ = CS_GPR(MASK);
> +	*cs++ = 0xfffff7ff;
> +	*cs++ = MI_LOAD_REGISTER_IMM;
> +	*cs++ = CS_GPR(MASK) + 4;
> +	*cs++ = 0xffffffff;
> +
> +	/* Increment the [ring] address for saving CS_TIMESTAMP */
> +	*cs++ = MI_MATH(8);
> +	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(INC));
> +	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(ADDR));
> +	*cs++ = MI_MATH_ADD;
> +	*cs++ = MI_MATH_STORE(MI_MATH_REG(ADDR), MI_MATH_REG_ACCU);
> +	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(ADDR));
> +	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(MASK));
> +	*cs++ = MI_MATH_AND;
> +	*cs++ = MI_MATH_STORE(MI_MATH_REG(ADDR), MI_MATH_REG_ACCU);
> +
> +	/* Rewrite the batch buffer for the next execution */
> +	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
> +	*cs++ = CS_GPR(ADDR);
> +	*cs++ = addr + offset_in_page(timestamp_lo);
> +	*cs++ = addr >> 32;
> +	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
> +	*cs++ = CS_GPR(ADDR);
> +	*cs++ = addr + offset_in_page(addr_lo);
> +	*cs++ = addr >> 32;
> +
> +	*cs++ = MI_BATCH_BUFFER_END;
> +
> +	munmap(map, 4096);
> +}
> +
> +static struct drm_i915_gem_exec_object2
> +tslog_create(int i915, uint32_t ctx, const struct intel_execution_engine2 *e)
> +{
> +	struct drm_i915_gem_exec_object2 obj = {
> +		.handle = batch_create(i915),
> +		.flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS,
> +	};
> +	struct drm_i915_gem_execbuffer2 execbuf = {
> +		.buffers_ptr = to_user_pointer(&obj),
> +		.buffer_count = 1,
> +		.rsvd1 = ctx,
> +		.flags = e->flags,
> +	};
> +
> +	obj.offset = obj.handle << 12;
> +	gem_execbuf(i915, &execbuf);
> +	gem_sync(i915, obj.handle);
> +
> +	tslog(i915, e, obj.handle, obj.offset);
> +
> +	obj.flags |= EXEC_OBJECT_PINNED;
> +	return obj;
> +}
> +
> +static int cmp_u32(const void *A, const void *B)
> +{
> +	const uint32_t *a = A, *b = B;
> +
> +	if (*a < *b)
> +		return -1;
> +	else if (*a > *b)
> +		return 1;
> +	else
> +		return 0;
> +}
> +
> +static bool has_ctx_timestamp(int i915, const struct intel_execution_engine2 *e)
> +{
> +	const int gen = intel_gen(intel_get_drm_devid(i915));
> +
> +	if (gen == 8 && e->class == I915_ENGINE_CLASS_VIDEO)
> +		return false; /* looks fubar */
> +
> +	return true;
> +}
> +
> +static struct intel_execution_engine2
> +pick_random_engine(int i915, const struct intel_execution_engine2 *not)
> +{
> +	const struct intel_execution_engine2 *e;
> +	unsigned int count = 0;
> +
> +	__for_each_physical_engine(i915, e) {
> +		if (e->flags == not->flags)
> +			continue;
> +		if (!gem_class_has_mutable_submission(i915, e->class))
> +			continue;
> +		count++;
> +	}
> +	if (!count)
> +		return *not;
> +
> +	count = rand() % count;
> +	__for_each_physical_engine(i915, e) {
> +		if (e->flags == not->flags)
> +			continue;
> +		if (!gem_class_has_mutable_submission(i915, e->class))
> +			continue;
> +		if (!count--)
> +			break;
> +	}
> +
> +	return *e;
> +}
> +
> +static void fair_child(int i915, uint32_t ctx,
> +		       const struct intel_execution_engine2 *e,
> +		       uint64_t frame_ns,
> +		       int timeline,
> +		       uint32_t common,
> +		       unsigned int flags,
> +		       unsigned long *ctl,
> +		       unsigned long *out)
> +#define F_SYNC		(1 << 0)
> +#define F_PACE		(1 << 1)
> +#define F_FLOW		(1 << 2)
> +#define F_HALF		(1 << 3)
> +#define F_SOLO		(1 << 4)
> +#define F_SPARE		(1 << 5)
> +#define F_NEXT		(1 << 6)
> +#define F_VIP		(1 << 7)
> +#define F_RRUL		(1 << 8)
> +#define F_SHARE		(1 << 9)
> +#define F_PING		(1 << 10)
> +#define F_THROTTLE	(1 << 11)
> +#define F_ISOLATE	(1 << 12)
> +{
> +	const int batches_per_frame = flags & F_SOLO ? 1 : 3;
> +	struct drm_i915_gem_exec_object2 obj[4] = {
> +		{},
> +		{
> +			.handle = common ?: gem_create(i915, 4096),
> +		},
> +		delay_create(i915, ctx, e, frame_ns / batches_per_frame),
> +		delay_create(i915, ctx, e, frame_ns / batches_per_frame),
> +	};
> +	struct intel_execution_engine2 ping = *e;
> +	int p_fence = -1, n_fence = -1;
> +	unsigned long count = 0;
> +	int n;
> +
> +	srandom(getpid());
> +	if (flags & F_PING)
> +		ping = pick_random_engine(i915, e);
> +	obj[0] = tslog_create(i915, ctx, &ping);
> +
> +	while (!READ_ONCE(*ctl)) {
> +		struct drm_i915_gem_execbuffer2 execbuf = {
> +			.buffers_ptr = to_user_pointer(obj),
> +			.buffer_count = 4,
> +			.rsvd1 = ctx,
> +			.rsvd2 = -1,
> +			.flags = e->flags,
> +		};
> +
> +		if (flags & F_FLOW) {
> +			unsigned int seq;
> +
> +			seq = count;
> +			if (flags & F_NEXT)
> +				seq++;
> +
> +			execbuf.rsvd2 =
> +				sw_sync_timeline_create_fence(timeline, seq);
> +			execbuf.flags |= I915_EXEC_FENCE_IN;
> +		}
> +
> +		execbuf.flags |= I915_EXEC_FENCE_OUT;
> +		gem_execbuf_wr(i915, &execbuf);
> +		n_fence = execbuf.rsvd2 >> 32;
> +		execbuf.flags &= ~(I915_EXEC_FENCE_OUT | I915_EXEC_FENCE_IN);
> +		for (n = 1; n < batches_per_frame; n++)
> +			gem_execbuf(i915, &execbuf);
> +		close(execbuf.rsvd2);
> +
> +		execbuf.buffer_count = 1;
> +		execbuf.batch_start_offset = 2048;
> +		execbuf.flags = ping.flags | I915_EXEC_FENCE_IN;
> +		execbuf.rsvd2 = n_fence;
> +		gem_execbuf(i915, &execbuf);
> +
> +		if (flags & F_PACE && p_fence != -1) {
> +			struct pollfd pfd = {
> +				.fd = p_fence,
> +				.events = POLLIN,
> +			};
> +			poll(&pfd, 1, -1);
> +		}
> +		close(p_fence);
> +
> +		if (flags & F_SYNC) {
> +			struct pollfd pfd = {
> +				.fd = n_fence,
> +				.events = POLLIN,
> +			};
> +			poll(&pfd, 1, -1);
> +		}
> +
> +		if (flags & F_THROTTLE)
> +			igt_ioctl(i915, DRM_IOCTL_I915_GEM_THROTTLE, 0);
> +
> +		igt_swap(obj[2], obj[3]);
> +		igt_swap(p_fence, n_fence);

What are the sync fences simulating and how come they are always used? I 
mean no children which submit batched up load?

> +		count++;
> +	}
> +	close(p_fence);
> +
> +	gem_close(i915, obj[3].handle);
> +	gem_close(i915, obj[2].handle);
> +	if (obj[1].handle != common)
> +		gem_close(i915, obj[1].handle);
> +
> +	gem_sync(i915, obj[0].handle);
> +	if (out) {
> +		uint32_t *map;
> +
> +		map = gem_mmap__device_coherent(i915, obj[0].handle,
> +						0, 4096, PROT_WRITE);
> +		for (n = 1; n < min(count, 512); n++) {
> +			igt_assert(map[n]);
> +			map[n - 1] = map[n] - map[n - 1];
> +		}
> +		qsort(map, --n, sizeof(*map), cmp_u32);
> +		*out = ticks_to_ns(i915, map[n / 2]);

What is returned? Could you explain the ts journal part a bit?

> +		munmap(map, 4096);
> +	}
> +	gem_close(i915, obj[0].handle);
> +}
> +
> +static int cmp_ul(const void *A, const void *B)
> +{
> +	const unsigned long *a = A, *b = B;
> +
> +	if (*a < *b)
> +		return -1;
> +	else if (*a > *b)
> +		return 1;
> +	else
> +		return 0;
> +}
> +
> +static uint64_t d_cpu_time(const struct rusage *a, const struct rusage *b)
> +{
> +	uint64_t cpu_time = 0;
> +
> +	cpu_time += (a->ru_utime.tv_sec - b->ru_utime.tv_sec) * NSEC_PER_SEC;
> +	cpu_time += (a->ru_utime.tv_usec - b->ru_utime.tv_usec) * 1000;
> +
> +	cpu_time += (a->ru_stime.tv_sec - b->ru_stime.tv_sec) * NSEC_PER_SEC;
> +	cpu_time += (a->ru_stime.tv_usec - b->ru_stime.tv_usec) * 1000;
> +
> +	return cpu_time;
> +}
> +
> +static void timeline_advance(int timeline, int delay_ns)
> +{
> +	struct timespec tv = { .tv_nsec = delay_ns };
> +	nanosleep(&tv, NULL);
> +	sw_sync_timeline_inc(timeline, 1);
> +}
> +
> +static void fairness(int i915,
> +		     const struct intel_execution_engine2 *e,
> +		     int timeout, unsigned int flags)
> +{
> +	const int frame_ns = 16666 * 1000;
> +	const int fence_ns = flags & F_HALF ? 2 * frame_ns : frame_ns;
> +	unsigned long *result;
> +	uint32_t common = 0;
> +
> +	igt_require(has_ctx_timestamp(i915, e));
> +	igt_require(gem_class_has_mutable_submission(i915, e->class));
> +
> +	if (flags & F_SHARE)
> +		common = gem_create(i915, 4095);
> +
> +	result = mmap(NULL, 4096, PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
> +
> +	for (int n = 2; n <= 64; n <<= 1) { /* 32 == 500us per client */
> +		int timeline = sw_sync_timeline_create();
> +		int nfences = timeout * NSEC_PER_SEC / fence_ns + 1;
> +		const int nchild = n - 1; /* odd for easy medians */
> +		const int child_ns = frame_ns / (nchild + !!(flags & F_SPARE));
> +		const int lo = nchild / 4;
> +		const int hi = (3 * nchild + 3) / 4 - 1;
> +		struct rusage old_usage, usage;
> +		uint64_t cpu_time, d_time;
> +		unsigned long vip = -1;
> +		struct timespec tv;
> +		struct igt_mean m;
> +
> +		if (flags & F_PING) {
> +			struct intel_execution_engine2 *ping;
> +
> +			__for_each_physical_engine(i915, ping) {
> +				if (ping->flags == e->flags)
> +					continue;
> +
> +				igt_fork(child, 1) {
> +					uint32_t ctx = gem_context_clone_with_engines(i915, 0);
> +
> +					fair_child(i915, ctx, ping,
> +						   child_ns / 8,
> +						   -1, common,
> +						   F_SOLO | F_PACE | F_SHARE,
> +						   &result[nchild],
> +						   NULL);
> +
> +					gem_context_destroy(i915, ctx);
> +				}
> +			}
> +		}
> +
> +		memset(result, 0, (nchild + 1) * sizeof(result[0]));

Children probably can't write into it before, but still would probably 
be better moved before the first fork (which passes the results array to 
children).

> +		getrusage(RUSAGE_CHILDREN, &old_usage);
> +		igt_nsec_elapsed(memset(&tv, 0, sizeof(tv)));
> +		igt_fork(child, nchild) {
> +			uint32_t ctx;
> +
> +			if (flags & F_ISOLATE) {
> +				int clone, dmabuf = -1;
> +
> +				if (common)
> +					dmabuf = prime_handle_to_fd(i915, common);
> +
> +				clone = gem_reopen_driver(i915);
> +				gem_context_copy_engines(i915, 0, clone, 0);
> +				i915 = clone;
> +
> +				if (dmabuf != -1)
> +					common = prime_fd_to_handle(i915, dmabuf);
> +			}
> +
> +			ctx = gem_context_clone_with_engines(i915, 0);
> +
> +			if (flags & F_VIP && child == 0) {
> +				gem_context_set_priority(i915, ctx, MAX_PRIO);
> +				flags |= F_FLOW;
> +			}
> +			if (flags & F_RRUL && child == 0)
> +				flags |= F_SOLO | F_FLOW | F_SYNC;
> +
> +			fair_child(i915, ctx, e, child_ns,
> +				   timeline, common, flags,
> +				   &result[nchild],
> +				   &result[child]);
> +
> +			gem_context_destroy(i915, ctx);
> +		}
> +
> +		while (nfences--)
> +			timeline_advance(timeline, fence_ns);
> +
> +		result[nchild] = 1;
> +		for (int child = 0; child < nchild; child++) {
> +			while (!READ_ONCE(result[child]))
> +				timeline_advance(timeline, fence_ns);
> +		}
> +
> +		igt_waitchildren();
> +		close(timeline);
> +
> +		/* Are we running out of CPU time, and fail to submit frames? */
> +		d_time = igt_nsec_elapsed(&tv);
> +		getrusage(RUSAGE_CHILDREN, &usage);
> +		cpu_time = d_cpu_time(&usage, &old_usage);
> +		if (10 * cpu_time > 9 * d_time) {
> +			if (nchild > 7)
> +				break;
> +
> +			igt_skip_on_f(10 * cpu_time > 9 * d_time,
> +				      "%.0f%% CPU usage, presuming capacity exceeded\n",
> +				      100. * cpu_time / d_time);

Aren't children mostly sleeping waiting on fences and like? And if so 
how/when the test ends up using a lot of CPU time?

> +		}
> +
> +		igt_mean_init(&m);
> +		for (int child = 0; child < nchild; child++)
> +			igt_mean_add(&m, result[child]);
> +
> +		if (flags & (F_VIP | F_RRUL))
> +			vip = result[0];
> +
> +		qsort(result, nchild, sizeof(*result), cmp_ul);
> +		igt_info("%2d clients, range: [%.1f, %.1f], iqr: [%.1f, %.1f], median: %.1f, mean: %.1f ± %.2f ms\n",
> +			 nchild,
> +			 1e-6 * result[0],  1e-6 * result[nchild - 1],
> +			 1e-6 * result[lo], 1e-6 * result[hi],
> +			 1e-6 * result[nchild / 2],
> +			 1e-6 * igt_mean_get(&m),
> +			 1e-6 * sqrt(igt_mean_get_variance(&m)));
> +
> +		if (vip != -1) {
> +			igt_info("VIP interval %.2f ms\n", 1e-6 * vip);
> +			igt_assert(4 * vip > 3 * fence_ns &&
> +				   3 * vip < 4 * fence_ns);
> +		}
> +
> +		/* May be slowed due to sheer volume of context switches */
> +		igt_assert(4 * igt_mean_get(&m) > 3 * fence_ns &&
> +			       igt_mean_get(&m) < 3 * fence_ns);
> +
> +		igt_assert(4 * igt_mean_get(&m) > 3 * result[nchild / 2] &&
> +			   3 * igt_mean_get(&m) < 4 * result[nchild / 2]);
> +
> +		igt_assert(2 * (result[hi] - result[lo]) < result[nchild / 2]);

Put some human readable text above the asserts explaining the criteria 
please.

VIP child takes part in the mean and does not affect the result?

> +	}
> +
> +	munmap(result, 4096);
> +	if (common)
> +		gem_close(i915, common);
> +}
> +
> +static void test_fairness(int i915, int timeout)
> +{
> +	static const struct {
> +		const char *name;
> +		unsigned int flags;
> +	} fair[] = {
> +		/*
> +		 * none - maximal greed in each client
> +		 *
> +		 * Push as many frames from each client as fast as possible
> +		 */
> +		{ "none",       0 },
> +		{ "none-vip",   F_VIP }, /* one vip client must meet deadlines */
> +		{ "none-solo",  F_SOLO }, /* 1 batch per frame per client */
> +		{ "none-share", F_SHARE }, /* read from a common buffer */
> +		{ "none-rrul",  F_RRUL }, /* "realtime-response under load" */
> +		{ "none-ping",  F_PING }, /* measure inter-engine fairness */
> +
> +		/*
> +		 * throttle - original per client throttling
> +		 *
> +		 * Used for front buffering rendering where there is no
> +		 * extenal frame marker. Each client tries to only keep
> +		 * 20ms of work submitted, though that measurement is
> +		 * flawed...
> +		 *
> +		 * This is used by Xorg to try and maintain some resembalance
> +		 * of input/output consistency when being feed a continuous
> +		 * stream of X11 draw requests straight into scanout, where
> +		 * the clients may submit the work faster than can be drawn.
> +		 *
> +		 * Throttling tracks requests per-file (and assumes that
> +		 * all requests are in submission order across the whole file),
> +		 * so we split each child to its own fd.
> +		 */
> +		{ "throttle",       F_THROTTLE | F_ISOLATE },
> +		{ "throttle-vip",   F_THROTTLE | F_ISOLATE | F_VIP },
> +		{ "throttle-solo",  F_THROTTLE | F_ISOLATE | F_SOLO },
> +		{ "throttle-share", F_THROTTLE | F_ISOLATE | F_SHARE },
> +		{ "throttle-rrul",  F_THROTTLE | F_ISOLATE | F_RRUL },
> +
> +		/*
> +		 * pace - mesa "submit double buffering"
> +		 *
> +		 * Submit a frame, wait for previous frame to start. This
> +		 * prevents each client from getting too far ahead of its
> +		 * rendering, maintaining a consistent input/output latency.
> +		 */
> +		{ "pace",       F_PACE },
> +		{ "pace-solo",  F_PACE | F_SOLO},
> +		{ "pace-share", F_PACE | F_SHARE},
> +		{ "pace-ping",  F_PACE | F_SHARE | F_PING},
> +
> +		/* sync - only submit a frame at a time */
> +		{ "sync",      F_SYNC },
> +		{ "sync-vip",  F_SYNC | F_VIP },
> +		{ "sync-solo", F_SYNC | F_SOLO },
> +
> +		/* flow - synchronise execution against the clock (vblank) */
> +		{ "flow",       F_PACE | F_FLOW },
> +		{ "flow-share", F_PACE | F_FLOW | F_SHARE },
> +		{ "flow-ping",  F_PACE | F_FLOW | F_SHARE | F_PING },
> +
> +		/* next - submit ahead of the clock (vblank double buffering) */
> +		{ "next",       F_PACE | F_FLOW | F_NEXT },
> +		{ "next-share", F_PACE | F_FLOW | F_NEXT | F_SHARE },
> +		{ "next-ping",  F_PACE | F_FLOW | F_NEXT | F_SHARE | F_PING },
> +
> +		/* spare - underutilise by a single client timeslice */
> +		{ "spare", F_PACE | F_FLOW | F_SPARE },
> +
> +		/* half - run at half pace (submit 16ms of work every 32ms) */
> +		{ "half",  F_PACE | F_FLOW | F_HALF },
> +
> +		{}
> +	};
> +
> +	igt_fixture {
> +		igt_info("CS timestamp frequency: %d\n",
> +			 read_timestamp_frequency(i915));
> +
> +		igt_require(intel_gen(intel_get_drm_devid(i915)) >= 8);
> +	}
> +
> +	for (typeof(*fair) *f = fair; f->name; f++) {
> +		igt_subtest_with_dynamic_f("fair-%s", f->name)  {
> +			const struct intel_execution_engine2 *e;
> +
> +			__for_each_physical_engine(i915, e) {
> +				if (!gem_class_can_store_dword(i915, e->class))
> +					continue;
> +
> +				igt_dynamic_f("%s", e->name)
> +					fairness(i915, e, timeout, f->flags);
> +			}
> +		}
> +	}
> +}
> +
> +static uint32_t read_ctx_timestamp(int i915,
> +				   uint32_t ctx,
> +				   const struct intel_execution_engine2 *e)
> +{
> +	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
> +	const uint32_t base = gem_engine_mmio_base(i915, e->name);
> +	struct drm_i915_gem_relocation_entry reloc;
> +	struct drm_i915_gem_exec_object2 obj = {
> +		.handle = gem_create(i915, 4096),
> +		.offset = 32 << 20,
> +		.relocs_ptr = to_user_pointer(&reloc),
> +		.relocation_count = 1,
> +	};
> +	struct drm_i915_gem_execbuffer2 execbuf = {
> +		.buffers_ptr = to_user_pointer(&obj),
> +		.buffer_count = 1,
> +		.flags = e->flags,
> +		.rsvd1 = ctx,
> +	};
> +#define RUNTIME (base + 0x3a8)
> +	uint32_t *map, *cs;
> +	uint32_t ts;
> +
> +	igt_require(base);
> +
> +	cs = map = gem_mmap__device_coherent(i915, obj.handle,
> +					     0, 4096, PROT_WRITE);
> +
> +	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
> +	*cs++ = RUNTIME;
> +	memset(&reloc, 0, sizeof(reloc));
> +	reloc.target_handle = obj.handle;
> +	reloc.presumed_offset = obj.offset;
> +	reloc.offset = offset_in_page(cs);
> +	reloc.delta = 4000;
> +	*cs++ = obj.offset + 4000;
> +	*cs++ = obj.offset >> 32;
> +
> +	*cs++ = MI_BATCH_BUFFER_END;
> +
> +	gem_execbuf(i915, &execbuf);
> +	gem_sync(i915, obj.handle);
> +	gem_close(i915, obj.handle);
> +
> +	ts = map[1000];
> +	munmap(map, 4096);
> +
> +	return ts;
> +}
> +
> +static void fairslice(int i915,
> +		      const struct intel_execution_engine2 *e,
> +		      unsigned long flags)
> +{
> +	igt_spin_t *spin = NULL;
> +	uint32_t ctx[3];
> +	uint32_t ts[3];
> +
> +	for (int i = 0; i < ARRAY_SIZE(ctx); i++) {
> +		ctx[i] = gem_context_clone_with_engines(i915, 0);
> +		if (spin == NULL) {
> +			spin = __igt_spin_new(i915,
> +					      .ctx = ctx[i],
> +					      .engine = e->flags,
> +					      .flags = flags);
> +		} else {
> +			struct drm_i915_gem_execbuffer2 eb = {
> +				.buffer_count = 1,
> +				.buffers_ptr = to_user_pointer(&spin->obj[IGT_SPIN_BATCH]),
> +				.flags = e->flags,
> +				.rsvd1 = ctx[i],
> +			};
> +			gem_execbuf(i915, &eb);
> +		}
> +	}
> +
> +	sleep(2); /* over the course of many timeslices */
> +
> +	igt_assert(gem_bo_busy(i915, spin->handle));
> +	igt_spin_end(spin);
> +	for (int i = 0; i < ARRAY_SIZE(ctx); i++)
> +		ts[i] = read_ctx_timestamp(i915, ctx[i], e);
> +
> +	for (int i = 0; i < ARRAY_SIZE(ctx); i++)
> +		gem_context_destroy(i915, ctx[i]);
> +	igt_spin_free(i915, spin);
> +
> +	qsort(ts, 3, sizeof(*ts), cmp_u32);
> +	igt_info("%s: [%.1f, %.1f] ms\n", e->name,
> +		 1e-6 * ticks_to_ns(i915, ts[0]),
> +		 1e-6 * ticks_to_ns(i915, ts[2]));

Log all three just as well?

> +
> +	igt_assert(ts[0] && ts[2] > ts[0]);
 > +	igt_assert(4 * ts[0] > 3 * ts[2]);

Three equal priority contexts - why would distribution be expected to be 
unfair? Intuitively I'd expect a check that all three are within some 
tolerance of each other, but okay, min and max is good enough, just 
don't understand the asserts. Max can just as well be equal to min, no? 
I mean and scheduler would still be considered fair. We should ignore 
the submission order I think, if that was the point.

> +}
> +
>   #define test_each_engine(T, i915, e) \
>   	igt_subtest_with_dynamic(T) __for_each_physical_engine(i915, e) \
>   		igt_dynamic_f("%s", e->name)
> @@ -2582,6 +3396,35 @@ igt_main
>   		test_each_engine("lateslice", fd, e)
>   			lateslice(fd, e->flags);
>   
> +		igt_subtest_group {
> +			igt_fixture {
> +				igt_require(gem_scheduler_has_semaphores(fd));
> +				igt_require(gem_scheduler_has_preemption(fd));
> +				igt_require(intel_gen(intel_get_drm_devid(fd)) >= 8);
> +			}
> +
> +			test_each_engine("fairslice", fd, e)
> +				fairslice(fd, e, 0);
> +
> +			test_each_engine("u-fairslice", fd, e)
> +				fairslice(fd, e, IGT_SPIN_USERPTR);
> +
> +			igt_subtest("fairslice-all")  {
> +				__for_each_physical_engine(fd, e) {
> +					igt_fork(child, 1)
> +						fairslice(fd, e, 0);
> +				}
> +				igt_waitchildren();
> +			}
> +			igt_subtest("u-fairslice-all")  {
> +				__for_each_physical_engine(fd, e) {
> +					igt_fork(child, 1)
> +						fairslice(fd, e, IGT_SPIN_USERPTR);
> +				}
> +				igt_waitchildren();
> +			}
> +		}
> +
>   		test_each_engine("submit-early-slice", fd, e)
>   			submit_slice(fd, e, EARLY_SUBMIT);
>   		test_each_engine("submit-golden-slice", fd, e)
> @@ -2610,6 +3453,10 @@ igt_main
>   		test_each_engine_store("promotion", fd, e)
>   			promotion(fd, e->flags);
>   
> +		igt_subtest_group {
> +			test_fairness(fd, 2);
> +		}
> +
>   		igt_subtest_group {
>   			igt_fixture {
>   				igt_require(gem_scheduler_has_preemption(fd));
> 

Seem clean and logical on the high level and on the implementation 
level. On the "medium" level I don't claim I tried to understand 
everything but it's not completely important. With medium level I mean 
all the different test scenarios, where the important thing is that as 
long as all children are doing the same thing, which I think they are 
(small open of VIP), it seems correct to test they will get equal amount 
of GPU time.

All subtests pass with the fair scheduler patches?

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [igt-dev] [PATCH i-g-t] i915/gem_exec_schedule: Try to spot unfairness
@ 2020-11-25 11:25   ` Tvrtko Ursulin
  0 siblings, 0 replies; 20+ messages in thread
From: Tvrtko Ursulin @ 2020-11-25 11:25 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx; +Cc: igt-dev, Tvrtko Ursulin


On 24/11/2020 23:39, Chris Wilson wrote:
> An important property for multi-client systems is that each client gets
> a 'fair' allotment of system time. (Where fairness is at the whim of the
> context properties, such as priorities.) This test forks N independent
> clients (albeit they happen to share a single vm), and does an equal
> amount of work in client and asserts that they take an equal amount of
> time.
> 
> Though we have never claimed to have a completely fair scheduler, that
> is what is expected.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Ramalingam C <ramalingam.c@intel.com>
> ---
>   tests/i915/gem_exec_schedule.c | 847 +++++++++++++++++++++++++++++++++
>   1 file changed, 847 insertions(+)
> 
> diff --git a/tests/i915/gem_exec_schedule.c b/tests/i915/gem_exec_schedule.c
> index f23d63ac3..d888efcd7 100644
> --- a/tests/i915/gem_exec_schedule.c
> +++ b/tests/i915/gem_exec_schedule.c
> @@ -29,6 +29,7 @@
>   #include <sys/poll.h>
>   #include <sys/ioctl.h>
>   #include <sys/mman.h>
> +#include <sys/resource.h>
>   #include <sys/syscall.h>
>   #include <sched.h>
>   #include <signal.h>
> @@ -2516,6 +2517,819 @@ static void measure_semaphore_power(int i915)
>   	rapl_close(&pkg);
>   }
>   
> +static int read_timestamp_frequency(int i915)
> +{
> +	int value = 0;
> +	drm_i915_getparam_t gp = {
> +		.value = &value,
> +		.param = I915_PARAM_CS_TIMESTAMP_FREQUENCY,
> +	};
> +	ioctl(i915, DRM_IOCTL_I915_GETPARAM, &gp);
> +	return value;
> +}
> +
> +static uint64_t div64_u64_round_up(uint64_t x, uint64_t y)
> +{
> +	return (x + y - 1) / y;
> +}
> +
> +static uint64_t ns_to_ctx_ticks(int i915, uint64_t ns)
> +{
> +	int f = read_timestamp_frequency(i915);
> +	if (intel_gen(intel_get_drm_devid(i915)) == 11)
> +		f = 12500000; /* icl!!! are you feeling alright? CTX vs CS */
> +	return div64_u64_round_up(ns * f, NSEC_PER_SEC);
> +}
> +
> +static uint64_t ticks_to_ns(int i915, uint64_t ticks)
> +{
> +	return div64_u64_round_up(ticks * NSEC_PER_SEC,
> +				  read_timestamp_frequency(i915));
> +}
> +
> +#define MI_INSTR(opcode, flags) (((opcode) << 23) | (flags))
> +
> +#define MI_MATH(x)                      MI_INSTR(0x1a, (x) - 1)
> +#define MI_MATH_INSTR(opcode, op1, op2) ((opcode) << 20 | (op1) << 10 | (op2))
> +/* Opcodes for MI_MATH_INSTR */
> +#define   MI_MATH_NOOP                  MI_MATH_INSTR(0x000, 0x0, 0x0)
> +#define   MI_MATH_LOAD(op1, op2)        MI_MATH_INSTR(0x080, op1, op2)
> +#define   MI_MATH_LOADINV(op1, op2)     MI_MATH_INSTR(0x480, op1, op2)
> +#define   MI_MATH_LOAD0(op1)            MI_MATH_INSTR(0x081, op1)
> +#define   MI_MATH_LOAD1(op1)            MI_MATH_INSTR(0x481, op1)
> +#define   MI_MATH_ADD                   MI_MATH_INSTR(0x100, 0x0, 0x0)
> +#define   MI_MATH_SUB                   MI_MATH_INSTR(0x101, 0x0, 0x0)
> +#define   MI_MATH_AND                   MI_MATH_INSTR(0x102, 0x0, 0x0)
> +#define   MI_MATH_OR                    MI_MATH_INSTR(0x103, 0x0, 0x0)
> +#define   MI_MATH_XOR                   MI_MATH_INSTR(0x104, 0x0, 0x0)
> +#define   MI_MATH_STORE(op1, op2)       MI_MATH_INSTR(0x180, op1, op2)
> +#define   MI_MATH_STOREINV(op1, op2)    MI_MATH_INSTR(0x580, op1, op2)
> +/* Registers used as operands in MI_MATH_INSTR */
> +#define   MI_MATH_REG(x)                (x)
> +#define   MI_MATH_REG_SRCA              0x20
> +#define   MI_MATH_REG_SRCB              0x21
> +#define   MI_MATH_REG_ACCU              0x31
> +#define   MI_MATH_REG_ZF                0x32
> +#define   MI_MATH_REG_CF                0x33
> +
> +#define MI_LOAD_REGISTER_REG    MI_INSTR(0x2A, 1)
> +
> +static void delay(int i915,
> +		  const struct intel_execution_engine2 *e,
> +		  uint32_t handle,
> +		  uint64_t addr,
> +		  uint64_t ns)
> +{
> +	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
> +	const uint32_t base = gem_engine_mmio_base(i915, e->name);
> +#define CS_GPR(x) (base + 0x600 + 8 * (x))
> +#define RUNTIME (base + 0x3a8)
> +	enum { START_TS, NOW_TS };
> +	uint32_t *map, *cs, *jmp;
> +
> +	igt_require(base);
> +
> +	/* Loop until CTX_TIMESTAMP - initial > @ns */
> +
> +	cs = map = gem_mmap__device_coherent(i915, handle, 0, 4096, PROT_WRITE);
> +
> +	*cs++ = MI_LOAD_REGISTER_IMM;
> +	*cs++ = CS_GPR(START_TS) + 4;
> +	*cs++ = 0;
> +	*cs++ = MI_LOAD_REGISTER_REG;
> +	*cs++ = RUNTIME;
> +	*cs++ = CS_GPR(START_TS);
> +
> +	while (offset_in_page(cs) & 63)
> +		*cs++ = 0;
> +	jmp = cs;
> +
> +	*cs++ = 0x5 << 23; /* MI_ARB_CHECK */
> +
> +	*cs++ = MI_LOAD_REGISTER_IMM;
> +	*cs++ = CS_GPR(NOW_TS) + 4;
> +	*cs++ = 0;
> +	*cs++ = MI_LOAD_REGISTER_REG;
> +	*cs++ = RUNTIME;
> +	*cs++ = CS_GPR(NOW_TS);
> +
> +	/* delta = now - start; inverted to match COND_BBE */
> +	*cs++ = MI_MATH(4);
> +	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(NOW_TS));
> +	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(START_TS));
> +	*cs++ = MI_MATH_SUB;
> +	*cs++ = MI_MATH_STOREINV(MI_MATH_REG(NOW_TS), MI_MATH_REG_ACCU);
> +
> +	/* Save delta for reading by COND_BBE */
> +	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
> +	*cs++ = CS_GPR(NOW_TS);
> +	*cs++ = addr + 4000;
> +	*cs++ = addr >> 32;
> +
> +	/* Delay between SRM and COND_BBE to post the writes */
> +	for (int n = 0; n < 8; n++) {
> +		*cs++ = MI_STORE_DWORD_IMM;
> +		if (use_64b) {
> +			*cs++ = addr + 4064;
> +			*cs++ = addr >> 32;
> +		} else {
> +			*cs++ = 0;
> +			*cs++ = addr + 4064;
> +		}
> +		*cs++ = 0;
> +	}
> +
> +	/* Break if delta [time elapsed] > ns */
> +	*cs++ = MI_COND_BATCH_BUFFER_END | MI_DO_COMPARE | (1 + use_64b);
> +	*cs++ = ~ns_to_ctx_ticks(i915, ns);
> +	*cs++ = addr + 4000;
> +	*cs++ = addr >> 32;
> +
> +	/* Otherwise back to recalculating delta */
> +	*cs++ = MI_BATCH_BUFFER_START | 1 << 8 | use_64b;
> +	*cs++ = addr + offset_in_page(jmp);
> +	*cs++ = addr >> 32;
> +
> +	munmap(map, 4096);
> +}
> +
> +static struct drm_i915_gem_exec_object2
> +delay_create(int i915, uint32_t ctx,
> +	     const struct intel_execution_engine2 *e,
> +	     uint64_t target_ns)
> +{
> +	struct drm_i915_gem_exec_object2 obj = {
> +		.handle = batch_create(i915),
> +		.flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS,
> +	};
> +	struct drm_i915_gem_execbuffer2 execbuf = {
> +		.buffers_ptr = to_user_pointer(&obj),
> +		.buffer_count = 1,
> +		.rsvd1 = ctx,
> +		.flags = e->flags,
> +	};
> +
> +	obj.offset = obj.handle << 12;
> +	gem_execbuf(i915, &execbuf);
> +	gem_sync(i915, obj.handle);
> +
> +	delay(i915, e, obj.handle, obj.offset, target_ns);
> +
> +	obj.flags |= EXEC_OBJECT_PINNED;
> +	return obj;
> +}
> +
> +static void tslog(int i915,
> +		  const struct intel_execution_engine2 *e,
> +		  uint32_t handle,
> +		  uint64_t addr)
> +{
> +	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
> +	const uint32_t base = gem_engine_mmio_base(i915, e->name);
> +#define CS_GPR(x) (base + 0x600 + 8 * (x))
> +#define CS_TIMESTAMP (base + 0x358)
> +	enum { INC, MASK, ADDR };
> +	uint32_t *timestamp_lo, *addr_lo;
> +	uint32_t *map, *cs;
> +
> +	igt_require(base);
> +
> +	map = gem_mmap__device_coherent(i915, handle, 0, 4096, PROT_WRITE);
> +	cs = map + 512;
> +
> +	/* Record the current CS_TIMESTAMP into a journal [a 512 slot ring]. */
> +	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
> +	*cs++ = CS_TIMESTAMP;
> +	timestamp_lo = cs;
> +	*cs++ = addr;
> +	*cs++ = addr >> 32;
> +
> +	/* Load the address + inc & mask variables */
> +	*cs++ = MI_LOAD_REGISTER_IMM;
> +	*cs++ = CS_GPR(ADDR);
> +	addr_lo = cs;
> +	*cs++ = addr;
> +	*cs++ = MI_LOAD_REGISTER_IMM;
> +	*cs++ = CS_GPR(ADDR) + 4;
> +	*cs++ = addr >> 32;
> +
> +	*cs++ = MI_LOAD_REGISTER_IMM;
> +	*cs++ = CS_GPR(INC);
> +	*cs++ = 4;
> +	*cs++ = MI_LOAD_REGISTER_IMM;
> +	*cs++ = CS_GPR(INC) + 4;
> +	*cs++ = 0;
> +
> +	*cs++ = MI_LOAD_REGISTER_IMM;
> +	*cs++ = CS_GPR(MASK);
> +	*cs++ = 0xfffff7ff;
> +	*cs++ = MI_LOAD_REGISTER_IMM;
> +	*cs++ = CS_GPR(MASK) + 4;
> +	*cs++ = 0xffffffff;
> +
> +	/* Increment the [ring] address for saving CS_TIMESTAMP */
> +	*cs++ = MI_MATH(8);
> +	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(INC));
> +	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(ADDR));
> +	*cs++ = MI_MATH_ADD;
> +	*cs++ = MI_MATH_STORE(MI_MATH_REG(ADDR), MI_MATH_REG_ACCU);
> +	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(ADDR));
> +	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(MASK));
> +	*cs++ = MI_MATH_AND;
> +	*cs++ = MI_MATH_STORE(MI_MATH_REG(ADDR), MI_MATH_REG_ACCU);
> +
> +	/* Rewrite the batch buffer for the next execution */
> +	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
> +	*cs++ = CS_GPR(ADDR);
> +	*cs++ = addr + offset_in_page(timestamp_lo);
> +	*cs++ = addr >> 32;
> +	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
> +	*cs++ = CS_GPR(ADDR);
> +	*cs++ = addr + offset_in_page(addr_lo);
> +	*cs++ = addr >> 32;
> +
> +	*cs++ = MI_BATCH_BUFFER_END;
> +
> +	munmap(map, 4096);
> +}
> +
> +static struct drm_i915_gem_exec_object2
> +tslog_create(int i915, uint32_t ctx, const struct intel_execution_engine2 *e)
> +{
> +	struct drm_i915_gem_exec_object2 obj = {
> +		.handle = batch_create(i915),
> +		.flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS,
> +	};
> +	struct drm_i915_gem_execbuffer2 execbuf = {
> +		.buffers_ptr = to_user_pointer(&obj),
> +		.buffer_count = 1,
> +		.rsvd1 = ctx,
> +		.flags = e->flags,
> +	};
> +
> +	obj.offset = obj.handle << 12;
> +	gem_execbuf(i915, &execbuf);
> +	gem_sync(i915, obj.handle);
> +
> +	tslog(i915, e, obj.handle, obj.offset);
> +
> +	obj.flags |= EXEC_OBJECT_PINNED;
> +	return obj;
> +}
> +
> +static int cmp_u32(const void *A, const void *B)
> +{
> +	const uint32_t *a = A, *b = B;
> +
> +	if (*a < *b)
> +		return -1;
> +	else if (*a > *b)
> +		return 1;
> +	else
> +		return 0;
> +}
> +
> +static bool has_ctx_timestamp(int i915, const struct intel_execution_engine2 *e)
> +{
> +	const int gen = intel_gen(intel_get_drm_devid(i915));
> +
> +	if (gen == 8 && e->class == I915_ENGINE_CLASS_VIDEO)
> +		return false; /* looks fubar */
> +
> +	return true;
> +}
> +
> +static struct intel_execution_engine2
> +pick_random_engine(int i915, const struct intel_execution_engine2 *not)
> +{
> +	const struct intel_execution_engine2 *e;
> +	unsigned int count = 0;
> +
> +	__for_each_physical_engine(i915, e) {
> +		if (e->flags == not->flags)
> +			continue;
> +		if (!gem_class_has_mutable_submission(i915, e->class))
> +			continue;
> +		count++;
> +	}
> +	if (!count)
> +		return *not;
> +
> +	count = rand() % count;
> +	__for_each_physical_engine(i915, e) {
> +		if (e->flags == not->flags)
> +			continue;
> +		if (!gem_class_has_mutable_submission(i915, e->class))
> +			continue;
> +		if (!count--)
> +			break;
> +	}
> +
> +	return *e;
> +}
> +
> +static void fair_child(int i915, uint32_t ctx,
> +		       const struct intel_execution_engine2 *e,
> +		       uint64_t frame_ns,
> +		       int timeline,
> +		       uint32_t common,
> +		       unsigned int flags,
> +		       unsigned long *ctl,
> +		       unsigned long *out)
> +#define F_SYNC		(1 << 0)
> +#define F_PACE		(1 << 1)
> +#define F_FLOW		(1 << 2)
> +#define F_HALF		(1 << 3)
> +#define F_SOLO		(1 << 4)
> +#define F_SPARE		(1 << 5)
> +#define F_NEXT		(1 << 6)
> +#define F_VIP		(1 << 7)
> +#define F_RRUL		(1 << 8)
> +#define F_SHARE		(1 << 9)
> +#define F_PING		(1 << 10)
> +#define F_THROTTLE	(1 << 11)
> +#define F_ISOLATE	(1 << 12)
> +{
> +	const int batches_per_frame = flags & F_SOLO ? 1 : 3;
> +	struct drm_i915_gem_exec_object2 obj[4] = {
> +		{},
> +		{
> +			.handle = common ?: gem_create(i915, 4096),
> +		},
> +		delay_create(i915, ctx, e, frame_ns / batches_per_frame),
> +		delay_create(i915, ctx, e, frame_ns / batches_per_frame),
> +	};
> +	struct intel_execution_engine2 ping = *e;
> +	int p_fence = -1, n_fence = -1;
> +	unsigned long count = 0;
> +	int n;
> +
> +	srandom(getpid());
> +	if (flags & F_PING)
> +		ping = pick_random_engine(i915, e);
> +	obj[0] = tslog_create(i915, ctx, &ping);
> +
> +	while (!READ_ONCE(*ctl)) {
> +		struct drm_i915_gem_execbuffer2 execbuf = {
> +			.buffers_ptr = to_user_pointer(obj),
> +			.buffer_count = 4,
> +			.rsvd1 = ctx,
> +			.rsvd2 = -1,
> +			.flags = e->flags,
> +		};
> +
> +		if (flags & F_FLOW) {
> +			unsigned int seq;
> +
> +			seq = count;
> +			if (flags & F_NEXT)
> +				seq++;
> +
> +			execbuf.rsvd2 =
> +				sw_sync_timeline_create_fence(timeline, seq);
> +			execbuf.flags |= I915_EXEC_FENCE_IN;
> +		}
> +
> +		execbuf.flags |= I915_EXEC_FENCE_OUT;
> +		gem_execbuf_wr(i915, &execbuf);
> +		n_fence = execbuf.rsvd2 >> 32;
> +		execbuf.flags &= ~(I915_EXEC_FENCE_OUT | I915_EXEC_FENCE_IN);
> +		for (n = 1; n < batches_per_frame; n++)
> +			gem_execbuf(i915, &execbuf);
> +		close(execbuf.rsvd2);
> +
> +		execbuf.buffer_count = 1;
> +		execbuf.batch_start_offset = 2048;
> +		execbuf.flags = ping.flags | I915_EXEC_FENCE_IN;
> +		execbuf.rsvd2 = n_fence;
> +		gem_execbuf(i915, &execbuf);
> +
> +		if (flags & F_PACE && p_fence != -1) {
> +			struct pollfd pfd = {
> +				.fd = p_fence,
> +				.events = POLLIN,
> +			};
> +			poll(&pfd, 1, -1);
> +		}
> +		close(p_fence);
> +
> +		if (flags & F_SYNC) {
> +			struct pollfd pfd = {
> +				.fd = n_fence,
> +				.events = POLLIN,
> +			};
> +			poll(&pfd, 1, -1);
> +		}
> +
> +		if (flags & F_THROTTLE)
> +			igt_ioctl(i915, DRM_IOCTL_I915_GEM_THROTTLE, 0);
> +
> +		igt_swap(obj[2], obj[3]);
> +		igt_swap(p_fence, n_fence);

What are the sync fences simulating and how come they are always used? I 
mean no children which submit batched up load?

> +		count++;
> +	}
> +	close(p_fence);
> +
> +	gem_close(i915, obj[3].handle);
> +	gem_close(i915, obj[2].handle);
> +	if (obj[1].handle != common)
> +		gem_close(i915, obj[1].handle);
> +
> +	gem_sync(i915, obj[0].handle);
> +	if (out) {
> +		uint32_t *map;
> +
> +		map = gem_mmap__device_coherent(i915, obj[0].handle,
> +						0, 4096, PROT_WRITE);
> +		for (n = 1; n < min(count, 512); n++) {
> +			igt_assert(map[n]);
> +			map[n - 1] = map[n] - map[n - 1];
> +		}
> +		qsort(map, --n, sizeof(*map), cmp_u32);
> +		*out = ticks_to_ns(i915, map[n / 2]);

What is returned? Could you explain the ts journal part a bit?

> +		munmap(map, 4096);
> +	}
> +	gem_close(i915, obj[0].handle);
> +}
> +
> +static int cmp_ul(const void *A, const void *B)
> +{
> +	const unsigned long *a = A, *b = B;
> +
> +	if (*a < *b)
> +		return -1;
> +	else if (*a > *b)
> +		return 1;
> +	else
> +		return 0;
> +}
> +
> +static uint64_t d_cpu_time(const struct rusage *a, const struct rusage *b)
> +{
> +	uint64_t cpu_time = 0;
> +
> +	cpu_time += (a->ru_utime.tv_sec - b->ru_utime.tv_sec) * NSEC_PER_SEC;
> +	cpu_time += (a->ru_utime.tv_usec - b->ru_utime.tv_usec) * 1000;
> +
> +	cpu_time += (a->ru_stime.tv_sec - b->ru_stime.tv_sec) * NSEC_PER_SEC;
> +	cpu_time += (a->ru_stime.tv_usec - b->ru_stime.tv_usec) * 1000;
> +
> +	return cpu_time;
> +}
> +
> +static void timeline_advance(int timeline, int delay_ns)
> +{
> +	struct timespec tv = { .tv_nsec = delay_ns };
> +	nanosleep(&tv, NULL);
> +	sw_sync_timeline_inc(timeline, 1);
> +}
> +
> +static void fairness(int i915,
> +		     const struct intel_execution_engine2 *e,
> +		     int timeout, unsigned int flags)
> +{
> +	const int frame_ns = 16666 * 1000;
> +	const int fence_ns = flags & F_HALF ? 2 * frame_ns : frame_ns;
> +	unsigned long *result;
> +	uint32_t common = 0;
> +
> +	igt_require(has_ctx_timestamp(i915, e));
> +	igt_require(gem_class_has_mutable_submission(i915, e->class));
> +
> +	if (flags & F_SHARE)
> +		common = gem_create(i915, 4095);
> +
> +	result = mmap(NULL, 4096, PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
> +
> +	for (int n = 2; n <= 64; n <<= 1) { /* 32 == 500us per client */
> +		int timeline = sw_sync_timeline_create();
> +		int nfences = timeout * NSEC_PER_SEC / fence_ns + 1;
> +		const int nchild = n - 1; /* odd for easy medians */
> +		const int child_ns = frame_ns / (nchild + !!(flags & F_SPARE));
> +		const int lo = nchild / 4;
> +		const int hi = (3 * nchild + 3) / 4 - 1;
> +		struct rusage old_usage, usage;
> +		uint64_t cpu_time, d_time;
> +		unsigned long vip = -1;
> +		struct timespec tv;
> +		struct igt_mean m;
> +
> +		if (flags & F_PING) {
> +			struct intel_execution_engine2 *ping;
> +
> +			__for_each_physical_engine(i915, ping) {
> +				if (ping->flags == e->flags)
> +					continue;
> +
> +				igt_fork(child, 1) {
> +					uint32_t ctx = gem_context_clone_with_engines(i915, 0);
> +
> +					fair_child(i915, ctx, ping,
> +						   child_ns / 8,
> +						   -1, common,
> +						   F_SOLO | F_PACE | F_SHARE,
> +						   &result[nchild],
> +						   NULL);
> +
> +					gem_context_destroy(i915, ctx);
> +				}
> +			}
> +		}
> +
> +		memset(result, 0, (nchild + 1) * sizeof(result[0]));

Children probably can't write into it before, but still would probably 
be better moved before the first fork (which passes the results array to 
children).

> +		getrusage(RUSAGE_CHILDREN, &old_usage);
> +		igt_nsec_elapsed(memset(&tv, 0, sizeof(tv)));
> +		igt_fork(child, nchild) {
> +			uint32_t ctx;
> +
> +			if (flags & F_ISOLATE) {
> +				int clone, dmabuf = -1;
> +
> +				if (common)
> +					dmabuf = prime_handle_to_fd(i915, common);
> +
> +				clone = gem_reopen_driver(i915);
> +				gem_context_copy_engines(i915, 0, clone, 0);
> +				i915 = clone;
> +
> +				if (dmabuf != -1)
> +					common = prime_fd_to_handle(i915, dmabuf);
> +			}
> +
> +			ctx = gem_context_clone_with_engines(i915, 0);
> +
> +			if (flags & F_VIP && child == 0) {
> +				gem_context_set_priority(i915, ctx, MAX_PRIO);
> +				flags |= F_FLOW;
> +			}
> +			if (flags & F_RRUL && child == 0)
> +				flags |= F_SOLO | F_FLOW | F_SYNC;
> +
> +			fair_child(i915, ctx, e, child_ns,
> +				   timeline, common, flags,
> +				   &result[nchild],
> +				   &result[child]);
> +
> +			gem_context_destroy(i915, ctx);
> +		}
> +
> +		while (nfences--)
> +			timeline_advance(timeline, fence_ns);
> +
> +		result[nchild] = 1;
> +		for (int child = 0; child < nchild; child++) {
> +			while (!READ_ONCE(result[child]))
> +				timeline_advance(timeline, fence_ns);
> +		}
> +
> +		igt_waitchildren();
> +		close(timeline);
> +
> +		/* Are we running out of CPU time, and fail to submit frames? */
> +		d_time = igt_nsec_elapsed(&tv);
> +		getrusage(RUSAGE_CHILDREN, &usage);
> +		cpu_time = d_cpu_time(&usage, &old_usage);
> +		if (10 * cpu_time > 9 * d_time) {
> +			if (nchild > 7)
> +				break;
> +
> +			igt_skip_on_f(10 * cpu_time > 9 * d_time,
> +				      "%.0f%% CPU usage, presuming capacity exceeded\n",
> +				      100. * cpu_time / d_time);

Aren't children mostly sleeping waiting on fences and like? And if so 
how/when the test ends up using a lot of CPU time?

> +		}
> +
> +		igt_mean_init(&m);
> +		for (int child = 0; child < nchild; child++)
> +			igt_mean_add(&m, result[child]);
> +
> +		if (flags & (F_VIP | F_RRUL))
> +			vip = result[0];
> +
> +		qsort(result, nchild, sizeof(*result), cmp_ul);
> +		igt_info("%2d clients, range: [%.1f, %.1f], iqr: [%.1f, %.1f], median: %.1f, mean: %.1f ± %.2f ms\n",
> +			 nchild,
> +			 1e-6 * result[0],  1e-6 * result[nchild - 1],
> +			 1e-6 * result[lo], 1e-6 * result[hi],
> +			 1e-6 * result[nchild / 2],
> +			 1e-6 * igt_mean_get(&m),
> +			 1e-6 * sqrt(igt_mean_get_variance(&m)));
> +
> +		if (vip != -1) {
> +			igt_info("VIP interval %.2f ms\n", 1e-6 * vip);
> +			igt_assert(4 * vip > 3 * fence_ns &&
> +				   3 * vip < 4 * fence_ns);
> +		}
> +
> +		/* May be slowed due to sheer volume of context switches */
> +		igt_assert(4 * igt_mean_get(&m) > 3 * fence_ns &&
> +			       igt_mean_get(&m) < 3 * fence_ns);
> +
> +		igt_assert(4 * igt_mean_get(&m) > 3 * result[nchild / 2] &&
> +			   3 * igt_mean_get(&m) < 4 * result[nchild / 2]);
> +
> +		igt_assert(2 * (result[hi] - result[lo]) < result[nchild / 2]);

Put some human readable text above the asserts explaining the criteria 
please.

VIP child takes part in the mean and does not affect the result?

> +	}
> +
> +	munmap(result, 4096);
> +	if (common)
> +		gem_close(i915, common);
> +}
> +
> +static void test_fairness(int i915, int timeout)
> +{
> +	static const struct {
> +		const char *name;
> +		unsigned int flags;
> +	} fair[] = {
> +		/*
> +		 * none - maximal greed in each client
> +		 *
> +		 * Push as many frames from each client as fast as possible
> +		 */
> +		{ "none",       0 },
> +		{ "none-vip",   F_VIP }, /* one vip client must meet deadlines */
> +		{ "none-solo",  F_SOLO }, /* 1 batch per frame per client */
> +		{ "none-share", F_SHARE }, /* read from a common buffer */
> +		{ "none-rrul",  F_RRUL }, /* "realtime-response under load" */
> +		{ "none-ping",  F_PING }, /* measure inter-engine fairness */
> +
> +		/*
> +		 * throttle - original per client throttling
> +		 *
> +		 * Used for front buffering rendering where there is no
> +		 * extenal frame marker. Each client tries to only keep
> +		 * 20ms of work submitted, though that measurement is
> +		 * flawed...
> +		 *
> +		 * This is used by Xorg to try and maintain some resembalance
> +		 * of input/output consistency when being feed a continuous
> +		 * stream of X11 draw requests straight into scanout, where
> +		 * the clients may submit the work faster than can be drawn.
> +		 *
> +		 * Throttling tracks requests per-file (and assumes that
> +		 * all requests are in submission order across the whole file),
> +		 * so we split each child to its own fd.
> +		 */
> +		{ "throttle",       F_THROTTLE | F_ISOLATE },
> +		{ "throttle-vip",   F_THROTTLE | F_ISOLATE | F_VIP },
> +		{ "throttle-solo",  F_THROTTLE | F_ISOLATE | F_SOLO },
> +		{ "throttle-share", F_THROTTLE | F_ISOLATE | F_SHARE },
> +		{ "throttle-rrul",  F_THROTTLE | F_ISOLATE | F_RRUL },
> +
> +		/*
> +		 * pace - mesa "submit double buffering"
> +		 *
> +		 * Submit a frame, wait for previous frame to start. This
> +		 * prevents each client from getting too far ahead of its
> +		 * rendering, maintaining a consistent input/output latency.
> +		 */
> +		{ "pace",       F_PACE },
> +		{ "pace-solo",  F_PACE | F_SOLO},
> +		{ "pace-share", F_PACE | F_SHARE},
> +		{ "pace-ping",  F_PACE | F_SHARE | F_PING},
> +
> +		/* sync - only submit a frame at a time */
> +		{ "sync",      F_SYNC },
> +		{ "sync-vip",  F_SYNC | F_VIP },
> +		{ "sync-solo", F_SYNC | F_SOLO },
> +
> +		/* flow - synchronise execution against the clock (vblank) */
> +		{ "flow",       F_PACE | F_FLOW },
> +		{ "flow-share", F_PACE | F_FLOW | F_SHARE },
> +		{ "flow-ping",  F_PACE | F_FLOW | F_SHARE | F_PING },
> +
> +		/* next - submit ahead of the clock (vblank double buffering) */
> +		{ "next",       F_PACE | F_FLOW | F_NEXT },
> +		{ "next-share", F_PACE | F_FLOW | F_NEXT | F_SHARE },
> +		{ "next-ping",  F_PACE | F_FLOW | F_NEXT | F_SHARE | F_PING },
> +
> +		/* spare - underutilise by a single client timeslice */
> +		{ "spare", F_PACE | F_FLOW | F_SPARE },
> +
> +		/* half - run at half pace (submit 16ms of work every 32ms) */
> +		{ "half",  F_PACE | F_FLOW | F_HALF },
> +
> +		{}
> +	};
> +
> +	igt_fixture {
> +		igt_info("CS timestamp frequency: %d\n",
> +			 read_timestamp_frequency(i915));
> +
> +		igt_require(intel_gen(intel_get_drm_devid(i915)) >= 8);
> +	}
> +
> +	for (typeof(*fair) *f = fair; f->name; f++) {
> +		igt_subtest_with_dynamic_f("fair-%s", f->name)  {
> +			const struct intel_execution_engine2 *e;
> +
> +			__for_each_physical_engine(i915, e) {
> +				if (!gem_class_can_store_dword(i915, e->class))
> +					continue;
> +
> +				igt_dynamic_f("%s", e->name)
> +					fairness(i915, e, timeout, f->flags);
> +			}
> +		}
> +	}
> +}
> +
> +static uint32_t read_ctx_timestamp(int i915,
> +				   uint32_t ctx,
> +				   const struct intel_execution_engine2 *e)
> +{
> +	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
> +	const uint32_t base = gem_engine_mmio_base(i915, e->name);
> +	struct drm_i915_gem_relocation_entry reloc;
> +	struct drm_i915_gem_exec_object2 obj = {
> +		.handle = gem_create(i915, 4096),
> +		.offset = 32 << 20,
> +		.relocs_ptr = to_user_pointer(&reloc),
> +		.relocation_count = 1,
> +	};
> +	struct drm_i915_gem_execbuffer2 execbuf = {
> +		.buffers_ptr = to_user_pointer(&obj),
> +		.buffer_count = 1,
> +		.flags = e->flags,
> +		.rsvd1 = ctx,
> +	};
> +#define RUNTIME (base + 0x3a8)
> +	uint32_t *map, *cs;
> +	uint32_t ts;
> +
> +	igt_require(base);
> +
> +	cs = map = gem_mmap__device_coherent(i915, obj.handle,
> +					     0, 4096, PROT_WRITE);
> +
> +	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
> +	*cs++ = RUNTIME;
> +	memset(&reloc, 0, sizeof(reloc));
> +	reloc.target_handle = obj.handle;
> +	reloc.presumed_offset = obj.offset;
> +	reloc.offset = offset_in_page(cs);
> +	reloc.delta = 4000;
> +	*cs++ = obj.offset + 4000;
> +	*cs++ = obj.offset >> 32;
> +
> +	*cs++ = MI_BATCH_BUFFER_END;
> +
> +	gem_execbuf(i915, &execbuf);
> +	gem_sync(i915, obj.handle);
> +	gem_close(i915, obj.handle);
> +
> +	ts = map[1000];
> +	munmap(map, 4096);
> +
> +	return ts;
> +}
> +
> +static void fairslice(int i915,
> +		      const struct intel_execution_engine2 *e,
> +		      unsigned long flags)
> +{
> +	igt_spin_t *spin = NULL;
> +	uint32_t ctx[3];
> +	uint32_t ts[3];
> +
> +	for (int i = 0; i < ARRAY_SIZE(ctx); i++) {
> +		ctx[i] = gem_context_clone_with_engines(i915, 0);
> +		if (spin == NULL) {
> +			spin = __igt_spin_new(i915,
> +					      .ctx = ctx[i],
> +					      .engine = e->flags,
> +					      .flags = flags);
> +		} else {
> +			struct drm_i915_gem_execbuffer2 eb = {
> +				.buffer_count = 1,
> +				.buffers_ptr = to_user_pointer(&spin->obj[IGT_SPIN_BATCH]),
> +				.flags = e->flags,
> +				.rsvd1 = ctx[i],
> +			};
> +			gem_execbuf(i915, &eb);
> +		}
> +	}
> +
> +	sleep(2); /* over the course of many timeslices */
> +
> +	igt_assert(gem_bo_busy(i915, spin->handle));
> +	igt_spin_end(spin);
> +	for (int i = 0; i < ARRAY_SIZE(ctx); i++)
> +		ts[i] = read_ctx_timestamp(i915, ctx[i], e);
> +
> +	for (int i = 0; i < ARRAY_SIZE(ctx); i++)
> +		gem_context_destroy(i915, ctx[i]);
> +	igt_spin_free(i915, spin);
> +
> +	qsort(ts, 3, sizeof(*ts), cmp_u32);
> +	igt_info("%s: [%.1f, %.1f] ms\n", e->name,
> +		 1e-6 * ticks_to_ns(i915, ts[0]),
> +		 1e-6 * ticks_to_ns(i915, ts[2]));

Log all three just as well?

> +
> +	igt_assert(ts[0] && ts[2] > ts[0]);
 > +	igt_assert(4 * ts[0] > 3 * ts[2]);

Three equal priority contexts - why would distribution be expected to be 
unfair? Intuitively I'd expect a check that all three are within some 
tolerance of each other, but okay, min and max is good enough, just 
don't understand the asserts. Max can just as well be equal to min, no? 
I mean and scheduler would still be considered fair. We should ignore 
the submission order I think, if that was the point.

> +}
> +
>   #define test_each_engine(T, i915, e) \
>   	igt_subtest_with_dynamic(T) __for_each_physical_engine(i915, e) \
>   		igt_dynamic_f("%s", e->name)
> @@ -2582,6 +3396,35 @@ igt_main
>   		test_each_engine("lateslice", fd, e)
>   			lateslice(fd, e->flags);
>   
> +		igt_subtest_group {
> +			igt_fixture {
> +				igt_require(gem_scheduler_has_semaphores(fd));
> +				igt_require(gem_scheduler_has_preemption(fd));
> +				igt_require(intel_gen(intel_get_drm_devid(fd)) >= 8);
> +			}
> +
> +			test_each_engine("fairslice", fd, e)
> +				fairslice(fd, e, 0);
> +
> +			test_each_engine("u-fairslice", fd, e)
> +				fairslice(fd, e, IGT_SPIN_USERPTR);
> +
> +			igt_subtest("fairslice-all")  {
> +				__for_each_physical_engine(fd, e) {
> +					igt_fork(child, 1)
> +						fairslice(fd, e, 0);
> +				}
> +				igt_waitchildren();
> +			}
> +			igt_subtest("u-fairslice-all")  {
> +				__for_each_physical_engine(fd, e) {
> +					igt_fork(child, 1)
> +						fairslice(fd, e, IGT_SPIN_USERPTR);
> +				}
> +				igt_waitchildren();
> +			}
> +		}
> +
>   		test_each_engine("submit-early-slice", fd, e)
>   			submit_slice(fd, e, EARLY_SUBMIT);
>   		test_each_engine("submit-golden-slice", fd, e)
> @@ -2610,6 +3453,10 @@ igt_main
>   		test_each_engine_store("promotion", fd, e)
>   			promotion(fd, e->flags);
>   
> +		igt_subtest_group {
> +			test_fairness(fd, 2);
> +		}
> +
>   		igt_subtest_group {
>   			igt_fixture {
>   				igt_require(gem_scheduler_has_preemption(fd));
> 

Seem clean and logical on the high level and on the implementation 
level. On the "medium" level I don't claim I tried to understand 
everything but it's not completely important. With medium level I mean 
all the different test scenarios, where the important thing is that as 
long as all children are doing the same thing, which I think they are 
(small open of VIP), it seems correct to test they will get equal amount 
of GPU time.

All subtests pass with the fair scheduler patches?

Regards,

Tvrtko
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [Intel-gfx] [igt-dev] [PATCH i-g-t] i915/gem_exec_schedule: Try to spot unfairness
  2020-11-25 11:25   ` Tvrtko Ursulin
@ 2020-11-25 12:23     ` Chris Wilson
  -1 siblings, 0 replies; 20+ messages in thread
From: Chris Wilson @ 2020-11-25 12:23 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx; +Cc: igt-dev

Quoting Tvrtko Ursulin (2020-11-25 11:25:02)
> 
> On 24/11/2020 23:39, Chris Wilson wrote:
> > +static void fair_child(int i915, uint32_t ctx,
> > +                    const struct intel_execution_engine2 *e,
> > +                    uint64_t frame_ns,
> > +                    int timeline,
> > +                    uint32_t common,
> > +                    unsigned int flags,
> > +                    unsigned long *ctl,
> > +                    unsigned long *out)
> > +#define F_SYNC               (1 << 0)
> > +#define F_PACE               (1 << 1)
> > +#define F_FLOW               (1 << 2)
> > +#define F_HALF               (1 << 3)
> > +#define F_SOLO               (1 << 4)
> > +#define F_SPARE              (1 << 5)
> > +#define F_NEXT               (1 << 6)
> > +#define F_VIP                (1 << 7)
> > +#define F_RRUL               (1 << 8)
> > +#define F_SHARE              (1 << 9)
> > +#define F_PING               (1 << 10)
> > +#define F_THROTTLE   (1 << 11)
> > +#define F_ISOLATE    (1 << 12)
> > +{
> > +     const int batches_per_frame = flags & F_SOLO ? 1 : 3;
> > +     struct drm_i915_gem_exec_object2 obj[4] = {
> > +             {},
> > +             {
> > +                     .handle = common ?: gem_create(i915, 4096),
> > +             },
> > +             delay_create(i915, ctx, e, frame_ns / batches_per_frame),
> > +             delay_create(i915, ctx, e, frame_ns / batches_per_frame),
> > +     };
> > +     struct intel_execution_engine2 ping = *e;
> > +     int p_fence = -1, n_fence = -1;
> > +     unsigned long count = 0;
> > +     int n;
> > +
> > +     srandom(getpid());
> > +     if (flags & F_PING)
> > +             ping = pick_random_engine(i915, e);
> > +     obj[0] = tslog_create(i915, ctx, &ping);
> > +
> > +     while (!READ_ONCE(*ctl)) {
> > +             struct drm_i915_gem_execbuffer2 execbuf = {
> > +                     .buffers_ptr = to_user_pointer(obj),
> > +                     .buffer_count = 4,
> > +                     .rsvd1 = ctx,
> > +                     .rsvd2 = -1,
> > +                     .flags = e->flags,
> > +             };
> > +
> > +             if (flags & F_FLOW) {
> > +                     unsigned int seq;
> > +
> > +                     seq = count;
> > +                     if (flags & F_NEXT)
> > +                             seq++;
> > +
> > +                     execbuf.rsvd2 =
> > +                             sw_sync_timeline_create_fence(timeline, seq);
> > +                     execbuf.flags |= I915_EXEC_FENCE_IN;
> > +             }
> > +
> > +             execbuf.flags |= I915_EXEC_FENCE_OUT;
> > +             gem_execbuf_wr(i915, &execbuf);
> > +             n_fence = execbuf.rsvd2 >> 32;
> > +             execbuf.flags &= ~(I915_EXEC_FENCE_OUT | I915_EXEC_FENCE_IN);
> > +             for (n = 1; n < batches_per_frame; n++)
> > +                     gem_execbuf(i915, &execbuf);
> > +             close(execbuf.rsvd2);
> > +
> > +             execbuf.buffer_count = 1;
> > +             execbuf.batch_start_offset = 2048;
> > +             execbuf.flags = ping.flags | I915_EXEC_FENCE_IN;
> > +             execbuf.rsvd2 = n_fence;
> > +             gem_execbuf(i915, &execbuf);
> > +
> > +             if (flags & F_PACE && p_fence != -1) {
> > +                     struct pollfd pfd = {
> > +                             .fd = p_fence,
> > +                             .events = POLLIN,
> > +                     };
> > +                     poll(&pfd, 1, -1);
> > +             }
> > +             close(p_fence);
> > +
> > +             if (flags & F_SYNC) {
> > +                     struct pollfd pfd = {
> > +                             .fd = n_fence,
> > +                             .events = POLLIN,
> > +                     };
> > +                     poll(&pfd, 1, -1);
> > +             }
> > +
> > +             if (flags & F_THROTTLE)
> > +                     igt_ioctl(i915, DRM_IOCTL_I915_GEM_THROTTLE, 0);
> > +
> > +             igt_swap(obj[2], obj[3]);
> > +             igt_swap(p_fence, n_fence);
> 
> What are the sync fences simulating and how come they are always used? I 
> mean no children which submit batched up load?

The sync fences are created for each submission for simplicity. We only
use them for synchronisation in emulating some of the clients (such as
mesa synchronising to previous SwapBuffers, and compositors
synchronising to vblanks). Creating/destroying an unused fence should
not be disruptive...

> > +             count++;
> > +     }
> > +     close(p_fence);
> > +
> > +     gem_close(i915, obj[3].handle);
> > +     gem_close(i915, obj[2].handle);
> > +     if (obj[1].handle != common)
> > +             gem_close(i915, obj[1].handle);
> > +
> > +     gem_sync(i915, obj[0].handle);
> > +     if (out) {
> > +             uint32_t *map;
> > +
> > +             map = gem_mmap__device_coherent(i915, obj[0].handle,
> > +                                             0, 4096, PROT_WRITE);
> > +             for (n = 1; n < min(count, 512); n++) {
> > +                     igt_assert(map[n]);
> > +                     map[n - 1] = map[n] - map[n - 1];
> > +             }
> > +             qsort(map, --n, sizeof(*map), cmp_u32);
> > +             *out = ticks_to_ns(i915, map[n / 2]);
> 
> What is returned? Could you explain the ts journal part a bit?

The median interval between timestamps. Each frame records the
CS_TIMESTAMP (global reference clock) it completed at. We then compute
the interval between each pair of frames and sort that to find the
median. I went with median to err on the side of stability, we want the
tests to be reliable. Checking the distribution within each client is
also interesting, but overwhelming.

> 
> > +             munmap(map, 4096);
> > +     }
> > +     gem_close(i915, obj[0].handle);
> > +}
> > +
> > +static int cmp_ul(const void *A, const void *B)
> > +{
> > +     const unsigned long *a = A, *b = B;
> > +
> > +     if (*a < *b)
> > +             return -1;
> > +     else if (*a > *b)
> > +             return 1;
> > +     else
> > +             return 0;
> > +}
> > +
> > +static uint64_t d_cpu_time(const struct rusage *a, const struct rusage *b)
> > +{
> > +     uint64_t cpu_time = 0;
> > +
> > +     cpu_time += (a->ru_utime.tv_sec - b->ru_utime.tv_sec) * NSEC_PER_SEC;
> > +     cpu_time += (a->ru_utime.tv_usec - b->ru_utime.tv_usec) * 1000;
> > +
> > +     cpu_time += (a->ru_stime.tv_sec - b->ru_stime.tv_sec) * NSEC_PER_SEC;
> > +     cpu_time += (a->ru_stime.tv_usec - b->ru_stime.tv_usec) * 1000;
> > +
> > +     return cpu_time;
> > +}
> > +
> > +static void timeline_advance(int timeline, int delay_ns)
> > +{
> > +     struct timespec tv = { .tv_nsec = delay_ns };
> > +     nanosleep(&tv, NULL);
> > +     sw_sync_timeline_inc(timeline, 1);
> > +}
> > +
> > +static void fairness(int i915,
> > +                  const struct intel_execution_engine2 *e,
> > +                  int timeout, unsigned int flags)
> > +{
> > +     const int frame_ns = 16666 * 1000;
> > +     const int fence_ns = flags & F_HALF ? 2 * frame_ns : frame_ns;
> > +     unsigned long *result;
> > +     uint32_t common = 0;
> > +
> > +     igt_require(has_ctx_timestamp(i915, e));
> > +     igt_require(gem_class_has_mutable_submission(i915, e->class));
> > +
> > +     if (flags & F_SHARE)
> > +             common = gem_create(i915, 4095);
> > +
> > +     result = mmap(NULL, 4096, PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
> > +
> > +     for (int n = 2; n <= 64; n <<= 1) { /* 32 == 500us per client */
> > +             int timeline = sw_sync_timeline_create();
> > +             int nfences = timeout * NSEC_PER_SEC / fence_ns + 1;
> > +             const int nchild = n - 1; /* odd for easy medians */
> > +             const int child_ns = frame_ns / (nchild + !!(flags & F_SPARE));
> > +             const int lo = nchild / 4;
> > +             const int hi = (3 * nchild + 3) / 4 - 1;
> > +             struct rusage old_usage, usage;
> > +             uint64_t cpu_time, d_time;
> > +             unsigned long vip = -1;
> > +             struct timespec tv;
> > +             struct igt_mean m;
> > +
> > +             if (flags & F_PING) {
> > +                     struct intel_execution_engine2 *ping;
> > +
> > +                     __for_each_physical_engine(i915, ping) {
> > +                             if (ping->flags == e->flags)
> > +                                     continue;
> > +
> > +                             igt_fork(child, 1) {
> > +                                     uint32_t ctx = gem_context_clone_with_engines(i915, 0);
> > +
> > +                                     fair_child(i915, ctx, ping,
> > +                                                child_ns / 8,
> > +                                                -1, common,
> > +                                                F_SOLO | F_PACE | F_SHARE,
> > +                                                &result[nchild],
> > +                                                NULL);
> > +
> > +                                     gem_context_destroy(i915, ctx);
> > +                             }
> > +                     }
> > +             }
> > +
> > +             memset(result, 0, (nchild + 1) * sizeof(result[0]));
> 
> Children probably can't write into it before, but still would probably 
> be better moved before the first fork (which passes the results array to 
> children).
> 
> > +             getrusage(RUSAGE_CHILDREN, &old_usage);
> > +             igt_nsec_elapsed(memset(&tv, 0, sizeof(tv)));
> > +             igt_fork(child, nchild) {
> > +                     uint32_t ctx;
> > +
> > +                     if (flags & F_ISOLATE) {
> > +                             int clone, dmabuf = -1;
> > +
> > +                             if (common)
> > +                                     dmabuf = prime_handle_to_fd(i915, common);
> > +
> > +                             clone = gem_reopen_driver(i915);
> > +                             gem_context_copy_engines(i915, 0, clone, 0);
> > +                             i915 = clone;
> > +
> > +                             if (dmabuf != -1)
> > +                                     common = prime_fd_to_handle(i915, dmabuf);
> > +                     }
> > +
> > +                     ctx = gem_context_clone_with_engines(i915, 0);
> > +
> > +                     if (flags & F_VIP && child == 0) {
> > +                             gem_context_set_priority(i915, ctx, MAX_PRIO);
> > +                             flags |= F_FLOW;
> > +                     }
> > +                     if (flags & F_RRUL && child == 0)
> > +                             flags |= F_SOLO | F_FLOW | F_SYNC;
> > +
> > +                     fair_child(i915, ctx, e, child_ns,
> > +                                timeline, common, flags,
> > +                                &result[nchild],
> > +                                &result[child]);
> > +
> > +                     gem_context_destroy(i915, ctx);
> > +             }
> > +
> > +             while (nfences--)
> > +                     timeline_advance(timeline, fence_ns);
> > +
> > +             result[nchild] = 1;
> > +             for (int child = 0; child < nchild; child++) {
> > +                     while (!READ_ONCE(result[child]))
> > +                             timeline_advance(timeline, fence_ns);
> > +             }
> > +
> > +             igt_waitchildren();
> > +             close(timeline);
> > +
> > +             /* Are we running out of CPU time, and fail to submit frames? */
> > +             d_time = igt_nsec_elapsed(&tv);
> > +             getrusage(RUSAGE_CHILDREN, &usage);
> > +             cpu_time = d_cpu_time(&usage, &old_usage);
> > +             if (10 * cpu_time > 9 * d_time) {
> > +                     if (nchild > 7)
> > +                             break;
> > +
> > +                     igt_skip_on_f(10 * cpu_time > 9 * d_time,
> > +                                   "%.0f%% CPU usage, presuming capacity exceeded\n",
> > +                                   100. * cpu_time / d_time);
> 
> Aren't children mostly sleeping waiting on fences and like? And if so 
> how/when the test ends up using a lot of CPU time?

lockdep. kasan. And some really slow devices. E.g. CI struggles to hit
31 clients, but on non-debug builds we can sustain 127 clients (the
context switch duration is the ultimate limiting step).

I needed to rule out the impact of the CPU scheduler when evaluating the
GPU. A simple metric being that if we saturate a core, then we are
likely to be experiencing extra latency in submission due to the CPU
scheduler.

> > +             igt_mean_init(&m);
> > +             for (int child = 0; child < nchild; child++)
> > +                     igt_mean_add(&m, result[child]);
> > +
> > +             if (flags & (F_VIP | F_RRUL))
> > +                     vip = result[0];
> > +
> > +             qsort(result, nchild, sizeof(*result), cmp_ul);
> > +             igt_info("%2d clients, range: [%.1f, %.1f], iqr: [%.1f, %.1f], median: %.1f, mean: %.1f ± %.2f ms\n",
> > +                      nchild,
> > +                      1e-6 * result[0],  1e-6 * result[nchild - 1],
> > +                      1e-6 * result[lo], 1e-6 * result[hi],
> > +                      1e-6 * result[nchild / 2],
> > +                      1e-6 * igt_mean_get(&m),
> > +                      1e-6 * sqrt(igt_mean_get_variance(&m)));
> > +
> > +             if (vip != -1) {
> > +                     igt_info("VIP interval %.2f ms\n", 1e-6 * vip);
> > +                     igt_assert(4 * vip > 3 * fence_ns &&
> > +                                3 * vip < 4 * fence_ns);
> > +             }
> > +
> > +             /* May be slowed due to sheer volume of context switches */
> > +             igt_assert(4 * igt_mean_get(&m) > 3 * fence_ns &&
> > +                            igt_mean_get(&m) < 3 * fence_ns);
> > +
> > +             igt_assert(4 * igt_mean_get(&m) > 3 * result[nchild / 2] &&
> > +                        3 * igt_mean_get(&m) < 4 * result[nchild / 2]);
> > +
> > +             igt_assert(2 * (result[hi] - result[lo]) < result[nchild / 2]);
> 
> Put some human readable text above the asserts explaining the criteria 
> please.
> 
> VIP child takes part in the mean and does not affect the result?

The VIP is also running at the same target fps; it hasn't yet been
an issue. When context switch is slow (rcs), the VIP is faster than the
rest so falls outside of the iqr anyway.

But it does cause an oddity in the range.

> > +static void fairslice(int i915,
> > +                   const struct intel_execution_engine2 *e,
> > +                   unsigned long flags)
> > +{
> > +     igt_spin_t *spin = NULL;
> > +     uint32_t ctx[3];
> > +     uint32_t ts[3];
> > +
> > +     for (int i = 0; i < ARRAY_SIZE(ctx); i++) {
> > +             ctx[i] = gem_context_clone_with_engines(i915, 0);
> > +             if (spin == NULL) {
> > +                     spin = __igt_spin_new(i915,
> > +                                           .ctx = ctx[i],
> > +                                           .engine = e->flags,
> > +                                           .flags = flags);
> > +             } else {
> > +                     struct drm_i915_gem_execbuffer2 eb = {
> > +                             .buffer_count = 1,
> > +                             .buffers_ptr = to_user_pointer(&spin->obj[IGT_SPIN_BATCH]),
> > +                             .flags = e->flags,
> > +                             .rsvd1 = ctx[i],
> > +                     };
> > +                     gem_execbuf(i915, &eb);
> > +             }
> > +     }
> > +
> > +     sleep(2); /* over the course of many timeslices */
> > +
> > +     igt_assert(gem_bo_busy(i915, spin->handle));
> > +     igt_spin_end(spin);
> > +     for (int i = 0; i < ARRAY_SIZE(ctx); i++)
> > +             ts[i] = read_ctx_timestamp(i915, ctx[i], e);
> > +
> > +     for (int i = 0; i < ARRAY_SIZE(ctx); i++)
> > +             gem_context_destroy(i915, ctx[i]);
> > +     igt_spin_free(i915, spin);
> > +
> > +     qsort(ts, 3, sizeof(*ts), cmp_u32);
> > +     igt_info("%s: [%.1f, %.1f] ms\n", e->name,
> > +              1e-6 * ticks_to_ns(i915, ts[0]),
> > +              1e-6 * ticks_to_ns(i915, ts[2]));
> 
> Log all three just as well?
> 
> > +
> > +     igt_assert(ts[0] && ts[2] > ts[0]);
>  > +    igt_assert(4 * ts[0] > 3 * ts[2]);
> 
> Three equal priority contexts - why would distribution be expected to be 
> unfair? Intuitively I'd expect a check that all three are within some 
> tolerance of each other, but okay, min and max is good enough, just 
> don't understand the asserts. Max can just as well be equal to min, no? 
> I mean and scheduler would still be considered fair. We should ignore 
> the submission order I think, if that was the point.

The first assert looks more like a leftover from when I used the wrong
compare fn (there was already one compare fn for longs :) and the issue
with cml+ returning 0.

The second assert is that the range is within 25%. Maybe 1/6 is more
interesting for 3, so something like

igt_assert((ts[2] - ts[0]) * 6 < ts[1]);

> Seem clean and logical on the high level and on the implementation 
> level. On the "medium" level I don't claim I tried to understand 
> everything but it's not completely important. With medium level I mean 
> all the different test scenarios, where the important thing is that as 
> long as all children are doing the same thing, which I think they are 
> (small open of VIP), it seems correct to test they will get equal amount 
> of GPU time.
> 
> All subtests pass with the fair scheduler patches?

Yes, although the *solo with 3 clients is borderline on tgl, solo being
targeted at a weak spot of the fair algorithm and initially was very
unfair.

I've tried to cover the userspace throttling algorithms used in practice
and issues found along the way, with a small amount of heterogeneity
(e.g. VIP representing a compositor with a bunch of individual client
windows). Though each client is itself a fixed workload which isn't very
representative, but makes measurements easy.

I think it's a reliable test that scales well across our gen for inter-
client fairness, but it certainly is not the complete picture. There will
always be surprises, and wsim is better suited to trying to replicate
real-world scenarios. One day we should define some regression tests for
wsim metrics.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [igt-dev] [PATCH i-g-t] i915/gem_exec_schedule: Try to spot unfairness
@ 2020-11-25 12:23     ` Chris Wilson
  0 siblings, 0 replies; 20+ messages in thread
From: Chris Wilson @ 2020-11-25 12:23 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx; +Cc: igt-dev, Tvrtko Ursulin

Quoting Tvrtko Ursulin (2020-11-25 11:25:02)
> 
> On 24/11/2020 23:39, Chris Wilson wrote:
> > +static void fair_child(int i915, uint32_t ctx,
> > +                    const struct intel_execution_engine2 *e,
> > +                    uint64_t frame_ns,
> > +                    int timeline,
> > +                    uint32_t common,
> > +                    unsigned int flags,
> > +                    unsigned long *ctl,
> > +                    unsigned long *out)
> > +#define F_SYNC               (1 << 0)
> > +#define F_PACE               (1 << 1)
> > +#define F_FLOW               (1 << 2)
> > +#define F_HALF               (1 << 3)
> > +#define F_SOLO               (1 << 4)
> > +#define F_SPARE              (1 << 5)
> > +#define F_NEXT               (1 << 6)
> > +#define F_VIP                (1 << 7)
> > +#define F_RRUL               (1 << 8)
> > +#define F_SHARE              (1 << 9)
> > +#define F_PING               (1 << 10)
> > +#define F_THROTTLE   (1 << 11)
> > +#define F_ISOLATE    (1 << 12)
> > +{
> > +     const int batches_per_frame = flags & F_SOLO ? 1 : 3;
> > +     struct drm_i915_gem_exec_object2 obj[4] = {
> > +             {},
> > +             {
> > +                     .handle = common ?: gem_create(i915, 4096),
> > +             },
> > +             delay_create(i915, ctx, e, frame_ns / batches_per_frame),
> > +             delay_create(i915, ctx, e, frame_ns / batches_per_frame),
> > +     };
> > +     struct intel_execution_engine2 ping = *e;
> > +     int p_fence = -1, n_fence = -1;
> > +     unsigned long count = 0;
> > +     int n;
> > +
> > +     srandom(getpid());
> > +     if (flags & F_PING)
> > +             ping = pick_random_engine(i915, e);
> > +     obj[0] = tslog_create(i915, ctx, &ping);
> > +
> > +     while (!READ_ONCE(*ctl)) {
> > +             struct drm_i915_gem_execbuffer2 execbuf = {
> > +                     .buffers_ptr = to_user_pointer(obj),
> > +                     .buffer_count = 4,
> > +                     .rsvd1 = ctx,
> > +                     .rsvd2 = -1,
> > +                     .flags = e->flags,
> > +             };
> > +
> > +             if (flags & F_FLOW) {
> > +                     unsigned int seq;
> > +
> > +                     seq = count;
> > +                     if (flags & F_NEXT)
> > +                             seq++;
> > +
> > +                     execbuf.rsvd2 =
> > +                             sw_sync_timeline_create_fence(timeline, seq);
> > +                     execbuf.flags |= I915_EXEC_FENCE_IN;
> > +             }
> > +
> > +             execbuf.flags |= I915_EXEC_FENCE_OUT;
> > +             gem_execbuf_wr(i915, &execbuf);
> > +             n_fence = execbuf.rsvd2 >> 32;
> > +             execbuf.flags &= ~(I915_EXEC_FENCE_OUT | I915_EXEC_FENCE_IN);
> > +             for (n = 1; n < batches_per_frame; n++)
> > +                     gem_execbuf(i915, &execbuf);
> > +             close(execbuf.rsvd2);
> > +
> > +             execbuf.buffer_count = 1;
> > +             execbuf.batch_start_offset = 2048;
> > +             execbuf.flags = ping.flags | I915_EXEC_FENCE_IN;
> > +             execbuf.rsvd2 = n_fence;
> > +             gem_execbuf(i915, &execbuf);
> > +
> > +             if (flags & F_PACE && p_fence != -1) {
> > +                     struct pollfd pfd = {
> > +                             .fd = p_fence,
> > +                             .events = POLLIN,
> > +                     };
> > +                     poll(&pfd, 1, -1);
> > +             }
> > +             close(p_fence);
> > +
> > +             if (flags & F_SYNC) {
> > +                     struct pollfd pfd = {
> > +                             .fd = n_fence,
> > +                             .events = POLLIN,
> > +                     };
> > +                     poll(&pfd, 1, -1);
> > +             }
> > +
> > +             if (flags & F_THROTTLE)
> > +                     igt_ioctl(i915, DRM_IOCTL_I915_GEM_THROTTLE, 0);
> > +
> > +             igt_swap(obj[2], obj[3]);
> > +             igt_swap(p_fence, n_fence);
> 
> What are the sync fences simulating and how come they are always used? I 
> mean no children which submit batched up load?

The sync fences are created for each submission for simplicity. We only
use them for synchronisation in emulating some of the clients (such as
mesa synchronising to previous SwapBuffers, and compositors
synchronising to vblanks). Creating/destroying an unused fence should
not be disruptive...

> > +             count++;
> > +     }
> > +     close(p_fence);
> > +
> > +     gem_close(i915, obj[3].handle);
> > +     gem_close(i915, obj[2].handle);
> > +     if (obj[1].handle != common)
> > +             gem_close(i915, obj[1].handle);
> > +
> > +     gem_sync(i915, obj[0].handle);
> > +     if (out) {
> > +             uint32_t *map;
> > +
> > +             map = gem_mmap__device_coherent(i915, obj[0].handle,
> > +                                             0, 4096, PROT_WRITE);
> > +             for (n = 1; n < min(count, 512); n++) {
> > +                     igt_assert(map[n]);
> > +                     map[n - 1] = map[n] - map[n - 1];
> > +             }
> > +             qsort(map, --n, sizeof(*map), cmp_u32);
> > +             *out = ticks_to_ns(i915, map[n / 2]);
> 
> What is returned? Could you explain the ts journal part a bit?

The median interval between timestamps. Each frame records the
CS_TIMESTAMP (global reference clock) it completed at. We then compute
the interval between each pair of frames and sort that to find the
median. I went with median to err on the side of stability, we want the
tests to be reliable. Checking the distribution within each client is
also interesting, but overwhelming.

> 
> > +             munmap(map, 4096);
> > +     }
> > +     gem_close(i915, obj[0].handle);
> > +}
> > +
> > +static int cmp_ul(const void *A, const void *B)
> > +{
> > +     const unsigned long *a = A, *b = B;
> > +
> > +     if (*a < *b)
> > +             return -1;
> > +     else if (*a > *b)
> > +             return 1;
> > +     else
> > +             return 0;
> > +}
> > +
> > +static uint64_t d_cpu_time(const struct rusage *a, const struct rusage *b)
> > +{
> > +     uint64_t cpu_time = 0;
> > +
> > +     cpu_time += (a->ru_utime.tv_sec - b->ru_utime.tv_sec) * NSEC_PER_SEC;
> > +     cpu_time += (a->ru_utime.tv_usec - b->ru_utime.tv_usec) * 1000;
> > +
> > +     cpu_time += (a->ru_stime.tv_sec - b->ru_stime.tv_sec) * NSEC_PER_SEC;
> > +     cpu_time += (a->ru_stime.tv_usec - b->ru_stime.tv_usec) * 1000;
> > +
> > +     return cpu_time;
> > +}
> > +
> > +static void timeline_advance(int timeline, int delay_ns)
> > +{
> > +     struct timespec tv = { .tv_nsec = delay_ns };
> > +     nanosleep(&tv, NULL);
> > +     sw_sync_timeline_inc(timeline, 1);
> > +}
> > +
> > +static void fairness(int i915,
> > +                  const struct intel_execution_engine2 *e,
> > +                  int timeout, unsigned int flags)
> > +{
> > +     const int frame_ns = 16666 * 1000;
> > +     const int fence_ns = flags & F_HALF ? 2 * frame_ns : frame_ns;
> > +     unsigned long *result;
> > +     uint32_t common = 0;
> > +
> > +     igt_require(has_ctx_timestamp(i915, e));
> > +     igt_require(gem_class_has_mutable_submission(i915, e->class));
> > +
> > +     if (flags & F_SHARE)
> > +             common = gem_create(i915, 4095);
> > +
> > +     result = mmap(NULL, 4096, PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
> > +
> > +     for (int n = 2; n <= 64; n <<= 1) { /* 32 == 500us per client */
> > +             int timeline = sw_sync_timeline_create();
> > +             int nfences = timeout * NSEC_PER_SEC / fence_ns + 1;
> > +             const int nchild = n - 1; /* odd for easy medians */
> > +             const int child_ns = frame_ns / (nchild + !!(flags & F_SPARE));
> > +             const int lo = nchild / 4;
> > +             const int hi = (3 * nchild + 3) / 4 - 1;
> > +             struct rusage old_usage, usage;
> > +             uint64_t cpu_time, d_time;
> > +             unsigned long vip = -1;
> > +             struct timespec tv;
> > +             struct igt_mean m;
> > +
> > +             if (flags & F_PING) {
> > +                     struct intel_execution_engine2 *ping;
> > +
> > +                     __for_each_physical_engine(i915, ping) {
> > +                             if (ping->flags == e->flags)
> > +                                     continue;
> > +
> > +                             igt_fork(child, 1) {
> > +                                     uint32_t ctx = gem_context_clone_with_engines(i915, 0);
> > +
> > +                                     fair_child(i915, ctx, ping,
> > +                                                child_ns / 8,
> > +                                                -1, common,
> > +                                                F_SOLO | F_PACE | F_SHARE,
> > +                                                &result[nchild],
> > +                                                NULL);
> > +
> > +                                     gem_context_destroy(i915, ctx);
> > +                             }
> > +                     }
> > +             }
> > +
> > +             memset(result, 0, (nchild + 1) * sizeof(result[0]));
> 
> Children probably can't write into it before, but still would probably 
> be better moved before the first fork (which passes the results array to 
> children).
> 
> > +             getrusage(RUSAGE_CHILDREN, &old_usage);
> > +             igt_nsec_elapsed(memset(&tv, 0, sizeof(tv)));
> > +             igt_fork(child, nchild) {
> > +                     uint32_t ctx;
> > +
> > +                     if (flags & F_ISOLATE) {
> > +                             int clone, dmabuf = -1;
> > +
> > +                             if (common)
> > +                                     dmabuf = prime_handle_to_fd(i915, common);
> > +
> > +                             clone = gem_reopen_driver(i915);
> > +                             gem_context_copy_engines(i915, 0, clone, 0);
> > +                             i915 = clone;
> > +
> > +                             if (dmabuf != -1)
> > +                                     common = prime_fd_to_handle(i915, dmabuf);
> > +                     }
> > +
> > +                     ctx = gem_context_clone_with_engines(i915, 0);
> > +
> > +                     if (flags & F_VIP && child == 0) {
> > +                             gem_context_set_priority(i915, ctx, MAX_PRIO);
> > +                             flags |= F_FLOW;
> > +                     }
> > +                     if (flags & F_RRUL && child == 0)
> > +                             flags |= F_SOLO | F_FLOW | F_SYNC;
> > +
> > +                     fair_child(i915, ctx, e, child_ns,
> > +                                timeline, common, flags,
> > +                                &result[nchild],
> > +                                &result[child]);
> > +
> > +                     gem_context_destroy(i915, ctx);
> > +             }
> > +
> > +             while (nfences--)
> > +                     timeline_advance(timeline, fence_ns);
> > +
> > +             result[nchild] = 1;
> > +             for (int child = 0; child < nchild; child++) {
> > +                     while (!READ_ONCE(result[child]))
> > +                             timeline_advance(timeline, fence_ns);
> > +             }
> > +
> > +             igt_waitchildren();
> > +             close(timeline);
> > +
> > +             /* Are we running out of CPU time, and fail to submit frames? */
> > +             d_time = igt_nsec_elapsed(&tv);
> > +             getrusage(RUSAGE_CHILDREN, &usage);
> > +             cpu_time = d_cpu_time(&usage, &old_usage);
> > +             if (10 * cpu_time > 9 * d_time) {
> > +                     if (nchild > 7)
> > +                             break;
> > +
> > +                     igt_skip_on_f(10 * cpu_time > 9 * d_time,
> > +                                   "%.0f%% CPU usage, presuming capacity exceeded\n",
> > +                                   100. * cpu_time / d_time);
> 
> Aren't children mostly sleeping waiting on fences and like? And if so 
> how/when the test ends up using a lot of CPU time?

lockdep. kasan. And some really slow devices. E.g. CI struggles to hit
31 clients, but on non-debug builds we can sustain 127 clients (the
context switch duration is the ultimate limiting step).

I needed to rule out the impact of the CPU scheduler when evaluating the
GPU. A simple metric being that if we saturate a core, then we are
likely to be experiencing extra latency in submission due to the CPU
scheduler.

> > +             igt_mean_init(&m);
> > +             for (int child = 0; child < nchild; child++)
> > +                     igt_mean_add(&m, result[child]);
> > +
> > +             if (flags & (F_VIP | F_RRUL))
> > +                     vip = result[0];
> > +
> > +             qsort(result, nchild, sizeof(*result), cmp_ul);
> > +             igt_info("%2d clients, range: [%.1f, %.1f], iqr: [%.1f, %.1f], median: %.1f, mean: %.1f ± %.2f ms\n",
> > +                      nchild,
> > +                      1e-6 * result[0],  1e-6 * result[nchild - 1],
> > +                      1e-6 * result[lo], 1e-6 * result[hi],
> > +                      1e-6 * result[nchild / 2],
> > +                      1e-6 * igt_mean_get(&m),
> > +                      1e-6 * sqrt(igt_mean_get_variance(&m)));
> > +
> > +             if (vip != -1) {
> > +                     igt_info("VIP interval %.2f ms\n", 1e-6 * vip);
> > +                     igt_assert(4 * vip > 3 * fence_ns &&
> > +                                3 * vip < 4 * fence_ns);
> > +             }
> > +
> > +             /* May be slowed due to sheer volume of context switches */
> > +             igt_assert(4 * igt_mean_get(&m) > 3 * fence_ns &&
> > +                            igt_mean_get(&m) < 3 * fence_ns);
> > +
> > +             igt_assert(4 * igt_mean_get(&m) > 3 * result[nchild / 2] &&
> > +                        3 * igt_mean_get(&m) < 4 * result[nchild / 2]);
> > +
> > +             igt_assert(2 * (result[hi] - result[lo]) < result[nchild / 2]);
> 
> Put some human readable text above the asserts explaining the criteria 
> please.
> 
> VIP child takes part in the mean and does not affect the result?

The VIP is also running at the same target fps; it hasn't yet been
an issue. When context switch is slow (rcs), the VIP is faster than the
rest so falls outside of the iqr anyway.

But it does cause an oddity in the range.

> > +static void fairslice(int i915,
> > +                   const struct intel_execution_engine2 *e,
> > +                   unsigned long flags)
> > +{
> > +     igt_spin_t *spin = NULL;
> > +     uint32_t ctx[3];
> > +     uint32_t ts[3];
> > +
> > +     for (int i = 0; i < ARRAY_SIZE(ctx); i++) {
> > +             ctx[i] = gem_context_clone_with_engines(i915, 0);
> > +             if (spin == NULL) {
> > +                     spin = __igt_spin_new(i915,
> > +                                           .ctx = ctx[i],
> > +                                           .engine = e->flags,
> > +                                           .flags = flags);
> > +             } else {
> > +                     struct drm_i915_gem_execbuffer2 eb = {
> > +                             .buffer_count = 1,
> > +                             .buffers_ptr = to_user_pointer(&spin->obj[IGT_SPIN_BATCH]),
> > +                             .flags = e->flags,
> > +                             .rsvd1 = ctx[i],
> > +                     };
> > +                     gem_execbuf(i915, &eb);
> > +             }
> > +     }
> > +
> > +     sleep(2); /* over the course of many timeslices */
> > +
> > +     igt_assert(gem_bo_busy(i915, spin->handle));
> > +     igt_spin_end(spin);
> > +     for (int i = 0; i < ARRAY_SIZE(ctx); i++)
> > +             ts[i] = read_ctx_timestamp(i915, ctx[i], e);
> > +
> > +     for (int i = 0; i < ARRAY_SIZE(ctx); i++)
> > +             gem_context_destroy(i915, ctx[i]);
> > +     igt_spin_free(i915, spin);
> > +
> > +     qsort(ts, 3, sizeof(*ts), cmp_u32);
> > +     igt_info("%s: [%.1f, %.1f] ms\n", e->name,
> > +              1e-6 * ticks_to_ns(i915, ts[0]),
> > +              1e-6 * ticks_to_ns(i915, ts[2]));
> 
> Log all three just as well?
> 
> > +
> > +     igt_assert(ts[0] && ts[2] > ts[0]);
>  > +    igt_assert(4 * ts[0] > 3 * ts[2]);
> 
> Three equal priority contexts - why would distribution be expected to be 
> unfair? Intuitively I'd expect a check that all three are within some 
> tolerance of each other, but okay, min and max is good enough, just 
> don't understand the asserts. Max can just as well be equal to min, no? 
> I mean and scheduler would still be considered fair. We should ignore 
> the submission order I think, if that was the point.

The first assert looks more like a leftover from when I used the wrong
compare fn (there was already one compare fn for longs :) and the issue
with cml+ returning 0.

The second assert is that the range is within 25%. Maybe 1/6 is more
interesting for 3, so something like

igt_assert((ts[2] - ts[0]) * 6 < ts[1]);

> Seem clean and logical on the high level and on the implementation 
> level. On the "medium" level I don't claim I tried to understand 
> everything but it's not completely important. With medium level I mean 
> all the different test scenarios, where the important thing is that as 
> long as all children are doing the same thing, which I think they are 
> (small open of VIP), it seems correct to test they will get equal amount 
> of GPU time.
> 
> All subtests pass with the fair scheduler patches?

Yes, although the *solo with 3 clients is borderline on tgl, solo being
targeted at a weak spot of the fair algorithm and initially was very
unfair.

I've tried to cover the userspace throttling algorithms used in practice
and issues found along the way, with a small amount of heterogeneity
(e.g. VIP representing a compositor with a bunch of individual client
windows). Though each client is itself a fixed workload which isn't very
representative, but makes measurements easy.

I think it's a reliable test that scales well across our gen for inter-
client fairness, but it certainly is not the complete picture. There will
always be surprises, and wsim is better suited to trying to replicate
real-world scenarios. One day we should define some regression tests for
wsim metrics.
-Chris
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [Intel-gfx] [PATCH i-g-t] i915/gem_exec_schedule: Try to spot unfairness
@ 2020-12-16 15:24 Chris Wilson
  0 siblings, 0 replies; 20+ messages in thread
From: Chris Wilson @ 2020-12-16 15:24 UTC (permalink / raw)
  To: intel-gfx; +Cc: igt-dev, Chris Wilson

An important property for multi-client systems is that each client gets
a 'fair' allotment of system time. (Where fairness is at the whim of the
context properties, such as priorities.) This test forks N independent
clients (albeit they happen to share a single vm), and does an equal
amount of work in client and asserts that they take an equal amount of
time.

Though we have never claimed to have a completely fair scheduler, that
is what is expected.

v2: igt_assert_f and more commentary; exclude vip from client stats,
include range of frame intervals from each individual client
v3: Write down what the test actually does!

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Ramalingam C <ramalingam.c@intel.com>
---
 tests/i915/gem_exec_schedule.c | 797 +++++++++++++++++++++++++++++++++
 1 file changed, 797 insertions(+)

diff --git a/tests/i915/gem_exec_schedule.c b/tests/i915/gem_exec_schedule.c
index dd15b2ac7..8be5539aa 100644
--- a/tests/i915/gem_exec_schedule.c
+++ b/tests/i915/gem_exec_schedule.c
@@ -29,6 +29,7 @@
 #include <sys/poll.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
+#include <sys/resource.h>
 #include <sys/syscall.h>
 #include <sched.h>
 #include <signal.h>
@@ -2532,12 +2533,250 @@ static uint64_t div64_u64_round_up(uint64_t x, uint64_t y)
 	return (x + y - 1) / y;
 }
 
+static uint64_t ns_to_ctx_ticks(int i915, uint64_t ns)
+{
+	int f = read_timestamp_frequency(i915);
+	if (intel_gen(intel_get_drm_devid(i915)) == 11)
+		f = 12500000; /* icl!!! are you feeling alright? CTX vs CS */
+	return div64_u64_round_up(ns * f, NSEC_PER_SEC);
+}
+
 static uint64_t ticks_to_ns(int i915, uint64_t ticks)
 {
 	return div64_u64_round_up(ticks * NSEC_PER_SEC,
 				  read_timestamp_frequency(i915));
 }
 
+#define MI_INSTR(opcode, flags) (((opcode) << 23) | (flags))
+
+#define MI_MATH(x)                      MI_INSTR(0x1a, (x) - 1)
+#define MI_MATH_INSTR(opcode, op1, op2) ((opcode) << 20 | (op1) << 10 | (op2))
+/* Opcodes for MI_MATH_INSTR */
+#define   MI_MATH_NOOP                  MI_MATH_INSTR(0x000, 0x0, 0x0)
+#define   MI_MATH_LOAD(op1, op2)        MI_MATH_INSTR(0x080, op1, op2)
+#define   MI_MATH_LOADINV(op1, op2)     MI_MATH_INSTR(0x480, op1, op2)
+#define   MI_MATH_LOAD0(op1)            MI_MATH_INSTR(0x081, op1)
+#define   MI_MATH_LOAD1(op1)            MI_MATH_INSTR(0x481, op1)
+#define   MI_MATH_ADD                   MI_MATH_INSTR(0x100, 0x0, 0x0)
+#define   MI_MATH_SUB                   MI_MATH_INSTR(0x101, 0x0, 0x0)
+#define   MI_MATH_AND                   MI_MATH_INSTR(0x102, 0x0, 0x0)
+#define   MI_MATH_OR                    MI_MATH_INSTR(0x103, 0x0, 0x0)
+#define   MI_MATH_XOR                   MI_MATH_INSTR(0x104, 0x0, 0x0)
+#define   MI_MATH_STORE(op1, op2)       MI_MATH_INSTR(0x180, op1, op2)
+#define   MI_MATH_STOREINV(op1, op2)    MI_MATH_INSTR(0x580, op1, op2)
+/* Registers used as operands in MI_MATH_INSTR */
+#define   MI_MATH_REG(x)                (x)
+#define   MI_MATH_REG_SRCA              0x20
+#define   MI_MATH_REG_SRCB              0x21
+#define   MI_MATH_REG_ACCU              0x31
+#define   MI_MATH_REG_ZF                0x32
+#define   MI_MATH_REG_CF                0x33
+
+#define MI_LOAD_REGISTER_REG    MI_INSTR(0x2A, 1)
+
+static void delay(int i915,
+		  const struct intel_execution_engine2 *e,
+		  uint32_t handle,
+		  uint64_t addr,
+		  uint64_t ns)
+{
+	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
+	const uint32_t base = gem_engine_mmio_base(i915, e->name);
+#define CS_GPR(x) (base + 0x600 + 8 * (x))
+#define RUNTIME (base + 0x3a8)
+	enum { START_TS, NOW_TS };
+	uint32_t *map, *cs, *jmp;
+
+	igt_require(base);
+
+	/* Loop until CTX_TIMESTAMP - initial > @ns */
+
+	cs = map = gem_mmap__device_coherent(i915, handle, 0, 4096, PROT_WRITE);
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(START_TS) + 4;
+	*cs++ = 0;
+	*cs++ = MI_LOAD_REGISTER_REG;
+	*cs++ = RUNTIME;
+	*cs++ = CS_GPR(START_TS);
+
+	while (offset_in_page(cs) & 63)
+		*cs++ = 0;
+	jmp = cs;
+
+	*cs++ = 0x5 << 23; /* MI_ARB_CHECK */
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(NOW_TS) + 4;
+	*cs++ = 0;
+	*cs++ = MI_LOAD_REGISTER_REG;
+	*cs++ = RUNTIME;
+	*cs++ = CS_GPR(NOW_TS);
+
+	/* delta = now - start; inverted to match COND_BBE */
+	*cs++ = MI_MATH(4);
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(NOW_TS));
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(START_TS));
+	*cs++ = MI_MATH_SUB;
+	*cs++ = MI_MATH_STOREINV(MI_MATH_REG(NOW_TS), MI_MATH_REG_ACCU);
+
+	/* Save delta for reading by COND_BBE */
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_GPR(NOW_TS);
+	*cs++ = addr + 4000;
+	*cs++ = addr >> 32;
+
+	/* Delay between SRM and COND_BBE to post the writes */
+	for (int n = 0; n < 8; n++) {
+		*cs++ = MI_STORE_DWORD_IMM;
+		if (use_64b) {
+			*cs++ = addr + 4064;
+			*cs++ = addr >> 32;
+		} else {
+			*cs++ = 0;
+			*cs++ = addr + 4064;
+		}
+		*cs++ = 0;
+	}
+
+	/* Break if delta [time elapsed] > ns */
+	*cs++ = MI_COND_BATCH_BUFFER_END | MI_DO_COMPARE | (1 + use_64b);
+	*cs++ = ~ns_to_ctx_ticks(i915, ns);
+	*cs++ = addr + 4000;
+	*cs++ = addr >> 32;
+
+	/* Otherwise back to recalculating delta */
+	*cs++ = MI_BATCH_BUFFER_START | 1 << 8 | use_64b;
+	*cs++ = addr + offset_in_page(jmp);
+	*cs++ = addr >> 32;
+
+	munmap(map, 4096);
+}
+
+static struct drm_i915_gem_exec_object2
+delay_create(int i915, uint32_t ctx,
+	     const struct intel_execution_engine2 *e,
+	     uint64_t target_ns)
+{
+	struct drm_i915_gem_exec_object2 obj = {
+		.handle = batch_create(i915),
+		.flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS,
+	};
+	struct drm_i915_gem_execbuffer2 execbuf = {
+		.buffers_ptr = to_user_pointer(&obj),
+		.buffer_count = 1,
+		.rsvd1 = ctx,
+		.flags = e->flags,
+	};
+
+	obj.offset = obj.handle << 12;
+	gem_execbuf(i915, &execbuf);
+	gem_sync(i915, obj.handle);
+
+	delay(i915, e, obj.handle, obj.offset, target_ns);
+
+	obj.flags |= EXEC_OBJECT_PINNED;
+	return obj;
+}
+
+static void tslog(int i915,
+		  const struct intel_execution_engine2 *e,
+		  uint32_t handle,
+		  uint64_t addr)
+{
+	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
+	const uint32_t base = gem_engine_mmio_base(i915, e->name);
+#define CS_GPR(x) (base + 0x600 + 8 * (x))
+#define CS_TIMESTAMP (base + 0x358)
+	enum { INC, MASK, ADDR };
+	uint32_t *timestamp_lo, *addr_lo;
+	uint32_t *map, *cs;
+
+	igt_require(base);
+
+	map = gem_mmap__device_coherent(i915, handle, 0, 4096, PROT_WRITE);
+	cs = map + 512;
+
+	/* Record the current CS_TIMESTAMP into a journal [a 512 slot ring]. */
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_TIMESTAMP;
+	timestamp_lo = cs;
+	*cs++ = addr;
+	*cs++ = addr >> 32;
+
+	/* Load the address + inc & mask variables */
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(ADDR);
+	addr_lo = cs;
+	*cs++ = addr;
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(ADDR) + 4;
+	*cs++ = addr >> 32;
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(INC);
+	*cs++ = 4;
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(INC) + 4;
+	*cs++ = 0;
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(MASK);
+	*cs++ = 0xfffff7ff;
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(MASK) + 4;
+	*cs++ = 0xffffffff;
+
+	/* Increment the [ring] address for saving CS_TIMESTAMP */
+	*cs++ = MI_MATH(8);
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(INC));
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(ADDR));
+	*cs++ = MI_MATH_ADD;
+	*cs++ = MI_MATH_STORE(MI_MATH_REG(ADDR), MI_MATH_REG_ACCU);
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(ADDR));
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(MASK));
+	*cs++ = MI_MATH_AND;
+	*cs++ = MI_MATH_STORE(MI_MATH_REG(ADDR), MI_MATH_REG_ACCU);
+
+	/* Rewrite the batch buffer for the next execution */
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_GPR(ADDR);
+	*cs++ = addr + offset_in_page(timestamp_lo);
+	*cs++ = addr >> 32;
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_GPR(ADDR);
+	*cs++ = addr + offset_in_page(addr_lo);
+	*cs++ = addr >> 32;
+
+	*cs++ = MI_BATCH_BUFFER_END;
+
+	munmap(map, 4096);
+}
+
+static struct drm_i915_gem_exec_object2
+tslog_create(int i915, uint32_t ctx, const struct intel_execution_engine2 *e)
+{
+	struct drm_i915_gem_exec_object2 obj = {
+		.handle = batch_create(i915),
+		.flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS,
+	};
+	struct drm_i915_gem_execbuffer2 execbuf = {
+		.buffers_ptr = to_user_pointer(&obj),
+		.buffer_count = 1,
+		.rsvd1 = ctx,
+		.flags = e->flags,
+	};
+
+	obj.offset = obj.handle << 12;
+	gem_execbuf(i915, &execbuf);
+	gem_sync(i915, obj.handle);
+
+	tslog(i915, e, obj.handle, obj.offset);
+
+	obj.flags |= EXEC_OBJECT_PINNED;
+	return obj;
+}
+
 static int cmp_u32(const void *A, const void *B)
 {
 	const uint32_t *a = A, *b = B;
@@ -2550,6 +2789,560 @@ static int cmp_u32(const void *A, const void *B)
 		return 0;
 }
 
+static bool has_ctx_timestamp(int i915, const struct intel_execution_engine2 *e)
+{
+	const int gen = intel_gen(intel_get_drm_devid(i915));
+
+	if (gen == 8 && e->class == I915_ENGINE_CLASS_VIDEO)
+		return false; /* looks fubar */
+
+	return true;
+}
+
+static struct intel_execution_engine2
+pick_random_engine(int i915, const struct intel_execution_engine2 *not)
+{
+	const struct intel_execution_engine2 *e;
+	unsigned int count = 0;
+
+	__for_each_physical_engine(i915, e) {
+		if (e->flags == not->flags)
+			continue;
+		if (!gem_class_has_mutable_submission(i915, e->class))
+			continue;
+		count++;
+	}
+	if (!count)
+		return *not;
+
+	count = rand() % count;
+	__for_each_physical_engine(i915, e) {
+		if (e->flags == not->flags)
+			continue;
+		if (!gem_class_has_mutable_submission(i915, e->class))
+			continue;
+		if (!count--)
+			break;
+	}
+
+	return *e;
+}
+
+static void fair_child(int i915, uint32_t ctx,
+		       const struct intel_execution_engine2 *e,
+		       uint64_t frame_ns,
+		       int timeline,
+		       uint32_t common,
+		       unsigned int flags,
+		       unsigned long *ctl,
+		       unsigned long *median,
+		       unsigned long *iqr)
+#define F_SYNC		(1 << 0)
+#define F_PACE		(1 << 1)
+#define F_FLOW		(1 << 2)
+#define F_HALF		(1 << 3)
+#define F_SOLO		(1 << 4)
+#define F_SPARE		(1 << 5)
+#define F_NEXT		(1 << 6)
+#define F_VIP		(1 << 7)
+#define F_RRUL		(1 << 8)
+#define F_SHARE		(1 << 9)
+#define F_PING		(1 << 10)
+#define F_THROTTLE	(1 << 11)
+#define F_ISOLATE	(1 << 12)
+{
+	const int batches_per_frame = flags & F_SOLO ? 1 : 3;
+	struct drm_i915_gem_exec_object2 obj[4] = {
+		{},
+		{
+			.handle = common ?: gem_create(i915, 4096),
+		},
+		delay_create(i915, ctx, e, frame_ns / batches_per_frame),
+		delay_create(i915, ctx, e, frame_ns / batches_per_frame),
+	};
+	struct intel_execution_engine2 ping = *e;
+	int p_fence = -1, n_fence = -1;
+	unsigned long count = 0;
+	int n;
+
+	srandom(getpid());
+	if (flags & F_PING)
+		ping = pick_random_engine(i915, e);
+	obj[0] = tslog_create(i915, ctx, &ping);
+
+	while (!READ_ONCE(*ctl)) {
+		struct drm_i915_gem_execbuffer2 execbuf = {
+			.buffers_ptr = to_user_pointer(obj),
+			.buffer_count = 3,
+			.rsvd1 = ctx,
+			.rsvd2 = -1,
+			.flags = e->flags,
+		};
+
+		if (flags & F_FLOW) {
+			unsigned int seq;
+
+			seq = count;
+			if (flags & F_NEXT)
+				seq++;
+
+			execbuf.rsvd2 =
+				sw_sync_timeline_create_fence(timeline, seq);
+			execbuf.flags |= I915_EXEC_FENCE_IN;
+		}
+
+		execbuf.flags |= I915_EXEC_FENCE_OUT;
+		gem_execbuf_wr(i915, &execbuf);
+		n_fence = execbuf.rsvd2 >> 32;
+		execbuf.flags &= ~(I915_EXEC_FENCE_OUT | I915_EXEC_FENCE_IN);
+		for (n = 1; n < batches_per_frame; n++)
+			gem_execbuf(i915, &execbuf);
+		close(execbuf.rsvd2);
+
+		execbuf.buffer_count = 1;
+		execbuf.batch_start_offset = 2048;
+		execbuf.flags = ping.flags | I915_EXEC_FENCE_IN;
+		execbuf.rsvd2 = n_fence;
+		gem_execbuf(i915, &execbuf);
+
+		if (flags & F_PACE && p_fence != -1) {
+			struct pollfd pfd = {
+				.fd = p_fence,
+				.events = POLLIN,
+			};
+			poll(&pfd, 1, -1);
+		}
+		close(p_fence);
+
+		if (flags & F_SYNC) {
+			struct pollfd pfd = {
+				.fd = n_fence,
+				.events = POLLIN,
+			};
+			poll(&pfd, 1, -1);
+		}
+
+		if (flags & F_THROTTLE)
+			igt_ioctl(i915, DRM_IOCTL_I915_GEM_THROTTLE, 0);
+
+		igt_swap(obj[2], obj[3]);
+		igt_swap(p_fence, n_fence);
+		count++;
+	}
+	close(p_fence);
+
+	gem_close(i915, obj[3].handle);
+	gem_close(i915, obj[2].handle);
+	if (obj[1].handle != common)
+		gem_close(i915, obj[1].handle);
+
+	gem_sync(i915, obj[0].handle);
+	if (median) {
+		uint32_t *map;
+
+		/*
+		 * We recorded the CS_TIMESTAMP of each frame, and if
+		 * the GPU is being shared completely fairly, we expect
+		 * each frame to be at the same interval from the last.
+		 *
+		 * Compute the interval between frames and report back
+		 * both the median interval and the range for this client.
+		 */
+
+		map = gem_mmap__device_coherent(i915, obj[0].handle,
+						0, 4096, PROT_WRITE);
+		for (n = 1; n < min(count, 512); n++) {
+			igt_assert(map[n]);
+			map[n - 1] = map[n] - map[n - 1];
+		}
+		qsort(map, --n, sizeof(*map), cmp_u32);
+		*iqr = ticks_to_ns(i915, map[(3 * n + 3) / 4] - map[n / 4]);
+		*median = ticks_to_ns(i915, map[n / 2]);
+		munmap(map, 4096);
+	}
+	gem_close(i915, obj[0].handle);
+}
+
+static int cmp_ul(const void *A, const void *B)
+{
+	const unsigned long *a = A, *b = B;
+
+	if (*a < *b)
+		return -1;
+	else if (*a > *b)
+		return 1;
+	else
+		return 0;
+}
+
+static uint64_t d_cpu_time(const struct rusage *a, const struct rusage *b)
+{
+	uint64_t cpu_time = 0;
+
+	cpu_time += (a->ru_utime.tv_sec - b->ru_utime.tv_sec) * NSEC_PER_SEC;
+	cpu_time += (a->ru_utime.tv_usec - b->ru_utime.tv_usec) * 1000;
+
+	cpu_time += (a->ru_stime.tv_sec - b->ru_stime.tv_sec) * NSEC_PER_SEC;
+	cpu_time += (a->ru_stime.tv_usec - b->ru_stime.tv_usec) * 1000;
+
+	return cpu_time;
+}
+
+static void timeline_advance(int timeline, int delay_ns)
+{
+	struct timespec tv = { .tv_nsec = delay_ns };
+	nanosleep(&tv, NULL);
+	sw_sync_timeline_inc(timeline, 1);
+}
+
+static void fairness(int i915,
+		     const struct intel_execution_engine2 *e,
+		     int timeout, unsigned int flags)
+{
+	const int frame_ns = 16666 * 1000;
+	const int fence_ns = flags & F_HALF ? 2 * frame_ns : frame_ns;
+	unsigned long *result, *iqr;
+	uint32_t common = 0;
+
+	igt_require(has_ctx_timestamp(i915, e));
+	igt_require(gem_class_has_mutable_submission(i915, e->class));
+
+	if (flags & F_SHARE)
+		common = gem_create(i915, 4095);
+
+	result = mmap(NULL, 4096, PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
+	igt_assert(result != MAP_FAILED);
+	iqr = mmap(NULL, 4096, PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
+	igt_assert(iqr != MAP_FAILED);
+
+	/*
+	 * The combined workload always runs at a 60fps target (unless F_HALF!).
+	 * This gives a frame of interval of 16ms that is evenly split across
+	 * all the clients, so simulating a system with a bunch of clients that
+	 * are perfectly balanced and can sustain 60fps. Our job is to ensure
+	 * that each client does run at a smooth 60fps.
+	 *
+	 * Each client runs a fixed length delay loop (as a single request,
+	 * or split into 3) and then records the CS_TIMESTAMP after completing
+	 * its delay. Given a fair allotment of GPU time to each client,
+	 * that timestamp will [ideally] be at a precise 16ms intervals.
+	 * In practice, time is wasted on context switches, so as the number
+	 * of clients increases, the proprotion of time spent on context
+	 * switches grows. As we get to 64 render clients, we will be spending
+	 * as much time in context switches as executing the client workloads.
+	 *
+	 * Each client frame may be paced by some throttling technique found
+	 * in the wild. i.e. each client may wait until a simulated vblank
+	 * to indicate the start of a new frame, or it may wait until the
+	 * completion of a previous frame. This causes submission from each
+	 * client and across the system to be chunky and uneven.
+	 *
+	 * We look at the variation of frame intervals within each client, and
+	 * the variation of the medians across the clients to see if the
+	 * distribution (budget) of GPU time was fair enough.
+	 *
+	 * Alternative (and important) metrics will be more latency centric;
+	 * looking at how well we can sustain meeting deadline given competition
+	 * by clients for the GPU.
+	 */
+
+	for (int n = 2; n <= 256; n <<= 1) { /* 32 == 500us per client */
+		int timeline = sw_sync_timeline_create();
+		int nfences = timeout * NSEC_PER_SEC / fence_ns + 1;
+		int nchild = n - 1; /* odd for easy medians */
+		const int child_ns = frame_ns / (nchild + !!(flags & F_SPARE));
+		const int lo = nchild / 4;
+		const int hi = (3 * nchild + 3) / 4 - 1;
+		struct rusage old_usage, usage;
+		uint64_t cpu_time, d_time;
+		struct timespec tv;
+		struct igt_mean m;
+
+		memset(result, 0, (nchild + 1) * sizeof(result[0]));
+
+		if (flags & F_PING) { /* fill the others with light bg load */
+			struct intel_execution_engine2 *ping;
+
+			__for_each_physical_engine(i915, ping) {
+				if (ping->flags == e->flags)
+					continue;
+
+				igt_fork(child, 1) {
+					uint32_t ctx = gem_context_clone_with_engines(i915, 0);
+
+					fair_child(i915, ctx, ping,
+						   child_ns / 8,
+						   -1, common,
+						   F_SOLO | F_PACE | F_SHARE,
+						   &result[nchild],
+						   NULL, NULL);
+
+					gem_context_destroy(i915, ctx);
+				}
+			}
+		}
+
+		getrusage(RUSAGE_CHILDREN, &old_usage);
+		igt_nsec_elapsed(memset(&tv, 0, sizeof(tv)));
+		igt_fork(child, nchild) {
+			uint32_t ctx;
+
+			if (flags & F_ISOLATE) {
+				int clone, dmabuf = -1;
+
+				if (common)
+					dmabuf = prime_handle_to_fd(i915, common);
+
+				clone = gem_reopen_driver(i915);
+				gem_context_copy_engines(i915, 0, clone, 0);
+				i915 = clone;
+
+				if (dmabuf != -1)
+					common = prime_fd_to_handle(i915, dmabuf);
+			}
+
+			ctx = gem_context_clone_with_engines(i915, 0);
+
+			if (flags & F_VIP && child == 0) {
+				gem_context_set_priority(i915, ctx, MAX_PRIO);
+				flags |= F_FLOW;
+			}
+			if (flags & F_RRUL && child == 0)
+				flags |= F_SOLO | F_FLOW | F_SYNC;
+
+			fair_child(i915, ctx, e, child_ns,
+				   timeline, common, flags,
+				   &result[nchild],
+				   &result[child], &iqr[child]);
+
+			gem_context_destroy(i915, ctx);
+		}
+
+		while (nfences--)
+			timeline_advance(timeline, fence_ns);
+
+		result[nchild] = 1;
+		for (int child = 0; child < nchild; child++) {
+			while (!READ_ONCE(result[child]))
+				timeline_advance(timeline, fence_ns);
+		}
+
+		igt_waitchildren();
+		close(timeline);
+
+		/*
+		 * Are we running out of CPU time, and fail to submit frames?
+		 *
+		 * We try to rule out any undue impact on the GPU scheduling
+		 * from the CPU scheduler by looking for core saturation. If
+		 * we may be in a situation where the clients + kernel are
+		 * taking a whole core (think lockdep), then it is increasingly
+		 * likely that our measurements include delays from the CPU
+		 * scheduler. Err on the side of caution.
+		 */
+		d_time = igt_nsec_elapsed(&tv);
+		getrusage(RUSAGE_CHILDREN, &usage);
+		cpu_time = d_cpu_time(&usage, &old_usage);
+		igt_debug("CPU usage: %.0f%%\n", 100. * cpu_time / d_time);
+		if (4 * cpu_time > 3 * d_time) {
+			if (nchild > 7) /* good enough to judge pass/fail */
+				break;
+
+			igt_skip_on_f(4 * cpu_time > 3 * d_time,
+				      "%.0f%% CPU usage, presuming capacity exceeded\n",
+				      100. * cpu_time / d_time);
+		}
+
+		/* With no contention, we should match our target frametime */
+		if (nchild == 1) {
+			igt_assert(4 * result[0] > 3 * fence_ns &&
+				   3 * result[0] < 4 * fence_ns);
+			continue;
+		}
+
+		/*
+		 * The VIP should always be able to hit the target frame rate;
+		 * regardless of budget contention from lessor clients.
+		 */
+		if (flags & (F_VIP | F_RRUL)) {
+			igt_info("VIP interval %.2fms, range %.2fms\n",
+				 1e-6 * result[0], 1e-6 * iqr[0]);
+			igt_assert_f(4 * result[0] > 3 * fence_ns &&
+				     3 * result[0] < 4 * fence_ns,
+				     "VIP expects to run exactly when it wants, expects an interval of %.2fms, was %.2fms\n",
+				     1e-6 * fence_ns, 1e-6 * result[0]);
+			igt_assert_f(2 * iqr[0] < result[0],
+				     "VIP frame IQR %.2fms exceeded median threshold %.2fms\n",
+				     1e-6 * iqr[0],
+				     1e-6 * result[0] / 2);
+			if (!--nchild)
+				continue;
+
+			/* Exclude the VIP result from the plebian statistics */
+			memmove(result, result + 1, nchild * sizeof(*result));
+			memmove(iqr, iqr + 1, nchild * sizeof(*iqr));
+		}
+
+		igt_mean_init(&m);
+		for (int child = 0; child < nchild; child++)
+			igt_mean_add(&m, result[child]);
+
+		qsort(result, nchild, sizeof(*result), cmp_ul);
+		qsort(iqr, nchild, sizeof(*iqr), cmp_ul);
+
+		/*
+		 * The target interval for median/mean is 16ms (fence_ns).
+		 * However, this work is evenly split across the clients so
+		 * the range (and median) of client medians may be much less
+		 * than 16ms [16/3N]. We present median of medians to try
+		 * and avoid any instability while running in CI; at the cost
+		 * of insensitivity!
+		 */
+		igt_info("%3d clients, range: [%.1f, %.1f], iqr: [%.1f, %.1f], median: %.1f [%.1f, %.1f], mean: %.1f ± %.2f ms, cpu: %.0f%%\n",
+			 nchild,
+			 1e-6 * result[0],  1e-6 * result[nchild - 1],
+			 1e-6 * result[lo], 1e-6 * result[hi],
+			 1e-6 * result[nchild / 2],
+			 1e-6 * iqr[lo], 1e-6 * iqr[hi],
+			 1e-6 * igt_mean_get(&m),
+			 1e-6 * sqrt(igt_mean_get_variance(&m)),
+			 100. * cpu_time / d_time);
+
+		igt_assert_f(iqr[nchild / 2] < 2 * result[nchild / 2],
+			     "Child frame IQR %.2fms exceeded median threshold %.2fms\n",
+			     1e-6 * iqr[nchild / 2],
+			     1e-6 * result[nchild / 2] * 2);
+
+		igt_assert_f(4 * igt_mean_get(&m) > 3 * result[nchild / 2] &&
+			     3 * igt_mean_get(&m) < 4 * result[nchild / 2],
+			     "Mean of client interval %.2fms differs from median %.2fms, distribution is skewed\n",
+
+			     1e-6 * igt_mean_get(&m), 1e-6 * result[nchild / 2]);
+
+		igt_assert_f(result[nchild / 2] > frame_ns / 2,
+			     "Median client interval %.2fms did not match target interval %.2fms\n",
+			     1e-6 * result[nchild / 2], 1e-6 * frame_ns);
+
+
+		igt_assert_f(result[hi] - result[lo] < result[nchild / 2],
+			     "Interquartile range of client intervals %.2fms is as large as the median threshold %.2fms, clients are not evenly distributed!\n",
+			     1e-6 * (result[hi] - result[lo]),
+			     1e-6 * result[nchild / 2]);
+
+		/* May be slowed due to sheer volume of context switches */
+		if (result[0] > 2 * fence_ns)
+			break;
+	}
+
+	munmap(iqr, 4096);
+	munmap(result, 4096);
+	if (common)
+		gem_close(i915, common);
+}
+
+static void test_fairness(int i915, int timeout)
+{
+	static const struct {
+		const char *name;
+		unsigned int flags;
+	} fair[] = {
+		/*
+		 * none - maximal greed in each client
+		 *
+		 * Push as many frames from each client as fast as possible
+		 */
+		{ "none",       0 },
+		{ "none-vip",   F_VIP }, /* one vip client must meet deadlines */
+		{ "none-solo",  F_SOLO }, /* 1 batch per frame per client */
+		{ "none-share", F_SHARE }, /* read from a common buffer */
+		{ "none-rrul",  F_RRUL }, /* "realtime-response under load" */
+		{ "none-ping",  F_PING }, /* measure inter-engine fairness */
+
+		/*
+		 * throttle - original per client throttling
+		 *
+		 * Used for front buffering rendering where there is no
+		 * extenal frame marker. Each client tries to only keep
+		 * 20ms of work submitted, though that measurement is
+		 * flawed...
+		 *
+		 * This is used by Xorg to try and maintain some resembalance
+		 * of input/output consistency when being feed a continuous
+		 * stream of X11 draw requests straight into scanout, where
+		 * the clients may submit the work faster than can be drawn.
+		 *
+		 * Throttling tracks requests per-file (and assumes that
+		 * all requests are in submission order across the whole file),
+		 * so we split each child to its own fd.
+		 */
+		{ "throttle",       F_THROTTLE | F_ISOLATE },
+		{ "throttle-vip",   F_THROTTLE | F_ISOLATE | F_VIP },
+		{ "throttle-solo",  F_THROTTLE | F_ISOLATE | F_SOLO },
+		{ "throttle-share", F_THROTTLE | F_ISOLATE | F_SHARE },
+		{ "throttle-rrul",  F_THROTTLE | F_ISOLATE | F_RRUL },
+
+		/*
+		 * pace - mesa "submit double buffering"
+		 *
+		 * Submit a frame, wait for previous frame to start. This
+		 * prevents each client from getting too far ahead of its
+		 * rendering, maintaining a consistent input/output latency.
+		 */
+		{ "pace",       F_PACE },
+		{ "pace-solo",  F_PACE | F_SOLO },
+		{ "pace-share", F_PACE | F_SOLO | F_SHARE },
+		{ "pace-ping",  F_PACE | F_SOLO | F_SHARE | F_PING},
+
+		/* sync - only submit a frame at a time */
+		{ "sync",      F_SYNC },
+		{ "sync-vip",  F_SYNC | F_VIP },
+		{ "sync-solo", F_SYNC | F_SOLO },
+
+		/* flow - synchronise execution against the clock (vblank) */
+		{ "flow",       F_PACE | F_FLOW },
+		{ "flow-solo",  F_PACE | F_FLOW | F_SOLO },
+		{ "flow-share", F_PACE | F_FLOW | F_SHARE },
+		{ "flow-ping",  F_PACE | F_FLOW | F_SHARE | F_PING },
+
+		/* next - submit ahead of the clock (vblank double buffering) */
+		{ "next",       F_PACE | F_FLOW | F_NEXT },
+		{ "next-solo",  F_PACE | F_FLOW | F_NEXT | F_SOLO },
+		{ "next-share", F_PACE | F_FLOW | F_NEXT | F_SHARE },
+		{ "next-ping",  F_PACE | F_FLOW | F_NEXT | F_SHARE | F_PING },
+
+		/* spare - underutilise by a single client timeslice */
+		{ "spare",      F_PACE | F_FLOW | F_SPARE },
+		{ "spare-solo", F_PACE | F_FLOW | F_SPARE | F_SOLO },
+
+		/* half - run at half pace (submit 16ms of work every 32ms) */
+		{ "half",       F_PACE | F_FLOW | F_HALF },
+		{ "half-solo",  F_PACE | F_FLOW | F_HALF | F_SOLO },
+
+		{}
+	};
+
+	igt_fixture {
+		igt_info("CS timestamp frequency: %d\n",
+			 read_timestamp_frequency(i915));
+
+		igt_require(intel_gen(intel_get_drm_devid(i915)) >= 8);
+	}
+
+	for (typeof(*fair) *f = fair; f->name; f++) {
+		igt_subtest_with_dynamic_f("fair-%s", f->name)  {
+			const struct intel_execution_engine2 *e;
+
+			__for_each_physical_engine(i915, e) {
+				if (!gem_class_can_store_dword(i915, e->class))
+					continue;
+
+				igt_dynamic_f("%s", e->name)
+					fairness(i915, e, timeout, f->flags);
+			}
+		}
+	}
+}
+
 static uint32_t read_ctx_timestamp(int i915,
 				   uint32_t ctx,
 				   const struct intel_execution_engine2 *e)
@@ -2789,6 +3582,10 @@ igt_main
 		test_each_engine_store("promotion", fd, e)
 			promotion(fd, e->flags);
 
+		igt_subtest_group {
+			test_fairness(fd, 2);
+		}
+
 		igt_subtest_group {
 			igt_fixture {
 				igt_require(gem_scheduler_has_preemption(fd));
-- 
2.29.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [Intel-gfx] [PATCH i-g-t] i915/gem_exec_schedule: Try to spot unfairness
@ 2020-12-10  2:09 Chris Wilson
  0 siblings, 0 replies; 20+ messages in thread
From: Chris Wilson @ 2020-12-10  2:09 UTC (permalink / raw)
  To: intel-gfx; +Cc: igt-dev, Chris Wilson

An important property for multi-client systems is that each client gets
a 'fair' allotment of system time. (Where fairness is at the whim of the
context properties, such as priorities.) This test forks N independent
clients (albeit they happen to share a single vm), and does an equal
amount of work in client and asserts that they take an equal amount of
time.

Though we have never claimed to have a completely fair scheduler, that
is what is expected.

v2: igt_assert_f and more commentary; exclude vip from client stats,
include range of frame intervals from each individual client
v3: Write down what the test actually does!

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Ramalingam C <ramalingam.c@intel.com>
---
 tests/i915/gem_exec_schedule.c | 954 +++++++++++++++++++++++++++++++++
 1 file changed, 954 insertions(+)

diff --git a/tests/i915/gem_exec_schedule.c b/tests/i915/gem_exec_schedule.c
index f23d63ac3..67cf88e72 100644
--- a/tests/i915/gem_exec_schedule.c
+++ b/tests/i915/gem_exec_schedule.c
@@ -29,6 +29,7 @@
 #include <sys/poll.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
+#include <sys/resource.h>
 #include <sys/syscall.h>
 #include <sched.h>
 #include <signal.h>
@@ -2516,6 +2517,926 @@ static void measure_semaphore_power(int i915)
 	rapl_close(&pkg);
 }
 
+static int read_timestamp_frequency(int i915)
+{
+	int value = 0;
+	drm_i915_getparam_t gp = {
+		.value = &value,
+		.param = I915_PARAM_CS_TIMESTAMP_FREQUENCY,
+	};
+	ioctl(i915, DRM_IOCTL_I915_GETPARAM, &gp);
+	return value;
+}
+
+static uint64_t div64_u64_round_up(uint64_t x, uint64_t y)
+{
+	return (x + y - 1) / y;
+}
+
+static uint64_t ns_to_ctx_ticks(int i915, uint64_t ns)
+{
+	int f = read_timestamp_frequency(i915);
+	if (intel_gen(intel_get_drm_devid(i915)) == 11)
+		f = 12500000; /* icl!!! are you feeling alright? CTX vs CS */
+	return div64_u64_round_up(ns * f, NSEC_PER_SEC);
+}
+
+static uint64_t ticks_to_ns(int i915, uint64_t ticks)
+{
+	return div64_u64_round_up(ticks * NSEC_PER_SEC,
+				  read_timestamp_frequency(i915));
+}
+
+#define MI_INSTR(opcode, flags) (((opcode) << 23) | (flags))
+
+#define MI_MATH(x)                      MI_INSTR(0x1a, (x) - 1)
+#define MI_MATH_INSTR(opcode, op1, op2) ((opcode) << 20 | (op1) << 10 | (op2))
+/* Opcodes for MI_MATH_INSTR */
+#define   MI_MATH_NOOP                  MI_MATH_INSTR(0x000, 0x0, 0x0)
+#define   MI_MATH_LOAD(op1, op2)        MI_MATH_INSTR(0x080, op1, op2)
+#define   MI_MATH_LOADINV(op1, op2)     MI_MATH_INSTR(0x480, op1, op2)
+#define   MI_MATH_LOAD0(op1)            MI_MATH_INSTR(0x081, op1)
+#define   MI_MATH_LOAD1(op1)            MI_MATH_INSTR(0x481, op1)
+#define   MI_MATH_ADD                   MI_MATH_INSTR(0x100, 0x0, 0x0)
+#define   MI_MATH_SUB                   MI_MATH_INSTR(0x101, 0x0, 0x0)
+#define   MI_MATH_AND                   MI_MATH_INSTR(0x102, 0x0, 0x0)
+#define   MI_MATH_OR                    MI_MATH_INSTR(0x103, 0x0, 0x0)
+#define   MI_MATH_XOR                   MI_MATH_INSTR(0x104, 0x0, 0x0)
+#define   MI_MATH_STORE(op1, op2)       MI_MATH_INSTR(0x180, op1, op2)
+#define   MI_MATH_STOREINV(op1, op2)    MI_MATH_INSTR(0x580, op1, op2)
+/* Registers used as operands in MI_MATH_INSTR */
+#define   MI_MATH_REG(x)                (x)
+#define   MI_MATH_REG_SRCA              0x20
+#define   MI_MATH_REG_SRCB              0x21
+#define   MI_MATH_REG_ACCU              0x31
+#define   MI_MATH_REG_ZF                0x32
+#define   MI_MATH_REG_CF                0x33
+
+#define MI_LOAD_REGISTER_REG    MI_INSTR(0x2A, 1)
+
+static void delay(int i915,
+		  const struct intel_execution_engine2 *e,
+		  uint32_t handle,
+		  uint64_t addr,
+		  uint64_t ns)
+{
+	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
+	const uint32_t base = gem_engine_mmio_base(i915, e->name);
+#define CS_GPR(x) (base + 0x600 + 8 * (x))
+#define RUNTIME (base + 0x3a8)
+	enum { START_TS, NOW_TS };
+	uint32_t *map, *cs, *jmp;
+
+	igt_require(base);
+
+	/* Loop until CTX_TIMESTAMP - initial > @ns */
+
+	cs = map = gem_mmap__device_coherent(i915, handle, 0, 4096, PROT_WRITE);
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(START_TS) + 4;
+	*cs++ = 0;
+	*cs++ = MI_LOAD_REGISTER_REG;
+	*cs++ = RUNTIME;
+	*cs++ = CS_GPR(START_TS);
+
+	while (offset_in_page(cs) & 63)
+		*cs++ = 0;
+	jmp = cs;
+
+	*cs++ = 0x5 << 23; /* MI_ARB_CHECK */
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(NOW_TS) + 4;
+	*cs++ = 0;
+	*cs++ = MI_LOAD_REGISTER_REG;
+	*cs++ = RUNTIME;
+	*cs++ = CS_GPR(NOW_TS);
+
+	/* delta = now - start; inverted to match COND_BBE */
+	*cs++ = MI_MATH(4);
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(NOW_TS));
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(START_TS));
+	*cs++ = MI_MATH_SUB;
+	*cs++ = MI_MATH_STOREINV(MI_MATH_REG(NOW_TS), MI_MATH_REG_ACCU);
+
+	/* Save delta for reading by COND_BBE */
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_GPR(NOW_TS);
+	*cs++ = addr + 4000;
+	*cs++ = addr >> 32;
+
+	/* Delay between SRM and COND_BBE to post the writes */
+	for (int n = 0; n < 8; n++) {
+		*cs++ = MI_STORE_DWORD_IMM;
+		if (use_64b) {
+			*cs++ = addr + 4064;
+			*cs++ = addr >> 32;
+		} else {
+			*cs++ = 0;
+			*cs++ = addr + 4064;
+		}
+		*cs++ = 0;
+	}
+
+	/* Break if delta [time elapsed] > ns */
+	*cs++ = MI_COND_BATCH_BUFFER_END | MI_DO_COMPARE | (1 + use_64b);
+	*cs++ = ~ns_to_ctx_ticks(i915, ns);
+	*cs++ = addr + 4000;
+	*cs++ = addr >> 32;
+
+	/* Otherwise back to recalculating delta */
+	*cs++ = MI_BATCH_BUFFER_START | 1 << 8 | use_64b;
+	*cs++ = addr + offset_in_page(jmp);
+	*cs++ = addr >> 32;
+
+	munmap(map, 4096);
+}
+
+static struct drm_i915_gem_exec_object2
+delay_create(int i915, uint32_t ctx,
+	     const struct intel_execution_engine2 *e,
+	     uint64_t target_ns)
+{
+	struct drm_i915_gem_exec_object2 obj = {
+		.handle = batch_create(i915),
+		.flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS,
+	};
+	struct drm_i915_gem_execbuffer2 execbuf = {
+		.buffers_ptr = to_user_pointer(&obj),
+		.buffer_count = 1,
+		.rsvd1 = ctx,
+		.flags = e->flags,
+	};
+
+	obj.offset = obj.handle << 12;
+	gem_execbuf(i915, &execbuf);
+	gem_sync(i915, obj.handle);
+
+	delay(i915, e, obj.handle, obj.offset, target_ns);
+
+	obj.flags |= EXEC_OBJECT_PINNED;
+	return obj;
+}
+
+static void tslog(int i915,
+		  const struct intel_execution_engine2 *e,
+		  uint32_t handle,
+		  uint64_t addr)
+{
+	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
+	const uint32_t base = gem_engine_mmio_base(i915, e->name);
+#define CS_GPR(x) (base + 0x600 + 8 * (x))
+#define CS_TIMESTAMP (base + 0x358)
+	enum { INC, MASK, ADDR };
+	uint32_t *timestamp_lo, *addr_lo;
+	uint32_t *map, *cs;
+
+	igt_require(base);
+
+	map = gem_mmap__device_coherent(i915, handle, 0, 4096, PROT_WRITE);
+	cs = map + 512;
+
+	/* Record the current CS_TIMESTAMP into a journal [a 512 slot ring]. */
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_TIMESTAMP;
+	timestamp_lo = cs;
+	*cs++ = addr;
+	*cs++ = addr >> 32;
+
+	/* Load the address + inc & mask variables */
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(ADDR);
+	addr_lo = cs;
+	*cs++ = addr;
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(ADDR) + 4;
+	*cs++ = addr >> 32;
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(INC);
+	*cs++ = 4;
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(INC) + 4;
+	*cs++ = 0;
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(MASK);
+	*cs++ = 0xfffff7ff;
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(MASK) + 4;
+	*cs++ = 0xffffffff;
+
+	/* Increment the [ring] address for saving CS_TIMESTAMP */
+	*cs++ = MI_MATH(8);
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(INC));
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(ADDR));
+	*cs++ = MI_MATH_ADD;
+	*cs++ = MI_MATH_STORE(MI_MATH_REG(ADDR), MI_MATH_REG_ACCU);
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(ADDR));
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(MASK));
+	*cs++ = MI_MATH_AND;
+	*cs++ = MI_MATH_STORE(MI_MATH_REG(ADDR), MI_MATH_REG_ACCU);
+
+	/* Rewrite the batch buffer for the next execution */
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_GPR(ADDR);
+	*cs++ = addr + offset_in_page(timestamp_lo);
+	*cs++ = addr >> 32;
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_GPR(ADDR);
+	*cs++ = addr + offset_in_page(addr_lo);
+	*cs++ = addr >> 32;
+
+	*cs++ = MI_BATCH_BUFFER_END;
+
+	munmap(map, 4096);
+}
+
+static struct drm_i915_gem_exec_object2
+tslog_create(int i915, uint32_t ctx, const struct intel_execution_engine2 *e)
+{
+	struct drm_i915_gem_exec_object2 obj = {
+		.handle = batch_create(i915),
+		.flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS,
+	};
+	struct drm_i915_gem_execbuffer2 execbuf = {
+		.buffers_ptr = to_user_pointer(&obj),
+		.buffer_count = 1,
+		.rsvd1 = ctx,
+		.flags = e->flags,
+	};
+
+	obj.offset = obj.handle << 12;
+	gem_execbuf(i915, &execbuf);
+	gem_sync(i915, obj.handle);
+
+	tslog(i915, e, obj.handle, obj.offset);
+
+	obj.flags |= EXEC_OBJECT_PINNED;
+	return obj;
+}
+
+static int cmp_u32(const void *A, const void *B)
+{
+	const uint32_t *a = A, *b = B;
+
+	if (*a < *b)
+		return -1;
+	else if (*a > *b)
+		return 1;
+	else
+		return 0;
+}
+
+static bool has_ctx_timestamp(int i915, const struct intel_execution_engine2 *e)
+{
+	const int gen = intel_gen(intel_get_drm_devid(i915));
+
+	if (gen == 8 && e->class == I915_ENGINE_CLASS_VIDEO)
+		return false; /* looks fubar */
+
+	return true;
+}
+
+static struct intel_execution_engine2
+pick_random_engine(int i915, const struct intel_execution_engine2 *not)
+{
+	const struct intel_execution_engine2 *e;
+	unsigned int count = 0;
+
+	__for_each_physical_engine(i915, e) {
+		if (e->flags == not->flags)
+			continue;
+		if (!gem_class_has_mutable_submission(i915, e->class))
+			continue;
+		count++;
+	}
+	if (!count)
+		return *not;
+
+	count = rand() % count;
+	__for_each_physical_engine(i915, e) {
+		if (e->flags == not->flags)
+			continue;
+		if (!gem_class_has_mutable_submission(i915, e->class))
+			continue;
+		if (!count--)
+			break;
+	}
+
+	return *e;
+}
+
+static void fair_child(int i915, uint32_t ctx,
+		       const struct intel_execution_engine2 *e,
+		       uint64_t frame_ns,
+		       int timeline,
+		       uint32_t common,
+		       unsigned int flags,
+		       unsigned long *ctl,
+		       unsigned long *median,
+		       unsigned long *iqr)
+#define F_SYNC		(1 << 0)
+#define F_PACE		(1 << 1)
+#define F_FLOW		(1 << 2)
+#define F_HALF		(1 << 3)
+#define F_SOLO		(1 << 4)
+#define F_SPARE		(1 << 5)
+#define F_NEXT		(1 << 6)
+#define F_VIP		(1 << 7)
+#define F_RRUL		(1 << 8)
+#define F_SHARE		(1 << 9)
+#define F_PING		(1 << 10)
+#define F_THROTTLE	(1 << 11)
+#define F_ISOLATE	(1 << 12)
+{
+	const int batches_per_frame = flags & F_SOLO ? 1 : 3;
+	struct drm_i915_gem_exec_object2 obj[4] = {
+		{},
+		{
+			.handle = common ?: gem_create(i915, 4096),
+		},
+		delay_create(i915, ctx, e, frame_ns / batches_per_frame),
+		delay_create(i915, ctx, e, frame_ns / batches_per_frame),
+	};
+	struct intel_execution_engine2 ping = *e;
+	int p_fence = -1, n_fence = -1;
+	unsigned long count = 0;
+	int n;
+
+	srandom(getpid());
+	if (flags & F_PING)
+		ping = pick_random_engine(i915, e);
+	obj[0] = tslog_create(i915, ctx, &ping);
+
+	while (!READ_ONCE(*ctl)) {
+		struct drm_i915_gem_execbuffer2 execbuf = {
+			.buffers_ptr = to_user_pointer(obj),
+			.buffer_count = 3,
+			.rsvd1 = ctx,
+			.rsvd2 = -1,
+			.flags = e->flags,
+		};
+
+		if (flags & F_FLOW) {
+			unsigned int seq;
+
+			seq = count;
+			if (flags & F_NEXT)
+				seq++;
+
+			execbuf.rsvd2 =
+				sw_sync_timeline_create_fence(timeline, seq);
+			execbuf.flags |= I915_EXEC_FENCE_IN;
+		}
+
+		execbuf.flags |= I915_EXEC_FENCE_OUT;
+		gem_execbuf_wr(i915, &execbuf);
+		n_fence = execbuf.rsvd2 >> 32;
+		execbuf.flags &= ~(I915_EXEC_FENCE_OUT | I915_EXEC_FENCE_IN);
+		for (n = 1; n < batches_per_frame; n++)
+			gem_execbuf(i915, &execbuf);
+		close(execbuf.rsvd2);
+
+		execbuf.buffer_count = 1;
+		execbuf.batch_start_offset = 2048;
+		execbuf.flags = ping.flags | I915_EXEC_FENCE_IN;
+		execbuf.rsvd2 = n_fence;
+		gem_execbuf(i915, &execbuf);
+
+		if (flags & F_PACE && p_fence != -1) {
+			struct pollfd pfd = {
+				.fd = p_fence,
+				.events = POLLIN,
+			};
+			poll(&pfd, 1, -1);
+		}
+		close(p_fence);
+
+		if (flags & F_SYNC) {
+			struct pollfd pfd = {
+				.fd = n_fence,
+				.events = POLLIN,
+			};
+			poll(&pfd, 1, -1);
+		}
+
+		if (flags & F_THROTTLE)
+			igt_ioctl(i915, DRM_IOCTL_I915_GEM_THROTTLE, 0);
+
+		igt_swap(obj[2], obj[3]);
+		igt_swap(p_fence, n_fence);
+		count++;
+	}
+	close(p_fence);
+
+	gem_close(i915, obj[3].handle);
+	gem_close(i915, obj[2].handle);
+	if (obj[1].handle != common)
+		gem_close(i915, obj[1].handle);
+
+	gem_sync(i915, obj[0].handle);
+	if (median) {
+		uint32_t *map;
+
+		/*
+		 * We recorded the CS_TIMESTAMP of each frame, and if
+		 * the GPU is being shared completely fairly, we expect
+		 * each frame to be at the same interval from the last.
+		 *
+		 * Compute the interval between frames and report back
+		 * both the median interval and the range for this client.
+		 */
+
+		map = gem_mmap__device_coherent(i915, obj[0].handle,
+						0, 4096, PROT_WRITE);
+		for (n = 1; n < min(count, 512); n++) {
+			igt_assert(map[n]);
+			map[n - 1] = map[n] - map[n - 1];
+		}
+		qsort(map, --n, sizeof(*map), cmp_u32);
+		*iqr = ticks_to_ns(i915, map[(3 * n + 3) / 4] - map[n / 4]);
+		*median = ticks_to_ns(i915, map[n / 2]);
+		munmap(map, 4096);
+	}
+	gem_close(i915, obj[0].handle);
+}
+
+static int cmp_ul(const void *A, const void *B)
+{
+	const unsigned long *a = A, *b = B;
+
+	if (*a < *b)
+		return -1;
+	else if (*a > *b)
+		return 1;
+	else
+		return 0;
+}
+
+static uint64_t d_cpu_time(const struct rusage *a, const struct rusage *b)
+{
+	uint64_t cpu_time = 0;
+
+	cpu_time += (a->ru_utime.tv_sec - b->ru_utime.tv_sec) * NSEC_PER_SEC;
+	cpu_time += (a->ru_utime.tv_usec - b->ru_utime.tv_usec) * 1000;
+
+	cpu_time += (a->ru_stime.tv_sec - b->ru_stime.tv_sec) * NSEC_PER_SEC;
+	cpu_time += (a->ru_stime.tv_usec - b->ru_stime.tv_usec) * 1000;
+
+	return cpu_time;
+}
+
+static void timeline_advance(int timeline, int delay_ns)
+{
+	struct timespec tv = { .tv_nsec = delay_ns };
+	nanosleep(&tv, NULL);
+	sw_sync_timeline_inc(timeline, 1);
+}
+
+static void fairness(int i915,
+		     const struct intel_execution_engine2 *e,
+		     int timeout, unsigned int flags)
+{
+	const int frame_ns = 16666 * 1000;
+	const int fence_ns = flags & F_HALF ? 2 * frame_ns : frame_ns;
+	unsigned long *result, *iqr;
+	uint32_t common = 0;
+
+	igt_require(has_ctx_timestamp(i915, e));
+	igt_require(gem_class_has_mutable_submission(i915, e->class));
+
+	if (flags & F_SHARE)
+		common = gem_create(i915, 4095);
+
+	result = mmap(NULL, 4096, PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
+	igt_assert(result != MAP_FAILED);
+	iqr = mmap(NULL, 4096, PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
+	igt_assert(iqr != MAP_FAILED);
+
+	/*
+	 * The combined workload always runs at a 60fps target (unless F_HALF!).
+	 * This gives a frame of interval of 16ms that is evenly split across
+	 * all the clients, so simulating a system with a bunch of clients that
+	 * are perfectly balanced and can sustain 60fps. Our job is to ensure
+	 * that each client does run at a smooth 60fps.
+	 *
+	 * Each client runs a fixed length delay loop (as a single request,
+	 * or split into 3) and then records the CS_TIMESTAMP after completing
+	 * its delay. Given a fair allotment of GPU time to each client,
+	 * that timestamp will [ideally] be at a precise 16ms intervals.
+	 * In practice, time is wasted on context switches, so as the number
+	 * of clients increases, the proprotion of time spent on context
+	 * switches grows. As we get to 64 render clients, we will be spending
+	 * as much time in context switches as executing the client workloads.
+	 *
+	 * Each client frame may be paced by some throttling technique found
+	 * in the wild. i.e. each client may wait until a simulated vblank
+	 * to indicate the start of a new frame, or it may wait until the
+	 * completion of a previous frame. This causes submission from each
+	 * client and across the system to be chunky and uneven.
+	 *
+	 * We look at the variation of frame intervals within each client, and
+	 * the variation of the medians across the clients to see if the
+	 * distribution (budget) of GPU time was fair enough.
+	 *
+	 * Alternative (and important) metrics will be more latency centric;
+	 * looking at how well we can sustain meeting deadline given competition
+	 * by clients for the GPU.
+	 */
+
+	for (int n = 2; n <= 256; n <<= 1) { /* 32 == 500us per client */
+		int timeline = sw_sync_timeline_create();
+		int nfences = timeout * NSEC_PER_SEC / fence_ns + 1;
+		int nchild = n - 1; /* odd for easy medians */
+		const int child_ns = frame_ns / (nchild + !!(flags & F_SPARE));
+		const int lo = nchild / 4;
+		const int hi = (3 * nchild + 3) / 4 - 1;
+		struct rusage old_usage, usage;
+		uint64_t cpu_time, d_time;
+		struct timespec tv;
+		struct igt_mean m;
+
+		memset(result, 0, (nchild + 1) * sizeof(result[0]));
+
+		if (flags & F_PING) { /* fill the others with light bg load */
+			struct intel_execution_engine2 *ping;
+
+			__for_each_physical_engine(i915, ping) {
+				if (ping->flags == e->flags)
+					continue;
+
+				igt_fork(child, 1) {
+					uint32_t ctx = gem_context_clone_with_engines(i915, 0);
+
+					fair_child(i915, ctx, ping,
+						   child_ns / 8,
+						   -1, common,
+						   F_SOLO | F_PACE | F_SHARE,
+						   &result[nchild],
+						   NULL, NULL);
+
+					gem_context_destroy(i915, ctx);
+				}
+			}
+		}
+
+		getrusage(RUSAGE_CHILDREN, &old_usage);
+		igt_nsec_elapsed(memset(&tv, 0, sizeof(tv)));
+		igt_fork(child, nchild) {
+			uint32_t ctx;
+
+			if (flags & F_ISOLATE) {
+				int clone, dmabuf = -1;
+
+				if (common)
+					dmabuf = prime_handle_to_fd(i915, common);
+
+				clone = gem_reopen_driver(i915);
+				gem_context_copy_engines(i915, 0, clone, 0);
+				i915 = clone;
+
+				if (dmabuf != -1)
+					common = prime_fd_to_handle(i915, dmabuf);
+			}
+
+			ctx = gem_context_clone_with_engines(i915, 0);
+
+			if (flags & F_VIP && child == 0) {
+				gem_context_set_priority(i915, ctx, MAX_PRIO);
+				flags |= F_FLOW;
+			}
+			if (flags & F_RRUL && child == 0)
+				flags |= F_SOLO | F_FLOW | F_SYNC;
+
+			fair_child(i915, ctx, e, child_ns,
+				   timeline, common, flags,
+				   &result[nchild],
+				   &result[child], &iqr[child]);
+
+			gem_context_destroy(i915, ctx);
+		}
+
+		while (nfences--)
+			timeline_advance(timeline, fence_ns);
+
+		result[nchild] = 1;
+		for (int child = 0; child < nchild; child++) {
+			while (!READ_ONCE(result[child]))
+				timeline_advance(timeline, fence_ns);
+		}
+
+		igt_waitchildren();
+		close(timeline);
+
+		/*
+		 * Are we running out of CPU time, and fail to submit frames?
+		 *
+		 * We try to rule out any undue impact on the GPU scheduling
+		 * from the CPU scheduler by looking for core saturation. If
+		 * we may be in a situation where the clients + kernel are
+		 * taking a whole core (think lockdep), then it is increasingly
+		 * likely that our measurements include delays from the CPU
+		 * scheduler. Err on the side of caution.
+		 */
+		d_time = igt_nsec_elapsed(&tv);
+		getrusage(RUSAGE_CHILDREN, &usage);
+		cpu_time = d_cpu_time(&usage, &old_usage);
+		igt_debug("CPU usage: %.0f%%\n", 100. * cpu_time / d_time);
+		if (4 * cpu_time > 3 * d_time) {
+			if (nchild > 7) /* good enough to judge pass/fail */
+				break;
+
+			igt_skip_on_f(4 * cpu_time > 3 * d_time,
+				      "%.0f%% CPU usage, presuming capacity exceeded\n",
+				      100. * cpu_time / d_time);
+		}
+
+		/* With no contention, we should match our target frametime */
+		if (nchild == 1) {
+			igt_assert(4 * result[0] > 3 * fence_ns &&
+				   3 * result[0] < 4 * fence_ns);
+			continue;
+		}
+
+		/*
+		 * The VIP should always be able to hit the target frame rate;
+		 * regardless of budget contention from lessor clients.
+		 */
+		if (flags & (F_VIP | F_RRUL)) {
+			igt_info("VIP interval %.2fms, range %.2fms\n",
+				 1e-6 * result[0], 1e-6 * iqr[0]);
+			igt_assert_f(4 * result[0] > 3 * fence_ns &&
+				     3 * result[0] < 4 * fence_ns,
+				     "VIP expects to run exactly when it wants, expects an interval of %.2fms, was %.2fms\n",
+				     1e-6 * fence_ns, 1e-6 * result[0]);
+			igt_assert_f(2 * iqr[0] < result[0],
+				     "VIP frame IQR %.2fms exceeded median threshold %.2fms\n",
+				     1e-6 * iqr[0],
+				     1e-6 * result[0] / 2);
+			if (!--nchild)
+				continue;
+
+			/* Exclude the VIP result from the plebian statistics */
+			memmove(result, result + 1, nchild * sizeof(*result));
+			memmove(iqr, iqr + 1, nchild * sizeof(*iqr));
+		}
+
+		igt_mean_init(&m);
+		for (int child = 0; child < nchild; child++)
+			igt_mean_add(&m, result[child]);
+
+		qsort(result, nchild, sizeof(*result), cmp_ul);
+		qsort(iqr, nchild, sizeof(*iqr), cmp_ul);
+
+		/*
+		 * The target interval for median/mean is 16ms (fence_ns).
+		 * However, this work is evenly split across the clients so
+		 * the range (and median) of client medians may be much less
+		 * than 16ms [16/3N]. We present median of medians to try
+		 * and avoid any instability while running in CI; at the cost
+		 * of insensitivity!
+		 */
+		igt_info("%3d clients, range: [%.1f, %.1f], iqr: [%.1f, %.1f], median: %.1f [%.1f, %.1f], mean: %.1f ± %.2f ms\n",
+			 nchild,
+			 1e-6 * result[0],  1e-6 * result[nchild - 1],
+			 1e-6 * result[lo], 1e-6 * result[hi],
+			 1e-6 * result[nchild / 2],
+			 1e-6 * iqr[lo], 1e-6 * iqr[hi],
+			 1e-6 * igt_mean_get(&m),
+			 1e-6 * sqrt(igt_mean_get_variance(&m)));
+
+		igt_assert_f(iqr[nchild / 2] < 2 * result[nchild / 2],
+			     "Child frame IQR %.2fms exceeded median threshold %.2fms\n",
+			     1e-6 * iqr[nchild / 2],
+			     1e-6 * result[nchild / 2] * 2);
+
+		igt_assert_f(4 * igt_mean_get(&m) > 3 * result[nchild / 2] &&
+			     3 * igt_mean_get(&m) < 4 * result[nchild / 2],
+			     "Mean of client interval %.2fms differs from median %.2fms, distribution is skewed\n",
+
+			     1e-6 * igt_mean_get(&m), 1e-6 * result[nchild / 2]);
+
+		igt_assert_f(2 * (result[hi] - result[lo]) < result[nchild / 2],
+			     "Interquartile range of client intervals %.2fms is as large as the median threshold %.2fms, clients are not evenly distributed!\n",
+			     1e-6 * (result[hi] - result[lo]),
+			     1e-6 * result[nchild / 2] / 2);
+
+		/* May be slowed due to sheer volume of context switches */
+		if (result[0] > 2 * fence_ns)
+			break;
+	}
+
+	munmap(iqr, 4096);
+	munmap(result, 4096);
+	if (common)
+		gem_close(i915, common);
+}
+
+static void test_fairness(int i915, int timeout)
+{
+	static const struct {
+		const char *name;
+		unsigned int flags;
+	} fair[] = {
+		/*
+		 * none - maximal greed in each client
+		 *
+		 * Push as many frames from each client as fast as possible
+		 */
+		{ "none",       0 },
+		{ "none-vip",   F_VIP }, /* one vip client must meet deadlines */
+		{ "none-solo",  F_SOLO }, /* 1 batch per frame per client */
+		{ "none-share", F_SHARE }, /* read from a common buffer */
+		{ "none-rrul",  F_RRUL }, /* "realtime-response under load" */
+		{ "none-ping",  F_PING }, /* measure inter-engine fairness */
+
+		/*
+		 * throttle - original per client throttling
+		 *
+		 * Used for front buffering rendering where there is no
+		 * extenal frame marker. Each client tries to only keep
+		 * 20ms of work submitted, though that measurement is
+		 * flawed...
+		 *
+		 * This is used by Xorg to try and maintain some resembalance
+		 * of input/output consistency when being feed a continuous
+		 * stream of X11 draw requests straight into scanout, where
+		 * the clients may submit the work faster than can be drawn.
+		 *
+		 * Throttling tracks requests per-file (and assumes that
+		 * all requests are in submission order across the whole file),
+		 * so we split each child to its own fd.
+		 */
+		{ "throttle",       F_THROTTLE | F_ISOLATE },
+		{ "throttle-vip",   F_THROTTLE | F_ISOLATE | F_VIP },
+		{ "throttle-solo",  F_THROTTLE | F_ISOLATE | F_SOLO },
+		{ "throttle-share", F_THROTTLE | F_ISOLATE | F_SHARE },
+		{ "throttle-rrul",  F_THROTTLE | F_ISOLATE | F_RRUL },
+
+		/*
+		 * pace - mesa "submit double buffering"
+		 *
+		 * Submit a frame, wait for previous frame to start. This
+		 * prevents each client from getting too far ahead of its
+		 * rendering, maintaining a consistent input/output latency.
+		 */
+		{ "pace",       F_PACE },
+		{ "pace-solo",  F_PACE | F_SOLO },
+		{ "pace-share", F_PACE | F_SOLO | F_SHARE },
+		{ "pace-ping",  F_PACE | F_SOLO | F_SHARE | F_PING},
+
+		/* sync - only submit a frame at a time */
+		{ "sync",      F_SYNC },
+		{ "sync-vip",  F_SYNC | F_VIP },
+		{ "sync-solo", F_SYNC | F_SOLO },
+
+		/* flow - synchronise execution against the clock (vblank) */
+		{ "flow",       F_PACE | F_FLOW },
+		{ "flow-solo",  F_PACE | F_FLOW | F_SOLO },
+		{ "flow-share", F_PACE | F_FLOW | F_SHARE },
+		{ "flow-ping",  F_PACE | F_FLOW | F_SHARE | F_PING },
+
+		/* next - submit ahead of the clock (vblank double buffering) */
+		{ "next",       F_PACE | F_FLOW | F_NEXT },
+		{ "next-solo",  F_PACE | F_FLOW | F_NEXT | F_SOLO },
+		{ "next-share", F_PACE | F_FLOW | F_NEXT | F_SHARE },
+		{ "next-ping",  F_PACE | F_FLOW | F_NEXT | F_SHARE | F_PING },
+
+		/* spare - underutilise by a single client timeslice */
+		{ "spare",      F_PACE | F_FLOW | F_SPARE },
+		{ "spare-solo", F_PACE | F_FLOW | F_SPARE | F_SOLO },
+
+		/* half - run at half pace (submit 16ms of work every 32ms) */
+		{ "half",       F_PACE | F_FLOW | F_HALF },
+		{ "half-solo",  F_PACE | F_FLOW | F_HALF | F_SOLO },
+
+		{}
+	};
+
+	igt_fixture {
+		igt_info("CS timestamp frequency: %d\n",
+			 read_timestamp_frequency(i915));
+
+		igt_require(intel_gen(intel_get_drm_devid(i915)) >= 8);
+	}
+
+	for (typeof(*fair) *f = fair; f->name; f++) {
+		igt_subtest_with_dynamic_f("fair-%s", f->name)  {
+			const struct intel_execution_engine2 *e;
+
+			__for_each_physical_engine(i915, e) {
+				if (!gem_class_can_store_dword(i915, e->class))
+					continue;
+
+				igt_dynamic_f("%s", e->name)
+					fairness(i915, e, timeout, f->flags);
+			}
+		}
+	}
+}
+
+static uint32_t read_ctx_timestamp(int i915,
+				   uint32_t ctx,
+				   const struct intel_execution_engine2 *e)
+{
+	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
+	const uint32_t base = gem_engine_mmio_base(i915, e->name);
+	struct drm_i915_gem_relocation_entry reloc;
+	struct drm_i915_gem_exec_object2 obj = {
+		.handle = gem_create(i915, 4096),
+		.offset = 32 << 20,
+		.relocs_ptr = to_user_pointer(&reloc),
+		.relocation_count = 1,
+	};
+	struct drm_i915_gem_execbuffer2 execbuf = {
+		.buffers_ptr = to_user_pointer(&obj),
+		.buffer_count = 1,
+		.flags = e->flags,
+		.rsvd1 = ctx,
+	};
+#define RUNTIME (base + 0x3a8)
+	uint32_t *map, *cs;
+	uint32_t ts;
+
+	igt_require(base);
+
+	cs = map = gem_mmap__device_coherent(i915, obj.handle,
+					     0, 4096, PROT_WRITE);
+
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = RUNTIME;
+	memset(&reloc, 0, sizeof(reloc));
+	reloc.target_handle = obj.handle;
+	reloc.presumed_offset = obj.offset;
+	reloc.offset = offset_in_page(cs);
+	reloc.delta = 4000;
+	*cs++ = obj.offset + 4000;
+	*cs++ = obj.offset >> 32;
+
+	*cs++ = MI_BATCH_BUFFER_END;
+
+	gem_execbuf(i915, &execbuf);
+	gem_sync(i915, obj.handle);
+	gem_close(i915, obj.handle);
+
+	ts = map[1000];
+	munmap(map, 4096);
+
+	return ts;
+}
+
+static void fairslice(int i915,
+		      const struct intel_execution_engine2 *e,
+		      unsigned long flags)
+{
+	igt_spin_t *spin = NULL;
+	uint32_t ctx[3];
+	uint32_t ts[3];
+
+	for (int i = 0; i < ARRAY_SIZE(ctx); i++) {
+		ctx[i] = gem_context_clone_with_engines(i915, 0);
+		if (spin == NULL) {
+			spin = __igt_spin_new(i915,
+					      .ctx = ctx[i],
+					      .engine = e->flags,
+					      .flags = flags);
+		} else {
+			struct drm_i915_gem_execbuffer2 eb = {
+				.buffer_count = 1,
+				.buffers_ptr = to_user_pointer(&spin->obj[IGT_SPIN_BATCH]),
+				.flags = e->flags,
+				.rsvd1 = ctx[i],
+			};
+			gem_execbuf(i915, &eb);
+		}
+	}
+
+	sleep(2); /* over the course of many timeslices */
+
+	igt_assert(gem_bo_busy(i915, spin->handle));
+	igt_spin_end(spin);
+	for (int i = 0; i < ARRAY_SIZE(ctx); i++)
+		ts[i] = read_ctx_timestamp(i915, ctx[i], e);
+
+	for (int i = 0; i < ARRAY_SIZE(ctx); i++)
+		gem_context_destroy(i915, ctx[i]);
+	igt_spin_free(i915, spin);
+
+	qsort(ts, 3, sizeof(*ts), cmp_u32);
+	igt_info("%s: [%.1f, %.1f, %.1f] ms\n", e->name,
+		 1e-6 * ticks_to_ns(i915, ts[0]),
+		 1e-6 * ticks_to_ns(i915, ts[1]),
+		 1e-6 * ticks_to_ns(i915, ts[2]));
+
+	igt_assert_f(ts[2], "CTX_TIMESTAMP not reported!\n");
+	igt_assert_f((ts[2] - ts[0]) * 6 < ts[1],
+		     "Range of timeslices greater than tolerable: %.2fms > %.2fms; unfair!\n",
+		     1e-6 * ticks_to_ns(i915, ts[2] - ts[0]),
+		     1e-6 * ticks_to_ns(i915, ts[1]) / 6);
+}
+
 #define test_each_engine(T, i915, e) \
 	igt_subtest_with_dynamic(T) __for_each_physical_engine(i915, e) \
 		igt_dynamic_f("%s", e->name)
@@ -2582,6 +3503,35 @@ igt_main
 		test_each_engine("lateslice", fd, e)
 			lateslice(fd, e->flags);
 
+		igt_subtest_group {
+			igt_fixture {
+				igt_require(gem_scheduler_has_semaphores(fd));
+				igt_require(gem_scheduler_has_preemption(fd));
+				igt_require(intel_gen(intel_get_drm_devid(fd)) >= 8);
+			}
+
+			test_each_engine("fairslice", fd, e)
+				fairslice(fd, e, 0);
+
+			test_each_engine("u-fairslice", fd, e)
+				fairslice(fd, e, IGT_SPIN_USERPTR);
+
+			igt_subtest("fairslice-all")  {
+				__for_each_physical_engine(fd, e) {
+					igt_fork(child, 1)
+						fairslice(fd, e, 0);
+				}
+				igt_waitchildren();
+			}
+			igt_subtest("u-fairslice-all")  {
+				__for_each_physical_engine(fd, e) {
+					igt_fork(child, 1)
+						fairslice(fd, e, IGT_SPIN_USERPTR);
+				}
+				igt_waitchildren();
+			}
+		}
+
 		test_each_engine("submit-early-slice", fd, e)
 			submit_slice(fd, e, EARLY_SUBMIT);
 		test_each_engine("submit-golden-slice", fd, e)
@@ -2610,6 +3560,10 @@ igt_main
 		test_each_engine_store("promotion", fd, e)
 			promotion(fd, e->flags);
 
+		igt_subtest_group {
+			test_fairness(fd, 2);
+		}
+
 		igt_subtest_group {
 			igt_fixture {
 				igt_require(gem_scheduler_has_preemption(fd));
-- 
2.29.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [Intel-gfx] [PATCH i-g-t] i915/gem_exec_schedule: Try to spot unfairness
@ 2020-08-03 13:57 Chris Wilson
  0 siblings, 0 replies; 20+ messages in thread
From: Chris Wilson @ 2020-08-03 13:57 UTC (permalink / raw)
  To: intel-gfx; +Cc: igt-dev, Chris Wilson

An important property for multi-client systems is that each client gets
a 'fair' allotment of system time. (Where fairness is at the whim of the
context properties, such as priorities.) This test forks N independent
clients (albeit they happen to share a single vm), and does an equal
amount of work in client and asserts that they take an equal amount of
time.

Though we have never claimed to have a completely fair scheduler, that
is what is expected.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Ramalingam C <ramalingam.c@intel.com>
---
 tests/i915/gem_exec_schedule.c | 816 +++++++++++++++++++++++++++++++++
 1 file changed, 816 insertions(+)

diff --git a/tests/i915/gem_exec_schedule.c b/tests/i915/gem_exec_schedule.c
index 488d93511..7c8ea6d70 100644
--- a/tests/i915/gem_exec_schedule.c
+++ b/tests/i915/gem_exec_schedule.c
@@ -29,6 +29,7 @@
 #include <sys/poll.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
+#include <sys/resource.h>
 #include <sys/syscall.h>
 #include <sched.h>
 #include <signal.h>
@@ -2503,6 +2504,800 @@ static void measure_semaphore_power(int i915)
 	rapl_close(&pkg);
 }
 
+static int read_timestamp_frequency(int i915)
+{
+	int value = 0;
+	drm_i915_getparam_t gp = {
+		.value = &value,
+		.param = I915_PARAM_CS_TIMESTAMP_FREQUENCY,
+	};
+	ioctl(i915, DRM_IOCTL_I915_GETPARAM, &gp);
+	return value;
+}
+
+static uint64_t div64_u64_round_up(uint64_t x, uint64_t y)
+{
+	return (x + y - 1) / y;
+}
+
+static uint64_t ns_to_ticks(int i915, uint64_t ns)
+{
+	return div64_u64_round_up(ns * read_timestamp_frequency(i915),
+				  NSEC_PER_SEC);
+}
+
+static uint64_t ticks_to_ns(int i915, uint64_t ticks)
+{
+	return div64_u64_round_up(ticks * NSEC_PER_SEC,
+				  read_timestamp_frequency(i915));
+}
+
+#define MI_INSTR(opcode, flags) (((opcode) << 23) | (flags))
+
+#define MI_MATH(x)                      MI_INSTR(0x1a, (x) - 1)
+#define MI_MATH_INSTR(opcode, op1, op2) ((opcode) << 20 | (op1) << 10 | (op2))
+/* Opcodes for MI_MATH_INSTR */
+#define   MI_MATH_NOOP                  MI_MATH_INSTR(0x000, 0x0, 0x0)
+#define   MI_MATH_LOAD(op1, op2)        MI_MATH_INSTR(0x080, op1, op2)
+#define   MI_MATH_LOADINV(op1, op2)     MI_MATH_INSTR(0x480, op1, op2)
+#define   MI_MATH_LOAD0(op1)            MI_MATH_INSTR(0x081, op1)
+#define   MI_MATH_LOAD1(op1)            MI_MATH_INSTR(0x481, op1)
+#define   MI_MATH_ADD                   MI_MATH_INSTR(0x100, 0x0, 0x0)
+#define   MI_MATH_SUB                   MI_MATH_INSTR(0x101, 0x0, 0x0)
+#define   MI_MATH_AND                   MI_MATH_INSTR(0x102, 0x0, 0x0)
+#define   MI_MATH_OR                    MI_MATH_INSTR(0x103, 0x0, 0x0)
+#define   MI_MATH_XOR                   MI_MATH_INSTR(0x104, 0x0, 0x0)
+#define   MI_MATH_STORE(op1, op2)       MI_MATH_INSTR(0x180, op1, op2)
+#define   MI_MATH_STOREINV(op1, op2)    MI_MATH_INSTR(0x580, op1, op2)
+/* Registers used as operands in MI_MATH_INSTR */
+#define   MI_MATH_REG(x)                (x)
+#define   MI_MATH_REG_SRCA              0x20
+#define   MI_MATH_REG_SRCB              0x21
+#define   MI_MATH_REG_ACCU              0x31
+#define   MI_MATH_REG_ZF                0x32
+#define   MI_MATH_REG_CF                0x33
+
+#define MI_LOAD_REGISTER_REG    MI_INSTR(0x2A, 1)
+
+static void delay(int i915,
+		  const struct intel_execution_engine2 *e,
+		  uint32_t handle,
+		  uint64_t addr,
+		  uint64_t ns)
+{
+	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
+	const uint32_t base = gem_engine_mmio_base(i915, e->name);
+#define CS_GPR(x) (base + 0x600 + 8 * (x))
+#define RUNTIME (base + 0x3a8)
+	enum { START_TS, NOW_TS };
+	uint32_t *map, *cs, *jmp;
+
+	igt_require(base);
+
+	/* Loop until CTX_TIMESTAMP - initial > @ns */
+
+	cs = map = gem_mmap__device_coherent(i915, handle, 0, 4096, PROT_WRITE);
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(START_TS) + 4;
+	*cs++ = 0;
+	*cs++ = MI_LOAD_REGISTER_REG;
+	*cs++ = RUNTIME;
+	*cs++ = CS_GPR(START_TS);
+
+	while (offset_in_page(cs) & 63)
+		*cs++ = 0;
+	jmp = cs;
+
+	*cs++ = 0x5 << 23; /* MI_ARB_CHECK */
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(NOW_TS) + 4;
+	*cs++ = 0;
+	*cs++ = MI_LOAD_REGISTER_REG;
+	*cs++ = RUNTIME;
+	*cs++ = CS_GPR(NOW_TS);
+
+	/* delta = now - start; inverted to match COND_BBE */
+	*cs++ = MI_MATH(4);
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(NOW_TS));
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(START_TS));
+	*cs++ = MI_MATH_SUB;
+	*cs++ = MI_MATH_STOREINV(MI_MATH_REG(NOW_TS), MI_MATH_REG_ACCU);
+
+	/* Save delta for reading by COND_BBE */
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_GPR(NOW_TS);
+	*cs++ = addr + 4000;
+	*cs++ = addr >> 32;
+
+	/* Delay between SRM and COND_BBE to post the writes */
+	for (int n = 0; n < 8; n++) {
+		*cs++ = MI_STORE_DWORD_IMM;
+		if (use_64b) {
+			*cs++ = addr + 4064;
+			*cs++ = addr >> 32;
+		} else {
+			*cs++ = 0;
+			*cs++ = addr + 4064;
+		}
+		*cs++ = 0;
+	}
+
+	/* Break if delta > ns */
+	*cs++ = MI_COND_BATCH_BUFFER_END | MI_DO_COMPARE | (1 + use_64b);
+	*cs++ = ~ns_to_ticks(i915, ns);
+	*cs++ = addr + 4000;
+	*cs++ = addr >> 32;
+
+	/* Otherwise back to recalculating delta */
+	*cs++ = MI_BATCH_BUFFER_START | 1 << 8 | use_64b;
+	*cs++ = addr + offset_in_page(jmp);
+	*cs++ = addr >> 32;
+
+	munmap(map, 4096);
+}
+
+static struct drm_i915_gem_exec_object2
+delay_create(int i915, uint32_t ctx,
+	     const struct intel_execution_engine2 *e,
+	     uint64_t target_ns)
+{
+	struct drm_i915_gem_exec_object2 obj = {
+		.handle = batch_create(i915),
+		.flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS,
+	};
+	struct drm_i915_gem_execbuffer2 execbuf = {
+		.buffers_ptr = to_user_pointer(&obj),
+		.buffer_count = 1,
+		.rsvd1 = ctx,
+		.flags = e->flags,
+	};
+
+	obj.offset = obj.handle << 12;
+	gem_execbuf(i915, &execbuf);
+	gem_sync(i915, obj.handle);
+
+	delay(i915, e, obj.handle, obj.offset, target_ns);
+
+	obj.flags |= EXEC_OBJECT_PINNED;
+	return obj;
+}
+
+static void tslog(int i915,
+		  const struct intel_execution_engine2 *e,
+		  uint32_t handle,
+		  uint64_t addr)
+{
+	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
+	const uint32_t base = gem_engine_mmio_base(i915, e->name);
+#define CS_GPR(x) (base + 0x600 + 8 * (x))
+#define CS_TIMESTAMP (base + 0x358)
+	enum { INC, MASK, ADDR };
+	uint32_t *timestamp_lo, *addr_lo;
+	uint32_t *map, *cs;
+
+	igt_require(base);
+
+	map = gem_mmap__device_coherent(i915, handle, 0, 4096, PROT_WRITE);
+	cs = map + 512;
+
+	/* Record the current CS_TIMESTAMP into a journal [a 512 slot ring]. */
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_TIMESTAMP;
+	timestamp_lo = cs;
+	*cs++ = addr;
+	*cs++ = addr >> 32;
+
+	/* Load the address + inc & mask variables */
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(ADDR);
+	addr_lo = cs;
+	*cs++ = addr;
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(ADDR) + 4;
+	*cs++ = addr >> 32;
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(INC);
+	*cs++ = 4;
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(INC) + 4;
+	*cs++ = 0;
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(MASK);
+	*cs++ = 0xfffff7ff;
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(MASK) + 4;
+	*cs++ = 0xffffffff;
+
+	/* Increment the [ring] address for saving CS_TIMESTAMP */
+	*cs++ = MI_MATH(8);
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(INC));
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(ADDR));
+	*cs++ = MI_MATH_ADD;
+	*cs++ = MI_MATH_STORE(MI_MATH_REG(ADDR), MI_MATH_REG_ACCU);
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(ADDR));
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(MASK));
+	*cs++ = MI_MATH_AND;
+	*cs++ = MI_MATH_STORE(MI_MATH_REG(ADDR), MI_MATH_REG_ACCU);
+
+	/* Rewrite the batch buffer for the next execution */
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_GPR(ADDR);
+	*cs++ = addr + offset_in_page(timestamp_lo);
+	*cs++ = addr >> 32;
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_GPR(ADDR);
+	*cs++ = addr + offset_in_page(addr_lo);
+	*cs++ = addr >> 32;
+
+	*cs++ = MI_BATCH_BUFFER_END;
+
+	munmap(map, 4096);
+}
+
+static struct drm_i915_gem_exec_object2
+tslog_create(int i915, uint32_t ctx, const struct intel_execution_engine2 *e)
+{
+	struct drm_i915_gem_exec_object2 obj = {
+		.handle = batch_create(i915),
+		.flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS,
+	};
+	struct drm_i915_gem_execbuffer2 execbuf = {
+		.buffers_ptr = to_user_pointer(&obj),
+		.buffer_count = 1,
+		.rsvd1 = ctx,
+		.flags = e->flags,
+	};
+
+	obj.offset = obj.handle << 12;
+	gem_execbuf(i915, &execbuf);
+	gem_sync(i915, obj.handle);
+
+	tslog(i915, e, obj.handle, obj.offset);
+
+	obj.flags |= EXEC_OBJECT_PINNED;
+	return obj;
+}
+
+static int cmp_u32(const void *A, const void *B)
+{
+	const uint32_t *a = A, *b = B;
+
+	if (*a < *b)
+		return -1;
+	else if (*a > *b)
+		return 1;
+	else
+		return 0;
+}
+
+static bool has_ctx_timestamp(int i915, const struct intel_execution_engine2 *e)
+{
+	const int gen = intel_gen(intel_get_drm_devid(i915));
+
+	if (gen == 8 && e->class == I915_ENGINE_CLASS_VIDEO)
+		return false; /* looks fubar */
+
+	return true;
+}
+
+static struct intel_execution_engine2
+pick_random_engine(int i915, const struct intel_execution_engine2 *not)
+{
+	const struct intel_execution_engine2 *e;
+	unsigned int count = 0;
+
+	__for_each_physical_engine(i915, e) {
+		if (e->flags == not->flags)
+			continue;
+		if (!gem_class_has_mutable_submission(i915, e->class))
+			continue;
+		count++;
+	}
+	if (!count)
+		return *not;
+
+	count = rand() % count;
+	__for_each_physical_engine(i915, e) {
+		if (e->flags == not->flags)
+			continue;
+		if (!gem_class_has_mutable_submission(i915, e->class))
+			continue;
+		if (!count--)
+			break;
+	}
+
+	return *e;
+}
+
+static void fair_child(int i915, uint32_t ctx,
+		       const struct intel_execution_engine2 *e,
+		       uint64_t frame_ns,
+		       int timeline,
+		       uint32_t common,
+		       unsigned int flags,
+		       unsigned long *ctl,
+		       unsigned long *out)
+#define F_SYNC		(1 << 0)
+#define F_PACE		(1 << 1)
+#define F_FLOW		(1 << 2)
+#define F_HALF		(1 << 3)
+#define F_SOLO		(1 << 4)
+#define F_SPARE		(1 << 5)
+#define F_NEXT		(1 << 6)
+#define F_VIP		(1 << 7)
+#define F_RRUL		(1 << 8)
+#define F_SHARE		(1 << 9)
+#define F_PING		(1 << 10)
+#define F_THROTTLE	(1 << 11)
+#define F_ISOLATE	(1 << 12)
+{
+	const int batches_per_frame = flags & F_SOLO ? 1 : 3;
+	struct drm_i915_gem_exec_object2 obj[4] = {
+		{},
+		{
+			.handle = common ?: gem_create(i915, 4096),
+		},
+		delay_create(i915, ctx, e, frame_ns / batches_per_frame),
+		delay_create(i915, ctx, e, frame_ns / batches_per_frame),
+	};
+	struct intel_execution_engine2 ping = *e;
+	int p_fence = -1, n_fence = -1;
+	unsigned long count = 0;
+	int n;
+
+	srandom(getpid());
+	if (flags & F_PING)
+		ping = pick_random_engine(i915, e);
+	obj[0] = tslog_create(i915, ctx, &ping);
+
+	while (!READ_ONCE(*ctl)) {
+		struct drm_i915_gem_execbuffer2 execbuf = {
+			.buffers_ptr = to_user_pointer(obj),
+			.buffer_count = 4,
+			.rsvd1 = ctx,
+			.rsvd2 = -1,
+			.flags = e->flags,
+		};
+
+		if (flags & F_FLOW) {
+			unsigned int seq;
+
+			seq = count;
+			if (flags & F_NEXT)
+				seq++;
+
+			execbuf.rsvd2 =
+				sw_sync_timeline_create_fence(timeline, seq);
+			execbuf.flags |= I915_EXEC_FENCE_IN;
+		}
+
+		execbuf.flags |= I915_EXEC_FENCE_OUT;
+		gem_execbuf_wr(i915, &execbuf);
+		n_fence = execbuf.rsvd2 >> 32;
+		execbuf.flags &= ~(I915_EXEC_FENCE_OUT | I915_EXEC_FENCE_IN);
+		for (n = 1; n < batches_per_frame; n++)
+			gem_execbuf(i915, &execbuf);
+		close(execbuf.rsvd2);
+
+		execbuf.buffer_count = 1;
+		execbuf.batch_start_offset = 2048;
+		execbuf.flags = ping.flags | I915_EXEC_FENCE_IN;
+		execbuf.rsvd2 = n_fence;
+		gem_execbuf(i915, &execbuf);
+
+		if (flags & F_PACE && p_fence != -1) {
+			struct pollfd pfd = {
+				.fd = p_fence,
+				.events = POLLIN,
+			};
+			poll(&pfd, 1, -1);
+		}
+		close(p_fence);
+
+		if (flags & F_SYNC) {
+			struct pollfd pfd = {
+				.fd = n_fence,
+				.events = POLLIN,
+			};
+			poll(&pfd, 1, -1);
+		}
+
+		if (flags & F_THROTTLE)
+			igt_ioctl(i915, DRM_IOCTL_I915_GEM_THROTTLE, 0);
+
+		igt_swap(obj[2], obj[3]);
+		igt_swap(p_fence, n_fence);
+		count++;
+	}
+	close(p_fence);
+
+	gem_close(i915, obj[3].handle);
+	gem_close(i915, obj[2].handle);
+	if (obj[1].handle != common)
+		gem_close(i915, obj[1].handle);
+
+	gem_sync(i915, obj[0].handle);
+	if (out) {
+		uint32_t *map;
+
+		map = gem_mmap__device_coherent(i915, obj[0].handle,
+						0, 4096, PROT_WRITE);
+		for (n = 1; n < min(count, 512); n++) {
+			igt_assert(map[n]);
+			map[n - 1] = map[n] - map[n - 1];
+		}
+		qsort(map, --n, sizeof(*map), cmp_u32);
+		*out = ticks_to_ns(i915, map[n / 2]);
+		munmap(map, 4096);
+	}
+	gem_close(i915, obj[0].handle);
+}
+
+static int cmp_ul(const void *A, const void *B)
+{
+	const unsigned long *a = A, *b = B;
+
+	if (*a < *b)
+		return -1;
+	else if (*a > *b)
+		return 1;
+	else
+		return 0;
+}
+
+static uint64_t d_cpu_time(const struct rusage *a, const struct rusage *b)
+{
+	uint64_t cpu_time = 0;
+
+	cpu_time += (a->ru_utime.tv_sec - b->ru_utime.tv_sec) * NSEC_PER_SEC;
+	cpu_time += (a->ru_utime.tv_usec - b->ru_utime.tv_usec) * 1000;
+
+	cpu_time += (a->ru_stime.tv_sec - b->ru_stime.tv_sec) * NSEC_PER_SEC;
+	cpu_time += (a->ru_stime.tv_usec - b->ru_stime.tv_usec) * 1000;
+
+	return cpu_time;
+}
+
+static void timeline_advance(int timeline, int delay_ns)
+{
+	struct timespec tv = { .tv_nsec = delay_ns };
+	nanosleep(&tv, NULL);
+	sw_sync_timeline_inc(timeline, 1);
+}
+
+static void fairness(int i915,
+		     const struct intel_execution_engine2 *e,
+		     int timeout, unsigned int flags)
+{
+	const int frame_ns = 16666 * 1000;
+	const int fence_ns = flags & F_HALF ? 2 * frame_ns : frame_ns;
+	unsigned long *result;
+	uint32_t common = 0;
+
+	igt_require(has_ctx_timestamp(i915, e));
+	igt_require(gem_class_has_mutable_submission(i915, e->class));
+
+	if (flags & F_SHARE)
+		common = gem_create(i915, 4095);
+
+	result = mmap(NULL, 4096, PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
+
+	for (int n = 2; n <= 64; n <<= 1) { /* 32 == 500us per client */
+		int timeline = sw_sync_timeline_create();
+		int nfences = timeout * NSEC_PER_SEC / fence_ns + 1;
+		const int nchild = n - 1; /* odd for easy medians */
+		const int child_ns = frame_ns / (nchild + !!(flags & F_SPARE));
+		const int lo = nchild / 4;
+		const int hi = (3 * nchild + 3) / 4 - 1;
+		struct rusage old_usage, usage;
+		uint64_t cpu_time, d_time;
+		unsigned long vip = -1;
+		struct timespec tv;
+		struct igt_mean m;
+
+		if (flags & F_PING) {
+			struct intel_execution_engine2 *ping;
+
+			__for_each_physical_engine(i915, ping) {
+				if (ping->flags == e->flags)
+					continue;
+
+				igt_fork(child, 1) {
+					uint32_t ctx = gem_context_clone_with_engines(i915, 0);
+
+					fair_child(i915, ctx, ping,
+						   child_ns / 8,
+						   -1, common,
+						   F_SOLO | F_PACE | F_SHARE,
+						   &result[nchild],
+						   NULL);
+
+					gem_context_destroy(i915, ctx);
+				}
+			}
+		}
+
+		memset(result, 0, (nchild + 1) * sizeof(result[0]));
+		getrusage(RUSAGE_CHILDREN, &old_usage);
+		igt_nsec_elapsed(memset(&tv, 0, sizeof(tv)));
+		igt_fork(child, nchild) {
+			uint32_t ctx;
+
+			if (flags & F_ISOLATE) {
+				int clone, dmabuf = -1;
+
+				if (common)
+					dmabuf = prime_handle_to_fd(i915, common);
+
+				clone = gem_reopen_driver(i915);
+				gem_context_copy_engines(i915, 0, clone, 0);
+				i915 = clone;
+
+				if (dmabuf != -1)
+					common = prime_fd_to_handle(i915, dmabuf);
+			}
+
+			ctx = gem_context_clone_with_engines(i915, 0);
+
+			if (flags & F_VIP && child == 0) {
+				gem_context_set_priority(i915, ctx, MAX_PRIO);
+				flags |= F_FLOW;
+			}
+			if (flags & F_RRUL && child == 0)
+				flags |= F_SOLO | F_FLOW | F_SYNC;
+
+			fair_child(i915, ctx, e, child_ns,
+				   timeline, common, flags,
+				   &result[nchild],
+				   &result[child]);
+
+			gem_context_destroy(i915, ctx);
+		}
+
+		while (nfences--)
+			timeline_advance(timeline, fence_ns);
+
+		result[nchild] = 1;
+		for (int child = 0; child < nchild; child++) {
+			while (!READ_ONCE(result[child]))
+				timeline_advance(timeline, fence_ns);
+		}
+
+		igt_waitchildren();
+		close(timeline);
+
+		/* Are we running out of CPU time, and fail to submit frames? */
+		d_time = igt_nsec_elapsed(&tv);
+		getrusage(RUSAGE_CHILDREN, &usage);
+		cpu_time = d_cpu_time(&usage, &old_usage);
+		if (10 * cpu_time > 9 * d_time) {
+			if (nchild > 7)
+				break;
+
+			igt_skip_on_f(10 * cpu_time > 9 * d_time,
+				      "%.0f%% CPU usage, presuming capacity exceeded\n",
+				      100. * cpu_time / d_time);
+		}
+
+		igt_mean_init(&m);
+		for (int child = 0; child < nchild; child++)
+			igt_mean_add(&m, result[child]);
+
+		if (flags & (F_VIP | F_RRUL))
+			vip = result[0];
+
+		qsort(result, nchild, sizeof(*result), cmp_ul);
+		igt_info("%2d clients, range: [%.1f, %.1f], iqr: [%.1f, %.1f], median: %.1f, mean: %.1f ± %.2f ms\n",
+			 nchild,
+			 1e-6 * result[0],  1e-6 * result[nchild - 1],
+			 1e-6 * result[lo], 1e-6 * result[hi],
+			 1e-6 * result[nchild / 2],
+			 1e-6 * igt_mean_get(&m),
+			 1e-6 * sqrt(igt_mean_get_variance(&m)));
+
+		if (vip != -1) {
+			igt_info("VIP interval %.2f ms\n", 1e-6 * vip);
+			igt_assert(4 * vip > 3 * fence_ns &&
+				   3 * vip < 4 * fence_ns);
+		}
+
+		/* May be slowed due to sheer volume of context switches */
+		igt_assert(4 * igt_mean_get(&m) > 3 * fence_ns &&
+			       igt_mean_get(&m) < 3 * fence_ns);
+
+		igt_assert(4 * igt_mean_get(&m) > 3 * result[nchild / 2] &&
+			   3 * igt_mean_get(&m) < 4 * result[nchild / 2]);
+
+		igt_assert(2 * (result[hi] - result[lo]) < result[nchild / 2]);
+	}
+
+	munmap(result, 4096);
+	if (common)
+		gem_close(i915, common);
+}
+
+static void test_fairness(int i915, int timeout)
+{
+	static const struct {
+		const char *name;
+		unsigned int flags;
+	} fair[] = {
+		/*
+		 * none - maximal greed in each client
+		 *
+		 * Push as many frames from each client as fast as possible
+		 */
+		{ "none",       0 },
+		{ "none-vip",   F_VIP }, /* one vip client must meet deadlines */
+		{ "none-solo",  F_SOLO }, /* 1 batch per frame per client */
+		{ "none-share", F_SHARE }, /* read from a common buffer */
+		{ "none-rrul",  F_RRUL }, /* "realtime-response under load" */
+		{ "none-ping",  F_PING }, /* measure inter-engine fairness */
+
+		/*
+		 * throttle - original per client throttling
+		 *
+		 * Used for front buffering rendering where there is no
+		 * extenal frame marker. Each client tries to only keep
+		 * 20ms of work submitted, though that measurement is
+		 * flawed...
+		 *
+		 * This is used by Xory to try and maintain some resembalance
+		 * of input/output consistency when being feed a continuous
+		 * stream of X11 draw requests straight into scanout, where
+		 * the clients may submit the work faster than can be drawn.
+		 *
+		 * Throttling tracks requests per-file (and assumes that
+		 * all requests are in submission order across the whole file),
+		 * so we split each child to its own fd.
+		 */
+		{ "throttle",       F_THROTTLE | F_ISOLATE },
+		{ "throttle-vip",   F_THROTTLE | F_ISOLATE | F_VIP },
+		{ "throttle-solo",  F_THROTTLE | F_ISOLATE | F_SOLO },
+		{ "throttle-share", F_THROTTLE | F_ISOLATE | F_SHARE },
+		{ "throttle-rrul",  F_THROTTLE | F_ISOLATE | F_RRUL },
+
+		/*
+		 * pace - mesa "submit double buffering"
+		 *
+		 * Submit a frame, wait for previous frame to start. This
+		 * prevents each client from getting too far ahead of its
+		 * rendering, maintaining a consistent input/output latency.
+		 */
+		{ "pace",       F_PACE },
+		{ "pace-solo",  F_PACE | F_SOLO},
+		{ "pace-share", F_PACE | F_SHARE},
+		{ "pace-ping",  F_PACE | F_SHARE | F_PING},
+
+		/* sync - only submit a frame at a time */
+		{ "sync",      F_SYNC },
+		{ "sync-vip",  F_SYNC | F_VIP },
+		{ "sync-solo", F_SYNC | F_SOLO },
+
+		/* flow - synchronise execution against the clock (vblank) */
+		{ "flow",       F_PACE | F_FLOW },
+		{ "flow-share", F_PACE | F_FLOW | F_SHARE },
+		{ "flow-ping",  F_PACE | F_FLOW | F_SHARE | F_PING },
+
+		/* next - submit ahead of the clock (vblank double buffering) */
+		{ "next",       F_PACE | F_FLOW | F_NEXT },
+		{ "next-share", F_PACE | F_FLOW | F_NEXT | F_SHARE },
+		{ "next-ping",  F_PACE | F_FLOW | F_NEXT | F_SHARE | F_PING },
+
+		/* spare - underutilise by a single client timeslice */
+		{ "spare", F_PACE | F_FLOW | F_SPARE },
+
+		/* half - run at half pace (submit 16ms of work every 32ms) */
+		{ "half",  F_PACE | F_FLOW | F_HALF },
+
+		{}
+	};
+
+	for (typeof(*fair) *f = fair; f->name; f++) {
+		igt_subtest_with_dynamic_f("fair-%s", f->name)  {
+			const struct intel_execution_engine2 *e;
+
+			igt_require(intel_gen(intel_get_drm_devid(i915)) >= 8);
+
+			__for_each_physical_engine(i915, e) {
+				if (!gem_class_can_store_dword(i915, e->class))
+					continue;
+
+				igt_dynamic_f("%s", e->name)
+					fairness(i915, e, timeout, f->flags);
+			}
+		}
+	}
+}
+
+static uint32_t read_ctx_timestamp(int i915,
+				   uint32_t ctx,
+				   const struct intel_execution_engine2 *e)
+{
+	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
+	const uint32_t base = gem_engine_mmio_base(i915, e->name);
+	struct drm_i915_gem_relocation_entry reloc;
+	struct drm_i915_gem_exec_object2 obj = {
+		.handle = gem_create(i915, 4096),
+		.offset = 32 << 20,
+		.relocs_ptr = to_user_pointer(&reloc),
+		.relocation_count = 1,
+	};
+	struct drm_i915_gem_execbuffer2 execbuf = {
+		.buffers_ptr = to_user_pointer(&obj),
+		.buffer_count = 1,
+		.flags = e->flags,
+		.rsvd1 = ctx,
+	};
+#define RUNTIME (base + 0x3a8)
+	uint32_t *map, *cs;
+	uint32_t ts;
+
+	igt_require(base);
+
+	cs = map = gem_mmap__device_coherent(i915, obj.handle,
+					     0, 4096, PROT_WRITE);
+
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = RUNTIME;
+	memset(&reloc, 0, sizeof(reloc));
+	reloc.target_handle = obj.handle;
+	reloc.presumed_offset = obj.offset;
+	reloc.offset = offset_in_page(cs);
+	reloc.delta = 4000;
+	*cs++ = obj.offset + 4000;
+	*cs++ = obj.offset >> 32;
+
+	*cs++ = MI_BATCH_BUFFER_END;
+
+	gem_execbuf(i915, &execbuf);
+	gem_sync(i915, obj.handle);
+	gem_close(i915, obj.handle);
+
+	ts = map[1000];
+	munmap(map, 4096);
+
+	return ts;
+}
+
+static void fairslice(int i915, const struct intel_execution_engine2 *e)
+{
+	igt_spin_t *spin[3];
+	uint32_t ctx[3];
+	uint32_t ts[3];
+
+	for (int i = 0; i < ARRAY_SIZE(ctx); i++) {
+		ctx[i] = gem_context_clone_with_engines(i915, 0);
+		spin[i] = igt_spin_new(i915, .ctx = ctx[i], .engine = e->flags);
+	}
+
+	sleep(2); /* over the course of many timeslices */
+
+	for (int i = 0; i < ARRAY_SIZE(ctx); i++) {
+		igt_assert(gem_bo_busy(i915, spin[i]->handle));
+		igt_spin_end(spin[i]);
+
+		ts[i] = read_ctx_timestamp(i915, ctx[i], e);
+	}
+
+	for (int i = 0; i < ARRAY_SIZE(ctx); i++) {
+		igt_spin_free(i915, spin[i]);
+		gem_context_destroy(i915, ctx[i]);
+	}
+
+	qsort(ts, 3, sizeof(*ts), cmp_u32);
+	igt_info("%s: [%.1f, %.1f] ms\n", e->name,
+		 1e-6 * ticks_to_ns(i915, ts[0]),
+		 1e-6 * ticks_to_ns(i915, ts[2]));
+
+	igt_assert(ts[0] && ts[2] > ts[0]);
+	igt_assert(4 * ts[0] > 3 * ts[2]);
+}
+
 #define test_each_engine(T, i915, e) \
 	igt_subtest_with_dynamic(T) __for_each_physical_engine(i915, e) \
 		igt_dynamic_f("%s", e->name)
@@ -2569,6 +3364,25 @@ igt_main
 		test_each_engine("lateslice", fd, e)
 			lateslice(fd, e->flags);
 
+		igt_subtest_group {
+			igt_fixture {
+				igt_require(gem_scheduler_has_semaphores(fd));
+				igt_require(gem_scheduler_has_preemption(fd));
+				igt_require(intel_gen(intel_get_drm_devid(fd)) >= 8);
+			}
+
+			test_each_engine("fairslice", fd, e)
+				fairslice(fd, e);
+
+			igt_subtest("fairslice-all")  {
+				__for_each_physical_engine(fd, e) {
+					igt_fork(child, 1)
+						fairslice(fd, e);
+				}
+				igt_waitchildren();
+			}
+		}
+
 		test_each_engine("submit-early-slice", fd, e)
 			submit_slice(fd, e, EARLY_SUBMIT);
 		test_each_engine("submit-golden-slice", fd, e)
@@ -2597,6 +3411,8 @@ igt_main
 		test_each_engine_store("promotion", fd, e)
 			promotion(fd, e->flags);
 
+		test_fairness(fd, 2);
+
 		igt_subtest_group {
 			igt_fixture {
 				igt_require(gem_scheduler_has_preemption(fd));
-- 
2.28.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [Intel-gfx] [PATCH i-g-t] i915/gem_exec_schedule: Try to spot unfairness
@ 2020-06-22 19:08 Chris Wilson
  0 siblings, 0 replies; 20+ messages in thread
From: Chris Wilson @ 2020-06-22 19:08 UTC (permalink / raw)
  To: intel-gfx; +Cc: igt-dev, Chris Wilson

An important property for multi-client systems is that each client gets
a 'fair' allotment of system time. (Where fairness is at the whim of the
context properties, such as priorities.) This test forks N independent
clients (albeit they happen to share a single vm), and does an equal
amount of work in client and asserts that they take an equal amount of
time.

Though we have never claimed to have a completely fair scheduler, that
is what is expected.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Ramalingam C <ramalingam.c@intel.com>
---
 tests/i915/gem_exec_schedule.c | 782 +++++++++++++++++++++++++++++++++
 1 file changed, 782 insertions(+)

diff --git a/tests/i915/gem_exec_schedule.c b/tests/i915/gem_exec_schedule.c
index 931b1245f..fae04536c 100644
--- a/tests/i915/gem_exec_schedule.c
+++ b/tests/i915/gem_exec_schedule.c
@@ -29,6 +29,7 @@
 #include <sys/poll.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
+#include <sys/resource.h>
 #include <sys/syscall.h>
 #include <sched.h>
 #include <signal.h>
@@ -2501,6 +2502,766 @@ static void measure_semaphore_power(int i915)
 	rapl_close(&pkg);
 }
 
+static int read_timestamp_frequency(int i915)
+{
+	int value = 0;
+	drm_i915_getparam_t gp = {
+		.value = &value,
+		.param = I915_PARAM_CS_TIMESTAMP_FREQUENCY,
+	};
+	ioctl(i915, DRM_IOCTL_I915_GETPARAM, &gp);
+	return value;
+}
+
+static uint64_t div64_u64_round_up(uint64_t x, uint64_t y)
+{
+	return (x + y - 1) / y;
+}
+
+static uint64_t ns_to_ticks(int i915, uint64_t ns)
+{
+	return div64_u64_round_up(ns * read_timestamp_frequency(i915),
+				  NSEC_PER_SEC);
+}
+
+static uint64_t ticks_to_ns(int i915, uint64_t ticks)
+{
+	return div64_u64_round_up(ticks * NSEC_PER_SEC,
+				  read_timestamp_frequency(i915));
+}
+
+#define MI_INSTR(opcode, flags) (((opcode) << 23) | (flags))
+
+#define MI_MATH(x)                      MI_INSTR(0x1a, (x) - 1)
+#define MI_MATH_INSTR(opcode, op1, op2) ((opcode) << 20 | (op1) << 10 | (op2))
+/* Opcodes for MI_MATH_INSTR */
+#define   MI_MATH_NOOP                  MI_MATH_INSTR(0x000, 0x0, 0x0)
+#define   MI_MATH_LOAD(op1, op2)        MI_MATH_INSTR(0x080, op1, op2)
+#define   MI_MATH_LOADINV(op1, op2)     MI_MATH_INSTR(0x480, op1, op2)
+#define   MI_MATH_LOAD0(op1)            MI_MATH_INSTR(0x081, op1)
+#define   MI_MATH_LOAD1(op1)            MI_MATH_INSTR(0x481, op1)
+#define   MI_MATH_ADD                   MI_MATH_INSTR(0x100, 0x0, 0x0)
+#define   MI_MATH_SUB                   MI_MATH_INSTR(0x101, 0x0, 0x0)
+#define   MI_MATH_AND                   MI_MATH_INSTR(0x102, 0x0, 0x0)
+#define   MI_MATH_OR                    MI_MATH_INSTR(0x103, 0x0, 0x0)
+#define   MI_MATH_XOR                   MI_MATH_INSTR(0x104, 0x0, 0x0)
+#define   MI_MATH_STORE(op1, op2)       MI_MATH_INSTR(0x180, op1, op2)
+#define   MI_MATH_STOREINV(op1, op2)    MI_MATH_INSTR(0x580, op1, op2)
+/* Registers used as operands in MI_MATH_INSTR */
+#define   MI_MATH_REG(x)                (x)
+#define   MI_MATH_REG_SRCA              0x20
+#define   MI_MATH_REG_SRCB              0x21
+#define   MI_MATH_REG_ACCU              0x31
+#define   MI_MATH_REG_ZF                0x32
+#define   MI_MATH_REG_CF                0x33
+
+#define MI_LOAD_REGISTER_REG    MI_INSTR(0x2A, 1)
+
+static void delay(int i915,
+		  const struct intel_execution_engine2 *e,
+		  uint32_t handle,
+		  uint64_t addr,
+		  uint64_t ns)
+{
+	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
+	const uint32_t base = gem_engine_mmio_base(i915, e->name);
+#define CS_GPR(x) (base + 0x600 + 8 * (x))
+#define RUNTIME (base + 0x3a8)
+	enum { START_TS, NOW_TS };
+	uint32_t *map, *cs, *jmp;
+
+	igt_require(base);
+
+	/* Loop until CTX_TIMESTAMP - initial > @ns */
+
+	cs = map = gem_mmap__device_coherent(i915, handle, 0, 4096, PROT_WRITE);
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(START_TS) + 4;
+	*cs++ = 0;
+	*cs++ = MI_LOAD_REGISTER_REG;
+	*cs++ = RUNTIME;
+	*cs++ = CS_GPR(START_TS);
+
+	while (offset_in_page(cs) & 63)
+		*cs++ = 0;
+	jmp = cs;
+
+	*cs++ = 0x5 << 23; /* MI_ARB_CHECK */
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(NOW_TS) + 4;
+	*cs++ = 0;
+	*cs++ = MI_LOAD_REGISTER_REG;
+	*cs++ = RUNTIME;
+	*cs++ = CS_GPR(NOW_TS);
+
+	/* delta = now - start; inverted to match COND_BBE */
+	*cs++ = MI_MATH(4);
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(NOW_TS));
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(START_TS));
+	*cs++ = MI_MATH_SUB;
+	*cs++ = MI_MATH_STOREINV(MI_MATH_REG(NOW_TS), MI_MATH_REG_ACCU);
+
+	/* Save delta for reading by COND_BBE */
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_GPR(NOW_TS);
+	*cs++ = addr + 4000;
+	*cs++ = addr >> 32;
+
+	/* Delay between SRM and COND_BBE to post the writes */
+	for (int n = 0; n < 8; n++) {
+		*cs++ = MI_STORE_DWORD_IMM;
+		if (use_64b) {
+			*cs++ = addr + 4064;
+			*cs++ = addr >> 32;
+		} else {
+			*cs++ = 0;
+			*cs++ = addr + 4064;
+		}
+		*cs++ = 0;
+	}
+
+	/* Break if delta > ns */
+	*cs++ = MI_COND_BATCH_BUFFER_END | MI_DO_COMPARE | (1 + use_64b);
+	*cs++ = ~ns_to_ticks(i915, ns);
+	*cs++ = addr + 4000;
+	*cs++ = addr >> 32;
+
+	/* Otherwise back to recalculating delta */
+	*cs++ = MI_BATCH_BUFFER_START | 1 << 8 | use_64b;
+	*cs++ = addr + offset_in_page(jmp);
+	*cs++ = addr >> 32;
+
+	munmap(map, 4096);
+}
+
+static struct drm_i915_gem_exec_object2
+delay_create(int i915, uint32_t ctx,
+	     const struct intel_execution_engine2 *e,
+	     uint64_t target_ns)
+{
+	struct drm_i915_gem_exec_object2 obj = {
+		.handle = batch_create(i915),
+		.flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS,
+	};
+	struct drm_i915_gem_execbuffer2 execbuf = {
+		.buffers_ptr = to_user_pointer(&obj),
+		.buffer_count = 1,
+		.rsvd1 = ctx,
+		.flags = e->flags,
+	};
+
+	gem_execbuf(i915, &execbuf);
+	gem_sync(i915, obj.handle);
+
+	delay(i915, e, obj.handle, obj.offset, target_ns);
+
+	obj.flags |= EXEC_OBJECT_PINNED;
+	return obj;
+}
+
+static void tslog(int i915,
+		  const struct intel_execution_engine2 *e,
+		  uint32_t handle,
+		  uint64_t addr)
+{
+	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
+	const uint32_t base = gem_engine_mmio_base(i915, e->name);
+#define CS_GPR(x) (base + 0x600 + 8 * (x))
+#define CS_TIMESTAMP (base + 0x358)
+	enum { INC, MASK, ADDR };
+	uint32_t *timestamp_lo, *addr_lo;
+	uint32_t *map, *cs;
+
+	igt_require(base);
+
+	map = gem_mmap__device_coherent(i915, handle, 0, 4096, PROT_WRITE);
+	cs = map + 512;
+
+	/* Record the current CS_TIMESTAMP into a journal [a 512 slot ring]. */
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_TIMESTAMP;
+	timestamp_lo = cs;
+	*cs++ = addr;
+	*cs++ = addr >> 32;
+
+	/* Load the address + inc & mask variables */
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(ADDR);
+	addr_lo = cs;
+	*cs++ = addr;
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(ADDR) + 4;
+	*cs++ = addr >> 32;
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(INC);
+	*cs++ = 4;
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(INC) + 4;
+	*cs++ = 0;
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(MASK);
+	*cs++ = 0xfffff7ff;
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(MASK) + 4;
+	*cs++ = 0xffffffff;
+
+	/* Increment the [ring] address for saving CS_TIMESTAMP */
+	*cs++ = MI_MATH(8);
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(INC));
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(ADDR));
+	*cs++ = MI_MATH_ADD;
+	*cs++ = MI_MATH_STORE(MI_MATH_REG(ADDR), MI_MATH_REG_ACCU);
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(ADDR));
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(MASK));
+	*cs++ = MI_MATH_AND;
+	*cs++ = MI_MATH_STORE(MI_MATH_REG(ADDR), MI_MATH_REG_ACCU);
+
+	/* Rewrite the batch buffer for the next execution */
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_GPR(ADDR);
+	*cs++ = addr + offset_in_page(timestamp_lo);
+	*cs++ = addr >> 32;
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_GPR(ADDR);
+	*cs++ = addr + offset_in_page(addr_lo);
+	*cs++ = addr >> 32;
+
+	*cs++ = MI_BATCH_BUFFER_END;
+
+	munmap(map, 4096);
+}
+
+static struct drm_i915_gem_exec_object2
+tslog_create(int i915, uint32_t ctx, const struct intel_execution_engine2 *e)
+{
+	struct drm_i915_gem_exec_object2 obj = {
+		.handle = batch_create(i915),
+		.flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS,
+	};
+	struct drm_i915_gem_execbuffer2 execbuf = {
+		.buffers_ptr = to_user_pointer(&obj),
+		.buffer_count = 1,
+		.rsvd1 = ctx,
+		.flags = e->flags,
+	};
+
+	gem_execbuf(i915, &execbuf);
+	gem_sync(i915, obj.handle);
+
+	tslog(i915, e, obj.handle, obj.offset);
+
+	obj.flags |= EXEC_OBJECT_PINNED;
+	return obj;
+}
+
+static int cmp_u32(const void *A, const void *B)
+{
+	const uint32_t *a = A, *b = B;
+
+	if (*a < *b)
+		return -1;
+	else if (*a > *b)
+		return 1;
+	else
+		return 0;
+}
+
+static struct intel_execution_engine2
+pick_random_engine(int i915, const struct intel_execution_engine2 *not)
+{
+	const struct intel_execution_engine2 *e;
+	unsigned int count = 0;
+
+	__for_each_physical_engine(i915, e) {
+		if (e->flags == not->flags)
+			continue;
+		if (!gem_class_has_mutable_submission(i915, e->class))
+			continue;
+		count++;
+	}
+	if (!count)
+		return *not;
+
+	count = rand() % count;
+	__for_each_physical_engine(i915, e) {
+		if (e->flags == not->flags)
+			continue;
+		if (!gem_class_has_mutable_submission(i915, e->class))
+			continue;
+		if (!count--)
+			break;
+	}
+
+	return *e;
+}
+
+static void fair_child(int i915, uint32_t ctx,
+		       const struct intel_execution_engine2 *e,
+		       uint64_t frame_ns,
+		       int timeline,
+		       uint32_t common,
+		       unsigned int flags,
+		       unsigned long *ctl,
+		       unsigned long *out)
+#define F_SYNC		(1 << 0)
+#define F_PACE		(1 << 1)
+#define F_FLOW		(1 << 2)
+#define F_HALF		(1 << 3)
+#define F_SOLO		(1 << 4)
+#define F_SPARE		(1 << 5)
+#define F_NEXT		(1 << 6)
+#define F_VIP		(1 << 7)
+#define F_RRUL		(1 << 8)
+#define F_SHARE		(1 << 9)
+#define F_PING		(1 << 10)
+#define F_THROTTLE	(1 << 11)
+{
+	const int batches_per_frame = flags & F_SOLO ? 1 : 3;
+	struct drm_i915_gem_exec_object2 obj[4] = {
+		{},
+		{
+			.handle = common ?: gem_create(i915, 4096),
+		},
+		delay_create(i915, ctx, e, frame_ns / batches_per_frame),
+		delay_create(i915, ctx, e, frame_ns / batches_per_frame),
+	};
+	struct intel_execution_engine2 ping = *e;
+	int p_fence = -1, n_fence = -1;
+	unsigned long count = 0;
+	int n;
+
+	srandom(getpid());
+	if (flags & F_PING)
+		ping = pick_random_engine(i915, e);
+	obj[0] = tslog_create(i915, ctx, &ping);
+
+	while (!READ_ONCE(*ctl)) {
+		struct drm_i915_gem_execbuffer2 execbuf = {
+			.buffers_ptr = to_user_pointer(obj),
+			.buffer_count = 4,
+			.rsvd1 = ctx,
+			.rsvd2 = -1,
+			.flags = e->flags,
+		};
+
+		if (flags & F_FLOW) {
+			unsigned int seq;
+
+			seq = count;
+			if (flags & F_NEXT)
+				seq++;
+
+			execbuf.rsvd2 =
+				sw_sync_timeline_create_fence(timeline, seq);
+			execbuf.flags |= I915_EXEC_FENCE_IN;
+		}
+
+		execbuf.flags |= I915_EXEC_FENCE_OUT;
+		gem_execbuf_wr(i915, &execbuf);
+		n_fence = execbuf.rsvd2 >> 32;
+		execbuf.flags &= ~(I915_EXEC_FENCE_OUT | I915_EXEC_FENCE_IN);
+		for (n = 1; n < batches_per_frame; n++)
+			gem_execbuf(i915, &execbuf);
+		close(execbuf.rsvd2);
+
+		execbuf.buffer_count = 1;
+		execbuf.batch_start_offset = 2048;
+		execbuf.flags = ping.flags | I915_EXEC_FENCE_IN;
+		execbuf.rsvd2 = n_fence;
+		gem_execbuf(i915, &execbuf);
+
+		if (flags & F_PACE && p_fence != -1) {
+			struct pollfd pfd = {
+				.fd = p_fence,
+				.events = POLLIN,
+			};
+			poll(&pfd, 1, -1);
+		}
+		close(p_fence);
+
+		if (flags & F_SYNC) {
+			struct pollfd pfd = {
+				.fd = n_fence,
+				.events = POLLIN,
+			};
+			poll(&pfd, 1, -1);
+		}
+
+		if (flags & F_THROTTLE)
+			igt_ioctl(i915, DRM_IOCTL_I915_GEM_THROTTLE, 0);
+
+		igt_swap(obj[2], obj[3]);
+		igt_swap(p_fence, n_fence);
+		count++;
+	}
+	close(p_fence);
+
+	gem_close(i915, obj[3].handle);
+	gem_close(i915, obj[2].handle);
+	if (obj[1].handle != common)
+		gem_close(i915, obj[1].handle);
+
+	gem_sync(i915, obj[0].handle);
+	if (out) {
+		uint32_t *map;
+
+		map = gem_mmap__device_coherent(i915, obj[0].handle,
+						0, 4096, PROT_WRITE);
+		for (n = 1; n < min(count, 512); n++) {
+			igt_assert(map[n]);
+			map[n - 1] = map[n] - map[n - 1];
+		}
+		qsort(map, --n, sizeof(*map), cmp_u32);
+		*out = ticks_to_ns(i915, map[n / 2]);
+		munmap(map, 4096);
+	}
+	gem_close(i915, obj[0].handle);
+}
+
+static int cmp_ul(const void *A, const void *B)
+{
+	const unsigned long *a = A, *b = B;
+
+	if (*a < *b)
+		return -1;
+	else if (*a > *b)
+		return 1;
+	else
+		return 0;
+}
+
+static uint64_t d_cpu_time(const struct rusage *a, const struct rusage *b)
+{
+	uint64_t cpu_time = 0;
+
+	cpu_time += (a->ru_utime.tv_sec - b->ru_utime.tv_sec) * NSEC_PER_SEC;
+	cpu_time += (a->ru_utime.tv_usec - b->ru_utime.tv_usec) * 1000;
+
+	cpu_time += (a->ru_stime.tv_sec - b->ru_stime.tv_sec) * NSEC_PER_SEC;
+	cpu_time += (a->ru_stime.tv_usec - b->ru_stime.tv_usec) * 1000;
+
+	return cpu_time;
+}
+
+static void timeline_advance(int timeline, int delay_ns)
+{
+	struct timespec tv = { .tv_nsec = delay_ns };
+	nanosleep(&tv, NULL);
+	sw_sync_timeline_inc(timeline, 1);
+}
+
+static void fairness(int i915,
+		     const struct intel_execution_engine2 *e,
+		     int timeout, unsigned int flags)
+{
+	const int frame_ns = 16666 * 1000;
+	const int fence_ns = flags & F_HALF ? 2 * frame_ns : frame_ns;
+	unsigned long *result;
+	uint32_t common = 0;
+
+	igt_require(gem_class_has_mutable_submission(i915, e->class));
+
+	if (flags & F_SHARE)
+		common = gem_create(i915, 4095);
+
+	result = mmap(NULL, 4096, PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
+
+	for (int n = 2; n <= 64; n <<= 1) { /* 32 == 500us per client */
+		int timeline = sw_sync_timeline_create();
+		int nfences = timeout * NSEC_PER_SEC / fence_ns + 1;
+		const int nchild = n - 1; /* odd for easy medians */
+		const int child_ns = frame_ns / (nchild + !!(flags & F_SPARE));
+		const int lo = nchild / 4;
+		const int hi = (3 * nchild + 3) / 4 - 1;
+		struct rusage old_usage, usage;
+		uint64_t cpu_time, d_time;
+		unsigned long vip = -1;
+		struct timespec tv;
+		struct igt_mean m;
+
+		if (flags & F_PING) {
+			struct intel_execution_engine2 *ping;
+
+			__for_each_physical_engine(i915, ping) {
+				if (ping->flags == e->flags)
+					continue;
+
+				igt_fork(child, 1) {
+					uint32_t ctx = gem_context_clone_with_engines(i915, 0);
+
+					fair_child(i915, ctx, ping,
+						   child_ns / 8,
+						   -1, common,
+						   F_SOLO | F_PACE | F_SHARE,
+						   &result[nchild],
+						   NULL);
+
+					gem_context_destroy(i915, ctx);
+				}
+			}
+		}
+
+		memset(result, 0, (nchild + 1) * sizeof(result[0]));
+		getrusage(RUSAGE_CHILDREN, &old_usage);
+		igt_nsec_elapsed(memset(&tv, 0, sizeof(tv)));
+		igt_fork(child, nchild) {
+			uint32_t ctx = gem_context_clone_with_engines(i915, 0);
+
+			if (flags & F_VIP && child == 0) {
+				gem_context_set_priority(i915, ctx, MAX_PRIO);
+				flags |= F_FLOW;
+			}
+			if (flags & F_RRUL && child == 0)
+				flags |= F_SOLO | F_FLOW | F_SYNC;
+
+			fair_child(i915, ctx, e, child_ns,
+				   timeline, common, flags,
+				   &result[nchild],
+				   &result[child]);
+
+			gem_context_destroy(i915, ctx);
+		}
+
+		while (nfences--)
+			timeline_advance(timeline, fence_ns);
+
+		result[nchild] = 1;
+		for (int child = 0; child < nchild; child++) {
+			while (!READ_ONCE(result[child]))
+				timeline_advance(timeline, fence_ns);
+		}
+
+		igt_waitchildren();
+		close(timeline);
+
+		/* Are we running out of CPU time, and fail to submit frames? */
+		d_time = igt_nsec_elapsed(&tv);
+		getrusage(RUSAGE_CHILDREN, &usage);
+		cpu_time = d_cpu_time(&usage, &old_usage);
+		if (10 * cpu_time > 9 * d_time) {
+			if (nchild > 7)
+				break;
+
+			igt_skip_on_f(10 * cpu_time > 9 * d_time,
+				      "%.0f%% CPU usage, presuming capacity exceeded\n",
+				      100. * cpu_time / d_time);
+		}
+
+		igt_mean_init(&m);
+		for (int child = 0; child < nchild; child++)
+			igt_mean_add(&m, result[child]);
+
+		if (flags & (F_VIP | F_RRUL))
+			vip = result[0];
+
+		qsort(result, nchild, sizeof(*result), cmp_ul);
+		igt_info("%2d clients, range: [%.1f, %.1f], iqr: [%.1f, %.1f], median: %.1f, mean: %.1f ± %.2f ms\n",
+			 nchild,
+			 1e-6 * result[0],  1e-6 * result[nchild - 1],
+			 1e-6 * result[lo], 1e-6 * result[hi],
+			 1e-6 * result[nchild / 2],
+			 1e-6 * igt_mean_get(&m),
+			 1e-6 * sqrt(igt_mean_get_variance(&m)));
+
+		if (vip != -1) {
+			igt_info("VIP interval %.2f ms\n", 1e-6 * vip);
+			igt_assert(4 * vip > 3 * fence_ns &&
+				   3 * vip < 4 * fence_ns);
+		}
+
+		/* May be slowed due to sheer volume of context switches */
+		igt_assert(4 * igt_mean_get(&m) > 3 * fence_ns &&
+			       igt_mean_get(&m) < 3 * fence_ns);
+
+		igt_assert(4 * igt_mean_get(&m) > 3 * result[nchild / 2] &&
+			   3 * igt_mean_get(&m) < 4 * result[nchild / 2]);
+
+		igt_assert(2 * (result[hi] - result[lo]) < result[nchild / 2]);
+	}
+
+	munmap(result, 4096);
+	if (common)
+		gem_close(i915, common);
+}
+
+static void test_fairness(int i915, int timeout)
+{
+	static const struct {
+		const char *name;
+		unsigned int flags;
+	} fair[] = {
+		/*
+		 * none - maximal greed in each client
+		 *
+		 * Push as many frames from each client as fast as possible
+		 */
+		{ "none",       0 },
+		{ "none-vip",   F_VIP }, /* one vip client must meet deadlines */
+		{ "none-solo",  F_SOLO }, /* 1 batch per frame per client */
+		{ "none-share", F_SHARE }, /* read from a common buffer */
+		{ "none-rrul",  F_RRUL }, /* "realtime-response under load" */
+		{ "none-ping",  F_PING }, /* measure inter-engine fairness */
+
+		/*
+		 * throttle - original per client throttling
+		 *
+		 * Used for front buffering rendering where there is no
+		 * extenal frame marker. Each client tries to only keep
+		 * 20ms of work submitted, though that measurement is
+		 * flawed...
+		 *
+		 * This is used by Xory to try and maintain some resembalance
+		 * of input/output consistency when being feed a continuous
+		 * stream of X11 draw requests straight into scanout, where
+		 * the clients may submit the work faster than can be drawn.
+		 */
+		{ "throttle",       F_THROTTLE },
+		{ "throttle-vip",   F_THROTTLE | F_VIP },
+		{ "throttle-solo",  F_THROTTLE | F_SOLO },
+		{ "throttle-share", F_THROTTLE | F_SHARE },
+		{ "throttle-rrul",  F_THROTTLE | F_RRUL },
+
+		/*
+		 * pace - mesa "submit double buffering"
+		 *
+		 * Submit a frame, wait for previous frame to start. This
+		 * prevents each client from getting too far ahead of its
+		 * rendering, maintaining a consistent input/output latency.
+		 */
+		{ "pace",       F_PACE },
+		{ "pace-solo",  F_PACE | F_SOLO},
+		{ "pace-share", F_PACE | F_SHARE},
+		{ "pace-ping",  F_PACE | F_SHARE | F_PING},
+
+		/* sync - only submit a frame at a time */
+		{ "sync",      F_SYNC },
+		{ "sync-vip",  F_SYNC | F_VIP },
+		{ "sync-solo", F_SYNC | F_SOLO },
+
+		/* flow - synchronise execution against the clock (vblank) */
+		{ "flow",       F_PACE | F_FLOW },
+		{ "flow-share", F_PACE | F_FLOW | F_SHARE },
+		{ "flow-ping",  F_PACE | F_FLOW | F_SHARE | F_PING },
+
+		/* next - submit ahead of the clock (vblank double buffering) */
+		{ "next",       F_PACE | F_FLOW | F_NEXT },
+		{ "next-share", F_PACE | F_FLOW | F_NEXT | F_SHARE },
+		{ "next-ping",  F_PACE | F_FLOW | F_NEXT | F_SHARE | F_PING },
+
+		/* spare - underutilise by a single client timeslice */
+		{ "spare", F_PACE | F_FLOW | F_SPARE },
+
+		/* half - run at half pace (submit 16ms of work every 32ms) */
+		{ "half",  F_PACE | F_FLOW | F_HALF },
+
+		{}
+	};
+
+	for (typeof(*fair) *f = fair; f->name; f++) {
+		igt_subtest_with_dynamic_f("fair-%s", f->name)  {
+			const struct intel_execution_engine2 *e;
+
+			igt_require(intel_gen(intel_get_drm_devid(i915)) >= 8);
+
+			__for_each_physical_engine(i915, e) {
+				if (!gem_class_can_store_dword(i915, e->class))
+					continue;
+
+				igt_dynamic_f("%s", e->name)
+					fairness(i915, e, timeout, f->flags);
+			}
+		}
+	}
+}
+
+static uint32_t read_ctx_timestamp(int i915,
+				   uint32_t ctx,
+				   const struct intel_execution_engine2 *e)
+{
+	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
+	const uint32_t base = gem_engine_mmio_base(i915, e->name);
+	struct drm_i915_gem_relocation_entry reloc;
+	struct drm_i915_gem_exec_object2 obj = {
+		.handle = gem_create(i915, 4096),
+		.offset = 32 << 20,
+		.relocs_ptr = to_user_pointer(&reloc),
+		.relocation_count = 1,
+	};
+	struct drm_i915_gem_execbuffer2 execbuf = {
+		.buffers_ptr = to_user_pointer(&obj),
+		.buffer_count = 1,
+		.flags = e->flags,
+		.rsvd1 = ctx,
+	};
+#define RUNTIME (base + 0x3a8)
+	uint32_t *map, *cs;
+	uint32_t ts;
+
+	igt_require(base);
+
+	cs = map = gem_mmap__device_coherent(i915, obj.handle,
+					     0, 4096, PROT_WRITE);
+
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = RUNTIME;
+	memset(&reloc, 0, sizeof(reloc));
+	reloc.target_handle = obj.handle;
+	reloc.presumed_offset = obj.offset;
+	reloc.offset = offset_in_page(cs);
+	reloc.delta = 4000;
+	*cs++ = obj.offset + 4000;
+	*cs++ = obj.offset >> 32;
+
+	*cs++ = MI_BATCH_BUFFER_END;
+
+	gem_execbuf(i915, &execbuf);
+	gem_sync(i915, obj.handle);
+	gem_close(i915, obj.handle);
+
+	ts = map[1000];
+	munmap(map, 4096);
+
+	return ts;
+}
+
+static void fairslice(int i915, const struct intel_execution_engine2 *e)
+{
+	igt_spin_t *spin[3];
+	uint32_t ctx[3];
+	uint32_t ts[3];
+
+	for (int i = 0; i < ARRAY_SIZE(ctx); i++) {
+		ctx[i] = gem_context_clone_with_engines(i915, 0);
+		spin[i] = igt_spin_new(i915, .ctx = ctx[i], .engine = e->flags);
+	}
+
+	sleep(2); /* over the course of many timeslices */
+
+	for (int i = 0; i < ARRAY_SIZE(ctx); i++) {
+		igt_assert(gem_bo_busy(i915, spin[i]->handle));
+		igt_spin_end(spin[i]);
+
+		ts[i] = read_ctx_timestamp(i915, ctx[i], e);
+	}
+
+	for (int i = 0; i < ARRAY_SIZE(ctx); i++) {
+		igt_spin_free(i915, spin[i]);
+		gem_context_destroy(i915, ctx[i]);
+	}
+
+	qsort(ts, 3, sizeof(*ts), cmp_u32);
+	igt_info("%s: [%.1f, %.1f] ms\n", e->name,
+		 1e-6 * ticks_to_ns(i915, ts[0]),
+		 1e-6 * ticks_to_ns(i915, ts[2]));
+
+	igt_assert(ts[0] && ts[2] > ts[0]);
+	igt_assert(4 * ts[0] > 3 * ts[2]);
+}
+
 #define test_each_engine(T, i915, e) \
 	igt_subtest_with_dynamic(T) __for_each_physical_engine(i915, e) \
 		igt_dynamic_f("%s", e->name)
@@ -2567,6 +3328,25 @@ igt_main
 		test_each_engine("lateslice", fd, e)
 			lateslice(fd, e->flags);
 
+		igt_subtest_group {
+			igt_fixture {
+				igt_require(gem_scheduler_has_semaphores(fd));
+				igt_require(gem_scheduler_has_preemption(fd));
+				igt_require(intel_gen(intel_get_drm_devid(fd)) >= 8);
+			}
+
+			test_each_engine("fairslice", fd, e)
+				fairslice(fd, e);
+
+			igt_subtest("fairslice-all")  {
+				__for_each_physical_engine(fd, e) {
+					igt_fork(child, 1)
+						fairslice(fd, e);
+				}
+				igt_waitchildren();
+			}
+		}
+
 		test_each_engine("submit-early-slice", fd, e)
 			submit_slice(fd, e, EARLY_SUBMIT);
 		test_each_engine("submit-golden-slice", fd, e)
@@ -2595,6 +3375,8 @@ igt_main
 		test_each_engine_store("promotion", fd, e)
 			promotion(fd, e->flags);
 
+		test_fairness(fd, 2);
+
 		igt_subtest_group {
 			igt_fixture {
 				igt_require(gem_scheduler_has_preemption(fd));
-- 
2.27.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [Intel-gfx] [PATCH i-g-t] i915/gem_exec_schedule: Try to spot unfairness
@ 2020-06-09 12:45 Chris Wilson
  0 siblings, 0 replies; 20+ messages in thread
From: Chris Wilson @ 2020-06-09 12:45 UTC (permalink / raw)
  To: intel-gfx; +Cc: Chris Wilson

An important property for multi-client systems is that each client gets
a 'fair' allotment of system time. (Where fairness is at the whim of the
context properties, such as priorities.) This test forks N independent
clients (albeit they happen to share a single vm), and does an equal
amount of work in client and asserts that they take an equal amount of
time.

Though we have never claimed to have a completely fair scheduler, that
is what is expected.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Ramalingam C <ramalingam.c@intel.com>
---
 tests/i915/gem_exec_schedule.c | 699 +++++++++++++++++++++++++++++++++
 1 file changed, 699 insertions(+)

diff --git a/tests/i915/gem_exec_schedule.c b/tests/i915/gem_exec_schedule.c
index 56c638833..b3a1fedaa 100644
--- a/tests/i915/gem_exec_schedule.c
+++ b/tests/i915/gem_exec_schedule.c
@@ -29,6 +29,7 @@
 #include <sys/poll.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
+#include <sys/resource.h>
 #include <sys/syscall.h>
 #include <sched.h>
 #include <signal.h>
@@ -2495,6 +2496,666 @@ static void measure_semaphore_power(int i915)
 	rapl_close(&pkg);
 }
 
+static int read_timestamp_frequency(int i915)
+{
+	int value = 0;
+	drm_i915_getparam_t gp = {
+		.value = &value,
+		.param = I915_PARAM_CS_TIMESTAMP_FREQUENCY,
+	};
+	ioctl(i915, DRM_IOCTL_I915_GETPARAM, &gp);
+	return value;
+}
+
+static uint64_t div64_u64_round_up(uint64_t x, uint64_t y)
+{
+	return (x + y - 1) / y;
+}
+
+static uint64_t ns_to_ticks(int i915, uint64_t ns)
+{
+	return div64_u64_round_up(ns * read_timestamp_frequency(i915),
+				  NSEC_PER_SEC);
+}
+
+static uint64_t ticks_to_ns(int i915, uint64_t ticks)
+{
+	return div64_u64_round_up(ticks * NSEC_PER_SEC,
+				  read_timestamp_frequency(i915));
+}
+
+#define MI_INSTR(opcode, flags) (((opcode) << 23) | (flags))
+
+#define MI_MATH(x)                      MI_INSTR(0x1a, (x) - 1)
+#define MI_MATH_INSTR(opcode, op1, op2) ((opcode) << 20 | (op1) << 10 | (op2))
+/* Opcodes for MI_MATH_INSTR */
+#define   MI_MATH_NOOP                  MI_MATH_INSTR(0x000, 0x0, 0x0)
+#define   MI_MATH_LOAD(op1, op2)        MI_MATH_INSTR(0x080, op1, op2)
+#define   MI_MATH_LOADINV(op1, op2)     MI_MATH_INSTR(0x480, op1, op2)
+#define   MI_MATH_LOAD0(op1)            MI_MATH_INSTR(0x081, op1)
+#define   MI_MATH_LOAD1(op1)            MI_MATH_INSTR(0x481, op1)
+#define   MI_MATH_ADD                   MI_MATH_INSTR(0x100, 0x0, 0x0)
+#define   MI_MATH_SUB                   MI_MATH_INSTR(0x101, 0x0, 0x0)
+#define   MI_MATH_AND                   MI_MATH_INSTR(0x102, 0x0, 0x0)
+#define   MI_MATH_OR                    MI_MATH_INSTR(0x103, 0x0, 0x0)
+#define   MI_MATH_XOR                   MI_MATH_INSTR(0x104, 0x0, 0x0)
+#define   MI_MATH_STORE(op1, op2)       MI_MATH_INSTR(0x180, op1, op2)
+#define   MI_MATH_STOREINV(op1, op2)    MI_MATH_INSTR(0x580, op1, op2)
+/* Registers used as operands in MI_MATH_INSTR */
+#define   MI_MATH_REG(x)                (x)
+#define   MI_MATH_REG_SRCA              0x20
+#define   MI_MATH_REG_SRCB              0x21
+#define   MI_MATH_REG_ACCU              0x31
+#define   MI_MATH_REG_ZF                0x32
+#define   MI_MATH_REG_CF                0x33
+
+#define MI_LOAD_REGISTER_REG    MI_INSTR(0x2A, 1)
+
+static void delay(int i915,
+		  const struct intel_execution_engine2 *e,
+		  uint32_t handle,
+		  uint64_t addr,
+		  uint64_t ns)
+{
+	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
+	const uint32_t base = gem_engine_mmio_base(i915, e->name);
+#define CS_GPR(x) (base + 0x600 + 8 * (x))
+#define RUNTIME (base + 0x3a8)
+	enum { START_TS, NOW_TS };
+	uint32_t *map, *cs, *jmp;
+
+	igt_require(base);
+
+	cs = map = gem_mmap__device_coherent(i915, handle, 0, 4096, PROT_WRITE);
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(START_TS) + 4;
+	*cs++ = 0;
+	*cs++ = MI_LOAD_REGISTER_REG;
+	*cs++ = RUNTIME;
+	*cs++ = CS_GPR(START_TS);
+
+	while (offset_in_page(cs) & 63)
+		*cs++ = 0;
+	jmp = cs;
+
+	*cs++ = 0x5 << 23; /* MI_ARB_CHECK */
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(NOW_TS) + 4;
+	*cs++ = 0;
+	*cs++ = MI_LOAD_REGISTER_REG;
+	*cs++ = RUNTIME;
+	*cs++ = CS_GPR(NOW_TS);
+
+	*cs++ = MI_MATH(4);
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(NOW_TS));
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(START_TS));
+	*cs++ = MI_MATH_SUB;
+	*cs++ = MI_MATH_STOREINV(MI_MATH_REG(NOW_TS), MI_MATH_REG_ACCU);
+
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_GPR(NOW_TS);
+	*cs++ = addr + 4000;
+	*cs++ = addr >> 32;
+
+	/* Delay between SRM and COND_BBE to post the writes */
+	for (int n = 0; n < 8; n++) {
+		*cs++ = MI_STORE_DWORD_IMM;
+		if (use_64b) {
+			*cs++ = addr + 4064;
+			*cs++ = addr >> 32;
+		} else {
+			*cs++ = 0;
+			*cs++ = addr + 4064;
+		}
+		*cs++ = 0;
+	}
+
+	*cs++ = MI_COND_BATCH_BUFFER_END | MI_DO_COMPARE | (1 + use_64b);
+	*cs++ = ~ns_to_ticks(i915, ns);
+	*cs++ = addr + 4000;
+	*cs++ = addr >> 32;
+
+	*cs++ = MI_BATCH_BUFFER_START | 1 << 8 | use_64b;
+	*cs++ = addr + offset_in_page(jmp);
+	*cs++ = addr >> 32;
+
+	munmap(map, 4096);
+}
+
+static struct drm_i915_gem_exec_object2
+delay_create(int i915, uint32_t ctx,
+	     const struct intel_execution_engine2 *e,
+	     uint64_t target_ns)
+{
+	struct drm_i915_gem_exec_object2 obj = {
+		.handle = batch_create(i915),
+		.flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS,
+	};
+	struct drm_i915_gem_execbuffer2 execbuf = {
+		.buffers_ptr = to_user_pointer(&obj),
+		.buffer_count = 1,
+		.rsvd1 = ctx,
+		.flags = e->flags,
+	};
+
+	gem_execbuf(i915, &execbuf);
+	gem_sync(i915, obj.handle);
+
+	delay(i915, e, obj.handle, obj.offset, target_ns);
+
+	obj.flags |= EXEC_OBJECT_PINNED;
+	return obj;
+}
+
+static void tslog(int i915,
+		  const struct intel_execution_engine2 *e,
+		  uint32_t handle,
+		  uint64_t addr)
+{
+	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
+	const uint32_t base = gem_engine_mmio_base(i915, e->name);
+#define CS_GPR(x) (base + 0x600 + 8 * (x))
+#define CS_TIMESTAMP (base + 0x358)
+	enum { ONE, MASK, ADDR };
+	uint32_t *timestamp_lo, *addr_lo;
+	uint32_t *map, *cs;
+
+	igt_require(base);
+
+	map = gem_mmap__device_coherent(i915, handle, 0, 4096, PROT_WRITE);
+	cs = map + 512;
+
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_TIMESTAMP;
+	timestamp_lo = cs;
+	*cs++ = addr;
+	*cs++ = addr >> 32;
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(ADDR);
+	addr_lo = cs;
+	*cs++ = addr;
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(ADDR) + 4;
+	*cs++ = addr >> 32;
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(ONE);
+	*cs++ = 4;
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(ONE) + 4;
+	*cs++ = 0;
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(MASK);
+	*cs++ = 0xfffff7ff;
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(MASK) + 4;
+	*cs++ = 0xffffffff;
+
+	*cs++ = MI_MATH(8);
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(ONE));
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(ADDR));
+	*cs++ = MI_MATH_ADD;
+	*cs++ = MI_MATH_STORE(MI_MATH_REG(ADDR), MI_MATH_REG_ACCU);
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(ADDR));
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(MASK));
+	*cs++ = MI_MATH_AND;
+	*cs++ = MI_MATH_STORE(MI_MATH_REG(ADDR), MI_MATH_REG_ACCU);
+
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_GPR(ADDR);
+	*cs++ = addr + offset_in_page(timestamp_lo);
+	*cs++ = addr >> 32;
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_GPR(ADDR);
+	*cs++ = addr + offset_in_page(addr_lo);
+	*cs++ = addr >> 32;
+
+	*cs++ = MI_BATCH_BUFFER_END;
+
+	munmap(map, 4096);
+}
+
+static struct drm_i915_gem_exec_object2
+tslog_create(int i915, uint32_t ctx, const struct intel_execution_engine2 *e)
+{
+	struct drm_i915_gem_exec_object2 obj = {
+		.handle = batch_create(i915),
+		.flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS,
+	};
+	struct drm_i915_gem_execbuffer2 execbuf = {
+		.buffers_ptr = to_user_pointer(&obj),
+		.buffer_count = 1,
+		.rsvd1 = ctx,
+		.flags = e->flags,
+	};
+
+	gem_execbuf(i915, &execbuf);
+	gem_sync(i915, obj.handle);
+
+	tslog(i915, e, obj.handle, obj.offset);
+
+	obj.flags |= EXEC_OBJECT_PINNED;
+	return obj;
+}
+
+static int cmp_u32(const void *A, const void *B)
+{
+	const uint32_t *a = A, *b = B;
+
+	if (*a < *b)
+		return -1;
+	else if (*a > *b)
+		return 1;
+	else
+		return 0;
+}
+
+static struct intel_execution_engine2
+pick_random_engine(int i915, const struct intel_execution_engine2 *not)
+{
+	const struct intel_execution_engine2 *e;
+	unsigned int count = 0;
+
+	__for_each_physical_engine(i915, e) {
+		if (e->flags == not->flags)
+			continue;
+		if (!gem_class_has_mutable_submission(i915, e->class))
+			continue;
+		count++;
+	}
+	if (!count)
+		return *not;
+
+	count = rand() % count;
+	__for_each_physical_engine(i915, e) {
+		if (e->flags == not->flags)
+			continue;
+		if (!gem_class_has_mutable_submission(i915, e->class))
+			continue;
+		if (!count--)
+			break;
+	}
+
+	return *e;
+}
+
+static void fair_child(int i915, uint32_t ctx,
+		       const struct intel_execution_engine2 *e,
+		       uint64_t frame_ns,
+		       int timeline,
+		       uint32_t common,
+		       unsigned int flags,
+		       unsigned long *ctl,
+		       unsigned long *out)
+#define F_SYNC  (1 << 0)
+#define F_PACE  (1 << 1)
+#define F_FLOW  (1 << 2)
+#define F_HALF  (1 << 3)
+#define F_SOLO  (1 << 4)
+#define F_SPARE (1 << 5)
+#define F_NEXT  (1 << 6)
+#define F_VIP   (1 << 7)
+#define F_RRUL  (1 << 8)
+#define F_SHARE (1 << 9)
+#define F_PING  (1 << 10)
+{
+	const int batches_per_frame = flags & F_SOLO ? 1 : 3;
+	struct drm_i915_gem_exec_object2 obj[4] = {
+		{},
+		{
+			.handle = common ?: gem_create(i915, 4096),
+		},
+		delay_create(i915, ctx, e, frame_ns / batches_per_frame),
+		delay_create(i915, ctx, e, frame_ns / batches_per_frame),
+	};
+	struct intel_execution_engine2 ping = *e;
+	int p_fence = -1, n_fence = -1;
+	unsigned long count = 0;
+	int n;
+
+	srandom(getpid());
+	if (flags & F_PING)
+		ping = pick_random_engine(i915, e);
+	obj[0] = tslog_create(i915, ctx, &ping);
+
+	while (!READ_ONCE(*ctl)) {
+		struct drm_i915_gem_execbuffer2 execbuf = {
+			.buffers_ptr = to_user_pointer(obj),
+			.buffer_count = 4,
+			.rsvd1 = ctx,
+			.rsvd2 = -1,
+			.flags = e->flags,
+		};
+
+		if (flags & F_FLOW) {
+			unsigned int seq;
+
+			seq = count;
+			if (flags & F_NEXT)
+				seq++;
+
+			execbuf.rsvd2 =
+				sw_sync_timeline_create_fence(timeline, seq);
+			execbuf.flags |= I915_EXEC_FENCE_IN;
+		}
+
+		execbuf.flags |= I915_EXEC_FENCE_OUT;
+		gem_execbuf_wr(i915, &execbuf);
+		n_fence = execbuf.rsvd2 >> 32;
+		execbuf.flags &= ~(I915_EXEC_FENCE_OUT | I915_EXEC_FENCE_IN);
+		for (n = 1; n < batches_per_frame; n++)
+			gem_execbuf(i915, &execbuf);
+		close(execbuf.rsvd2);
+
+		execbuf.buffer_count = 1;
+		execbuf.batch_start_offset = 2048;
+		execbuf.flags = ping.flags | I915_EXEC_FENCE_IN;
+		execbuf.rsvd2 = n_fence;
+		gem_execbuf(i915, &execbuf);
+
+		if (flags & F_PACE && p_fence != -1) {
+			struct pollfd pfd = {
+				.fd = p_fence,
+				.events = POLLIN,
+			};
+			poll(&pfd, 1, -1);
+		}
+		close(p_fence);
+
+		if (flags & F_SYNC) {
+			struct pollfd pfd = {
+				.fd = n_fence,
+				.events = POLLIN,
+			};
+			poll(&pfd, 1, -1);
+		}
+
+		igt_swap(obj[2], obj[3]);
+		igt_swap(p_fence, n_fence);
+		count++;
+	}
+	close(p_fence);
+
+	gem_close(i915, obj[3].handle);
+	gem_close(i915, obj[2].handle);
+	if (obj[1].handle != common)
+		gem_close(i915, obj[1].handle);
+
+	gem_sync(i915, obj[0].handle);
+	if (out) {
+		uint32_t *map;
+
+		map = gem_mmap__device_coherent(i915, obj[0].handle,
+						0, 4096, PROT_WRITE);
+		for (n = 1; n < min(count, 512); n++) {
+			igt_assert(map[n]);
+			map[n - 1] = map[n] - map[n - 1];
+		}
+		qsort(map, --n, sizeof(*map), cmp_u32);
+		*out = ticks_to_ns(i915, map[n / 2]);
+		munmap(map, 4096);
+	}
+	gem_close(i915, obj[0].handle);
+}
+
+static int cmp_ul(const void *A, const void *B)
+{
+	const unsigned long *a = A, *b = B;
+
+	if (*a < *b)
+		return -1;
+	else if (*a > *b)
+		return 1;
+	else
+		return 0;
+}
+
+static uint64_t d_cpu_time(const struct rusage *a, const struct rusage *b)
+{
+	uint64_t cpu_time = 0;
+
+	cpu_time += (a->ru_utime.tv_sec - b->ru_utime.tv_sec) * NSEC_PER_SEC;
+	cpu_time += (a->ru_utime.tv_usec - b->ru_utime.tv_usec) * 1000;
+
+	cpu_time += (a->ru_stime.tv_sec - b->ru_stime.tv_sec) * NSEC_PER_SEC;
+	cpu_time += (a->ru_stime.tv_usec - b->ru_stime.tv_usec) * 1000;
+
+	return cpu_time;
+}
+
+static void timeline_advance(int timeline, int delay_ns)
+{
+	struct timespec tv = { .tv_nsec = delay_ns };
+	nanosleep(&tv, NULL);
+	sw_sync_timeline_inc(timeline, 1);
+}
+
+static void fairness(int i915,
+		     const struct intel_execution_engine2 *e,
+		     int timeout, unsigned int flags)
+{
+	const int frame_ns = 16666 * 1000;
+	const int fence_ns = flags & F_HALF ? 2 * frame_ns : frame_ns;
+	unsigned long *result;
+	uint32_t common = 0;
+
+	igt_require(intel_gen(intel_get_drm_devid(i915)) >= 8);
+	igt_require(gem_class_has_mutable_submission(i915, e->class));
+
+	if (flags & F_SHARE)
+		common = gem_create(i915, 4095);
+
+	result = mmap(NULL, 4096, PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
+
+	for (int n = 2; n <= 64; n <<= 1) { /* 32 == 500us per client */
+		int timeline = sw_sync_timeline_create();
+		int nfences = timeout * NSEC_PER_SEC / fence_ns + 1;
+		const int nchild = n - 1; /* odd for easy medians */
+		const int child_ns = frame_ns / (nchild + !!(flags & F_SPARE));
+		const int lo = nchild / 4;
+		const int hi = (3 * nchild + 3) / 4 - 1;
+		struct rusage old_usage, usage;
+		uint64_t cpu_time, d_time;
+		unsigned long vip = -1;
+		struct timespec tv;
+		struct igt_mean m;
+
+		if (flags & F_PING) {
+			struct intel_execution_engine2 *ping;
+
+			__for_each_physical_engine(i915, ping) {
+				if (ping->flags == e->flags)
+					continue;
+
+				igt_fork(child, 1) {
+					uint32_t ctx = gem_context_clone_with_engines(i915, 0);
+
+					fair_child(i915, ctx, ping,
+						   child_ns / 8,
+						   -1, common,
+						   F_SOLO | F_PACE | F_SHARE,
+						   &result[nchild],
+						   NULL);
+
+					gem_context_destroy(i915, ctx);
+				}
+			}
+		}
+
+		memset(result, 0, (nchild + 1) * sizeof(result[0]));
+		getrusage(RUSAGE_CHILDREN, &old_usage);
+		igt_nsec_elapsed(memset(&tv, 0, sizeof(tv)));
+		igt_fork(child, nchild) {
+			uint32_t ctx = gem_context_clone_with_engines(i915, 0);
+
+			if (flags & F_VIP && child == 0) {
+				gem_context_set_priority(i915, ctx, MAX_PRIO);
+				flags |= F_FLOW;
+			}
+			if (flags & F_RRUL && child == 0)
+				flags |= F_SOLO | F_FLOW | F_SYNC;
+
+			fair_child(i915, ctx, e, child_ns,
+				   timeline, common, flags,
+				   &result[nchild],
+				   &result[child]);
+
+			gem_context_destroy(i915, ctx);
+		}
+
+		while (nfences--)
+			timeline_advance(timeline, fence_ns);
+
+		result[nchild] = 1;
+		for (int child = 0; child < nchild; child++) {
+			while (!READ_ONCE(result[child]))
+				timeline_advance(timeline, fence_ns);
+		}
+
+		igt_waitchildren();
+		close(timeline);
+
+		d_time = igt_nsec_elapsed(&tv);
+		getrusage(RUSAGE_CHILDREN, &usage);
+		cpu_time = d_cpu_time(&usage, &old_usage);
+		if (10 * cpu_time > 9 * d_time) {
+			if (nchild > 7)
+				break;
+
+			igt_skip_on_f(10 * cpu_time > 9 * d_time,
+				      "%.0f%% CPU usage, presuming capacity exceeded\n",
+				      100.* cpu_time / d_time);
+		}
+
+		igt_mean_init(&m);
+		for (int child = 0; child < nchild; child++)
+			igt_mean_add(&m, result[child]);
+
+		if (flags & (F_VIP | F_RRUL))
+			vip = result[0];
+
+		qsort(result, nchild, sizeof(*result), cmp_ul);
+		igt_info("%d clients, range: [%.1f, %.1f], iqr: [%.1f, %.1f], median: %.1f, mean: %.1f ± %.2f ms\n",
+			 nchild,
+			 1e-6 * result[0],  1e-6 * result[nchild - 1],
+			 1e-6 * result[lo], 1e-6 * result[hi],
+			 1e-6 * result[nchild / 2],
+			 1e-6 * igt_mean_get(&m),
+			 1e-6 * sqrt(igt_mean_get_variance(&m)));
+
+		if (vip != -1) {
+			igt_info("VIP interval %.2f ms\n", 1e-6 * vip);
+			igt_assert(4 * vip > 3 * fence_ns &&
+				   3 * vip < 4 * fence_ns);
+		}
+
+		/* May be slowed due to sheer volume of context switches */
+		igt_assert(4 * igt_mean_get(&m) > 3 * fence_ns &&
+			       igt_mean_get(&m) < 3 * fence_ns);
+
+		igt_assert(4 * igt_mean_get(&m) > 3 * result[nchild / 2] &&
+			   3 * igt_mean_get(&m) < 4 * result[nchild / 2]);
+
+		igt_assert(2 * (result[hi] - result[lo]) < result[nchild / 2]);
+	}
+
+	munmap(result, 4096);
+	if (common)
+		gem_close(i915, common);
+}
+
+static uint32_t read_ctx_timestamp(int i915,
+				   uint32_t ctx,
+				   const struct intel_execution_engine2 *e)
+{
+	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
+	const uint32_t base = gem_engine_mmio_base(i915, e->name);
+	struct drm_i915_gem_relocation_entry reloc;
+	struct drm_i915_gem_exec_object2 obj = {
+		.handle = gem_create(i915, 4096),
+		.offset = 32 << 20,
+		.relocs_ptr = to_user_pointer(&reloc),
+		.relocation_count = 1,
+	};
+	struct drm_i915_gem_execbuffer2 execbuf = {
+		.buffers_ptr = to_user_pointer(&obj),
+		.buffer_count = 1,
+		.flags = e->flags,
+		.rsvd1 = ctx,
+	};
+#define RUNTIME (base + 0x3a8)
+	uint32_t *map, *cs;
+	uint32_t ts;
+
+	igt_require(base);
+
+	cs = map = gem_mmap__device_coherent(i915, obj.handle,
+					     0, 4096, PROT_WRITE);
+
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = RUNTIME;
+	memset(&reloc, 0, sizeof(reloc));
+	reloc.target_handle = obj.handle;
+	reloc.presumed_offset = obj.offset;
+	reloc.offset = offset_in_page(cs);
+	reloc.delta = 4000;
+	*cs++ = obj.offset + 4000;
+	*cs++ = obj.offset >> 32;
+
+	*cs++ = MI_BATCH_BUFFER_END;
+
+	gem_execbuf(i915, &execbuf);
+	gem_sync(i915, obj.handle);
+	gem_close(i915, obj.handle);
+
+	ts = map[1000];
+	munmap(map, 4096);
+
+	return ts;
+}
+
+static void fairslice(int i915, const struct intel_execution_engine2 *e)
+{
+	igt_spin_t *spin[3];
+	uint32_t ctx[3];
+	uint32_t ts[3];
+
+	igt_require(gem_scheduler_has_semaphores(i915));
+	igt_require(gem_scheduler_has_preemption(i915));
+	igt_require(intel_gen(intel_get_drm_devid(i915)) >= 8);
+
+	for (int i = 0; i < ARRAY_SIZE(ctx); i++) {
+		ctx[i] = gem_context_clone_with_engines(i915, 0);
+		spin[i] = igt_spin_new(i915, .ctx = ctx[i], .engine = e->flags);
+	}
+
+	sleep(2); /* over the course of many timeslices */
+
+	for (int i = 0; i < ARRAY_SIZE(ctx); i++) {
+		igt_assert(gem_bo_busy(i915, spin[i]->handle));
+		igt_spin_end(spin[i]);
+
+		ts[i] = read_ctx_timestamp(i915, ctx[i], e);
+	}
+
+	for (int i = 0; i < ARRAY_SIZE(ctx); i++) {
+		igt_spin_free(i915, spin[i]);
+		gem_context_destroy(i915, ctx[i]);
+	}
+
+	qsort(ts, 3, sizeof(*ts), cmp_u32);
+	igt_info("%s: [%.1f, %.1f] ms\n", e->name,
+		 1e-6 * ticks_to_ns(i915, ts[0]),
+		 1e-6 * ticks_to_ns(i915, ts[2]));
+
+	igt_assert(ts[0] && ts[2] > ts[0]);
+	igt_assert(4 * ts[0] > 3 * ts[2]);
+}
+
 #define test_each_engine(T, i915, e) \
 	igt_subtest_with_dynamic(T) __for_each_physical_engine(i915, e) \
 		igt_dynamic_f("%s", e->name)
@@ -2561,6 +3222,9 @@ igt_main
 		test_each_engine("lateslice", fd, e)
 			lateslice(fd, e->flags);
 
+		test_each_engine("fairslice", fd, e)
+			fairslice(fd, e);
+
 		test_each_engine("submit-early-slice", fd, e)
 			submit_slice(fd, e, EARLY_SUBMIT);
 		test_each_engine("submit-golden-slice", fd, e)
@@ -2589,6 +3253,41 @@ igt_main
 		test_each_engine_store("promotion", fd, e)
 			promotion(fd, e->flags);
 
+		test_each_engine_store("fair-none", fd, e)
+			fairness(fd, e, 2, 0);
+		test_each_engine_store("fair-none-vip", fd, e)
+			fairness(fd, e, 2, F_VIP);
+		test_each_engine_store("fair-none-share", fd, e)
+			fairness(fd, e, 2, F_SHARE);
+		test_each_engine_store("fair-none-rrul", fd, e)
+			fairness(fd, e, 2, F_RRUL);
+		test_each_engine_store("fair-none-ping", fd, e)
+			fairness(fd, e, 2, F_PING);
+		test_each_engine_store("fair-pace", fd, e)
+			fairness(fd, e, 2, F_PACE);
+		test_each_engine_store("fair-pace-share", fd, e)
+			fairness(fd, e, 2, F_PACE | F_SHARE);
+		test_each_engine_store("fair-pace-ping", fd, e)
+			fairness(fd, e, 2, F_PACE | F_SHARE | F_PING);
+		test_each_engine_store("fair-sync", fd, e)
+			fairness(fd, e, 2, F_SYNC);
+		test_each_engine_store("fair-sync-vip", fd, e)
+			fairness(fd, e, 2, F_SYNC | F_VIP);
+		test_each_engine_store("fair-solo", fd, e)
+			fairness(fd, e, 2, F_SYNC | F_SOLO);
+		test_each_engine_store("fair-flow", fd, e)
+			fairness(fd, e, 2, F_PACE | F_FLOW);
+		test_each_engine_store("fair-flow-ping", fd, e)
+			fairness(fd, e, 2, F_PACE | F_FLOW | F_PING);
+		test_each_engine_store("fair-next", fd, e)
+			fairness(fd, e, 2, F_PACE | F_FLOW | F_NEXT);
+		test_each_engine_store("fair-next-share", fd, e)
+			fairness(fd, e, 2, F_PACE | F_FLOW | F_NEXT | F_SHARE);
+		test_each_engine_store("fair-spare", fd, e)
+			fairness(fd, e, 2, F_PACE | F_FLOW | F_SPARE);
+		test_each_engine_store("fair-half", fd, e)
+			fairness(fd, e, 2, F_PACE | F_FLOW | F_HALF);
+
 		igt_subtest_group {
 			igt_fixture {
 				igt_require(gem_scheduler_has_preemption(fd));
-- 
2.27.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [Intel-gfx] [PATCH i-g-t] i915/gem_exec_schedule: Try to spot unfairness
  2020-06-02  8:22 Chris Wilson
  2020-06-02  8:32 ` Chris Wilson
@ 2020-06-02  8:50 ` Chris Wilson
  1 sibling, 0 replies; 20+ messages in thread
From: Chris Wilson @ 2020-06-02  8:50 UTC (permalink / raw)
  To: intel-gfx; +Cc: igt-dev, Chris Wilson

An important property for multi-client systems is that each client gets
a 'fair' allotment of system time. (Where fairness is at the whim of the
context properties, such as priorities.) This test forks N independent
clients (albeit they happen to share a single vm), and does an equal
amount of work in client and asserts that they take an equal amount of
time.

Though we have never claimed to have a completely fair scheduler, that
is what is expected.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Ramalingam C <ramalingam.c@intel.com>
---
 tests/i915/gem_exec_schedule.c | 442 +++++++++++++++++++++++++++++++++
 1 file changed, 442 insertions(+)

diff --git a/tests/i915/gem_exec_schedule.c b/tests/i915/gem_exec_schedule.c
index 56c638833..ced9ee571 100644
--- a/tests/i915/gem_exec_schedule.c
+++ b/tests/i915/gem_exec_schedule.c
@@ -2495,6 +2495,433 @@ static void measure_semaphore_power(int i915)
 	rapl_close(&pkg);
 }
 
+static int read_timestamp_frequency(int i915)
+{
+	int value = 0;
+	drm_i915_getparam_t gp = {
+		.value = &value,
+		.param = I915_PARAM_CS_TIMESTAMP_FREQUENCY,
+	};
+	ioctl(i915, DRM_IOCTL_I915_GETPARAM, &gp);
+	return value;
+}
+
+static uint64_t div64_u64_round_up(uint64_t x, uint64_t y)
+{
+	return (x + y - 1) / y;
+}
+
+static uint64_t ns_to_ticks(int i915, uint64_t ns)
+{
+	return div64_u64_round_up(ns * read_timestamp_frequency(i915),
+				  NSEC_PER_SEC);
+}
+
+static uint64_t ticks_to_ns(int i915, uint64_t ticks)
+{
+	return div64_u64_round_up(ticks * NSEC_PER_SEC,
+				  read_timestamp_frequency(i915));
+}
+
+#define MI_INSTR(opcode, flags) (((opcode) << 23) | (flags))
+
+#define MI_MATH(x)                      MI_INSTR(0x1a, (x) - 1)
+#define MI_MATH_INSTR(opcode, op1, op2) ((opcode) << 20 | (op1) << 10 | (op2))
+/* Opcodes for MI_MATH_INSTR */
+#define   MI_MATH_NOOP                  MI_MATH_INSTR(0x000, 0x0, 0x0)
+#define   MI_MATH_LOAD(op1, op2)        MI_MATH_INSTR(0x080, op1, op2)
+#define   MI_MATH_LOADINV(op1, op2)     MI_MATH_INSTR(0x480, op1, op2)
+#define   MI_MATH_LOAD0(op1)            MI_MATH_INSTR(0x081, op1)
+#define   MI_MATH_LOAD1(op1)            MI_MATH_INSTR(0x481, op1)
+#define   MI_MATH_ADD                   MI_MATH_INSTR(0x100, 0x0, 0x0)
+#define   MI_MATH_SUB                   MI_MATH_INSTR(0x101, 0x0, 0x0)
+#define   MI_MATH_AND                   MI_MATH_INSTR(0x102, 0x0, 0x0)
+#define   MI_MATH_OR                    MI_MATH_INSTR(0x103, 0x0, 0x0)
+#define   MI_MATH_XOR                   MI_MATH_INSTR(0x104, 0x0, 0x0)
+#define   MI_MATH_STORE(op1, op2)       MI_MATH_INSTR(0x180, op1, op2)
+#define   MI_MATH_STOREINV(op1, op2)    MI_MATH_INSTR(0x580, op1, op2)
+/* Registers used as operands in MI_MATH_INSTR */
+#define   MI_MATH_REG(x)                (x)
+#define   MI_MATH_REG_SRCA              0x20
+#define   MI_MATH_REG_SRCB              0x21
+#define   MI_MATH_REG_ACCU              0x31
+#define   MI_MATH_REG_ZF                0x32
+#define   MI_MATH_REG_CF                0x33
+
+#define MI_LOAD_REGISTER_REG    MI_INSTR(0x2A, 1)
+
+static void delay(int i915,
+		  const struct intel_execution_engine2 *e,
+		  uint32_t handle,
+		  uint64_t addr,
+		  uint64_t ns)
+{
+	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
+	const uint32_t base = gem_engine_mmio_base(i915, e->name);
+#define CS_GPR(x) (base + 0x600 + 8 * (x))
+#define TIMESTAMP (base + 0x3a8)
+	enum { START_TS, NOW_TS };
+	uint32_t *map, *cs, *jmp;
+
+	igt_require(base);
+
+	cs = map = gem_mmap__device_coherent(i915, handle, 0, 4096, PROT_WRITE);
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(START_TS) + 4;
+	*cs++ = 0;
+	*cs++ = MI_LOAD_REGISTER_REG;
+	*cs++ = TIMESTAMP;
+	*cs++ = CS_GPR(START_TS);
+
+	if (offset_in_page(cs) & 4)
+		*cs++ = 0;
+	jmp = cs;
+
+	*cs++ = 0x5 << 23; /* MI_ARB_CHECK */
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(NOW_TS) + 4;
+	*cs++ = 0;
+	*cs++ = MI_LOAD_REGISTER_REG;
+	*cs++ = TIMESTAMP;
+	*cs++ = CS_GPR(NOW_TS);
+
+	*cs++ = MI_MATH(4);
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(NOW_TS));
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(START_TS));
+	*cs++ = MI_MATH_SUB;
+	*cs++ = MI_MATH_STOREINV(MI_MATH_REG(NOW_TS), MI_MATH_REG_ACCU);
+
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_GPR(NOW_TS);
+	*cs++ = addr + 4000;
+	*cs++ = addr >> 32;
+
+	*cs++ = MI_COND_BATCH_BUFFER_END | MI_DO_COMPARE | (1 + use_64b);
+	*cs++ = ~ns_to_ticks(i915, ns);
+	*cs++ = addr + 4000;
+	*cs++ = addr >> 32;
+
+	*cs++ = MI_BATCH_BUFFER_START | 1 << 8 | use_64b;
+	*cs++ = addr + offset_in_page(jmp);
+	*cs++ = addr >> 32;
+
+	munmap(map, 4096);
+}
+
+static struct drm_i915_gem_exec_object2
+delay_create(int i915, uint32_t ctx,
+	     const struct intel_execution_engine2 *e,
+	     uint64_t target_ns)
+{
+	struct drm_i915_gem_exec_object2 obj = {
+		.handle = batch_create(i915),
+		.flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS,
+	};
+	struct drm_i915_gem_execbuffer2 execbuf = {
+		.buffers_ptr = to_user_pointer(&obj),
+		.buffer_count = 1,
+		.rsvd1 = ctx,
+		.flags = e->flags,
+	};
+
+	gem_execbuf(i915, &execbuf);
+	gem_sync(i915, obj.handle);
+
+	delay(i915, e, obj.handle, obj.offset, target_ns);
+
+	obj.flags |= EXEC_OBJECT_PINNED;
+	return obj;
+}
+
+static void tslog(int i915,
+		  const struct intel_execution_engine2 *e,
+		  uint32_t handle,
+		  uint64_t addr)
+{
+	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
+	const uint32_t base = gem_engine_mmio_base(i915, e->name);
+#define CS_GPR(x) (base + 0x600 + 8 * (x))
+#define CS_TIMESTAMP (base + 0x358)
+	enum { ONE, MASK, ADDR };
+	uint32_t *timestamp_lo, *addr_lo;
+	uint32_t *map, *cs;
+
+	igt_require(base);
+
+	map = gem_mmap__device_coherent(i915, handle, 0, 4096, PROT_WRITE);
+	cs = map + 512;
+
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_TIMESTAMP;
+	timestamp_lo = cs;
+	*cs++ = addr;
+	*cs++ = addr >> 32;
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(ADDR);
+	addr_lo = cs;
+	*cs++ = addr;
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(ADDR) + 4;
+	*cs++ = addr >> 32;
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(ONE);
+	*cs++ = 4;
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(ONE) + 4;
+	*cs++ = 0;
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(MASK);
+	*cs++ = 0xfffff7ff;
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(MASK) + 4;
+	*cs++ = 0xffffffff;
+
+	*cs++ = MI_MATH(8);
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(ONE));
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(ADDR));
+	*cs++ = MI_MATH_ADD;
+	*cs++ = MI_MATH_STORE(MI_MATH_REG(ADDR), MI_MATH_REG_ACCU);
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(ADDR));
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(MASK));
+	*cs++ = MI_MATH_AND;
+	*cs++ = MI_MATH_STORE(MI_MATH_REG(ADDR), MI_MATH_REG_ACCU);
+
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_GPR(ADDR);
+	*cs++ = addr + offset_in_page(timestamp_lo);
+	*cs++ = addr >> 32;
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_GPR(ADDR);
+	*cs++ = addr + offset_in_page(addr_lo);
+	*cs++ = addr >> 32;
+
+	*cs++ = MI_BATCH_BUFFER_END;
+
+	munmap(map, 4096);
+}
+
+static struct drm_i915_gem_exec_object2
+tslog_create(int i915, uint32_t ctx, const struct intel_execution_engine2 *e)
+{
+	struct drm_i915_gem_exec_object2 obj = {
+		.handle = batch_create(i915),
+		.flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS,
+	};
+	struct drm_i915_gem_execbuffer2 execbuf = {
+		.buffers_ptr = to_user_pointer(&obj),
+		.buffer_count = 1,
+		.rsvd1 = ctx,
+		.flags = e->flags,
+	};
+
+	gem_execbuf(i915, &execbuf);
+	gem_sync(i915, obj.handle);
+
+	tslog(i915, e, obj.handle, obj.offset);
+
+	obj.flags |= EXEC_OBJECT_PINNED;
+	return obj;
+}
+
+static int cmp_u32(const void *A, const void *B)
+{
+	const unsigned long *a = A, *b = B;
+
+	if (*a < *b)
+		return -1;
+	else if (*a > *b)
+		return 1;
+	else
+		return 0;
+}
+
+static void fair_child(int i915, uint32_t ctx,
+		       const struct intel_execution_engine2 *e,
+		       uint64_t frame_ns,
+		       int timeout,
+		       int timeline,
+		       unsigned int flags,
+		       unsigned long *ctl,
+		       unsigned long *out)
+#define F_SYNC  (1 << 0)
+#define F_PACE  (1 << 1)
+#define F_FLOW  (1 << 2)
+#define F_HALF  (1 << 3)
+#define F_SOLO  (1 << 4)
+#define F_SPARE (1 << 8)
+{
+	const int batches_per_frame = flags & F_SOLO ? 1 : 3;
+	struct drm_i915_gem_exec_object2 prev =
+		delay_create(i915, ctx, e, frame_ns / batches_per_frame);
+	struct drm_i915_gem_exec_object2 next =
+		delay_create(i915, ctx, e, frame_ns / batches_per_frame);
+	struct drm_i915_gem_exec_object2 ts = tslog_create(i915, ctx, e);
+	int p_fence = -1, n_fence = -1;
+	unsigned long count = 0;
+	uint32_t *map;
+	int n;
+
+	while (!READ_ONCE(*ctl)) {
+		struct drm_i915_gem_execbuffer2 execbuf = {
+			.buffers_ptr = to_user_pointer(&next),
+			.buffer_count = 1,
+			.rsvd1 = ctx,
+			.rsvd2 = -1,
+			.flags = e->flags,
+		};
+
+		if (flags & F_FLOW) {
+			execbuf.rsvd2 =
+				sw_sync_timeline_create_fence(timeline, count);
+			execbuf.flags |= I915_EXEC_FENCE_IN;
+		}
+
+		execbuf.flags |= I915_EXEC_FENCE_OUT;
+		gem_execbuf_wr(i915, &execbuf);
+		n_fence = execbuf.rsvd2 >> 32;
+		execbuf.flags &= ~(I915_EXEC_FENCE_OUT | I915_EXEC_FENCE_IN);
+		for (n = 1; n < batches_per_frame; n++)
+			gem_execbuf(i915, &execbuf);
+
+		execbuf.buffers_ptr = to_user_pointer(&ts);
+		execbuf.batch_start_offset = 2048;
+		gem_execbuf(i915, &execbuf);
+
+		if (flags & F_PACE && p_fence != -1) {
+			struct pollfd pfd = {
+				.fd = p_fence,
+				.events = POLLIN,
+			};
+			poll(&pfd, 1, -1);
+		}
+		close(p_fence);
+		close(execbuf.rsvd2);
+
+		if (flags & F_SYNC) {
+			struct pollfd pfd = {
+				.fd = n_fence,
+				.events = POLLIN,
+			};
+			poll(&pfd, 1, -1);
+		}
+
+		igt_swap(prev, next);
+		igt_swap(p_fence, n_fence);
+		count++;
+	}
+	close(p_fence);
+
+	gem_close(i915, next.handle);
+	gem_close(i915, prev.handle);
+
+	gem_sync(i915, ts.handle);
+	map = gem_mmap__device_coherent(i915, ts.handle, 0, 4096, PROT_WRITE);
+	for (n = 1; n < min(count, 512); n++) {
+		igt_assert(map[n]);
+		map[n - 1] = map[n] - map[n - 1];
+	}
+	qsort(map, --n, sizeof(*map), cmp_u32);
+	*out = ticks_to_ns(i915, map[n / 2]);
+	munmap(map, 4096);
+
+	gem_close(i915, ts.handle);
+}
+
+static int cmp_ul(const void *A, const void *B)
+{
+	const unsigned long *a = A, *b = B;
+
+	if (*a < *b)
+		return -1;
+	else if (*a > *b)
+		return 1;
+	else
+		return 0;
+}
+
+static void fairness(int i915,
+		     const struct intel_execution_engine2 *e,
+		     int timeout, unsigned int flags)
+{
+	const int frame_ns = 16666 * 1000;
+	const int fence_ns = flags & F_HALF ? 2 * frame_ns : frame_ns;
+	unsigned long *result;
+
+	igt_require(intel_gen(intel_get_drm_devid(i915)) >= 8);
+	igt_require(gem_class_has_mutable_submission(i915, e->class));
+	igt_require(e->class == I915_ENGINE_CLASS_RENDER || /* XXX excuse me? */
+		    intel_gen(intel_get_drm_devid(i915)) < 11);
+
+	result = mmap(NULL, 4096, PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
+
+	for (int n = 2; n <= 16; n <<= 1) {
+		int timeline = sw_sync_timeline_create();
+		int nfences = timeout * NSEC_PER_SEC / fence_ns + 1;
+		const int nchild = n - 1; /* odd for easy medians */
+		const int child_ns = frame_ns / (nchild + !!(flags & F_SPARE));
+		const int lo = nchild / 4;
+		const int hi = (3 * nchild + 3) / 4 - 1;
+		struct igt_mean m;
+
+		memset(result, 0, (nchild + 1) * sizeof(result[0]));
+		igt_fork(child, nchild) {
+			uint32_t ctx = gem_context_clone_with_engines(i915, 0);
+
+			fair_child(i915, ctx, e, child_ns,
+				   timeout, timeline, flags,
+				   &result[nchild],
+				   &result[child]);
+
+			gem_context_destroy(i915, ctx);
+		}
+
+		while (nfences--) {
+			struct timespec tv = { .tv_nsec = fence_ns };
+			nanosleep(&tv, NULL);
+			sw_sync_timeline_inc(timeline, 1);
+		}
+		result[nchild] = 1;
+		for (int child = 0; child < nchild; child++) {
+			while (!READ_ONCE(result[child])) {
+				struct timespec tv = { .tv_nsec = fence_ns };
+				nanosleep(&tv, NULL);
+				sw_sync_timeline_inc(timeline, 1);
+			}
+		}
+		igt_waitchildren();
+		close(timeline);
+
+		igt_mean_init(&m);
+		for (int child = 0; child < nchild; child++)
+			igt_mean_add(&m, result[child]);
+
+		qsort(result, nchild, sizeof(*result), cmp_ul);
+		igt_info("%d clients, range: [%.1f, %.1f], iqr: [%.1f, %.1f], median: %.1f, mean: %.1f ± %.2f ms\n",
+			 nchild,
+			 1e-6 * result[0],  1e-6 * result[nchild - 1],
+			 1e-6 * result[lo], 1e-6 * result[hi],
+			 1e-6 * result[nchild / 2],
+			 1e-6 * igt_mean_get(&m),
+			 1e-6 * sqrt(igt_mean_get_variance(&m)));
+
+#if 0
+		/* Mean within 10% of target */
+		igt_assert( 9 * igt_mean_get(&m) > 10 * frame_ns &&
+			   10 * igt_mean_get(&m) <  9 * frame_ns);
+
+		/* Variance [inter-quartile range] is less than 33% of median */
+		igt_assert(3 * result[hi] - result[lo] < result[nchild / 2]);
+#endif
+	}
+
+	munmap(result, 4096);
+}
+
 #define test_each_engine(T, i915, e) \
 	igt_subtest_with_dynamic(T) __for_each_physical_engine(i915, e) \
 		igt_dynamic_f("%s", e->name)
@@ -2589,6 +3016,21 @@ igt_main
 		test_each_engine_store("promotion", fd, e)
 			promotion(fd, e->flags);
 
+		test_each_engine_store("fair-none", fd, e)
+			fairness(fd, e, 2, 0);
+		test_each_engine_store("fair-pace", fd, e)
+			fairness(fd, e, 2, F_PACE);
+		test_each_engine_store("fair-sync", fd, e)
+			fairness(fd, e, 2, F_SYNC);
+		test_each_engine_store("fair-solo", fd, e)
+			fairness(fd, e, 2, F_SYNC | F_SOLO);
+		test_each_engine_store("fair-flow", fd, e)
+			fairness(fd, e, 2, F_PACE | F_FLOW);
+		test_each_engine_store("fair-spare", fd, e)
+			fairness(fd, e, 2, F_PACE | F_FLOW | F_SPARE);
+		test_each_engine_store("fair-half", fd, e)
+			fairness(fd, e, 2, F_PACE | F_FLOW | F_HALF);
+
 		igt_subtest_group {
 			igt_fixture {
 				igt_require(gem_scheduler_has_preemption(fd));
-- 
2.27.0.rc2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [Intel-gfx] [PATCH i-g-t] i915/gem_exec_schedule: Try to spot unfairness
  2020-06-02  8:22 Chris Wilson
@ 2020-06-02  8:32 ` Chris Wilson
  2020-06-02  8:50 ` Chris Wilson
  1 sibling, 0 replies; 20+ messages in thread
From: Chris Wilson @ 2020-06-02  8:32 UTC (permalink / raw)
  To: intel-gfx; +Cc: igt-dev, Chris Wilson

An important property for multi-client systems is that each client gets
a 'fair' allotment of system time. (Where fairness is at the whim of the
context properties, such as priorities.) This test forks N independent
clients (albeit they happen to share a single vm), and does an equal
amount of work in client and asserts that they take an equal amount of
time.

Though we have never claimed to have a completely fair scheduler, that
is what is expected.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Ramalingam C <ramalingam.c@intel.com>
---
 tests/i915/gem_exec_schedule.c | 440 +++++++++++++++++++++++++++++++++
 1 file changed, 440 insertions(+)

diff --git a/tests/i915/gem_exec_schedule.c b/tests/i915/gem_exec_schedule.c
index 56c638833..911379cad 100644
--- a/tests/i915/gem_exec_schedule.c
+++ b/tests/i915/gem_exec_schedule.c
@@ -2495,6 +2495,431 @@ static void measure_semaphore_power(int i915)
 	rapl_close(&pkg);
 }
 
+static int read_timestamp_frequency(int i915)
+{
+	int value = 0;
+	drm_i915_getparam_t gp = {
+		.value = &value,
+		.param = I915_PARAM_CS_TIMESTAMP_FREQUENCY,
+	};
+	ioctl(i915, DRM_IOCTL_I915_GETPARAM, &gp);
+	return value;
+}
+
+static uint64_t div64_u64_round_up(uint64_t x, uint64_t y)
+{
+	return (x + y - 1) / y;
+}
+
+static uint64_t ns_to_ticks(int i915, uint64_t ns)
+{
+	return div64_u64_round_up(ns * read_timestamp_frequency(i915),
+				  NSEC_PER_SEC);
+}
+
+static uint64_t ticks_to_ns(int i915, uint64_t ticks)
+{
+	return div64_u64_round_up(ticks * NSEC_PER_SEC,
+				  read_timestamp_frequency(i915));
+}
+
+#define MI_INSTR(opcode, flags) (((opcode) << 23) | (flags))
+
+#define MI_MATH(x)                      MI_INSTR(0x1a, (x) - 1)
+#define MI_MATH_INSTR(opcode, op1, op2) ((opcode) << 20 | (op1) << 10 | (op2))
+/* Opcodes for MI_MATH_INSTR */
+#define   MI_MATH_NOOP                  MI_MATH_INSTR(0x000, 0x0, 0x0)
+#define   MI_MATH_LOAD(op1, op2)        MI_MATH_INSTR(0x080, op1, op2)
+#define   MI_MATH_LOADINV(op1, op2)     MI_MATH_INSTR(0x480, op1, op2)
+#define   MI_MATH_LOAD0(op1)            MI_MATH_INSTR(0x081, op1)
+#define   MI_MATH_LOAD1(op1)            MI_MATH_INSTR(0x481, op1)
+#define   MI_MATH_ADD                   MI_MATH_INSTR(0x100, 0x0, 0x0)
+#define   MI_MATH_SUB                   MI_MATH_INSTR(0x101, 0x0, 0x0)
+#define   MI_MATH_AND                   MI_MATH_INSTR(0x102, 0x0, 0x0)
+#define   MI_MATH_OR                    MI_MATH_INSTR(0x103, 0x0, 0x0)
+#define   MI_MATH_XOR                   MI_MATH_INSTR(0x104, 0x0, 0x0)
+#define   MI_MATH_STORE(op1, op2)       MI_MATH_INSTR(0x180, op1, op2)
+#define   MI_MATH_STOREINV(op1, op2)    MI_MATH_INSTR(0x580, op1, op2)
+/* Registers used as operands in MI_MATH_INSTR */
+#define   MI_MATH_REG(x)                (x)
+#define   MI_MATH_REG_SRCA              0x20
+#define   MI_MATH_REG_SRCB              0x21
+#define   MI_MATH_REG_ACCU              0x31
+#define   MI_MATH_REG_ZF                0x32
+#define   MI_MATH_REG_CF                0x33
+
+#define MI_LOAD_REGISTER_REG    MI_INSTR(0x2A, 1)
+
+static void delay(int i915,
+		  const struct intel_execution_engine2 *e,
+		  uint32_t handle,
+		  uint64_t addr,
+		  uint64_t ns)
+{
+	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
+	const uint32_t base = gem_engine_mmio_base(i915, e->name);
+#define CS_GPR(x) (base + 0x600 + 8 * (x))
+#define TIMESTAMP (base + 0x3a8)
+	enum { START_TS, NOW_TS };
+	uint32_t *map, *cs, *jmp;
+
+	igt_require(base);
+
+	cs = map = gem_mmap__device_coherent(i915, handle, 0, 4096, PROT_WRITE);
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(START_TS) + 4;
+	*cs++ = 0;
+	*cs++ = MI_LOAD_REGISTER_REG;
+	*cs++ = TIMESTAMP;
+	*cs++ = CS_GPR(START_TS);
+
+	if (offset_in_page(cs) & 4)
+		*cs++ = 0;
+	jmp = cs;
+
+	*cs++ = 0x5 << 23; /* MI_ARB_CHECK */
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(NOW_TS) + 4;
+	*cs++ = 0;
+	*cs++ = MI_LOAD_REGISTER_REG;
+	*cs++ = TIMESTAMP;
+	*cs++ = CS_GPR(NOW_TS);
+
+	*cs++ = MI_MATH(4);
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(NOW_TS));
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(START_TS));
+	*cs++ = MI_MATH_SUB;
+	*cs++ = MI_MATH_STOREINV(MI_MATH_REG(NOW_TS), MI_MATH_REG_ACCU);
+
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_GPR(NOW_TS);
+	*cs++ = addr + 4000;
+	*cs++ = addr >> 32;
+
+	*cs++ = MI_COND_BATCH_BUFFER_END | MI_DO_COMPARE | (1 + use_64b);
+	*cs++ = ~ns_to_ticks(i915, ns);
+	*cs++ = addr + 4000;
+	*cs++ = addr >> 32;
+
+	*cs++ = MI_BATCH_BUFFER_START | 1 << 8 | use_64b;
+	*cs++ = addr + offset_in_page(jmp);
+	*cs++ = addr >> 32;
+
+	munmap(map, 4096);
+}
+
+static struct drm_i915_gem_exec_object2
+delay_create(int i915, uint32_t ctx,
+	     const struct intel_execution_engine2 *e,
+	     uint64_t target_ns)
+{
+	struct drm_i915_gem_exec_object2 obj = {
+		.handle = batch_create(i915),
+		.flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS,
+	};
+	struct drm_i915_gem_execbuffer2 execbuf = {
+		.buffers_ptr = to_user_pointer(&obj),
+		.buffer_count = 1,
+		.rsvd1 = ctx,
+		.flags = e->flags,
+	};
+
+	gem_execbuf(i915, &execbuf);
+	gem_sync(i915, obj.handle);
+
+	delay(i915, e, obj.handle, obj.offset, target_ns);
+
+	obj.flags |= EXEC_OBJECT_PINNED;
+	return obj;
+}
+
+static void tslog(int i915,
+		  const struct intel_execution_engine2 *e,
+		  uint32_t handle,
+		  uint64_t addr)
+{
+	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
+	const uint32_t base = gem_engine_mmio_base(i915, e->name);
+#define CS_GPR(x) (base + 0x600 + 8 * (x))
+#define CS_TIMESTAMP (base + 0x358)
+	enum { ONE, MASK, ADDR };
+	uint32_t *timestamp_lo, *addr_lo;
+	uint32_t *map, *cs;
+
+	igt_require(base);
+
+	map = gem_mmap__device_coherent(i915, handle, 0, 4096, PROT_WRITE);
+	cs = map + 512;
+
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_TIMESTAMP;
+	timestamp_lo = cs;
+	*cs++ = addr;
+	*cs++ = addr >> 32;
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(ADDR);
+	addr_lo = cs;
+	*cs++ = addr;
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(ADDR) + 4;
+	*cs++ = addr >> 32;
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(ONE);
+	*cs++ = 4;
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(ONE) + 4;
+	*cs++ = 0;
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(MASK);
+	*cs++ = 0xfffff7ff;
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(MASK) + 4;
+	*cs++ = 0xffffffff;
+
+	*cs++ = MI_MATH(8);
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(ONE));
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(ADDR));
+	*cs++ = MI_MATH_ADD;
+	*cs++ = MI_MATH_STORE(MI_MATH_REG(ADDR), MI_MATH_REG_ACCU);
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(ADDR));
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(MASK));
+	*cs++ = MI_MATH_AND;
+	*cs++ = MI_MATH_STORE(MI_MATH_REG(ADDR), MI_MATH_REG_ACCU);
+
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_GPR(ADDR);
+	*cs++ = addr + offset_in_page(timestamp_lo);
+	*cs++ = addr >> 32;
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_GPR(ADDR);
+	*cs++ = addr + offset_in_page(addr_lo);
+	*cs++ = addr >> 32;
+
+	*cs++ = MI_BATCH_BUFFER_END;
+
+	munmap(map, 4096);
+}
+
+static struct drm_i915_gem_exec_object2
+tslog_create(int i915, uint32_t ctx, const struct intel_execution_engine2 *e)
+{
+	struct drm_i915_gem_exec_object2 obj = {
+		.handle = batch_create(i915),
+		.flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS,
+	};
+	struct drm_i915_gem_execbuffer2 execbuf = {
+		.buffers_ptr = to_user_pointer(&obj),
+		.buffer_count = 1,
+		.rsvd1 = ctx,
+		.flags = e->flags,
+	};
+
+	gem_execbuf(i915, &execbuf);
+	gem_sync(i915, obj.handle);
+
+	tslog(i915, e, obj.handle, obj.offset);
+
+	obj.flags |= EXEC_OBJECT_PINNED;
+	return obj;
+}
+
+static int cmp_u32(const void *A, const void *B)
+{
+	const unsigned long *a = A, *b = B;
+
+	if (*a < *b)
+		return -1;
+	else if (*a > *b)
+		return 1;
+	else
+		return 0;
+}
+
+static void fair_child(int i915, uint32_t ctx,
+		       const struct intel_execution_engine2 *e,
+		       uint64_t frame_ns,
+		       int timeout,
+		       int timeline,
+		       unsigned int flags,
+		       unsigned long *ctl,
+		       unsigned long *out)
+#define F_SYNC  (1 << 0)
+#define F_PACE  (1 << 1)
+#define F_FLOW  (1 << 2)
+#define F_HALF  (1 << 3)
+#define F_SOLO  (1 << 4)
+#define F_SPARE (1 << 8)
+{
+	const int batches_per_frame = flags & F_SOLO ? 1 : 3;
+	struct drm_i915_gem_exec_object2 prev =
+		delay_create(i915, ctx, e, frame_ns / batches_per_frame);
+	struct drm_i915_gem_exec_object2 next =
+		delay_create(i915, ctx, e, frame_ns / batches_per_frame);
+	struct drm_i915_gem_exec_object2 ts = tslog_create(i915, ctx, e);
+	int p_fence = -1, n_fence = -1;
+	unsigned long count = 0;
+	uint32_t *map;
+	int n;
+
+	while (!READ_ONCE(*ctl)) {
+		struct drm_i915_gem_execbuffer2 execbuf = {
+			.buffers_ptr = to_user_pointer(&next),
+			.buffer_count = 1,
+			.rsvd1 = ctx,
+			.rsvd2 = -1,
+			.flags = e->flags,
+		};
+
+		if (flags & F_FLOW) {
+			execbuf.rsvd2 =
+				sw_sync_timeline_create_fence(timeline, count);
+			execbuf.flags |= I915_EXEC_FENCE_IN;
+		}
+
+		execbuf.flags |= I915_EXEC_FENCE_OUT;
+		gem_execbuf_wr(i915, &execbuf);
+		n_fence = execbuf.rsvd2 >> 32;
+		execbuf.flags &= ~(I915_EXEC_FENCE_OUT | I915_EXEC_FENCE_IN);
+		for (n = 1; n < batches_per_frame; n++)
+			gem_execbuf(i915, &execbuf);
+
+		execbuf.buffers_ptr = to_user_pointer(&ts);
+		execbuf.batch_start_offset = 2048;
+		gem_execbuf(i915, &execbuf);
+
+		if (flags & F_PACE && p_fence != -1) {
+			struct pollfd pfd = {
+				.fd = p_fence,
+				.events = POLLIN,
+			};
+			poll(&pfd, 1, -1);
+		}
+		close(p_fence);
+		close(execbuf.rsvd2);
+
+		if (flags & F_SYNC) {
+			struct pollfd pfd = {
+				.fd = n_fence,
+				.events = POLLIN,
+			};
+			poll(&pfd, 1, -1);
+		}
+
+		igt_swap(prev, next);
+		igt_swap(p_fence, n_fence);
+		count++;
+	}
+	close(p_fence);
+
+	gem_close(i915, next.handle);
+	gem_close(i915, prev.handle);
+
+	gem_sync(i915, ts.handle);
+	map = gem_mmap__device_coherent(i915, ts.handle, 0, 4096, PROT_WRITE);
+	for (n = 1; n < min(count, 512); n++) {
+		igt_assert(map[n]);
+		map[n - 1] = map[n] - map[n - 1];
+	}
+	qsort(map, --n, sizeof(*map), cmp_u32);
+	*out = ticks_to_ns(i915, map[n / 2]);
+	munmap(map, 4096);
+
+	gem_close(i915, ts.handle);
+}
+
+static int cmp_ul(const void *A, const void *B)
+{
+	const unsigned long *a = A, *b = B;
+
+	if (*a < *b)
+		return -1;
+	else if (*a > *b)
+		return 1;
+	else
+		return 0;
+}
+
+static void fairness(int i915,
+		     const struct intel_execution_engine2 *e,
+		     int timeout, unsigned int flags)
+{
+	const int frame_ns = 16666 * 1000;
+	const int fence_ns = flags & F_HALF ? 2 * frame_ns : frame_ns;
+	unsigned long *result;
+
+	igt_require(intel_gen(intel_get_drm_devid(i915)) >= 8);
+	igt_require(gem_class_has_mutable_submission(i915, e->class));
+
+	result = mmap(NULL, 4096, PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
+
+	for (int n = 2; n <= 16; n <<= 1) {
+		int timeline = sw_sync_timeline_create();
+		int nfences = timeout * NSEC_PER_SEC / fence_ns + 1;
+		const int nchild = n - 1; /* odd for easy medians */
+		const int child_ns = frame_ns / (nchild + !!(flags & F_SPARE));
+		const int lo = nchild / 4;
+		const int hi = (3 * nchild + 3) / 4 - 1;
+		struct igt_mean m;
+
+		memset(result, 0, (nchild + 1) * sizeof(result[0]));
+		igt_fork(child, nchild) {
+			uint32_t ctx = gem_context_clone_with_engines(i915, 0);
+
+			fair_child(i915, ctx, e, child_ns,
+				   timeout, timeline, flags,
+				   &result[nchild],
+				   &result[child]);
+
+			gem_context_destroy(i915, ctx);
+		}
+
+		while (nfences--) {
+			struct timespec tv = { .tv_nsec = fence_ns };
+			nanosleep(&tv, NULL);
+			sw_sync_timeline_inc(timeline, 1);
+		}
+		result[nchild] = 1;
+		for (int child = 0; child < nchild; child++) {
+			while (!READ_ONCE(result[child])) {
+				struct timespec tv = { .tv_nsec = fence_ns };
+				nanosleep(&tv, NULL);
+				sw_sync_timeline_inc(timeline, 1);
+			}
+		}
+		igt_waitchildren();
+		close(timeline);
+
+		igt_mean_init(&m);
+		for (int child = 0; child < nchild; child++)
+			igt_mean_add(&m, result[child]);
+
+		qsort(result, nchild, sizeof(*result), cmp_ul);
+		igt_info("%d clients, range: [%.1f, %.1f], iqr: [%.1f, %.1f], median: %.1f, mean: %.1f ± %.2f ms\n",
+			 nchild,
+			 1e-6 * result[0],  1e-6 * result[nchild - 1],
+			 1e-6 * result[lo], 1e-6 * result[hi],
+			 1e-6 * result[nchild / 2],
+			 1e-6 * igt_mean_get(&m),
+			 1e-6 * sqrt(igt_mean_get_variance(&m)));
+
+#if 0
+		/* Mean within 10% of target */
+		igt_assert( 9 * igt_mean_get(&m) > 10 * frame_ns &&
+			   10 * igt_mean_get(&m) <  9 * frame_ns);
+
+		/* Variance [inter-quartile range] is less than 33% of median */
+		igt_assert(3 * result[hi] - result[lo] < result[nchild / 2]);
+#endif
+	}
+
+	munmap(result, 4096);
+}
+
 #define test_each_engine(T, i915, e) \
 	igt_subtest_with_dynamic(T) __for_each_physical_engine(i915, e) \
 		igt_dynamic_f("%s", e->name)
@@ -2589,6 +3014,21 @@ igt_main
 		test_each_engine_store("promotion", fd, e)
 			promotion(fd, e->flags);
 
+		test_each_engine_store("fair-none", fd, e)
+			fairness(fd, e, 2, 0);
+		test_each_engine_store("fair-pace", fd, e)
+			fairness(fd, e, 2, F_PACE);
+		test_each_engine_store("fair-sync", fd, e)
+			fairness(fd, e, 2, F_SYNC);
+		test_each_engine_store("fair-solo", fd, e)
+			fairness(fd, e, 2, F_SYNC | F_SOLO);
+		test_each_engine_store("fair-flow", fd, e)
+			fairness(fd, e, 2, F_PACE | F_FLOW);
+		test_each_engine_store("fair-spare", fd, e)
+			fairness(fd, e, 2, F_PACE | F_FLOW | F_SPARE);
+		test_each_engine_store("fair-half", fd, e)
+			fairness(fd, e, 2, F_PACE | F_FLOW | F_HALF);
+
 		igt_subtest_group {
 			igt_fixture {
 				igt_require(gem_scheduler_has_preemption(fd));
-- 
2.27.0.rc2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [Intel-gfx] [PATCH i-g-t] i915/gem_exec_schedule: Try to spot unfairness
@ 2020-06-02  8:22 Chris Wilson
  2020-06-02  8:32 ` Chris Wilson
  2020-06-02  8:50 ` Chris Wilson
  0 siblings, 2 replies; 20+ messages in thread
From: Chris Wilson @ 2020-06-02  8:22 UTC (permalink / raw)
  To: intel-gfx; +Cc: igt-dev, Chris Wilson

An important property for multi-client systems is that each client gets
a 'fair' allotment of system time. (Where fairness is at the whim of the
context properties, such as priorities.) This test forks N independent
clients (albeit they happen to share a single vm), and does an equal
amount of work in client and asserts that they take an equal amount of
time.

Though we have never claimed to have a completely fair scheduler, that
is what is expected.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Ramalingam C <ramalingam.c@intel.com>
---
 tests/i915/gem_exec_schedule.c | 436 +++++++++++++++++++++++++++++++++
 1 file changed, 436 insertions(+)

diff --git a/tests/i915/gem_exec_schedule.c b/tests/i915/gem_exec_schedule.c
index 56c638833..3045eeb62 100644
--- a/tests/i915/gem_exec_schedule.c
+++ b/tests/i915/gem_exec_schedule.c
@@ -2495,6 +2495,429 @@ static void measure_semaphore_power(int i915)
 	rapl_close(&pkg);
 }
 
+static int read_timestamp_frequency(int i915)
+{
+	int value = 0;
+	drm_i915_getparam_t gp = {
+		.value = &value,
+		.param = I915_PARAM_CS_TIMESTAMP_FREQUENCY,
+	};
+	ioctl(i915, DRM_IOCTL_I915_GETPARAM, &gp);
+	return value;
+}
+
+static uint64_t div64_u64_round_up(uint64_t x, uint64_t y)
+{
+	return (x + y - 1) / y;
+}
+
+static uint64_t ns_to_ticks(int i915, uint64_t ns)
+{
+	return div64_u64_round_up(ns * read_timestamp_frequency(i915),
+				  NSEC_PER_SEC);
+}
+
+static uint64_t ticks_to_ns(int i915, uint64_t ticks)
+{
+	return div64_u64_round_up(ticks * NSEC_PER_SEC,
+				  read_timestamp_frequency(i915));
+}
+
+#define MI_INSTR(opcode, flags) (((opcode) << 23) | (flags))
+
+#define MI_MATH(x)                      MI_INSTR(0x1a, (x) - 1)
+#define MI_MATH_INSTR(opcode, op1, op2) ((opcode) << 20 | (op1) << 10 | (op2))
+/* Opcodes for MI_MATH_INSTR */
+#define   MI_MATH_NOOP                  MI_MATH_INSTR(0x000, 0x0, 0x0)
+#define   MI_MATH_LOAD(op1, op2)        MI_MATH_INSTR(0x080, op1, op2)
+#define   MI_MATH_LOADINV(op1, op2)     MI_MATH_INSTR(0x480, op1, op2)
+#define   MI_MATH_LOAD0(op1)            MI_MATH_INSTR(0x081, op1)
+#define   MI_MATH_LOAD1(op1)            MI_MATH_INSTR(0x481, op1)
+#define   MI_MATH_ADD                   MI_MATH_INSTR(0x100, 0x0, 0x0)
+#define   MI_MATH_SUB                   MI_MATH_INSTR(0x101, 0x0, 0x0)
+#define   MI_MATH_AND                   MI_MATH_INSTR(0x102, 0x0, 0x0)
+#define   MI_MATH_OR                    MI_MATH_INSTR(0x103, 0x0, 0x0)
+#define   MI_MATH_XOR                   MI_MATH_INSTR(0x104, 0x0, 0x0)
+#define   MI_MATH_STORE(op1, op2)       MI_MATH_INSTR(0x180, op1, op2)
+#define   MI_MATH_STOREINV(op1, op2)    MI_MATH_INSTR(0x580, op1, op2)
+/* Registers used as operands in MI_MATH_INSTR */
+#define   MI_MATH_REG(x)                (x)
+#define   MI_MATH_REG_SRCA              0x20
+#define   MI_MATH_REG_SRCB              0x21
+#define   MI_MATH_REG_ACCU              0x31
+#define   MI_MATH_REG_ZF                0x32
+#define   MI_MATH_REG_CF                0x33
+
+#define MI_LOAD_REGISTER_REG    MI_INSTR(0x2A, 1)
+
+static void delay(int i915,
+		  const struct intel_execution_engine2 *e,
+		  uint32_t handle,
+		  uint64_t addr,
+		  uint64_t ns)
+{
+	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
+	const uint32_t base = gem_engine_mmio_base(i915, e->name);
+#define CS_GPR(x) (base + 0x600 + 8 * (x))
+#define TIMESTAMP (base + 0x3a8)
+	enum { START_TS, NOW_TS };
+	uint32_t *map, *cs, *jmp;
+
+	igt_require(base);
+
+	cs = map = gem_mmap__device_coherent(i915, handle, 0, 4096, PROT_WRITE);
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(START_TS) + 4;
+	*cs++ = 0;
+	*cs++ = MI_LOAD_REGISTER_REG;
+	*cs++ = TIMESTAMP;
+	*cs++ = CS_GPR(START_TS);
+
+	if (offset_in_page(cs) & 4)
+		*cs++ = 0;
+	jmp = cs;
+
+	*cs++ = 0x5 << 23; /* MI_ARB_CHECK */
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(NOW_TS) + 4;
+	*cs++ = 0;
+	*cs++ = MI_LOAD_REGISTER_REG;
+	*cs++ = TIMESTAMP;
+	*cs++ = CS_GPR(NOW_TS);
+
+	*cs++ = MI_MATH(4);
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(NOW_TS));
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(START_TS));
+	*cs++ = MI_MATH_SUB;
+	*cs++ = MI_MATH_STOREINV(MI_MATH_REG(NOW_TS), MI_MATH_REG_ACCU);
+
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_GPR(NOW_TS);
+	*cs++ = addr + 4000;
+	*cs++ = addr >> 32;
+
+	*cs++ = MI_COND_BATCH_BUFFER_END | MI_DO_COMPARE | (1 + use_64b);
+	*cs++ = ~ns_to_ticks(i915, ns);
+	*cs++ = addr + 4000;
+	*cs++ = addr >> 32;
+
+	*cs++ = MI_BATCH_BUFFER_START | 1 << 8 | use_64b;
+	*cs++ = addr + offset_in_page(jmp);
+	*cs++ = addr >> 32;
+
+	munmap(map, 4096);
+}
+
+static struct drm_i915_gem_exec_object2
+delay_create(int i915, uint32_t ctx,
+	     const struct intel_execution_engine2 *e,
+	     uint64_t target_ns)
+{
+	struct drm_i915_gem_exec_object2 obj = {
+		.handle = batch_create(i915),
+		.flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS,
+	};
+	struct drm_i915_gem_execbuffer2 execbuf = {
+		.buffers_ptr = to_user_pointer(&obj),
+		.buffer_count = 1,
+		.rsvd1 = ctx,
+		.flags = e->flags,
+	};
+
+	gem_execbuf(i915, &execbuf);
+	gem_sync(i915, obj.handle);
+
+	delay(i915, e, obj.handle, obj.offset, target_ns);
+
+	obj.flags |= EXEC_OBJECT_PINNED;
+	return obj;
+}
+
+static void tslog(int i915,
+		  const struct intel_execution_engine2 *e,
+		  uint32_t handle,
+		  uint64_t addr)
+{
+	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
+	const uint32_t base = gem_engine_mmio_base(i915, e->name);
+#define CS_GPR(x) (base + 0x600 + 8 * (x))
+#define CS_TIMESTAMP (base + 0x358)
+	enum { ONE, MASK, ADDR };
+	uint32_t *timestamp_lo, *addr_lo;
+	uint32_t *map, *cs;
+
+	igt_require(base);
+
+	map = gem_mmap__device_coherent(i915, handle, 0, 4096, PROT_WRITE);
+	cs = map + 512;
+
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_TIMESTAMP;
+	timestamp_lo = cs;
+	*cs++ = addr;
+	*cs++ = addr >> 32;
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(ADDR);
+	addr_lo = cs;
+	*cs++ = addr;
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(ADDR) + 4;
+	*cs++ = addr >> 32;
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(ONE);
+	*cs++ = 4;
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(ONE) + 4;
+	*cs++ = 0;
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(MASK);
+	*cs++ = 0xfffff7ff;
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(MASK) + 4;
+	*cs++ = 0xffffffff;
+
+	*cs++ = MI_MATH(8);
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(ONE));
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(ADDR));
+	*cs++ = MI_MATH_ADD;
+	*cs++ = MI_MATH_STORE(MI_MATH_REG(ADDR), MI_MATH_REG_ACCU);
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(ADDR));
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(MASK));
+	*cs++ = MI_MATH_AND;
+	*cs++ = MI_MATH_STORE(MI_MATH_REG(ADDR), MI_MATH_REG_ACCU);
+
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_GPR(ADDR);
+	*cs++ = addr + offset_in_page(timestamp_lo);
+	*cs++ = addr >> 32;
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_GPR(ADDR);
+	*cs++ = addr + offset_in_page(addr_lo);
+	*cs++ = addr >> 32;
+
+	*cs++ = MI_BATCH_BUFFER_END;
+
+	munmap(map, 4096);
+}
+
+static struct drm_i915_gem_exec_object2
+tslog_create(int i915, uint32_t ctx, const struct intel_execution_engine2 *e)
+{
+	struct drm_i915_gem_exec_object2 obj = {
+		.handle = batch_create(i915),
+		.flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS,
+	};
+	struct drm_i915_gem_execbuffer2 execbuf = {
+		.buffers_ptr = to_user_pointer(&obj),
+		.buffer_count = 1,
+		.rsvd1 = ctx,
+		.flags = e->flags,
+	};
+
+	gem_execbuf(i915, &execbuf);
+	gem_sync(i915, obj.handle);
+
+	tslog(i915, e, obj.handle, obj.offset);
+
+	obj.flags |= EXEC_OBJECT_PINNED;
+	return obj;
+}
+
+static int cmp_u32(const void *A, const void *B)
+{
+	const unsigned long *a = A, *b = B;
+
+	if (*a < *b)
+		return -1;
+	else if (*a > *b)
+		return 1;
+	else
+		return 0;
+}
+
+static void fair_child(int i915, uint32_t ctx,
+		       const struct intel_execution_engine2 *e,
+		       uint64_t frame_ns,
+		       int timeout,
+		       int timeline,
+		       unsigned int flags,
+		       unsigned long *ctl,
+		       unsigned long *out)
+#define F_SYNC (1 << 0)
+#define F_PACE (1 << 1)
+#define F_FLOW (1 << 2)
+#define F_HALF (1 << 3)
+#define F_SOLO (1 << 4)
+{
+	const int batches_per_frame = flags & F_SOLO ? 1 : 3;
+	struct drm_i915_gem_exec_object2 prev =
+		delay_create(i915, ctx, e, frame_ns / batches_per_frame);
+	struct drm_i915_gem_exec_object2 next =
+		delay_create(i915, ctx, e, frame_ns / batches_per_frame);
+	struct drm_i915_gem_exec_object2 ts = tslog_create(i915, ctx, e);
+	int p_fence = -1, n_fence = -1;
+	unsigned long count = 0;
+	uint32_t *map;
+	int n;
+
+	while (!READ_ONCE(*ctl)) {
+		struct drm_i915_gem_execbuffer2 execbuf = {
+			.buffers_ptr = to_user_pointer(&next),
+			.buffer_count = 1,
+			.rsvd1 = ctx,
+			.rsvd2 = -1,
+			.flags = e->flags,
+		};
+
+		if (flags & (F_FLOW | F_HALF)) {
+			execbuf.rsvd2 =
+				sw_sync_timeline_create_fence(timeline, count);
+			execbuf.flags |= I915_EXEC_FENCE_IN;
+		}
+
+		execbuf.flags |= I915_EXEC_FENCE_OUT;
+		gem_execbuf_wr(i915, &execbuf);
+		n_fence = execbuf.rsvd2 >> 32;
+		execbuf.flags &= ~(I915_EXEC_FENCE_OUT | I915_EXEC_FENCE_IN);
+		for (n = 1; n < batches_per_frame; n++)
+			gem_execbuf(i915, &execbuf);
+
+		execbuf.buffers_ptr = to_user_pointer(&ts);
+		execbuf.batch_start_offset = 2048;
+		gem_execbuf(i915, &execbuf);
+
+		if (flags & F_PACE && p_fence != -1) {
+			struct pollfd pfd = {
+				.fd = p_fence,
+				.events = POLLIN,
+			};
+			poll(&pfd, 1, -1);
+		}
+		close(p_fence);
+		close(execbuf.rsvd2);
+
+		if (flags & F_SYNC) {
+			struct pollfd pfd = {
+				.fd = n_fence,
+				.events = POLLIN,
+			};
+			poll(&pfd, 1, -1);
+		}
+
+		igt_swap(prev, next);
+		igt_swap(p_fence, n_fence);
+		count++;
+	}
+	close(p_fence);
+
+	gem_close(i915, next.handle);
+	gem_close(i915, prev.handle);
+
+	gem_sync(i915, ts.handle);
+	map = gem_mmap__device_coherent(i915, ts.handle, 0, 4096, PROT_WRITE);
+	for (n = 1; n < min(count, 512); n++) {
+		igt_assert(map[n]);
+		map[n - 1] = map[n] - map[n - 1];
+	}
+	qsort(map, --n, sizeof(*map), cmp_u32);
+	*out = ticks_to_ns(i915, map[n / 2]);
+	munmap(map, 4096);
+
+	gem_close(i915, ts.handle);
+}
+
+static int cmp_ul(const void *A, const void *B)
+{
+	const unsigned long *a = A, *b = B;
+
+	if (*a < *b)
+		return -1;
+	else if (*a > *b)
+		return 1;
+	else
+		return 0;
+}
+
+static void fairness(int i915,
+		     const struct intel_execution_engine2 *e,
+		     int timeout, unsigned int flags)
+{
+	const int frame_ns = 16666 * 1000;
+	const int fence_ns = flags & F_HALF ? 2 * frame_ns : frame_ns;
+	unsigned long *result;
+
+	igt_require(intel_gen(intel_get_drm_devid(i915)) >= 8);
+	igt_require(gem_class_has_mutable_submission(i915, e->class));
+
+	result = mmap(NULL, 4096, PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
+
+	for (int n = 2; n <= 16; n <<= 1) {
+		int timeline = sw_sync_timeline_create();
+		int nfences = timeout * NSEC_PER_SEC / fence_ns + 1;
+		const int nchild = n - 1; /* odd for easy medians */
+		const int lo = nchild / 4;
+		const int hi = (3 * nchild + 3) / 4 - 1;
+		struct igt_mean m;
+
+		memset(result, 0, (nchild + 1) * sizeof(result[0]));
+		igt_fork(child, nchild) {
+			uint32_t ctx = gem_context_clone_with_engines(i915, 0);
+
+			fair_child(i915, ctx, e, frame_ns / nchild,
+				   timeout, timeline, flags,
+				   &result[nchild],
+				   &result[child]);
+
+			gem_context_destroy(i915, ctx);
+		}
+
+		while (nfences--) {
+			struct timespec tv = { .tv_nsec = fence_ns };
+			nanosleep(&tv, NULL);
+			sw_sync_timeline_inc(timeline, 1);
+		}
+		result[nchild] = 1;
+		for (int child = 0; child < nchild; child++) {
+			while (!READ_ONCE(result[child])) {
+				struct timespec tv = { .tv_nsec = fence_ns };
+				nanosleep(&tv, NULL);
+				sw_sync_timeline_inc(timeline, 1);
+			}
+		}
+		igt_waitchildren();
+		close(timeline);
+
+		igt_mean_init(&m);
+		for (int child = 0; child < nchild; child++)
+			igt_mean_add(&m, result[child]);
+
+		qsort(result, nchild, sizeof(*result), cmp_ul);
+		igt_info("%d clients, range: [%.1f, %.1f], iqr: [%.1f, %.1f], median: %.1f, mean: %.1f ± %.2f ms\n",
+			 nchild,
+			 1e-6 * result[0],  1e-6 * result[nchild - 1],
+			 1e-6 * result[lo], 1e-6 * result[hi],
+			 1e-6 * result[nchild / 2],
+			 1e-6 * igt_mean_get(&m),
+			 1e-6 * sqrt(igt_mean_get_variance(&m)));
+
+#if 0
+		/* Mean within 10% of target */
+		igt_assert( 9 * igt_mean_get(&m) > 10 * frame_ns &&
+			   10 * igt_mean_get(&m) <  9 * frame_ns);
+
+		/* Variance [inter-quartile range] is less than 33% of median */
+		igt_assert(3 * result[hi] - result[lo] < result[nchild / 2]);
+#endif
+	}
+
+	munmap(result, 4096);
+}
+
 #define test_each_engine(T, i915, e) \
 	igt_subtest_with_dynamic(T) __for_each_physical_engine(i915, e) \
 		igt_dynamic_f("%s", e->name)
@@ -2589,6 +3012,19 @@ igt_main
 		test_each_engine_store("promotion", fd, e)
 			promotion(fd, e->flags);
 
+		test_each_engine_store("fair-none", fd, e)
+			fairness(fd, e, 2, 0);
+		test_each_engine_store("fair-pace", fd, e)
+			fairness(fd, e, 2, F_PACE);
+		test_each_engine_store("fair-sync", fd, e)
+			fairness(fd, e, 2, F_SYNC);
+		test_each_engine_store("fair-flow", fd, e)
+			fairness(fd, e, 2, F_PACE | F_FLOW);
+		test_each_engine_store("fair-half", fd, e)
+			fairness(fd, e, 2, F_PACE | F_HALF);
+		test_each_engine_store("fair-solo", fd, e)
+			fairness(fd, e, 2, F_SYNC | F_SOLO);
+
 		igt_subtest_group {
 			igt_fixture {
 				igt_require(gem_scheduler_has_preemption(fd));
-- 
2.27.0.rc2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [Intel-gfx] [PATCH i-g-t] i915/gem_exec_schedule: Try to spot unfairness
@ 2020-06-02  0:26 Chris Wilson
  0 siblings, 0 replies; 20+ messages in thread
From: Chris Wilson @ 2020-06-02  0:26 UTC (permalink / raw)
  To: intel-gfx; +Cc: igt-dev, Chris Wilson

An important property for multi-client systems is that each client gets
a 'fair' allotment of system time. (Where fairness is at the whim of the
context properties, such as priorities.) This test forks N independent
clients (albeit they happen to share a single vm), and does an equal
amount of work in client and asserts that they take an equal amount of
time.

Though we have never claimed to have a completely fair scheduler, that
is what is expected.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Ramalingam C <ramalingam.c@intel.com>
---
 tests/i915/gem_exec_schedule.c | 418 +++++++++++++++++++++++++++++++++
 1 file changed, 418 insertions(+)

diff --git a/tests/i915/gem_exec_schedule.c b/tests/i915/gem_exec_schedule.c
index 56c638833..d1121ecd2 100644
--- a/tests/i915/gem_exec_schedule.c
+++ b/tests/i915/gem_exec_schedule.c
@@ -2495,6 +2495,417 @@ static void measure_semaphore_power(int i915)
 	rapl_close(&pkg);
 }
 
+static int read_timestamp_frequency(int i915)
+{
+	int value = 0;
+	drm_i915_getparam_t gp = {
+		.value = &value,
+		.param = I915_PARAM_CS_TIMESTAMP_FREQUENCY,
+	};
+	ioctl(i915, DRM_IOCTL_I915_GETPARAM, &gp);
+	return value;
+}
+
+static uint64_t div64_u64_round_up(uint64_t x, uint64_t y)
+{
+	return (x + y - 1) / y;
+}
+
+static uint64_t ns_to_ticks(int i915, uint64_t ns)
+{
+	return div64_u64_round_up(ns * read_timestamp_frequency(i915),
+				  NSEC_PER_SEC);
+}
+
+static uint64_t ticks_to_ns(int i915, uint64_t ticks)
+{
+	return div64_u64_round_up(ticks * NSEC_PER_SEC,
+				  read_timestamp_frequency(i915));
+}
+
+#define MI_INSTR(opcode, flags) (((opcode) << 23) | (flags))
+
+#define MI_MATH(x)                      MI_INSTR(0x1a, (x) - 1)
+#define MI_MATH_INSTR(opcode, op1, op2) ((opcode) << 20 | (op1) << 10 | (op2))
+/* Opcodes for MI_MATH_INSTR */
+#define   MI_MATH_NOOP                  MI_MATH_INSTR(0x000, 0x0, 0x0)
+#define   MI_MATH_LOAD(op1, op2)        MI_MATH_INSTR(0x080, op1, op2)
+#define   MI_MATH_LOADINV(op1, op2)     MI_MATH_INSTR(0x480, op1, op2)
+#define   MI_MATH_LOAD0(op1)            MI_MATH_INSTR(0x081, op1)
+#define   MI_MATH_LOAD1(op1)            MI_MATH_INSTR(0x481, op1)
+#define   MI_MATH_ADD                   MI_MATH_INSTR(0x100, 0x0, 0x0)
+#define   MI_MATH_SUB                   MI_MATH_INSTR(0x101, 0x0, 0x0)
+#define   MI_MATH_AND                   MI_MATH_INSTR(0x102, 0x0, 0x0)
+#define   MI_MATH_OR                    MI_MATH_INSTR(0x103, 0x0, 0x0)
+#define   MI_MATH_XOR                   MI_MATH_INSTR(0x104, 0x0, 0x0)
+#define   MI_MATH_STORE(op1, op2)       MI_MATH_INSTR(0x180, op1, op2)
+#define   MI_MATH_STOREINV(op1, op2)    MI_MATH_INSTR(0x580, op1, op2)
+/* Registers used as operands in MI_MATH_INSTR */
+#define   MI_MATH_REG(x)                (x)
+#define   MI_MATH_REG_SRCA              0x20
+#define   MI_MATH_REG_SRCB              0x21
+#define   MI_MATH_REG_ACCU              0x31
+#define   MI_MATH_REG_ZF                0x32
+#define   MI_MATH_REG_CF                0x33
+
+#define MI_LOAD_REGISTER_REG    MI_INSTR(0x2A, 1)
+
+static void delay(int i915,
+		  const struct intel_execution_engine2 *e,
+		  uint32_t handle,
+		  uint64_t addr,
+		  uint64_t ns)
+{
+	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
+	const uint32_t base = gem_engine_mmio_base(i915, e->name);
+#define CS_GPR(x) (base + 0x600 + 8 * (x))
+#define TIMESTAMP (base + 0x3a8)
+	enum { START_TS, NOW_TS };
+	uint32_t *map, *cs, *jmp;
+
+	igt_require(base);
+
+	cs = map = gem_mmap__device_coherent(i915, handle, 0, 4096, PROT_WRITE);
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(START_TS) + 4;
+	*cs++ = 0;
+	*cs++ = MI_LOAD_REGISTER_REG;
+	*cs++ = TIMESTAMP;
+	*cs++ = CS_GPR(START_TS);
+
+	if (offset_in_page(cs) & 4)
+		*cs++ = 0;
+	jmp = cs;
+
+	*cs++ = 0x5 << 23; /* MI_ARB_CHECK */
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(NOW_TS) + 4;
+	*cs++ = 0;
+	*cs++ = MI_LOAD_REGISTER_REG;
+	*cs++ = TIMESTAMP;
+	*cs++ = CS_GPR(NOW_TS);
+
+	*cs++ = MI_MATH(4);
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(NOW_TS));
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(START_TS));
+	*cs++ = MI_MATH_SUB;
+	*cs++ = MI_MATH_STOREINV(MI_MATH_REG(NOW_TS), MI_MATH_REG_ACCU);
+
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_GPR(NOW_TS);
+	*cs++ = addr + 4000;
+	*cs++ = addr >> 32;
+
+	*cs++ = MI_COND_BATCH_BUFFER_END | MI_DO_COMPARE | (1 + use_64b);
+	*cs++ = ~ns_to_ticks(i915, ns);
+	*cs++ = addr + 4000;
+	*cs++ = addr >> 32;
+
+	*cs++ = MI_BATCH_BUFFER_START | 1 << 8 | use_64b;
+	*cs++ = addr + offset_in_page(jmp);
+	*cs++ = addr >> 32;
+
+	munmap(map, 4096);
+}
+
+static struct drm_i915_gem_exec_object2
+delay_create(int i915, uint32_t ctx,
+	     const struct intel_execution_engine2 *e,
+	     uint64_t target_ns)
+{
+	struct drm_i915_gem_exec_object2 obj = {
+		.handle = batch_create(i915),
+		.flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS,
+	};
+	struct drm_i915_gem_execbuffer2 execbuf = {
+		.buffers_ptr = to_user_pointer(&obj),
+		.buffer_count = 1,
+		.rsvd1 = ctx,
+		.flags = e->flags,
+	};
+
+	gem_execbuf(i915, &execbuf);
+	gem_sync(i915, obj.handle);
+
+	delay(i915, e, obj.handle, obj.offset, target_ns);
+
+	obj.flags |= EXEC_OBJECT_PINNED;
+	return obj;
+}
+
+static void tslog(int i915,
+		  const struct intel_execution_engine2 *e,
+		  uint32_t handle,
+		  uint64_t addr)
+{
+	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
+	const uint32_t base = gem_engine_mmio_base(i915, e->name);
+#define CS_GPR(x) (base + 0x600 + 8 * (x))
+#define CS_TIMESTAMP (base + 0x358)
+	enum { ONE, MASK, ADDR };
+	uint32_t *timestamp_lo, *addr_lo;
+	uint32_t *map, *cs;
+
+	igt_require(base);
+
+	map = gem_mmap__device_coherent(i915, handle, 0, 4096, PROT_WRITE);
+	cs = map + 512;
+
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_TIMESTAMP;
+	timestamp_lo = cs;
+	*cs++ = addr;
+	*cs++ = addr >> 32;
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(ADDR);
+	addr_lo = cs;
+	*cs++ = addr;
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(ADDR) + 4;
+	*cs++ = addr >> 32;
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(ONE);
+	*cs++ = 4;
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(ONE) + 4;
+	*cs++ = 0;
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(MASK);
+	*cs++ = 0xfffff7ff;
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(MASK) + 4;
+	*cs++ = 0xffffffff;
+
+	*cs++ = MI_MATH(8);
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(ONE));
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(ADDR));
+	*cs++ = MI_MATH_ADD;
+	*cs++ = MI_MATH_STORE(MI_MATH_REG(ADDR), MI_MATH_REG_ACCU);
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(ADDR));
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(MASK));
+	*cs++ = MI_MATH_AND;
+	*cs++ = MI_MATH_STORE(MI_MATH_REG(ADDR), MI_MATH_REG_ACCU);
+
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_GPR(ADDR);
+	*cs++ = addr + offset_in_page(timestamp_lo);
+	*cs++ = addr >> 32;
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_GPR(ADDR);
+	*cs++ = addr + offset_in_page(addr_lo);
+	*cs++ = addr >> 32;
+
+	*cs++ = MI_BATCH_BUFFER_END;
+
+	munmap(map, 4096);
+}
+
+static struct drm_i915_gem_exec_object2
+tslog_create(int i915, uint32_t ctx, const struct intel_execution_engine2 *e)
+{
+	struct drm_i915_gem_exec_object2 obj = {
+		.handle = batch_create(i915),
+		.flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS,
+	};
+	struct drm_i915_gem_execbuffer2 execbuf = {
+		.buffers_ptr = to_user_pointer(&obj),
+		.buffer_count = 1,
+		.rsvd1 = ctx,
+		.flags = e->flags,
+	};
+
+	gem_execbuf(i915, &execbuf);
+	gem_sync(i915, obj.handle);
+
+	tslog(i915, e, obj.handle, obj.offset);
+
+	obj.flags |= EXEC_OBJECT_PINNED;
+	return obj;
+}
+
+static int cmp_u32(const void *A, const void *B)
+{
+	const unsigned long *a = A, *b = B;
+
+	if (*a < *b)
+		return -1;
+	else if (*a > *b)
+		return 1;
+	else
+		return 0;
+}
+
+static void fair_child(int i915, uint32_t ctx,
+		       const struct intel_execution_engine2 *e,
+		       uint64_t frame_ns,
+		       int timeout,
+		       int timeline,
+		       unsigned int flags,
+		       unsigned long *ctl,
+		       unsigned long *out)
+#define F_PACING 0x1
+#define F_EXTERNAL 0x2
+{
+	const int batches_per_frame = 3;
+	struct drm_i915_gem_exec_object2 prev =
+		delay_create(i915, ctx, e, frame_ns / batches_per_frame);
+	struct drm_i915_gem_exec_object2 next =
+		delay_create(i915, ctx, e, frame_ns / batches_per_frame);
+	struct drm_i915_gem_exec_object2 ts = tslog_create(i915, ctx, e);
+	struct timespec tv = {};
+	unsigned long count = 0;
+	int p_fence = -1, n_fence = -1;
+	uint32_t *map;
+	int n;
+
+	igt_nsec_elapsed(&tv);
+	while (!READ_ONCE(*ctl)) {
+		struct drm_i915_gem_execbuffer2 execbuf = {
+			.buffers_ptr = to_user_pointer(&next),
+			.buffer_count = 1,
+			.rsvd1 = ctx,
+			.rsvd2 = -1,
+			.flags = e->flags,
+		};
+
+		if (flags & F_EXTERNAL) {
+			execbuf.rsvd2 =
+				sw_sync_timeline_create_fence(timeline, count);
+			execbuf.flags |= I915_EXEC_FENCE_IN;
+		}
+
+		execbuf.flags |= I915_EXEC_FENCE_OUT;
+		gem_execbuf_wr(i915, &execbuf);
+		n_fence = execbuf.rsvd2 >> 32;
+		execbuf.flags &= ~(I915_EXEC_FENCE_OUT | I915_EXEC_FENCE_IN);
+		for (n = 1; n < batches_per_frame; n++)
+			gem_execbuf(i915, &execbuf);
+
+		execbuf.buffers_ptr = to_user_pointer(&ts);
+		execbuf.batch_start_offset = 2048;
+		gem_execbuf(i915, &execbuf);
+
+		if (flags & F_PACING && p_fence != -1) {
+			struct pollfd pfd = {
+				.fd = p_fence,
+				.events = POLLIN,
+			};
+			poll(&pfd, 1, -1);
+		}
+		close(p_fence);
+		close(execbuf.rsvd2);
+
+		igt_swap(prev, next);
+		igt_swap(p_fence, n_fence);
+		count++;
+	}
+	gem_sync(i915, prev.handle);
+	close(p_fence);
+
+	gem_close(i915, next.handle);
+	gem_close(i915, prev.handle);
+
+	map = gem_mmap__device_coherent(i915, ts.handle, 0, 4096, PROT_WRITE);
+	for (n = 1; n < min(count, 512); n++)
+		map[n - 1] = map[n] - map[n - 1];
+	qsort(map, --n, sizeof(*map), cmp_u32);
+	*out = ticks_to_ns(i915, map[n / 2]);
+	munmap(map, 4096);
+
+	gem_close(i915, ts.handle);
+}
+
+static int cmp_ul(const void *A, const void *B)
+{
+	const unsigned long *a = A, *b = B;
+
+	if (*a < *b)
+		return -1;
+	else if (*a > *b)
+		return 1;
+	else
+		return 0;
+}
+
+static void fairness(int i915,
+		     const struct intel_execution_engine2 *e,
+		     int timeout, unsigned int flags)
+{
+	const int frame_ns = 16666 * 1000;
+	unsigned long *result;
+
+	igt_require(intel_gen(intel_get_drm_devid(i915)) >= 8);
+	igt_require(gem_class_has_mutable_submission(i915, e->class));
+
+	result = mmap(NULL, 4096, PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
+
+	for (int n = 2; n <= 16; n <<= 1) {
+		int timeline = sw_sync_timeline_create();
+		int nframes = timeout * NSEC_PER_SEC / frame_ns + 1;
+		const int nchild = n - 1; /* odd for easy medians */
+		const int lo = nchild / 4;
+		const int hi = (3 * nchild + 3) / 4 - 1;
+		struct igt_mean m;
+
+		memset(result, 0, (nchild + 1) * sizeof(result[0]));
+		igt_fork(child, nchild) {
+			uint32_t ctx = gem_context_clone_with_engines(i915, 0);
+
+			fair_child(i915, ctx, e, frame_ns / nchild,
+				   timeout, timeline, flags,
+				   &result[nchild],
+				   &result[child]);
+
+			gem_context_destroy(i915, ctx);
+		}
+
+		while (nframes--) {
+			struct timespec tv = { .tv_nsec = frame_ns };
+			nanosleep(&tv, NULL);
+			sw_sync_timeline_inc(timeline, 1);
+		}
+		result[nchild] = 1;
+		for (int child = 0; child < nchild; child++) {
+			while (!READ_ONCE(result[child])) {
+				struct timespec tv = { .tv_nsec = frame_ns };
+				nanosleep(&tv, NULL);
+				sw_sync_timeline_inc(timeline, 1);
+			}
+		}
+		igt_waitchildren();
+		close(timeline);
+
+		igt_mean_init(&m);
+		for (int child = 0; child < nchild; child++)
+			igt_mean_add(&m, result[child]);
+
+		qsort(result, nchild, sizeof(*result), cmp_ul);
+		igt_info("%d clients, range: [%.1f, %.1f], iqr: [%.1f, %.1f], median: %.1f, mean: %.1f ± %.2f ms\n",
+			 nchild,
+			 1e-6 * result[0],  1e-6 * result[nchild - 1],
+			 1e-6 * result[lo], 1e-6 * result[hi],
+			 1e-6 * result[nchild / 2],
+			 1e-6 * igt_mean_get(&m),
+			 1e-6 * sqrt(igt_mean_get_variance(&m)));
+
+#if 0
+		/* Mean within 10% of target */
+		igt_assert( 9 * igt_mean_get(&m) > 10 * frame_ns &&
+			   10 * igt_mean_get(&m) <  9 * frame_ns);
+
+		/* Variance [inter-quartile range] is less than 33% of median */
+		igt_assert(3 * result[hi] - result[lo] < result[nchild / 2]);
+#endif
+	}
+
+	munmap(result, 4096);
+}
+
 #define test_each_engine(T, i915, e) \
 	igt_subtest_with_dynamic(T) __for_each_physical_engine(i915, e) \
 		igt_dynamic_f("%s", e->name)
@@ -2589,6 +3000,13 @@ igt_main
 		test_each_engine_store("promotion", fd, e)
 			promotion(fd, e->flags);
 
+		test_each_engine_store("fair-none", fd, e)
+			fairness(fd, e, 2, 0);
+		test_each_engine_store("fair-pace", fd, e)
+			fairness(fd, e, 2, F_PACING);
+		test_each_engine_store("fair-sync", fd, e)
+			fairness(fd, e, 2, F_PACING | F_EXTERNAL);
+
 		igt_subtest_group {
 			igt_fixture {
 				igt_require(gem_scheduler_has_preemption(fd));
-- 
2.27.0.rc2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [Intel-gfx] [PATCH i-g-t] i915/gem_exec_schedule: Try to spot unfairness
  2020-06-01 19:08 Chris Wilson
  2020-06-01 19:53 ` Chris Wilson
@ 2020-06-01 21:17 ` Chris Wilson
  1 sibling, 0 replies; 20+ messages in thread
From: Chris Wilson @ 2020-06-01 21:17 UTC (permalink / raw)
  To: intel-gfx; +Cc: igt-dev, Chris Wilson

An important property for multi-client systems is that each client gets
a 'fair' allotment of system time. (Where fairness is at the whim of the
context properties, such as priorities.) This test forks N independent
clients (albeit they happen to share a single vm), and does an equal
amount of work in client and asserts that they take an equal amount of
time.

Though we have never claimed to have a completely fair scheduler, that
is what is expected.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Ramalingam C <ramalingam.c@intel.com>
---
 tests/i915/gem_exec_schedule.c | 253 +++++++++++++++++++++++++++++++++
 1 file changed, 253 insertions(+)

diff --git a/tests/i915/gem_exec_schedule.c b/tests/i915/gem_exec_schedule.c
index 56c638833..d58d926b1 100644
--- a/tests/i915/gem_exec_schedule.c
+++ b/tests/i915/gem_exec_schedule.c
@@ -2495,6 +2495,254 @@ static void measure_semaphore_power(int i915)
 	rapl_close(&pkg);
 }
 
+static int read_timestamp_frequency(int i915)
+{
+	int value = 0;
+	drm_i915_getparam_t gp = {
+		.value = &value,
+		.param = I915_PARAM_CS_TIMESTAMP_FREQUENCY,
+	};
+	ioctl(i915, DRM_IOCTL_I915_GETPARAM, &gp);
+	return value;
+}
+
+static uint64_t div64_u64_round_up(uint64_t x, uint64_t y)
+{
+	return (x + y - 1) / y;
+}
+
+static uint64_t ns_to_ticks(int i915, uint64_t ns)
+{
+	return div64_u64_round_up(ns * read_timestamp_frequency(i915),
+				  1000000000);
+}
+
+#define MI_INSTR(opcode, flags) (((opcode) << 23) | (flags))
+
+#define MI_MATH(x)                      MI_INSTR(0x1a, (x) - 1)
+#define MI_MATH_INSTR(opcode, op1, op2) ((opcode) << 20 | (op1) << 10 | (op2))
+/* Opcodes for MI_MATH_INSTR */
+#define   MI_MATH_NOOP                  MI_MATH_INSTR(0x000, 0x0, 0x0)
+#define   MI_MATH_LOAD(op1, op2)        MI_MATH_INSTR(0x080, op1, op2)
+#define   MI_MATH_LOADINV(op1, op2)     MI_MATH_INSTR(0x480, op1, op2)
+#define   MI_MATH_LOAD0(op1)            MI_MATH_INSTR(0x081, op1)
+#define   MI_MATH_LOAD1(op1)            MI_MATH_INSTR(0x481, op1)
+#define   MI_MATH_ADD                   MI_MATH_INSTR(0x100, 0x0, 0x0)
+#define   MI_MATH_SUB                   MI_MATH_INSTR(0x101, 0x0, 0x0)
+#define   MI_MATH_AND                   MI_MATH_INSTR(0x102, 0x0, 0x0)
+#define   MI_MATH_OR                    MI_MATH_INSTR(0x103, 0x0, 0x0)
+#define   MI_MATH_XOR                   MI_MATH_INSTR(0x104, 0x0, 0x0)
+#define   MI_MATH_STORE(op1, op2)       MI_MATH_INSTR(0x180, op1, op2)
+#define   MI_MATH_STOREINV(op1, op2)    MI_MATH_INSTR(0x580, op1, op2)
+/* Registers used as operands in MI_MATH_INSTR */
+#define   MI_MATH_REG(x)                (x)
+#define   MI_MATH_REG_SRCA              0x20
+#define   MI_MATH_REG_SRCB              0x21
+#define   MI_MATH_REG_ACCU              0x31
+#define   MI_MATH_REG_ZF                0x32
+#define   MI_MATH_REG_CF                0x33
+
+#define MI_LOAD_REGISTER_REG    MI_INSTR(0x2A, 1)
+
+static void async_delay(int i915,
+			const struct intel_execution_engine2 *e,
+			uint32_t handle,
+			uint64_t addr,
+			uint64_t ns)
+{
+	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
+	const uint32_t base = gem_engine_mmio_base(i915, e->name);
+#define CS_GPR(x) (base + 0x600 + 8 * (x))
+#define TIMESTAMP (base + 0x3a8)
+	enum { START_TS, NOW_TS };
+	uint32_t *map, *cs, *jmp;
+
+	igt_require(base);
+
+	cs = map = gem_mmap__device_coherent(i915, handle, 0, 4096, PROT_WRITE);
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(START_TS) + 4;
+	*cs++ = 0;
+	*cs++ = MI_LOAD_REGISTER_REG;
+	*cs++ = TIMESTAMP;
+	*cs++ = CS_GPR(START_TS);
+
+	if (offset_in_page(cs) & 4)
+		*cs++ = 0;
+	jmp = cs;
+
+	*cs++ = 0x5 << 23; /* MI_ARB_CHECK */
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(NOW_TS) + 4;
+	*cs++ = 0;
+	*cs++ = MI_LOAD_REGISTER_REG;
+	*cs++ = TIMESTAMP;
+	*cs++ = CS_GPR(NOW_TS);
+
+	*cs++ = MI_MATH(4);
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(NOW_TS));
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(START_TS));
+	*cs++ = MI_MATH_SUB;
+	*cs++ = MI_MATH_STOREINV(MI_MATH_REG(NOW_TS), MI_MATH_REG_ACCU);
+
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_GPR(NOW_TS);
+	*cs++ = addr + 4000;
+	*cs++ = addr >> 32;
+
+	*cs++ = MI_COND_BATCH_BUFFER_END | MI_DO_COMPARE | (1 + use_64b);
+	*cs++ = ~ns_to_ticks(i915, ns);
+	*cs++ = addr + 4000;
+	*cs++ = addr >> 32;
+
+	*cs++ = MI_BATCH_BUFFER_START | 1 << 8 | use_64b;
+	*cs++ = addr + offset_in_page(jmp);
+	*cs++ = addr >> 32;
+
+	munmap(map, 4096);
+}
+
+static struct drm_i915_gem_exec_object2
+timed_create(int i915, uint32_t ctx,
+	     const struct intel_execution_engine2 *e,
+	     uint64_t target_ns)
+{
+	struct drm_i915_gem_exec_object2 obj = {
+		.handle = batch_create(i915),
+		.flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS,
+	};
+	struct drm_i915_gem_execbuffer2 execbuf = {
+		.buffers_ptr = to_user_pointer(&obj),
+		.buffer_count = 1,
+		.rsvd1 = ctx,
+		.flags = e->flags,
+	};
+
+	gem_execbuf(i915, &execbuf);
+	gem_sync(i915, obj.handle);
+
+	async_delay(i915, e, obj.handle, obj.offset, target_ns);
+
+	obj.flags |= EXEC_OBJECT_PINNED;
+	return obj;
+}
+
+static void fair_child(int i915, uint32_t ctx,
+		       const struct intel_execution_engine2 *e,
+		       uint64_t frame_ns,
+		       int timeout,
+		       unsigned int flags,
+		       unsigned long *out)
+#define F_PACING 0x1
+{
+	const int batches_per_frame = 3;
+	struct drm_i915_gem_exec_object2 prev =
+		timed_create(i915, ctx, e, frame_ns / batches_per_frame);
+	struct drm_i915_gem_exec_object2 next =
+		timed_create(i915, ctx, e, frame_ns / batches_per_frame);
+	struct timespec tv = {};
+	unsigned long count = 0;
+	int p_fence = -1, n_fence = -1;
+
+	igt_nsec_elapsed(&tv);
+	igt_until_timeout(timeout) {
+		struct drm_i915_gem_execbuffer2 execbuf = {
+			.buffers_ptr = to_user_pointer(&next),
+			.buffer_count = 1,
+			.rsvd1 = ctx,
+			.flags = e->flags,
+		};
+
+		execbuf.flags |= I915_EXEC_FENCE_OUT;
+		gem_execbuf_wr(i915, &execbuf);
+		n_fence = execbuf.rsvd2 >> 32;
+		execbuf.flags &= ~I915_EXEC_FENCE_OUT;
+		for (int n = 1; n < batches_per_frame; n++)
+			gem_execbuf(i915, &execbuf);
+
+		if (flags & F_PACING && p_fence != -1) {
+			struct pollfd pfd = {
+				.fd = p_fence,
+				.events = POLLIN,
+			};
+			poll(&pfd, 1, -1);
+		}
+		close(p_fence);
+
+		igt_swap(prev, next);
+		igt_swap(p_fence, n_fence);
+		count++;
+	}
+	gem_sync(i915, prev.handle);
+	*out = igt_nsec_elapsed(&tv) / count;
+	close(p_fence);
+
+	gem_close(i915, next.handle);
+	gem_close(i915, prev.handle);
+}
+
+static int ul_cmp(const void *A, const void *B)
+{
+	const unsigned long *a = A, *b = B;
+
+	if (*a < *b)
+		return -1;
+	else if (*a > *b)
+		return 1;
+	else
+		return 0;
+}
+
+static void fairness(int i915,
+		     const struct intel_execution_engine2 *e,
+		     int timeout, unsigned int flags)
+{
+	const int frame_ns = 16666 * 1000;
+	unsigned long *result;
+
+	igt_require(intel_gen(intel_get_drm_devid(i915)) >= 8);
+
+	result = mmap(NULL, 4096, PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
+
+	for (int n = 2; n <= 16; n <<= 1) {
+		const int nchild = n - 1; /* odd for easy medians */
+		const int iqr_lo = nchild / 4;
+		const int iqr_hi = (3 * nchild + 3) / 4 - 1;
+		unsigned long iqr;
+
+		memset(result, 0, nchild * sizeof(result[0]));
+		igt_fork(child, nchild) {
+			uint32_t ctx = gem_context_clone_with_engines(i915, 0);
+
+
+			fair_child(i915, ctx, e, frame_ns / nchild,
+				   timeout, flags, &result[child]);
+
+			gem_context_destroy(i915, ctx);
+		}
+		igt_waitchildren();
+
+		qsort(result, nchild, sizeof(*result), ul_cmp);
+		igt_info("%d clients, range: [%lu, %lu], iqr: [%lu, %lu], median: %lu\n",
+			 nchild,
+			 result[0], result[nchild - 1],
+			 result[iqr_lo], result[iqr_hi],
+			 result[nchild / 2]);
+
+		/* Median within 10% of target */
+		igt_assert(10 * result[nchild / 2] > 9 * frame_ns &&
+			   9 * result[nchild / 2] < 10 * frame_ns);
+
+		/* Variance [inter-quartile range] is less than 33% of median */
+		iqr = result[iqr_hi] - result[iqr_lo];
+		igt_assert(3 * iqr < result[nchild / 2]);
+	}
+
+	munmap(result, 4096);
+}
+
 #define test_each_engine(T, i915, e) \
 	igt_subtest_with_dynamic(T) __for_each_physical_engine(i915, e) \
 		igt_dynamic_f("%s", e->name)
@@ -2589,6 +2837,11 @@ igt_main
 		test_each_engine_store("promotion", fd, e)
 			promotion(fd, e->flags);
 
+		test_each_engine_store("fairness", fd, e)
+			fairness(fd, e, 3, F_PACING);
+		test_each_engine_store("unfairness", fd, e)
+			fairness(fd, e, 3, 0);
+
 		igt_subtest_group {
 			igt_fixture {
 				igt_require(gem_scheduler_has_preemption(fd));
-- 
2.27.0.rc2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [Intel-gfx] [PATCH i-g-t] i915/gem_exec_schedule: Try to spot unfairness
  2020-06-01 19:08 Chris Wilson
@ 2020-06-01 19:53 ` Chris Wilson
  2020-06-01 21:17 ` Chris Wilson
  1 sibling, 0 replies; 20+ messages in thread
From: Chris Wilson @ 2020-06-01 19:53 UTC (permalink / raw)
  To: intel-gfx; +Cc: igt-dev, Chris Wilson

An important property for multi-client systems is that each client gets
a 'fair' allotment of system time. (Where fairness is at the whim of the
context properties, such as priorities.) This test forks N independent
clients (albeit they happen to share a single vm), and does an equal
amount of work in client and asserts that they take an equal amount of
time.

Though we have never claimed to have a completely fair scheduler, that
is what is expected.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Ramalingam C <ramalingam.c@intel.com>
---
 tests/i915/gem_exec_schedule.c | 245 +++++++++++++++++++++++++++++++++
 1 file changed, 245 insertions(+)

diff --git a/tests/i915/gem_exec_schedule.c b/tests/i915/gem_exec_schedule.c
index 56c638833..5d91e94a3 100644
--- a/tests/i915/gem_exec_schedule.c
+++ b/tests/i915/gem_exec_schedule.c
@@ -2495,6 +2495,246 @@ static void measure_semaphore_power(int i915)
 	rapl_close(&pkg);
 }
 
+static int read_timestamp_frequency(int i915)
+{
+	int value = 0;
+	drm_i915_getparam_t gp = {
+		.value = &value,
+		.param = I915_PARAM_CS_TIMESTAMP_FREQUENCY,
+	};
+	ioctl(i915, DRM_IOCTL_I915_GETPARAM, &gp);
+	return value;
+}
+
+static uint64_t div64_u64_round_up(uint64_t x, uint64_t y)
+{
+	return (x + y - 1) / y;
+}
+
+static uint64_t ns_to_ticks(int i915, uint64_t ns)
+{
+	return div64_u64_round_up(ns * read_timestamp_frequency(i915),
+				  1000000000);
+}
+
+#define MI_INSTR(opcode, flags) (((opcode) << 23) | (flags))
+
+#define MI_MATH(x)                      MI_INSTR(0x1a, (x) - 1)
+#define MI_MATH_INSTR(opcode, op1, op2) ((opcode) << 20 | (op1) << 10 | (op2))
+/* Opcodes for MI_MATH_INSTR */
+#define   MI_MATH_NOOP                  MI_MATH_INSTR(0x000, 0x0, 0x0)
+#define   MI_MATH_LOAD(op1, op2)        MI_MATH_INSTR(0x080, op1, op2)
+#define   MI_MATH_LOADINV(op1, op2)     MI_MATH_INSTR(0x480, op1, op2)
+#define   MI_MATH_LOAD0(op1)            MI_MATH_INSTR(0x081, op1)
+#define   MI_MATH_LOAD1(op1)            MI_MATH_INSTR(0x481, op1)
+#define   MI_MATH_ADD                   MI_MATH_INSTR(0x100, 0x0, 0x0)
+#define   MI_MATH_SUB                   MI_MATH_INSTR(0x101, 0x0, 0x0)
+#define   MI_MATH_AND                   MI_MATH_INSTR(0x102, 0x0, 0x0)
+#define   MI_MATH_OR                    MI_MATH_INSTR(0x103, 0x0, 0x0)
+#define   MI_MATH_XOR                   MI_MATH_INSTR(0x104, 0x0, 0x0)
+#define   MI_MATH_STORE(op1, op2)       MI_MATH_INSTR(0x180, op1, op2)
+#define   MI_MATH_STOREINV(op1, op2)    MI_MATH_INSTR(0x580, op1, op2)
+/* Registers used as operands in MI_MATH_INSTR */
+#define   MI_MATH_REG(x)                (x)
+#define   MI_MATH_REG_SRCA              0x20
+#define   MI_MATH_REG_SRCB              0x21
+#define   MI_MATH_REG_ACCU              0x31
+#define   MI_MATH_REG_ZF                0x32
+#define   MI_MATH_REG_CF                0x33
+
+#define MI_LOAD_REGISTER_REG    MI_INSTR(0x2A, 1)
+
+static void async_delay(int i915,
+			const struct intel_execution_engine2 *e,
+			uint32_t handle,
+			uint64_t addr,
+			uint64_t ns)
+{
+	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
+	const uint32_t base = gem_engine_mmio_base(i915, e->name);
+#define CS_GPR(x) (base + 0x600 + 8 * (x))
+#define TIMESTAMP (base + 0x3a8)
+	enum { START_TS, NOW_TS };
+	uint32_t *map, *cs, *jmp;
+
+	igt_require(base);
+
+	cs = map = gem_mmap__device_coherent(i915, handle, 0, 4096, PROT_WRITE);
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(START_TS) + 4;
+	*cs++ = 0;
+	*cs++ = MI_LOAD_REGISTER_REG;
+	*cs++ = TIMESTAMP;
+	*cs++ = CS_GPR(START_TS);
+
+	if (offset_in_page(cs) & 4)
+		*cs++ = 0;
+	jmp = cs;
+
+	*cs++ = 0x5 << 23; /* MI_ARB_CHECK */
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(NOW_TS) + 4;
+	*cs++ = 0;
+	*cs++ = MI_LOAD_REGISTER_REG;
+	*cs++ = TIMESTAMP;
+	*cs++ = CS_GPR(NOW_TS);
+
+	*cs++ = MI_MATH(4);
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(NOW_TS));
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(START_TS));
+	*cs++ = MI_MATH_SUB;
+	*cs++ = MI_MATH_STOREINV(MI_MATH_REG(NOW_TS), MI_MATH_REG_ACCU);
+
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_GPR(NOW_TS);
+	*cs++ = addr + 4000;
+	*cs++ = addr >> 32;
+
+	*cs++ = MI_COND_BATCH_BUFFER_END | MI_DO_COMPARE | (1 + use_64b);
+	*cs++ = ~ns_to_ticks(i915, ns);
+	*cs++ = addr + 4000;
+	*cs++ = addr >> 32;
+
+	*cs++ = MI_BATCH_BUFFER_START | 1 << 8 | use_64b;
+	*cs++ = addr + offset_in_page(jmp);
+	*cs++ = addr >> 32;
+
+	munmap(map, 4096);
+}
+
+static struct drm_i915_gem_exec_object2
+timed_create(int i915, uint32_t ctx,
+	     const struct intel_execution_engine2 *e,
+	     uint64_t target_ns)
+{
+	struct drm_i915_gem_exec_object2 obj = {
+		.handle = batch_create(i915),
+		.flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS,
+	};
+	struct drm_i915_gem_execbuffer2 execbuf = {
+		.buffers_ptr = to_user_pointer(&obj),
+		.buffer_count = 1,
+		.rsvd1 = ctx,
+		.flags = e->flags,
+	};
+
+	gem_execbuf(i915, &execbuf);
+	gem_sync(i915, obj.handle);
+
+	async_delay(i915, e, obj.handle, obj.offset, target_ns);
+
+	obj.flags |= EXEC_OBJECT_PINNED;
+	return obj;
+}
+
+static void fair_child(int i915, uint32_t ctx,
+		       const struct intel_execution_engine2 *e,
+		       uint64_t frame_ns,
+		       int timeout,
+		       unsigned int flags,
+		       unsigned long *out)
+#define F_PACING 0x1
+{
+	const int batches_per_frame = 3;
+	struct drm_i915_gem_exec_object2 prev =
+		timed_create(i915, ctx, e, frame_ns / batches_per_frame);
+	struct drm_i915_gem_exec_object2 next =
+		timed_create(i915, ctx, e, frame_ns / batches_per_frame);
+	struct timespec tv = {};
+	unsigned long count = 0;
+	int p_fence = -1, n_fence = -1;
+
+	igt_nsec_elapsed(&tv);
+	igt_until_timeout(timeout) {
+		struct drm_i915_gem_execbuffer2 execbuf = {
+			.buffers_ptr = to_user_pointer(&next),
+			.buffer_count = 1,
+			.rsvd1 = ctx,
+			.flags = e->flags,
+		};
+
+		execbuf.flags |= I915_EXEC_FENCE_OUT;
+		gem_execbuf_wr(i915, &execbuf);
+		n_fence = execbuf.rsvd2 >> 32;
+		execbuf.flags &= ~I915_EXEC_FENCE_OUT;
+		for (int n = 1; n < batches_per_frame; n++)
+			gem_execbuf(i915, &execbuf);
+
+		if (flags & F_PACING && p_fence != -1) {
+			struct pollfd pfd = {
+				.fd = p_fence,
+				.events = POLLIN,
+			};
+			poll(&pfd, 1, -1);
+		}
+		close(p_fence);
+
+		igt_swap(prev, next);
+		igt_swap(p_fence, n_fence);
+		count++;
+	}
+	gem_sync(i915, prev.handle);
+	*out = igt_nsec_elapsed(&tv) / count;
+	close(p_fence);
+
+	gem_close(i915, next.handle);
+	gem_close(i915, prev.handle);
+}
+
+static int ul_cmp(const void *A, const void *B)
+{
+	const unsigned long *a = A, *b = B;
+
+	if (*a < *b)
+		return -1;
+	else if (*a > *b)
+		return 1;
+	else
+		return 0;
+}
+
+static void fairness(int i915,
+		     const struct intel_execution_engine2 *e,
+		     int timeout, unsigned int flags)
+{
+	const int frame_ns = 16666 * 1000;
+	unsigned long *result;
+
+	igt_require(intel_gen(intel_get_drm_devid(i915)) >= 8);
+
+	result = mmap(NULL, 4096, PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
+
+	for (int n = 2; n <= 16; n <<= 1) {
+		int nchild = n - 1; /* odd for easy medians */
+
+		memset(result, 0, nchild * sizeof(result[0]));
+		igt_fork(child, nchild) {
+			uint32_t ctx = gem_context_clone_with_engines(i915, 0);
+
+
+			fair_child(i915, ctx, e, frame_ns / nchild,
+				   timeout, flags, &result[child]);
+
+			gem_context_destroy(i915, ctx);
+		}
+		igt_waitchildren();
+
+		qsort(result, nchild, sizeof(*result), ul_cmp);
+		igt_info("%d clients, range: [%lu, %lu], median: %lu\n",
+			 nchild, result[0], result[nchild-1], result[nchild/2]);
+
+		igt_assert(4 * result[0] > 3 * result[nchild-1]);
+		igt_assert(3 * result[0] < 4 * result[nchild-1]);
+
+		igt_assert(4 * result[nchild/2] > 3 * frame_ns);
+		igt_assert(3 * result[nchild/2] < 4 * frame_ns);
+	}
+
+	munmap(result, 4096);
+}
+
 #define test_each_engine(T, i915, e) \
 	igt_subtest_with_dynamic(T) __for_each_physical_engine(i915, e) \
 		igt_dynamic_f("%s", e->name)
@@ -2589,6 +2829,11 @@ igt_main
 		test_each_engine_store("promotion", fd, e)
 			promotion(fd, e->flags);
 
+		test_each_engine_store("fairness", fd, e)
+			fairness(fd, e, 3, F_PACING);
+		test_each_engine_store("unfairness", fd, e)
+			fairness(fd, e, 3, 0);
+
 		igt_subtest_group {
 			igt_fixture {
 				igt_require(gem_scheduler_has_preemption(fd));
-- 
2.27.0.rc2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [Intel-gfx] [PATCH i-g-t] i915/gem_exec_schedule: Try to spot unfairness
@ 2020-06-01 19:08 Chris Wilson
  2020-06-01 19:53 ` Chris Wilson
  2020-06-01 21:17 ` Chris Wilson
  0 siblings, 2 replies; 20+ messages in thread
From: Chris Wilson @ 2020-06-01 19:08 UTC (permalink / raw)
  To: intel-gfx; +Cc: igt-dev, Chris Wilson

An important property for multi-client systems is that each client gets
a 'fair' allotment of system time. (Where fairness is at the whim of the
context properties, such as priorities.) This test forks N independent
clients (albeit they happen to share a single vm), and does an equal
amount of work in client and asserts that they take an equal amount of
time.

Though we have never claimed to have a completely fair scheduler, that
is what is expected.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Ramalingam C <ramalingam.c@intel.com>
---
 tests/i915/gem_exec_schedule.c | 224 +++++++++++++++++++++++++++++++++
 1 file changed, 224 insertions(+)

diff --git a/tests/i915/gem_exec_schedule.c b/tests/i915/gem_exec_schedule.c
index 56c638833..0ec21bf54 100644
--- a/tests/i915/gem_exec_schedule.c
+++ b/tests/i915/gem_exec_schedule.c
@@ -2495,6 +2495,227 @@ static void measure_semaphore_power(int i915)
 	rapl_close(&pkg);
 }
 
+static int read_timestamp_frequency(int i915)
+{
+	int value = 0;
+	drm_i915_getparam_t gp = {
+		.value = &value,
+		.param = I915_PARAM_CS_TIMESTAMP_FREQUENCY,
+	};
+	ioctl(i915, DRM_IOCTL_I915_GETPARAM, &gp);
+	return value;
+}
+
+static uint64_t div64_u64_round_up(uint64_t x, uint64_t y)
+{
+	return (x + y - 1) / y;
+}
+
+static uint64_t ns_to_ticks(int i915, uint64_t ns)
+{
+	return div64_u64_round_up(ns * read_timestamp_frequency(i915),
+				  1000000000);
+}
+
+#define MI_INSTR(opcode, flags) (((opcode) << 23) | (flags))
+
+#define MI_MATH(x)                      MI_INSTR(0x1a, (x) - 1)
+#define MI_MATH_INSTR(opcode, op1, op2) ((opcode) << 20 | (op1) << 10 | (op2))
+/* Opcodes for MI_MATH_INSTR */
+#define   MI_MATH_NOOP                  MI_MATH_INSTR(0x000, 0x0, 0x0)
+#define   MI_MATH_LOAD(op1, op2)        MI_MATH_INSTR(0x080, op1, op2)
+#define   MI_MATH_LOADINV(op1, op2)     MI_MATH_INSTR(0x480, op1, op2)
+#define   MI_MATH_LOAD0(op1)            MI_MATH_INSTR(0x081, op1)
+#define   MI_MATH_LOAD1(op1)            MI_MATH_INSTR(0x481, op1)
+#define   MI_MATH_ADD                   MI_MATH_INSTR(0x100, 0x0, 0x0)
+#define   MI_MATH_SUB                   MI_MATH_INSTR(0x101, 0x0, 0x0)
+#define   MI_MATH_AND                   MI_MATH_INSTR(0x102, 0x0, 0x0)
+#define   MI_MATH_OR                    MI_MATH_INSTR(0x103, 0x0, 0x0)
+#define   MI_MATH_XOR                   MI_MATH_INSTR(0x104, 0x0, 0x0)
+#define   MI_MATH_STORE(op1, op2)       MI_MATH_INSTR(0x180, op1, op2)
+#define   MI_MATH_STOREINV(op1, op2)    MI_MATH_INSTR(0x580, op1, op2)
+/* Registers used as operands in MI_MATH_INSTR */
+#define   MI_MATH_REG(x)                (x)
+#define   MI_MATH_REG_SRCA              0x20
+#define   MI_MATH_REG_SRCB              0x21
+#define   MI_MATH_REG_ACCU              0x31
+#define   MI_MATH_REG_ZF                0x32
+#define   MI_MATH_REG_CF                0x33
+
+#define MI_LOAD_REGISTER_REG    MI_INSTR(0x2A, 1)
+
+static void async_delay(int i915,
+			const struct intel_execution_engine2 *e,
+			uint32_t handle,
+			uint64_t addr,
+			uint64_t ns)
+{
+	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
+	const uint32_t base = gem_engine_mmio_base(i915, e->name);
+#define CS_GPR(x) (base + 0x600 + 8 * (x))
+#define TIMESTAMP (base + 0x3a8)
+	enum { START_TS, NOW_TS };
+	uint32_t *map, *cs, *jmp;
+
+	igt_require(base);
+
+	cs = map = gem_mmap__device_coherent(i915, handle, 0, 4096, PROT_WRITE);
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(START_TS) + 4;
+	*cs++ = 0;
+	*cs++ = MI_LOAD_REGISTER_REG;
+	*cs++ = TIMESTAMP;
+	*cs++ = CS_GPR(START_TS);
+
+	if (offset_in_page(cs) & 4)
+		*cs++ = 0;
+	jmp = cs;
+
+	*cs++ = 0x5 << 23; /* MI_ARB_CHECK */
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(NOW_TS) + 4;
+	*cs++ = 0;
+	*cs++ = MI_LOAD_REGISTER_REG;
+	*cs++ = TIMESTAMP;
+	*cs++ = CS_GPR(NOW_TS);
+
+	*cs++ = MI_MATH(4);
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(NOW_TS));
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(START_TS));
+	*cs++ = MI_MATH_SUB;
+	*cs++ = MI_MATH_STOREINV(MI_MATH_REG(NOW_TS), MI_MATH_REG_ACCU);
+
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_GPR(NOW_TS);
+	*cs++ = addr + 4000;
+	*cs++ = addr >> 32;
+
+	*cs++ = MI_COND_BATCH_BUFFER_END | MI_DO_COMPARE | (1 + use_64b);
+	*cs++ = ~ns_to_ticks(i915, ns);
+	*cs++ = addr + 4000;
+	*cs++ = addr >> 32;
+
+	*cs++ = MI_BATCH_BUFFER_START | 1 << 8 | use_64b;
+	*cs++ = addr + offset_in_page(jmp);
+	*cs++ = addr >> 32;
+
+	munmap(map, 4096);
+}
+
+static struct drm_i915_gem_exec_object2
+timed_create(int i915, uint32_t ctx,
+	     const struct intel_execution_engine2 *e,
+	     uint64_t target_ns)
+{
+	struct drm_i915_gem_exec_object2 obj = {
+		.handle = batch_create(i915),
+		.flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS,
+	};
+	struct drm_i915_gem_execbuffer2 execbuf = {
+		.buffers_ptr = to_user_pointer(&obj),
+		.buffer_count = 1,
+		.rsvd1 = ctx,
+		.flags = e->flags,
+	};
+
+	gem_execbuf(i915, &execbuf);
+	gem_sync(i915, obj.handle);
+
+	async_delay(i915, e, obj.handle, obj.offset, target_ns);
+
+	obj.flags |= EXEC_OBJECT_PINNED;
+	return obj;
+}
+
+static void fair_child(int i915, uint32_t ctx,
+		       const struct intel_execution_engine2 *e,
+		       uint64_t frame_ns,
+		       int timeout,
+		       unsigned long *out)
+{
+	const int batches_per_frame = 3;
+	struct drm_i915_gem_exec_object2 prev =
+		timed_create(i915, ctx, e, frame_ns / batches_per_frame);
+	struct drm_i915_gem_exec_object2 next =
+		timed_create(i915, ctx, e, frame_ns / batches_per_frame);
+	struct timespec tv = {};
+	unsigned long count = 0;
+
+	igt_nsec_elapsed(&tv);
+	igt_until_timeout(timeout) {
+		struct drm_i915_gem_execbuffer2 execbuf = {
+			.buffers_ptr = to_user_pointer(&next),
+			.buffer_count = 1,
+			.rsvd1 = ctx,
+			.flags = e->flags,
+		};
+
+		for (int n = 0; n < batches_per_frame; n++)
+			gem_execbuf(i915, &execbuf);
+
+		gem_sync(i915, prev.handle);
+		igt_swap(prev, next);
+		count++;
+	}
+	gem_sync(i915, prev.handle);
+	*out = igt_nsec_elapsed(&tv) / count;
+
+	gem_close(i915, next.handle);
+	gem_close(i915, prev.handle);
+}
+
+static int ul_cmp(const void *A, const void *B)
+{
+	const unsigned long *a = A, *b = B;
+
+	if (*a < *b)
+		return -1;
+	else if (*a > *b)
+		return 1;
+	else
+		return 0;
+}
+
+static void fairness(int i915,
+		     const struct intel_execution_engine2 *e,
+		     int timeout)
+{
+	const int frame_ns = 16666 * 1000;
+	unsigned long *result;
+
+	igt_require(intel_gen(intel_get_drm_devid(i915)) >= 8);
+
+	result = mmap(NULL, 4096, PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
+
+	for (int n = 2; n <= 16; n <<= 1) {
+		int nchild = n - 1; /* odd for easy medians */
+
+		memset(result, 0, nchild * sizeof(result[0]));
+		igt_fork(child, nchild) {
+			uint32_t ctx = gem_context_clone_with_engines(i915, 0);
+
+
+			fair_child(i915, ctx, e, frame_ns / nchild,
+				   timeout, &result[child]);
+
+			gem_context_destroy(i915, ctx);
+		}
+		igt_waitchildren();
+
+		qsort(result, nchild, sizeof(*result), ul_cmp);
+		igt_info("%d clients, range: [%lu, %lu], median: %lu\n",
+			 nchild, result[0], result[nchild-1], result[nchild/2]);
+
+		igt_assert(4 * result[0] > 3 * result[nchild-1]);
+		igt_assert(3 * result[0] < 4 * result[nchild-1]);
+
+		igt_assert(4 * result[nchild/2] > 3 * frame_ns);
+		igt_assert(3 * result[nchild/2] < 4 * frame_ns);
+	}
+}
+
 #define test_each_engine(T, i915, e) \
 	igt_subtest_with_dynamic(T) __for_each_physical_engine(i915, e) \
 		igt_dynamic_f("%s", e->name)
@@ -2589,6 +2810,9 @@ igt_main
 		test_each_engine_store("promotion", fd, e)
 			promotion(fd, e->flags);
 
+		test_each_engine_store("fairness", fd, e)
+			fairness(fd, e, 3);
+
 		igt_subtest_group {
 			igt_fixture {
 				igt_require(gem_scheduler_has_preemption(fd));
-- 
2.27.0.rc2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2020-12-16 15:25 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-11-24 23:39 [Intel-gfx] [PATCH i-g-t] i915/gem_exec_schedule: Try to spot unfairness Chris Wilson
2020-11-24 23:39 ` [igt-dev] " Chris Wilson
2020-11-25  0:33 ` [igt-dev] ✓ Fi.CI.BAT: success for i915/gem_exec_schedule: Try to spot unfairness (rev10) Patchwork
2020-11-25  5:29 ` [igt-dev] ✗ Fi.CI.IGT: failure " Patchwork
2020-11-25 11:25 ` [Intel-gfx] [igt-dev] [PATCH i-g-t] i915/gem_exec_schedule: Try to spot unfairness Tvrtko Ursulin
2020-11-25 11:25   ` Tvrtko Ursulin
2020-11-25 12:23   ` [Intel-gfx] " Chris Wilson
2020-11-25 12:23     ` Chris Wilson
  -- strict thread matches above, loose matches on Subject: below --
2020-12-16 15:24 [Intel-gfx] " Chris Wilson
2020-12-10  2:09 Chris Wilson
2020-08-03 13:57 Chris Wilson
2020-06-22 19:08 Chris Wilson
2020-06-09 12:45 Chris Wilson
2020-06-02  8:22 Chris Wilson
2020-06-02  8:32 ` Chris Wilson
2020-06-02  8:50 ` Chris Wilson
2020-06-02  0:26 Chris Wilson
2020-06-01 19:08 Chris Wilson
2020-06-01 19:53 ` Chris Wilson
2020-06-01 21:17 ` Chris Wilson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.