All of lore.kernel.org
 help / color / mirror / Atom feed
* [igt-dev] [PATCH] i915/gem_exec_balancer: Test parallel execbuf
@ 2021-10-22  0:18 Matthew Brost
  2021-10-22  0:59 ` [igt-dev] ✓ Fi.CI.BAT: success for " Patchwork
                   ` (2 more replies)
  0 siblings, 3 replies; 5+ messages in thread
From: Matthew Brost @ 2021-10-22  0:18 UTC (permalink / raw)
  To: igt-dev; +Cc: daniele.ceraolospurio, john.c.harrison

Add basic parallel execbuf submission test which more or less just
submits the same BB in loop a which does an atomic increment to a memory
location. The memory location is checked at the end for the correct
value. Different sections use various IOCTL options (e.g. fences,
location of BBs, etc...).

In addition to above sections, an additional section ensure the ordering
of parallel submission by submitting a spinning batch to 1 individual
engine, submit a parallel execbuf to all engines instances within the
class, verify none on parallel execbuf make to hardware, release
spinner, and finally verify everything has completed.

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 include/drm-uapi/i915_drm.h    | 136 ++++++++-
 lib/intel_ctx.c                |  28 +-
 lib/intel_ctx.h                |   2 +
 lib/intel_reg.h                |   5 +
 tests/i915/gem_exec_balancer.c | 487 +++++++++++++++++++++++++++++++++
 5 files changed, 656 insertions(+), 2 deletions(-)

diff --git a/include/drm-uapi/i915_drm.h b/include/drm-uapi/i915_drm.h
index c788a1ab4..b57f52623 100644
--- a/include/drm-uapi/i915_drm.h
+++ b/include/drm-uapi/i915_drm.h
@@ -1824,6 +1824,7 @@ struct drm_i915_gem_context_param {
  * Extensions:
  *   i915_context_engines_load_balance (I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE)
  *   i915_context_engines_bond (I915_CONTEXT_ENGINES_EXT_BOND)
+ *   i915_context_engines_parallel_submit (I915_CONTEXT_ENGINES_EXT_PARALLEL_SUBMIT)
  */
 #define I915_CONTEXT_PARAM_ENGINES	0xa
 
@@ -2104,10 +2105,137 @@ struct i915_context_engines_bond {
  * 	gem_execbuf(drm_fd, &execbuf);
  */
 
+/**
+ * struct i915_context_engines_parallel_submit - Configure engine for
+ * parallel submission.
+ *
+ * Setup a slot in the context engine map to allow multiple BBs to be submitted
+ * in a single execbuf IOCTL. Those BBs will then be scheduled to run on the GPU
+ * in parallel. Multiple hardware contexts are created internally in the i915
+ * run these BBs. Once a slot is configured for N BBs only N BBs can be
+ * submitted in each execbuf IOCTL and this is implicit behavior e.g. The user
+ * doesn't tell the execbuf IOCTL there are N BBs, the execbuf IOCTL knows how
+ * many BBs there are based on the slot's configuration. The N BBs are the last
+ * N buffer objects or first N if I915_EXEC_BATCH_FIRST is set.
+ *
+ * The default placement behavior is to create implicit bonds between each
+ * context if each context maps to more than 1 physical engine (e.g. context is
+ * a virtual engine). Also we only allow contexts of same engine class and these
+ * contexts must be in logically contiguous order. Examples of the placement
+ * behavior described below. Lastly, the default is to not allow BBs to
+ * preempted mid BB rather insert coordinated preemption on all hardware
+ * contexts between each set of BBs. Flags may be added in the future to change
+ * both of these default behaviors.
+ *
+ * Returns -EINVAL if hardware context placement configuration is invalid or if
+ * the placement configuration isn't supported on the platform / submission
+ * interface.
+ * Returns -ENODEV if extension isn't supported on the platform / submission
+ * interface.
+ *
+ * .. code-block:: none
+ *
+ *	Example 1 pseudo code:
+ *	CS[X] = generic engine of same class, logical instance X
+ *	INVALID = I915_ENGINE_CLASS_INVALID, I915_ENGINE_CLASS_INVALID_NONE
+ *	set_engines(INVALID)
+ *	set_parallel(engine_index=0, width=2, num_siblings=1,
+ *		     engines=CS[0],CS[1])
+ *
+ *	Results in the following valid placement:
+ *	CS[0], CS[1]
+ *
+ *	Example 2 pseudo code:
+ *	CS[X] = generic engine of same class, logical instance X
+ *	INVALID = I915_ENGINE_CLASS_INVALID, I915_ENGINE_CLASS_INVALID_NONE
+ *	set_engines(INVALID)
+ *	set_parallel(engine_index=0, width=2, num_siblings=2,
+ *		     engines=CS[0],CS[2],CS[1],CS[3])
+ *
+ *	Results in the following valid placements:
+ *	CS[0], CS[1]
+ *	CS[2], CS[3]
+ *
+ *	This can also be thought of as 2 virtual engines described by 2-D array
+ *	in the engines the field with bonds placed between each index of the
+ *	virtual engines. e.g. CS[0] is bonded to CS[1], CS[2] is bonded to
+ *	CS[3].
+ *	VE[0] = CS[0], CS[2]
+ *	VE[1] = CS[1], CS[3]
+ *
+ *	Example 3 pseudo code:
+ *	CS[X] = generic engine of same class, logical instance X
+ *	INVALID = I915_ENGINE_CLASS_INVALID, I915_ENGINE_CLASS_INVALID_NONE
+ *	set_engines(INVALID)
+ *	set_parallel(engine_index=0, width=2, num_siblings=2,
+ *		     engines=CS[0],CS[1],CS[1],CS[3])
+ *
+ *	Results in the following valid and invalid placements:
+ *	CS[0], CS[1]
+ *	CS[1], CS[3] - Not logical contiguous, return -EINVAL
+ */
+struct i915_context_engines_parallel_submit {
+	/**
+	 * @base: base user extension.
+	 */
+	struct i915_user_extension base;
+
+	/**
+	 * @engine_index: slot for parallel engine
+	 */
+	__u16 engine_index;
+
+	/**
+	 * @width: number of contexts per parallel engine
+	 */
+	__u16 width;
+
+	/**
+	 * @num_siblings: number of siblings per context
+	 */
+	__u16 num_siblings;
+
+	/**
+	 * @mbz16: reserved for future use; must be zero
+	 */
+	__u16 mbz16;
+
+	/**
+	 * @flags: all undefined flags must be zero, currently not defined flags
+	 */
+	__u64 flags;
+
+	/**
+	 * @mbz64: reserved for future use; must be zero
+	 */
+	__u64 mbz64[3];
+
+	/**
+	 * @engines: 2-d array of engine instances to configure parallel engine
+	 *
+	 * length = width (i) * num_siblings (j)
+	 * index = j + i * num_siblings
+	 */
+	struct i915_engine_class_instance engines[0];
+
+} __packed;
+
+#define I915_DEFINE_CONTEXT_ENGINES_PARALLEL_SUBMIT(name__, N__) struct { \
+	struct i915_user_extension base; \
+	__u16 engine_index; \
+	__u16 width; \
+	__u16 num_siblings; \
+	__u16 mbz16; \
+	__u64 flags; \
+	__u64 mbz64[3]; \
+	struct i915_engine_class_instance engines[N__]; \
+} __attribute__((packed)) name__
+
 struct i915_context_param_engines {
 	__u64 extensions; /* linked chain of extension blocks, 0 terminates */
 #define I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE 0 /* see i915_context_engines_load_balance */
 #define I915_CONTEXT_ENGINES_EXT_BOND 1 /* see i915_context_engines_bond */
+#define I915_CONTEXT_ENGINES_EXT_PARALLEL_SUBMIT 2 /* see i915_context_engines_parallel_submit */
 	struct i915_engine_class_instance engines[0];
 } __attribute__((packed));
 
@@ -2726,14 +2854,20 @@ struct drm_i915_engine_info {
 
 	/** @flags: Engine flags. */
 	__u64 flags;
+#define I915_ENGINE_INFO_HAS_LOGICAL_INSTANCE		(1 << 0)
 
 	/** @capabilities: Capabilities of this engine. */
 	__u64 capabilities;
 #define I915_VIDEO_CLASS_CAPABILITY_HEVC		(1 << 0)
 #define I915_VIDEO_AND_ENHANCE_CLASS_CAPABILITY_SFC	(1 << 1)
 
+	/** @logical_instance: Logical instance of engine */
+	__u16 logical_instance;
+
 	/** @rsvd1: Reserved fields. */
-	__u64 rsvd1[4];
+	__u16 rsvd1[3];
+	/** @rsvd2: Reserved fields. */
+	__u64 rsvd2[3];
 };
 
 /**
diff --git a/lib/intel_ctx.c b/lib/intel_ctx.c
index f28c15544..11ec6fca4 100644
--- a/lib/intel_ctx.c
+++ b/lib/intel_ctx.c
@@ -83,6 +83,7 @@ __context_create_cfg(int fd, const intel_ctx_cfg_t *cfg, uint32_t *ctx_id)
 {
 	uint64_t ext_root = 0;
 	I915_DEFINE_CONTEXT_ENGINES_LOAD_BALANCE(balance, GEM_MAX_ENGINES);
+	I915_DEFINE_CONTEXT_ENGINES_PARALLEL_SUBMIT(parallel, GEM_MAX_ENGINES);
 	I915_DEFINE_CONTEXT_PARAM_ENGINES(engines, GEM_MAX_ENGINES);
 	struct drm_i915_gem_context_create_ext_setparam engines_param, vm_param;
 	struct drm_i915_gem_context_create_ext_setparam persist_param;
@@ -117,7 +118,29 @@ __context_create_cfg(int fd, const intel_ctx_cfg_t *cfg, uint32_t *ctx_id)
 		unsigned num_logical_engines;
 		memset(&engines, 0, sizeof(engines));
 
-		if (cfg->load_balance) {
+		if (cfg->parallel) {
+			memset(&parallel, 0, sizeof(parallel));
+
+			num_logical_engines = 1;
+
+			parallel.base.name =
+				I915_CONTEXT_ENGINES_EXT_PARALLEL_SUBMIT;
+
+			engines.engines[0].engine_class =
+				I915_ENGINE_CLASS_INVALID;
+			engines.engines[0].engine_instance =
+				I915_ENGINE_CLASS_INVALID_NONE;
+
+			parallel.num_siblings = cfg->num_engines;
+			parallel.width = cfg->width;
+			for (i = 0; i < cfg->num_engines * cfg->width; i++) {
+				igt_assert_eq(cfg->engines[0].engine_class,
+					      cfg->engines[i].engine_class);
+				parallel.engines[i] = cfg->engines[i];
+			}
+
+			engines.extensions = to_user_pointer(&parallel);
+		} else if (cfg->load_balance) {
 			memset(&balance, 0, sizeof(balance));
 
 			/* In this case, the first engine is the virtual
@@ -127,6 +150,9 @@ __context_create_cfg(int fd, const intel_ctx_cfg_t *cfg, uint32_t *ctx_id)
 			igt_assert(cfg->num_engines + 1 <= GEM_MAX_ENGINES);
 			num_logical_engines = cfg->num_engines + 1;
 
+			balance.base.name =
+				I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE;
+
 			engines.engines[0].engine_class =
 				I915_ENGINE_CLASS_INVALID;
 			engines.engines[0].engine_instance =
diff --git a/lib/intel_ctx.h b/lib/intel_ctx.h
index 9649f6d96..89c65fcd3 100644
--- a/lib/intel_ctx.h
+++ b/lib/intel_ctx.h
@@ -46,7 +46,9 @@ typedef struct intel_ctx_cfg {
 	uint32_t vm;
 	bool nopersist;
 	bool load_balance;
+	bool parallel;
 	unsigned int num_engines;
+	unsigned int width;
 	struct i915_engine_class_instance engines[GEM_MAX_ENGINES];
 } intel_ctx_cfg_t;
 
diff --git a/lib/intel_reg.h b/lib/intel_reg.h
index c447525a0..44b0d480f 100644
--- a/lib/intel_reg.h
+++ b/lib/intel_reg.h
@@ -2642,6 +2642,11 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #define STATE3D_COLOR_FACTOR	((0x3<<29)|(0x1d<<24)|(0x01<<16))
 
+/* Atomics */
+#define MI_ATOMIC			((0x2f << 23) | 2)
+#define   MI_ATOMIC_INLINE_DATA         (1 << 18)
+#define   MI_ATOMIC_ADD                 (0x7 << 8)
+
 /* Batch */
 #define MI_BATCH_BUFFER		((0x30 << 23) | 1)
 #define MI_BATCH_BUFFER_START	(0x31 << 23)
diff --git a/tests/i915/gem_exec_balancer.c b/tests/i915/gem_exec_balancer.c
index e4e5cda4a..171295777 100644
--- a/tests/i915/gem_exec_balancer.c
+++ b/tests/i915/gem_exec_balancer.c
@@ -25,6 +25,7 @@
 #include <sched.h>
 #include <sys/ioctl.h>
 #include <sys/signal.h>
+#include <poll.h>
 
 #include "i915/gem.h"
 #include "i915/gem_create.h"
@@ -56,6 +57,31 @@ static size_t sizeof_load_balance(int count)
 
 #define alloca0(sz) ({ size_t sz__ = (sz); memset(alloca(sz__), 0, sz__); })
 
+static int
+__i915_query(int fd, struct drm_i915_query *q)
+{
+	if (igt_ioctl(fd, DRM_IOCTL_I915_QUERY, q))
+		return -errno;
+
+	return 0;
+}
+
+static int
+__i915_query_items(int fd, struct drm_i915_query_item *items, uint32_t n_items)
+{
+	struct drm_i915_query q = {
+		.num_items = n_items,
+		.items_ptr = to_user_pointer(items),
+		};
+
+	return __i915_query(fd, &q);
+}
+
+#define i915_query_items(fd, items, n_items) do { \
+		igt_assert_eq(__i915_query_items(fd, items, n_items), 0); \
+		errno = 0; \
+	} while (0)
+
 static bool has_class_instance(int i915, uint16_t class, uint16_t instance)
 {
 	int fd;
@@ -2752,6 +2778,380 @@ static void nohangcheck(int i915)
 	close(params);
 }
 
+static void check_bo(int i915, uint32_t handle, unsigned int count, bool wait)
+{
+	uint32_t *map;
+
+	map = gem_mmap__cpu(i915, handle, 0, 4096, PROT_READ);
+	if (wait)
+		gem_set_domain(i915, handle, I915_GEM_DOMAIN_CPU,
+			       I915_GEM_DOMAIN_CPU);
+	igt_assert_eq(map[0], count);
+	munmap(map, 4096);
+}
+
+static struct drm_i915_query_engine_info *query_engine_info(int i915)
+{
+	struct drm_i915_query_engine_info *engines;
+	struct drm_i915_query_item item;
+
+#define QUERY_SIZE	0x4000
+	engines = malloc(QUERY_SIZE);
+	igt_assert(engines);
+
+	memset(engines, 0, QUERY_SIZE);
+	memset(&item, 0, sizeof(item));
+	item.query_id = DRM_I915_QUERY_ENGINE_INFO;
+	item.data_ptr = to_user_pointer(engines);
+	item.length = QUERY_SIZE;
+
+	i915_query_items(i915, &item, 1);
+	igt_assert(item.length >= 0);
+	igt_assert(item.length <= QUERY_SIZE);
+#undef QUERY_SIZE
+
+	return engines;
+}
+
+/* This function only works if siblings contains all instances of a class */
+static void logical_sort_siblings(int i915,
+				  struct i915_engine_class_instance *siblings,
+				  unsigned int count)
+{
+	struct i915_engine_class_instance *sorted;
+	struct drm_i915_query_engine_info *engines;
+	unsigned int i, j;
+
+	sorted = calloc(count, sizeof(*sorted));
+	igt_assert(sorted);
+
+	engines = query_engine_info(i915);
+
+	for (j = 0; j < count; ++j) {
+		for (i = 0; i < engines->num_engines; ++i) {
+			if (siblings[j].engine_class ==
+			    engines->engines[i].engine.engine_class &&
+			    siblings[j].engine_instance ==
+			    engines->engines[i].engine.engine_instance) {
+				uint16_t logical_instance =
+					engines->engines[i].logical_instance;
+
+				igt_assert(logical_instance < count);
+				igt_assert(!sorted[logical_instance].engine_class);
+				igt_assert(!sorted[logical_instance].engine_instance);
+
+				sorted[logical_instance] = siblings[j];
+				break;
+			}
+		}
+		igt_assert(i != engines->num_engines);
+	}
+
+	memcpy(siblings, sorted, sizeof(*sorted) * count);
+	free(sorted);
+	free(engines);
+}
+
+#define PARALLEL_BB_FIRST		(0x1 << 0)
+#define PARALLEL_OUT_FENCE		(0x1 << 1)
+#define PARALLEL_IN_FENCE		(0x1 << 2)
+#define PARALLEL_SUBMIT_FENCE		(0x1 << 3)
+#define PARALLEL_CONTEXTS		(0x1 << 4)
+#define PARALLEL_VIRTUAL		(0x1 << 5)
+
+static void parallel_thread(int i915, unsigned int flags,
+			    struct i915_engine_class_instance *siblings,
+			    unsigned int count, unsigned int bb_per_execbuf)
+{
+	const intel_ctx_t *ctx = NULL;
+	int n, i, j, fence = 0;
+	uint32_t batch[16];
+	struct drm_i915_gem_execbuffer2 execbuf;
+	struct drm_i915_gem_exec_object2 obj[32];
+#define PARALLEL_BB_LOOP_COUNT	512
+	const intel_ctx_t *ctxs[PARALLEL_BB_LOOP_COUNT];
+	uint32_t target_bo_idx = 0;
+	uint32_t first_bb_idx = 1;
+	intel_ctx_cfg_t cfg;
+
+	if (flags & PARALLEL_BB_FIRST) {
+		target_bo_idx = bb_per_execbuf;
+		first_bb_idx = 0;
+	}
+
+	memset(&cfg, 0, sizeof(cfg));
+	if (flags & PARALLEL_VIRTUAL) {
+		cfg.parallel = true;
+		cfg.num_engines = count / bb_per_execbuf;
+		cfg.width = bb_per_execbuf;
+
+		for (i = 0; i < cfg.width; ++i)
+			for (j = 0; j < cfg.num_engines; ++j)
+				memcpy(cfg.engines + i * cfg.num_engines + j,
+				       siblings + j * cfg.width + i,
+				       sizeof(*siblings));
+	} else {
+		cfg.parallel = true;
+		cfg.num_engines = 1;
+		cfg.width = count;
+		memcpy(cfg.engines, siblings, sizeof(*siblings) * count);
+	}
+	ctx = intel_ctx_create(i915, &cfg);
+
+	i = 0;
+	batch[i] = MI_ATOMIC | MI_ATOMIC_INLINE_DATA |
+		MI_ATOMIC_ADD;
+#define TARGET_BO_OFFSET	(0x1 << 16)
+	batch[++i] = TARGET_BO_OFFSET;
+	batch[++i] = 0;
+	batch[++i] = 1;
+	batch[++i] = MI_BATCH_BUFFER_END;
+
+	memset(obj, 0, sizeof(obj));
+	obj[target_bo_idx].offset = TARGET_BO_OFFSET;
+	obj[target_bo_idx].flags = EXEC_OBJECT_PINNED | EXEC_OBJECT_WRITE;
+	obj[target_bo_idx].handle = gem_create(i915, 4096);
+
+	for (i = first_bb_idx; i < bb_per_execbuf + first_bb_idx; ++i) {
+		obj[i].handle = gem_create(i915, 4096);
+		gem_write(i915, obj[i].handle, 0, batch,
+			  sizeof(batch));
+	}
+
+	memset(&execbuf, 0, sizeof(execbuf));
+	execbuf.buffers_ptr = to_user_pointer(obj);
+	execbuf.buffer_count = bb_per_execbuf + 1;
+	execbuf.flags |= I915_EXEC_HANDLE_LUT;
+	if (flags & PARALLEL_BB_FIRST)
+		execbuf.flags |= I915_EXEC_BATCH_FIRST;
+	if (flags & PARALLEL_OUT_FENCE)
+		execbuf.flags |= I915_EXEC_FENCE_OUT;
+	execbuf.buffers_ptr = to_user_pointer(obj);
+	execbuf.rsvd1 = ctx->id;
+
+	for (n = 0; n < PARALLEL_BB_LOOP_COUNT; ++n) {
+		for (i = 0; i < count / bb_per_execbuf; ++i ) {
+			execbuf.flags &= ~0x3full;
+			execbuf.flags |= i;
+			gem_execbuf_wr(i915, &execbuf);
+
+			if (flags & PARALLEL_OUT_FENCE) {
+				igt_assert_eq(sync_fence_wait(execbuf.rsvd2 >> 32,
+							      1000), 0);
+				igt_assert_eq(sync_fence_status(execbuf.rsvd2 >> 32), 1);
+
+				if (fence)
+					close(fence);
+				fence = execbuf.rsvd2 >> 32;
+
+				if (flags & PARALLEL_SUBMIT_FENCE) {
+					execbuf.flags |=
+						I915_EXEC_FENCE_SUBMIT;
+					execbuf.rsvd2 >>= 32;
+				} else if (flags &  PARALLEL_IN_FENCE) {
+					execbuf.flags |=
+						I915_EXEC_FENCE_IN;
+					execbuf.rsvd2 >>= 32;
+				} else {
+					execbuf.rsvd2 = 0;
+				}
+			}
+
+			if (flags & PARALLEL_VIRTUAL)
+				break;
+		}
+
+		if (flags & PARALLEL_CONTEXTS) {
+			ctxs[n] = ctx;
+			ctx = intel_ctx_create(i915, &cfg);
+			execbuf.rsvd1 = ctx->id;
+		}
+	}
+	if (fence)
+		close(fence);
+
+	check_bo(i915, obj[target_bo_idx].handle, flags & PARALLEL_VIRTUAL ?
+		 bb_per_execbuf * PARALLEL_BB_LOOP_COUNT :
+		 count * PARALLEL_BB_LOOP_COUNT, true);
+
+	intel_ctx_destroy(i915, ctx);
+	for (i = 0; flags & PARALLEL_CONTEXTS &&
+	     i < PARALLEL_BB_LOOP_COUNT; ++i) {
+		intel_ctx_destroy(i915, ctxs[i]);
+	}
+	for (i = 0; i < bb_per_execbuf + 1; ++i)
+		gem_close(i915, obj[i].handle);
+}
+
+static void parallel(int i915, unsigned int flags)
+{
+	for (int class = 0; class < 32; class++) {
+		struct i915_engine_class_instance *siblings;
+		unsigned int count, bb_per_execbuf;
+
+		siblings = list_engines(i915, 1u << class, &count);
+		if (!siblings)
+			continue;
+
+		if (count < 2) {
+			free(siblings);
+			continue;
+		}
+
+		logical_sort_siblings(i915, siblings, count);
+		bb_per_execbuf = count;
+
+		parallel_thread(i915, flags, siblings,
+				count, bb_per_execbuf);
+
+		free(siblings);
+	}
+}
+
+static void parallel_balancer(int i915, unsigned int flags)
+{
+	for (int class = 0; class < 32; class++) {
+		struct i915_engine_class_instance *siblings;
+		unsigned int count;
+
+		siblings = list_engines(i915, 1u << class, &count);
+		if (!siblings)
+			continue;
+
+		if (count < 4) {
+			free(siblings);
+			continue;
+		}
+
+		logical_sort_siblings(i915, siblings, count);
+
+		for (unsigned int bb_per_execbuf = 2;;) {
+			igt_fork(child, count / bb_per_execbuf)
+				parallel_thread(i915,
+						flags | PARALLEL_VIRTUAL,
+						siblings,
+						count,
+						bb_per_execbuf);
+			igt_waitchildren();
+
+			if (count / ++bb_per_execbuf <= 1)
+				break;
+		}
+
+		free(siblings);
+	}
+}
+
+static bool fence_busy(int fence)
+{
+	return poll(&(struct pollfd){fence, POLLIN}, 1, 0) == 0;
+}
+
+static void parallel_ordering(int i915, unsigned int flags)
+{
+	for (int class = 0; class < 32; class++) {
+		const intel_ctx_t *ctx = NULL, *spin_ctx = NULL;
+		struct i915_engine_class_instance *siblings;
+		unsigned int count;
+		int i = 0, fence = 0;
+		uint32_t batch[16];
+		struct drm_i915_gem_execbuffer2 execbuf;
+		struct drm_i915_gem_exec_object2 obj[32];
+		igt_spin_t *spin;
+		intel_ctx_cfg_t cfg;
+
+		siblings = list_engines(i915, 1u << class, &count);
+		if (!siblings)
+			continue;
+
+		if (count < 2) {
+			free(siblings);
+			continue;
+		}
+
+		logical_sort_siblings(i915, siblings, count);
+
+		memset(&cfg, 0, sizeof(cfg));
+		cfg.parallel = true;
+		cfg.num_engines = 1;
+		cfg.width = count;
+		memcpy(cfg.engines, siblings, sizeof(*siblings) * count);
+
+		ctx = intel_ctx_create(i915, &cfg);
+
+		batch[i] = MI_ATOMIC | MI_ATOMIC_INLINE_DATA |
+			MI_ATOMIC_ADD;
+		batch[++i] = TARGET_BO_OFFSET;
+		batch[++i] = 0;
+		batch[++i] = 1;
+		batch[++i] = MI_BATCH_BUFFER_END;
+
+		memset(obj, 0, sizeof(obj));
+		obj[0].offset = TARGET_BO_OFFSET;
+		obj[0].flags = EXEC_OBJECT_PINNED | EXEC_OBJECT_WRITE;
+		obj[0].handle = gem_create(i915, 4096);
+
+		for (i = 1; i < count + 1; ++i) {
+			obj[i].handle = gem_create(i915, 4096);
+			gem_write(i915, obj[i].handle, 0, batch,
+				  sizeof(batch));
+		}
+
+		memset(&execbuf, 0, sizeof(execbuf));
+		execbuf.buffers_ptr = to_user_pointer(obj);
+		execbuf.buffer_count = count + 1;
+		execbuf.flags |= I915_EXEC_HANDLE_LUT;
+		execbuf.flags |= I915_EXEC_NO_RELOC;
+		execbuf.flags |= I915_EXEC_FENCE_OUT;
+		execbuf.buffers_ptr = to_user_pointer(obj);
+		execbuf.rsvd1 = ctx->id;
+
+		/* Block parallel submission */
+		spin_ctx = ctx_create_engines(i915, siblings, count);
+		spin = __igt_spin_new(i915,
+				      .ctx = spin_ctx,
+				      .engine = 0,
+				      .flags = IGT_SPIN_FENCE_OUT |
+				      IGT_SPIN_NO_PREEMPTION);
+
+		/* Wait for spinners to start */
+		usleep(5 * 10000);
+		igt_assert(fence_busy(spin->out_fence));
+
+		/* Submit parallel execbuf */
+		gem_execbuf_wr(i915, &execbuf);
+		fence = execbuf.rsvd2 >> 32;
+
+		/*
+		 * Wait long enough for timeslcing to kick in but not
+		 * preemption. Spinner + parallel execbuf should be
+		 * active.
+		 */
+		usleep(25 * 10000);
+		igt_assert(fence_busy(spin->out_fence));
+		igt_assert(fence_busy(fence));
+		check_bo(i915, obj[0].handle, 0, false);
+
+		/*
+		 * End spinner and wait for spinner + parallel execbuf
+		 * to compelte.
+		 */
+		igt_spin_end(spin);
+		igt_assert_eq(sync_fence_wait(fence, 1000), 0);
+		igt_assert_eq(sync_fence_status(fence), 1);
+		check_bo(i915, obj[0].handle, count, true);
+		close(fence);
+
+		/* Clean up */
+		intel_ctx_destroy(i915, ctx);
+		intel_ctx_destroy(i915, spin_ctx);
+		for (i = 0; i < count + 1; ++i)
+			gem_close(i915, obj[i].handle);
+		free(siblings);
+		igt_spin_free(i915, spin);
+	}
+}
+
 static bool has_persistence(int i915)
 {
 	struct drm_i915_gem_context_param p = {
@@ -2786,6 +3186,61 @@ static bool has_load_balancer(int i915)
 	return err == 0;
 }
 
+static bool has_logical_mapping(int i915)
+{
+	struct drm_i915_query_engine_info *engines;
+	unsigned int i;
+
+	engines = query_engine_info(i915);
+
+	for (i = 0; i < engines->num_engines; ++i)
+		if (!(engines->engines[i].flags &
+		     I915_ENGINE_INFO_HAS_LOGICAL_INSTANCE)) {
+			free(engines);
+			return false;
+		}
+
+	free(engines);
+	return true;
+}
+
+static bool has_parallel_execbuf(int i915)
+{
+	intel_ctx_cfg_t cfg = {
+		.parallel = true,
+		.num_engines = 1,
+	};
+	const intel_ctx_t *ctx = NULL;
+	int err;
+
+	for (int class = 0; class < 32; class++) {
+		struct i915_engine_class_instance *siblings;
+		unsigned int count;
+
+		siblings = list_engines(i915, 1u << class, &count);
+		if (!siblings)
+			continue;
+
+		if (count < 2) {
+			free(siblings);
+			continue;
+		}
+
+		logical_sort_siblings(i915, siblings, count);
+
+		cfg.width = count;
+		memcpy(cfg.engines, siblings, sizeof(*siblings) * count);
+		free(siblings);
+
+		err = __intel_ctx_create(i915, &cfg, &ctx);
+		intel_ctx_destroy(i915, ctx);
+
+		return err == 0;
+	}
+
+	return false;
+}
+
 igt_main
 {
 	int i915 = -1;
@@ -2886,6 +3341,38 @@ igt_main
 		igt_stop_hang_detector();
 	}
 
+	igt_subtest_group {
+		igt_fixture {
+			igt_require(has_logical_mapping(i915));
+			igt_require(has_parallel_execbuf(i915));
+		}
+
+		igt_subtest("parallel-ordering")
+			parallel_ordering(i915, 0);
+
+		igt_subtest("parallel")
+			parallel(i915, 0);
+
+		igt_subtest("parallel-bb-first")
+			parallel(i915, PARALLEL_BB_FIRST);
+
+		igt_subtest("parallel-out-fence")
+			parallel(i915, PARALLEL_OUT_FENCE);
+
+		igt_subtest("parallel-keep-in-fence")
+			parallel(i915, PARALLEL_OUT_FENCE | PARALLEL_IN_FENCE);
+
+		igt_subtest("parallel-keep-submit-fence")
+			parallel(i915, PARALLEL_OUT_FENCE |
+				 PARALLEL_SUBMIT_FENCE);
+
+		igt_subtest("parallel-contexts")
+			parallel(i915, PARALLEL_CONTEXTS);
+
+		igt_subtest("parallel-balancer")
+			parallel_balancer(i915, 0);
+	}
+
 	igt_subtest_group {
 		igt_hang_t  hang;
 
-- 
2.32.0

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [igt-dev] ✓ Fi.CI.BAT: success for i915/gem_exec_balancer: Test parallel execbuf
  2021-10-22  0:18 [igt-dev] [PATCH] i915/gem_exec_balancer: Test parallel execbuf Matthew Brost
@ 2021-10-22  0:59 ` Patchwork
  2021-10-22  4:21 ` [igt-dev] ✓ Fi.CI.IGT: " Patchwork
  2021-11-02 18:34 ` [igt-dev] [PATCH] " Daniele Ceraolo Spurio
  2 siblings, 0 replies; 5+ messages in thread
From: Patchwork @ 2021-10-22  0:59 UTC (permalink / raw)
  To: Matthew Brost; +Cc: igt-dev

[-- Attachment #1: Type: text/plain, Size: 8653 bytes --]

== Series Details ==

Series: i915/gem_exec_balancer: Test parallel execbuf
URL   : https://patchwork.freedesktop.org/series/96161/
State : success

== Summary ==

CI Bug Log - changes from CI_DRM_10773 -> IGTPW_6347
====================================================

Summary
-------

  **SUCCESS**

  No regressions found.

  External URL: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/index.html

Possible new issues
-------------------

  Here are the unknown changes that may have been introduced in IGTPW_6347:

### IGT changes ###

#### Suppressed ####

  The following results come from untrusted machines, tests, or statuses.
  They do not affect the overall result.

  * igt@kms_frontbuffer_tracking@basic:
    - {fi-hsw-gt1}:       [PASS][1] -> [DMESG-WARN][2]
   [1]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_10773/fi-hsw-gt1/igt@kms_frontbuffer_tracking@basic.html
   [2]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/fi-hsw-gt1/igt@kms_frontbuffer_tracking@basic.html

  
Known issues
------------

  Here are the changes found in IGTPW_6347 that come from known issues:

### IGT changes ###

#### Issues hit ####

  * igt@core_hotunplug@unbind-rebind:
    - fi-elk-e7500:       [PASS][3] -> [FAIL][4] ([i915#3194])
   [3]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_10773/fi-elk-e7500/igt@core_hotunplug@unbind-rebind.html
   [4]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/fi-elk-e7500/igt@core_hotunplug@unbind-rebind.html

  * igt@gem_exec_fence@basic-busy@bcs0:
    - fi-apl-guc:         NOTRUN -> [SKIP][5] ([fdo#109271]) +1 similar issue
   [5]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/fi-apl-guc/igt@gem_exec_fence@basic-busy@bcs0.html

  * igt@i915_hangman@error-state-basic:
    - fi-apl-guc:         NOTRUN -> [DMESG-WARN][6] ([i915#1610])
   [6]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/fi-apl-guc/igt@i915_hangman@error-state-basic.html

  * igt@i915_selftest@live@execlists:
    - fi-bsw-nick:        [PASS][7] -> [INCOMPLETE][8] ([i915#2940])
   [7]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_10773/fi-bsw-nick/igt@i915_selftest@live@execlists.html
   [8]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/fi-bsw-nick/igt@i915_selftest@live@execlists.html

  * igt@i915_selftest@live@hangcheck:
    - fi-snb-2600:        [PASS][9] -> [INCOMPLETE][10] ([i915#3921])
   [9]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_10773/fi-snb-2600/igt@i915_selftest@live@hangcheck.html
   [10]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/fi-snb-2600/igt@i915_selftest@live@hangcheck.html

  * igt@runner@aborted:
    - fi-bsw-nick:        NOTRUN -> [FAIL][11] ([fdo#109271] / [i915#1436] / [i915#3428] / [i915#4312])
   [11]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/fi-bsw-nick/igt@runner@aborted.html
    - fi-apl-guc:         NOTRUN -> [FAIL][12] ([i915#2426] / [i915#3363] / [i915#4312])
   [12]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/fi-apl-guc/igt@runner@aborted.html

  
#### Possible fixes ####

  * igt@i915_pm_rpm@module-reload:
    - fi-kbl-guc:         [FAIL][13] ([i915#579]) -> [PASS][14]
   [13]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_10773/fi-kbl-guc/igt@i915_pm_rpm@module-reload.html
   [14]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/fi-kbl-guc/igt@i915_pm_rpm@module-reload.html

  * igt@i915_selftest@live@gt_heartbeat:
    - fi-bdw-samus:       [DMESG-FAIL][15] ([i915#541]) -> [PASS][16]
   [15]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_10773/fi-bdw-samus/igt@i915_selftest@live@gt_heartbeat.html
   [16]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/fi-bdw-samus/igt@i915_selftest@live@gt_heartbeat.html

  * igt@i915_selftest@live@hangcheck:
    - {fi-hsw-gt1}:       [DMESG-WARN][17] ([i915#3303]) -> [PASS][18]
   [17]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_10773/fi-hsw-gt1/igt@i915_selftest@live@hangcheck.html
   [18]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/fi-hsw-gt1/igt@i915_selftest@live@hangcheck.html

  * igt@kms_flip@basic-flip-vs-modeset@c-dp2:
    - fi-cfl-8109u:       [DMESG-WARN][19] ([i915#165]) -> [PASS][20] +2 similar issues
   [19]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_10773/fi-cfl-8109u/igt@kms_flip@basic-flip-vs-modeset@c-dp2.html
   [20]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/fi-cfl-8109u/igt@kms_flip@basic-flip-vs-modeset@c-dp2.html

  * igt@kms_flip@basic-plain-flip@c-dp2:
    - fi-cfl-8109u:       [DMESG-WARN][21] ([i915#165] / [i915#295]) -> [PASS][22] +4 similar issues
   [21]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_10773/fi-cfl-8109u/igt@kms_flip@basic-plain-flip@c-dp2.html
   [22]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/fi-cfl-8109u/igt@kms_flip@basic-plain-flip@c-dp2.html

  * igt@kms_frontbuffer_tracking@basic:
    - fi-cml-u2:          [DMESG-WARN][23] ([i915#4269]) -> [PASS][24]
   [23]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_10773/fi-cml-u2/igt@kms_frontbuffer_tracking@basic.html
   [24]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/fi-cml-u2/igt@kms_frontbuffer_tracking@basic.html

  
#### Warnings ####

  * igt@i915_pm_rpm@basic-rte:
    - fi-kbl-guc:         [SKIP][25] ([fdo#109271]) -> [FAIL][26] ([i915#3049])
   [25]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_10773/fi-kbl-guc/igt@i915_pm_rpm@basic-rte.html
   [26]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/fi-kbl-guc/igt@i915_pm_rpm@basic-rte.html

  * igt@kms_flip@basic-plain-flip@c-dp1:
    - fi-cfl-8109u:       [DMESG-WARN][27] ([i915#165] / [i915#295]) -> [FAIL][28] ([i915#4165])
   [27]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_10773/fi-cfl-8109u/igt@kms_flip@basic-plain-flip@c-dp1.html
   [28]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/fi-cfl-8109u/igt@kms_flip@basic-plain-flip@c-dp1.html

  * igt@kms_pipe_crc_basic@compare-crc-sanitycheck-pipe-b:
    - fi-cfl-8109u:       [DMESG-WARN][29] ([i915#165] / [i915#295]) -> [DMESG-WARN][30] ([i915#295]) +14 similar issues
   [29]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_10773/fi-cfl-8109u/igt@kms_pipe_crc_basic@compare-crc-sanitycheck-pipe-b.html
   [30]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/fi-cfl-8109u/igt@kms_pipe_crc_basic@compare-crc-sanitycheck-pipe-b.html

  
  {name}: This element is suppressed. This means it is ignored when computing
          the status of the difference (SUCCESS, WARNING, or FAILURE).

  [fdo#109271]: https://bugs.freedesktop.org/show_bug.cgi?id=109271
  [i915#1436]: https://gitlab.freedesktop.org/drm/intel/issues/1436
  [i915#1610]: https://gitlab.freedesktop.org/drm/intel/issues/1610
  [i915#165]: https://gitlab.freedesktop.org/drm/intel/issues/165
  [i915#2426]: https://gitlab.freedesktop.org/drm/intel/issues/2426
  [i915#2940]: https://gitlab.freedesktop.org/drm/intel/issues/2940
  [i915#295]: https://gitlab.freedesktop.org/drm/intel/issues/295
  [i915#3049]: https://gitlab.freedesktop.org/drm/intel/issues/3049
  [i915#3194]: https://gitlab.freedesktop.org/drm/intel/issues/3194
  [i915#3303]: https://gitlab.freedesktop.org/drm/intel/issues/3303
  [i915#3363]: https://gitlab.freedesktop.org/drm/intel/issues/3363
  [i915#3428]: https://gitlab.freedesktop.org/drm/intel/issues/3428
  [i915#3921]: https://gitlab.freedesktop.org/drm/intel/issues/3921
  [i915#4165]: https://gitlab.freedesktop.org/drm/intel/issues/4165
  [i915#4269]: https://gitlab.freedesktop.org/drm/intel/issues/4269
  [i915#4312]: https://gitlab.freedesktop.org/drm/intel/issues/4312
  [i915#541]: https://gitlab.freedesktop.org/drm/intel/issues/541
  [i915#579]: https://gitlab.freedesktop.org/drm/intel/issues/579


Participating hosts (39 -> 37)
------------------------------

  Additional (1): fi-apl-guc 
  Missing    (3): fi-ctg-p8600 fi-bsw-cyan fi-hsw-4200u 


Build changes
-------------

  * CI: CI-20190529 -> None
  * IGT: IGT_6258 -> IGTPW_6347

  CI-20190529: 20190529
  CI_DRM_10773: fa267509357bd9eb021c3d474fe0980cde18de62 @ git://anongit.freedesktop.org/gfx-ci/linux
  IGTPW_6347: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/index.html
  IGT_6258: 4c80c71d7dec29b6376846ae96bd04dc0b6e34d9 @ https://gitlab.freedesktop.org/drm/igt-gpu-tools.git



== Testlist changes ==

+igt@gem_exec_balancer@parallel
+igt@gem_exec_balancer@parallel-balancer
+igt@gem_exec_balancer@parallel-bb-first
+igt@gem_exec_balancer@parallel-contexts
+igt@gem_exec_balancer@parallel-keep-in-fence
+igt@gem_exec_balancer@parallel-keep-submit-fence
+igt@gem_exec_balancer@parallel-ordering
+igt@gem_exec_balancer@parallel-out-fence

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/index.html

[-- Attachment #2: Type: text/html, Size: 10314 bytes --]

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [igt-dev] ✓ Fi.CI.IGT: success for i915/gem_exec_balancer: Test parallel execbuf
  2021-10-22  0:18 [igt-dev] [PATCH] i915/gem_exec_balancer: Test parallel execbuf Matthew Brost
  2021-10-22  0:59 ` [igt-dev] ✓ Fi.CI.BAT: success for " Patchwork
@ 2021-10-22  4:21 ` Patchwork
  2021-11-02 18:34 ` [igt-dev] [PATCH] " Daniele Ceraolo Spurio
  2 siblings, 0 replies; 5+ messages in thread
From: Patchwork @ 2021-10-22  4:21 UTC (permalink / raw)
  To: Matthew Brost; +Cc: igt-dev

[-- Attachment #1: Type: text/plain, Size: 30263 bytes --]

== Series Details ==

Series: i915/gem_exec_balancer: Test parallel execbuf
URL   : https://patchwork.freedesktop.org/series/96161/
State : success

== Summary ==

CI Bug Log - changes from CI_DRM_10773_full -> IGTPW_6347_full
====================================================

Summary
-------

  **SUCCESS**

  No regressions found.

  External URL: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/index.html

Possible new issues
-------------------

  Here are the unknown changes that may have been introduced in IGTPW_6347_full:

### IGT changes ###

#### Possible regressions ####

  * {igt@gem_exec_balancer@parallel-contexts} (NEW):
    - shard-tglb:         NOTRUN -> [SKIP][1] +5 similar issues
   [1]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-tglb8/igt@gem_exec_balancer@parallel-contexts.html

  * {igt@gem_exec_balancer@parallel-out-fence} (NEW):
    - shard-iclb:         NOTRUN -> [SKIP][2] +7 similar issues
   [2]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-iclb7/igt@gem_exec_balancer@parallel-out-fence.html

  
New tests
---------

  New tests have been introduced between CI_DRM_10773_full and IGTPW_6347_full:

### New IGT tests (8) ###

  * igt@gem_exec_balancer@parallel:
    - Statuses : 6 skip(s)
    - Exec time: [0.0] s

  * igt@gem_exec_balancer@parallel-balancer:
    - Statuses : 4 skip(s)
    - Exec time: [0.0] s

  * igt@gem_exec_balancer@parallel-bb-first:
    - Statuses : 4 skip(s)
    - Exec time: [0.0] s

  * igt@gem_exec_balancer@parallel-contexts:
    - Statuses : 6 skip(s)
    - Exec time: [0.0] s

  * igt@gem_exec_balancer@parallel-keep-in-fence:
    - Statuses : 5 skip(s)
    - Exec time: [0.0] s

  * igt@gem_exec_balancer@parallel-keep-submit-fence:
    - Statuses : 5 skip(s)
    - Exec time: [0.0] s

  * igt@gem_exec_balancer@parallel-ordering:
    - Statuses : 5 skip(s)
    - Exec time: [0.0] s

  * igt@gem_exec_balancer@parallel-out-fence:
    - Statuses : 5 skip(s)
    - Exec time: [0.0] s

  

Known issues
------------

  Here are the changes found in IGTPW_6347_full that come from known issues:

### IGT changes ###

#### Issues hit ####

  * igt@feature_discovery@display-2x:
    - shard-tglb:         NOTRUN -> [SKIP][3] ([i915#1839])
   [3]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-tglb6/igt@feature_discovery@display-2x.html

  * igt@gem_create@create-massive:
    - shard-snb:          NOTRUN -> [DMESG-WARN][4] ([i915#3002])
   [4]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-snb5/igt@gem_create@create-massive.html
    - shard-kbl:          NOTRUN -> [DMESG-WARN][5] ([i915#3002])
   [5]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-kbl3/igt@gem_create@create-massive.html
    - shard-apl:          NOTRUN -> [DMESG-WARN][6] ([i915#3002])
   [6]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-apl1/igt@gem_create@create-massive.html

  * igt@gem_ctx_persistence@legacy-engines-hostile-preempt:
    - shard-snb:          NOTRUN -> [SKIP][7] ([fdo#109271] / [i915#1099]) +4 similar issues
   [7]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-snb2/igt@gem_ctx_persistence@legacy-engines-hostile-preempt.html

  * igt@gem_eio@unwedge-stress:
    - shard-glk:          [PASS][8] -> [TIMEOUT][9] ([i915#2369])
   [8]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_10773/shard-glk7/igt@gem_eio@unwedge-stress.html
   [9]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-glk8/igt@gem_eio@unwedge-stress.html
    - shard-tglb:         [PASS][10] -> [TIMEOUT][11] ([i915#2369] / [i915#3063] / [i915#3648])
   [10]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_10773/shard-tglb3/igt@gem_eio@unwedge-stress.html
   [11]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-tglb8/igt@gem_eio@unwedge-stress.html

  * {igt@gem_exec_balancer@parallel-out-fence} (NEW):
    - shard-glk:          NOTRUN -> [SKIP][12] ([fdo#109271]) +42 similar issues
   [12]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-glk6/igt@gem_exec_balancer@parallel-out-fence.html

  * igt@gem_exec_fair@basic-none@vcs1:
    - shard-kbl:          [PASS][13] -> [FAIL][14] ([i915#2842])
   [13]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_10773/shard-kbl4/igt@gem_exec_fair@basic-none@vcs1.html
   [14]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-kbl2/igt@gem_exec_fair@basic-none@vcs1.html

  * igt@gem_exec_fair@basic-pace-solo@rcs0:
    - shard-glk:          [PASS][15] -> [FAIL][16] ([i915#2842]) +1 similar issue
   [15]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_10773/shard-glk9/igt@gem_exec_fair@basic-pace-solo@rcs0.html
   [16]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-glk1/igt@gem_exec_fair@basic-pace-solo@rcs0.html

  * igt@gem_exec_fair@basic-pace@bcs0:
    - shard-tglb:         [PASS][17] -> [FAIL][18] ([i915#2842]) +1 similar issue
   [17]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_10773/shard-tglb8/igt@gem_exec_fair@basic-pace@bcs0.html
   [18]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-tglb1/igt@gem_exec_fair@basic-pace@bcs0.html

  * igt@gem_exec_fair@basic-throttle@rcs0:
    - shard-tglb:         NOTRUN -> [FAIL][19] ([i915#2842])
   [19]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-tglb5/igt@gem_exec_fair@basic-throttle@rcs0.html

  * igt@gem_exec_flush@basic-batch-kernel-default-cmd:
    - shard-iclb:         NOTRUN -> [SKIP][20] ([fdo#109313])
   [20]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-iclb8/igt@gem_exec_flush@basic-batch-kernel-default-cmd.html
    - shard-tglb:         NOTRUN -> [SKIP][21] ([fdo#109313])
   [21]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-tglb6/igt@gem_exec_flush@basic-batch-kernel-default-cmd.html

  * igt@gem_exec_params@no-bsd:
    - shard-tglb:         NOTRUN -> [SKIP][22] ([fdo#109283])
   [22]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-tglb7/igt@gem_exec_params@no-bsd.html
    - shard-iclb:         NOTRUN -> [SKIP][23] ([fdo#109283])
   [23]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-iclb6/igt@gem_exec_params@no-bsd.html

  * igt@gem_exec_params@secure-non-root:
    - shard-tglb:         NOTRUN -> [SKIP][24] ([fdo#112283])
   [24]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-tglb6/igt@gem_exec_params@secure-non-root.html

  * igt@gem_exec_whisper@basic-normal:
    - shard-glk:          [PASS][25] -> [DMESG-WARN][26] ([i915#118])
   [25]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_10773/shard-glk9/igt@gem_exec_whisper@basic-normal.html
   [26]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-glk6/igt@gem_exec_whisper@basic-normal.html

  * igt@gem_pxp@create-protected-buffer:
    - shard-iclb:         NOTRUN -> [SKIP][27] ([i915#4270])
   [27]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-iclb7/igt@gem_pxp@create-protected-buffer.html
    - shard-tglb:         NOTRUN -> [SKIP][28] ([i915#4270])
   [28]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-tglb1/igt@gem_pxp@create-protected-buffer.html

  * igt@gem_userptr_blits@dmabuf-sync:
    - shard-apl:          NOTRUN -> [SKIP][29] ([fdo#109271] / [i915#3323])
   [29]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-apl6/igt@gem_userptr_blits@dmabuf-sync.html

  * igt@gem_userptr_blits@readonly-unsync:
    - shard-tglb:         NOTRUN -> [SKIP][30] ([i915#3297])
   [30]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-tglb8/igt@gem_userptr_blits@readonly-unsync.html

  * igt@gem_userptr_blits@vma-merge:
    - shard-kbl:          NOTRUN -> [FAIL][31] ([i915#3318])
   [31]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-kbl1/igt@gem_userptr_blits@vma-merge.html

  * igt@gen7_exec_parse@bitmasks:
    - shard-tglb:         NOTRUN -> [SKIP][32] ([fdo#109289])
   [32]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-tglb7/igt@gen7_exec_parse@bitmasks.html

  * igt@gen9_exec_parse@allowed-all:
    - shard-kbl:          [PASS][33] -> [DMESG-WARN][34] ([i915#1436] / [i915#716])
   [33]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_10773/shard-kbl2/igt@gen9_exec_parse@allowed-all.html
   [34]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-kbl7/igt@gen9_exec_parse@allowed-all.html

  * igt@gen9_exec_parse@shadow-peek:
    - shard-tglb:         NOTRUN -> [SKIP][35] ([i915#2856])
   [35]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-tglb2/igt@gen9_exec_parse@shadow-peek.html
    - shard-iclb:         NOTRUN -> [SKIP][36] ([i915#2856])
   [36]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-iclb2/igt@gen9_exec_parse@shadow-peek.html

  * igt@i915_pm_dc@dc9-dpms:
    - shard-iclb:         [PASS][37] -> [FAIL][38] ([i915#4275])
   [37]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_10773/shard-iclb4/igt@i915_pm_dc@dc9-dpms.html
   [38]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-iclb4/igt@i915_pm_dc@dc9-dpms.html

  * igt@i915_pm_rpm@dpms-non-lpsp:
    - shard-tglb:         NOTRUN -> [SKIP][39] ([fdo#111644] / [i915#1397] / [i915#2411])
   [39]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-tglb6/igt@i915_pm_rpm@dpms-non-lpsp.html

  * igt@i915_pm_sseu@full-enable:
    - shard-tglb:         NOTRUN -> [SKIP][40] ([fdo#109288])
   [40]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-tglb1/igt@i915_pm_sseu@full-enable.html

  * igt@i915_query@query-topology-unsupported:
    - shard-tglb:         NOTRUN -> [SKIP][41] ([fdo#109302])
   [41]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-tglb8/igt@i915_query@query-topology-unsupported.html

  * igt@i915_selftest@live@hangcheck:
    - shard-snb:          NOTRUN -> [INCOMPLETE][42] ([i915#3921])
   [42]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-snb6/igt@i915_selftest@live@hangcheck.html

  * igt@i915_suspend@fence-restore-tiled2untiled:
    - shard-apl:          [PASS][43] -> [DMESG-WARN][44] ([i915#180]) +1 similar issue
   [43]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_10773/shard-apl3/igt@i915_suspend@fence-restore-tiled2untiled.html
   [44]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-apl1/igt@i915_suspend@fence-restore-tiled2untiled.html

  * igt@i915_suspend@sysfs-reader:
    - shard-tglb:         [PASS][45] -> [INCOMPLETE][46] ([i915#456])
   [45]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_10773/shard-tglb5/igt@i915_suspend@sysfs-reader.html
   [46]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-tglb7/igt@i915_suspend@sysfs-reader.html

  * igt@kms_big_fb@linear-8bpp-rotate-270:
    - shard-tglb:         NOTRUN -> [SKIP][47] ([fdo#111614]) +3 similar issues
   [47]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-tglb3/igt@kms_big_fb@linear-8bpp-rotate-270.html

  * igt@kms_big_fb@x-tiled-16bpp-rotate-270:
    - shard-iclb:         NOTRUN -> [SKIP][48] ([fdo#110725] / [fdo#111614])
   [48]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-iclb5/igt@kms_big_fb@x-tiled-16bpp-rotate-270.html

  * igt@kms_big_fb@yf-tiled-max-hw-stride-32bpp-rotate-0-hflip:
    - shard-apl:          NOTRUN -> [SKIP][49] ([fdo#109271] / [i915#3777]) +1 similar issue
   [49]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-apl8/igt@kms_big_fb@yf-tiled-max-hw-stride-32bpp-rotate-0-hflip.html

  * igt@kms_big_fb@yf-tiled-max-hw-stride-64bpp-rotate-0-hflip:
    - shard-tglb:         NOTRUN -> [SKIP][50] ([fdo#111615])
   [50]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-tglb2/igt@kms_big_fb@yf-tiled-max-hw-stride-64bpp-rotate-0-hflip.html

  * igt@kms_big_joiner@invalid-modeset:
    - shard-tglb:         NOTRUN -> [SKIP][51] ([i915#2705])
   [51]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-tglb2/igt@kms_big_joiner@invalid-modeset.html

  * igt@kms_ccs@pipe-a-crc-sprite-planes-basic-y_tiled_gen12_mc_ccs:
    - shard-glk:          NOTRUN -> [SKIP][52] ([fdo#109271] / [i915#3886]) +4 similar issues
   [52]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-glk2/igt@kms_ccs@pipe-a-crc-sprite-planes-basic-y_tiled_gen12_mc_ccs.html
    - shard-iclb:         NOTRUN -> [SKIP][53] ([fdo#109278] / [i915#3886]) +1 similar issue
   [53]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-iclb8/igt@kms_ccs@pipe-a-crc-sprite-planes-basic-y_tiled_gen12_mc_ccs.html
    - shard-tglb:         NOTRUN -> [SKIP][54] ([i915#3689] / [i915#3886])
   [54]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-tglb1/igt@kms_ccs@pipe-a-crc-sprite-planes-basic-y_tiled_gen12_mc_ccs.html

  * igt@kms_ccs@pipe-a-missing-ccs-buffer-y_tiled_gen12_rc_ccs_cc:
    - shard-apl:          NOTRUN -> [SKIP][55] ([fdo#109271] / [i915#3886]) +7 similar issues
   [55]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-apl3/igt@kms_ccs@pipe-a-missing-ccs-buffer-y_tiled_gen12_rc_ccs_cc.html

  * igt@kms_ccs@pipe-b-ccs-on-another-bo-y_tiled_gen12_rc_ccs:
    - shard-iclb:         NOTRUN -> [SKIP][56] ([fdo#109278]) +1 similar issue
   [56]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-iclb8/igt@kms_ccs@pipe-b-ccs-on-another-bo-y_tiled_gen12_rc_ccs.html

  * igt@kms_ccs@pipe-b-missing-ccs-buffer-y_tiled_gen12_rc_ccs_cc:
    - shard-kbl:          NOTRUN -> [SKIP][57] ([fdo#109271] / [i915#3886]) +10 similar issues
   [57]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-kbl4/igt@kms_ccs@pipe-b-missing-ccs-buffer-y_tiled_gen12_rc_ccs_cc.html

  * igt@kms_ccs@pipe-d-missing-ccs-buffer-yf_tiled_ccs:
    - shard-tglb:         NOTRUN -> [SKIP][58] ([i915#3689]) +8 similar issues
   [58]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-tglb8/igt@kms_ccs@pipe-d-missing-ccs-buffer-yf_tiled_ccs.html

  * igt@kms_cdclk@mode-transition:
    - shard-apl:          NOTRUN -> [SKIP][59] ([fdo#109271]) +254 similar issues
   [59]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-apl7/igt@kms_cdclk@mode-transition.html

  * igt@kms_chamelium@vga-hpd:
    - shard-apl:          NOTRUN -> [SKIP][60] ([fdo#109271] / [fdo#111827]) +15 similar issues
   [60]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-apl1/igt@kms_chamelium@vga-hpd.html

  * igt@kms_chamelium@vga-hpd-after-suspend:
    - shard-glk:          NOTRUN -> [SKIP][61] ([fdo#109271] / [fdo#111827])
   [61]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-glk5/igt@kms_chamelium@vga-hpd-after-suspend.html

  * igt@kms_color_chamelium@pipe-a-ctm-0-25:
    - shard-snb:          NOTRUN -> [SKIP][62] ([fdo#109271] / [fdo#111827]) +15 similar issues
   [62]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-snb6/igt@kms_color_chamelium@pipe-a-ctm-0-25.html

  * igt@kms_color_chamelium@pipe-b-ctm-0-75:
    - shard-tglb:         NOTRUN -> [SKIP][63] ([fdo#109284] / [fdo#111827]) +5 similar issues
   [63]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-tglb1/igt@kms_color_chamelium@pipe-b-ctm-0-75.html

  * igt@kms_color_chamelium@pipe-c-ctm-0-25:
    - shard-kbl:          NOTRUN -> [SKIP][64] ([fdo#109271] / [fdo#111827]) +18 similar issues
   [64]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-kbl1/igt@kms_color_chamelium@pipe-c-ctm-0-25.html

  * igt@kms_content_protection@srm:
    - shard-apl:          NOTRUN -> [TIMEOUT][65] ([i915#1319])
   [65]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-apl8/igt@kms_content_protection@srm.html

  * igt@kms_content_protection@uevent:
    - shard-apl:          NOTRUN -> [FAIL][66] ([i915#2105])
   [66]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-apl2/igt@kms_content_protection@uevent.html

  * igt@kms_cursor_crc@pipe-c-cursor-512x170-offscreen:
    - shard-tglb:         NOTRUN -> [SKIP][67] ([fdo#109279] / [i915#3359]) +2 similar issues
   [67]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-tglb8/igt@kms_cursor_crc@pipe-c-cursor-512x170-offscreen.html

  * igt@kms_cursor_crc@pipe-c-cursor-max-size-onscreen:
    - shard-tglb:         NOTRUN -> [SKIP][68] ([i915#3359]) +3 similar issues
   [68]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-tglb1/igt@kms_cursor_crc@pipe-c-cursor-max-size-onscreen.html

  * igt@kms_cursor_crc@pipe-d-cursor-32x32-random:
    - shard-tglb:         NOTRUN -> [SKIP][69] ([i915#3319])
   [69]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-tglb2/igt@kms_cursor_crc@pipe-d-cursor-32x32-random.html

  * igt@kms_cursor_edge_walk@pipe-d-128x128-right-edge:
    - shard-snb:          NOTRUN -> [SKIP][70] ([fdo#109271]) +409 similar issues
   [70]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-snb2/igt@kms_cursor_edge_walk@pipe-d-128x128-right-edge.html

  * igt@kms_cursor_legacy@cursorb-vs-flipa-atomic:
    - shard-iclb:         NOTRUN -> [SKIP][71] ([fdo#109274] / [fdo#109278])
   [71]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-iclb2/igt@kms_cursor_legacy@cursorb-vs-flipa-atomic.html

  * igt@kms_cursor_legacy@flip-vs-cursor-atomic-transitions:
    - shard-glk:          [PASS][72] -> [FAIL][73] ([i915#2346])
   [72]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_10773/shard-glk8/igt@kms_cursor_legacy@flip-vs-cursor-atomic-transitions.html
   [73]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-glk7/igt@kms_cursor_legacy@flip-vs-cursor-atomic-transitions.html

  * igt@kms_dp_tiled_display@basic-test-pattern-with-chamelium:
    - shard-tglb:         NOTRUN -> [SKIP][74] ([i915#3528])
   [74]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-tglb5/igt@kms_dp_tiled_display@basic-test-pattern-with-chamelium.html

  * igt@kms_flip@2x-plain-flip-ts-check-interruptible@ac-hdmi-a1-hdmi-a2:
    - shard-glk:          [PASS][75] -> [FAIL][76] ([i915#2122])
   [75]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_10773/shard-glk3/igt@kms_flip@2x-plain-flip-ts-check-interruptible@ac-hdmi-a1-hdmi-a2.html
   [76]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-glk7/igt@kms_flip@2x-plain-flip-ts-check-interruptible@ac-hdmi-a1-hdmi-a2.html

  * igt@kms_flip@flip-vs-suspend-interruptible@a-dp1:
    - shard-kbl:          [PASS][77] -> [DMESG-WARN][78] ([i915#180]) +4 similar issues
   [77]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_10773/shard-kbl2/igt@kms_flip@flip-vs-suspend-interruptible@a-dp1.html
   [78]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-kbl7/igt@kms_flip@flip-vs-suspend-interruptible@a-dp1.html

  * igt@kms_frontbuffer_tracking@fbc-2p-scndscrn-shrfb-msflip-blt:
    - shard-tglb:         NOTRUN -> [SKIP][79] ([fdo#111825]) +20 similar issues
   [79]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-tglb6/igt@kms_frontbuffer_tracking@fbc-2p-scndscrn-shrfb-msflip-blt.html
    - shard-iclb:         NOTRUN -> [SKIP][80] ([fdo#109280]) +7 similar issues
   [80]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-iclb4/igt@kms_frontbuffer_tracking@fbc-2p-scndscrn-shrfb-msflip-blt.html

  * igt@kms_frontbuffer_tracking@fbc-suspend:
    - shard-apl:          NOTRUN -> [DMESG-WARN][81] ([i915#180])
   [81]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-apl8/igt@kms_frontbuffer_tracking@fbc-suspend.html

  * igt@kms_frontbuffer_tracking@psr-suspend:
    - shard-tglb:         [PASS][82] -> [INCOMPLETE][83] ([i915#2411] / [i915#456])
   [82]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_10773/shard-tglb2/igt@kms_frontbuffer_tracking@psr-suspend.html
   [83]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-tglb7/igt@kms_frontbuffer_tracking@psr-suspend.html

  * igt@kms_hdr@bpc-switch-suspend:
    - shard-kbl:          NOTRUN -> [DMESG-WARN][84] ([i915#180]) +6 similar issues
   [84]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-kbl4/igt@kms_hdr@bpc-switch-suspend.html

  * igt@kms_hdr@static-toggle-dpms:
    - shard-tglb:         NOTRUN -> [SKIP][85] ([i915#1187])
   [85]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-tglb5/igt@kms_hdr@static-toggle-dpms.html

  * igt@kms_pipe_crc_basic@disable-crc-after-crtc-pipe-d:
    - shard-kbl:          NOTRUN -> [SKIP][86] ([fdo#109271] / [i915#533]) +1 similar issue
   [86]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-kbl6/igt@kms_pipe_crc_basic@disable-crc-after-crtc-pipe-d.html

  * igt@kms_pipe_crc_basic@suspend-read-crc-pipe-c:
    - shard-tglb:         [PASS][87] -> [INCOMPLETE][88] ([i915#2828] / [i915#456])
   [87]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_10773/shard-tglb1/igt@kms_pipe_crc_basic@suspend-read-crc-pipe-c.html
   [88]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-tglb7/igt@kms_pipe_crc_basic@suspend-read-crc-pipe-c.html

  * igt@kms_plane@plane-panning-bottom-right-suspend@pipe-b-planes:
    - shard-tglb:         NOTRUN -> [INCOMPLETE][89] ([i915#456])
   [89]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-tglb7/igt@kms_plane@plane-panning-bottom-right-suspend@pipe-b-planes.html

  * igt@kms_plane_alpha_blend@pipe-a-alpha-7efc:
    - shard-kbl:          NOTRUN -> [FAIL][90] ([fdo#108145] / [i915#265]) +1 similar issue
   [90]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-kbl3/igt@kms_plane_alpha_blend@pipe-a-alpha-7efc.html

  * igt@kms_plane_alpha_blend@pipe-a-alpha-transparent-fb:
    - shard-kbl:          NOTRUN -> [FAIL][91] ([i915#265])
   [91]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-kbl6/igt@kms_plane_alpha_blend@pipe-a-alpha-transparent-fb.html

  * igt@kms_plane_alpha_blend@pipe-b-constant-alpha-max:
    - shard-apl:          NOTRUN -> [FAIL][92] ([fdo#108145] / [i915#265]) +1 similar issue
   [92]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-apl7/igt@kms_plane_alpha_blend@pipe-b-constant-alpha-max.html

  * igt@kms_plane_alpha_blend@pipe-c-alpha-transparent-fb:
    - shard-apl:          NOTRUN -> [FAIL][93] ([i915#265])
   [93]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-apl1/igt@kms_plane_alpha_blend@pipe-c-alpha-transparent-fb.html

  * igt@kms_plane_lowres@pipe-a-tiling-y:
    - shard-iclb:         NOTRUN -> [SKIP][94] ([i915#3536])
   [94]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-iclb3/igt@kms_plane_lowres@pipe-a-tiling-y.html
    - shard-tglb:         NOTRUN -> [SKIP][95] ([i915#3536])
   [95]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-tglb2/igt@kms_plane_lowres@pipe-a-tiling-y.html

  * igt@kms_psr2_sf@overlay-plane-update-sf-dmg-area-2:
    - shard-apl:          NOTRUN -> [SKIP][96] ([fdo#109271] / [i915#658]) +4 similar issues
   [96]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-apl3/igt@kms_psr2_sf@overlay-plane-update-sf-dmg-area-2.html

  * igt@kms_psr2_sf@primary-plane-update-sf-dmg-area-1:
    - shard-kbl:          NOTRUN -> [SKIP][97] ([fdo#109271] / [i915#658]) +5 similar issues
   [97]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-kbl2/igt@kms_psr2_sf@primary-plane-update-sf-dmg-area-1.html

  * igt@kms_psr2_sf@primary-plane-update-sf-dmg-area-5:
    - shard-tglb:         NOTRUN -> [SKIP][98] ([i915#2920]) +1 similar issue
   [98]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-tglb2/igt@kms_psr2_sf@primary-plane-update-sf-dmg-area-5.html

  * igt@kms_psr2_su@page_flip:
    - shard-tglb:         NOTRUN -> [SKIP][99] ([i915#1911])
   [99]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-tglb2/igt@kms_psr2_su@page_flip.html

  * igt@kms_psr@psr2_cursor_plane_onoff:
    - shard-tglb:         NOTRUN -> [FAIL][100] ([i915#132] / [i915#3467])
   [100]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-tglb1/igt@kms_psr@psr2_cursor_plane_onoff.html

  * igt@kms_psr@psr2_no_drrs:
    - shard-iclb:         [PASS][101] -> [SKIP][102] ([fdo#109441]) +2 similar issues
   [101]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_10773/shard-iclb2/igt@kms_psr@psr2_no_drrs.html
   [102]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-iclb5/igt@kms_psr@psr2_no_drrs.html

  * igt@kms_universal_plane@disable-primary-vs-flip-pipe-d:
    - shard-kbl:          NOTRUN -> [SKIP][103] ([fdo#109271]) +231 similar issues
   [103]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-kbl1/igt@kms_universal_plane@disable-primary-vs-flip-pipe-d.html

  * igt@kms_vblank@pipe-d-wait-idle:
    - shard-apl:          NOTRUN -> [SKIP][104] ([fdo#109271] / [i915#533]) +2 similar issues
   [104]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-apl1/igt@kms_vblank@pipe-d-wait-idle.html

  * igt@kms_writeback@writeback-check-output:
    - shard-apl:          NOTRUN -> [SKIP][105] ([fdo#109271] / [i915#2437]) +1 similar issue
   [105]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-apl7/igt@kms_writeback@writeback-check-output.html

  * igt@nouveau_crc@pipe-d-ctx-flip-skip-current-frame:
    - shard-tglb:         NOTRUN -> [SKIP][106] ([i915#2530])
   [106]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-tglb2/igt@nouveau_crc@pipe-d-ctx-flip-skip-current-frame.html

  * igt@prime_nv_test@i915_import_cpu_mmap:
    - shard-iclb:         NOTRUN -> [SKIP][107] ([fdo#109291])
   [107]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-iclb5/igt@prime_nv_test@i915_import_cpu_mmap.html
    - shard-tglb:         NOTRUN -> [SKIP][108] ([fdo#109291])
   [108]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-tglb8/igt@prime_nv_test@i915_import_cpu_mmap.html

  * igt@prime_vgem@basic-userptr:
    - shard-tglb:         NOTRUN -> [SKIP][109] ([i915#3301])
   [109]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-tglb1/igt@prime_vgem@basic-userptr.html

  * igt@prime_vgem@fence-write-hang:
    - shard-tglb:         NOTRUN -> [SKIP][110] ([fdo#109295])
   [110]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-tglb8/igt@prime_vgem@fence-write-hang.html

  * igt@sysfs_clients@fair-7:
    - shard-apl:          NOTRUN -> [SKIP][111] ([fdo#109271] / [i915#2994]) +4 similar issues
   [111]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-apl8/igt@sysfs_clients@fair-7.html

  * igt@sysfs_clients@sema-10:
    - shard-tglb:         NOTRUN -> [SKIP][112] ([i915#2994]) +1 similar issue
   [112]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-tglb7/igt@sysfs_clients@sema-10.html
    - shard-kbl:          NOTRUN -> [SKIP][113] ([fdo#109271] / [i915#2994])
   [113]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-kbl1/igt@sysfs_clients@sema-10.html

  
#### Possible fixes ####

  * igt@gem_exec_fair@basic-pace@vecs0:
    - shard-glk:          [FAIL][114] ([i915#2842]) -> [PASS][115] +2 similar issues
   [114]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_10773/shard-glk3/igt@gem_exec_fair@basic-pace@vecs0.html
   [115]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-glk2/igt@gem_exec_fair@basic-pace@vecs0.html
    - shard-iclb:         [FAIL][116] ([i915#2842]) -> [PASS][117]
   [116]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_10773/shard-iclb8/igt@gem_exec_fair@basic-pace@vecs0.html
   [117]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-iclb8/igt@gem_exec_fair@basic-pace@vecs0.html

  * igt@gem_exec_fair@basic-throttle@rcs0:
    - shard-iclb:         [FAIL][118] ([i915#2849]) -> [PASS][119]
   [118]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_10773/shard-iclb4/igt@gem_exec_fair@basic-throttle@rcs0.html
   [119]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-iclb7/igt@gem_exec_fair@basic-throttle@rcs0.html

  * igt@gem_huc_copy@huc-copy:
    - shard-tglb:         [SKIP][120] ([i915#2190]) -> [PASS][121]
   [120]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_10773/shard-tglb6/igt@gem_huc_copy@huc-copy.html
   [121]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-tglb2/igt@gem_huc_copy@huc-copy.html

  * igt@i915_pm_dc@dc6-psr:
    - shard-iclb:         [FAIL][122] ([i915#454]) -> [PASS][123]
   [122]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_10773/shard-iclb6/igt@i915_pm_dc@dc6-psr.html
   [123]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-iclb7/igt@i915_pm_dc@dc6-psr.html

  * igt@i915_suspend@forcewake:
    - shard-tglb:         [INCOMPLETE][124] ([i915#2411] / [i915#456]) -> [PASS][125] +1 similar issue
   [124]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_10773/shard-tglb7/igt@i915_suspend@forcewake.html
   [125]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-tglb5/igt@i915_suspend@forcewake.html

  * igt@kms_big_fb@linear-32bpp-rotate-180:
    - shard-glk:          [DMESG-WARN][126] ([i915#118]) -> [PASS][127]
   [126]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_10773/shard-glk6/igt@kms_big_fb@linear-32bpp-rotate-180.html
   [127]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-glk5/igt@kms_big_fb@linear-32bpp-rotate-180.html

  * igt@kms_cursor_crc@pipe-c-cursor-suspend:
    - shard-kbl:          [DMESG-WARN][128] ([i915#180]) -> [PASS][129] +5 similar issues
   [128]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_10773/shard-kbl6/igt@kms_cursor_crc@pipe-c-cursor-suspend.html
   [129]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-kbl2/igt@kms_cursor_crc@pipe-c-cursor-suspend.html

  * igt@kms_fbcon_fbt@psr-suspend:
    - shard-tglb:         [INCOMPLETE][130] ([i915#456]) -> [PASS][131]
   [130]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_10773/shard-tglb7/igt@kms_fbcon_fbt@psr-suspend.html
   [131]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-tglb1/igt@kms_fbcon_fbt@psr-suspend.html

  * igt@kms_flip@flip-vs-expired-vblank-interruptible@a-dp1:
    - shard-kbl:          [FAIL][132] ([i915#79]) -> [PASS][133]
   [132]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_10773/shard-kbl6/igt@kms_flip@flip-vs-expired-vblank-interruptible@a-dp1.html
   [133]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-kbl6/igt@kms_flip@flip-vs-expired-vblank-interruptible@a-dp1.html

  * igt@kms_frontbuffer_tracking@fbc-suspend:
    - shard-tglb:         [INCOMPLETE][134] ([i915#2828] / [i915#456]) -> [PASS][135]
   [134]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_10773/shard-tglb7/igt@kms_frontbuffer_tracking@fbc-suspend.html
   [135]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/shard-tglb1/igt@kms_frontbuffer_trackin

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_6347/index.html

[-- Attachment #2: Type: text/html, Size: 34009 bytes --]

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [igt-dev] [PATCH] i915/gem_exec_balancer: Test parallel execbuf
  2021-10-22  0:18 [igt-dev] [PATCH] i915/gem_exec_balancer: Test parallel execbuf Matthew Brost
  2021-10-22  0:59 ` [igt-dev] ✓ Fi.CI.BAT: success for " Patchwork
  2021-10-22  4:21 ` [igt-dev] ✓ Fi.CI.IGT: " Patchwork
@ 2021-11-02 18:34 ` Daniele Ceraolo Spurio
  2021-11-02 21:55   ` Matthew Brost
  2 siblings, 1 reply; 5+ messages in thread
From: Daniele Ceraolo Spurio @ 2021-11-02 18:34 UTC (permalink / raw)
  To: Matthew Brost, igt-dev



On 10/21/2021 5:18 PM, Matthew Brost wrote:
> Add basic parallel execbuf submission test which more or less just
> submits the same BB in loop a which does an atomic increment to a memory
> location. The memory location is checked at the end for the correct
> value. Different sections use various IOCTL options (e.g. fences,
> location of BBs, etc...).
>
> In addition to above sections, an additional section ensure the ordering
> of parallel submission by submitting a spinning batch to 1 individual
> engine, submit a parallel execbuf to all engines instances within the
> class, verify none on parallel execbuf make to hardware, release
> spinner, and finally verify everything has completed.
>
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> ---
>   include/drm-uapi/i915_drm.h    | 136 ++++++++-
>   lib/intel_ctx.c                |  28 +-
>   lib/intel_ctx.h                |   2 +
>   lib/intel_reg.h                |   5 +
>   tests/i915/gem_exec_balancer.c | 487 +++++++++++++++++++++++++++++++++
>   5 files changed, 656 insertions(+), 2 deletions(-)
>
> diff --git a/include/drm-uapi/i915_drm.h b/include/drm-uapi/i915_drm.h
> index c788a1ab4..b57f52623 100644
> --- a/include/drm-uapi/i915_drm.h
> +++ b/include/drm-uapi/i915_drm.h

The uapi file needs to be in sync with drm-next. If the changes have 
already reached drm-next then we should just have a separate patch doing 
the file sync, otherwise these defs must move to lib/i915/i915_drm_local.h

> @@ -1824,6 +1824,7 @@ struct drm_i915_gem_context_param {
>    * Extensions:
>    *   i915_context_engines_load_balance (I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE)
>    *   i915_context_engines_bond (I915_CONTEXT_ENGINES_EXT_BOND)
> + *   i915_context_engines_parallel_submit (I915_CONTEXT_ENGINES_EXT_PARALLEL_SUBMIT)
>    */
>   #define I915_CONTEXT_PARAM_ENGINES	0xa
>   
> @@ -2104,10 +2105,137 @@ struct i915_context_engines_bond {
>    * 	gem_execbuf(drm_fd, &execbuf);
>    */
>   
> +/**
> + * struct i915_context_engines_parallel_submit - Configure engine for
> + * parallel submission.
> + *
> + * Setup a slot in the context engine map to allow multiple BBs to be submitted
> + * in a single execbuf IOCTL. Those BBs will then be scheduled to run on the GPU
> + * in parallel. Multiple hardware contexts are created internally in the i915
> + * run these BBs. Once a slot is configured for N BBs only N BBs can be
> + * submitted in each execbuf IOCTL and this is implicit behavior e.g. The user
> + * doesn't tell the execbuf IOCTL there are N BBs, the execbuf IOCTL knows how
> + * many BBs there are based on the slot's configuration. The N BBs are the last
> + * N buffer objects or first N if I915_EXEC_BATCH_FIRST is set.
> + *
> + * The default placement behavior is to create implicit bonds between each
> + * context if each context maps to more than 1 physical engine (e.g. context is
> + * a virtual engine). Also we only allow contexts of same engine class and these
> + * contexts must be in logically contiguous order. Examples of the placement
> + * behavior described below. Lastly, the default is to not allow BBs to
> + * preempted mid BB rather insert coordinated preemption on all hardware
> + * contexts between each set of BBs. Flags may be added in the future to change
> + * both of these default behaviors.
> + *
> + * Returns -EINVAL if hardware context placement configuration is invalid or if
> + * the placement configuration isn't supported on the platform / submission
> + * interface.
> + * Returns -ENODEV if extension isn't supported on the platform / submission
> + * interface.
> + *
> + * .. code-block:: none
> + *
> + *	Example 1 pseudo code:
> + *	CS[X] = generic engine of same class, logical instance X
> + *	INVALID = I915_ENGINE_CLASS_INVALID, I915_ENGINE_CLASS_INVALID_NONE
> + *	set_engines(INVALID)
> + *	set_parallel(engine_index=0, width=2, num_siblings=1,
> + *		     engines=CS[0],CS[1])
> + *
> + *	Results in the following valid placement:
> + *	CS[0], CS[1]
> + *
> + *	Example 2 pseudo code:
> + *	CS[X] = generic engine of same class, logical instance X
> + *	INVALID = I915_ENGINE_CLASS_INVALID, I915_ENGINE_CLASS_INVALID_NONE
> + *	set_engines(INVALID)
> + *	set_parallel(engine_index=0, width=2, num_siblings=2,
> + *		     engines=CS[0],CS[2],CS[1],CS[3])
> + *
> + *	Results in the following valid placements:
> + *	CS[0], CS[1]
> + *	CS[2], CS[3]
> + *
> + *	This can also be thought of as 2 virtual engines described by 2-D array
> + *	in the engines the field with bonds placed between each index of the
> + *	virtual engines. e.g. CS[0] is bonded to CS[1], CS[2] is bonded to
> + *	CS[3].
> + *	VE[0] = CS[0], CS[2]
> + *	VE[1] = CS[1], CS[3]
> + *
> + *	Example 3 pseudo code:
> + *	CS[X] = generic engine of same class, logical instance X
> + *	INVALID = I915_ENGINE_CLASS_INVALID, I915_ENGINE_CLASS_INVALID_NONE
> + *	set_engines(INVALID)
> + *	set_parallel(engine_index=0, width=2, num_siblings=2,
> + *		     engines=CS[0],CS[1],CS[1],CS[3])
> + *
> + *	Results in the following valid and invalid placements:
> + *	CS[0], CS[1]
> + *	CS[1], CS[3] - Not logical contiguous, return -EINVAL
> + */
> +struct i915_context_engines_parallel_submit {
> +	/**
> +	 * @base: base user extension.
> +	 */
> +	struct i915_user_extension base;
> +
> +	/**
> +	 * @engine_index: slot for parallel engine
> +	 */
> +	__u16 engine_index;
> +
> +	/**
> +	 * @width: number of contexts per parallel engine
> +	 */
> +	__u16 width;
> +
> +	/**
> +	 * @num_siblings: number of siblings per context
> +	 */
> +	__u16 num_siblings;
> +
> +	/**
> +	 * @mbz16: reserved for future use; must be zero
> +	 */
> +	__u16 mbz16;
> +
> +	/**
> +	 * @flags: all undefined flags must be zero, currently not defined flags
> +	 */
> +	__u64 flags;
> +
> +	/**
> +	 * @mbz64: reserved for future use; must be zero
> +	 */
> +	__u64 mbz64[3];
> +
> +	/**
> +	 * @engines: 2-d array of engine instances to configure parallel engine
> +	 *
> +	 * length = width (i) * num_siblings (j)
> +	 * index = j + i * num_siblings
> +	 */
> +	struct i915_engine_class_instance engines[0];
> +
> +} __packed;
> +
> +#define I915_DEFINE_CONTEXT_ENGINES_PARALLEL_SUBMIT(name__, N__) struct { \
> +	struct i915_user_extension base; \
> +	__u16 engine_index; \
> +	__u16 width; \
> +	__u16 num_siblings; \
> +	__u16 mbz16; \
> +	__u64 flags; \
> +	__u64 mbz64[3]; \
> +	struct i915_engine_class_instance engines[N__]; \
> +} __attribute__((packed)) name__
> +
>   struct i915_context_param_engines {
>   	__u64 extensions; /* linked chain of extension blocks, 0 terminates */
>   #define I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE 0 /* see i915_context_engines_load_balance */
>   #define I915_CONTEXT_ENGINES_EXT_BOND 1 /* see i915_context_engines_bond */
> +#define I915_CONTEXT_ENGINES_EXT_PARALLEL_SUBMIT 2 /* see i915_context_engines_parallel_submit */
>   	struct i915_engine_class_instance engines[0];
>   } __attribute__((packed));
>   
> @@ -2726,14 +2854,20 @@ struct drm_i915_engine_info {
>   
>   	/** @flags: Engine flags. */
>   	__u64 flags;
> +#define I915_ENGINE_INFO_HAS_LOGICAL_INSTANCE		(1 << 0)
>   
>   	/** @capabilities: Capabilities of this engine. */
>   	__u64 capabilities;
>   #define I915_VIDEO_CLASS_CAPABILITY_HEVC		(1 << 0)
>   #define I915_VIDEO_AND_ENHANCE_CLASS_CAPABILITY_SFC	(1 << 1)
>   
> +	/** @logical_instance: Logical instance of engine */
> +	__u16 logical_instance;
> +
>   	/** @rsvd1: Reserved fields. */
> -	__u64 rsvd1[4];
> +	__u16 rsvd1[3];
> +	/** @rsvd2: Reserved fields. */
> +	__u64 rsvd2[3];
>   };
>   
>   /**
> diff --git a/lib/intel_ctx.c b/lib/intel_ctx.c
> index f28c15544..11ec6fca4 100644
> --- a/lib/intel_ctx.c
> +++ b/lib/intel_ctx.c
> @@ -83,6 +83,7 @@ __context_create_cfg(int fd, const intel_ctx_cfg_t *cfg, uint32_t *ctx_id)
>   {
>   	uint64_t ext_root = 0;
>   	I915_DEFINE_CONTEXT_ENGINES_LOAD_BALANCE(balance, GEM_MAX_ENGINES);
> +	I915_DEFINE_CONTEXT_ENGINES_PARALLEL_SUBMIT(parallel, GEM_MAX_ENGINES);
>   	I915_DEFINE_CONTEXT_PARAM_ENGINES(engines, GEM_MAX_ENGINES);
>   	struct drm_i915_gem_context_create_ext_setparam engines_param, vm_param;
>   	struct drm_i915_gem_context_create_ext_setparam persist_param;
> @@ -117,7 +118,29 @@ __context_create_cfg(int fd, const intel_ctx_cfg_t *cfg, uint32_t *ctx_id)
>   		unsigned num_logical_engines;
>   		memset(&engines, 0, sizeof(engines));
>   

Do we need an assert to make sure cfg->load_balance and cfg->parallel 
are not set at the same time?

> -		if (cfg->load_balance) {
> +		if (cfg->parallel) {
> +			memset(&parallel, 0, sizeof(parallel));
> +
> +			num_logical_engines = 1;
> +
> +			parallel.base.name =
> +				I915_CONTEXT_ENGINES_EXT_PARALLEL_SUBMIT;
> +
> +			engines.engines[0].engine_class =
> +				I915_ENGINE_CLASS_INVALID;
> +			engines.engines[0].engine_instance =
> +				I915_ENGINE_CLASS_INVALID_NONE;
> +
> +			parallel.num_siblings = cfg->num_engines;
> +			parallel.width = cfg->width;
> +			for (i = 0; i < cfg->num_engines * cfg->width; i++) {
> +				igt_assert_eq(cfg->engines[0].engine_class,
> +					      cfg->engines[i].engine_class);
> +				parallel.engines[i] = cfg->engines[i];
> +			}
> +
> +			engines.extensions = to_user_pointer(&parallel);
> +		} else if (cfg->load_balance) {
>   			memset(&balance, 0, sizeof(balance));
>   
>   			/* In this case, the first engine is the virtual
> @@ -127,6 +150,9 @@ __context_create_cfg(int fd, const intel_ctx_cfg_t *cfg, uint32_t *ctx_id)
>   			igt_assert(cfg->num_engines + 1 <= GEM_MAX_ENGINES);
>   			num_logical_engines = cfg->num_engines + 1;
>   
> +			balance.base.name =
> +				I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE;
> +
>   			engines.engines[0].engine_class =
>   				I915_ENGINE_CLASS_INVALID;
>   			engines.engines[0].engine_instance =
> diff --git a/lib/intel_ctx.h b/lib/intel_ctx.h
> index 9649f6d96..89c65fcd3 100644
> --- a/lib/intel_ctx.h
> +++ b/lib/intel_ctx.h
> @@ -46,7 +46,9 @@ typedef struct intel_ctx_cfg {
>   	uint32_t vm;
>   	bool nopersist;
>   	bool load_balance;
> +	bool parallel;
>   	unsigned int num_engines;
> +	unsigned int width;

Given that width is only set when parallel is true, we could potentially 
have a single var (parallel_width?) and check for it being > 0 instead 
of checking the bool. Just a thought, not a blocker.

>   	struct i915_engine_class_instance engines[GEM_MAX_ENGINES];
>   } intel_ctx_cfg_t;
>   
> diff --git a/lib/intel_reg.h b/lib/intel_reg.h
> index c447525a0..44b0d480f 100644
> --- a/lib/intel_reg.h
> +++ b/lib/intel_reg.h
> @@ -2642,6 +2642,11 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
>   
>   #define STATE3D_COLOR_FACTOR	((0x3<<29)|(0x1d<<24)|(0x01<<16))
>   
> +/* Atomics */
> +#define MI_ATOMIC			((0x2f << 23) | 2)
> +#define   MI_ATOMIC_INLINE_DATA         (1 << 18)
> +#define   MI_ATOMIC_ADD                 (0x7 << 8)
> +
>   /* Batch */
>   #define MI_BATCH_BUFFER		((0x30 << 23) | 1)
>   #define MI_BATCH_BUFFER_START	(0x31 << 23)
> diff --git a/tests/i915/gem_exec_balancer.c b/tests/i915/gem_exec_balancer.c
> index e4e5cda4a..171295777 100644
> --- a/tests/i915/gem_exec_balancer.c
> +++ b/tests/i915/gem_exec_balancer.c
> @@ -25,6 +25,7 @@
>   #include <sched.h>
>   #include <sys/ioctl.h>
>   #include <sys/signal.h>
> +#include <poll.h>
>   
>   #include "i915/gem.h"
>   #include "i915/gem_create.h"
> @@ -56,6 +57,31 @@ static size_t sizeof_load_balance(int count)
>   
>   #define alloca0(sz) ({ size_t sz__ = (sz); memset(alloca(sz__), 0, sz__); })
>   
> +static int
> +__i915_query(int fd, struct drm_i915_query *q)
> +{
> +	if (igt_ioctl(fd, DRM_IOCTL_I915_QUERY, q))
> +		return -errno;
> +
> +	return 0;
> +}
> +
> +static int
> +__i915_query_items(int fd, struct drm_i915_query_item *items, uint32_t n_items)
> +{
> +	struct drm_i915_query q = {
> +		.num_items = n_items,
> +		.items_ptr = to_user_pointer(items),
> +		};
> +
> +	return __i915_query(fd, &q);
> +}

Identical query helpers are implemented in a couple other places 
(lib/i915/intel_memory_region.c, tests/i915/i915_query.c), so I believe 
we have critical usage mass to move them to their own lib file.

> +
> +#define i915_query_items(fd, items, n_items) do { \
> +		igt_assert_eq(__i915_query_items(fd, items, n_items), 0); \
> +		errno = 0; \
> +	} while (0)
> +
>   static bool has_class_instance(int i915, uint16_t class, uint16_t instance)
>   {
>   	int fd;
> @@ -2752,6 +2778,380 @@ static void nohangcheck(int i915)
>   	close(params);
>   }
>   
> +static void check_bo(int i915, uint32_t handle, unsigned int count, bool wait)

s/count/expected? you're not using that variable as a count, just as a 
value to compare against

> +{
> +	uint32_t *map;
> +
> +	map = gem_mmap__cpu(i915, handle, 0, 4096, PROT_READ);
> +	if (wait)
> +		gem_set_domain(i915, handle, I915_GEM_DOMAIN_CPU,
> +			       I915_GEM_DOMAIN_CPU);
> +	igt_assert_eq(map[0], count);
> +	munmap(map, 4096);
> +}
> +
> +static struct drm_i915_query_engine_info *query_engine_info(int i915)
> +{
> +	struct drm_i915_query_engine_info *engines;
> +	struct drm_i915_query_item item;
> +
> +#define QUERY_SIZE	0x4000
> +	engines = malloc(QUERY_SIZE);
> +	igt_assert(engines);
> +
> +	memset(engines, 0, QUERY_SIZE);
> +	memset(&item, 0, sizeof(item));
> +	item.query_id = DRM_I915_QUERY_ENGINE_INFO;
> +	item.data_ptr = to_user_pointer(engines);
> +	item.length = QUERY_SIZE;
> +
> +	i915_query_items(i915, &item, 1);

There is an helper you can use for this query (__gem_query_engines)

> +	igt_assert(item.length >= 0);
> +	igt_assert(item.length <= QUERY_SIZE);
> +#undef QUERY_SIZE
> +
> +	return engines;
> +}
> +
> +/* This function only works if siblings contains all instances of a class */
> +static void logical_sort_siblings(int i915,
> +				  struct i915_engine_class_instance *siblings,
> +				  unsigned int count)
> +{
> +	struct i915_engine_class_instance *sorted;
> +	struct drm_i915_query_engine_info *engines;
> +	unsigned int i, j;
> +
> +	sorted = calloc(count, sizeof(*sorted));
> +	igt_assert(sorted);
> +
> +	engines = query_engine_info(i915);
> +
> +	for (j = 0; j < count; ++j) {
> +		for (i = 0; i < engines->num_engines; ++i) {
> +			if (siblings[j].engine_class ==
> +			    engines->engines[i].engine.engine_class &&
> +			    siblings[j].engine_instance ==
> +			    engines->engines[i].engine.engine_instance) {
> +				uint16_t logical_instance =
> +					engines->engines[i].logical_instance;
> +
> +				igt_assert(logical_instance < count);
> +				igt_assert(!sorted[logical_instance].engine_class);
> +				igt_assert(!sorted[logical_instance].engine_instance);
> +
> +				sorted[logical_instance] = siblings[j];
> +				break;
> +			}
> +		}
> +		igt_assert(i != engines->num_engines);
> +	}
> +
> +	memcpy(siblings, sorted, sizeof(*sorted) * count);
> +	free(sorted);
> +	free(engines);
> +}
> +
> +#define PARALLEL_BB_FIRST		(0x1 << 0)
> +#define PARALLEL_OUT_FENCE		(0x1 << 1)
> +#define PARALLEL_IN_FENCE		(0x1 << 2)
> +#define PARALLEL_SUBMIT_FENCE		(0x1 << 3)
> +#define PARALLEL_CONTEXTS		(0x1 << 4)
> +#define PARALLEL_VIRTUAL		(0x1 << 5)
> +
> +static void parallel_thread(int i915, unsigned int flags,
> +			    struct i915_engine_class_instance *siblings,
> +			    unsigned int count, unsigned int bb_per_execbuf)
> +{
> +	const intel_ctx_t *ctx = NULL;
> +	int n, i, j, fence = 0;
> +	uint32_t batch[16];
> +	struct drm_i915_gem_execbuffer2 execbuf;
> +	struct drm_i915_gem_exec_object2 obj[32];

Max num of objects is 32, do we need an assert that bb_per_execbuf <=31 
to leave room for the target BO? Or is that overkill since we likely 
won't have that many engines?

> +#define PARALLEL_BB_LOOP_COUNT	512
> +	const intel_ctx_t *ctxs[PARALLEL_BB_LOOP_COUNT];
> +	uint32_t target_bo_idx = 0;
> +	uint32_t first_bb_idx = 1;
> +	intel_ctx_cfg_t cfg;
> +
> +	if (flags & PARALLEL_BB_FIRST) {
> +		target_bo_idx = bb_per_execbuf;
> +		first_bb_idx = 0;
> +	}
> +
> +	memset(&cfg, 0, sizeof(cfg));
> +	if (flags & PARALLEL_VIRTUAL) {
> +		cfg.parallel = true;
> +		cfg.num_engines = count / bb_per_execbuf;

igt_assert (count >= bb_per_execbuf && count % bb_per_execbuf == 0) to 
make sure the provided values are fine?

> +		cfg.width = bb_per_execbuf;
> +
> +		for (i = 0; i < cfg.width; ++i)
> +			for (j = 0; j < cfg.num_engines; ++j)
> +				memcpy(cfg.engines + i * cfg.num_engines + j,
> +				       siblings + j * cfg.width + i,
> +				       sizeof(*siblings));
> +	} else {
> +		cfg.parallel = true;
> +		cfg.num_engines = 1;
> +		cfg.width = count;

Here the usage of count vs bb_per_execbuf gets a bit counfusing. AFAICS 
using width = count here only works if  count = bb_per_execbuf , because 
in the loop below we only create bb_per_execbuf batches. Why not use 
bb_per_execbuf directly for consistency? That would also allow you to 
pull the base cfg out of the if statement here and just always use:

cfg.parallel = true;
cfg.num_engines = count / bb_per_execbuf;
cfg.width = bb_per_execbuf;

Because if count = bb_per_execbuf it resolves to the same values anyway.

> +		memcpy(cfg.engines, siblings, sizeof(*siblings) * count);
> +	}
> +	ctx = intel_ctx_create(i915, &cfg);
> +
> +	i = 0;
> +	batch[i] = MI_ATOMIC | MI_ATOMIC_INLINE_DATA |
> +		MI_ATOMIC_ADD;
> +#define TARGET_BO_OFFSET	(0x1 << 16)
> +	batch[++i] = TARGET_BO_OFFSET;
> +	batch[++i] = 0;
> +	batch[++i] = 1;
> +	batch[++i] = MI_BATCH_BUFFER_END;
> +
> +	memset(obj, 0, sizeof(obj));
> +	obj[target_bo_idx].offset = TARGET_BO_OFFSET;
> +	obj[target_bo_idx].flags = EXEC_OBJECT_PINNED | EXEC_OBJECT_WRITE;
> +	obj[target_bo_idx].handle = gem_create(i915, 4096);
> +
> +	for (i = first_bb_idx; i < bb_per_execbuf + first_bb_idx; ++i) {
> +		obj[i].handle = gem_create(i915, 4096);
> +		gem_write(i915, obj[i].handle, 0, batch,
> +			  sizeof(batch));
> +	}
> +
> +	memset(&execbuf, 0, sizeof(execbuf));
> +	execbuf.buffers_ptr = to_user_pointer(obj);
> +	execbuf.buffer_count = bb_per_execbuf + 1;
> +	execbuf.flags |= I915_EXEC_HANDLE_LUT;
> +	if (flags & PARALLEL_BB_FIRST)
> +		execbuf.flags |= I915_EXEC_BATCH_FIRST;
> +	if (flags & PARALLEL_OUT_FENCE)
> +		execbuf.flags |= I915_EXEC_FENCE_OUT;
> +	execbuf.buffers_ptr = to_user_pointer(obj);
> +	execbuf.rsvd1 = ctx->id;
> +
> +	for (n = 0; n < PARALLEL_BB_LOOP_COUNT; ++n) {
> +		for (i = 0; i < count / bb_per_execbuf; ++i ) {

As discussed offline, this internal loop doesn't do anything (we only 
ever cycle once) and should be removed.

> +			execbuf.flags &= ~0x3full;
> +			execbuf.flags |= i;
> +			gem_execbuf_wr(i915, &execbuf);
> +
> +			if (flags & PARALLEL_OUT_FENCE) {
> +				igt_assert_eq(sync_fence_wait(execbuf.rsvd2 >> 32,
> +							      1000), 0);
> +				igt_assert_eq(sync_fence_status(execbuf.rsvd2 >> 32), 1);
> +
> +				if (fence)
> +					close(fence);
> +				fence = execbuf.rsvd2 >> 32;
> +
> +				if (flags & PARALLEL_SUBMIT_FENCE) {
> +					execbuf.flags |=
> +						I915_EXEC_FENCE_SUBMIT;
> +					execbuf.rsvd2 >>= 32;
> +				} else if (flags &  PARALLEL_IN_FENCE) {
> +					execbuf.flags |=
> +						I915_EXEC_FENCE_IN;
> +					execbuf.rsvd2 >>= 32;
> +				} else {
> +					execbuf.rsvd2 = 0;
> +				}
> +			}
> +
> +			if (flags & PARALLEL_VIRTUAL)
> +				break;
> +		}
> +
> +		if (flags & PARALLEL_CONTEXTS) {
> +			ctxs[n] = ctx;
> +			ctx = intel_ctx_create(i915, &cfg);
> +			execbuf.rsvd1 = ctx->id;
> +		}
> +	}
> +	if (fence)
> +		close(fence);
> +
> +	check_bo(i915, obj[target_bo_idx].handle, flags & PARALLEL_VIRTUAL ?
> +		 bb_per_execbuf * PARALLEL_BB_LOOP_COUNT :
> +		 count * PARALLEL_BB_LOOP_COUNT, true);

same as above, can just use bb_per_execbuf unconditionally here

> +
> +	intel_ctx_destroy(i915, ctx);
> +	for (i = 0; flags & PARALLEL_CONTEXTS &&
> +	     i < PARALLEL_BB_LOOP_COUNT; ++i) {
> +		intel_ctx_destroy(i915, ctxs[i]);
> +	}
> +	for (i = 0; i < bb_per_execbuf + 1; ++i)
> +		gem_close(i915, obj[i].handle);
> +}
> +
> +static void parallel(int i915, unsigned int flags)
> +{
> +	for (int class = 0; class < 32; class++) {

I think we usually avoid declaring variables inside the for loops 
statements, even if the recent C standards allow it, but not sure if we 
have an official style in this regard. there is multiple instance of 
this in this file.

> +		struct i915_engine_class_instance *siblings;
> +		unsigned int count, bb_per_execbuf;
> +
> +		siblings = list_engines(i915, 1u << class, &count);
> +		if (!siblings)
> +			continue;
> +
> +		if (count < 2) {
> +			free(siblings);
> +			continue;
> +		}
> +
> +		logical_sort_siblings(i915, siblings, count);
> +		bb_per_execbuf = count;
> +
> +		parallel_thread(i915, flags, siblings,
> +				count, bb_per_execbuf);
> +
> +		free(siblings);
> +	}
> +}
> +
> +static void parallel_balancer(int i915, unsigned int flags)
> +{
> +	for (int class = 0; class < 32; class++) {
> +		struct i915_engine_class_instance *siblings;
> +		unsigned int count;
> +
> +		siblings = list_engines(i915, 1u << class, &count);
> +		if (!siblings)
> +			continue;
> +
> +		if (count < 4) {
> +			free(siblings);
> +			continue;
> +		}
> +
> +		logical_sort_siblings(i915, siblings, count);
> +
> +		for (unsigned int bb_per_execbuf = 2;;) {
> +			igt_fork(child, count / bb_per_execbuf)
> +				parallel_thread(i915,
> +						flags | PARALLEL_VIRTUAL,
> +						siblings,
> +						count,
> +						bb_per_execbuf);

As a possible future improvement IMO it'd be nice to check that 2 
parallel VEs are deployed to the HW at the same time. The test will 
currently pass even if they are serialized. Not a blocker.

> +			igt_waitchildren();
> +
> +			if (count / ++bb_per_execbuf <= 1)
> +				break;

bikeshed: why not just put this in the if statement?

for (bb = 2; count / bb > 1 ; ++bb)

not a blocker.

> +		}
> +
> +		free(siblings);
> +	}
> +}
> +
> +static bool fence_busy(int fence)
> +{
> +	return poll(&(struct pollfd){fence, POLLIN}, 1, 0) == 0;
> +}
> +
> +static void parallel_ordering(int i915, unsigned int flags)

A one-line comment about the test to describe what it's doing would help 
IMO.

> +{
> +	for (int class = 0; class < 32; class++) {
> +		const intel_ctx_t *ctx = NULL, *spin_ctx = NULL;
> +		struct i915_engine_class_instance *siblings;
> +		unsigned int count;
> +		int i = 0, fence = 0;
> +		uint32_t batch[16];
> +		struct drm_i915_gem_execbuffer2 execbuf;
> +		struct drm_i915_gem_exec_object2 obj[32];
> +		igt_spin_t *spin;
> +		intel_ctx_cfg_t cfg;
> +
> +		siblings = list_engines(i915, 1u << class, &count);
> +		if (!siblings)
> +			continue;
> +
> +		if (count < 2) {
> +			free(siblings);
> +			continue;
> +		}
> +
> +		logical_sort_siblings(i915, siblings, count);
> +
> +		memset(&cfg, 0, sizeof(cfg));
> +		cfg.parallel = true;
> +		cfg.num_engines = 1;
> +		cfg.width = count;
> +		memcpy(cfg.engines, siblings, sizeof(*siblings) * count);
> +
> +		ctx = intel_ctx_create(i915, &cfg);
> +
> +		batch[i] = MI_ATOMIC | MI_ATOMIC_INLINE_DATA |
> +			MI_ATOMIC_ADD;
> +		batch[++i] = TARGET_BO_OFFSET;
> +		batch[++i] = 0;
> +		batch[++i] = 1;
> +		batch[++i] = MI_BATCH_BUFFER_END;
> +
> +		memset(obj, 0, sizeof(obj));
> +		obj[0].offset = TARGET_BO_OFFSET;
> +		obj[0].flags = EXEC_OBJECT_PINNED | EXEC_OBJECT_WRITE;
> +		obj[0].handle = gem_create(i915, 4096);
> +
> +		for (i = 1; i < count + 1; ++i) {
> +			obj[i].handle = gem_create(i915, 4096);
> +			gem_write(i915, obj[i].handle, 0, batch,
> +				  sizeof(batch));
> +		}

The object setup code here is identical to the one in parallel_thread(), 
maybe move it to a common function?

> +
> +		memset(&execbuf, 0, sizeof(execbuf));
> +		execbuf.buffers_ptr = to_user_pointer(obj);
> +		execbuf.buffer_count = count + 1;
> +		execbuf.flags |= I915_EXEC_HANDLE_LUT;
> +		execbuf.flags |= I915_EXEC_NO_RELOC;
> +		execbuf.flags |= I915_EXEC_FENCE_OUT;
> +		execbuf.buffers_ptr = to_user_pointer(obj);
> +		execbuf.rsvd1 = ctx->id;
> +
> +		/* Block parallel submission */
> +		spin_ctx = ctx_create_engines(i915, siblings, count);
> +		spin = __igt_spin_new(i915,
> +				      .ctx = spin_ctx,
> +				      .engine = 0,
> +				      .flags = IGT_SPIN_FENCE_OUT |
> +				      IGT_SPIN_NO_PREEMPTION);
> +
> +		/* Wait for spinners to start */
> +		usleep(5 * 10000);
> +		igt_assert(fence_busy(spin->out_fence));
> +
> +		/* Submit parallel execbuf */
> +		gem_execbuf_wr(i915, &execbuf);
> +		fence = execbuf.rsvd2 >> 32;
> +
> +		/*
> +		 * Wait long enough for timeslcing to kick in but not
> +		 * preemption. Spinner + parallel execbuf should be
> +		 * active.
> +		 */
> +		usleep(25 * 10000);

This is a pretty arbitrary number, what if the system has been set up 
with a longer timeslicing period (or none at all) and/or a shorter 
preemption timeout? IMO you should read those out of sysfs and tune the 
waits accordingly

Daniele

> +		igt_assert(fence_busy(spin->out_fence));
> +		igt_assert(fence_busy(fence));
> +		check_bo(i915, obj[0].handle, 0, false);
> +
> +		/*
> +		 * End spinner and wait for spinner + parallel execbuf
> +		 * to compelte.
> +		 */
> +		igt_spin_end(spin);
> +		igt_assert_eq(sync_fence_wait(fence, 1000), 0);
> +		igt_assert_eq(sync_fence_status(fence), 1);
> +		check_bo(i915, obj[0].handle, count, true);
> +		close(fence);
> +
> +		/* Clean up */
> +		intel_ctx_destroy(i915, ctx);
> +		intel_ctx_destroy(i915, spin_ctx);
> +		for (i = 0; i < count + 1; ++i)
> +			gem_close(i915, obj[i].handle);
> +		free(siblings);
> +		igt_spin_free(i915, spin);
> +	}
> +}
> +
>   static bool has_persistence(int i915)
>   {
>   	struct drm_i915_gem_context_param p = {
> @@ -2786,6 +3186,61 @@ static bool has_load_balancer(int i915)
>   	return err == 0;
>   }
>   
> +static bool has_logical_mapping(int i915)
> +{
> +	struct drm_i915_query_engine_info *engines;
> +	unsigned int i;
> +
> +	engines = query_engine_info(i915);
> +
> +	for (i = 0; i < engines->num_engines; ++i)
> +		if (!(engines->engines[i].flags &
> +		     I915_ENGINE_INFO_HAS_LOGICAL_INSTANCE)) {
> +			free(engines);
> +			return false;
> +		}
> +
> +	free(engines);
> +	return true;
> +}
> +
> +static bool has_parallel_execbuf(int i915)
> +{
> +	intel_ctx_cfg_t cfg = {
> +		.parallel = true,
> +		.num_engines = 1,
> +	};
> +	const intel_ctx_t *ctx = NULL;
> +	int err;
> +
> +	for (int class = 0; class < 32; class++) {
> +		struct i915_engine_class_instance *siblings;
> +		unsigned int count;
> +
> +		siblings = list_engines(i915, 1u << class, &count);
> +		if (!siblings)
> +			continue;
> +
> +		if (count < 2) {
> +			free(siblings);
> +			continue;
> +		}
> +
> +		logical_sort_siblings(i915, siblings, count);
> +
> +		cfg.width = count;
> +		memcpy(cfg.engines, siblings, sizeof(*siblings) * count);
> +		free(siblings);
> +
> +		err = __intel_ctx_create(i915, &cfg, &ctx);
> +		intel_ctx_destroy(i915, ctx);
> +
> +		return err == 0;
> +	}
> +
> +	return false;
> +}
> +
>   igt_main
>   {
>   	int i915 = -1;
> @@ -2886,6 +3341,38 @@ igt_main
>   		igt_stop_hang_detector();
>   	}
>   
> +	igt_subtest_group {
> +		igt_fixture {
> +			igt_require(has_logical_mapping(i915));
> +			igt_require(has_parallel_execbuf(i915));
> +		}
> +
> +		igt_subtest("parallel-ordering")
> +			parallel_ordering(i915, 0);
> +
> +		igt_subtest("parallel")
> +			parallel(i915, 0);
> +
> +		igt_subtest("parallel-bb-first")
> +			parallel(i915, PARALLEL_BB_FIRST);
> +
> +		igt_subtest("parallel-out-fence")
> +			parallel(i915, PARALLEL_OUT_FENCE);
> +
> +		igt_subtest("parallel-keep-in-fence")
> +			parallel(i915, PARALLEL_OUT_FENCE | PARALLEL_IN_FENCE);
> +
> +		igt_subtest("parallel-keep-submit-fence")
> +			parallel(i915, PARALLEL_OUT_FENCE |
> +				 PARALLEL_SUBMIT_FENCE);
> +
> +		igt_subtest("parallel-contexts")
> +			parallel(i915, PARALLEL_CONTEXTS);
> +
> +		igt_subtest("parallel-balancer")
> +			parallel_balancer(i915, 0);
> +	}
> +
>   	igt_subtest_group {
>   		igt_hang_t  hang;
>   




^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [igt-dev] [PATCH] i915/gem_exec_balancer: Test parallel execbuf
  2021-11-02 18:34 ` [igt-dev] [PATCH] " Daniele Ceraolo Spurio
@ 2021-11-02 21:55   ` Matthew Brost
  0 siblings, 0 replies; 5+ messages in thread
From: Matthew Brost @ 2021-11-02 21:55 UTC (permalink / raw)
  To: Daniele Ceraolo Spurio; +Cc: igt-dev

On Tue, Nov 02, 2021 at 11:34:08AM -0700, Daniele Ceraolo Spurio wrote:
> 
> 
> On 10/21/2021 5:18 PM, Matthew Brost wrote:
> > Add basic parallel execbuf submission test which more or less just
> > submits the same BB in loop a which does an atomic increment to a memory
> > location. The memory location is checked at the end for the correct
> > value. Different sections use various IOCTL options (e.g. fences,
> > location of BBs, etc...).
> > 
> > In addition to above sections, an additional section ensure the ordering
> > of parallel submission by submitting a spinning batch to 1 individual
> > engine, submit a parallel execbuf to all engines instances within the
> > class, verify none on parallel execbuf make to hardware, release
> > spinner, and finally verify everything has completed.
> > 
> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > ---
> >   include/drm-uapi/i915_drm.h    | 136 ++++++++-
> >   lib/intel_ctx.c                |  28 +-
> >   lib/intel_ctx.h                |   2 +
> >   lib/intel_reg.h                |   5 +
> >   tests/i915/gem_exec_balancer.c | 487 +++++++++++++++++++++++++++++++++
> >   5 files changed, 656 insertions(+), 2 deletions(-)
> > 
> > diff --git a/include/drm-uapi/i915_drm.h b/include/drm-uapi/i915_drm.h
> > index c788a1ab4..b57f52623 100644
> > --- a/include/drm-uapi/i915_drm.h
> > +++ b/include/drm-uapi/i915_drm.h
> 
> The uapi file needs to be in sync with drm-next. If the changes have already
> reached drm-next then we should just have a separate patch doing the file
> sync, otherwise these defs must move to lib/i915/i915_drm_local.h
> 

I think the uAPI changes have landed in drm-next. I'm going to guess the
DIM script can sync header files for me. Let me look into this. 


> > @@ -1824,6 +1824,7 @@ struct drm_i915_gem_context_param {
> >    * Extensions:
> >    *   i915_context_engines_load_balance (I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE)
> >    *   i915_context_engines_bond (I915_CONTEXT_ENGINES_EXT_BOND)
> > + *   i915_context_engines_parallel_submit (I915_CONTEXT_ENGINES_EXT_PARALLEL_SUBMIT)
> >    */
> >   #define I915_CONTEXT_PARAM_ENGINES	0xa
> > @@ -2104,10 +2105,137 @@ struct i915_context_engines_bond {
> >    * 	gem_execbuf(drm_fd, &execbuf);
> >    */
> > +/**
> > + * struct i915_context_engines_parallel_submit - Configure engine for
> > + * parallel submission.
> > + *
> > + * Setup a slot in the context engine map to allow multiple BBs to be submitted
> > + * in a single execbuf IOCTL. Those BBs will then be scheduled to run on the GPU
> > + * in parallel. Multiple hardware contexts are created internally in the i915
> > + * run these BBs. Once a slot is configured for N BBs only N BBs can be
> > + * submitted in each execbuf IOCTL and this is implicit behavior e.g. The user
> > + * doesn't tell the execbuf IOCTL there are N BBs, the execbuf IOCTL knows how
> > + * many BBs there are based on the slot's configuration. The N BBs are the last
> > + * N buffer objects or first N if I915_EXEC_BATCH_FIRST is set.
> > + *
> > + * The default placement behavior is to create implicit bonds between each
> > + * context if each context maps to more than 1 physical engine (e.g. context is
> > + * a virtual engine). Also we only allow contexts of same engine class and these
> > + * contexts must be in logically contiguous order. Examples of the placement
> > + * behavior described below. Lastly, the default is to not allow BBs to
> > + * preempted mid BB rather insert coordinated preemption on all hardware
> > + * contexts between each set of BBs. Flags may be added in the future to change
> > + * both of these default behaviors.
> > + *
> > + * Returns -EINVAL if hardware context placement configuration is invalid or if
> > + * the placement configuration isn't supported on the platform / submission
> > + * interface.
> > + * Returns -ENODEV if extension isn't supported on the platform / submission
> > + * interface.
> > + *
> > + * .. code-block:: none
> > + *
> > + *	Example 1 pseudo code:
> > + *	CS[X] = generic engine of same class, logical instance X
> > + *	INVALID = I915_ENGINE_CLASS_INVALID, I915_ENGINE_CLASS_INVALID_NONE
> > + *	set_engines(INVALID)
> > + *	set_parallel(engine_index=0, width=2, num_siblings=1,
> > + *		     engines=CS[0],CS[1])
> > + *
> > + *	Results in the following valid placement:
> > + *	CS[0], CS[1]
> > + *
> > + *	Example 2 pseudo code:
> > + *	CS[X] = generic engine of same class, logical instance X
> > + *	INVALID = I915_ENGINE_CLASS_INVALID, I915_ENGINE_CLASS_INVALID_NONE
> > + *	set_engines(INVALID)
> > + *	set_parallel(engine_index=0, width=2, num_siblings=2,
> > + *		     engines=CS[0],CS[2],CS[1],CS[3])
> > + *
> > + *	Results in the following valid placements:
> > + *	CS[0], CS[1]
> > + *	CS[2], CS[3]
> > + *
> > + *	This can also be thought of as 2 virtual engines described by 2-D array
> > + *	in the engines the field with bonds placed between each index of the
> > + *	virtual engines. e.g. CS[0] is bonded to CS[1], CS[2] is bonded to
> > + *	CS[3].
> > + *	VE[0] = CS[0], CS[2]
> > + *	VE[1] = CS[1], CS[3]
> > + *
> > + *	Example 3 pseudo code:
> > + *	CS[X] = generic engine of same class, logical instance X
> > + *	INVALID = I915_ENGINE_CLASS_INVALID, I915_ENGINE_CLASS_INVALID_NONE
> > + *	set_engines(INVALID)
> > + *	set_parallel(engine_index=0, width=2, num_siblings=2,
> > + *		     engines=CS[0],CS[1],CS[1],CS[3])
> > + *
> > + *	Results in the following valid and invalid placements:
> > + *	CS[0], CS[1]
> > + *	CS[1], CS[3] - Not logical contiguous, return -EINVAL
> > + */
> > +struct i915_context_engines_parallel_submit {
> > +	/**
> > +	 * @base: base user extension.
> > +	 */
> > +	struct i915_user_extension base;
> > +
> > +	/**
> > +	 * @engine_index: slot for parallel engine
> > +	 */
> > +	__u16 engine_index;
> > +
> > +	/**
> > +	 * @width: number of contexts per parallel engine
> > +	 */
> > +	__u16 width;
> > +
> > +	/**
> > +	 * @num_siblings: number of siblings per context
> > +	 */
> > +	__u16 num_siblings;
> > +
> > +	/**
> > +	 * @mbz16: reserved for future use; must be zero
> > +	 */
> > +	__u16 mbz16;
> > +
> > +	/**
> > +	 * @flags: all undefined flags must be zero, currently not defined flags
> > +	 */
> > +	__u64 flags;
> > +
> > +	/**
> > +	 * @mbz64: reserved for future use; must be zero
> > +	 */
> > +	__u64 mbz64[3];
> > +
> > +	/**
> > +	 * @engines: 2-d array of engine instances to configure parallel engine
> > +	 *
> > +	 * length = width (i) * num_siblings (j)
> > +	 * index = j + i * num_siblings
> > +	 */
> > +	struct i915_engine_class_instance engines[0];
> > +
> > +} __packed;
> > +
> > +#define I915_DEFINE_CONTEXT_ENGINES_PARALLEL_SUBMIT(name__, N__) struct { \
> > +	struct i915_user_extension base; \
> > +	__u16 engine_index; \
> > +	__u16 width; \
> > +	__u16 num_siblings; \
> > +	__u16 mbz16; \
> > +	__u64 flags; \
> > +	__u64 mbz64[3]; \
> > +	struct i915_engine_class_instance engines[N__]; \
> > +} __attribute__((packed)) name__
> > +
> >   struct i915_context_param_engines {
> >   	__u64 extensions; /* linked chain of extension blocks, 0 terminates */
> >   #define I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE 0 /* see i915_context_engines_load_balance */
> >   #define I915_CONTEXT_ENGINES_EXT_BOND 1 /* see i915_context_engines_bond */
> > +#define I915_CONTEXT_ENGINES_EXT_PARALLEL_SUBMIT 2 /* see i915_context_engines_parallel_submit */
> >   	struct i915_engine_class_instance engines[0];
> >   } __attribute__((packed));
> > @@ -2726,14 +2854,20 @@ struct drm_i915_engine_info {
> >   	/** @flags: Engine flags. */
> >   	__u64 flags;
> > +#define I915_ENGINE_INFO_HAS_LOGICAL_INSTANCE		(1 << 0)
> >   	/** @capabilities: Capabilities of this engine. */
> >   	__u64 capabilities;
> >   #define I915_VIDEO_CLASS_CAPABILITY_HEVC		(1 << 0)
> >   #define I915_VIDEO_AND_ENHANCE_CLASS_CAPABILITY_SFC	(1 << 1)
> > +	/** @logical_instance: Logical instance of engine */
> > +	__u16 logical_instance;
> > +
> >   	/** @rsvd1: Reserved fields. */
> > -	__u64 rsvd1[4];
> > +	__u16 rsvd1[3];
> > +	/** @rsvd2: Reserved fields. */
> > +	__u64 rsvd2[3];
> >   };
> >   /**
> > diff --git a/lib/intel_ctx.c b/lib/intel_ctx.c
> > index f28c15544..11ec6fca4 100644
> > --- a/lib/intel_ctx.c
> > +++ b/lib/intel_ctx.c
> > @@ -83,6 +83,7 @@ __context_create_cfg(int fd, const intel_ctx_cfg_t *cfg, uint32_t *ctx_id)
> >   {
> >   	uint64_t ext_root = 0;
> >   	I915_DEFINE_CONTEXT_ENGINES_LOAD_BALANCE(balance, GEM_MAX_ENGINES);
> > +	I915_DEFINE_CONTEXT_ENGINES_PARALLEL_SUBMIT(parallel, GEM_MAX_ENGINES);
> >   	I915_DEFINE_CONTEXT_PARAM_ENGINES(engines, GEM_MAX_ENGINES);
> >   	struct drm_i915_gem_context_create_ext_setparam engines_param, vm_param;
> >   	struct drm_i915_gem_context_create_ext_setparam persist_param;
> > @@ -117,7 +118,29 @@ __context_create_cfg(int fd, const intel_ctx_cfg_t *cfg, uint32_t *ctx_id)
> >   		unsigned num_logical_engines;
> >   		memset(&engines, 0, sizeof(engines));
> 
> Do we need an assert to make sure cfg->load_balance and cfg->parallel are
> not set at the same time?
> 

Do we need to - no. Would it be a good idea? Yes. Will do.

> > -		if (cfg->load_balance) {
> > +		if (cfg->parallel) {
> > +			memset(&parallel, 0, sizeof(parallel));
> > +
> > +			num_logical_engines = 1;
> > +
> > +			parallel.base.name =
> > +				I915_CONTEXT_ENGINES_EXT_PARALLEL_SUBMIT;
> > +
> > +			engines.engines[0].engine_class =
> > +				I915_ENGINE_CLASS_INVALID;
> > +			engines.engines[0].engine_instance =
> > +				I915_ENGINE_CLASS_INVALID_NONE;
> > +
> > +			parallel.num_siblings = cfg->num_engines;
> > +			parallel.width = cfg->width;
> > +			for (i = 0; i < cfg->num_engines * cfg->width; i++) {
> > +				igt_assert_eq(cfg->engines[0].engine_class,
> > +					      cfg->engines[i].engine_class);
> > +				parallel.engines[i] = cfg->engines[i];
> > +			}
> > +
> > +			engines.extensions = to_user_pointer(&parallel);
> > +		} else if (cfg->load_balance) {
> >   			memset(&balance, 0, sizeof(balance));
> >   			/* In this case, the first engine is the virtual
> > @@ -127,6 +150,9 @@ __context_create_cfg(int fd, const intel_ctx_cfg_t *cfg, uint32_t *ctx_id)
> >   			igt_assert(cfg->num_engines + 1 <= GEM_MAX_ENGINES);
> >   			num_logical_engines = cfg->num_engines + 1;
> > +			balance.base.name =
> > +				I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE;
> > +
> >   			engines.engines[0].engine_class =
> >   				I915_ENGINE_CLASS_INVALID;
> >   			engines.engines[0].engine_instance =
> > diff --git a/lib/intel_ctx.h b/lib/intel_ctx.h
> > index 9649f6d96..89c65fcd3 100644
> > --- a/lib/intel_ctx.h
> > +++ b/lib/intel_ctx.h
> > @@ -46,7 +46,9 @@ typedef struct intel_ctx_cfg {
> >   	uint32_t vm;
> >   	bool nopersist;
> >   	bool load_balance;
> > +	bool parallel;
> >   	unsigned int num_engines;
> > +	unsigned int width;
> 
> Given that width is only set when parallel is true, we could potentially
> have a single var (parallel_width?) and check for it being > 0 instead of
> checking the bool. Just a thought, not a blocker.
> 

The precedent seems to be just use a bool (e.g. nopersist, load_balance)
are already in place.

> >   	struct i915_engine_class_instance engines[GEM_MAX_ENGINES];
> >   } intel_ctx_cfg_t;
> > diff --git a/lib/intel_reg.h b/lib/intel_reg.h
> > index c447525a0..44b0d480f 100644
> > --- a/lib/intel_reg.h
> > +++ b/lib/intel_reg.h
> > @@ -2642,6 +2642,11 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
> >   #define STATE3D_COLOR_FACTOR	((0x3<<29)|(0x1d<<24)|(0x01<<16))
> > +/* Atomics */
> > +#define MI_ATOMIC			((0x2f << 23) | 2)
> > +#define   MI_ATOMIC_INLINE_DATA         (1 << 18)
> > +#define   MI_ATOMIC_ADD                 (0x7 << 8)
> > +
> >   /* Batch */
> >   #define MI_BATCH_BUFFER		((0x30 << 23) | 1)
> >   #define MI_BATCH_BUFFER_START	(0x31 << 23)
> > diff --git a/tests/i915/gem_exec_balancer.c b/tests/i915/gem_exec_balancer.c
> > index e4e5cda4a..171295777 100644
> > --- a/tests/i915/gem_exec_balancer.c
> > +++ b/tests/i915/gem_exec_balancer.c
> > @@ -25,6 +25,7 @@
> >   #include <sched.h>
> >   #include <sys/ioctl.h>
> >   #include <sys/signal.h>
> > +#include <poll.h>
> >   #include "i915/gem.h"
> >   #include "i915/gem_create.h"
> > @@ -56,6 +57,31 @@ static size_t sizeof_load_balance(int count)
> >   #define alloca0(sz) ({ size_t sz__ = (sz); memset(alloca(sz__), 0, sz__); })
> > +static int
> > +__i915_query(int fd, struct drm_i915_query *q)
> > +{
> > +	if (igt_ioctl(fd, DRM_IOCTL_I915_QUERY, q))
> > +		return -errno;
> > +
> > +	return 0;
> > +}
> > +
> > +static int
> > +__i915_query_items(int fd, struct drm_i915_query_item *items, uint32_t n_items)
> > +{
> > +	struct drm_i915_query q = {
> > +		.num_items = n_items,
> > +		.items_ptr = to_user_pointer(items),
> > +		};
> > +
> > +	return __i915_query(fd, &q);
> > +}
> 
> Identical query helpers are implemented in a couple other places
> (lib/i915/intel_memory_region.c, tests/i915/i915_query.c), so I believe we
> have critical usage mass to move them to their own lib file.
> 

Sure. That might be a follow up or just another patch in the series.

> > +
> > +#define i915_query_items(fd, items, n_items) do { \
> > +		igt_assert_eq(__i915_query_items(fd, items, n_items), 0); \
> > +		errno = 0; \
> > +	} while (0)
> > +
> >   static bool has_class_instance(int i915, uint16_t class, uint16_t instance)
> >   {
> >   	int fd;
> > @@ -2752,6 +2778,380 @@ static void nohangcheck(int i915)
> >   	close(params);
> >   }
> > +static void check_bo(int i915, uint32_t handle, unsigned int count, bool wait)
> 
> s/count/expected? you're not using that variable as a count, just as a value
> to compare against
>

expected is a better name. Will change.
 
> > +{
> > +	uint32_t *map;
> > +
> > +	map = gem_mmap__cpu(i915, handle, 0, 4096, PROT_READ);
> > +	if (wait)
> > +		gem_set_domain(i915, handle, I915_GEM_DOMAIN_CPU,
> > +			       I915_GEM_DOMAIN_CPU);
> > +	igt_assert_eq(map[0], count);
> > +	munmap(map, 4096);
> > +}
> > +
> > +static struct drm_i915_query_engine_info *query_engine_info(int i915)
> > +{
> > +	struct drm_i915_query_engine_info *engines;
> > +	struct drm_i915_query_item item;
> > +
> > +#define QUERY_SIZE	0x4000
> > +	engines = malloc(QUERY_SIZE);
> > +	igt_assert(engines);
> > +
> > +	memset(engines, 0, QUERY_SIZE);
> > +	memset(&item, 0, sizeof(item));
> > +	item.query_id = DRM_I915_QUERY_ENGINE_INFO;
> > +	item.data_ptr = to_user_pointer(engines);
> > +	item.length = QUERY_SIZE;
> > +
> > +	i915_query_items(i915, &item, 1);
> 
> There is an helper you can use for this query (__gem_query_engines)
> 

Yes, I think that will work.

> > +	igt_assert(item.length >= 0);
> > +	igt_assert(item.length <= QUERY_SIZE);
> > +#undef QUERY_SIZE
> > +
> > +	return engines;
> > +}
> > +
> > +/* This function only works if siblings contains all instances of a class */
> > +static void logical_sort_siblings(int i915,
> > +				  struct i915_engine_class_instance *siblings,
> > +				  unsigned int count)
> > +{
> > +	struct i915_engine_class_instance *sorted;
> > +	struct drm_i915_query_engine_info *engines;
> > +	unsigned int i, j;
> > +
> > +	sorted = calloc(count, sizeof(*sorted));
> > +	igt_assert(sorted);
> > +
> > +	engines = query_engine_info(i915);
> > +
> > +	for (j = 0; j < count; ++j) {
> > +		for (i = 0; i < engines->num_engines; ++i) {
> > +			if (siblings[j].engine_class ==
> > +			    engines->engines[i].engine.engine_class &&
> > +			    siblings[j].engine_instance ==
> > +			    engines->engines[i].engine.engine_instance) {
> > +				uint16_t logical_instance =
> > +					engines->engines[i].logical_instance;
> > +
> > +				igt_assert(logical_instance < count);
> > +				igt_assert(!sorted[logical_instance].engine_class);
> > +				igt_assert(!sorted[logical_instance].engine_instance);
> > +
> > +				sorted[logical_instance] = siblings[j];
> > +				break;
> > +			}
> > +		}
> > +		igt_assert(i != engines->num_engines);
> > +	}
> > +
> > +	memcpy(siblings, sorted, sizeof(*sorted) * count);
> > +	free(sorted);
> > +	free(engines);
> > +}
> > +
> > +#define PARALLEL_BB_FIRST		(0x1 << 0)
> > +#define PARALLEL_OUT_FENCE		(0x1 << 1)
> > +#define PARALLEL_IN_FENCE		(0x1 << 2)
> > +#define PARALLEL_SUBMIT_FENCE		(0x1 << 3)
> > +#define PARALLEL_CONTEXTS		(0x1 << 4)
> > +#define PARALLEL_VIRTUAL		(0x1 << 5)
> > +
> > +static void parallel_thread(int i915, unsigned int flags,
> > +			    struct i915_engine_class_instance *siblings,
> > +			    unsigned int count, unsigned int bb_per_execbuf)
> > +{
> > +	const intel_ctx_t *ctx = NULL;
> > +	int n, i, j, fence = 0;
> > +	uint32_t batch[16];
> > +	struct drm_i915_gem_execbuffer2 execbuf;
> > +	struct drm_i915_gem_exec_object2 obj[32];
> 
> Max num of objects is 32, do we need an assert that bb_per_execbuf <=31 to
> leave room for the target BO? Or is that overkill since we likely won't have
> that many engines?
>

It can't hurt to future proof the code.
 
> > +#define PARALLEL_BB_LOOP_COUNT	512
> > +	const intel_ctx_t *ctxs[PARALLEL_BB_LOOP_COUNT];
> > +	uint32_t target_bo_idx = 0;
> > +	uint32_t first_bb_idx = 1;
> > +	intel_ctx_cfg_t cfg;
> > +
> > +	if (flags & PARALLEL_BB_FIRST) {
> > +		target_bo_idx = bb_per_execbuf;
> > +		first_bb_idx = 0;
> > +	}
> > +
> > +	memset(&cfg, 0, sizeof(cfg));
> > +	if (flags & PARALLEL_VIRTUAL) {
> > +		cfg.parallel = true;
> > +		cfg.num_engines = count / bb_per_execbuf;
> 
> igt_assert (count >= bb_per_execbuf && count % bb_per_execbuf == 0) to make
> sure the provided values are fine?
>

Sure.
 
> > +		cfg.width = bb_per_execbuf;
> > +
> > +		for (i = 0; i < cfg.width; ++i)
> > +			for (j = 0; j < cfg.num_engines; ++j)
> > +				memcpy(cfg.engines + i * cfg.num_engines + j,
> > +				       siblings + j * cfg.width + i,
> > +				       sizeof(*siblings));
> > +	} else {
> > +		cfg.parallel = true;
> > +		cfg.num_engines = 1;
> > +		cfg.width = count;
> 
> Here the usage of count vs bb_per_execbuf gets a bit counfusing. AFAICS
> using width = count here only works if  count = bb_per_execbuf , because in
> the loop below we only create bb_per_execbuf batches. Why not use
> bb_per_execbuf directly for consistency? That would also allow you to pull
> the base cfg out of the if statement here and just always use:
> 
> cfg.parallel = true;
> cfg.num_engines = count / bb_per_execbuf;
> cfg.width = bb_per_execbuf;
> 
> Because if count = bb_per_execbuf it resolves to the same values anyway.
>

I think that works.
 
> > +		memcpy(cfg.engines, siblings, sizeof(*siblings) * count);
> > +	}
> > +	ctx = intel_ctx_create(i915, &cfg);
> > +
> > +	i = 0;
> > +	batch[i] = MI_ATOMIC | MI_ATOMIC_INLINE_DATA |
> > +		MI_ATOMIC_ADD;
> > +#define TARGET_BO_OFFSET	(0x1 << 16)
> > +	batch[++i] = TARGET_BO_OFFSET;
> > +	batch[++i] = 0;
> > +	batch[++i] = 1;
> > +	batch[++i] = MI_BATCH_BUFFER_END;
> > +
> > +	memset(obj, 0, sizeof(obj));
> > +	obj[target_bo_idx].offset = TARGET_BO_OFFSET;
> > +	obj[target_bo_idx].flags = EXEC_OBJECT_PINNED | EXEC_OBJECT_WRITE;
> > +	obj[target_bo_idx].handle = gem_create(i915, 4096);
> > +
> > +	for (i = first_bb_idx; i < bb_per_execbuf + first_bb_idx; ++i) {
> > +		obj[i].handle = gem_create(i915, 4096);
> > +		gem_write(i915, obj[i].handle, 0, batch,
> > +			  sizeof(batch));
> > +	}
> > +
> > +	memset(&execbuf, 0, sizeof(execbuf));
> > +	execbuf.buffers_ptr = to_user_pointer(obj);
> > +	execbuf.buffer_count = bb_per_execbuf + 1;
> > +	execbuf.flags |= I915_EXEC_HANDLE_LUT;
> > +	if (flags & PARALLEL_BB_FIRST)
> > +		execbuf.flags |= I915_EXEC_BATCH_FIRST;
> > +	if (flags & PARALLEL_OUT_FENCE)
> > +		execbuf.flags |= I915_EXEC_FENCE_OUT;
> > +	execbuf.buffers_ptr = to_user_pointer(obj);
> > +	execbuf.rsvd1 = ctx->id;
> > +
> > +	for (n = 0; n < PARALLEL_BB_LOOP_COUNT; ++n) {
> > +		for (i = 0; i < count / bb_per_execbuf; ++i ) {
> 
> As discussed offline, this internal loop doesn't do anything (we only ever
> cycle once) and should be removed.
> 

Agree, this stale code from a previous version of the test.

> > +			execbuf.flags &= ~0x3full;
> > +			execbuf.flags |= i;
> > +			gem_execbuf_wr(i915, &execbuf);
> > +
> > +			if (flags & PARALLEL_OUT_FENCE) {
> > +				igt_assert_eq(sync_fence_wait(execbuf.rsvd2 >> 32,
> > +							      1000), 0);
> > +				igt_assert_eq(sync_fence_status(execbuf.rsvd2 >> 32), 1);
> > +
> > +				if (fence)
> > +					close(fence);
> > +				fence = execbuf.rsvd2 >> 32;
> > +
> > +				if (flags & PARALLEL_SUBMIT_FENCE) {
> > +					execbuf.flags |=
> > +						I915_EXEC_FENCE_SUBMIT;
> > +					execbuf.rsvd2 >>= 32;
> > +				} else if (flags &  PARALLEL_IN_FENCE) {
> > +					execbuf.flags |=
> > +						I915_EXEC_FENCE_IN;
> > +					execbuf.rsvd2 >>= 32;
> > +				} else {
> > +					execbuf.rsvd2 = 0;
> > +				}
> > +			}
> > +
> > +			if (flags & PARALLEL_VIRTUAL)
> > +				break;
> > +		}
> > +
> > +		if (flags & PARALLEL_CONTEXTS) {
> > +			ctxs[n] = ctx;
> > +			ctx = intel_ctx_create(i915, &cfg);
> > +			execbuf.rsvd1 = ctx->id;
> > +		}
> > +	}
> > +	if (fence)
> > +		close(fence);
> > +
> > +	check_bo(i915, obj[target_bo_idx].handle, flags & PARALLEL_VIRTUAL ?
> > +		 bb_per_execbuf * PARALLEL_BB_LOOP_COUNT :
> > +		 count * PARALLEL_BB_LOOP_COUNT, true);
> 
> same as above, can just use bb_per_execbuf unconditionally here
>

Yep.

> > +
> > +	intel_ctx_destroy(i915, ctx);
> > +	for (i = 0; flags & PARALLEL_CONTEXTS &&
> > +	     i < PARALLEL_BB_LOOP_COUNT; ++i) {
> > +		intel_ctx_destroy(i915, ctxs[i]);
> > +	}
> > +	for (i = 0; i < bb_per_execbuf + 1; ++i)
> > +		gem_close(i915, obj[i].handle);
> > +}
> > +
> > +static void parallel(int i915, unsigned int flags)
> > +{
> > +	for (int class = 0; class < 32; class++) {
> 
> I think we usually avoid declaring variables inside the for loops
> statements, even if the recent C standards allow it, but not sure if we have
> an official style in this regard. there is multiple instance of this in this
> file.
> 

Sure.

> > +		struct i915_engine_class_instance *siblings;
> > +		unsigned int count, bb_per_execbuf;
> > +
> > +		siblings = list_engines(i915, 1u << class, &count);
> > +		if (!siblings)
> > +			continue;
> > +
> > +		if (count < 2) {
> > +			free(siblings);
> > +			continue;
> > +		}
> > +
> > +		logical_sort_siblings(i915, siblings, count);
> > +		bb_per_execbuf = count;
> > +
> > +		parallel_thread(i915, flags, siblings,
> > +				count, bb_per_execbuf);
> > +
> > +		free(siblings);
> > +	}
> > +}
> > +
> > +static void parallel_balancer(int i915, unsigned int flags)
> > +{
> > +	for (int class = 0; class < 32; class++) {
> > +		struct i915_engine_class_instance *siblings;
> > +		unsigned int count;
> > +
> > +		siblings = list_engines(i915, 1u << class, &count);
> > +		if (!siblings)
> > +			continue;
> > +
> > +		if (count < 4) {
> > +			free(siblings);
> > +			continue;
> > +		}
> > +
> > +		logical_sort_siblings(i915, siblings, count);
> > +
> > +		for (unsigned int bb_per_execbuf = 2;;) {
> > +			igt_fork(child, count / bb_per_execbuf)
> > +				parallel_thread(i915,
> > +						flags | PARALLEL_VIRTUAL,
> > +						siblings,
> > +						count,
> > +						bb_per_execbuf);
> 
> As a possible future improvement IMO it'd be nice to check that 2 parallel
> VEs are deployed to the HW at the same time. The test will currently pass
> even if they are serialized. Not a blocker.
>

We probably will need spinners to do this as with short running batches the 2
parallel VEs probably will never actually be on the hardware at the same
time because they more less switch in and switch out at such a fast rate
only 1 will actually be scheduled at a time. Agree this would be a good
test to have though. Likely need a whole new functions / test sections
to do this. Again this might be in a follow up or another patch later in
this series.
 
> > +			igt_waitchildren();
> > +
> > +			if (count / ++bb_per_execbuf <= 1)
> > +				break;
> 
> bikeshed: why not just put this in the if statement?
> 
> for (bb = 2; count / bb > 1 ; ++bb)
> 
> not a blocker.
>

That works.
 
> > +		}
> > +
> > +		free(siblings);
> > +	}
> > +}
> > +
> > +static bool fence_busy(int fence)
> > +{
> > +	return poll(&(struct pollfd){fence, POLLIN}, 1, 0) == 0;
> > +}
> > +
> > +static void parallel_ordering(int i915, unsigned int flags)
> 
> A one-line comment about the test to describe what it's doing would help
> IMO.
>

Sure. BTW, this section is fail with my current (on list) implementation
(rather weak) for parallel submission with execlists.
 
> > +{
> > +	for (int class = 0; class < 32; class++) {
> > +		const intel_ctx_t *ctx = NULL, *spin_ctx = NULL;
> > +		struct i915_engine_class_instance *siblings;
> > +		unsigned int count;
> > +		int i = 0, fence = 0;
> > +		uint32_t batch[16];
> > +		struct drm_i915_gem_execbuffer2 execbuf;
> > +		struct drm_i915_gem_exec_object2 obj[32];
> > +		igt_spin_t *spin;
> > +		intel_ctx_cfg_t cfg;
> > +
> > +		siblings = list_engines(i915, 1u << class, &count);
> > +		if (!siblings)
> > +			continue;
> > +
> > +		if (count < 2) {
> > +			free(siblings);
> > +			continue;
> > +		}
> > +
> > +		logical_sort_siblings(i915, siblings, count);
> > +
> > +		memset(&cfg, 0, sizeof(cfg));
> > +		cfg.parallel = true;
> > +		cfg.num_engines = 1;
> > +		cfg.width = count;
> > +		memcpy(cfg.engines, siblings, sizeof(*siblings) * count);
> > +
> > +		ctx = intel_ctx_create(i915, &cfg);
> > +
> > +		batch[i] = MI_ATOMIC | MI_ATOMIC_INLINE_DATA |
> > +			MI_ATOMIC_ADD;
> > +		batch[++i] = TARGET_BO_OFFSET;
> > +		batch[++i] = 0;
> > +		batch[++i] = 1;
> > +		batch[++i] = MI_BATCH_BUFFER_END;
> > +
> > +		memset(obj, 0, sizeof(obj));
> > +		obj[0].offset = TARGET_BO_OFFSET;
> > +		obj[0].flags = EXEC_OBJECT_PINNED | EXEC_OBJECT_WRITE;
> > +		obj[0].handle = gem_create(i915, 4096);
> > +
> > +		for (i = 1; i < count + 1; ++i) {
> > +			obj[i].handle = gem_create(i915, 4096);
> > +			gem_write(i915, obj[i].handle, 0, batch,
> > +				  sizeof(batch));
> > +		}
> 
> The object setup code here is identical to the one in parallel_thread(),
> maybe move it to a common function?
>

Sure.
 
> > +
> > +		memset(&execbuf, 0, sizeof(execbuf));
> > +		execbuf.buffers_ptr = to_user_pointer(obj);
> > +		execbuf.buffer_count = count + 1;
> > +		execbuf.flags |= I915_EXEC_HANDLE_LUT;
> > +		execbuf.flags |= I915_EXEC_NO_RELOC;
> > +		execbuf.flags |= I915_EXEC_FENCE_OUT;
> > +		execbuf.buffers_ptr = to_user_pointer(obj);
> > +		execbuf.rsvd1 = ctx->id;
> > +
> > +		/* Block parallel submission */
> > +		spin_ctx = ctx_create_engines(i915, siblings, count);
> > +		spin = __igt_spin_new(i915,
> > +				      .ctx = spin_ctx,
> > +				      .engine = 0,
> > +				      .flags = IGT_SPIN_FENCE_OUT |
> > +				      IGT_SPIN_NO_PREEMPTION);
> > +
> > +		/* Wait for spinners to start */
> > +		usleep(5 * 10000);
> > +		igt_assert(fence_busy(spin->out_fence));
> > +
> > +		/* Submit parallel execbuf */
> > +		gem_execbuf_wr(i915, &execbuf);
> > +		fence = execbuf.rsvd2 >> 32;
> > +
> > +		/*
> > +		 * Wait long enough for timeslcing to kick in but not
> > +		 * preemption. Spinner + parallel execbuf should be
> > +		 * active.
> > +		 */
> > +		usleep(25 * 10000);
> 
> This is a pretty arbitrary number, what if the system has been set up with a
> longer timeslicing period (or none at all) and/or a shorter preemption
> timeout? IMO you should read those out of sysfs and tune the waits
> accordingly
>

I think we helpers for reading sysfs values, this should be easy enough
to add. How about we wait for 5x the timeslice or something? If values
are configured in way where that math doesn't work, this we just skip
this test.

Matt
 
> Daniele
> 
> > +		igt_assert(fence_busy(spin->out_fence));
> > +		igt_assert(fence_busy(fence));
> > +		check_bo(i915, obj[0].handle, 0, false);
> > +
> > +		/*
> > +		 * End spinner and wait for spinner + parallel execbuf
> > +		 * to compelte.
> > +		 */
> > +		igt_spin_end(spin);
> > +		igt_assert_eq(sync_fence_wait(fence, 1000), 0);
> > +		igt_assert_eq(sync_fence_status(fence), 1);
> > +		check_bo(i915, obj[0].handle, count, true);
> > +		close(fence);
> > +
> > +		/* Clean up */
> > +		intel_ctx_destroy(i915, ctx);
> > +		intel_ctx_destroy(i915, spin_ctx);
> > +		for (i = 0; i < count + 1; ++i)
> > +			gem_close(i915, obj[i].handle);
> > +		free(siblings);
> > +		igt_spin_free(i915, spin);
> > +	}
> > +}
> > +
> >   static bool has_persistence(int i915)
> >   {
> >   	struct drm_i915_gem_context_param p = {
> > @@ -2786,6 +3186,61 @@ static bool has_load_balancer(int i915)
> >   	return err == 0;
> >   }
> > +static bool has_logical_mapping(int i915)
> > +{
> > +	struct drm_i915_query_engine_info *engines;
> > +	unsigned int i;
> > +
> > +	engines = query_engine_info(i915);
> > +
> > +	for (i = 0; i < engines->num_engines; ++i)
> > +		if (!(engines->engines[i].flags &
> > +		     I915_ENGINE_INFO_HAS_LOGICAL_INSTANCE)) {
> > +			free(engines);
> > +			return false;
> > +		}
> > +
> > +	free(engines);
> > +	return true;
> > +}
> > +
> > +static bool has_parallel_execbuf(int i915)
> > +{
> > +	intel_ctx_cfg_t cfg = {
> > +		.parallel = true,
> > +		.num_engines = 1,
> > +	};
> > +	const intel_ctx_t *ctx = NULL;
> > +	int err;
> > +
> > +	for (int class = 0; class < 32; class++) {
> > +		struct i915_engine_class_instance *siblings;
> > +		unsigned int count;
> > +
> > +		siblings = list_engines(i915, 1u << class, &count);
> > +		if (!siblings)
> > +			continue;
> > +
> > +		if (count < 2) {
> > +			free(siblings);
> > +			continue;
> > +		}
> > +
> > +		logical_sort_siblings(i915, siblings, count);
> > +
> > +		cfg.width = count;
> > +		memcpy(cfg.engines, siblings, sizeof(*siblings) * count);
> > +		free(siblings);
> > +
> > +		err = __intel_ctx_create(i915, &cfg, &ctx);
> > +		intel_ctx_destroy(i915, ctx);
> > +
> > +		return err == 0;
> > +	}
> > +
> > +	return false;
> > +}
> > +
> >   igt_main
> >   {
> >   	int i915 = -1;
> > @@ -2886,6 +3341,38 @@ igt_main
> >   		igt_stop_hang_detector();
> >   	}
> > +	igt_subtest_group {
> > +		igt_fixture {
> > +			igt_require(has_logical_mapping(i915));
> > +			igt_require(has_parallel_execbuf(i915));
> > +		}
> > +
> > +		igt_subtest("parallel-ordering")
> > +			parallel_ordering(i915, 0);
> > +
> > +		igt_subtest("parallel")
> > +			parallel(i915, 0);
> > +
> > +		igt_subtest("parallel-bb-first")
> > +			parallel(i915, PARALLEL_BB_FIRST);
> > +
> > +		igt_subtest("parallel-out-fence")
> > +			parallel(i915, PARALLEL_OUT_FENCE);
> > +
> > +		igt_subtest("parallel-keep-in-fence")
> > +			parallel(i915, PARALLEL_OUT_FENCE | PARALLEL_IN_FENCE);
> > +
> > +		igt_subtest("parallel-keep-submit-fence")
> > +			parallel(i915, PARALLEL_OUT_FENCE |
> > +				 PARALLEL_SUBMIT_FENCE);
> > +
> > +		igt_subtest("parallel-contexts")
> > +			parallel(i915, PARALLEL_CONTEXTS);
> > +
> > +		igt_subtest("parallel-balancer")
> > +			parallel_balancer(i915, 0);
> > +	}
> > +
> >   	igt_subtest_group {
> >   		igt_hang_t  hang;
> 
> 
> 
> 

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2021-11-02 21:59 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-10-22  0:18 [igt-dev] [PATCH] i915/gem_exec_balancer: Test parallel execbuf Matthew Brost
2021-10-22  0:59 ` [igt-dev] ✓ Fi.CI.BAT: success for " Patchwork
2021-10-22  4:21 ` [igt-dev] ✓ Fi.CI.IGT: " Patchwork
2021-11-02 18:34 ` [igt-dev] [PATCH] " Daniele Ceraolo Spurio
2021-11-02 21:55   ` Matthew Brost

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.