All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/3] drm-uapi: Pull i915_drm.h changes for context cloning
@ 2019-03-25 10:58 Chris Wilson
  2019-03-25 10:58 ` [PATCH 2/3] iris: Create a composite context for both compute and render pipelines Chris Wilson
                   ` (2 more replies)
  0 siblings, 3 replies; 12+ messages in thread
From: Chris Wilson @ 2019-03-25 10:58 UTC (permalink / raw)
  To: mesa-dev; +Cc: intel-gfx

For use in GPU recovery and pipeline construction.
---
 include/drm-uapi/i915_drm.h | 389 +++++++++++++++++++++++++++++-------
 1 file changed, 317 insertions(+), 72 deletions(-)

diff --git a/include/drm-uapi/i915_drm.h b/include/drm-uapi/i915_drm.h
index d2792ab3640..59baacd265d 100644
--- a/include/drm-uapi/i915_drm.h
+++ b/include/drm-uapi/i915_drm.h
@@ -62,6 +62,28 @@ extern "C" {
 #define I915_ERROR_UEVENT		"ERROR"
 #define I915_RESET_UEVENT		"RESET"
 
+/*
+ * i915_user_extension: Base class for defining a chain of extensions
+ *
+ * Many interfaces need to grow over time. In most cases we can simply
+ * extend the struct and have userspace pass in more data. Another option,
+ * as demonstrated by Vulkan's approach to providing extensions for forward
+ * and backward compatibility, is to use a list of optional structs to
+ * provide those extra details.
+ *
+ * The key advantage to using an extension chain is that it allows us to
+ * redefine the interface more easily than an ever growing struct of
+ * increasing complexity, and for large parts of that interface to be
+ * entirely optional. The downside is more pointer chasing; chasing across
+ * the boundary with pointers encapsulated inside u64.
+ */
+struct i915_user_extension {
+	__u64 next_extension;
+	__u32 name;
+	__u32 flags; /* All undefined bits must be zero. */
+	__u32 rsvd[4]; /* Reserved for future use; must be zero. */
+};
+
 /*
  * MOCS indexes used for GPU surfaces, defining the cacheability of the
  * surface data and the coherency for this data wrt. CPU vs. GPU accesses.
@@ -99,9 +121,14 @@ enum drm_i915_gem_engine_class {
 	I915_ENGINE_CLASS_VIDEO		= 2,
 	I915_ENGINE_CLASS_VIDEO_ENHANCE	= 3,
 
+	/* should be kept compact */
+
 	I915_ENGINE_CLASS_INVALID	= -1
 };
 
+#define I915_ENGINE_CLASS_INVALID_NONE -1
+#define I915_ENGINE_CLASS_INVALID_VIRTUAL 0
+
 /**
  * DOC: perf_events exposed by i915 through /sys/bus/event_sources/drivers/i915
  *
@@ -319,6 +346,9 @@ typedef struct _drm_i915_sarea {
 #define DRM_I915_PERF_ADD_CONFIG	0x37
 #define DRM_I915_PERF_REMOVE_CONFIG	0x38
 #define DRM_I915_QUERY			0x39
+#define DRM_I915_GEM_VM_CREATE		0x3a
+#define DRM_I915_GEM_VM_DESTROY		0x3b
+/* Must be kept compact -- no holes */
 
 #define DRM_IOCTL_I915_INIT		DRM_IOW( DRM_COMMAND_BASE + DRM_I915_INIT, drm_i915_init_t)
 #define DRM_IOCTL_I915_FLUSH		DRM_IO ( DRM_COMMAND_BASE + DRM_I915_FLUSH)
@@ -367,6 +397,7 @@ typedef struct _drm_i915_sarea {
 #define DRM_IOCTL_I915_GET_SPRITE_COLORKEY DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GET_SPRITE_COLORKEY, struct drm_intel_sprite_colorkey)
 #define DRM_IOCTL_I915_GEM_WAIT		DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_WAIT, struct drm_i915_gem_wait)
 #define DRM_IOCTL_I915_GEM_CONTEXT_CREATE	DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GEM_CONTEXT_CREATE, struct drm_i915_gem_context_create)
+#define DRM_IOCTL_I915_GEM_CONTEXT_CREATE_EXT	DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GEM_CONTEXT_CREATE, struct drm_i915_gem_context_create_ext)
 #define DRM_IOCTL_I915_GEM_CONTEXT_DESTROY	DRM_IOW (DRM_COMMAND_BASE + DRM_I915_GEM_CONTEXT_DESTROY, struct drm_i915_gem_context_destroy)
 #define DRM_IOCTL_I915_REG_READ			DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_REG_READ, struct drm_i915_reg_read)
 #define DRM_IOCTL_I915_GET_RESET_STATS		DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GET_RESET_STATS, struct drm_i915_reset_stats)
@@ -377,6 +408,8 @@ typedef struct _drm_i915_sarea {
 #define DRM_IOCTL_I915_PERF_ADD_CONFIG	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_PERF_ADD_CONFIG, struct drm_i915_perf_oa_config)
 #define DRM_IOCTL_I915_PERF_REMOVE_CONFIG	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_PERF_REMOVE_CONFIG, __u64)
 #define DRM_IOCTL_I915_QUERY			DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_QUERY, struct drm_i915_query)
+#define DRM_IOCTL_I915_GEM_VM_CREATE	DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_VM_CREATE, struct drm_i915_gem_vm_control)
+#define DRM_IOCTL_I915_GEM_VM_DESTROY	DRM_IOW (DRM_COMMAND_BASE + DRM_I915_GEM_VM_DESTROY, struct drm_i915_gem_vm_control)
 
 /* Allow drivers to submit batchbuffers directly to hardware, relying
  * on the security mechanisms provided by hardware.
@@ -476,6 +509,7 @@ typedef struct drm_i915_irq_wait {
 #define   I915_SCHEDULER_CAP_ENABLED	(1ul << 0)
 #define   I915_SCHEDULER_CAP_PRIORITY	(1ul << 1)
 #define   I915_SCHEDULER_CAP_PREEMPTION	(1ul << 2)
+#define   I915_SCHEDULER_CAP_SEMAPHORES	(1ul << 3)
 
 #define I915_PARAM_HUC_STATUS		 42
 
@@ -559,6 +593,14 @@ typedef struct drm_i915_irq_wait {
  */
 #define I915_PARAM_MMAP_GTT_COHERENT	52
 
+/*
+ * Query whether DRM_I915_GEM_EXECBUFFER2 supports coordination of parallel
+ * execution through use of explicit fence support.
+ * See I915_EXEC_FENCE_OUT and I915_EXEC_FENCE_SUBMIT.
+ */
+#define I915_PARAM_HAS_EXEC_SUBMIT_FENCE 53
+/* Must be kept compact -- no holes and well documented */
+
 typedef struct drm_i915_getparam {
 	__s32 param;
 	/*
@@ -574,6 +616,7 @@ typedef struct drm_i915_getparam {
 #define I915_SETPARAM_TEX_LRU_LOG_GRANULARITY             2
 #define I915_SETPARAM_ALLOW_BATCHBUFFER                   3
 #define I915_SETPARAM_NUM_USED_FENCES                     4
+/* Must be kept compact -- no holes */
 
 typedef struct drm_i915_setparam {
 	int param;
@@ -972,7 +1015,7 @@ struct drm_i915_gem_execbuffer2 {
 	 * struct drm_i915_gem_exec_fence *fences.
 	 */
 	__u64 cliprects_ptr;
-#define I915_EXEC_RING_MASK              (7<<0)
+#define I915_EXEC_RING_MASK              (0x3f)
 #define I915_EXEC_DEFAULT                (0<<0)
 #define I915_EXEC_RENDER                 (1<<0)
 #define I915_EXEC_BSD                    (2<<0)
@@ -1078,7 +1121,16 @@ struct drm_i915_gem_execbuffer2 {
  */
 #define I915_EXEC_FENCE_ARRAY   (1<<19)
 
-#define __I915_EXEC_UNKNOWN_FLAGS (-(I915_EXEC_FENCE_ARRAY<<1))
+/*
+ * Setting I915_EXEC_FENCE_SUBMIT implies that lower_32_bits(rsvd2) represent
+ * a sync_file fd to wait upon (in a nonblocking manner) prior to executing
+ * the batch.
+ *
+ * Returns -EINVAL if the sync_file fd cannot be found.
+ */
+#define I915_EXEC_FENCE_SUBMIT		(1 << 20)
+
+#define __I915_EXEC_UNKNOWN_FLAGS (-(I915_EXEC_FENCE_SUBMIT << 1))
 
 #define I915_EXEC_CONTEXT_ID_MASK	(0xffffffff)
 #define i915_execbuffer2_set_context_id(eb2, context) \
@@ -1120,32 +1172,34 @@ struct drm_i915_gem_busy {
 	 * as busy may become idle before the ioctl is completed.
 	 *
 	 * Furthermore, if the object is busy, which engine is busy is only
-	 * provided as a guide. There are race conditions which prevent the
-	 * report of which engines are busy from being always accurate.
-	 * However, the converse is not true. If the object is idle, the
-	 * result of the ioctl, that all engines are idle, is accurate.
+	 * provided as a guide and only indirectly by reporting its class
+	 * (there may be more than one engine in each class). There are race
+	 * conditions which prevent the report of which engines are busy from
+	 * being always accurate.  However, the converse is not true. If the
+	 * object is idle, the result of the ioctl, that all engines are idle,
+	 * is accurate.
 	 *
 	 * The returned dword is split into two fields to indicate both
-	 * the engines on which the object is being read, and the
-	 * engine on which it is currently being written (if any).
+	 * the engine classess on which the object is being read, and the
+	 * engine class on which it is currently being written (if any).
 	 *
 	 * The low word (bits 0:15) indicate if the object is being written
 	 * to by any engine (there can only be one, as the GEM implicit
 	 * synchronisation rules force writes to be serialised). Only the
-	 * engine for the last write is reported.
+	 * engine class (offset by 1, I915_ENGINE_CLASS_RENDER is reported as
+	 * 1 not 0 etc) for the last write is reported.
 	 *
-	 * The high word (bits 16:31) are a bitmask of which engines are
-	 * currently reading from the object. Multiple engines may be
+	 * The high word (bits 16:31) are a bitmask of which engines classes
+	 * are currently reading from the object. Multiple engines may be
 	 * reading from the object simultaneously.
 	 *
-	 * The value of each engine is the same as specified in the
-	 * EXECBUFFER2 ioctl, i.e. I915_EXEC_RENDER, I915_EXEC_BSD etc.
-	 * Note I915_EXEC_DEFAULT is a symbolic value and is mapped to
-	 * the I915_EXEC_RENDER engine for execution, and so it is never
+	 * The value of each engine class is the same as specified in the
+	 * I915_CONTEXT_SET_ENGINES parameter and via perf, i.e.
+	 * I915_ENGINE_CLASS_RENDER, I915_ENGINE_CLASS_COPY, etc.
 	 * reported as active itself. Some hardware may have parallel
 	 * execution engines, e.g. multiple media engines, which are
-	 * mapped to the same identifier in the EXECBUFFER2 ioctl and
-	 * so are not separately reported for busyness.
+	 * mapped to the same class identifier and so are not separately
+	 * reported for busyness.
 	 *
 	 * Caveat emptor:
 	 * Only the boolean result of this query is reliable; that is whether
@@ -1412,65 +1466,18 @@ struct drm_i915_gem_wait {
 };
 
 struct drm_i915_gem_context_create {
-	/*  output: id of new context*/
-	__u32 ctx_id;
-	__u32 pad;
-};
-
-struct drm_i915_gem_context_destroy {
-	__u32 ctx_id;
+	__u32 ctx_id; /* output: id of new context*/
 	__u32 pad;
 };
 
-struct drm_i915_reg_read {
-	/*
-	 * Register offset.
-	 * For 64bit wide registers where the upper 32bits don't immediately
-	 * follow the lower 32bits, the offset of the lower 32bits must
-	 * be specified
-	 */
-	__u64 offset;
-#define I915_REG_READ_8B_WA (1ul << 0)
-
-	__u64 val; /* Return value */
-};
-/* Known registers:
- *
- * Render engine timestamp - 0x2358 + 64bit - gen7+
- * - Note this register returns an invalid value if using the default
- *   single instruction 8byte read, in order to workaround that pass
- *   flag I915_REG_READ_8B_WA in offset field.
- *
- */
-
-struct drm_i915_reset_stats {
-	__u32 ctx_id;
+struct drm_i915_gem_context_create_ext {
+	__u32 ctx_id; /* output: id of new context*/
 	__u32 flags;
-
-	/* All resets since boot/module reload, for all contexts */
-	__u32 reset_count;
-
-	/* Number of batches lost when active in GPU, for this context */
-	__u32 batch_active;
-
-	/* Number of batches lost pending for execution, for this context */
-	__u32 batch_pending;
-
-	__u32 pad;
-};
-
-struct drm_i915_gem_userptr {
-	__u64 user_ptr;
-	__u64 user_size;
-	__u32 flags;
-#define I915_USERPTR_READ_ONLY 0x1
-#define I915_USERPTR_UNSYNCHRONIZED 0x80000000
-	/**
-	 * Returned handle for the object.
-	 *
-	 * Object handles are nonzero.
-	 */
-	__u32 handle;
+#define I915_CONTEXT_CREATE_FLAGS_USE_EXTENSIONS	(1u << 0)
+#define I915_CONTEXT_CREATE_FLAGS_SINGLE_TIMELINE	(1u << 1)
+#define I915_CONTEXT_CREATE_FLAGS_UNKNOWN \
+	(-(I915_CONTEXT_CREATE_FLAGS_SINGLE_TIMELINE << 1))
+	__u64 extensions;
 };
 
 struct drm_i915_gem_context_param {
@@ -1491,6 +1498,63 @@ struct drm_i915_gem_context_param {
 	 * drm_i915_gem_context_param_sseu.
 	 */
 #define I915_CONTEXT_PARAM_SSEU		0x7
+
+/*
+ * Not all clients may want to attempt automatic recover of a context after
+ * a hang (for example, some clients may only submit very small incremental
+ * batches relying on known logical state of previous batches which will never
+ * recover correctly and each attempt will hang), and so would prefer that
+ * the context is forever banned instead.
+ *
+ * If set to false (0), after a reset, subsequent (and in flight) rendering
+ * from this context is discarded, and the client will need to create a new
+ * context to use instead.
+ *
+ * If set to true (1), the kernel will automatically attempt to recover the
+ * context by skipping the hanging batch and executing the next batch starting
+ * from the default context state (discarding the incomplete logical context
+ * state lost due to the reset).
+ *
+ * On creation, all new contexts are marked as recoverable.
+ */
+#define I915_CONTEXT_PARAM_RECOVERABLE	0x8
+
+	/*
+	 * The id of the associated virtual memory address space (ppGTT) of
+	 * this context. Can be retrieved and passed to another context
+	 * (on the same fd) for both to use the same ppGTT and so share
+	 * address layouts, and avoid reloading the page tables on context
+	 * switches between themselves.
+	 *
+	 * See DRM_I915_GEM_VM_CREATE and DRM_I915_GEM_VM_DESTROY.
+	 */
+#define I915_CONTEXT_PARAM_VM		0x9
+
+/*
+ * I915_CONTEXT_PARAM_ENGINES:
+ *
+ * Bind this context to operate on this subset of available engines. Henceforth,
+ * the I915_EXEC_RING selector for DRM_IOCTL_I915_GEM_EXECBUFFER2 operates as
+ * an index into this array of engines; I915_EXEC_DEFAULT selecting engine[0]
+ * and upwards. Slots 0...N are filled in using the specified (class, instance).
+ * Use
+ *	engine_class: I915_ENGINE_CLASS_INVALID,
+ *	engine_instance: I915_ENGINE_CLASS_INVALID_NONE
+ * to specify a gap in the array that can be filled in later, e.g. by a
+ * virtual engine used for load balancing.
+ *
+ * Setting the number of engines bound to the context to 0, by passing a zero
+ * sized argument, will revert back to default settings.
+ *
+ * See struct i915_context_param_engines.
+ *
+ * Extensions:
+ *   i915_context_engines_load_balance (I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE)
+ *   i915_context_engines_bond (I915_CONTEXT_ENGINES_EXT_BOND)
+ */
+#define I915_CONTEXT_PARAM_ENGINES	0xa
+/* Must be kept compact -- no holes and well documented */
+
 	__u64 value;
 };
 
@@ -1553,6 +1617,186 @@ struct drm_i915_gem_context_param_sseu {
 	__u32 rsvd;
 };
 
+/*
+ * i915_context_engines_load_balance:
+ *
+ * Enable load balancing across this set of engines.
+ *
+ * Into the I915_EXEC_DEFAULT slot [0], a virtual engine is created that when
+ * used will proxy the execbuffer request onto one of the set of engines
+ * in such a way as to distribute the load evenly across the set.
+ *
+ * The set of engines must be compatible (e.g. the same HW class) as they
+ * will share the same logical GPU context and ring.
+ *
+ * To intermix rendering with the virtual engine and direct rendering onto
+ * the backing engines (bypassing the load balancing proxy), the context must
+ * be defined to use a single timeline for all engines.
+ */
+struct i915_context_engines_load_balance {
+	struct i915_user_extension base;
+
+	__u16 engine_index;
+	__u16 mbz16; /* reserved for future use; must be zero */
+	__u32 flags; /* all undefined flags must be zero */
+
+	__u64 engines_mask; /* selection mask of engines[] */
+
+	__u64 mbz64[4]; /* reserved for future use; must be zero */
+};
+
+/*
+ * i915_context_engines_bond:
+ *
+ * Constructed bonded pairs for execution within a virtual engine.
+ *
+ * All engines are equal, but some are more equal than others. Given
+ * the distribution of resources in the HW, it may be preferable to run
+ * a request on a given subset of engines in parallel to a request on a
+ * specific engine. We enable this selection of engines within a virtual
+ * engine by specifying bonding pairs, for any given master engine we will
+ * only execute on one of the corresponding siblings within the virtual engine.
+ *
+ * To execute a request in parallel on the master engine and a sibling requires
+ * coordination with a I915_EXEC_FENCE_SUBMIT.
+ */
+struct i915_context_engines_bond {
+	struct i915_user_extension base;
+
+	__u16 virtual_index; /* index of virtual engine in ctx->engines[] */
+	__u16 mbz;
+
+	__u16 master_class;
+	__u16 master_instance;
+
+	__u64 sibling_mask; /* bitmask of BIT(sibling_index) wrt the v.engine */
+	__u64 flags; /* all undefined flags must be zero */
+};
+
+struct i915_context_param_engines {
+	__u64 extensions; /* linked chain of extension blocks, 0 terminates */
+#define I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE 0
+#define I915_CONTEXT_ENGINES_EXT_BOND 1
+
+	struct {
+		__u16 engine_class; /* see enum drm_i915_gem_engine_class */
+		__u16 engine_instance;
+	} class_instance[0];
+} __attribute__((packed));
+
+#define I915_DEFINE_CONTEXT_PARAM_ENGINES(name__, N__) struct { \
+	__u64 extensions; \
+	struct { \
+		__u16 engine_class; \
+		__u16 engine_instance; \
+	} class_instance[N__]; \
+} __attribute__((packed)) name__
+
+struct drm_i915_gem_context_create_ext_setparam {
+#define I915_CONTEXT_CREATE_EXT_SETPARAM 0
+	struct i915_user_extension base;
+	struct drm_i915_gem_context_param param;
+};
+
+struct drm_i915_gem_context_create_ext_clone {
+#define I915_CONTEXT_CREATE_EXT_CLONE 1
+	struct i915_user_extension base;
+	__u32 clone_id;
+	__u32 flags;
+#define I915_CONTEXT_CLONE_ENGINES	(1u << 0)
+#define I915_CONTEXT_CLONE_FLAGS	(1u << 1)
+#define I915_CONTEXT_CLONE_SCHEDATTR	(1u << 2)
+#define I915_CONTEXT_CLONE_SSEU		(1u << 3)
+#define I915_CONTEXT_CLONE_TIMELINE	(1u << 4)
+#define I915_CONTEXT_CLONE_VM		(1u << 5)
+#define I915_CONTEXT_CLONE_UNKNOWN -(I915_CONTEXT_CLONE_VM << 1)
+	__u64 rsvd;
+};
+
+struct drm_i915_gem_context_destroy {
+	__u32 ctx_id;
+	__u32 pad;
+};
+
+/*
+ * DRM_I915_GEM_VM_CREATE -
+ *
+ * Create a new virtual memory address space (ppGTT) for use within a context
+ * on the same file. Extensions can be provided to configure exactly how the
+ * address space is setup upon creation.
+ *
+ * The id of new VM (bound to the fd) for use with I915_CONTEXT_PARAM_VM is
+ * returned in the outparam @id.
+ *
+ * No flags are defined, with all bits reserved and must be zero.
+ *
+ * An extension chain maybe provided, starting with @extensions, and terminated
+ * by the @next_extension being 0. Currently, no extensions are defined.
+ *
+ * DRM_I915_GEM_VM_DESTROY -
+ *
+ * Destroys a previously created VM id, specified in @id.
+ *
+ * No extensions or flags are allowed currently, and so must be zero.
+ */
+struct drm_i915_gem_vm_control {
+	__u64 extensions;
+	__u32 flags;
+	__u32 vm_id;
+};
+
+struct drm_i915_reg_read {
+	/*
+	 * Register offset.
+	 * For 64bit wide registers where the upper 32bits don't immediately
+	 * follow the lower 32bits, the offset of the lower 32bits must
+	 * be specified
+	 */
+	__u64 offset;
+#define I915_REG_READ_8B_WA (1ul << 0)
+
+	__u64 val; /* Return value */
+};
+
+/* Known registers:
+ *
+ * Render engine timestamp - 0x2358 + 64bit - gen7+
+ * - Note this register returns an invalid value if using the default
+ *   single instruction 8byte read, in order to workaround that pass
+ *   flag I915_REG_READ_8B_WA in offset field.
+ *
+ */
+
+struct drm_i915_reset_stats {
+	__u32 ctx_id;
+	__u32 flags;
+
+	/* All resets since boot/module reload, for all contexts */
+	__u32 reset_count;
+
+	/* Number of batches lost when active in GPU, for this context */
+	__u32 batch_active;
+
+	/* Number of batches lost pending for execution, for this context */
+	__u32 batch_pending;
+
+	__u32 pad;
+};
+
+struct drm_i915_gem_userptr {
+	__u64 user_ptr;
+	__u64 user_size;
+	__u32 flags;
+#define I915_USERPTR_READ_ONLY 0x1
+#define I915_USERPTR_UNSYNCHRONIZED 0x80000000
+	/**
+	 * Returned handle for the object.
+	 *
+	 * Object handles are nonzero.
+	 */
+	__u32 handle;
+};
+
 enum drm_i915_oa_format {
 	I915_OA_FORMAT_A13 = 1,	    /* HSW only */
 	I915_OA_FORMAT_A29,	    /* HSW only */
@@ -1714,6 +1958,7 @@ struct drm_i915_perf_oa_config {
 struct drm_i915_query_item {
 	__u64 query_id;
 #define DRM_I915_QUERY_TOPOLOGY_INFO    1
+/* Must be kept compact -- no holes and well documented */
 
 	/*
 	 * When set to zero by userspace, this is filled with the size of the
-- 
2.20.1

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 2/3] iris: Create a composite context for both compute and render pipelines
  2019-03-25 10:58 [PATCH 1/3] drm-uapi: Pull i915_drm.h changes for context cloning Chris Wilson
@ 2019-03-25 10:58 ` Chris Wilson
  2019-03-26  5:52   ` Kenneth Graunke
  2019-03-31  9:57   ` [Intel-gfx] " Jordan Justen
  2019-03-25 10:59 ` [PATCH 3/3] iris: Handle GPU recovery Chris Wilson
  2019-03-31  9:53 ` [Mesa-dev] [PATCH 1/3] drm-uapi: Pull i915_drm.h changes for context cloning Jordan Justen
  2 siblings, 2 replies; 12+ messages in thread
From: Chris Wilson @ 2019-03-25 10:58 UTC (permalink / raw)
  To: mesa-dev; +Cc: intel-gfx, Kenneth Graunke

iris currently uses two distinct GEM contexts to have distinct logical
HW contexts for the compute and render pipelines. However, using two
distinct GEM contexts implies that they are distinct timelines, yet as
they are a single GL context that implies they belong to a single
timeline from the client perspective. Currently, fences are occasionally
inserted to order the two timelines. Using 2 GEM contexts, also implies
that we keep 2 ppGTT for identical buffer state. If we can create a
single GEM context, with the right capabilities, we can have a single
VM, a single timeline, but 2 logical HW contexts for the 2 pipelines.

This is allowed through the new context interface that allows VM to be
shared, timelines to be specified, and for the logical contexts to be
constructed as the user desires.

Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Kenneth Graunke <kenneth@whitecape.org>
---
 src/gallium/drivers/iris/iris_batch.c   | 16 ++-----
 src/gallium/drivers/iris/iris_batch.h   |  5 +--
 src/gallium/drivers/iris/iris_context.c | 56 ++++++++++++++++++++++++-
 3 files changed, 60 insertions(+), 17 deletions(-)

diff --git a/src/gallium/drivers/iris/iris_batch.c b/src/gallium/drivers/iris/iris_batch.c
index 556422c38bc..40bcd939795 100644
--- a/src/gallium/drivers/iris/iris_batch.c
+++ b/src/gallium/drivers/iris/iris_batch.c
@@ -164,24 +164,16 @@ iris_init_batch(struct iris_batch *batch,
                 struct iris_vtable *vtbl,
                 struct pipe_debug_callback *dbg,
                 struct iris_batch *all_batches,
-                enum iris_batch_name name,
-                uint8_t engine,
-                int priority)
+                uint32_t hw_id,
+                enum iris_batch_name name)
 {
    batch->screen = screen;
    batch->vtbl = vtbl;
    batch->dbg = dbg;
    batch->name = name;
 
-   /* engine should be one of I915_EXEC_RENDER, I915_EXEC_BLT, etc. */
-   assert((engine & ~I915_EXEC_RING_MASK) == 0);
-   assert(util_bitcount(engine) == 1);
-   batch->engine = engine;
-
-   batch->hw_ctx_id = iris_create_hw_context(screen->bufmgr);
-   assert(batch->hw_ctx_id);
-
-   iris_hw_context_set_priority(screen->bufmgr, batch->hw_ctx_id, priority);
+   batch->hw_ctx_id = hw_id;
+   batch->engine = name;
 
    util_dynarray_init(&batch->exec_fences, ralloc_context(NULL));
    util_dynarray_init(&batch->syncpts, ralloc_context(NULL));
diff --git a/src/gallium/drivers/iris/iris_batch.h b/src/gallium/drivers/iris/iris_batch.h
index 2a68103c379..db1a4cbbe11 100644
--- a/src/gallium/drivers/iris/iris_batch.h
+++ b/src/gallium/drivers/iris/iris_batch.h
@@ -131,9 +131,8 @@ void iris_init_batch(struct iris_batch *batch,
                      struct iris_vtable *vtbl,
                      struct pipe_debug_callback *dbg,
                      struct iris_batch *all_batches,
-                     enum iris_batch_name name,
-                     uint8_t ring,
-                     int priority);
+                     uint32_t hw_id,
+                     enum iris_batch_name name);
 void iris_chain_to_new_batch(struct iris_batch *batch);
 void iris_batch_free(struct iris_batch *batch);
 void iris_batch_maybe_flush(struct iris_batch *batch, unsigned estimate);
diff --git a/src/gallium/drivers/iris/iris_context.c b/src/gallium/drivers/iris/iris_context.c
index a1d11755a24..aeb608c70bd 100644
--- a/src/gallium/drivers/iris/iris_context.c
+++ b/src/gallium/drivers/iris/iris_context.c
@@ -143,6 +143,57 @@ iris_destroy_context(struct pipe_context *ctx)
       unreachable("Unknown hardware generation"); \
    }
 
+static void create_hw_contexts(struct iris_screen *screen,
+                               uint32_t *hw_id,
+                               int priority)
+{
+#define RCS0 {0, 0}
+   I915_DEFINE_CONTEXT_PARAM_ENGINES(engines, 2) = {
+      .class_instance = { RCS0, RCS0 }
+   };
+   struct drm_i915_gem_context_create_ext_setparam p_engines = {
+      .base = {
+         .name = I915_CONTEXT_CREATE_EXT_SETPARAM,
+	 .next_extension = 0, /* end of chain */
+      },
+      .param = {
+         .param = I915_CONTEXT_PARAM_ENGINES,
+         .value = (uintptr_t)&engines,
+         .size = sizeof(engines),
+      },
+   };
+   struct drm_i915_gem_context_create_ext_setparam p_prio = {
+      .base = {
+         .name =I915_CONTEXT_CREATE_EXT_SETPARAM,
+         .next_extension = (uintptr_t)&p_engines,
+      },
+      .param = {
+         .param = I915_CONTEXT_PARAM_PRIORITY,
+         .value = priority,
+      },
+   };
+   struct drm_i915_gem_context_create_ext arg = {
+      .flags = I915_CONTEXT_CREATE_FLAGS_SINGLE_TIMELINE |
+	       I915_CONTEXT_CREATE_FLAGS_USE_EXTENSIONS,
+      .extensions = (uintptr_t)&p_prio,
+   };
+
+   if (drm_ioctl(screen->fd,
+                 DRM_IOCTL_I915_GEM_CONTEXT_CREATE_EXT,
+                 &arg) == 0) {
+      for (int i = 0; i < IRIS_BATCH_COUNT; i++)
+         hw_id[i] = arg.ctx_id;
+      return;
+   }
+
+   /* No fancy new features; create two contexts instead */
+   for (int i = 0; i < IRIS_BATCH_COUNT; i++) {
+      hw_id[i] = iris_create_hw_context(screen->bufmgr);
+      assert(hw_id[i]);
+
+      iris_hw_context_set_priority(screen->bufmgr, hw_id[i], priority);
+   }
+}
 /**
  * Create a context.
  *
@@ -154,6 +205,7 @@ iris_create_context(struct pipe_screen *pscreen, void *priv, unsigned flags)
    struct iris_screen *screen = (struct iris_screen*)pscreen;
    const struct gen_device_info *devinfo = &screen->devinfo;
    struct iris_context *ice = rzalloc(NULL, struct iris_context);
+   uint32_t hw_id[IRIS_BATCH_COUNT];
 
    if (!ice)
       return NULL;
@@ -210,10 +262,10 @@ iris_create_context(struct pipe_screen *pscreen, void *priv, unsigned flags)
    if (flags & PIPE_CONTEXT_LOW_PRIORITY)
       priority = GEN_CONTEXT_LOW_PRIORITY;
 
+   create_hw_contexts(screen, hw_id, priority);
    for (int i = 0; i < IRIS_BATCH_COUNT; i++) {
       iris_init_batch(&ice->batches[i], screen, &ice->vtbl, &ice->dbg,
-                      ice->batches, (enum iris_batch_name) i,
-                      I915_EXEC_RENDER, priority);
+                      ice->batches, hw_id[i], (enum iris_batch_name) i);
    }
 
    ice->vtbl.init_render_context(screen, &ice->batches[IRIS_BATCH_RENDER],
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 3/3] iris: Handle GPU recovery
  2019-03-25 10:58 [PATCH 1/3] drm-uapi: Pull i915_drm.h changes for context cloning Chris Wilson
  2019-03-25 10:58 ` [PATCH 2/3] iris: Create a composite context for both compute and render pipelines Chris Wilson
@ 2019-03-25 10:59 ` Chris Wilson
  2019-03-31  9:53 ` [Mesa-dev] [PATCH 1/3] drm-uapi: Pull i915_drm.h changes for context cloning Jordan Justen
  2 siblings, 0 replies; 12+ messages in thread
From: Chris Wilson @ 2019-03-25 10:59 UTC (permalink / raw)
  To: mesa-dev; +Cc: intel-gfx

We want to opt out of the automatic GPU recovery and replay performed by
the kernel of a guilty context after a GPU reset as our incremental
batch construction very often implies that subsequent batches are a GPU
reset are incomplete and will trigger fresh GPU hangs. As we are aware
of how we need to reset the context state, but the kernel isn't, tell
the kernel to cancel the inflight rendering and immediately report the
GPU hang, where upon we reconstruct a fresh context for the next batch.
---
 src/gallium/drivers/iris/iris_batch.c   | 92 ++++++++++++++++++-------
 src/gallium/drivers/iris/iris_batch.h   | 13 ++++
 src/gallium/drivers/iris/iris_bufmgr.c  | 25 +++++++
 src/gallium/drivers/iris/iris_context.c | 24 +++++--
 src/gallium/drivers/iris/iris_context.h | 12 ++--
 5 files changed, 125 insertions(+), 41 deletions(-)

diff --git a/src/gallium/drivers/iris/iris_batch.c b/src/gallium/drivers/iris/iris_batch.c
index 40bcd939795..2edca3d558f 100644
--- a/src/gallium/drivers/iris/iris_batch.c
+++ b/src/gallium/drivers/iris/iris_batch.c
@@ -163,6 +163,7 @@ iris_init_batch(struct iris_batch *batch,
                 struct iris_screen *screen,
                 struct iris_vtable *vtbl,
                 struct pipe_debug_callback *dbg,
+                iris_init_context_fn init_context,
                 struct iris_batch *all_batches,
                 uint32_t hw_id,
                 enum iris_batch_name name)
@@ -171,6 +172,7 @@ iris_init_batch(struct iris_batch *batch,
    batch->vtbl = vtbl;
    batch->dbg = dbg;
    batch->name = name;
+   batch->init_context = init_context;
 
    batch->hw_ctx_id = hw_id;
    batch->engine = name;
@@ -212,6 +214,8 @@ iris_init_batch(struct iris_batch *batch,
    }
 
    iris_batch_reset(batch);
+
+   batch->init_context(batch->screen, batch, batch->vtbl, batch->dbg);
 }
 
 static struct drm_i915_gem_exec_object2 *
@@ -443,6 +447,44 @@ iris_finish_batch(struct iris_batch *batch)
       batch->primary_batch_size = iris_batch_bytes_used(batch);
 }
 
+static int
+iris_recreate_context(struct iris_batch *batch)
+{
+   struct drm_i915_gem_context_create_ext_clone clone = {
+      .base = { .name = I915_CONTEXT_CREATE_EXT_CLONE },
+      .clone_id = batch->hw_ctx_id,
+      .flags = ~I915_CONTEXT_CLONE_UNKNOWN,
+   };
+   struct drm_i915_gem_context_create_ext arg = {
+      .flags = I915_CONTEXT_CREATE_FLAGS_USE_EXTENSIONS,
+      .extensions = (uintptr_t)&clone,
+   };
+   if (drm_ioctl(batch->screen->fd,
+                 DRM_IOCTL_I915_GEM_CONTEXT_CREATE_EXT,
+                 &arg))
+      return -EIO;
+
+   uint32_t old_ctx_id = batch->hw_ctx_id;
+
+   batch->hw_ctx_id = arg.ctx_id;
+   batch->init_context(batch->screen, batch, batch->vtbl, batch->dbg);
+
+   for (int b = 0; b < ARRAY_SIZE(batch->other_batches); b++) {
+      struct iris_batch *other = batch->other_batches[b];
+      if (other->hw_ctx_id != old_ctx_id)
+         continue;
+
+      other->hw_ctx_id = arg.ctx_id;
+      other->init_context(other->screen, other, other->vtbl, other->dbg);
+   }
+
+   drm_ioctl(batch->screen->fd,
+             DRM_IOCTL_I915_GEM_CONTEXT_DESTROY,
+             &old_ctx_id);
+
+   return 0;
+}
+
 /**
  * Submit the batch to the GPU via execbuffer2.
  */
@@ -483,17 +525,11 @@ submit_batch(struct iris_batch *batch)
          (uintptr_t)util_dynarray_begin(&batch->exec_fences);
    }
 
-   int ret = drm_ioctl(batch->screen->fd,
-                       DRM_IOCTL_I915_GEM_EXECBUFFER2,
-                       &execbuf);
-   if (ret != 0) {
+   int ret = 0;
+   if (drm_ioctl(batch->screen->fd,
+                 DRM_IOCTL_I915_GEM_EXECBUFFER2,
+                 &execbuf))
       ret = -errno;
-      DBG("execbuf FAILED: errno = %d\n", -ret);
-      fprintf(stderr, "execbuf FAILED: errno = %d\n", -ret);
-      abort();
-   } else {
-      DBG("execbuf succeeded\n");
-   }
 
    for (int i = 0; i < batch->exec_count; i++) {
       struct iris_bo *bo = batch->exec_bos[i];
@@ -561,6 +597,25 @@ _iris_batch_flush(struct iris_batch *batch, const char *file, int line)
 
    int ret = submit_batch(batch);
 
+   batch->exec_count = 0;
+   batch->aperture_space = 0;
+
+   struct iris_syncpt *syncpt =
+      ((struct iris_syncpt **) util_dynarray_begin(&batch->syncpts))[0];
+   iris_syncpt_reference(screen, &batch->last_syncpt, syncpt);
+
+   util_dynarray_foreach(&batch->syncpts, struct iris_syncpt *, s)
+      iris_syncpt_reference(screen, s, NULL);
+   util_dynarray_clear(&batch->syncpts);
+
+   util_dynarray_clear(&batch->exec_fences);
+
+   /* Start a new batch buffer. */
+   iris_batch_reset(batch);
+
+   if (ret == -EIO)
+      ret = iris_recreate_context(batch);
+
    if (ret >= 0) {
       //if (iris->ctx.Const.ResetStrategy == GL_LOSE_CONTEXT_ON_RESET_ARB)
          //iris_check_for_reset(ice);
@@ -574,25 +629,10 @@ _iris_batch_flush(struct iris_batch *batch, const char *file, int line)
       const bool color = INTEL_DEBUG & DEBUG_COLOR;
       fprintf(stderr, "%siris: Failed to submit batchbuffer: %-80s%s\n",
               color ? "\e[1;41m" : "", strerror(-ret), color ? "\e[0m" : "");
-      abort();
 #endif
+      abort();
    }
 
-   batch->exec_count = 0;
-   batch->aperture_space = 0;
-
-   struct iris_syncpt *syncpt =
-      ((struct iris_syncpt **) util_dynarray_begin(&batch->syncpts))[0];
-   iris_syncpt_reference(screen, &batch->last_syncpt, syncpt);
-
-   util_dynarray_foreach(&batch->syncpts, struct iris_syncpt *, s)
-      iris_syncpt_reference(screen, s, NULL);
-   util_dynarray_clear(&batch->syncpts);
-
-   util_dynarray_clear(&batch->exec_fences);
-
-   /* Start a new batch buffer. */
-   iris_batch_reset(batch);
 }
 
 /**
diff --git a/src/gallium/drivers/iris/iris_batch.h b/src/gallium/drivers/iris/iris_batch.h
index db1a4cbbe11..da94749fe20 100644
--- a/src/gallium/drivers/iris/iris_batch.h
+++ b/src/gallium/drivers/iris/iris_batch.h
@@ -41,6 +41,16 @@
 /* Our target batch size - flush approximately at this point. */
 #define BATCH_SZ (20 * 1024)
 
+struct iris_screen;
+struct iris_batch;
+struct iris_vtable;
+struct pipe_debug_callback;
+
+typedef void (*iris_init_context_fn)(struct iris_screen *screen,
+                                     struct iris_batch *batch,
+                                     struct iris_vtable *vtbl,
+                                     struct pipe_debug_callback *dbg);
+
 enum iris_batch_name {
    IRIS_BATCH_RENDER,
    IRIS_BATCH_COMPUTE,
@@ -124,12 +134,15 @@ struct iris_batch {
 
    /** Have we emitted any draw calls to this batch? */
    bool contains_draw;
+
+   iris_init_context_fn init_context;
 };
 
 void iris_init_batch(struct iris_batch *batch,
                      struct iris_screen *screen,
                      struct iris_vtable *vtbl,
                      struct pipe_debug_callback *dbg,
+                     iris_init_context_fn init_context,
                      struct iris_batch *all_batches,
                      uint32_t hw_id,
                      enum iris_batch_name name);
diff --git a/src/gallium/drivers/iris/iris_bufmgr.c b/src/gallium/drivers/iris/iris_bufmgr.c
index e0d167913d2..4c198063869 100644
--- a/src/gallium/drivers/iris/iris_bufmgr.c
+++ b/src/gallium/drivers/iris/iris_bufmgr.c
@@ -1548,6 +1548,29 @@ init_cache_buckets(struct iris_bufmgr *bufmgr)
    }
 }
 
+static void
+init_context(struct iris_bufmgr *bufmgr, uint32_t ctx_id)
+{
+   /*
+    * Upon declaring a GPU hang, the kernel will zap the guilty context
+    * back to the default logical HW state and attempt to continue on to
+    * our next submitted batchbuffer. However, we only send incremental
+    * logical state (e.g. we only ever setup invariant register state
+    * once in brw_initial_gpu_upload()) and so attempting to reply the
+    * next batchbuffer without the correct logical state can be fatal.
+    * Here we tell the kernel not to attempt to recover our context but
+    * immediately (on the next batchbuffer submission) report that the
+    * context is lost, and we will do the recovery ourselves -- 2 lost
+    * batches instead of a continual stream until we are banned, or the
+    * machine is dead.
+    */
+   struct drm_i915_gem_context_param p = {
+      .ctx_id = ctx_id,
+      .param = I915_CONTEXT_PARAM_RECOVERABLE,
+   };
+   drm_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CONTEXT_SETPARAM, &p);
+}
+
 uint32_t
 iris_create_hw_context(struct iris_bufmgr *bufmgr)
 {
@@ -1558,6 +1581,8 @@ iris_create_hw_context(struct iris_bufmgr *bufmgr)
       return 0;
    }
 
+   init_context(bufmgr, create.ctx_id);
+
    return create.ctx_id;
 }
 
diff --git a/src/gallium/drivers/iris/iris_context.c b/src/gallium/drivers/iris/iris_context.c
index aeb608c70bd..9e24733def7 100644
--- a/src/gallium/drivers/iris/iris_context.c
+++ b/src/gallium/drivers/iris/iris_context.c
@@ -162,11 +162,21 @@ static void create_hw_contexts(struct iris_screen *screen,
          .size = sizeof(engines),
       },
    };
-   struct drm_i915_gem_context_create_ext_setparam p_prio = {
+   struct drm_i915_gem_context_create_ext_setparam p_recover = {
       .base = {
          .name =I915_CONTEXT_CREATE_EXT_SETPARAM,
          .next_extension = (uintptr_t)&p_engines,
       },
+      .param = {
+         .param = I915_CONTEXT_PARAM_RECOVERABLE,
+         .value = 0,
+      },
+   };
+   struct drm_i915_gem_context_create_ext_setparam p_prio = {
+      .base = {
+         .name =I915_CONTEXT_CREATE_EXT_SETPARAM,
+         .next_extension = (uintptr_t)&p_recover,
+      },
       .param = {
          .param = I915_CONTEXT_PARAM_PRIORITY,
          .value = priority,
@@ -262,16 +272,16 @@ iris_create_context(struct pipe_screen *pscreen, void *priv, unsigned flags)
    if (flags & PIPE_CONTEXT_LOW_PRIORITY)
       priority = GEN_CONTEXT_LOW_PRIORITY;
 
+   iris_init_context_fn initfn[] = {
+      [IRIS_BATCH_RENDER]  = ice->vtbl.init_render_context,
+      [IRIS_BATCH_COMPUTE] = ice->vtbl.init_compute_context,
+   };
    create_hw_contexts(screen, hw_id, priority);
    for (int i = 0; i < IRIS_BATCH_COUNT; i++) {
-      iris_init_batch(&ice->batches[i], screen, &ice->vtbl, &ice->dbg,
+      iris_init_batch(&ice->batches[i], screen,
+                      &ice->vtbl, &ice->dbg, initfn[i],
                       ice->batches, hw_id[i], (enum iris_batch_name) i);
    }
 
-   ice->vtbl.init_render_context(screen, &ice->batches[IRIS_BATCH_RENDER],
-                                 &ice->vtbl, &ice->dbg);
-   ice->vtbl.init_compute_context(screen, &ice->batches[IRIS_BATCH_COMPUTE],
-                                  &ice->vtbl, &ice->dbg);
-
    return ctx;
 }
diff --git a/src/gallium/drivers/iris/iris_context.h b/src/gallium/drivers/iris/iris_context.h
index 494c931d0f0..d64d391e98a 100644
--- a/src/gallium/drivers/iris/iris_context.h
+++ b/src/gallium/drivers/iris/iris_context.h
@@ -346,14 +346,10 @@ struct iris_stream_output_target {
  */
 struct iris_vtable {
    void (*destroy_state)(struct iris_context *ice);
-   void (*init_render_context)(struct iris_screen *screen,
-                               struct iris_batch *batch,
-                               struct iris_vtable *vtbl,
-                               struct pipe_debug_callback *dbg);
-   void (*init_compute_context)(struct iris_screen *screen,
-                                struct iris_batch *batch,
-                                struct iris_vtable *vtbl,
-                                struct pipe_debug_callback *dbg);
+
+   iris_init_context_fn init_render_context;
+   iris_init_context_fn init_compute_context;
+
    void (*upload_render_state)(struct iris_context *ice,
                                struct iris_batch *batch,
                                const struct pipe_draw_info *draw);
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/3] iris: Create a composite context for both compute and render pipelines
  2019-03-25 10:58 ` [PATCH 2/3] iris: Create a composite context for both compute and render pipelines Chris Wilson
@ 2019-03-26  5:52   ` Kenneth Graunke
  2019-03-26  7:16     ` Chris Wilson
  2019-03-31  9:57   ` [Intel-gfx] " Jordan Justen
  1 sibling, 1 reply; 12+ messages in thread
From: Kenneth Graunke @ 2019-03-26  5:52 UTC (permalink / raw)
  To: Chris Wilson; +Cc: mesa-dev, intel-gfx, Joonas Lahtinen


[-- Attachment #1.1: Type: text/plain, Size: 2507 bytes --]

On Monday, March 25, 2019 3:58:59 AM PDT Chris Wilson wrote:
> iris currently uses two distinct GEM contexts to have distinct logical
> HW contexts for the compute and render pipelines. However, using two
> distinct GEM contexts implies that they are distinct timelines, yet as
> they are a single GL context that implies they belong to a single
> timeline from the client perspective. Currently, fences are occasionally
> inserted to order the two timelines. Using 2 GEM contexts, also implies
> that we keep 2 ppGTT for identical buffer state. If we can create a
> single GEM context, with the right capabilities, we can have a single
> VM, a single timeline, but 2 logical HW contexts for the 2 pipelines.
> 
> This is allowed through the new context interface that allows VM to be
> shared, timelines to be specified, and for the logical contexts to be
> constructed as the user desires.
> 
> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> Cc: Kenneth Graunke <kenneth@whitecape.org>
> ---
>  src/gallium/drivers/iris/iris_batch.c   | 16 ++-----
>  src/gallium/drivers/iris/iris_batch.h   |  5 +--
>  src/gallium/drivers/iris/iris_context.c | 56 ++++++++++++++++++++++++-
>  3 files changed, 60 insertions(+), 17 deletions(-)

Hi Chris,

I don't think that I want the single timeline option.  It seems like
we've been moving away from implicit sync for a long time, and the
explicit sync code we have is pretty straightforward and seems to do
the trick.  Jason and I also chatted briefly, and we don't necessarily
want to a strict submission-order between render/compute.

Separating the VMA from the context state image seems like absolutely
the right thing to do - as you said, they're separate in hardware,
and no real reason to tie it together.  I would be in favor of new
uABI for that.

I don't think there will be much overhead reduction from sharing the
VMA here though.  It's very plausible that the compositor might want
to run between render and compute batches, at which point we end up
doing page directory loads anyway.  I have also heard rumors about bit
47 becoming magical at some point which may prohibit us from sharing...

Context cloning seems OK, but I'm always pretty hesitant to add new
uABI unless it's strictly necessary.  In this case, we can do the same
thing with a little bit of userspace code, so I'm not sure it's worth
adding that...

I would love to see an iris patch to use the new
I915_CONTEXT_PARAM_RECOVERABLE option without the other dependencies.

--Ken

[-- Attachment #1.2: This is a digitally signed message part. --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

[-- Attachment #2: Type: text/plain, Size: 156 bytes --]

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/3] iris: Create a composite context for both compute and render pipelines
  2019-03-26  5:52   ` Kenneth Graunke
@ 2019-03-26  7:16     ` Chris Wilson
  2019-03-26 17:01       ` Kenneth Graunke
  0 siblings, 1 reply; 12+ messages in thread
From: Chris Wilson @ 2019-03-26  7:16 UTC (permalink / raw)
  To: Kenneth Graunke; +Cc: mesa-dev, intel-gfx

Quoting Kenneth Graunke (2019-03-26 05:52:10)
> On Monday, March 25, 2019 3:58:59 AM PDT Chris Wilson wrote:
> > iris currently uses two distinct GEM contexts to have distinct logical
> > HW contexts for the compute and render pipelines. However, using two
> > distinct GEM contexts implies that they are distinct timelines, yet as
> > they are a single GL context that implies they belong to a single
> > timeline from the client perspective. Currently, fences are occasionally
> > inserted to order the two timelines. Using 2 GEM contexts, also implies
> > that we keep 2 ppGTT for identical buffer state. If we can create a
> > single GEM context, with the right capabilities, we can have a single
> > VM, a single timeline, but 2 logical HW contexts for the 2 pipelines.
> > 
> > This is allowed through the new context interface that allows VM to be
> > shared, timelines to be specified, and for the logical contexts to be
> > constructed as the user desires.
> > 
> > Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> > Cc: Kenneth Graunke <kenneth@whitecape.org>
> > ---
> >  src/gallium/drivers/iris/iris_batch.c   | 16 ++-----
> >  src/gallium/drivers/iris/iris_batch.h   |  5 +--
> >  src/gallium/drivers/iris/iris_context.c | 56 ++++++++++++++++++++++++-
> >  3 files changed, 60 insertions(+), 17 deletions(-)
> 
> Hi Chris,
> 
> I don't think that I want the single timeline option.  It seems like
> we've been moving away from implicit sync for a long time, and the
> explicit sync code we have is pretty straightforward and seems to do
> the trick.  Jason and I also chatted briefly, and we don't necessarily
> want to a strict submission-order between render/compute.

I disagree if you think this means more implicit sync. It is setting up
the GEM context to an exact match of the GL context, by _explicit_
control of the timeline. Then the fences you do export from inside the
GL context do not need to be faked to be a composite of the pair of
contexts. You still have explicit fences, and you have explicit control
over the definition of their timeline.

> Separating the VMA from the context state image seems like absolutely
> the right thing to do - as you said, they're separate in hardware,
> and no real reason to tie it together.  I would be in favor of new
> uABI for that.
> 
> I don't think there will be much overhead reduction from sharing the
> VMA here though.  It's very plausible that the compositor might want
> to run between render and compute batches, at which point we end up
> doing page directory loads anyway.  I have also heard rumors about bit
> 47 becoming magical at some point which may prohibit us from sharing...

Yeah, but that doesn't actually affect the context setup, just how you
decide to use it in end. And by that point, you'll be forced into using
this new uABI anyway or something entirely different :-p
 
> Context cloning seems OK, but I'm always pretty hesitant to add new
> uABI unless it's strictly necessary.  In this case, we can do the same
> thing with a little bit of userspace code, so I'm not sure it's worth
> adding that...

Actually you cannot do the same without some of the new uABI either,
since previously you did not have all the parameters exposed.
 
> I would love to see an iris patch to use the new
> I915_CONTEXT_PARAM_RECOVERABLE option without the other dependencies.

https://gitlab.freedesktop.org/ickle/mesa/commit/84d9cb1d8d98a50dcceea19ccbc3836b15cf73ae
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/3] iris: Create a composite context for both compute and render pipelines
  2019-03-26  7:16     ` Chris Wilson
@ 2019-03-26 17:01       ` Kenneth Graunke
  2019-03-26 17:15         ` Chris Wilson
  0 siblings, 1 reply; 12+ messages in thread
From: Kenneth Graunke @ 2019-03-26 17:01 UTC (permalink / raw)
  To: Chris Wilson; +Cc: mesa-dev, intel-gfx, Joonas Lahtinen


[-- Attachment #1.1: Type: text/plain, Size: 4938 bytes --]

On Tuesday, March 26, 2019 12:16:20 AM PDT Chris Wilson wrote:
> Quoting Kenneth Graunke (2019-03-26 05:52:10)
> > On Monday, March 25, 2019 3:58:59 AM PDT Chris Wilson wrote:
> > > iris currently uses two distinct GEM contexts to have distinct logical
> > > HW contexts for the compute and render pipelines. However, using two
> > > distinct GEM contexts implies that they are distinct timelines, yet as
> > > they are a single GL context that implies they belong to a single
> > > timeline from the client perspective. Currently, fences are occasionally
> > > inserted to order the two timelines. Using 2 GEM contexts, also implies
> > > that we keep 2 ppGTT for identical buffer state. If we can create a
> > > single GEM context, with the right capabilities, we can have a single
> > > VM, a single timeline, but 2 logical HW contexts for the 2 pipelines.
> > > 
> > > This is allowed through the new context interface that allows VM to be
> > > shared, timelines to be specified, and for the logical contexts to be
> > > constructed as the user desires.
> > > 
> > > Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> > > Cc: Kenneth Graunke <kenneth@whitecape.org>
> > > ---
> > >  src/gallium/drivers/iris/iris_batch.c   | 16 ++-----
> > >  src/gallium/drivers/iris/iris_batch.h   |  5 +--
> > >  src/gallium/drivers/iris/iris_context.c | 56 ++++++++++++++++++++++++-
> > >  3 files changed, 60 insertions(+), 17 deletions(-)
> > 
> > Hi Chris,
> > 
> > I don't think that I want the single timeline option.  It seems like
> > we've been moving away from implicit sync for a long time, and the
> > explicit sync code we have is pretty straightforward and seems to do
> > the trick.  Jason and I also chatted briefly, and we don't necessarily
> > want to a strict submission-order between render/compute.
> 
> I disagree if you think this means more implicit sync. It is setting up
> the GEM context to an exact match of the GL context, by _explicit_
> control of the timeline. Then the fences you do export from inside the
> GL context do not need to be faked to be a composite of the pair of
> contexts. You still have explicit fences, and you have explicit control
> over the definition of their timeline.

With regard to multiple GL contexts, yes, everything remains explicit.
But having 2-3 separate timelines within a GL context allows us to
reorder work behind GL's back, which is all the rage these days for
performance.  Tilers do it all the time.  Position-only bucketing may
require it.  I'd really like to start treating render and compute as
distinct asynchronous queues.  At the very least, experimenting with
that and not tying my hands to a particular behavior.

There may be some use for single timeline, though.  Attaching images as
compute shader inputs may require CCS/HiZ resolves, which have to happen
on the RCS.  Right now, I do those on IRIS_BATCH_RENDER, which mean that
it backs up behind any queued render work.  Ideally, I'd do those on a
third context, which could be tied to the compute timeline, so the
resolves and the compute job can both execute ahead of queued rendering,
but still back to back.

> > Separating the VMA from the context state image seems like absolutely
> > the right thing to do - as you said, they're separate in hardware,
> > and no real reason to tie it together.  I would be in favor of new
> > uABI for that.
> > 
> > I don't think there will be much overhead reduction from sharing the
> > VMA here though.  It's very plausible that the compositor might want
> > to run between render and compute batches, at which point we end up
> > doing page directory loads anyway.  I have also heard rumors about bit
> > 47 becoming magical at some point which may prohibit us from sharing...
> 
> Yeah, but that doesn't actually affect the context setup, just how you
> decide to use it in end. And by that point, you'll be forced into using
> this new uABI anyway or something entirely different :-p

Looking into this a bit more, I think we're actually OK.  I thought I
might need to have distinct addresses for render and compute - at which
point nearly every address would differ in terms of bit 47 - but it
looks like the correct answer is "just never use that bit".  *shrug*

> > Context cloning seems OK, but I'm always pretty hesitant to add new
> > uABI unless it's strictly necessary.  In this case, we can do the same
> > thing with a little bit of userspace code, so I'm not sure it's worth
> > adding that...
> 
> Actually you cannot do the same without some of the new uABI either,
> since previously you did not have all the parameters exposed.

What isn't exposed?  We set up everything the first time, why can't we
do it again?

> > I would love to see an iris patch to use the new
> > I915_CONTEXT_PARAM_RECOVERABLE option without the other dependencies.
> 
> https://gitlab.freedesktop.org/ickle/mesa/commit/84d9cb1d8d98a50dcceea19ccbc3836b15cf73ae
> -Chris

[-- Attachment #1.2: This is a digitally signed message part. --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

[-- Attachment #2: Type: text/plain, Size: 156 bytes --]

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/3] iris: Create a composite context for both compute and render pipelines
  2019-03-26 17:01       ` Kenneth Graunke
@ 2019-03-26 17:15         ` Chris Wilson
  0 siblings, 0 replies; 12+ messages in thread
From: Chris Wilson @ 2019-03-26 17:15 UTC (permalink / raw)
  To: Kenneth Graunke; +Cc: mesa-dev, intel-gfx, Joonas Lahtinen

Quoting Kenneth Graunke (2019-03-26 17:01:57)
> On Tuesday, March 26, 2019 12:16:20 AM PDT Chris Wilson wrote:
> > Quoting Kenneth Graunke (2019-03-26 05:52:10)
> > > On Monday, March 25, 2019 3:58:59 AM PDT Chris Wilson wrote:
> > > > iris currently uses two distinct GEM contexts to have distinct logical
> > > > HW contexts for the compute and render pipelines. However, using two
> > > > distinct GEM contexts implies that they are distinct timelines, yet as
> > > > they are a single GL context that implies they belong to a single
> > > > timeline from the client perspective. Currently, fences are occasionally
> > > > inserted to order the two timelines. Using 2 GEM contexts, also implies
> > > > that we keep 2 ppGTT for identical buffer state. If we can create a
> > > > single GEM context, with the right capabilities, we can have a single
> > > > VM, a single timeline, but 2 logical HW contexts for the 2 pipelines.
> > > > 
> > > > This is allowed through the new context interface that allows VM to be
> > > > shared, timelines to be specified, and for the logical contexts to be
> > > > constructed as the user desires.
> > > > 
> > > > Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> > > > Cc: Kenneth Graunke <kenneth@whitecape.org>
> > > > ---
> > > >  src/gallium/drivers/iris/iris_batch.c   | 16 ++-----
> > > >  src/gallium/drivers/iris/iris_batch.h   |  5 +--
> > > >  src/gallium/drivers/iris/iris_context.c | 56 ++++++++++++++++++++++++-
> > > >  3 files changed, 60 insertions(+), 17 deletions(-)
> > > 
> > > Hi Chris,
> > > 
> > > I don't think that I want the single timeline option.  It seems like
> > > we've been moving away from implicit sync for a long time, and the
> > > explicit sync code we have is pretty straightforward and seems to do
> > > the trick.  Jason and I also chatted briefly, and we don't necessarily
> > > want to a strict submission-order between render/compute.
> > 
> > I disagree if you think this means more implicit sync. It is setting up
> > the GEM context to an exact match of the GL context, by _explicit_
> > control of the timeline. Then the fences you do export from inside the
> > GL context do not need to be faked to be a composite of the pair of
> > contexts. You still have explicit fences, and you have explicit control
> > over the definition of their timeline.
> 
> With regard to multiple GL contexts, yes, everything remains explicit.
> But having 2-3 separate timelines within a GL context allows us to
> reorder work behind GL's back, which is all the rage these days for
> performance.  Tilers do it all the time.  Position-only bucketing may
> require it.  I'd really like to start treating render and compute as
> distinct asynchronous queues.  At the very least, experimenting with
> that and not tying my hands to a particular behavior.

That's a reasonable argument. If you want to try and keep the GL
semantics intact while playing with ordering underneath, have fun!

The only problem I forsee if there is any observable through which the
pipelines can determine their ordering / concurrency (sampling a common
buffer or clock) that might construe a violation.
 
> There may be some use for single timeline, though.  Attaching images as
> compute shader inputs may require CCS/HiZ resolves, which have to happen
> on the RCS.  Right now, I do those on IRIS_BATCH_RENDER, which mean that
> it backs up behind any queued render work.  Ideally, I'd do those on a
> third context, which could be tied to the compute timeline, so the
> resolves and the compute job can both execute ahead of queued rendering,
> but still back to back.

I have an inkling that timelines should be first class for userspace to
control exactly. But I have not seen anything close to a use case to
justify that (yet). And by the time a usecase should arise, we will
probably be onto the next shiny. That's the problem with cloudy crystal
balls.

> > > Separating the VMA from the context state image seems like absolutely
> > > the right thing to do - as you said, they're separate in hardware,
> > > and no real reason to tie it together.  I would be in favor of new
> > > uABI for that.
> > > 
> > > I don't think there will be much overhead reduction from sharing the
> > > VMA here though.  It's very plausible that the compositor might want
> > > to run between render and compute batches, at which point we end up
> > > doing page directory loads anyway.  I have also heard rumors about bit
> > > 47 becoming magical at some point which may prohibit us from sharing...
> > 
> > Yeah, but that doesn't actually affect the context setup, just how you
> > decide to use it in end. And by that point, you'll be forced into using
> > this new uABI anyway or something entirely different :-p
> 
> Looking into this a bit more, I think we're actually OK.  I thought I
> might need to have distinct addresses for render and compute - at which
> point nearly every address would differ in terms of bit 47 - but it
> looks like the correct answer is "just never use that bit".  *shrug*

Yup.

> > > Context cloning seems OK, but I'm always pretty hesitant to add new
> > > uABI unless it's strictly necessary.  In this case, we can do the same
> > > thing with a little bit of userspace code, so I'm not sure it's worth
> > > adding that...
> > 
> > Actually you cannot do the same without some of the new uABI either,
> > since previously you did not have all the parameters exposed.
> 
> What isn't exposed?  We set up everything the first time, why can't we
> do it again?

When going through the list of things I couldn't re-establish, the ppgtt
was top of that list. Hence I suddenly got motivated to provide a means
for context-recovery to be able to keep as much state as possible and
decide how much to reset.

The alternative is to be able to reset the old context. That would need
to be a synchronous operation -- I need to flush all the old requests
and cancel them before returning, in effect I might as well just create
a new context pointer, copy across all the state and insert into the idr
as the old id. Given that the answer is cloning, and cloning is useful
in general, that seems a good api to carry forward.
-Chris
_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Mesa-dev] [PATCH 1/3] drm-uapi: Pull i915_drm.h changes for context cloning
  2019-03-25 10:58 [PATCH 1/3] drm-uapi: Pull i915_drm.h changes for context cloning Chris Wilson
  2019-03-25 10:58 ` [PATCH 2/3] iris: Create a composite context for both compute and render pipelines Chris Wilson
  2019-03-25 10:59 ` [PATCH 3/3] iris: Handle GPU recovery Chris Wilson
@ 2019-03-31  9:53 ` Jordan Justen
  2019-03-31  9:55   ` Chris Wilson
  2 siblings, 1 reply; 12+ messages in thread
From: Jordan Justen @ 2019-03-31  9:53 UTC (permalink / raw)
  To: Chris Wilson, mesa-dev; +Cc: intel-gfx

Where are these changes from (repo/commit)? It could be good to
reference in the commit message.

I suspect that the answer might mean that these patches should be
labeled RFC.

-Jordan

On 2019-03-25 03:58:58, Chris Wilson wrote:
> For use in GPU recovery and pipeline construction.
> ---
>  include/drm-uapi/i915_drm.h | 389 +++++++++++++++++++++++++++++-------
>  1 file changed, 317 insertions(+), 72 deletions(-)
> 
> diff --git a/include/drm-uapi/i915_drm.h b/include/drm-uapi/i915_drm.h
> index d2792ab3640..59baacd265d 100644
> --- a/include/drm-uapi/i915_drm.h
> +++ b/include/drm-uapi/i915_drm.h
> @@ -62,6 +62,28 @@ extern "C" {
>  #define I915_ERROR_UEVENT              "ERROR"
>  #define I915_RESET_UEVENT              "RESET"
>  
> +/*
> + * i915_user_extension: Base class for defining a chain of extensions
> + *
> + * Many interfaces need to grow over time. In most cases we can simply
> + * extend the struct and have userspace pass in more data. Another option,
> + * as demonstrated by Vulkan's approach to providing extensions for forward
> + * and backward compatibility, is to use a list of optional structs to
> + * provide those extra details.
> + *
> + * The key advantage to using an extension chain is that it allows us to
> + * redefine the interface more easily than an ever growing struct of
> + * increasing complexity, and for large parts of that interface to be
> + * entirely optional. The downside is more pointer chasing; chasing across
> + * the boundary with pointers encapsulated inside u64.
> + */
> +struct i915_user_extension {
> +       __u64 next_extension;
> +       __u32 name;
> +       __u32 flags; /* All undefined bits must be zero. */
> +       __u32 rsvd[4]; /* Reserved for future use; must be zero. */
> +};
> +
>  /*
>   * MOCS indexes used for GPU surfaces, defining the cacheability of the
>   * surface data and the coherency for this data wrt. CPU vs. GPU accesses.
> @@ -99,9 +121,14 @@ enum drm_i915_gem_engine_class {
>         I915_ENGINE_CLASS_VIDEO         = 2,
>         I915_ENGINE_CLASS_VIDEO_ENHANCE = 3,
>  
> +       /* should be kept compact */
> +
>         I915_ENGINE_CLASS_INVALID       = -1
>  };
>  
> +#define I915_ENGINE_CLASS_INVALID_NONE -1
> +#define I915_ENGINE_CLASS_INVALID_VIRTUAL 0
> +
>  /**
>   * DOC: perf_events exposed by i915 through /sys/bus/event_sources/drivers/i915
>   *
> @@ -319,6 +346,9 @@ typedef struct _drm_i915_sarea {
>  #define DRM_I915_PERF_ADD_CONFIG       0x37
>  #define DRM_I915_PERF_REMOVE_CONFIG    0x38
>  #define DRM_I915_QUERY                 0x39
> +#define DRM_I915_GEM_VM_CREATE         0x3a
> +#define DRM_I915_GEM_VM_DESTROY                0x3b
> +/* Must be kept compact -- no holes */
>  
>  #define DRM_IOCTL_I915_INIT            DRM_IOW( DRM_COMMAND_BASE + DRM_I915_INIT, drm_i915_init_t)
>  #define DRM_IOCTL_I915_FLUSH           DRM_IO ( DRM_COMMAND_BASE + DRM_I915_FLUSH)
> @@ -367,6 +397,7 @@ typedef struct _drm_i915_sarea {
>  #define DRM_IOCTL_I915_GET_SPRITE_COLORKEY DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GET_SPRITE_COLORKEY, struct drm_intel_sprite_colorkey)
>  #define DRM_IOCTL_I915_GEM_WAIT                DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_WAIT, struct drm_i915_gem_wait)
>  #define DRM_IOCTL_I915_GEM_CONTEXT_CREATE      DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GEM_CONTEXT_CREATE, struct drm_i915_gem_context_create)
> +#define DRM_IOCTL_I915_GEM_CONTEXT_CREATE_EXT  DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GEM_CONTEXT_CREATE, struct drm_i915_gem_context_create_ext)
>  #define DRM_IOCTL_I915_GEM_CONTEXT_DESTROY     DRM_IOW (DRM_COMMAND_BASE + DRM_I915_GEM_CONTEXT_DESTROY, struct drm_i915_gem_context_destroy)
>  #define DRM_IOCTL_I915_REG_READ                        DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_REG_READ, struct drm_i915_reg_read)
>  #define DRM_IOCTL_I915_GET_RESET_STATS         DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GET_RESET_STATS, struct drm_i915_reset_stats)
> @@ -377,6 +408,8 @@ typedef struct _drm_i915_sarea {
>  #define DRM_IOCTL_I915_PERF_ADD_CONFIG DRM_IOW(DRM_COMMAND_BASE + DRM_I915_PERF_ADD_CONFIG, struct drm_i915_perf_oa_config)
>  #define DRM_IOCTL_I915_PERF_REMOVE_CONFIG      DRM_IOW(DRM_COMMAND_BASE + DRM_I915_PERF_REMOVE_CONFIG, __u64)
>  #define DRM_IOCTL_I915_QUERY                   DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_QUERY, struct drm_i915_query)
> +#define DRM_IOCTL_I915_GEM_VM_CREATE   DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_VM_CREATE, struct drm_i915_gem_vm_control)
> +#define DRM_IOCTL_I915_GEM_VM_DESTROY  DRM_IOW (DRM_COMMAND_BASE + DRM_I915_GEM_VM_DESTROY, struct drm_i915_gem_vm_control)
>  
>  /* Allow drivers to submit batchbuffers directly to hardware, relying
>   * on the security mechanisms provided by hardware.
> @@ -476,6 +509,7 @@ typedef struct drm_i915_irq_wait {
>  #define   I915_SCHEDULER_CAP_ENABLED   (1ul << 0)
>  #define   I915_SCHEDULER_CAP_PRIORITY  (1ul << 1)
>  #define   I915_SCHEDULER_CAP_PREEMPTION        (1ul << 2)
> +#define   I915_SCHEDULER_CAP_SEMAPHORES        (1ul << 3)
>  
>  #define I915_PARAM_HUC_STATUS           42
>  
> @@ -559,6 +593,14 @@ typedef struct drm_i915_irq_wait {
>   */
>  #define I915_PARAM_MMAP_GTT_COHERENT   52
>  
> +/*
> + * Query whether DRM_I915_GEM_EXECBUFFER2 supports coordination of parallel
> + * execution through use of explicit fence support.
> + * See I915_EXEC_FENCE_OUT and I915_EXEC_FENCE_SUBMIT.
> + */
> +#define I915_PARAM_HAS_EXEC_SUBMIT_FENCE 53
> +/* Must be kept compact -- no holes and well documented */
> +
>  typedef struct drm_i915_getparam {
>         __s32 param;
>         /*
> @@ -574,6 +616,7 @@ typedef struct drm_i915_getparam {
>  #define I915_SETPARAM_TEX_LRU_LOG_GRANULARITY             2
>  #define I915_SETPARAM_ALLOW_BATCHBUFFER                   3
>  #define I915_SETPARAM_NUM_USED_FENCES                     4
> +/* Must be kept compact -- no holes */
>  
>  typedef struct drm_i915_setparam {
>         int param;
> @@ -972,7 +1015,7 @@ struct drm_i915_gem_execbuffer2 {
>          * struct drm_i915_gem_exec_fence *fences.
>          */
>         __u64 cliprects_ptr;
> -#define I915_EXEC_RING_MASK              (7<<0)
> +#define I915_EXEC_RING_MASK              (0x3f)
>  #define I915_EXEC_DEFAULT                (0<<0)
>  #define I915_EXEC_RENDER                 (1<<0)
>  #define I915_EXEC_BSD                    (2<<0)
> @@ -1078,7 +1121,16 @@ struct drm_i915_gem_execbuffer2 {
>   */
>  #define I915_EXEC_FENCE_ARRAY   (1<<19)
>  
> -#define __I915_EXEC_UNKNOWN_FLAGS (-(I915_EXEC_FENCE_ARRAY<<1))
> +/*
> + * Setting I915_EXEC_FENCE_SUBMIT implies that lower_32_bits(rsvd2) represent
> + * a sync_file fd to wait upon (in a nonblocking manner) prior to executing
> + * the batch.
> + *
> + * Returns -EINVAL if the sync_file fd cannot be found.
> + */
> +#define I915_EXEC_FENCE_SUBMIT         (1 << 20)
> +
> +#define __I915_EXEC_UNKNOWN_FLAGS (-(I915_EXEC_FENCE_SUBMIT << 1))
>  
>  #define I915_EXEC_CONTEXT_ID_MASK      (0xffffffff)
>  #define i915_execbuffer2_set_context_id(eb2, context) \
> @@ -1120,32 +1172,34 @@ struct drm_i915_gem_busy {
>          * as busy may become idle before the ioctl is completed.
>          *
>          * Furthermore, if the object is busy, which engine is busy is only
> -        * provided as a guide. There are race conditions which prevent the
> -        * report of which engines are busy from being always accurate.
> -        * However, the converse is not true. If the object is idle, the
> -        * result of the ioctl, that all engines are idle, is accurate.
> +        * provided as a guide and only indirectly by reporting its class
> +        * (there may be more than one engine in each class). There are race
> +        * conditions which prevent the report of which engines are busy from
> +        * being always accurate.  However, the converse is not true. If the
> +        * object is idle, the result of the ioctl, that all engines are idle,
> +        * is accurate.
>          *
>          * The returned dword is split into two fields to indicate both
> -        * the engines on which the object is being read, and the
> -        * engine on which it is currently being written (if any).
> +        * the engine classess on which the object is being read, and the
> +        * engine class on which it is currently being written (if any).
>          *
>          * The low word (bits 0:15) indicate if the object is being written
>          * to by any engine (there can only be one, as the GEM implicit
>          * synchronisation rules force writes to be serialised). Only the
> -        * engine for the last write is reported.
> +        * engine class (offset by 1, I915_ENGINE_CLASS_RENDER is reported as
> +        * 1 not 0 etc) for the last write is reported.
>          *
> -        * The high word (bits 16:31) are a bitmask of which engines are
> -        * currently reading from the object. Multiple engines may be
> +        * The high word (bits 16:31) are a bitmask of which engines classes
> +        * are currently reading from the object. Multiple engines may be
>          * reading from the object simultaneously.
>          *
> -        * The value of each engine is the same as specified in the
> -        * EXECBUFFER2 ioctl, i.e. I915_EXEC_RENDER, I915_EXEC_BSD etc.
> -        * Note I915_EXEC_DEFAULT is a symbolic value and is mapped to
> -        * the I915_EXEC_RENDER engine for execution, and so it is never
> +        * The value of each engine class is the same as specified in the
> +        * I915_CONTEXT_SET_ENGINES parameter and via perf, i.e.
> +        * I915_ENGINE_CLASS_RENDER, I915_ENGINE_CLASS_COPY, etc.
>          * reported as active itself. Some hardware may have parallel
>          * execution engines, e.g. multiple media engines, which are
> -        * mapped to the same identifier in the EXECBUFFER2 ioctl and
> -        * so are not separately reported for busyness.
> +        * mapped to the same class identifier and so are not separately
> +        * reported for busyness.
>          *
>          * Caveat emptor:
>          * Only the boolean result of this query is reliable; that is whether
> @@ -1412,65 +1466,18 @@ struct drm_i915_gem_wait {
>  };
>  
>  struct drm_i915_gem_context_create {
> -       /*  output: id of new context*/
> -       __u32 ctx_id;
> -       __u32 pad;
> -};
> -
> -struct drm_i915_gem_context_destroy {
> -       __u32 ctx_id;
> +       __u32 ctx_id; /* output: id of new context*/
>         __u32 pad;
>  };
>  
> -struct drm_i915_reg_read {
> -       /*
> -        * Register offset.
> -        * For 64bit wide registers where the upper 32bits don't immediately
> -        * follow the lower 32bits, the offset of the lower 32bits must
> -        * be specified
> -        */
> -       __u64 offset;
> -#define I915_REG_READ_8B_WA (1ul << 0)
> -
> -       __u64 val; /* Return value */
> -};
> -/* Known registers:
> - *
> - * Render engine timestamp - 0x2358 + 64bit - gen7+
> - * - Note this register returns an invalid value if using the default
> - *   single instruction 8byte read, in order to workaround that pass
> - *   flag I915_REG_READ_8B_WA in offset field.
> - *
> - */
> -
> -struct drm_i915_reset_stats {
> -       __u32 ctx_id;
> +struct drm_i915_gem_context_create_ext {
> +       __u32 ctx_id; /* output: id of new context*/
>         __u32 flags;
> -
> -       /* All resets since boot/module reload, for all contexts */
> -       __u32 reset_count;
> -
> -       /* Number of batches lost when active in GPU, for this context */
> -       __u32 batch_active;
> -
> -       /* Number of batches lost pending for execution, for this context */
> -       __u32 batch_pending;
> -
> -       __u32 pad;
> -};
> -
> -struct drm_i915_gem_userptr {
> -       __u64 user_ptr;
> -       __u64 user_size;
> -       __u32 flags;
> -#define I915_USERPTR_READ_ONLY 0x1
> -#define I915_USERPTR_UNSYNCHRONIZED 0x80000000
> -       /**
> -        * Returned handle for the object.
> -        *
> -        * Object handles are nonzero.
> -        */
> -       __u32 handle;
> +#define I915_CONTEXT_CREATE_FLAGS_USE_EXTENSIONS       (1u << 0)
> +#define I915_CONTEXT_CREATE_FLAGS_SINGLE_TIMELINE      (1u << 1)
> +#define I915_CONTEXT_CREATE_FLAGS_UNKNOWN \
> +       (-(I915_CONTEXT_CREATE_FLAGS_SINGLE_TIMELINE << 1))
> +       __u64 extensions;
>  };
>  
>  struct drm_i915_gem_context_param {
> @@ -1491,6 +1498,63 @@ struct drm_i915_gem_context_param {
>          * drm_i915_gem_context_param_sseu.
>          */
>  #define I915_CONTEXT_PARAM_SSEU                0x7
> +
> +/*
> + * Not all clients may want to attempt automatic recover of a context after
> + * a hang (for example, some clients may only submit very small incremental
> + * batches relying on known logical state of previous batches which will never
> + * recover correctly and each attempt will hang), and so would prefer that
> + * the context is forever banned instead.
> + *
> + * If set to false (0), after a reset, subsequent (and in flight) rendering
> + * from this context is discarded, and the client will need to create a new
> + * context to use instead.
> + *
> + * If set to true (1), the kernel will automatically attempt to recover the
> + * context by skipping the hanging batch and executing the next batch starting
> + * from the default context state (discarding the incomplete logical context
> + * state lost due to the reset).
> + *
> + * On creation, all new contexts are marked as recoverable.
> + */
> +#define I915_CONTEXT_PARAM_RECOVERABLE 0x8
> +
> +       /*
> +        * The id of the associated virtual memory address space (ppGTT) of
> +        * this context. Can be retrieved and passed to another context
> +        * (on the same fd) for both to use the same ppGTT and so share
> +        * address layouts, and avoid reloading the page tables on context
> +        * switches between themselves.
> +        *
> +        * See DRM_I915_GEM_VM_CREATE and DRM_I915_GEM_VM_DESTROY.
> +        */
> +#define I915_CONTEXT_PARAM_VM          0x9
> +
> +/*
> + * I915_CONTEXT_PARAM_ENGINES:
> + *
> + * Bind this context to operate on this subset of available engines. Henceforth,
> + * the I915_EXEC_RING selector for DRM_IOCTL_I915_GEM_EXECBUFFER2 operates as
> + * an index into this array of engines; I915_EXEC_DEFAULT selecting engine[0]
> + * and upwards. Slots 0...N are filled in using the specified (class, instance).
> + * Use
> + *     engine_class: I915_ENGINE_CLASS_INVALID,
> + *     engine_instance: I915_ENGINE_CLASS_INVALID_NONE
> + * to specify a gap in the array that can be filled in later, e.g. by a
> + * virtual engine used for load balancing.
> + *
> + * Setting the number of engines bound to the context to 0, by passing a zero
> + * sized argument, will revert back to default settings.
> + *
> + * See struct i915_context_param_engines.
> + *
> + * Extensions:
> + *   i915_context_engines_load_balance (I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE)
> + *   i915_context_engines_bond (I915_CONTEXT_ENGINES_EXT_BOND)
> + */
> +#define I915_CONTEXT_PARAM_ENGINES     0xa
> +/* Must be kept compact -- no holes and well documented */
> +
>         __u64 value;
>  };
>  
> @@ -1553,6 +1617,186 @@ struct drm_i915_gem_context_param_sseu {
>         __u32 rsvd;
>  };
>  
> +/*
> + * i915_context_engines_load_balance:
> + *
> + * Enable load balancing across this set of engines.
> + *
> + * Into the I915_EXEC_DEFAULT slot [0], a virtual engine is created that when
> + * used will proxy the execbuffer request onto one of the set of engines
> + * in such a way as to distribute the load evenly across the set.
> + *
> + * The set of engines must be compatible (e.g. the same HW class) as they
> + * will share the same logical GPU context and ring.
> + *
> + * To intermix rendering with the virtual engine and direct rendering onto
> + * the backing engines (bypassing the load balancing proxy), the context must
> + * be defined to use a single timeline for all engines.
> + */
> +struct i915_context_engines_load_balance {
> +       struct i915_user_extension base;
> +
> +       __u16 engine_index;
> +       __u16 mbz16; /* reserved for future use; must be zero */
> +       __u32 flags; /* all undefined flags must be zero */
> +
> +       __u64 engines_mask; /* selection mask of engines[] */
> +
> +       __u64 mbz64[4]; /* reserved for future use; must be zero */
> +};
> +
> +/*
> + * i915_context_engines_bond:
> + *
> + * Constructed bonded pairs for execution within a virtual engine.
> + *
> + * All engines are equal, but some are more equal than others. Given
> + * the distribution of resources in the HW, it may be preferable to run
> + * a request on a given subset of engines in parallel to a request on a
> + * specific engine. We enable this selection of engines within a virtual
> + * engine by specifying bonding pairs, for any given master engine we will
> + * only execute on one of the corresponding siblings within the virtual engine.
> + *
> + * To execute a request in parallel on the master engine and a sibling requires
> + * coordination with a I915_EXEC_FENCE_SUBMIT.
> + */
> +struct i915_context_engines_bond {
> +       struct i915_user_extension base;
> +
> +       __u16 virtual_index; /* index of virtual engine in ctx->engines[] */
> +       __u16 mbz;
> +
> +       __u16 master_class;
> +       __u16 master_instance;
> +
> +       __u64 sibling_mask; /* bitmask of BIT(sibling_index) wrt the v.engine */
> +       __u64 flags; /* all undefined flags must be zero */
> +};
> +
> +struct i915_context_param_engines {
> +       __u64 extensions; /* linked chain of extension blocks, 0 terminates */
> +#define I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE 0
> +#define I915_CONTEXT_ENGINES_EXT_BOND 1
> +
> +       struct {
> +               __u16 engine_class; /* see enum drm_i915_gem_engine_class */
> +               __u16 engine_instance;
> +       } class_instance[0];
> +} __attribute__((packed));
> +
> +#define I915_DEFINE_CONTEXT_PARAM_ENGINES(name__, N__) struct { \
> +       __u64 extensions; \
> +       struct { \
> +               __u16 engine_class; \
> +               __u16 engine_instance; \
> +       } class_instance[N__]; \
> +} __attribute__((packed)) name__
> +
> +struct drm_i915_gem_context_create_ext_setparam {
> +#define I915_CONTEXT_CREATE_EXT_SETPARAM 0
> +       struct i915_user_extension base;
> +       struct drm_i915_gem_context_param param;
> +};
> +
> +struct drm_i915_gem_context_create_ext_clone {
> +#define I915_CONTEXT_CREATE_EXT_CLONE 1
> +       struct i915_user_extension base;
> +       __u32 clone_id;
> +       __u32 flags;
> +#define I915_CONTEXT_CLONE_ENGINES     (1u << 0)
> +#define I915_CONTEXT_CLONE_FLAGS       (1u << 1)
> +#define I915_CONTEXT_CLONE_SCHEDATTR   (1u << 2)
> +#define I915_CONTEXT_CLONE_SSEU                (1u << 3)
> +#define I915_CONTEXT_CLONE_TIMELINE    (1u << 4)
> +#define I915_CONTEXT_CLONE_VM          (1u << 5)
> +#define I915_CONTEXT_CLONE_UNKNOWN -(I915_CONTEXT_CLONE_VM << 1)
> +       __u64 rsvd;
> +};
> +
> +struct drm_i915_gem_context_destroy {
> +       __u32 ctx_id;
> +       __u32 pad;
> +};
> +
> +/*
> + * DRM_I915_GEM_VM_CREATE -
> + *
> + * Create a new virtual memory address space (ppGTT) for use within a context
> + * on the same file. Extensions can be provided to configure exactly how the
> + * address space is setup upon creation.
> + *
> + * The id of new VM (bound to the fd) for use with I915_CONTEXT_PARAM_VM is
> + * returned in the outparam @id.
> + *
> + * No flags are defined, with all bits reserved and must be zero.
> + *
> + * An extension chain maybe provided, starting with @extensions, and terminated
> + * by the @next_extension being 0. Currently, no extensions are defined.
> + *
> + * DRM_I915_GEM_VM_DESTROY -
> + *
> + * Destroys a previously created VM id, specified in @id.
> + *
> + * No extensions or flags are allowed currently, and so must be zero.
> + */
> +struct drm_i915_gem_vm_control {
> +       __u64 extensions;
> +       __u32 flags;
> +       __u32 vm_id;
> +};
> +
> +struct drm_i915_reg_read {
> +       /*
> +        * Register offset.
> +        * For 64bit wide registers where the upper 32bits don't immediately
> +        * follow the lower 32bits, the offset of the lower 32bits must
> +        * be specified
> +        */
> +       __u64 offset;
> +#define I915_REG_READ_8B_WA (1ul << 0)
> +
> +       __u64 val; /* Return value */
> +};
> +
> +/* Known registers:
> + *
> + * Render engine timestamp - 0x2358 + 64bit - gen7+
> + * - Note this register returns an invalid value if using the default
> + *   single instruction 8byte read, in order to workaround that pass
> + *   flag I915_REG_READ_8B_WA in offset field.
> + *
> + */
> +
> +struct drm_i915_reset_stats {
> +       __u32 ctx_id;
> +       __u32 flags;
> +
> +       /* All resets since boot/module reload, for all contexts */
> +       __u32 reset_count;
> +
> +       /* Number of batches lost when active in GPU, for this context */
> +       __u32 batch_active;
> +
> +       /* Number of batches lost pending for execution, for this context */
> +       __u32 batch_pending;
> +
> +       __u32 pad;
> +};
> +
> +struct drm_i915_gem_userptr {
> +       __u64 user_ptr;
> +       __u64 user_size;
> +       __u32 flags;
> +#define I915_USERPTR_READ_ONLY 0x1
> +#define I915_USERPTR_UNSYNCHRONIZED 0x80000000
> +       /**
> +        * Returned handle for the object.
> +        *
> +        * Object handles are nonzero.
> +        */
> +       __u32 handle;
> +};
> +
>  enum drm_i915_oa_format {
>         I915_OA_FORMAT_A13 = 1,     /* HSW only */
>         I915_OA_FORMAT_A29,         /* HSW only */
> @@ -1714,6 +1958,7 @@ struct drm_i915_perf_oa_config {
>  struct drm_i915_query_item {
>         __u64 query_id;
>  #define DRM_I915_QUERY_TOPOLOGY_INFO    1
> +/* Must be kept compact -- no holes and well documented */
>  
>         /*
>          * When set to zero by userspace, this is filled with the size of the
> -- 
> 2.20.1
> 
> _______________________________________________
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Mesa-dev] [PATCH 1/3] drm-uapi: Pull i915_drm.h changes for context cloning
  2019-03-31  9:53 ` [Mesa-dev] [PATCH 1/3] drm-uapi: Pull i915_drm.h changes for context cloning Jordan Justen
@ 2019-03-31  9:55   ` Chris Wilson
  0 siblings, 0 replies; 12+ messages in thread
From: Chris Wilson @ 2019-03-31  9:55 UTC (permalink / raw)
  To: Jordan Justen, mesa-dev; +Cc: intel-gfx

Quoting Jordan Justen (2019-03-31 10:53:06)
> Where are these changes from (repo/commit)? It could be good to
> reference in the commit message.

They don't exist in drm-next yet, so they don't have a reference.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Intel-gfx] [PATCH 2/3] iris: Create a composite context for both compute and render pipelines
  2019-03-25 10:58 ` [PATCH 2/3] iris: Create a composite context for both compute and render pipelines Chris Wilson
  2019-03-26  5:52   ` Kenneth Graunke
@ 2019-03-31  9:57   ` Jordan Justen
  2019-03-31 10:02     ` Chris Wilson
  1 sibling, 1 reply; 12+ messages in thread
From: Jordan Justen @ 2019-03-31  9:57 UTC (permalink / raw)
  To: Chris Wilson, mesa-dev; +Cc: intel-gfx, Kenneth Graunke

On 2019-03-25 03:58:59, Chris Wilson wrote:
> iris currently uses two distinct GEM contexts to have distinct logical
> HW contexts for the compute and render pipelines. However, using two
> distinct GEM contexts implies that they are distinct timelines, yet as
> they are a single GL context that implies they belong to a single
> timeline from the client perspective. Currently, fences are occasionally
> inserted to order the two timelines. Using 2 GEM contexts, also implies
> that we keep 2 ppGTT for identical buffer state. If we can create a
> single GEM context, with the right capabilities, we can have a single
> VM, a single timeline, but 2 logical HW contexts for the 2 pipelines.
> 
> This is allowed through the new context interface that allows VM to be
> shared, timelines to be specified, and for the logical contexts to be
> constructed as the user desires.
> 
> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> Cc: Kenneth Graunke <kenneth@whitecape.org>
> ---
>  src/gallium/drivers/iris/iris_batch.c   | 16 ++-----
>  src/gallium/drivers/iris/iris_batch.h   |  5 +--
>  src/gallium/drivers/iris/iris_context.c | 56 ++++++++++++++++++++++++-
>  3 files changed, 60 insertions(+), 17 deletions(-)
> 
> diff --git a/src/gallium/drivers/iris/iris_batch.c b/src/gallium/drivers/iris/iris_batch.c
> index 556422c38bc..40bcd939795 100644
> --- a/src/gallium/drivers/iris/iris_batch.c
> +++ b/src/gallium/drivers/iris/iris_batch.c
> @@ -164,24 +164,16 @@ iris_init_batch(struct iris_batch *batch,
>                  struct iris_vtable *vtbl,
>                  struct pipe_debug_callback *dbg,
>                  struct iris_batch *all_batches,
> -                enum iris_batch_name name,
> -                uint8_t engine,
> -                int priority)
> +                uint32_t hw_id,
> +                enum iris_batch_name name)
>  {
>     batch->screen = screen;
>     batch->vtbl = vtbl;
>     batch->dbg = dbg;
>     batch->name = name;
>  
> -   /* engine should be one of I915_EXEC_RENDER, I915_EXEC_BLT, etc. */
> -   assert((engine & ~I915_EXEC_RING_MASK) == 0);
> -   assert(util_bitcount(engine) == 1);
> -   batch->engine = engine;
> -
> -   batch->hw_ctx_id = iris_create_hw_context(screen->bufmgr);
> -   assert(batch->hw_ctx_id);
> -
> -   iris_hw_context_set_priority(screen->bufmgr, batch->hw_ctx_id, priority);
> +   batch->hw_ctx_id = hw_id;
> +   batch->engine = name;

I think this should fall-back to I915_EXEC_RENDER if the old style
contexts are created.

-Jordan
_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/3] iris: Create a composite context for both compute and render pipelines
  2019-03-31  9:57   ` [Intel-gfx] " Jordan Justen
@ 2019-03-31 10:02     ` Chris Wilson
  2019-03-31 10:41       ` Jordan Justen
  0 siblings, 1 reply; 12+ messages in thread
From: Chris Wilson @ 2019-03-31 10:02 UTC (permalink / raw)
  To: Jordan Justen, mesa-dev; +Cc: intel-gfx, Kenneth Graunke

Quoting Jordan Justen (2019-03-31 10:57:09)
> On 2019-03-25 03:58:59, Chris Wilson wrote:
> > iris currently uses two distinct GEM contexts to have distinct logical
> > HW contexts for the compute and render pipelines. However, using two
> > distinct GEM contexts implies that they are distinct timelines, yet as
> > they are a single GL context that implies they belong to a single
> > timeline from the client perspective. Currently, fences are occasionally
> > inserted to order the two timelines. Using 2 GEM contexts, also implies
> > that we keep 2 ppGTT for identical buffer state. If we can create a
> > single GEM context, with the right capabilities, we can have a single
> > VM, a single timeline, but 2 logical HW contexts for the 2 pipelines.
> > 
> > This is allowed through the new context interface that allows VM to be
> > shared, timelines to be specified, and for the logical contexts to be
> > constructed as the user desires.
> > 
> > Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> > Cc: Kenneth Graunke <kenneth@whitecape.org>
> > ---
> >  src/gallium/drivers/iris/iris_batch.c   | 16 ++-----
> >  src/gallium/drivers/iris/iris_batch.h   |  5 +--
> >  src/gallium/drivers/iris/iris_context.c | 56 ++++++++++++++++++++++++-
> >  3 files changed, 60 insertions(+), 17 deletions(-)
> > 
> > diff --git a/src/gallium/drivers/iris/iris_batch.c b/src/gallium/drivers/iris/iris_batch.c
> > index 556422c38bc..40bcd939795 100644
> > --- a/src/gallium/drivers/iris/iris_batch.c
> > +++ b/src/gallium/drivers/iris/iris_batch.c
> > @@ -164,24 +164,16 @@ iris_init_batch(struct iris_batch *batch,
> >                  struct iris_vtable *vtbl,
> >                  struct pipe_debug_callback *dbg,
> >                  struct iris_batch *all_batches,
> > -                enum iris_batch_name name,
> > -                uint8_t engine,
> > -                int priority)
> > +                uint32_t hw_id,
> > +                enum iris_batch_name name)
> >  {
> >     batch->screen = screen;
> >     batch->vtbl = vtbl;
> >     batch->dbg = dbg;
> >     batch->name = name;
> >  
> > -   /* engine should be one of I915_EXEC_RENDER, I915_EXEC_BLT, etc. */
> > -   assert((engine & ~I915_EXEC_RING_MASK) == 0);
> > -   assert(util_bitcount(engine) == 1);
> > -   batch->engine = engine;
> > -
> > -   batch->hw_ctx_id = iris_create_hw_context(screen->bufmgr);
> > -   assert(batch->hw_ctx_id);
> > -
> > -   iris_hw_context_set_priority(screen->bufmgr, batch->hw_ctx_id, priority);
> > +   batch->hw_ctx_id = hw_id;
> > +   batch->engine = name;
> 
> I think this should fall-back to I915_EXEC_RENDER if the old style
> contexts are created.

It uses the legacy uABI to map both name=COMPUTE to the render ring
and name=RENDER to the render ring. Should we have more or either, or
start using the many video engines, ctx->engines[] will be the only way
to define the execbuf mapping. So name == engine, as used today, can
remain invariant across the legacy I915_EXEC_RING API and future
ctx->engines[] for the remaining forseeable future of GEM_EXECBUFFER2.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/3] iris: Create a composite context for both compute and render pipelines
  2019-03-31 10:02     ` Chris Wilson
@ 2019-03-31 10:41       ` Jordan Justen
  0 siblings, 0 replies; 12+ messages in thread
From: Jordan Justen @ 2019-03-31 10:41 UTC (permalink / raw)
  To: Chris Wilson, mesa-dev; +Cc: intel-gfx, Kenneth Graunke

On 2019-03-31 03:02:21, Chris Wilson wrote:
> Quoting Jordan Justen (2019-03-31 10:57:09)
> > On 2019-03-25 03:58:59, Chris Wilson wrote:
> > > iris currently uses two distinct GEM contexts to have distinct logical
> > > HW contexts for the compute and render pipelines. However, using two
> > > distinct GEM contexts implies that they are distinct timelines, yet as
> > > they are a single GL context that implies they belong to a single
> > > timeline from the client perspective. Currently, fences are occasionally
> > > inserted to order the two timelines. Using 2 GEM contexts, also implies
> > > that we keep 2 ppGTT for identical buffer state. If we can create a
> > > single GEM context, with the right capabilities, we can have a single
> > > VM, a single timeline, but 2 logical HW contexts for the 2 pipelines.
> > > 
> > > This is allowed through the new context interface that allows VM to be
> > > shared, timelines to be specified, and for the logical contexts to be
> > > constructed as the user desires.
> > > 
> > > Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> > > Cc: Kenneth Graunke <kenneth@whitecape.org>
> > > ---
> > >  src/gallium/drivers/iris/iris_batch.c   | 16 ++-----
> > >  src/gallium/drivers/iris/iris_batch.h   |  5 +--
> > >  src/gallium/drivers/iris/iris_context.c | 56 ++++++++++++++++++++++++-
> > >  3 files changed, 60 insertions(+), 17 deletions(-)
> > > 
> > > diff --git a/src/gallium/drivers/iris/iris_batch.c b/src/gallium/drivers/iris/iris_batch.c
> > > index 556422c38bc..40bcd939795 100644
> > > --- a/src/gallium/drivers/iris/iris_batch.c
> > > +++ b/src/gallium/drivers/iris/iris_batch.c
> > > @@ -164,24 +164,16 @@ iris_init_batch(struct iris_batch *batch,
> > >                  struct iris_vtable *vtbl,
> > >                  struct pipe_debug_callback *dbg,
> > >                  struct iris_batch *all_batches,
> > > -                enum iris_batch_name name,
> > > -                uint8_t engine,
> > > -                int priority)
> > > +                uint32_t hw_id,
> > > +                enum iris_batch_name name)
> > >  {
> > >     batch->screen = screen;
> > >     batch->vtbl = vtbl;
> > >     batch->dbg = dbg;
> > >     batch->name = name;
> > >  
> > > -   /* engine should be one of I915_EXEC_RENDER, I915_EXEC_BLT, etc. */
> > > -   assert((engine & ~I915_EXEC_RING_MASK) == 0);
> > > -   assert(util_bitcount(engine) == 1);
> > > -   batch->engine = engine;
> > > -
> > > -   batch->hw_ctx_id = iris_create_hw_context(screen->bufmgr);
> > > -   assert(batch->hw_ctx_id);
> > > -
> > > -   iris_hw_context_set_priority(screen->bufmgr, batch->hw_ctx_id, priority);
> > > +   batch->hw_ctx_id = hw_id;
> > > +   batch->engine = name;
> > 
> > I think this should fall-back to I915_EXEC_RENDER if the old style
> > contexts are created.
> 
> It uses the legacy uABI to map both name=COMPUTE to the render ring
> and name=RENDER to the render ring.

Oh, what legacy uABI sets that? I thought if engines weren't set then
I915_EXEC_RENDER would be needed for the two contexts to execute on
the rcs. But, after this patch, isn't name 0 and 1?

I guess I915_EXEC_DEFAULT is 0 and I915_EXEC_RENDER is 1. So, will
I915_EXEC_DEFAULT execute on rcs too? If so, I'm not sure I agree with
taking advantage of this coincidence without even a comment.

-Jordan

> Should we have more or either, or
> start using the many video engines, ctx->engines[] will be the only way
> to define the execbuf mapping. So name == engine, as used today, can
> remain invariant across the legacy I915_EXEC_RING API and future
> ctx->engines[] for the remaining forseeable future of GEM_EXECBUFFER2.
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2019-03-31 10:41 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-03-25 10:58 [PATCH 1/3] drm-uapi: Pull i915_drm.h changes for context cloning Chris Wilson
2019-03-25 10:58 ` [PATCH 2/3] iris: Create a composite context for both compute and render pipelines Chris Wilson
2019-03-26  5:52   ` Kenneth Graunke
2019-03-26  7:16     ` Chris Wilson
2019-03-26 17:01       ` Kenneth Graunke
2019-03-26 17:15         ` Chris Wilson
2019-03-31  9:57   ` [Intel-gfx] " Jordan Justen
2019-03-31 10:02     ` Chris Wilson
2019-03-31 10:41       ` Jordan Justen
2019-03-25 10:59 ` [PATCH 3/3] iris: Handle GPU recovery Chris Wilson
2019-03-31  9:53 ` [Mesa-dev] [PATCH 1/3] drm-uapi: Pull i915_drm.h changes for context cloning Jordan Justen
2019-03-31  9:55   ` Chris Wilson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.