All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 01/11] drm/i915/gt: Limit VFE threads based on GT
@ 2021-01-10 15:03 ` Chris Wilson
  0 siblings, 0 replies; 30+ messages in thread
From: Chris Wilson @ 2021-01-10 15:03 UTC (permalink / raw)
  To: intel-gfx
  Cc: Chris Wilson, Mika Kuoppala, Prathap Kumar Valsan,
	Akeem G Abodunrin, Jon Bloomfield, Rodrigo Vivi, Randy Wright,
	stable

MEDIA_STATE_VFE only accepts the 'maximum number of threads' in the
range [0, n-1] where n is #EU * (#threads/EU) with the number of threads
based on plaform and the number of EU based on the number of slices and
subslices. This is a fixed number per platform/gt, so appropriately
limit the number of threads we spawn to match the device.

v2: Oversaturate the system with tasks to force execution on every HW
thread; if the thread idles it is returned to the pool and may be reused
again before an unused thread.

v3: Fix more state commands, which was causing Baytrail to barf.
v4: STATE_CACHE_INVALIDATE requires a stall on Ivybridge

Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/2024
Fixes: 47f8253d2b89 ("drm/i915/gen7: Clear all EU/L3 residual contexts")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Prathap Kumar Valsan <prathap.kumar.valsan@intel.com>
Cc: Akeem G Abodunrin <akeem.g.abodunrin@intel.com>
Cc: Jon Bloomfield <jon.bloomfield@intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Randy Wright <rwright@hpe.com>
Cc: stable@vger.kernel.org # v5.7+
---
 drivers/gpu/drm/i915/gt/gen7_renderclear.c | 157 ++++++++++++---------
 1 file changed, 94 insertions(+), 63 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/gen7_renderclear.c b/drivers/gpu/drm/i915/gt/gen7_renderclear.c
index d93d85cd3027..f32a8e8040b2 100644
--- a/drivers/gpu/drm/i915/gt/gen7_renderclear.c
+++ b/drivers/gpu/drm/i915/gt/gen7_renderclear.c
@@ -7,8 +7,6 @@
 #include "i915_drv.h"
 #include "intel_gpu_commands.h"
 
-#define MAX_URB_ENTRIES 64
-#define STATE_SIZE (4 * 1024)
 #define GT3_INLINE_DATA_DELAYS 0x1E00
 #define batch_advance(Y, CS) GEM_BUG_ON((Y)->end != (CS))
 
@@ -34,38 +32,59 @@ struct batch_chunk {
 };
 
 struct batch_vals {
-	u32 max_primitives;
-	u32 max_urb_entries;
-	u32 cmd_size;
-	u32 state_size;
+	u32 max_threads;
 	u32 state_start;
-	u32 batch_size;
+	u32 surface_start;
 	u32 surface_height;
 	u32 surface_width;
-	u32 scratch_size;
-	u32 max_size;
+	u32 size;
 };
 
+static inline int num_primitives(const struct batch_vals *bv)
+{
+	/*
+	 * We need to saturate the GPU with work in order to dispatch
+	 * a shader on every HW thread, and clear the thread-local registers.
+	 * In short, we have to dispatch work faster than the shaders can
+	 * run in order to fill occupy each HW thread.
+	 */
+	return bv->max_threads;
+}
+
 static void
 batch_get_defaults(struct drm_i915_private *i915, struct batch_vals *bv)
 {
 	if (IS_HASWELL(i915)) {
-		bv->max_primitives = 280;
-		bv->max_urb_entries = MAX_URB_ENTRIES;
+		switch (INTEL_INFO(i915)->gt) {
+		default:
+		case 1:
+			bv->max_threads = 70;
+			break;
+		case 2:
+			bv->max_threads = 140;
+			break;
+		case 3:
+			bv->max_threads = 280;
+			break;
+		}
 		bv->surface_height = 16 * 16;
 		bv->surface_width = 32 * 2 * 16;
 	} else {
-		bv->max_primitives = 128;
-		bv->max_urb_entries = MAX_URB_ENTRIES / 2;
+		switch (INTEL_INFO(i915)->gt) {
+		default:
+		case 1: /* including vlv */
+			bv->max_threads = 36;
+			break;
+		case 2:
+			bv->max_threads = 128;
+			break;
+		}
 		bv->surface_height = 16 * 8;
 		bv->surface_width = 32 * 16;
 	}
-	bv->cmd_size = bv->max_primitives * 4096;
-	bv->state_size = STATE_SIZE;
-	bv->state_start = bv->cmd_size;
-	bv->batch_size = bv->cmd_size + bv->state_size;
-	bv->scratch_size = bv->surface_height * bv->surface_width;
-	bv->max_size = bv->batch_size + bv->scratch_size;
+	bv->state_start = round_up(SZ_1K + num_primitives(bv) * 64, SZ_4K);
+	bv->surface_start = bv->state_start + SZ_4K;
+	bv->size = bv->surface_start + bv->surface_height * bv->surface_width;
 }
 
 static void batch_init(struct batch_chunk *bc,
@@ -155,7 +174,8 @@ static u32
 gen7_fill_binding_table(struct batch_chunk *state,
 			const struct batch_vals *bv)
 {
-	u32 surface_start = gen7_fill_surface_state(state, bv->batch_size, bv);
+	u32 surface_start =
+		gen7_fill_surface_state(state, bv->surface_start, bv);
 	u32 *cs = batch_alloc_items(state, 32, 8);
 	u32 offset = batch_offset(state, cs);
 
@@ -214,9 +234,9 @@ static void
 gen7_emit_state_base_address(struct batch_chunk *batch,
 			     u32 surface_state_base)
 {
-	u32 *cs = batch_alloc_items(batch, 0, 12);
+	u32 *cs = batch_alloc_items(batch, 0, 10);
 
-	*cs++ = STATE_BASE_ADDRESS | (12 - 2);
+	*cs++ = STATE_BASE_ADDRESS | (10 - 2);
 	/* general */
 	*cs++ = batch_addr(batch) | BASE_ADDRESS_MODIFY;
 	/* surface */
@@ -233,8 +253,6 @@ gen7_emit_state_base_address(struct batch_chunk *batch,
 	*cs++ = BASE_ADDRESS_MODIFY;
 	*cs++ = 0;
 	*cs++ = BASE_ADDRESS_MODIFY;
-	*cs++ = 0;
-	*cs++ = 0;
 	batch_advance(batch, cs);
 }
 
@@ -244,8 +262,7 @@ gen7_emit_vfe_state(struct batch_chunk *batch,
 		    u32 urb_size, u32 curbe_size,
 		    u32 mode)
 {
-	u32 urb_entries = bv->max_urb_entries;
-	u32 threads = bv->max_primitives - 1;
+	u32 threads = bv->max_threads - 1;
 	u32 *cs = batch_alloc_items(batch, 32, 8);
 
 	*cs++ = MEDIA_VFE_STATE | (8 - 2);
@@ -254,7 +271,7 @@ gen7_emit_vfe_state(struct batch_chunk *batch,
 	*cs++ = 0;
 
 	/* number of threads & urb entries for GPGPU vs Media Mode */
-	*cs++ = threads << 16 | urb_entries << 8 | mode << 2;
+	*cs++ = threads << 16 | 1 << 8 | mode << 2;
 
 	*cs++ = 0;
 
@@ -293,17 +310,12 @@ gen7_emit_media_object(struct batch_chunk *batch,
 {
 	unsigned int x_offset = (media_object_index % 16) * 64;
 	unsigned int y_offset = (media_object_index / 16) * 16;
-	unsigned int inline_data_size;
-	unsigned int media_batch_size;
-	unsigned int i;
+	unsigned int pkt = 6 + 3;
 	u32 *cs;
 
-	inline_data_size = 112 * 8;
-	media_batch_size = inline_data_size + 6;
+	cs = batch_alloc_items(batch, 8, pkt);
 
-	cs = batch_alloc_items(batch, 8, media_batch_size);
-
-	*cs++ = MEDIA_OBJECT | (media_batch_size - 2);
+	*cs++ = MEDIA_OBJECT | (pkt - 2);
 
 	/* interface descriptor offset */
 	*cs++ = 0;
@@ -317,25 +329,44 @@ gen7_emit_media_object(struct batch_chunk *batch,
 	*cs++ = 0;
 
 	/* inline */
-	*cs++ = (y_offset << 16) | (x_offset);
+	*cs++ = y_offset << 16 | x_offset;
 	*cs++ = 0;
 	*cs++ = GT3_INLINE_DATA_DELAYS;
-	for (i = 3; i < inline_data_size; i++)
-		*cs++ = 0;
 
 	batch_advance(batch, cs);
 }
 
 static void gen7_emit_pipeline_flush(struct batch_chunk *batch)
 {
-	u32 *cs = batch_alloc_items(batch, 0, 5);
+	u32 *cs = batch_alloc_items(batch, 0, 4);
 
-	*cs++ = GFX_OP_PIPE_CONTROL(5);
-	*cs++ = PIPE_CONTROL_STATE_CACHE_INVALIDATE |
-		PIPE_CONTROL_GLOBAL_GTT_IVB;
+	*cs++ = GFX_OP_PIPE_CONTROL(4);
+	*cs++ = PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
+		PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+		PIPE_CONTROL_DC_FLUSH_ENABLE |
+		PIPE_CONTROL_CS_STALL;
 	*cs++ = 0;
 	*cs++ = 0;
+
+	batch_advance(batch, cs);
+}
+
+static void gen7_emit_pipeline_invalidate(struct batch_chunk *batch)
+{
+	u32 *cs = batch_alloc_items(batch, 0, 8);
+
+	/* ivb: Stall before STATE_CACHE_INVALIDATE */
+	*cs++ = GFX_OP_PIPE_CONTROL(4);
+	*cs++ = PIPE_CONTROL_STALL_AT_SCOREBOARD |
+		PIPE_CONTROL_CS_STALL;
 	*cs++ = 0;
+	*cs++ = 0;
+
+	*cs++ = GFX_OP_PIPE_CONTROL(4);
+	*cs++ = PIPE_CONTROL_STATE_CACHE_INVALIDATE;
+	*cs++ = 0;
+	*cs++ = 0;
+
 	batch_advance(batch, cs);
 }
 
@@ -344,34 +375,34 @@ static void emit_batch(struct i915_vma * const vma,
 		       const struct batch_vals *bv)
 {
 	struct drm_i915_private *i915 = vma->vm->i915;
-	unsigned int desc_count = 64;
-	const u32 urb_size = 112;
+	const unsigned int desc_count = 1;
+	const unsigned int urb_size = 1;
 	struct batch_chunk cmds, state;
-	u32 interface_descriptor;
+	u32 descriptors;
 	unsigned int i;
 
-	batch_init(&cmds, vma, start, 0, bv->cmd_size);
-	batch_init(&state, vma, start, bv->state_start, bv->state_size);
+	batch_init(&cmds, vma, start, 0, bv->state_start);
+	batch_init(&state, vma, start, bv->state_start, SZ_4K);
 
-	interface_descriptor =
-		gen7_fill_interface_descriptor(&state, bv,
-					       IS_HASWELL(i915) ?
-					       &cb_kernel_hsw :
-					       &cb_kernel_ivb,
-					       desc_count);
-	gen7_emit_pipeline_flush(&cmds);
+	descriptors = gen7_fill_interface_descriptor(&state, bv,
+						     IS_HASWELL(i915) ?
+						     &cb_kernel_hsw :
+						     &cb_kernel_ivb,
+						     desc_count);
+
+	gen7_emit_pipeline_invalidate(&cmds);
 	batch_add(&cmds, PIPELINE_SELECT | PIPELINE_SELECT_MEDIA);
 	batch_add(&cmds, MI_NOOP);
-	gen7_emit_state_base_address(&cmds, interface_descriptor);
+	gen7_emit_pipeline_invalidate(&cmds);
+
 	gen7_emit_pipeline_flush(&cmds);
+	gen7_emit_state_base_address(&cmds, descriptors);
+	gen7_emit_pipeline_invalidate(&cmds);
 
 	gen7_emit_vfe_state(&cmds, bv, urb_size - 1, 0, 0);
+	gen7_emit_interface_descriptor_load(&cmds, descriptors, desc_count);
 
-	gen7_emit_interface_descriptor_load(&cmds,
-					    interface_descriptor,
-					    desc_count);
-
-	for (i = 0; i < bv->max_primitives; i++)
+	for (i = 0; i < num_primitives(bv); i++)
 		gen7_emit_media_object(&cmds, i);
 
 	batch_add(&cmds, MI_BATCH_BUFFER_END);
@@ -385,15 +416,15 @@ int gen7_setup_clear_gpr_bb(struct intel_engine_cs * const engine,
 
 	batch_get_defaults(engine->i915, &bv);
 	if (!vma)
-		return bv.max_size;
+		return bv.size;
 
-	GEM_BUG_ON(vma->obj->base.size < bv.max_size);
+	GEM_BUG_ON(vma->obj->base.size < bv.size);
 
 	batch = i915_gem_object_pin_map(vma->obj, I915_MAP_WC);
 	if (IS_ERR(batch))
 		return PTR_ERR(batch);
 
-	emit_batch(vma, memset(batch, 0, bv.max_size), &bv);
+	emit_batch(vma, memset(batch, 0, bv.size), &bv);
 
 	i915_gem_object_flush_map(vma->obj);
 	__i915_gem_object_release_map(vma->obj);
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [Intel-gfx] [PATCH 01/11] drm/i915/gt: Limit VFE threads based on GT
@ 2021-01-10 15:03 ` Chris Wilson
  0 siblings, 0 replies; 30+ messages in thread
From: Chris Wilson @ 2021-01-10 15:03 UTC (permalink / raw)
  To: intel-gfx; +Cc: Randy Wright, Chris Wilson, stable

MEDIA_STATE_VFE only accepts the 'maximum number of threads' in the
range [0, n-1] where n is #EU * (#threads/EU) with the number of threads
based on plaform and the number of EU based on the number of slices and
subslices. This is a fixed number per platform/gt, so appropriately
limit the number of threads we spawn to match the device.

v2: Oversaturate the system with tasks to force execution on every HW
thread; if the thread idles it is returned to the pool and may be reused
again before an unused thread.

v3: Fix more state commands, which was causing Baytrail to barf.
v4: STATE_CACHE_INVALIDATE requires a stall on Ivybridge

Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/2024
Fixes: 47f8253d2b89 ("drm/i915/gen7: Clear all EU/L3 residual contexts")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Prathap Kumar Valsan <prathap.kumar.valsan@intel.com>
Cc: Akeem G Abodunrin <akeem.g.abodunrin@intel.com>
Cc: Jon Bloomfield <jon.bloomfield@intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Randy Wright <rwright@hpe.com>
Cc: stable@vger.kernel.org # v5.7+
---
 drivers/gpu/drm/i915/gt/gen7_renderclear.c | 157 ++++++++++++---------
 1 file changed, 94 insertions(+), 63 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/gen7_renderclear.c b/drivers/gpu/drm/i915/gt/gen7_renderclear.c
index d93d85cd3027..f32a8e8040b2 100644
--- a/drivers/gpu/drm/i915/gt/gen7_renderclear.c
+++ b/drivers/gpu/drm/i915/gt/gen7_renderclear.c
@@ -7,8 +7,6 @@
 #include "i915_drv.h"
 #include "intel_gpu_commands.h"
 
-#define MAX_URB_ENTRIES 64
-#define STATE_SIZE (4 * 1024)
 #define GT3_INLINE_DATA_DELAYS 0x1E00
 #define batch_advance(Y, CS) GEM_BUG_ON((Y)->end != (CS))
 
@@ -34,38 +32,59 @@ struct batch_chunk {
 };
 
 struct batch_vals {
-	u32 max_primitives;
-	u32 max_urb_entries;
-	u32 cmd_size;
-	u32 state_size;
+	u32 max_threads;
 	u32 state_start;
-	u32 batch_size;
+	u32 surface_start;
 	u32 surface_height;
 	u32 surface_width;
-	u32 scratch_size;
-	u32 max_size;
+	u32 size;
 };
 
+static inline int num_primitives(const struct batch_vals *bv)
+{
+	/*
+	 * We need to saturate the GPU with work in order to dispatch
+	 * a shader on every HW thread, and clear the thread-local registers.
+	 * In short, we have to dispatch work faster than the shaders can
+	 * run in order to fill occupy each HW thread.
+	 */
+	return bv->max_threads;
+}
+
 static void
 batch_get_defaults(struct drm_i915_private *i915, struct batch_vals *bv)
 {
 	if (IS_HASWELL(i915)) {
-		bv->max_primitives = 280;
-		bv->max_urb_entries = MAX_URB_ENTRIES;
+		switch (INTEL_INFO(i915)->gt) {
+		default:
+		case 1:
+			bv->max_threads = 70;
+			break;
+		case 2:
+			bv->max_threads = 140;
+			break;
+		case 3:
+			bv->max_threads = 280;
+			break;
+		}
 		bv->surface_height = 16 * 16;
 		bv->surface_width = 32 * 2 * 16;
 	} else {
-		bv->max_primitives = 128;
-		bv->max_urb_entries = MAX_URB_ENTRIES / 2;
+		switch (INTEL_INFO(i915)->gt) {
+		default:
+		case 1: /* including vlv */
+			bv->max_threads = 36;
+			break;
+		case 2:
+			bv->max_threads = 128;
+			break;
+		}
 		bv->surface_height = 16 * 8;
 		bv->surface_width = 32 * 16;
 	}
-	bv->cmd_size = bv->max_primitives * 4096;
-	bv->state_size = STATE_SIZE;
-	bv->state_start = bv->cmd_size;
-	bv->batch_size = bv->cmd_size + bv->state_size;
-	bv->scratch_size = bv->surface_height * bv->surface_width;
-	bv->max_size = bv->batch_size + bv->scratch_size;
+	bv->state_start = round_up(SZ_1K + num_primitives(bv) * 64, SZ_4K);
+	bv->surface_start = bv->state_start + SZ_4K;
+	bv->size = bv->surface_start + bv->surface_height * bv->surface_width;
 }
 
 static void batch_init(struct batch_chunk *bc,
@@ -155,7 +174,8 @@ static u32
 gen7_fill_binding_table(struct batch_chunk *state,
 			const struct batch_vals *bv)
 {
-	u32 surface_start = gen7_fill_surface_state(state, bv->batch_size, bv);
+	u32 surface_start =
+		gen7_fill_surface_state(state, bv->surface_start, bv);
 	u32 *cs = batch_alloc_items(state, 32, 8);
 	u32 offset = batch_offset(state, cs);
 
@@ -214,9 +234,9 @@ static void
 gen7_emit_state_base_address(struct batch_chunk *batch,
 			     u32 surface_state_base)
 {
-	u32 *cs = batch_alloc_items(batch, 0, 12);
+	u32 *cs = batch_alloc_items(batch, 0, 10);
 
-	*cs++ = STATE_BASE_ADDRESS | (12 - 2);
+	*cs++ = STATE_BASE_ADDRESS | (10 - 2);
 	/* general */
 	*cs++ = batch_addr(batch) | BASE_ADDRESS_MODIFY;
 	/* surface */
@@ -233,8 +253,6 @@ gen7_emit_state_base_address(struct batch_chunk *batch,
 	*cs++ = BASE_ADDRESS_MODIFY;
 	*cs++ = 0;
 	*cs++ = BASE_ADDRESS_MODIFY;
-	*cs++ = 0;
-	*cs++ = 0;
 	batch_advance(batch, cs);
 }
 
@@ -244,8 +262,7 @@ gen7_emit_vfe_state(struct batch_chunk *batch,
 		    u32 urb_size, u32 curbe_size,
 		    u32 mode)
 {
-	u32 urb_entries = bv->max_urb_entries;
-	u32 threads = bv->max_primitives - 1;
+	u32 threads = bv->max_threads - 1;
 	u32 *cs = batch_alloc_items(batch, 32, 8);
 
 	*cs++ = MEDIA_VFE_STATE | (8 - 2);
@@ -254,7 +271,7 @@ gen7_emit_vfe_state(struct batch_chunk *batch,
 	*cs++ = 0;
 
 	/* number of threads & urb entries for GPGPU vs Media Mode */
-	*cs++ = threads << 16 | urb_entries << 8 | mode << 2;
+	*cs++ = threads << 16 | 1 << 8 | mode << 2;
 
 	*cs++ = 0;
 
@@ -293,17 +310,12 @@ gen7_emit_media_object(struct batch_chunk *batch,
 {
 	unsigned int x_offset = (media_object_index % 16) * 64;
 	unsigned int y_offset = (media_object_index / 16) * 16;
-	unsigned int inline_data_size;
-	unsigned int media_batch_size;
-	unsigned int i;
+	unsigned int pkt = 6 + 3;
 	u32 *cs;
 
-	inline_data_size = 112 * 8;
-	media_batch_size = inline_data_size + 6;
+	cs = batch_alloc_items(batch, 8, pkt);
 
-	cs = batch_alloc_items(batch, 8, media_batch_size);
-
-	*cs++ = MEDIA_OBJECT | (media_batch_size - 2);
+	*cs++ = MEDIA_OBJECT | (pkt - 2);
 
 	/* interface descriptor offset */
 	*cs++ = 0;
@@ -317,25 +329,44 @@ gen7_emit_media_object(struct batch_chunk *batch,
 	*cs++ = 0;
 
 	/* inline */
-	*cs++ = (y_offset << 16) | (x_offset);
+	*cs++ = y_offset << 16 | x_offset;
 	*cs++ = 0;
 	*cs++ = GT3_INLINE_DATA_DELAYS;
-	for (i = 3; i < inline_data_size; i++)
-		*cs++ = 0;
 
 	batch_advance(batch, cs);
 }
 
 static void gen7_emit_pipeline_flush(struct batch_chunk *batch)
 {
-	u32 *cs = batch_alloc_items(batch, 0, 5);
+	u32 *cs = batch_alloc_items(batch, 0, 4);
 
-	*cs++ = GFX_OP_PIPE_CONTROL(5);
-	*cs++ = PIPE_CONTROL_STATE_CACHE_INVALIDATE |
-		PIPE_CONTROL_GLOBAL_GTT_IVB;
+	*cs++ = GFX_OP_PIPE_CONTROL(4);
+	*cs++ = PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
+		PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+		PIPE_CONTROL_DC_FLUSH_ENABLE |
+		PIPE_CONTROL_CS_STALL;
 	*cs++ = 0;
 	*cs++ = 0;
+
+	batch_advance(batch, cs);
+}
+
+static void gen7_emit_pipeline_invalidate(struct batch_chunk *batch)
+{
+	u32 *cs = batch_alloc_items(batch, 0, 8);
+
+	/* ivb: Stall before STATE_CACHE_INVALIDATE */
+	*cs++ = GFX_OP_PIPE_CONTROL(4);
+	*cs++ = PIPE_CONTROL_STALL_AT_SCOREBOARD |
+		PIPE_CONTROL_CS_STALL;
 	*cs++ = 0;
+	*cs++ = 0;
+
+	*cs++ = GFX_OP_PIPE_CONTROL(4);
+	*cs++ = PIPE_CONTROL_STATE_CACHE_INVALIDATE;
+	*cs++ = 0;
+	*cs++ = 0;
+
 	batch_advance(batch, cs);
 }
 
@@ -344,34 +375,34 @@ static void emit_batch(struct i915_vma * const vma,
 		       const struct batch_vals *bv)
 {
 	struct drm_i915_private *i915 = vma->vm->i915;
-	unsigned int desc_count = 64;
-	const u32 urb_size = 112;
+	const unsigned int desc_count = 1;
+	const unsigned int urb_size = 1;
 	struct batch_chunk cmds, state;
-	u32 interface_descriptor;
+	u32 descriptors;
 	unsigned int i;
 
-	batch_init(&cmds, vma, start, 0, bv->cmd_size);
-	batch_init(&state, vma, start, bv->state_start, bv->state_size);
+	batch_init(&cmds, vma, start, 0, bv->state_start);
+	batch_init(&state, vma, start, bv->state_start, SZ_4K);
 
-	interface_descriptor =
-		gen7_fill_interface_descriptor(&state, bv,
-					       IS_HASWELL(i915) ?
-					       &cb_kernel_hsw :
-					       &cb_kernel_ivb,
-					       desc_count);
-	gen7_emit_pipeline_flush(&cmds);
+	descriptors = gen7_fill_interface_descriptor(&state, bv,
+						     IS_HASWELL(i915) ?
+						     &cb_kernel_hsw :
+						     &cb_kernel_ivb,
+						     desc_count);
+
+	gen7_emit_pipeline_invalidate(&cmds);
 	batch_add(&cmds, PIPELINE_SELECT | PIPELINE_SELECT_MEDIA);
 	batch_add(&cmds, MI_NOOP);
-	gen7_emit_state_base_address(&cmds, interface_descriptor);
+	gen7_emit_pipeline_invalidate(&cmds);
+
 	gen7_emit_pipeline_flush(&cmds);
+	gen7_emit_state_base_address(&cmds, descriptors);
+	gen7_emit_pipeline_invalidate(&cmds);
 
 	gen7_emit_vfe_state(&cmds, bv, urb_size - 1, 0, 0);
+	gen7_emit_interface_descriptor_load(&cmds, descriptors, desc_count);
 
-	gen7_emit_interface_descriptor_load(&cmds,
-					    interface_descriptor,
-					    desc_count);
-
-	for (i = 0; i < bv->max_primitives; i++)
+	for (i = 0; i < num_primitives(bv); i++)
 		gen7_emit_media_object(&cmds, i);
 
 	batch_add(&cmds, MI_BATCH_BUFFER_END);
@@ -385,15 +416,15 @@ int gen7_setup_clear_gpr_bb(struct intel_engine_cs * const engine,
 
 	batch_get_defaults(engine->i915, &bv);
 	if (!vma)
-		return bv.max_size;
+		return bv.size;
 
-	GEM_BUG_ON(vma->obj->base.size < bv.max_size);
+	GEM_BUG_ON(vma->obj->base.size < bv.size);
 
 	batch = i915_gem_object_pin_map(vma->obj, I915_MAP_WC);
 	if (IS_ERR(batch))
 		return PTR_ERR(batch);
 
-	emit_batch(vma, memset(batch, 0, bv.max_size), &bv);
+	emit_batch(vma, memset(batch, 0, bv.size), &bv);
 
 	i915_gem_object_flush_map(vma->obj);
 	__i915_gem_object_release_map(vma->obj);
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [Intel-gfx] [PATCH 02/11] drm/i915/gt: Restore clear-residual mitigations for Ivybridge, Baytrail
  2021-01-10 15:03 ` [Intel-gfx] " Chris Wilson
  (?)
@ 2021-01-10 15:03 ` Chris Wilson
  2021-01-11 17:35   ` Rodrigo Vivi
  -1 siblings, 1 reply; 30+ messages in thread
From: Chris Wilson @ 2021-01-10 15:03 UTC (permalink / raw)
  To: intel-gfx; +Cc: Chris Wilson

The mitigation is required for all gen7 platforms, now that it does not
cause GPU hangs, restore it for Ivybridge and Baytrail.

Fixes: 47f8253d2b89 ("drm/i915/gen7: Clear all EU/L3 residual contexts")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Prathap Kumar Valsan <prathap.kumar.valsan@intel.com>
Cc: Akeem G Abodunrin <akeem.g.abodunrin@intel.com>
Cc: Bloomfield Jon <jon.bloomfield@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_ring_submission.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_ring_submission.c b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
index 1c6d421f6fe5..724d56c9583d 100644
--- a/drivers/gpu/drm/i915/gt/intel_ring_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
@@ -1324,7 +1324,7 @@ int intel_ring_submission_setup(struct intel_engine_cs *engine)
 
 	GEM_BUG_ON(timeline->hwsp_ggtt != engine->status_page.vma);
 
-	if (IS_HASWELL(engine->i915) && engine->class == RENDER_CLASS) {
+	if (IS_GEN(engine->i915, 7) && engine->class == RENDER_CLASS) {
 		err = gen7_ctx_switch_bb_init(engine);
 		if (err)
 			goto err_ring_unpin;
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 03/11] drm/i915: Allow the sysadmin to override security mitigations
  2021-01-10 15:03 ` [Intel-gfx] " Chris Wilson
@ 2021-01-10 15:03   ` Chris Wilson
  -1 siblings, 0 replies; 30+ messages in thread
From: Chris Wilson @ 2021-01-10 15:03 UTC (permalink / raw)
  To: intel-gfx
  Cc: Chris Wilson, Joonas Lahtinen, Jon Bloomfield, Rodrigo Vivi, stable

The clear-residuals mitigation is a relatively heavy hammer and under some
circumstances the user may wish to forgo the context isolation in order
to meet some performance requirement. Introduce a generic module
parameter to allow selectively enabling/disabling different mitigations.

Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/1858
Fixes: 47f8253d2b89 ("drm/i915/gen7: Clear all EU/L3 residual contexts")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Jon Bloomfield <jon.bloomfield@intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: stable@vger.kernel.org # v5.7
---
 drivers/gpu/drm/i915/Makefile                 |   1 +
 .../gpu/drm/i915/gt/intel_ring_submission.c   |   4 +-
 drivers/gpu/drm/i915/i915_mitigations.c       | 148 ++++++++++++++++++
 drivers/gpu/drm/i915/i915_mitigations.h       |  13 ++
 4 files changed, 165 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/i915/i915_mitigations.c
 create mode 100644 drivers/gpu/drm/i915/i915_mitigations.h

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index 4074d8cb0d6e..48f82c354611 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -38,6 +38,7 @@ i915-y += i915_drv.o \
 	  i915_config.o \
 	  i915_irq.o \
 	  i915_getparam.o \
+	  i915_mitigations.o \
 	  i915_params.o \
 	  i915_pci.o \
 	  i915_scatterlist.o \
diff --git a/drivers/gpu/drm/i915/gt/intel_ring_submission.c b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
index 724d56c9583d..657afd8ebc14 100644
--- a/drivers/gpu/drm/i915/gt/intel_ring_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
@@ -32,6 +32,7 @@
 #include "gen6_ppgtt.h"
 #include "gen7_renderclear.h"
 #include "i915_drv.h"
+#include "i915_mitigations.h"
 #include "intel_breadcrumbs.h"
 #include "intel_context.h"
 #include "intel_gt.h"
@@ -918,7 +919,8 @@ static int switch_context(struct i915_request *rq)
 	GEM_BUG_ON(HAS_EXECLISTS(engine->i915));
 
 	if (engine->wa_ctx.vma && ce != engine->kernel_context) {
-		if (engine->wa_ctx.vma->private != ce) {
+		if (engine->wa_ctx.vma->private != ce &&
+		    i915_mitigate_clear_residuals()) {
 			ret = clear_residuals(rq);
 			if (ret)
 				return ret;
diff --git a/drivers/gpu/drm/i915/i915_mitigations.c b/drivers/gpu/drm/i915/i915_mitigations.c
new file mode 100644
index 000000000000..8d5637cfa734
--- /dev/null
+++ b/drivers/gpu/drm/i915/i915_mitigations.c
@@ -0,0 +1,148 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2021 Intel Corporation
+ */
+
+#include <linux/kernel.h>
+#include <linux/moduleparam.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+
+#include "i915_drv.h"
+#include "i915_mitigations.h"
+
+static unsigned long mitigations = ~0UL;
+
+enum {
+	CLEAR_RESIDUALS = 0,
+};
+
+static const char * const names[] = {
+	[CLEAR_RESIDUALS] = "residuals",
+};
+
+bool i915_mitigate_clear_residuals(void)
+{
+	return READ_ONCE(mitigations) & BIT(CLEAR_RESIDUALS);
+}
+
+static int mitigations_set(const char *val, const struct kernel_param *kp)
+{
+	unsigned long new = ~0UL;
+	char *str, *sep, *tok;
+	bool first = true;
+	int err = 0;
+
+	BUILD_BUG_ON(ARRAY_SIZE(names) >= BITS_PER_TYPE(mitigations));
+
+	str = kstrdup(val, GFP_KERNEL);
+	if (!str)
+		return -ENOMEM;
+
+	for (sep = str; (tok = strsep(&sep, ","));) {
+		bool enable = true;
+		int i;
+
+		/* Be tolerant of leading/trailing whitespace */
+		tok = strim(tok);
+
+		if (first) {
+			first = false;
+
+			if (!strcmp(tok, "auto")) {
+				new = ~0UL;
+				continue;
+			}
+
+			new = 0;
+			if (!strcmp(tok, "off"))
+				continue;
+		}
+
+		if (*tok == '!') {
+			enable = !enable;
+			tok++;
+		}
+
+		if (!strncmp(tok, "no", 2)) {
+			enable = !enable;
+			tok += 2;
+		}
+
+		if (*tok == '\0')
+			continue;
+
+		for (i = 0; i < ARRAY_SIZE(names); i++) {
+			if (!strcmp(tok, names[i])) {
+				if (enable)
+					new |= BIT(i);
+				else
+					new &= ~BIT(i);
+				break;
+			}
+		}
+		if (i == ARRAY_SIZE(names)) {
+			pr_err("Bad %s.mitigations=%s, '%s' is unknown\n",
+			       DRIVER_NAME, val, tok);
+			err = -EINVAL;
+			break;
+		}
+	}
+	kfree(str);
+	if (err)
+		return err;
+
+	WRITE_ONCE(mitigations, new);
+	return 0;
+}
+
+static int mitigations_get(char *buffer, const struct kernel_param *kp)
+{
+	unsigned long local = READ_ONCE(mitigations);
+	int count, i;
+	bool enable;
+
+	if (!local)
+		return scnprintf(buffer, PAGE_SIZE, "%s\n", "off");
+
+	if (local & BIT(BITS_PER_LONG - 1)) {
+		count = scnprintf(buffer, PAGE_SIZE, "%s,", "auto");
+		enable = false;
+	} else {
+		enable = true;
+		count = 0;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(names); i++) {
+		if ((local & BIT(i)) != enable)
+			continue;
+
+		count += scnprintf(buffer + count, PAGE_SIZE - count,
+				   "%s%s,", enable ? "" : "!", names[i]);
+	}
+
+	buffer[count - 1] = '\n';
+	return count;
+}
+
+static const struct kernel_param_ops ops = {
+	.set = mitigations_set,
+	.get = mitigations_get,
+};
+
+module_param_cb_unsafe(mitigations, &ops, NULL, 0600);
+MODULE_PARM_DESC(mitigations,
+"Selectively enable security mitigations for all Intel® GPUs in the system.\n"
+"\n"
+"  auto -- enables all mitigations required for the platform [default]\n"
+"  off  -- disables all mitigations\n"
+"\n"
+"Individual mitigations can be enabled by passing a comma-separated string,\n"
+"e.g. mitigations=residuals to enable only clearing residuals or\n"
+"mitigations=auto,noresiduals to disable only the clear residual mitigation.\n"
+"Either '!' or 'no' may be used to switch from enabling the mitigation to\n"
+"disabling it.\n"
+"\n"
+"Active mitigations for Ivybridge, Baytrail, Haswell:\n"
+"  residuals -- clear all thread-local registers between contexts"
+);
diff --git a/drivers/gpu/drm/i915/i915_mitigations.h b/drivers/gpu/drm/i915/i915_mitigations.h
new file mode 100644
index 000000000000..1359d8135287
--- /dev/null
+++ b/drivers/gpu/drm/i915/i915_mitigations.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2021 Intel Corporation
+ */
+
+#ifndef __I915_MITIGATIONS_H__
+#define __I915_MITIGATIONS_H__
+
+#include <linux/types.h>
+
+bool i915_mitigate_clear_residuals(void);
+
+#endif /* __I915_MITIGATIONS_H__ */
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [Intel-gfx] [PATCH 03/11] drm/i915: Allow the sysadmin to override security mitigations
@ 2021-01-10 15:03   ` Chris Wilson
  0 siblings, 0 replies; 30+ messages in thread
From: Chris Wilson @ 2021-01-10 15:03 UTC (permalink / raw)
  To: intel-gfx; +Cc: stable, Chris Wilson

The clear-residuals mitigation is a relatively heavy hammer and under some
circumstances the user may wish to forgo the context isolation in order
to meet some performance requirement. Introduce a generic module
parameter to allow selectively enabling/disabling different mitigations.

Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/1858
Fixes: 47f8253d2b89 ("drm/i915/gen7: Clear all EU/L3 residual contexts")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Jon Bloomfield <jon.bloomfield@intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: stable@vger.kernel.org # v5.7
---
 drivers/gpu/drm/i915/Makefile                 |   1 +
 .../gpu/drm/i915/gt/intel_ring_submission.c   |   4 +-
 drivers/gpu/drm/i915/i915_mitigations.c       | 148 ++++++++++++++++++
 drivers/gpu/drm/i915/i915_mitigations.h       |  13 ++
 4 files changed, 165 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/i915/i915_mitigations.c
 create mode 100644 drivers/gpu/drm/i915/i915_mitigations.h

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index 4074d8cb0d6e..48f82c354611 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -38,6 +38,7 @@ i915-y += i915_drv.o \
 	  i915_config.o \
 	  i915_irq.o \
 	  i915_getparam.o \
+	  i915_mitigations.o \
 	  i915_params.o \
 	  i915_pci.o \
 	  i915_scatterlist.o \
diff --git a/drivers/gpu/drm/i915/gt/intel_ring_submission.c b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
index 724d56c9583d..657afd8ebc14 100644
--- a/drivers/gpu/drm/i915/gt/intel_ring_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
@@ -32,6 +32,7 @@
 #include "gen6_ppgtt.h"
 #include "gen7_renderclear.h"
 #include "i915_drv.h"
+#include "i915_mitigations.h"
 #include "intel_breadcrumbs.h"
 #include "intel_context.h"
 #include "intel_gt.h"
@@ -918,7 +919,8 @@ static int switch_context(struct i915_request *rq)
 	GEM_BUG_ON(HAS_EXECLISTS(engine->i915));
 
 	if (engine->wa_ctx.vma && ce != engine->kernel_context) {
-		if (engine->wa_ctx.vma->private != ce) {
+		if (engine->wa_ctx.vma->private != ce &&
+		    i915_mitigate_clear_residuals()) {
 			ret = clear_residuals(rq);
 			if (ret)
 				return ret;
diff --git a/drivers/gpu/drm/i915/i915_mitigations.c b/drivers/gpu/drm/i915/i915_mitigations.c
new file mode 100644
index 000000000000..8d5637cfa734
--- /dev/null
+++ b/drivers/gpu/drm/i915/i915_mitigations.c
@@ -0,0 +1,148 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2021 Intel Corporation
+ */
+
+#include <linux/kernel.h>
+#include <linux/moduleparam.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+
+#include "i915_drv.h"
+#include "i915_mitigations.h"
+
+static unsigned long mitigations = ~0UL;
+
+enum {
+	CLEAR_RESIDUALS = 0,
+};
+
+static const char * const names[] = {
+	[CLEAR_RESIDUALS] = "residuals",
+};
+
+bool i915_mitigate_clear_residuals(void)
+{
+	return READ_ONCE(mitigations) & BIT(CLEAR_RESIDUALS);
+}
+
+static int mitigations_set(const char *val, const struct kernel_param *kp)
+{
+	unsigned long new = ~0UL;
+	char *str, *sep, *tok;
+	bool first = true;
+	int err = 0;
+
+	BUILD_BUG_ON(ARRAY_SIZE(names) >= BITS_PER_TYPE(mitigations));
+
+	str = kstrdup(val, GFP_KERNEL);
+	if (!str)
+		return -ENOMEM;
+
+	for (sep = str; (tok = strsep(&sep, ","));) {
+		bool enable = true;
+		int i;
+
+		/* Be tolerant of leading/trailing whitespace */
+		tok = strim(tok);
+
+		if (first) {
+			first = false;
+
+			if (!strcmp(tok, "auto")) {
+				new = ~0UL;
+				continue;
+			}
+
+			new = 0;
+			if (!strcmp(tok, "off"))
+				continue;
+		}
+
+		if (*tok == '!') {
+			enable = !enable;
+			tok++;
+		}
+
+		if (!strncmp(tok, "no", 2)) {
+			enable = !enable;
+			tok += 2;
+		}
+
+		if (*tok == '\0')
+			continue;
+
+		for (i = 0; i < ARRAY_SIZE(names); i++) {
+			if (!strcmp(tok, names[i])) {
+				if (enable)
+					new |= BIT(i);
+				else
+					new &= ~BIT(i);
+				break;
+			}
+		}
+		if (i == ARRAY_SIZE(names)) {
+			pr_err("Bad %s.mitigations=%s, '%s' is unknown\n",
+			       DRIVER_NAME, val, tok);
+			err = -EINVAL;
+			break;
+		}
+	}
+	kfree(str);
+	if (err)
+		return err;
+
+	WRITE_ONCE(mitigations, new);
+	return 0;
+}
+
+static int mitigations_get(char *buffer, const struct kernel_param *kp)
+{
+	unsigned long local = READ_ONCE(mitigations);
+	int count, i;
+	bool enable;
+
+	if (!local)
+		return scnprintf(buffer, PAGE_SIZE, "%s\n", "off");
+
+	if (local & BIT(BITS_PER_LONG - 1)) {
+		count = scnprintf(buffer, PAGE_SIZE, "%s,", "auto");
+		enable = false;
+	} else {
+		enable = true;
+		count = 0;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(names); i++) {
+		if ((local & BIT(i)) != enable)
+			continue;
+
+		count += scnprintf(buffer + count, PAGE_SIZE - count,
+				   "%s%s,", enable ? "" : "!", names[i]);
+	}
+
+	buffer[count - 1] = '\n';
+	return count;
+}
+
+static const struct kernel_param_ops ops = {
+	.set = mitigations_set,
+	.get = mitigations_get,
+};
+
+module_param_cb_unsafe(mitigations, &ops, NULL, 0600);
+MODULE_PARM_DESC(mitigations,
+"Selectively enable security mitigations for all Intel® GPUs in the system.\n"
+"\n"
+"  auto -- enables all mitigations required for the platform [default]\n"
+"  off  -- disables all mitigations\n"
+"\n"
+"Individual mitigations can be enabled by passing a comma-separated string,\n"
+"e.g. mitigations=residuals to enable only clearing residuals or\n"
+"mitigations=auto,noresiduals to disable only the clear residual mitigation.\n"
+"Either '!' or 'no' may be used to switch from enabling the mitigation to\n"
+"disabling it.\n"
+"\n"
+"Active mitigations for Ivybridge, Baytrail, Haswell:\n"
+"  residuals -- clear all thread-local registers between contexts"
+);
diff --git a/drivers/gpu/drm/i915/i915_mitigations.h b/drivers/gpu/drm/i915/i915_mitigations.h
new file mode 100644
index 000000000000..1359d8135287
--- /dev/null
+++ b/drivers/gpu/drm/i915/i915_mitigations.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2021 Intel Corporation
+ */
+
+#ifndef __I915_MITIGATIONS_H__
+#define __I915_MITIGATIONS_H__
+
+#include <linux/types.h>
+
+bool i915_mitigate_clear_residuals(void);
+
+#endif /* __I915_MITIGATIONS_H__ */
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [Intel-gfx] [PATCH 04/11] drm/i915/gt: Rearrange vlv workarounds
  2021-01-10 15:03 ` [Intel-gfx] " Chris Wilson
                   ` (2 preceding siblings ...)
  (?)
@ 2021-01-10 15:03 ` Chris Wilson
  -1 siblings, 0 replies; 30+ messages in thread
From: Chris Wilson @ 2021-01-10 15:03 UTC (permalink / raw)
  To: intel-gfx; +Cc: Chris Wilson

Some rcs0 workarounds were being incorrectly applied to the GT, and so
we failed to restore the expected register settings after a reset.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/gt/intel_workarounds.c | 95 +++++++++++----------
 1 file changed, 51 insertions(+), 44 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.c b/drivers/gpu/drm/i915/gt/intel_workarounds.c
index c52433914d52..8006fd526100 100644
--- a/drivers/gpu/drm/i915/gt/intel_workarounds.c
+++ b/drivers/gpu/drm/i915/gt/intel_workarounds.c
@@ -889,53 +889,9 @@ ivb_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
 static void
 vlv_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
 {
-	/* WaDisableEarlyCull:vlv */
-	wa_masked_en(wal, _3D_CHICKEN3, _3D_CHICKEN_SF_DISABLE_OBJEND_CULL);
-
-	/* WaPsdDispatchEnable:vlv */
-	/* WaDisablePSDDualDispatchEnable:vlv */
-	wa_masked_en(wal,
-		     GEN7_HALF_SLICE_CHICKEN1,
-		     GEN7_MAX_PS_THREAD_DEP |
-		     GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE);
-
-	/* WaDisable_RenderCache_OperationalFlush:vlv */
-	wa_masked_dis(wal, CACHE_MODE_0_GEN7, RC_OP_FLUSH_ENABLE);
-
 	/* WaForceL3Serialization:vlv */
 	wa_write_clr(wal, GEN7_L3SQCREG4, L3SQ_URB_READ_CAM_MATCH_DISABLE);
 
-	/*
-	 * WaVSThreadDispatchOverride:ivb,vlv
-	 *
-	 * This actually overrides the dispatch
-	 * mode for all thread types.
-	 */
-	wa_write_clr_set(wal,
-			 GEN7_FF_THREAD_MODE,
-			 GEN7_FF_SCHED_MASK,
-			 GEN7_FF_TS_SCHED_HW |
-			 GEN7_FF_VS_SCHED_HW |
-			 GEN7_FF_DS_SCHED_HW);
-
-	/*
-	 * BSpec says this must be set, even though
-	 * WaDisable4x2SubspanOptimization isn't listed for VLV.
-	 */
-	wa_masked_en(wal, CACHE_MODE_1, PIXEL_SUBSPAN_COLLECT_OPT_DISABLE);
-
-	/*
-	 * BSpec recommends 8x4 when MSAA is used,
-	 * however in practice 16x4 seems fastest.
-	 *
-	 * Note that PS/WM thread counts depend on the WIZ hashing
-	 * disable bit, which we don't touch here, but it's good
-	 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
-	 */
-	wa_add(wal, GEN7_GT_MODE, 0,
-	       _MASKED_FIELD(GEN6_WIZ_HASHING_MASK, GEN6_WIZ_HASHING_16x4),
-	       GEN6_WIZ_HASHING_16x4);
-
 	/*
 	 * WaIncreaseL3CreditsForVLVB0:vlv
 	 * This is the hardware default actually.
@@ -1953,6 +1909,57 @@ rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
 		       GEN6_WIZ_HASHING_16x4);
 	}
 
+	if (IS_VALLEYVIEW(i915)) {
+		/* WaDisableEarlyCull:vlv */
+		wa_masked_en(wal,
+			     _3D_CHICKEN3,
+			     _3D_CHICKEN_SF_DISABLE_OBJEND_CULL);
+
+		/*
+		 * WaVSThreadDispatchOverride:ivb,vlv
+		 *
+		 * This actually overrides the dispatch
+		 * mode for all thread types.
+		 */
+		wa_write_clr_set(wal,
+				 GEN7_FF_THREAD_MODE,
+				 GEN7_FF_SCHED_MASK,
+				 GEN7_FF_TS_SCHED_HW |
+				 GEN7_FF_VS_SCHED_HW |
+				 GEN7_FF_DS_SCHED_HW);
+
+		/* WaDisable_RenderCache_OperationalFlush:vlv */
+		wa_masked_dis(wal, CACHE_MODE_0_GEN7, RC_OP_FLUSH_ENABLE);
+
+		/*
+		 * BSpec says this must be set, even though
+		 * WaDisable4x2SubspanOptimization isn't listed for VLV.
+		 */
+		wa_masked_en(wal,
+			     CACHE_MODE_1,
+			     PIXEL_SUBSPAN_COLLECT_OPT_DISABLE);
+
+		/*
+		 * BSpec recommends 8x4 when MSAA is used,
+		 * however in practice 16x4 seems fastest.
+		 *
+		 * Note that PS/WM thread counts depend on the WIZ hashing
+		 * disable bit, which we don't touch here, but it's good
+		 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
+		 */
+		wa_add(wal, GEN7_GT_MODE, 0,
+		       _MASKED_FIELD(GEN6_WIZ_HASHING_MASK,
+				     GEN6_WIZ_HASHING_16x4),
+		       GEN6_WIZ_HASHING_16x4);
+
+		/* WaPsdDispatchEnable:vlv */
+		/* WaDisablePSDDualDispatchEnable:vlv */
+		wa_masked_en(wal,
+			     GEN7_HALF_SLICE_CHICKEN1,
+			     GEN7_MAX_PS_THREAD_DEP |
+			     GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE);
+	}
+
 	if (IS_GEN(i915, 7))
 		/* WaBCSVCSTlbInvalidationMode:ivb,vlv,hsw */
 		wa_masked_en(wal,
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [Intel-gfx] [PATCH 05/11] drm/i915/gt: Rearrange ivb workarounds
  2021-01-10 15:03 ` [Intel-gfx] " Chris Wilson
                   ` (3 preceding siblings ...)
  (?)
@ 2021-01-10 15:03 ` Chris Wilson
  -1 siblings, 0 replies; 30+ messages in thread
From: Chris Wilson @ 2021-01-10 15:03 UTC (permalink / raw)
  To: intel-gfx; +Cc: Chris Wilson

Some rcs0 workarounds were being incorrectly applied to the GT, and so
we failed to restore the expected register settings after a reset.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/gt/intel_workarounds.c | 122 ++++++++------------
 1 file changed, 49 insertions(+), 73 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.c b/drivers/gpu/drm/i915/gt/intel_workarounds.c
index 8006fd526100..d99773e6776e 100644
--- a/drivers/gpu/drm/i915/gt/intel_workarounds.c
+++ b/drivers/gpu/drm/i915/gt/intel_workarounds.c
@@ -829,18 +829,6 @@ snb_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
 static void
 ivb_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
 {
-	/* WaDisableEarlyCull:ivb */
-	wa_masked_en(wal, _3D_CHICKEN3, _3D_CHICKEN_SF_DISABLE_OBJEND_CULL);
-
-	/* WaDisablePSDDualDispatchEnable:ivb */
-	if (IS_IVB_GT1(i915))
-		wa_masked_en(wal,
-			     GEN7_HALF_SLICE_CHICKEN1,
-			     GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE);
-
-	/* WaDisable_RenderCache_OperationalFlush:ivb */
-	wa_masked_dis(wal, CACHE_MODE_0_GEN7, RC_OP_FLUSH_ENABLE);
-
 	/* Apply the WaDisableRHWOOptimizationForRenderHang:ivb workaround. */
 	wa_masked_dis(wal,
 		      GEN7_COMMON_SLICE_CHICKEN1,
@@ -852,38 +840,6 @@ ivb_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
 
 	/* WaForceL3Serialization:ivb */
 	wa_write_clr(wal, GEN7_L3SQCREG4, L3SQ_URB_READ_CAM_MATCH_DISABLE);
-
-	/*
-	 * WaVSThreadDispatchOverride:ivb,vlv
-	 *
-	 * This actually overrides the dispatch
-	 * mode for all thread types.
-	 */
-	wa_write_clr_set(wal, GEN7_FF_THREAD_MODE,
-			 GEN7_FF_SCHED_MASK,
-			 GEN7_FF_TS_SCHED_HW |
-			 GEN7_FF_VS_SCHED_HW |
-			 GEN7_FF_DS_SCHED_HW);
-
-	if (0) { /* causes HiZ corruption on ivb:gt1 */
-		/* enable HiZ Raw Stall Optimization */
-		wa_masked_dis(wal, CACHE_MODE_0_GEN7, HIZ_RAW_STALL_OPT_DISABLE);
-	}
-
-	/* WaDisable4x2SubspanOptimization:ivb */
-	wa_masked_en(wal, CACHE_MODE_1, PIXEL_SUBSPAN_COLLECT_OPT_DISABLE);
-
-	/*
-	 * BSpec recommends 8x4 when MSAA is used,
-	 * however in practice 16x4 seems fastest.
-	 *
-	 * Note that PS/WM thread counts depend on the WIZ hashing
-	 * disable bit, which we don't touch here, but it's good
-	 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
-	 */
-	wa_add(wal, GEN7_GT_MODE, 0,
-	       _MASKED_FIELD(GEN6_WIZ_HASHING_MASK, GEN6_WIZ_HASHING_16x4),
-	       GEN6_WIZ_HASHING_16x4);
 }
 
 static void
@@ -1887,26 +1843,11 @@ rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
 
 		wa_masked_dis(wal,
 			      CACHE_MODE_0_GEN7,
-			      /* WaDisable_RenderCache_OperationalFlush:hsw */
-			      RC_OP_FLUSH_ENABLE |
 			      /* enable HiZ Raw Stall Optimization */
 			      HIZ_RAW_STALL_OPT_DISABLE);
 
 		/* WaDisable4x2SubspanOptimization:hsw */
 		wa_masked_en(wal, CACHE_MODE_1, PIXEL_SUBSPAN_COLLECT_OPT_DISABLE);
-
-		/*
-		 * BSpec recommends 8x4 when MSAA is used,
-		 * however in practice 16x4 seems fastest.
-		 *
-		 * Note that PS/WM thread counts depend on the WIZ hashing
-		 * disable bit, which we don't touch here, but it's good
-		 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
-		 */
-		wa_add(wal, GEN7_GT_MODE, 0,
-		       _MASKED_FIELD(GEN6_WIZ_HASHING_MASK,
-				     GEN6_WIZ_HASHING_16x4),
-		       GEN6_WIZ_HASHING_16x4);
 	}
 
 	if (IS_VALLEYVIEW(i915)) {
@@ -1928,11 +1869,59 @@ rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
 				 GEN7_FF_VS_SCHED_HW |
 				 GEN7_FF_DS_SCHED_HW);
 
-		/* WaDisable_RenderCache_OperationalFlush:vlv */
+		/* WaPsdDispatchEnable:vlv */
+		/* WaDisablePSDDualDispatchEnable:vlv */
+		wa_masked_en(wal,
+			     GEN7_HALF_SLICE_CHICKEN1,
+			     GEN7_MAX_PS_THREAD_DEP |
+			     GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE);
+	}
+
+	if (IS_IVYBRIDGE(i915)) {
+		/* WaDisableEarlyCull:ivb */
+		wa_masked_en(wal,
+			     _3D_CHICKEN3,
+			     _3D_CHICKEN_SF_DISABLE_OBJEND_CULL);
+
+		if (0) { /* causes HiZ corruption on ivb:gt1 */
+			/* enable HiZ Raw Stall Optimization */
+			wa_masked_dis(wal,
+				      CACHE_MODE_0_GEN7,
+				      HIZ_RAW_STALL_OPT_DISABLE);
+		}
+
+		/*
+		 * WaVSThreadDispatchOverride:ivb,vlv
+		 *
+		 * This actually overrides the dispatch
+		 * mode for all thread types.
+		 */
+		wa_write_clr_set(wal,
+				 GEN7_FF_THREAD_MODE,
+				 GEN7_FF_SCHED_MASK,
+				 GEN7_FF_TS_SCHED_HW |
+				 GEN7_FF_VS_SCHED_HW |
+				 GEN7_FF_DS_SCHED_HW);
+
+		/* WaDisablePSDDualDispatchEnable:ivb */
+		if (IS_IVB_GT1(i915))
+			wa_masked_en(wal,
+				     GEN7_HALF_SLICE_CHICKEN1,
+				     GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE);
+	}
+
+	if (IS_GEN(i915, 7)) {
+		/* WaBCSVCSTlbInvalidationMode:ivb,vlv,hsw */
+		wa_masked_en(wal,
+			     GFX_MODE_GEN7,
+			     GFX_TLB_INVALIDATE_EXPLICIT | GFX_REPLAY_MODE);
+
+		/* WaDisable_RenderCache_OperationalFlush:ivb,vlv,hsw */
 		wa_masked_dis(wal, CACHE_MODE_0_GEN7, RC_OP_FLUSH_ENABLE);
 
 		/*
 		 * BSpec says this must be set, even though
+		 * WaDisable4x2SubspanOptimization:ivb,hsw
 		 * WaDisable4x2SubspanOptimization isn't listed for VLV.
 		 */
 		wa_masked_en(wal,
@@ -1951,21 +1940,8 @@ rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
 		       _MASKED_FIELD(GEN6_WIZ_HASHING_MASK,
 				     GEN6_WIZ_HASHING_16x4),
 		       GEN6_WIZ_HASHING_16x4);
-
-		/* WaPsdDispatchEnable:vlv */
-		/* WaDisablePSDDualDispatchEnable:vlv */
-		wa_masked_en(wal,
-			     GEN7_HALF_SLICE_CHICKEN1,
-			     GEN7_MAX_PS_THREAD_DEP |
-			     GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE);
 	}
 
-	if (IS_GEN(i915, 7))
-		/* WaBCSVCSTlbInvalidationMode:ivb,vlv,hsw */
-		wa_masked_en(wal,
-			     GFX_MODE_GEN7,
-			     GFX_TLB_INVALIDATE_EXPLICIT | GFX_REPLAY_MODE);
-
 	if (IS_GEN_RANGE(i915, 6, 7))
 		/*
 		 * We need to disable the AsyncFlip performance optimisations in
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [Intel-gfx] [PATCH 06/11] drm/i915/gt: Replace open-coded intel_engine_stop_cs()
  2021-01-10 15:03 ` [Intel-gfx] " Chris Wilson
                   ` (4 preceding siblings ...)
  (?)
@ 2021-01-10 15:03 ` Chris Wilson
  -1 siblings, 0 replies; 30+ messages in thread
From: Chris Wilson @ 2021-01-10 15:03 UTC (permalink / raw)
  To: intel-gfx; +Cc: Chris Wilson

In the legacy ringbuffer submission, we still had an open-coded version
of intel_engine_stop_cs() with one addition verification step. Transfer
that verification to intel_engine_stop_cs() itself, and call it.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/gt/intel_engine_cs.c     | 15 +++++++++--
 .../gpu/drm/i915/gt/intel_ring_submission.c   | 25 +------------------
 2 files changed, 14 insertions(+), 26 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index 1847d3c2ea99..58c900a12c13 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -1048,8 +1048,19 @@ int intel_engine_stop_cs(struct intel_engine_cs *engine)
 
 	ENGINE_TRACE(engine, "\n");
 	if (__intel_engine_stop_cs(engine, 1000, stop_timeout(engine))) {
-		ENGINE_TRACE(engine, "timed out on STOP_RING -> IDLE\n");
-		err = -ETIMEDOUT;
+		ENGINE_TRACE(engine,
+			     "timed out on STOP_RING -> IDLE; HEAD:%04x, TAIL:%04x\n",
+			     ENGINE_READ_FW(engine, RING_HEAD) & HEAD_ADDR,
+			     ENGINE_READ_FW(engine, RING_TAIL) & TAIL_ADDR);
+
+		/*
+		 * Sometimes we observe that the idle flag is not
+		 * set even though the ring is empty. So double
+		 * check before giving up.
+		 */
+		if ((ENGINE_READ_FW(engine, RING_HEAD) & HEAD_ADDR) !=
+		    (ENGINE_READ_FW(engine, RING_TAIL) & TAIL_ADDR))
+			err = -ETIMEDOUT;
 	}
 
 	return err;
diff --git a/drivers/gpu/drm/i915/gt/intel_ring_submission.c b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
index 657afd8ebc14..20f42722be8b 100644
--- a/drivers/gpu/drm/i915/gt/intel_ring_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
@@ -159,30 +159,7 @@ static void ring_setup_status_page(struct intel_engine_cs *engine)
 
 static bool stop_ring(struct intel_engine_cs *engine)
 {
-	struct drm_i915_private *dev_priv = engine->i915;
-
-	if (INTEL_GEN(dev_priv) > 2) {
-		ENGINE_WRITE(engine,
-			     RING_MI_MODE, _MASKED_BIT_ENABLE(STOP_RING));
-		if (intel_wait_for_register(engine->uncore,
-					    RING_MI_MODE(engine->mmio_base),
-					    MODE_IDLE,
-					    MODE_IDLE,
-					    1000)) {
-			drm_err(&dev_priv->drm,
-				"%s : timed out trying to stop ring\n",
-				engine->name);
-
-			/*
-			 * Sometimes we observe that the idle flag is not
-			 * set even though the ring is empty. So double
-			 * check before giving up.
-			 */
-			if (ENGINE_READ(engine, RING_HEAD) !=
-			    ENGINE_READ(engine, RING_TAIL))
-				return false;
-		}
-	}
+	intel_engine_stop_cs(engine);
 
 	ENGINE_WRITE(engine, RING_HEAD, ENGINE_READ(engine, RING_TAIL));
 
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [Intel-gfx] [PATCH 07/11] drm/i915/gt: Reapply ppgtt enabling after engine resets
  2021-01-10 15:03 ` [Intel-gfx] " Chris Wilson
                   ` (5 preceding siblings ...)
  (?)
@ 2021-01-10 15:04 ` Chris Wilson
  -1 siblings, 0 replies; 30+ messages in thread
From: Chris Wilson @ 2021-01-10 15:04 UTC (permalink / raw)
  To: intel-gfx; +Cc: Chris Wilson

The GFX_MODE is reset along with the engine, turning off ppGTT. We need
to re-enable it upon resume.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/gt/gen6_ppgtt.c            |  9 ---------
 drivers/gpu/drm/i915/gt/intel_ring_submission.c | 13 ++++++++++---
 2 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/gen6_ppgtt.c b/drivers/gpu/drm/i915/gt/gen6_ppgtt.c
index 680bd9442eb0..0f02afe7f43a 100644
--- a/drivers/gpu/drm/i915/gt/gen6_ppgtt.c
+++ b/drivers/gpu/drm/i915/gt/gen6_ppgtt.c
@@ -27,8 +27,6 @@ void gen7_ppgtt_enable(struct intel_gt *gt)
 {
 	struct drm_i915_private *i915 = gt->i915;
 	struct intel_uncore *uncore = gt->uncore;
-	struct intel_engine_cs *engine;
-	enum intel_engine_id id;
 	u32 ecochk;
 
 	intel_uncore_rmw(uncore, GAC_ECO_BITS, 0, ECOBITS_PPGTT_CACHE64B);
@@ -41,13 +39,6 @@ void gen7_ppgtt_enable(struct intel_gt *gt)
 		ecochk &= ~ECOCHK_PPGTT_GFDT_IVB;
 	}
 	intel_uncore_write(uncore, GAM_ECOCHK, ecochk);
-
-	for_each_engine(engine, gt, id) {
-		/* GFX_MODE is per-ring on gen7+ */
-		ENGINE_WRITE(engine,
-			     RING_MODE_GEN7,
-			     _MASKED_BIT_ENABLE(GFX_PPGTT_ENABLE));
-	}
 }
 
 void gen6_ppgtt_enable(struct intel_gt *gt)
diff --git a/drivers/gpu/drm/i915/gt/intel_ring_submission.c b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
index 20f42722be8b..01553f029ac1 100644
--- a/drivers/gpu/drm/i915/gt/intel_ring_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
@@ -189,9 +189,16 @@ static void set_pp_dir(struct intel_engine_cs *engine)
 {
 	struct i915_address_space *vm = vm_alias(engine->gt->vm);
 
-	if (vm) {
-		ENGINE_WRITE(engine, RING_PP_DIR_DCLV, PP_DIR_DCLV_2G);
-		ENGINE_WRITE(engine, RING_PP_DIR_BASE, pp_dir(vm));
+	if (!vm)
+		return;
+
+	ENGINE_WRITE(engine, RING_PP_DIR_DCLV, PP_DIR_DCLV_2G);
+	ENGINE_WRITE(engine, RING_PP_DIR_BASE, pp_dir(vm));
+
+	if (INTEL_GEN(engine->i915) >= 7) {
+		ENGINE_WRITE(engine,
+			     RING_MODE_GEN7,
+			     _MASKED_BIT_ENABLE(GFX_PPGTT_ENABLE));
 	}
 }
 
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [Intel-gfx] [PATCH 08/11] drm/i915/gt: Lift stop_ring() to reset_prepare
  2021-01-10 15:03 ` [Intel-gfx] " Chris Wilson
                   ` (6 preceding siblings ...)
  (?)
@ 2021-01-10 15:04 ` Chris Wilson
  -1 siblings, 0 replies; 30+ messages in thread
From: Chris Wilson @ 2021-01-10 15:04 UTC (permalink / raw)
  To: intel-gfx; +Cc: Chris Wilson

Push the sleeping stop_ring() out of the reset resume function to reset
prepare; we are not allowed to sleep in the former.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 .../gpu/drm/i915/gt/intel_ring_submission.c   | 97 +++++++------------
 1 file changed, 36 insertions(+), 61 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_ring_submission.c b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
index 01553f029ac1..d7eabed01616 100644
--- a/drivers/gpu/drm/i915/gt/intel_ring_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
@@ -157,21 +157,6 @@ static void ring_setup_status_page(struct intel_engine_cs *engine)
 	flush_cs_tlb(engine);
 }
 
-static bool stop_ring(struct intel_engine_cs *engine)
-{
-	intel_engine_stop_cs(engine);
-
-	ENGINE_WRITE(engine, RING_HEAD, ENGINE_READ(engine, RING_TAIL));
-
-	ENGINE_WRITE(engine, RING_HEAD, 0);
-	ENGINE_WRITE(engine, RING_TAIL, 0);
-
-	/* The ring must be empty before it is disabled */
-	ENGINE_WRITE(engine, RING_CTL, 0);
-
-	return (ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR) == 0;
-}
-
 static struct i915_address_space *vm_alias(struct i915_address_space *vm)
 {
 	if (i915_is_ggtt(vm))
@@ -213,31 +198,6 @@ static int xcs_resume(struct intel_engine_cs *engine)
 
 	intel_uncore_forcewake_get(engine->uncore, FORCEWAKE_ALL);
 
-	/* WaClearRingBufHeadRegAtInit:ctg,elk */
-	if (!stop_ring(engine)) {
-		/* G45 ring initialization often fails to reset head to zero */
-		drm_dbg(&dev_priv->drm, "%s head not reset to zero "
-			"ctl %08x head %08x tail %08x start %08x\n",
-			engine->name,
-			ENGINE_READ(engine, RING_CTL),
-			ENGINE_READ(engine, RING_HEAD),
-			ENGINE_READ(engine, RING_TAIL),
-			ENGINE_READ(engine, RING_START));
-
-		if (!stop_ring(engine)) {
-			drm_err(&dev_priv->drm,
-				"failed to set %s head to zero "
-				"ctl %08x head %08x tail %08x start %08x\n",
-				engine->name,
-				ENGINE_READ(engine, RING_CTL),
-				ENGINE_READ(engine, RING_HEAD),
-				ENGINE_READ(engine, RING_TAIL),
-				ENGINE_READ(engine, RING_START));
-			ret = -EIO;
-			goto out;
-		}
-	}
-
 	if (HWS_NEEDS_PHYSICAL(dev_priv))
 		ring_setup_phys_status_page(engine);
 	else
@@ -339,11 +299,21 @@ static void xcs_sanitize(struct intel_engine_cs *engine)
 	clflush_cache_range(engine->status_page.addr, PAGE_SIZE);
 }
 
+static bool stop_ring(struct intel_engine_cs *engine)
+{
+	ENGINE_WRITE_FW(engine, RING_HEAD, ENGINE_READ_FW(engine, RING_TAIL));
+
+	ENGINE_WRITE_FW(engine, RING_HEAD, 0);
+	ENGINE_WRITE_FW(engine, RING_TAIL, 0);
+
+	/* The ring must be empty before it is disabled */
+	ENGINE_WRITE_FW(engine, RING_CTL, 0);
+
+	return (ENGINE_READ_FW(engine, RING_HEAD) & HEAD_ADDR) == 0;
+}
+
 static void reset_prepare(struct intel_engine_cs *engine)
 {
-	struct intel_uncore *uncore = engine->uncore;
-	const u32 base = engine->mmio_base;
-
 	/*
 	 * We stop engines, otherwise we might get failed reset and a
 	 * dead gpu (on elk). Also as modern gpu as kbl can suffer
@@ -355,30 +325,35 @@ static void reset_prepare(struct intel_engine_cs *engine)
 	 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
 	 *
 	 * WaMediaResetMainRingCleanup:ctg,elk (presumably)
+	 * WaClearRingBufHeadRegAtInit:ctg,elk
 	 *
 	 * FIXME: Wa for more modern gens needs to be validated
 	 */
 	ENGINE_TRACE(engine, "\n");
+	intel_engine_stop_cs(engine);
 
-	if (intel_engine_stop_cs(engine))
-		ENGINE_TRACE(engine, "timed out on STOP_RING\n");
+	if (!stop_ring(engine)) {
+		/* G45 ring initialization often fails to reset head to zero */
+		drm_dbg(&engine->i915->drm,
+			"%s head not reset to zero "
+			"ctl %08x head %08x tail %08x start %08x\n",
+			engine->name,
+			ENGINE_READ_FW(engine, RING_CTL),
+			ENGINE_READ_FW(engine, RING_HEAD),
+			ENGINE_READ_FW(engine, RING_TAIL),
+			ENGINE_READ_FW(engine, RING_START));
+	}
 
-	intel_uncore_write_fw(uncore,
-			      RING_HEAD(base),
-			      intel_uncore_read_fw(uncore, RING_TAIL(base)));
-	intel_uncore_posting_read_fw(uncore, RING_HEAD(base)); /* paranoia */
-
-	intel_uncore_write_fw(uncore, RING_HEAD(base), 0);
-	intel_uncore_write_fw(uncore, RING_TAIL(base), 0);
-	intel_uncore_posting_read_fw(uncore, RING_TAIL(base));
-
-	/* The ring must be empty before it is disabled */
-	intel_uncore_write_fw(uncore, RING_CTL(base), 0);
-
-	/* Check acts as a post */
-	if (intel_uncore_read_fw(uncore, RING_HEAD(base)))
-		ENGINE_TRACE(engine, "ring head [%x] not parked\n",
-			     intel_uncore_read_fw(uncore, RING_HEAD(base)));
+	if (!stop_ring(engine)) {
+		drm_err(&engine->i915->drm,
+			"failed to set %s head to zero "
+			"ctl %08x head %08x tail %08x start %08x\n",
+			engine->name,
+			ENGINE_READ_FW(engine, RING_CTL),
+			ENGINE_READ_FW(engine, RING_HEAD),
+			ENGINE_READ_FW(engine, RING_TAIL),
+			ENGINE_READ_FW(engine, RING_START));
+	}
 }
 
 static void reset_rewind(struct intel_engine_cs *engine, bool stalled)
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [Intel-gfx] [PATCH 09/11] drm/i915/gt: Pull ring submission resume under its caller forcewake
  2021-01-10 15:03 ` [Intel-gfx] " Chris Wilson
                   ` (7 preceding siblings ...)
  (?)
@ 2021-01-10 15:04 ` Chris Wilson
  -1 siblings, 0 replies; 30+ messages in thread
From: Chris Wilson @ 2021-01-10 15:04 UTC (permalink / raw)
  To: intel-gfx; +Cc: Chris Wilson

Take advantage of calling xcs_resume under a forcewake by using direct
mmio access. In particular, we can avoid the sleeping variants to allow
resume to be called from softirq context, required for engine resets.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 .../gpu/drm/i915/gt/intel_ring_submission.c   | 96 ++++++++-----------
 1 file changed, 42 insertions(+), 54 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_ring_submission.c b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
index d7eabed01616..c57f34bdd178 100644
--- a/drivers/gpu/drm/i915/gt/intel_ring_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
@@ -122,31 +122,27 @@ static void set_hwsp(struct intel_engine_cs *engine, u32 offset)
 		hwsp = RING_HWS_PGA(engine->mmio_base);
 	}
 
-	intel_uncore_write(engine->uncore, hwsp, offset);
-	intel_uncore_posting_read(engine->uncore, hwsp);
+	intel_uncore_write_fw(engine->uncore, hwsp, offset);
+	intel_uncore_posting_read_fw(engine->uncore, hwsp);
 }
 
 static void flush_cs_tlb(struct intel_engine_cs *engine)
 {
-	struct drm_i915_private *dev_priv = engine->i915;
-
-	if (!IS_GEN_RANGE(dev_priv, 6, 7))
+	if (!IS_GEN_RANGE(engine->i915, 6, 7))
 		return;
 
 	/* ring should be idle before issuing a sync flush*/
-	drm_WARN_ON(&dev_priv->drm,
-		    (ENGINE_READ(engine, RING_MI_MODE) & MODE_IDLE) == 0);
+	GEM_DEBUG_WARN_ON((ENGINE_READ(engine, RING_MI_MODE) & MODE_IDLE) == 0);
 
-	ENGINE_WRITE(engine, RING_INSTPM,
-		     _MASKED_BIT_ENABLE(INSTPM_TLB_INVALIDATE |
-					INSTPM_SYNC_FLUSH));
-	if (intel_wait_for_register(engine->uncore,
-				    RING_INSTPM(engine->mmio_base),
-				    INSTPM_SYNC_FLUSH, 0,
-				    1000))
-		drm_err(&dev_priv->drm,
-			"%s: wait for SyncFlush to complete for TLB invalidation timed out\n",
-			engine->name);
+	ENGINE_WRITE_FW(engine, RING_INSTPM,
+			_MASKED_BIT_ENABLE(INSTPM_TLB_INVALIDATE |
+					   INSTPM_SYNC_FLUSH));
+	if (__intel_wait_for_register_fw(engine->uncore,
+					 RING_INSTPM(engine->mmio_base),
+					 INSTPM_SYNC_FLUSH, 0,
+					 2000, 0, NULL))
+		ENGINE_TRACE(engine,
+			     "wait for SyncFlush to complete for TLB invalidation timed out\n");
 }
 
 static void ring_setup_status_page(struct intel_engine_cs *engine)
@@ -177,13 +173,13 @@ static void set_pp_dir(struct intel_engine_cs *engine)
 	if (!vm)
 		return;
 
-	ENGINE_WRITE(engine, RING_PP_DIR_DCLV, PP_DIR_DCLV_2G);
-	ENGINE_WRITE(engine, RING_PP_DIR_BASE, pp_dir(vm));
+	ENGINE_WRITE_FW(engine, RING_PP_DIR_DCLV, PP_DIR_DCLV_2G);
+	ENGINE_WRITE_FW(engine, RING_PP_DIR_BASE, pp_dir(vm));
 
 	if (INTEL_GEN(engine->i915) >= 7) {
-		ENGINE_WRITE(engine,
-			     RING_MODE_GEN7,
-			     _MASKED_BIT_ENABLE(GFX_PPGTT_ENABLE));
+		ENGINE_WRITE_FW(engine,
+				RING_MODE_GEN7,
+				_MASKED_BIT_ENABLE(GFX_PPGTT_ENABLE));
 	}
 }
 
@@ -191,13 +187,10 @@ static int xcs_resume(struct intel_engine_cs *engine)
 {
 	struct drm_i915_private *dev_priv = engine->i915;
 	struct intel_ring *ring = engine->legacy.ring;
-	int ret = 0;
 
 	ENGINE_TRACE(engine, "ring:{HEAD:%04x, TAIL:%04x}\n",
 		     ring->head, ring->tail);
 
-	intel_uncore_forcewake_get(engine->uncore, FORCEWAKE_ALL);
-
 	if (HWS_NEEDS_PHYSICAL(dev_priv))
 		ring_setup_phys_status_page(engine);
 	else
@@ -205,16 +198,13 @@ static int xcs_resume(struct intel_engine_cs *engine)
 
 	intel_breadcrumbs_reset(engine->breadcrumbs);
 
-	/* Enforce ordering by reading HEAD register back */
-	ENGINE_POSTING_READ(engine, RING_HEAD);
-
 	/*
 	 * Initialize the ring. This must happen _after_ we've cleared the ring
 	 * registers with the above sequence (the readback of the HEAD registers
 	 * also enforces ordering), otherwise the hw might lose the new ring
 	 * register values.
 	 */
-	ENGINE_WRITE(engine, RING_START, i915_ggtt_offset(ring->vma));
+	ENGINE_WRITE_FW(engine, RING_START, i915_ggtt_offset(ring->vma));
 
 	/* Check that the ring offsets point within the ring! */
 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->head));
@@ -224,46 +214,44 @@ static int xcs_resume(struct intel_engine_cs *engine)
 	set_pp_dir(engine);
 
 	/* First wake the ring up to an empty/idle ring */
-	ENGINE_WRITE(engine, RING_HEAD, ring->head);
-	ENGINE_WRITE(engine, RING_TAIL, ring->head);
+	ENGINE_WRITE_FW(engine, RING_HEAD, ring->head);
+	ENGINE_WRITE_FW(engine, RING_TAIL, ring->head);
 	ENGINE_POSTING_READ(engine, RING_TAIL);
 
-	ENGINE_WRITE(engine, RING_CTL, RING_CTL_SIZE(ring->size) | RING_VALID);
+	ENGINE_WRITE_FW(engine, RING_CTL,
+			RING_CTL_SIZE(ring->size) | RING_VALID);
 
 	/* If the head is still not zero, the ring is dead */
-	if (intel_wait_for_register(engine->uncore,
-				    RING_CTL(engine->mmio_base),
-				    RING_VALID, RING_VALID,
-				    50)) {
-		drm_err(&dev_priv->drm, "%s initialization failed "
-			  "ctl %08x (valid? %d) head %08x [%08x] tail %08x [%08x] start %08x [expected %08x]\n",
-			  engine->name,
-			  ENGINE_READ(engine, RING_CTL),
-			  ENGINE_READ(engine, RING_CTL) & RING_VALID,
-			  ENGINE_READ(engine, RING_HEAD), ring->head,
-			  ENGINE_READ(engine, RING_TAIL), ring->tail,
-			  ENGINE_READ(engine, RING_START),
-			  i915_ggtt_offset(ring->vma));
-		ret = -EIO;
-		goto out;
+	if (__intel_wait_for_register_fw(engine->uncore,
+					 RING_CTL(engine->mmio_base),
+					 RING_VALID, RING_VALID,
+					 5000, 0, NULL)) {
+		drm_err(&dev_priv->drm,
+			"%s initialization failed; "
+			"ctl %08x (valid? %d) head %08x [%08x] tail %08x [%08x] start %08x [expected %08x]\n",
+			engine->name,
+			ENGINE_READ(engine, RING_CTL),
+			ENGINE_READ(engine, RING_CTL) & RING_VALID,
+			ENGINE_READ(engine, RING_HEAD), ring->head,
+			ENGINE_READ(engine, RING_TAIL), ring->tail,
+			ENGINE_READ(engine, RING_START),
+			i915_ggtt_offset(ring->vma));
+		return -EIO;
 	}
 
 	if (INTEL_GEN(dev_priv) > 2)
-		ENGINE_WRITE(engine,
-			     RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
+		ENGINE_WRITE_FW(engine,
+				RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
 
 	/* Now awake, let it get started */
 	if (ring->tail != ring->head) {
-		ENGINE_WRITE(engine, RING_TAIL, ring->tail);
+		ENGINE_WRITE_FW(engine, RING_TAIL, ring->tail);
 		ENGINE_POSTING_READ(engine, RING_TAIL);
 	}
 
 	/* Papering over lost _interrupts_ immediately following the restart */
 	intel_engine_signal_breadcrumbs(engine);
-out:
-	intel_uncore_forcewake_put(engine->uncore, FORCEWAKE_ALL);
-
-	return ret;
+	return 0;
 }
 
 static void sanitize_hwsp(struct intel_engine_cs *engine)
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [Intel-gfx] [PATCH 10/11] drm/i915/selftests: Prepare the selftests for engine resets with ring submission
  2021-01-10 15:03 ` [Intel-gfx] " Chris Wilson
                   ` (8 preceding siblings ...)
  (?)
@ 2021-01-10 15:04 ` Chris Wilson
  -1 siblings, 0 replies; 30+ messages in thread
From: Chris Wilson @ 2021-01-10 15:04 UTC (permalink / raw)
  To: intel-gfx; +Cc: Chris Wilson

The engine resets selftests kick the tasklets, safe up until now as only
execlists supported engine resets.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/gt/selftest_hangcheck.c | 18 ++++++++++++++----
 drivers/gpu/drm/i915/gt/selftest_reset.c     | 11 ++++++++---
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
index c28d1fcad673..28f71cc2004d 100644
--- a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
+++ b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
@@ -560,6 +560,7 @@ static int __igt_reset_engine(struct intel_gt *gt, bool active)
 
 	for_each_engine(engine, gt, id) {
 		unsigned int reset_count, reset_engine_count;
+		unsigned long count;
 		IGT_TIMEOUT(end_time);
 
 		if (active && !intel_engine_can_store_dword(engine))
@@ -577,6 +578,7 @@ static int __igt_reset_engine(struct intel_gt *gt, bool active)
 
 		st_engine_heartbeat_disable(engine);
 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
+		count = 0;
 		do {
 			if (active) {
 				struct i915_request *rq;
@@ -625,9 +627,13 @@ static int __igt_reset_engine(struct intel_gt *gt, bool active)
 				err = -EINVAL;
 				break;
 			}
+
+			count++;
 		} while (time_before(jiffies, end_time));
 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
 		st_engine_heartbeat_enable(engine);
+		pr_info("%s: Completed %lu %s resets\n",
+			engine->name, count, active ? "active" : "idle");
 
 		if (err)
 			break;
@@ -1478,7 +1484,8 @@ static int igt_reset_queue(void *arg)
 			prev = rq;
 			count++;
 		} while (time_before(jiffies, end_time));
-		pr_info("%s: Completed %d resets\n", engine->name, count);
+		pr_info("%s: Completed %d queued resets\n",
+			engine->name, count);
 
 		*h.batch = MI_BATCH_BUFFER_END;
 		intel_gt_chipset_flush(engine->gt);
@@ -1575,7 +1582,8 @@ static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
 	GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
 		  engine->name, mode, p->name);
 
-	tasklet_disable(t);
+	if (t->func)
+		tasklet_disable(t);
 	if (strcmp(p->name, "softirq"))
 		local_bh_disable();
 	p->critical_section_begin();
@@ -1585,8 +1593,10 @@ static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
 	p->critical_section_end();
 	if (strcmp(p->name, "softirq"))
 		local_bh_enable();
-	tasklet_enable(t);
-	tasklet_hi_schedule(t);
+	if (t->func) {
+		tasklet_enable(t);
+		tasklet_hi_schedule(t);
+	}
 
 	if (err)
 		pr_err("i915_reset_engine(%s:%s) failed under %s\n",
diff --git a/drivers/gpu/drm/i915/gt/selftest_reset.c b/drivers/gpu/drm/i915/gt/selftest_reset.c
index b7befcfbdcde..8784257ec808 100644
--- a/drivers/gpu/drm/i915/gt/selftest_reset.c
+++ b/drivers/gpu/drm/i915/gt/selftest_reset.c
@@ -321,7 +321,10 @@ static int igt_atomic_engine_reset(void *arg)
 		goto out_unlock;
 
 	for_each_engine(engine, gt, id) {
-		tasklet_disable(&engine->execlists.tasklet);
+		struct tasklet_struct *t = &engine->execlists.tasklet;
+
+		if (t->func)
+			tasklet_disable(t);
 		intel_engine_pm_get(engine);
 
 		for (p = igt_atomic_phases; p->name; p++) {
@@ -345,8 +348,10 @@ static int igt_atomic_engine_reset(void *arg)
 		}
 
 		intel_engine_pm_put(engine);
-		tasklet_enable(&engine->execlists.tasklet);
-		tasklet_hi_schedule(&engine->execlists.tasklet);
+		if (t->func) {
+			tasklet_enable(t);
+			tasklet_hi_schedule(t);
+		}
 		if (err)
 			break;
 	}
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [Intel-gfx] [PATCH 11/11] drm/i915: Mark per-engine-reset as supported on gen7
  2021-01-10 15:03 ` [Intel-gfx] " Chris Wilson
                   ` (9 preceding siblings ...)
  (?)
@ 2021-01-10 15:04 ` Chris Wilson
  -1 siblings, 0 replies; 30+ messages in thread
From: Chris Wilson @ 2021-01-10 15:04 UTC (permalink / raw)
  To: intel-gfx; +Cc: Chris Wilson

The benefit of only resetting a single engine is that we leave other
streams of userspace work intact across a hang; vital for process
isolation. We had wired up individual engine resets for gen6, but only
enabled it from gen8; now let's turn it on for the forgotten gen7. gen6
is still a mystery as how to unravel some global state that appears to
be reset along with an engine (in particular the ppgtt enabling in
GFX_MODE).

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_pci.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c
index 39608381b4a4..020b5f561f07 100644
--- a/drivers/gpu/drm/i915/i915_pci.c
+++ b/drivers/gpu/drm/i915/i915_pci.c
@@ -455,6 +455,7 @@ static const struct intel_device_info snb_m_gt2_info = {
 	.has_llc = 1, \
 	.has_rc6 = 1, \
 	.has_rc6p = 1, \
+	.has_reset_engine = true, \
 	.has_rps = true, \
 	.dma_mask_size = 40, \
 	.ppgtt_type = INTEL_PPGTT_ALIASING, \
@@ -513,6 +514,7 @@ static const struct intel_device_info vlv_info = {
 	.cpu_transcoder_mask = BIT(TRANSCODER_A) | BIT(TRANSCODER_B),
 	.has_runtime_pm = 1,
 	.has_rc6 = 1,
+	.has_reset_engine = true,
 	.has_rps = true,
 	.display.has_gmch = 1,
 	.display.has_hotplug = 1,
@@ -571,8 +573,7 @@ static const struct intel_device_info hsw_gt3_info = {
 	.dma_mask_size = 39, \
 	.ppgtt_type = INTEL_PPGTT_FULL, \
 	.ppgtt_size = 48, \
-	.has_64bit_reloc = 1, \
-	.has_reset_engine = 1
+	.has_64bit_reloc = 1
 
 #define BDW_PLATFORM \
 	GEN8_FEATURES, \
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [Intel-gfx] ✗ Fi.CI.CHECKPATCH: warning for series starting with [01/11] drm/i915/gt: Limit VFE threads based on GT
  2021-01-10 15:03 ` [Intel-gfx] " Chris Wilson
                   ` (10 preceding siblings ...)
  (?)
@ 2021-01-10 15:35 ` Patchwork
  -1 siblings, 0 replies; 30+ messages in thread
From: Patchwork @ 2021-01-10 15:35 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [01/11] drm/i915/gt: Limit VFE threads based on GT
URL   : https://patchwork.freedesktop.org/series/85682/
State : warning

== Summary ==

$ dim checkpatch origin/drm-tip
b57630875319 drm/i915/gt: Limit VFE threads based on GT
3de2a3eaddac drm/i915/gt: Restore clear-residual mitigations for Ivybridge, Baytrail
caa80494f198 drm/i915: Allow the sysadmin to override security mitigations
-:54: WARNING:FILE_PATH_CHANGES: added, moved or deleted file(s), does MAINTAINERS need updating?
#54: 
new file mode 100644

-:193: CHECK:PARENTHESIS_ALIGNMENT: Alignment should match open parenthesis
#193: FILE: drivers/gpu/drm/i915/i915_mitigations.c:135:
+MODULE_PARM_DESC(mitigations,
+"Selectively enable security mitigations for all Intel® GPUs in the system.\n"

total: 0 errors, 1 warnings, 1 checks, 184 lines checked
46082f463e50 drm/i915/gt: Rearrange vlv workarounds
db29438bd0f1 drm/i915/gt: Rearrange ivb workarounds
66c741851381 drm/i915/gt: Replace open-coded intel_engine_stop_cs()
1695ce6b5260 drm/i915/gt: Reapply ppgtt enabling after engine resets
d42ad5b8ac19 drm/i915/gt: Lift stop_ring() to reset_prepare
04d6143b4820 drm/i915/gt: Pull ring submission resume under its caller forcewake
92e8b37223ae drm/i915/selftests: Prepare the selftests for engine resets with ring submission
0dee39e5ddf4 drm/i915: Mark per-engine-reset as supported on gen7


_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [Intel-gfx] ✗ Fi.CI.SPARSE: warning for series starting with [01/11] drm/i915/gt: Limit VFE threads based on GT
  2021-01-10 15:03 ` [Intel-gfx] " Chris Wilson
                   ` (11 preceding siblings ...)
  (?)
@ 2021-01-10 15:35 ` Patchwork
  -1 siblings, 0 replies; 30+ messages in thread
From: Patchwork @ 2021-01-10 15:35 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [01/11] drm/i915/gt: Limit VFE threads based on GT
URL   : https://patchwork.freedesktop.org/series/85682/
State : warning

== Summary ==

$ dim sparse --fast origin/drm-tip
Sparse version: v0.6.2
Fast mode used, each commit won't be checked separately.
-
+drivers/gpu/drm/i915/gt/intel_reset.c:1329:5: warning: context imbalance in 'intel_gt_reset_trylock' - different lock contexts for basic block
+./include/linux/seqlock.h:843:24: warning: trying to copy expression type 31
+./include/linux/seqlock.h:843:24: warning: trying to copy expression type 31
+./include/linux/seqlock.h:869:16: warning: trying to copy expression type 31


_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [Intel-gfx] ✗ Fi.CI.BAT: failure for series starting with [01/11] drm/i915/gt: Limit VFE threads based on GT
  2021-01-10 15:03 ` [Intel-gfx] " Chris Wilson
                   ` (12 preceding siblings ...)
  (?)
@ 2021-01-10 16:05 ` Patchwork
  -1 siblings, 0 replies; 30+ messages in thread
From: Patchwork @ 2021-01-10 16:05 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx


[-- Attachment #1.1: Type: text/plain, Size: 6435 bytes --]

== Series Details ==

Series: series starting with [01/11] drm/i915/gt: Limit VFE threads based on GT
URL   : https://patchwork.freedesktop.org/series/85682/
State : failure

== Summary ==

CI Bug Log - changes from CI_DRM_9573 -> Patchwork_19311
====================================================

Summary
-------

  **FAILURE**

  Serious unknown changes coming with Patchwork_19311 absolutely need to be
  verified manually.
  
  If you think the reported changes have nothing to do with the changes
  introduced in Patchwork_19311, please notify your bug team to allow them
  to document this new failure mode, which will reduce false positives in CI.

  External URL: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19311/index.html

Possible new issues
-------------------

  Here are the unknown changes that may have been introduced in Patchwork_19311:

### IGT changes ###

#### Possible regressions ####

  * igt@i915_selftest@live@execlists:
    - fi-bsw-n3050:       [PASS][1] -> [INCOMPLETE][2]
   [1]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9573/fi-bsw-n3050/igt@i915_selftest@live@execlists.html
   [2]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19311/fi-bsw-n3050/igt@i915_selftest@live@execlists.html

  * igt@i915_selftest@live@hangcheck:
    - fi-snb-2520m:       [PASS][3] -> [INCOMPLETE][4]
   [3]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9573/fi-snb-2520m/igt@i915_selftest@live@hangcheck.html
   [4]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19311/fi-snb-2520m/igt@i915_selftest@live@hangcheck.html

  
Known issues
------------

  Here are the changes found in Patchwork_19311 that come from known issues:

### IGT changes ###

#### Issues hit ####

  * igt@amdgpu/amd_basic@userptr:
    - fi-byt-j1900:       NOTRUN -> [SKIP][5] ([fdo#109271]) +17 similar issues
   [5]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19311/fi-byt-j1900/igt@amdgpu/amd_basic@userptr.html

  * igt@i915_selftest@live@gt_lrc:
    - fi-bsw-n3050:       [PASS][6] -> [DMESG-FAIL][7] ([i915#2675])
   [6]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9573/fi-bsw-n3050/igt@i915_selftest@live@gt_lrc.html
   [7]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19311/fi-bsw-n3050/igt@i915_selftest@live@gt_lrc.html

  * igt@prime_self_import@basic-with_one_bo_two_files:
    - fi-tgl-y:           [PASS][8] -> [DMESG-WARN][9] ([i915#402]) +2 similar issues
   [8]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9573/fi-tgl-y/igt@prime_self_import@basic-with_one_bo_two_files.html
   [9]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19311/fi-tgl-y/igt@prime_self_import@basic-with_one_bo_two_files.html

  * igt@runner@aborted:
    - fi-bdw-5557u:       NOTRUN -> [FAIL][10] ([i915#2029])
   [10]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19311/fi-bdw-5557u/igt@runner@aborted.html
    - fi-bsw-n3050:       NOTRUN -> [FAIL][11] ([i915#1436])
   [11]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19311/fi-bsw-n3050/igt@runner@aborted.html
    - fi-tgl-u2:          NOTRUN -> [FAIL][12] ([i915#1602] / [i915#2029] / [i915#456])
   [12]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19311/fi-tgl-u2/igt@runner@aborted.html

  
#### Possible fixes ####

  * igt@gem_ringfill@basic-all:
    - fi-tgl-y:           [DMESG-WARN][13] ([i915#402]) -> [PASS][14] +1 similar issue
   [13]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9573/fi-tgl-y/igt@gem_ringfill@basic-all.html
   [14]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19311/fi-tgl-y/igt@gem_ringfill@basic-all.html

  * igt@i915_pm_rpm@module-reload:
    - fi-byt-j1900:       [INCOMPLETE][15] ([i915#142] / [i915#2405]) -> [PASS][16]
   [15]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9573/fi-byt-j1900/igt@i915_pm_rpm@module-reload.html
   [16]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19311/fi-byt-j1900/igt@i915_pm_rpm@module-reload.html

  * igt@kms_chamelium@dp-crc-fast:
    - fi-kbl-7500u:       [FAIL][17] ([i915#1161] / [i915#262]) -> [PASS][18]
   [17]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9573/fi-kbl-7500u/igt@kms_chamelium@dp-crc-fast.html
   [18]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19311/fi-kbl-7500u/igt@kms_chamelium@dp-crc-fast.html

  
  [fdo#109271]: https://bugs.freedesktop.org/show_bug.cgi?id=109271
  [i915#1161]: https://gitlab.freedesktop.org/drm/intel/issues/1161
  [i915#142]: https://gitlab.freedesktop.org/drm/intel/issues/142
  [i915#1436]: https://gitlab.freedesktop.org/drm/intel/issues/1436
  [i915#1602]: https://gitlab.freedesktop.org/drm/intel/issues/1602
  [i915#2029]: https://gitlab.freedesktop.org/drm/intel/issues/2029
  [i915#2405]: https://gitlab.freedesktop.org/drm/intel/issues/2405
  [i915#262]: https://gitlab.freedesktop.org/drm/intel/issues/262
  [i915#2675]: https://gitlab.freedesktop.org/drm/intel/issues/2675
  [i915#402]: https://gitlab.freedesktop.org/drm/intel/issues/402
  [i915#456]: https://gitlab.freedesktop.org/drm/intel/issues/456


Participating hosts (43 -> 38)
------------------------------

  Missing    (5): fi-ilk-m540 fi-hsw-4200u fi-bsw-cyan fi-ctg-p8600 fi-bdw-samus 


Build changes
-------------

  * Linux: CI_DRM_9573 -> Patchwork_19311

  CI-20190529: 20190529
  CI_DRM_9573: cd0df21e28c36de80356344ff8683be2813c6ff2 @ git://anongit.freedesktop.org/gfx-ci/linux
  IGT_5953: 65c5eea699141e6f942ce0a8fc85db76ce53cd19 @ git://anongit.freedesktop.org/xorg/app/intel-gpu-tools
  Patchwork_19311: 0dee39e5ddf4f8bed9fbac4d4c4a7bed38900990 @ git://anongit.freedesktop.org/gfx-ci/linux


== Linux commits ==

0dee39e5ddf4 drm/i915: Mark per-engine-reset as supported on gen7
92e8b37223ae drm/i915/selftests: Prepare the selftests for engine resets with ring submission
04d6143b4820 drm/i915/gt: Pull ring submission resume under its caller forcewake
d42ad5b8ac19 drm/i915/gt: Lift stop_ring() to reset_prepare
1695ce6b5260 drm/i915/gt: Reapply ppgtt enabling after engine resets
66c741851381 drm/i915/gt: Replace open-coded intel_engine_stop_cs()
db29438bd0f1 drm/i915/gt: Rearrange ivb workarounds
46082f463e50 drm/i915/gt: Rearrange vlv workarounds
caa80494f198 drm/i915: Allow the sysadmin to override security mitigations
3de2a3eaddac drm/i915/gt: Restore clear-residual mitigations for Ivybridge, Baytrail
b57630875319 drm/i915/gt: Limit VFE threads based on GT

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19311/index.html

[-- Attachment #1.2: Type: text/html, Size: 7499 bytes --]

[-- Attachment #2: Type: text/plain, Size: 160 bytes --]

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* RE: [PATCH 03/11] drm/i915: Allow the sysadmin to override security mitigations
  2021-01-10 15:03   ` [Intel-gfx] " Chris Wilson
@ 2021-01-11 17:31     ` Bloomfield, Jon
  -1 siblings, 0 replies; 30+ messages in thread
From: Bloomfield, Jon @ 2021-01-11 17:31 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx; +Cc: Joonas Lahtinen, Vivi, Rodrigo, stable

> -----Original Message-----
> From: Chris Wilson <chris@chris-wilson.co.uk>
> Sent: Sunday, January 10, 2021 7:04 AM
> To: intel-gfx@lists.freedesktop.org
> Cc: Chris Wilson <chris@chris-wilson.co.uk>; Joonas Lahtinen
> <joonas.lahtinen@linux.intel.com>; Bloomfield, Jon
> <jon.bloomfield@intel.com>; Vivi, Rodrigo <rodrigo.vivi@intel.com>;
> stable@vger.kernel.org
> Subject: [PATCH 03/11] drm/i915: Allow the sysadmin to override security
> mitigations
> 
> The clear-residuals mitigation is a relatively heavy hammer and under some
> circumstances the user may wish to forgo the context isolation in order
> to meet some performance requirement. Introduce a generic module
> parameter to allow selectively enabling/disabling different mitigations.
> 
> Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/1858
> Fixes: 47f8253d2b89 ("drm/i915/gen7: Clear all EU/L3 residual contexts")
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> Cc: Jon Bloomfield <jon.bloomfield@intel.com>
> Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
> Cc: stable@vger.kernel.org # v5.7
> ---

Reviewed-by: Jon Bloomfield <jon.bloomfield@intel.com>?

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Intel-gfx] [PATCH 03/11] drm/i915: Allow the sysadmin to override security mitigations
@ 2021-01-11 17:31     ` Bloomfield, Jon
  0 siblings, 0 replies; 30+ messages in thread
From: Bloomfield, Jon @ 2021-01-11 17:31 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx; +Cc: stable

> -----Original Message-----
> From: Chris Wilson <chris@chris-wilson.co.uk>
> Sent: Sunday, January 10, 2021 7:04 AM
> To: intel-gfx@lists.freedesktop.org
> Cc: Chris Wilson <chris@chris-wilson.co.uk>; Joonas Lahtinen
> <joonas.lahtinen@linux.intel.com>; Bloomfield, Jon
> <jon.bloomfield@intel.com>; Vivi, Rodrigo <rodrigo.vivi@intel.com>;
> stable@vger.kernel.org
> Subject: [PATCH 03/11] drm/i915: Allow the sysadmin to override security
> mitigations
> 
> The clear-residuals mitigation is a relatively heavy hammer and under some
> circumstances the user may wish to forgo the context isolation in order
> to meet some performance requirement. Introduce a generic module
> parameter to allow selectively enabling/disabling different mitigations.
> 
> Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/1858
> Fixes: 47f8253d2b89 ("drm/i915/gen7: Clear all EU/L3 residual contexts")
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> Cc: Jon Bloomfield <jon.bloomfield@intel.com>
> Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
> Cc: stable@vger.kernel.org # v5.7
> ---

Reviewed-by: Jon Bloomfield <jon.bloomfield@intel.com>?
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Intel-gfx] [PATCH 01/11] drm/i915/gt: Limit VFE threads based on GT
  2021-01-10 15:03 ` [Intel-gfx] " Chris Wilson
@ 2021-01-11 17:35   ` Rodrigo Vivi
  -1 siblings, 0 replies; 30+ messages in thread
From: Rodrigo Vivi @ 2021-01-11 17:35 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx, Randy Wright, stable

On Sun, Jan 10, 2021 at 03:03:54PM +0000, Chris Wilson wrote:
> MEDIA_STATE_VFE only accepts the 'maximum number of threads' in the
> range [0, n-1] where n is #EU * (#threads/EU) with the number of threads
> based on plaform and the number of EU based on the number of slices and
> subslices. This is a fixed number per platform/gt, so appropriately
> limit the number of threads we spawn to match the device.
> 
> v2: Oversaturate the system with tasks to force execution on every HW
> thread; if the thread idles it is returned to the pool and may be reused
> again before an unused thread.
> 
> v3: Fix more state commands, which was causing Baytrail to barf.

CI is still not happy with byt right? or is that false positive?

> v4: STATE_CACHE_INVALIDATE requires a stall on Ivybridge
> 
> Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/2024
> Fixes: 47f8253d2b89 ("drm/i915/gen7: Clear all EU/L3 residual contexts")
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Prathap Kumar Valsan <prathap.kumar.valsan@intel.com>
> Cc: Akeem G Abodunrin <akeem.g.abodunrin@intel.com>
> Cc: Jon Bloomfield <jon.bloomfield@intel.com>
> Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
> Cc: Randy Wright <rwright@hpe.com>
> Cc: stable@vger.kernel.org # v5.7+
> ---
>  drivers/gpu/drm/i915/gt/gen7_renderclear.c | 157 ++++++++++++---------
>  1 file changed, 94 insertions(+), 63 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/gen7_renderclear.c b/drivers/gpu/drm/i915/gt/gen7_renderclear.c
> index d93d85cd3027..f32a8e8040b2 100644
> --- a/drivers/gpu/drm/i915/gt/gen7_renderclear.c
> +++ b/drivers/gpu/drm/i915/gt/gen7_renderclear.c
> @@ -7,8 +7,6 @@
>  #include "i915_drv.h"
>  #include "intel_gpu_commands.h"
>  
> -#define MAX_URB_ENTRIES 64
> -#define STATE_SIZE (4 * 1024)
>  #define GT3_INLINE_DATA_DELAYS 0x1E00
>  #define batch_advance(Y, CS) GEM_BUG_ON((Y)->end != (CS))
>  
> @@ -34,38 +32,59 @@ struct batch_chunk {
>  };
>  
>  struct batch_vals {
> -	u32 max_primitives;
> -	u32 max_urb_entries;
> -	u32 cmd_size;
> -	u32 state_size;
> +	u32 max_threads;
>  	u32 state_start;
> -	u32 batch_size;
> +	u32 surface_start;
>  	u32 surface_height;
>  	u32 surface_width;
> -	u32 scratch_size;
> -	u32 max_size;
> +	u32 size;
>  };
>  
> +static inline int num_primitives(const struct batch_vals *bv)
> +{
> +	/*
> +	 * We need to saturate the GPU with work in order to dispatch
> +	 * a shader on every HW thread, and clear the thread-local registers.
> +	 * In short, we have to dispatch work faster than the shaders can
> +	 * run in order to fill occupy each HW thread.
> +	 */
> +	return bv->max_threads;
> +}
> +
>  static void
>  batch_get_defaults(struct drm_i915_private *i915, struct batch_vals *bv)
>  {
>  	if (IS_HASWELL(i915)) {
> -		bv->max_primitives = 280;
> -		bv->max_urb_entries = MAX_URB_ENTRIES;
> +		switch (INTEL_INFO(i915)->gt) {
> +		default:
> +		case 1:
> +			bv->max_threads = 70;
> +			break;
> +		case 2:
> +			bv->max_threads = 140;
> +			break;
> +		case 3:
> +			bv->max_threads = 280;
> +			break;
> +		}
>  		bv->surface_height = 16 * 16;
>  		bv->surface_width = 32 * 2 * 16;
>  	} else {
> -		bv->max_primitives = 128;
> -		bv->max_urb_entries = MAX_URB_ENTRIES / 2;
> +		switch (INTEL_INFO(i915)->gt) {
> +		default:
> +		case 1: /* including vlv */
> +			bv->max_threads = 36;
> +			break;
> +		case 2:
> +			bv->max_threads = 128;
> +			break;
> +		}
>  		bv->surface_height = 16 * 8;
>  		bv->surface_width = 32 * 16;

all the values above matches the spec.

>  	}
> -	bv->cmd_size = bv->max_primitives * 4096;
> -	bv->state_size = STATE_SIZE;
> -	bv->state_start = bv->cmd_size;
> -	bv->batch_size = bv->cmd_size + bv->state_size;
> -	bv->scratch_size = bv->surface_height * bv->surface_width;
> -	bv->max_size = bv->batch_size + bv->scratch_size;
> +	bv->state_start = round_up(SZ_1K + num_primitives(bv) * 64, SZ_4K);
> +	bv->surface_start = bv->state_start + SZ_4K;
> +	bv->size = bv->surface_start + bv->surface_height * bv->surface_width;

I liked this batch values simplification...

>  }
>  
>  static void batch_init(struct batch_chunk *bc,
> @@ -155,7 +174,8 @@ static u32
>  gen7_fill_binding_table(struct batch_chunk *state,
>  			const struct batch_vals *bv)
>  {
> -	u32 surface_start = gen7_fill_surface_state(state, bv->batch_size, bv);
> +	u32 surface_start =
> +		gen7_fill_surface_state(state, bv->surface_start, bv);
>  	u32 *cs = batch_alloc_items(state, 32, 8);
>  	u32 offset = batch_offset(state, cs);
>  
> @@ -214,9 +234,9 @@ static void
>  gen7_emit_state_base_address(struct batch_chunk *batch,
>  			     u32 surface_state_base)
>  {
> -	u32 *cs = batch_alloc_items(batch, 0, 12);
> +	u32 *cs = batch_alloc_items(batch, 0, 10);
>  
> -	*cs++ = STATE_BASE_ADDRESS | (12 - 2);
> +	*cs++ = STATE_BASE_ADDRESS | (10 - 2);
>  	/* general */
>  	*cs++ = batch_addr(batch) | BASE_ADDRESS_MODIFY;
>  	/* surface */
> @@ -233,8 +253,6 @@ gen7_emit_state_base_address(struct batch_chunk *batch,
>  	*cs++ = BASE_ADDRESS_MODIFY;
>  	*cs++ = 0;
>  	*cs++ = BASE_ADDRESS_MODIFY;
> -	*cs++ = 0;
> -	*cs++ = 0;

why don't we need this anymore?

>  	batch_advance(batch, cs);
>  }
>  
> @@ -244,8 +262,7 @@ gen7_emit_vfe_state(struct batch_chunk *batch,
>  		    u32 urb_size, u32 curbe_size,
>  		    u32 mode)
>  {
> -	u32 urb_entries = bv->max_urb_entries;
> -	u32 threads = bv->max_primitives - 1;
> +	u32 threads = bv->max_threads - 1;
>  	u32 *cs = batch_alloc_items(batch, 32, 8);
>  
>  	*cs++ = MEDIA_VFE_STATE | (8 - 2);
> @@ -254,7 +271,7 @@ gen7_emit_vfe_state(struct batch_chunk *batch,
>  	*cs++ = 0;
>  
>  	/* number of threads & urb entries for GPGPU vs Media Mode */
> -	*cs++ = threads << 16 | urb_entries << 8 | mode << 2;
> +	*cs++ = threads << 16 | 1 << 8 | mode << 2;
>  
>  	*cs++ = 0;
>  
> @@ -293,17 +310,12 @@ gen7_emit_media_object(struct batch_chunk *batch,
>  {
>  	unsigned int x_offset = (media_object_index % 16) * 64;
>  	unsigned int y_offset = (media_object_index / 16) * 16;
> -	unsigned int inline_data_size;
> -	unsigned int media_batch_size;
> -	unsigned int i;
> +	unsigned int pkt = 6 + 3;
>  	u32 *cs;
>  
> -	inline_data_size = 112 * 8;
> -	media_batch_size = inline_data_size + 6;
> +	cs = batch_alloc_items(batch, 8, pkt);
>  
> -	cs = batch_alloc_items(batch, 8, media_batch_size);
> -
> -	*cs++ = MEDIA_OBJECT | (media_batch_size - 2);
> +	*cs++ = MEDIA_OBJECT | (pkt - 2);
>  
>  	/* interface descriptor offset */
>  	*cs++ = 0;
> @@ -317,25 +329,44 @@ gen7_emit_media_object(struct batch_chunk *batch,
>  	*cs++ = 0;
>  
>  	/* inline */
> -	*cs++ = (y_offset << 16) | (x_offset);
> +	*cs++ = y_offset << 16 | x_offset;
>  	*cs++ = 0;
>  	*cs++ = GT3_INLINE_DATA_DELAYS;
> -	for (i = 3; i < inline_data_size; i++)
> -		*cs++ = 0;

why?

>  
>  	batch_advance(batch, cs);
>  }
>  
>  static void gen7_emit_pipeline_flush(struct batch_chunk *batch)
>  {
> -	u32 *cs = batch_alloc_items(batch, 0, 5);
> +	u32 *cs = batch_alloc_items(batch, 0, 4);
>  
> -	*cs++ = GFX_OP_PIPE_CONTROL(5);
> -	*cs++ = PIPE_CONTROL_STATE_CACHE_INVALIDATE |
> -		PIPE_CONTROL_GLOBAL_GTT_IVB;
> +	*cs++ = GFX_OP_PIPE_CONTROL(4);
> +	*cs++ = PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
> +		PIPE_CONTROL_DEPTH_CACHE_FLUSH |
> +		PIPE_CONTROL_DC_FLUSH_ENABLE |
> +		PIPE_CONTROL_CS_STALL;
>  	*cs++ = 0;
>  	*cs++ = 0;
> +
> +	batch_advance(batch, cs);
> +}
> +
> +static void gen7_emit_pipeline_invalidate(struct batch_chunk *batch)
> +{
> +	u32 *cs = batch_alloc_items(batch, 0, 8);
> +
> +	/* ivb: Stall before STATE_CACHE_INVALIDATE */
> +	*cs++ = GFX_OP_PIPE_CONTROL(4);
> +	*cs++ = PIPE_CONTROL_STALL_AT_SCOREBOARD |
> +		PIPE_CONTROL_CS_STALL;
>  	*cs++ = 0;
> +	*cs++ = 0;
> +
> +	*cs++ = GFX_OP_PIPE_CONTROL(4);
> +	*cs++ = PIPE_CONTROL_STATE_CACHE_INVALIDATE;
> +	*cs++ = 0;
> +	*cs++ = 0;
> +
>  	batch_advance(batch, cs);
>  }
>  
> @@ -344,34 +375,34 @@ static void emit_batch(struct i915_vma * const vma,
>  		       const struct batch_vals *bv)
>  {
>  	struct drm_i915_private *i915 = vma->vm->i915;
> -	unsigned int desc_count = 64;
> -	const u32 urb_size = 112;
> +	const unsigned int desc_count = 1;
> +	const unsigned int urb_size = 1;
>  	struct batch_chunk cmds, state;
> -	u32 interface_descriptor;
> +	u32 descriptors;
>  	unsigned int i;
>  
> -	batch_init(&cmds, vma, start, 0, bv->cmd_size);
> -	batch_init(&state, vma, start, bv->state_start, bv->state_size);
> +	batch_init(&cmds, vma, start, 0, bv->state_start);
> +	batch_init(&state, vma, start, bv->state_start, SZ_4K);
>  
> -	interface_descriptor =
> -		gen7_fill_interface_descriptor(&state, bv,
> -					       IS_HASWELL(i915) ?
> -					       &cb_kernel_hsw :
> -					       &cb_kernel_ivb,
> -					       desc_count);
> -	gen7_emit_pipeline_flush(&cmds);
> +	descriptors = gen7_fill_interface_descriptor(&state, bv,
> +						     IS_HASWELL(i915) ?
> +						     &cb_kernel_hsw :
> +						     &cb_kernel_ivb,
> +						     desc_count);
> +
> +	gen7_emit_pipeline_invalidate(&cmds);
>  	batch_add(&cmds, PIPELINE_SELECT | PIPELINE_SELECT_MEDIA);
>  	batch_add(&cmds, MI_NOOP);
> -	gen7_emit_state_base_address(&cmds, interface_descriptor);
> +	gen7_emit_pipeline_invalidate(&cmds);
> +
>  	gen7_emit_pipeline_flush(&cmds);
> +	gen7_emit_state_base_address(&cmds, descriptors);
> +	gen7_emit_pipeline_invalidate(&cmds);

why do we need double invalidate?

>  
>  	gen7_emit_vfe_state(&cmds, bv, urb_size - 1, 0, 0);
> +	gen7_emit_interface_descriptor_load(&cmds, descriptors, desc_count);
>  
> -	gen7_emit_interface_descriptor_load(&cmds,
> -					    interface_descriptor,
> -					    desc_count);
> -
> -	for (i = 0; i < bv->max_primitives; i++)
> +	for (i = 0; i < num_primitives(bv); i++)
>  		gen7_emit_media_object(&cmds, i);
>  
>  	batch_add(&cmds, MI_BATCH_BUFFER_END);
> @@ -385,15 +416,15 @@ int gen7_setup_clear_gpr_bb(struct intel_engine_cs * const engine,
>  
>  	batch_get_defaults(engine->i915, &bv);
>  	if (!vma)
> -		return bv.max_size;
> +		return bv.size;
>  
> -	GEM_BUG_ON(vma->obj->base.size < bv.max_size);
> +	GEM_BUG_ON(vma->obj->base.size < bv.size);
>  
>  	batch = i915_gem_object_pin_map(vma->obj, I915_MAP_WC);
>  	if (IS_ERR(batch))
>  		return PTR_ERR(batch);
>  
> -	emit_batch(vma, memset(batch, 0, bv.max_size), &bv);
> +	emit_batch(vma, memset(batch, 0, bv.size), &bv);
>  
>  	i915_gem_object_flush_map(vma->obj);
>  	__i915_gem_object_release_map(vma->obj);
> -- 
> 2.20.1
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Intel-gfx] [PATCH 01/11] drm/i915/gt: Limit VFE threads based on GT
@ 2021-01-11 17:35   ` Rodrigo Vivi
  0 siblings, 0 replies; 30+ messages in thread
From: Rodrigo Vivi @ 2021-01-11 17:35 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx, stable, Randy Wright

On Sun, Jan 10, 2021 at 03:03:54PM +0000, Chris Wilson wrote:
> MEDIA_STATE_VFE only accepts the 'maximum number of threads' in the
> range [0, n-1] where n is #EU * (#threads/EU) with the number of threads
> based on plaform and the number of EU based on the number of slices and
> subslices. This is a fixed number per platform/gt, so appropriately
> limit the number of threads we spawn to match the device.
> 
> v2: Oversaturate the system with tasks to force execution on every HW
> thread; if the thread idles it is returned to the pool and may be reused
> again before an unused thread.
> 
> v3: Fix more state commands, which was causing Baytrail to barf.

CI is still not happy with byt right? or is that false positive?

> v4: STATE_CACHE_INVALIDATE requires a stall on Ivybridge
> 
> Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/2024
> Fixes: 47f8253d2b89 ("drm/i915/gen7: Clear all EU/L3 residual contexts")
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Prathap Kumar Valsan <prathap.kumar.valsan@intel.com>
> Cc: Akeem G Abodunrin <akeem.g.abodunrin@intel.com>
> Cc: Jon Bloomfield <jon.bloomfield@intel.com>
> Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
> Cc: Randy Wright <rwright@hpe.com>
> Cc: stable@vger.kernel.org # v5.7+
> ---
>  drivers/gpu/drm/i915/gt/gen7_renderclear.c | 157 ++++++++++++---------
>  1 file changed, 94 insertions(+), 63 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/gen7_renderclear.c b/drivers/gpu/drm/i915/gt/gen7_renderclear.c
> index d93d85cd3027..f32a8e8040b2 100644
> --- a/drivers/gpu/drm/i915/gt/gen7_renderclear.c
> +++ b/drivers/gpu/drm/i915/gt/gen7_renderclear.c
> @@ -7,8 +7,6 @@
>  #include "i915_drv.h"
>  #include "intel_gpu_commands.h"
>  
> -#define MAX_URB_ENTRIES 64
> -#define STATE_SIZE (4 * 1024)
>  #define GT3_INLINE_DATA_DELAYS 0x1E00
>  #define batch_advance(Y, CS) GEM_BUG_ON((Y)->end != (CS))
>  
> @@ -34,38 +32,59 @@ struct batch_chunk {
>  };
>  
>  struct batch_vals {
> -	u32 max_primitives;
> -	u32 max_urb_entries;
> -	u32 cmd_size;
> -	u32 state_size;
> +	u32 max_threads;
>  	u32 state_start;
> -	u32 batch_size;
> +	u32 surface_start;
>  	u32 surface_height;
>  	u32 surface_width;
> -	u32 scratch_size;
> -	u32 max_size;
> +	u32 size;
>  };
>  
> +static inline int num_primitives(const struct batch_vals *bv)
> +{
> +	/*
> +	 * We need to saturate the GPU with work in order to dispatch
> +	 * a shader on every HW thread, and clear the thread-local registers.
> +	 * In short, we have to dispatch work faster than the shaders can
> +	 * run in order to fill occupy each HW thread.
> +	 */
> +	return bv->max_threads;
> +}
> +
>  static void
>  batch_get_defaults(struct drm_i915_private *i915, struct batch_vals *bv)
>  {
>  	if (IS_HASWELL(i915)) {
> -		bv->max_primitives = 280;
> -		bv->max_urb_entries = MAX_URB_ENTRIES;
> +		switch (INTEL_INFO(i915)->gt) {
> +		default:
> +		case 1:
> +			bv->max_threads = 70;
> +			break;
> +		case 2:
> +			bv->max_threads = 140;
> +			break;
> +		case 3:
> +			bv->max_threads = 280;
> +			break;
> +		}
>  		bv->surface_height = 16 * 16;
>  		bv->surface_width = 32 * 2 * 16;
>  	} else {
> -		bv->max_primitives = 128;
> -		bv->max_urb_entries = MAX_URB_ENTRIES / 2;
> +		switch (INTEL_INFO(i915)->gt) {
> +		default:
> +		case 1: /* including vlv */
> +			bv->max_threads = 36;
> +			break;
> +		case 2:
> +			bv->max_threads = 128;
> +			break;
> +		}
>  		bv->surface_height = 16 * 8;
>  		bv->surface_width = 32 * 16;

all the values above matches the spec.

>  	}
> -	bv->cmd_size = bv->max_primitives * 4096;
> -	bv->state_size = STATE_SIZE;
> -	bv->state_start = bv->cmd_size;
> -	bv->batch_size = bv->cmd_size + bv->state_size;
> -	bv->scratch_size = bv->surface_height * bv->surface_width;
> -	bv->max_size = bv->batch_size + bv->scratch_size;
> +	bv->state_start = round_up(SZ_1K + num_primitives(bv) * 64, SZ_4K);
> +	bv->surface_start = bv->state_start + SZ_4K;
> +	bv->size = bv->surface_start + bv->surface_height * bv->surface_width;

I liked this batch values simplification...

>  }
>  
>  static void batch_init(struct batch_chunk *bc,
> @@ -155,7 +174,8 @@ static u32
>  gen7_fill_binding_table(struct batch_chunk *state,
>  			const struct batch_vals *bv)
>  {
> -	u32 surface_start = gen7_fill_surface_state(state, bv->batch_size, bv);
> +	u32 surface_start =
> +		gen7_fill_surface_state(state, bv->surface_start, bv);
>  	u32 *cs = batch_alloc_items(state, 32, 8);
>  	u32 offset = batch_offset(state, cs);
>  
> @@ -214,9 +234,9 @@ static void
>  gen7_emit_state_base_address(struct batch_chunk *batch,
>  			     u32 surface_state_base)
>  {
> -	u32 *cs = batch_alloc_items(batch, 0, 12);
> +	u32 *cs = batch_alloc_items(batch, 0, 10);
>  
> -	*cs++ = STATE_BASE_ADDRESS | (12 - 2);
> +	*cs++ = STATE_BASE_ADDRESS | (10 - 2);
>  	/* general */
>  	*cs++ = batch_addr(batch) | BASE_ADDRESS_MODIFY;
>  	/* surface */
> @@ -233,8 +253,6 @@ gen7_emit_state_base_address(struct batch_chunk *batch,
>  	*cs++ = BASE_ADDRESS_MODIFY;
>  	*cs++ = 0;
>  	*cs++ = BASE_ADDRESS_MODIFY;
> -	*cs++ = 0;
> -	*cs++ = 0;

why don't we need this anymore?

>  	batch_advance(batch, cs);
>  }
>  
> @@ -244,8 +262,7 @@ gen7_emit_vfe_state(struct batch_chunk *batch,
>  		    u32 urb_size, u32 curbe_size,
>  		    u32 mode)
>  {
> -	u32 urb_entries = bv->max_urb_entries;
> -	u32 threads = bv->max_primitives - 1;
> +	u32 threads = bv->max_threads - 1;
>  	u32 *cs = batch_alloc_items(batch, 32, 8);
>  
>  	*cs++ = MEDIA_VFE_STATE | (8 - 2);
> @@ -254,7 +271,7 @@ gen7_emit_vfe_state(struct batch_chunk *batch,
>  	*cs++ = 0;
>  
>  	/* number of threads & urb entries for GPGPU vs Media Mode */
> -	*cs++ = threads << 16 | urb_entries << 8 | mode << 2;
> +	*cs++ = threads << 16 | 1 << 8 | mode << 2;
>  
>  	*cs++ = 0;
>  
> @@ -293,17 +310,12 @@ gen7_emit_media_object(struct batch_chunk *batch,
>  {
>  	unsigned int x_offset = (media_object_index % 16) * 64;
>  	unsigned int y_offset = (media_object_index / 16) * 16;
> -	unsigned int inline_data_size;
> -	unsigned int media_batch_size;
> -	unsigned int i;
> +	unsigned int pkt = 6 + 3;
>  	u32 *cs;
>  
> -	inline_data_size = 112 * 8;
> -	media_batch_size = inline_data_size + 6;
> +	cs = batch_alloc_items(batch, 8, pkt);
>  
> -	cs = batch_alloc_items(batch, 8, media_batch_size);
> -
> -	*cs++ = MEDIA_OBJECT | (media_batch_size - 2);
> +	*cs++ = MEDIA_OBJECT | (pkt - 2);
>  
>  	/* interface descriptor offset */
>  	*cs++ = 0;
> @@ -317,25 +329,44 @@ gen7_emit_media_object(struct batch_chunk *batch,
>  	*cs++ = 0;
>  
>  	/* inline */
> -	*cs++ = (y_offset << 16) | (x_offset);
> +	*cs++ = y_offset << 16 | x_offset;
>  	*cs++ = 0;
>  	*cs++ = GT3_INLINE_DATA_DELAYS;
> -	for (i = 3; i < inline_data_size; i++)
> -		*cs++ = 0;

why?

>  
>  	batch_advance(batch, cs);
>  }
>  
>  static void gen7_emit_pipeline_flush(struct batch_chunk *batch)
>  {
> -	u32 *cs = batch_alloc_items(batch, 0, 5);
> +	u32 *cs = batch_alloc_items(batch, 0, 4);
>  
> -	*cs++ = GFX_OP_PIPE_CONTROL(5);
> -	*cs++ = PIPE_CONTROL_STATE_CACHE_INVALIDATE |
> -		PIPE_CONTROL_GLOBAL_GTT_IVB;
> +	*cs++ = GFX_OP_PIPE_CONTROL(4);
> +	*cs++ = PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
> +		PIPE_CONTROL_DEPTH_CACHE_FLUSH |
> +		PIPE_CONTROL_DC_FLUSH_ENABLE |
> +		PIPE_CONTROL_CS_STALL;
>  	*cs++ = 0;
>  	*cs++ = 0;
> +
> +	batch_advance(batch, cs);
> +}
> +
> +static void gen7_emit_pipeline_invalidate(struct batch_chunk *batch)
> +{
> +	u32 *cs = batch_alloc_items(batch, 0, 8);
> +
> +	/* ivb: Stall before STATE_CACHE_INVALIDATE */
> +	*cs++ = GFX_OP_PIPE_CONTROL(4);
> +	*cs++ = PIPE_CONTROL_STALL_AT_SCOREBOARD |
> +		PIPE_CONTROL_CS_STALL;
>  	*cs++ = 0;
> +	*cs++ = 0;
> +
> +	*cs++ = GFX_OP_PIPE_CONTROL(4);
> +	*cs++ = PIPE_CONTROL_STATE_CACHE_INVALIDATE;
> +	*cs++ = 0;
> +	*cs++ = 0;
> +
>  	batch_advance(batch, cs);
>  }
>  
> @@ -344,34 +375,34 @@ static void emit_batch(struct i915_vma * const vma,
>  		       const struct batch_vals *bv)
>  {
>  	struct drm_i915_private *i915 = vma->vm->i915;
> -	unsigned int desc_count = 64;
> -	const u32 urb_size = 112;
> +	const unsigned int desc_count = 1;
> +	const unsigned int urb_size = 1;
>  	struct batch_chunk cmds, state;
> -	u32 interface_descriptor;
> +	u32 descriptors;
>  	unsigned int i;
>  
> -	batch_init(&cmds, vma, start, 0, bv->cmd_size);
> -	batch_init(&state, vma, start, bv->state_start, bv->state_size);
> +	batch_init(&cmds, vma, start, 0, bv->state_start);
> +	batch_init(&state, vma, start, bv->state_start, SZ_4K);
>  
> -	interface_descriptor =
> -		gen7_fill_interface_descriptor(&state, bv,
> -					       IS_HASWELL(i915) ?
> -					       &cb_kernel_hsw :
> -					       &cb_kernel_ivb,
> -					       desc_count);
> -	gen7_emit_pipeline_flush(&cmds);
> +	descriptors = gen7_fill_interface_descriptor(&state, bv,
> +						     IS_HASWELL(i915) ?
> +						     &cb_kernel_hsw :
> +						     &cb_kernel_ivb,
> +						     desc_count);
> +
> +	gen7_emit_pipeline_invalidate(&cmds);
>  	batch_add(&cmds, PIPELINE_SELECT | PIPELINE_SELECT_MEDIA);
>  	batch_add(&cmds, MI_NOOP);
> -	gen7_emit_state_base_address(&cmds, interface_descriptor);
> +	gen7_emit_pipeline_invalidate(&cmds);
> +
>  	gen7_emit_pipeline_flush(&cmds);
> +	gen7_emit_state_base_address(&cmds, descriptors);
> +	gen7_emit_pipeline_invalidate(&cmds);

why do we need double invalidate?

>  
>  	gen7_emit_vfe_state(&cmds, bv, urb_size - 1, 0, 0);
> +	gen7_emit_interface_descriptor_load(&cmds, descriptors, desc_count);
>  
> -	gen7_emit_interface_descriptor_load(&cmds,
> -					    interface_descriptor,
> -					    desc_count);
> -
> -	for (i = 0; i < bv->max_primitives; i++)
> +	for (i = 0; i < num_primitives(bv); i++)
>  		gen7_emit_media_object(&cmds, i);
>  
>  	batch_add(&cmds, MI_BATCH_BUFFER_END);
> @@ -385,15 +416,15 @@ int gen7_setup_clear_gpr_bb(struct intel_engine_cs * const engine,
>  
>  	batch_get_defaults(engine->i915, &bv);
>  	if (!vma)
> -		return bv.max_size;
> +		return bv.size;
>  
> -	GEM_BUG_ON(vma->obj->base.size < bv.max_size);
> +	GEM_BUG_ON(vma->obj->base.size < bv.size);
>  
>  	batch = i915_gem_object_pin_map(vma->obj, I915_MAP_WC);
>  	if (IS_ERR(batch))
>  		return PTR_ERR(batch);
>  
> -	emit_batch(vma, memset(batch, 0, bv.max_size), &bv);
> +	emit_batch(vma, memset(batch, 0, bv.size), &bv);
>  
>  	i915_gem_object_flush_map(vma->obj);
>  	__i915_gem_object_release_map(vma->obj);
> -- 
> 2.20.1
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Intel-gfx] [PATCH 02/11] drm/i915/gt: Restore clear-residual mitigations for Ivybridge, Baytrail
  2021-01-10 15:03 ` [Intel-gfx] [PATCH 02/11] drm/i915/gt: Restore clear-residual mitigations for Ivybridge, Baytrail Chris Wilson
@ 2021-01-11 17:35   ` Rodrigo Vivi
  0 siblings, 0 replies; 30+ messages in thread
From: Rodrigo Vivi @ 2021-01-11 17:35 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

On Sun, Jan 10, 2021 at 03:03:55PM +0000, Chris Wilson wrote:
> The mitigation is required for all gen7 platforms, now that it does not
> cause GPU hangs, restore it for Ivybridge and Baytrail.
> 
> Fixes: 47f8253d2b89 ("drm/i915/gen7: Clear all EU/L3 residual contexts")
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Prathap Kumar Valsan <prathap.kumar.valsan@intel.com>
> Cc: Akeem G Abodunrin <akeem.g.abodunrin@intel.com>
> Cc: Bloomfield Jon <jon.bloomfield@intel.com>
> ---
>  drivers/gpu/drm/i915/gt/intel_ring_submission.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_ring_submission.c b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
> index 1c6d421f6fe5..724d56c9583d 100644
> --- a/drivers/gpu/drm/i915/gt/intel_ring_submission.c
> +++ b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
> @@ -1324,7 +1324,7 @@ int intel_ring_submission_setup(struct intel_engine_cs *engine)
>  
>  	GEM_BUG_ON(timeline->hwsp_ggtt != engine->status_page.vma);
>  
> -	if (IS_HASWELL(engine->i915) && engine->class == RENDER_CLASS) {
> +	if (IS_GEN(engine->i915, 7) && engine->class == RENDER_CLASS) {

when CI is really happy

Reviewed-by: Rodrigo Vivi <rodrigo.vivi@intel.com>


>  		err = gen7_ctx_switch_bb_init(engine);
>  		if (err)
>  			goto err_ring_unpin;
> -- 
> 2.20.1
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Intel-gfx] [PATCH 03/11] drm/i915: Allow the sysadmin to override security mitigations
  2021-01-10 15:03   ` [Intel-gfx] " Chris Wilson
@ 2021-01-11 17:48     ` Rodrigo Vivi
  -1 siblings, 0 replies; 30+ messages in thread
From: Rodrigo Vivi @ 2021-01-11 17:48 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx, stable

On Sun, Jan 10, 2021 at 03:03:56PM +0000, Chris Wilson wrote:
> The clear-residuals mitigation is a relatively heavy hammer and under some
> circumstances the user may wish to forgo the context isolation in order
> to meet some performance requirement. Introduce a generic module
> parameter to allow selectively enabling/disabling different mitigations.
> 
> Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/1858

I'm afraid this will have the same faith as the rc6 and the validation impact :/

> Fixes: 47f8253d2b89 ("drm/i915/gen7: Clear all EU/L3 residual contexts")
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> Cc: Jon Bloomfield <jon.bloomfield@intel.com>
> Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
> Cc: stable@vger.kernel.org # v5.7
> ---
>  drivers/gpu/drm/i915/Makefile                 |   1 +
>  .../gpu/drm/i915/gt/intel_ring_submission.c   |   4 +-
>  drivers/gpu/drm/i915/i915_mitigations.c       | 148 ++++++++++++++++++
>  drivers/gpu/drm/i915/i915_mitigations.h       |  13 ++
>  4 files changed, 165 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/gpu/drm/i915/i915_mitigations.c
>  create mode 100644 drivers/gpu/drm/i915/i915_mitigations.h
> 
> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
> index 4074d8cb0d6e..48f82c354611 100644
> --- a/drivers/gpu/drm/i915/Makefile
> +++ b/drivers/gpu/drm/i915/Makefile
> @@ -38,6 +38,7 @@ i915-y += i915_drv.o \
>  	  i915_config.o \
>  	  i915_irq.o \
>  	  i915_getparam.o \
> +	  i915_mitigations.o \
>  	  i915_params.o \
>  	  i915_pci.o \
>  	  i915_scatterlist.o \
> diff --git a/drivers/gpu/drm/i915/gt/intel_ring_submission.c b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
> index 724d56c9583d..657afd8ebc14 100644
> --- a/drivers/gpu/drm/i915/gt/intel_ring_submission.c
> +++ b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
> @@ -32,6 +32,7 @@
>  #include "gen6_ppgtt.h"
>  #include "gen7_renderclear.h"
>  #include "i915_drv.h"
> +#include "i915_mitigations.h"
>  #include "intel_breadcrumbs.h"
>  #include "intel_context.h"
>  #include "intel_gt.h"
> @@ -918,7 +919,8 @@ static int switch_context(struct i915_request *rq)
>  	GEM_BUG_ON(HAS_EXECLISTS(engine->i915));
>  
>  	if (engine->wa_ctx.vma && ce != engine->kernel_context) {
> -		if (engine->wa_ctx.vma->private != ce) {
> +		if (engine->wa_ctx.vma->private != ce &&
> +		    i915_mitigate_clear_residuals()) {
>  			ret = clear_residuals(rq);
>  			if (ret)
>  				return ret;
> diff --git a/drivers/gpu/drm/i915/i915_mitigations.c b/drivers/gpu/drm/i915/i915_mitigations.c
> new file mode 100644
> index 000000000000..8d5637cfa734
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/i915_mitigations.c
> @@ -0,0 +1,148 @@
> +// SPDX-License-Identifier: MIT
> +/*
> + * Copyright © 2021 Intel Corporation
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/moduleparam.h>
> +#include <linux/slab.h>
> +#include <linux/string.h>
> +
> +#include "i915_drv.h"
> +#include "i915_mitigations.h"
> +
> +static unsigned long mitigations = ~0UL;
> +
> +enum {
> +	CLEAR_RESIDUALS = 0,

specially worse if this list grows...

> +};
> +
> +static const char * const names[] = {
> +	[CLEAR_RESIDUALS] = "residuals",
> +};
> +
> +bool i915_mitigate_clear_residuals(void)
> +{
> +	return READ_ONCE(mitigations) & BIT(CLEAR_RESIDUALS);
> +}
> +
> +static int mitigations_set(const char *val, const struct kernel_param *kp)
> +{
> +	unsigned long new = ~0UL;
> +	char *str, *sep, *tok;
> +	bool first = true;
> +	int err = 0;
> +
> +	BUILD_BUG_ON(ARRAY_SIZE(names) >= BITS_PER_TYPE(mitigations));
> +
> +	str = kstrdup(val, GFP_KERNEL);
> +	if (!str)
> +		return -ENOMEM;
> +
> +	for (sep = str; (tok = strsep(&sep, ","));) {
> +		bool enable = true;
> +		int i;
> +
> +		/* Be tolerant of leading/trailing whitespace */
> +		tok = strim(tok);
> +
> +		if (first) {
> +			first = false;
> +
> +			if (!strcmp(tok, "auto")) {
> +				new = ~0UL;
> +				continue;
> +			}
> +
> +			new = 0;
> +			if (!strcmp(tok, "off"))
> +				continue;
> +		}
> +
> +		if (*tok == '!') {
> +			enable = !enable;
> +			tok++;
> +		}
> +
> +		if (!strncmp(tok, "no", 2)) {
> +			enable = !enable;
> +			tok += 2;
> +		}
> +
> +		if (*tok == '\0')
> +			continue;
> +
> +		for (i = 0; i < ARRAY_SIZE(names); i++) {
> +			if (!strcmp(tok, names[i])) {
> +				if (enable)
> +					new |= BIT(i);
> +				else
> +					new &= ~BIT(i);
> +				break;
> +			}
> +		}
> +		if (i == ARRAY_SIZE(names)) {
> +			pr_err("Bad %s.mitigations=%s, '%s' is unknown\n",
> +			       DRIVER_NAME, val, tok);
> +			err = -EINVAL;
> +			break;
> +		}
> +	}
> +	kfree(str);
> +	if (err)
> +		return err;
> +
> +	WRITE_ONCE(mitigations, new);
> +	return 0;
> +}
> +
> +static int mitigations_get(char *buffer, const struct kernel_param *kp)
> +{
> +	unsigned long local = READ_ONCE(mitigations);
> +	int count, i;
> +	bool enable;
> +
> +	if (!local)
> +		return scnprintf(buffer, PAGE_SIZE, "%s\n", "off");
> +
> +	if (local & BIT(BITS_PER_LONG - 1)) {
> +		count = scnprintf(buffer, PAGE_SIZE, "%s,", "auto");
> +		enable = false;
> +	} else {
> +		enable = true;
> +		count = 0;
> +	}
> +
> +	for (i = 0; i < ARRAY_SIZE(names); i++) {
> +		if ((local & BIT(i)) != enable)
> +			continue;
> +
> +		count += scnprintf(buffer + count, PAGE_SIZE - count,
> +				   "%s%s,", enable ? "" : "!", names[i]);
> +	}
> +
> +	buffer[count - 1] = '\n';
> +	return count;
> +}
> +
> +static const struct kernel_param_ops ops = {
> +	.set = mitigations_set,
> +	.get = mitigations_get,
> +};
> +
> +module_param_cb_unsafe(mitigations, &ops, NULL, 0600);
> +MODULE_PARM_DESC(mitigations,
> +"Selectively enable security mitigations for all Intel® GPUs in the system.\n"
> +"\n"
> +"  auto -- enables all mitigations required for the platform [default]\n"
> +"  off  -- disables all mitigations\n"
> +"\n"
> +"Individual mitigations can be enabled by passing a comma-separated string,\n"
> +"e.g. mitigations=residuals to enable only clearing residuals or\n"
> +"mitigations=auto,noresiduals to disable only the clear residual mitigation.\n"
> +"Either '!' or 'no' may be used to switch from enabling the mitigation to\n"
> +"disabling it.\n"

but I liked this structure to at least stop the growing of the params...

Reviewed-by: Rodrigo Vivi <rodrigo.vivi@intel.com>

> +"\n"
> +"Active mitigations for Ivybridge, Baytrail, Haswell:\n"
> +"  residuals -- clear all thread-local registers between contexts"
> +);
> diff --git a/drivers/gpu/drm/i915/i915_mitigations.h b/drivers/gpu/drm/i915/i915_mitigations.h
> new file mode 100644
> index 000000000000..1359d8135287
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/i915_mitigations.h
> @@ -0,0 +1,13 @@
> +/* SPDX-License-Identifier: MIT */
> +/*
> + * Copyright © 2021 Intel Corporation
> + */
> +
> +#ifndef __I915_MITIGATIONS_H__
> +#define __I915_MITIGATIONS_H__
> +
> +#include <linux/types.h>
> +
> +bool i915_mitigate_clear_residuals(void);
> +
> +#endif /* __I915_MITIGATIONS_H__ */
> -- 
> 2.20.1
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Intel-gfx] [PATCH 03/11] drm/i915: Allow the sysadmin to override security mitigations
@ 2021-01-11 17:48     ` Rodrigo Vivi
  0 siblings, 0 replies; 30+ messages in thread
From: Rodrigo Vivi @ 2021-01-11 17:48 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx, stable

On Sun, Jan 10, 2021 at 03:03:56PM +0000, Chris Wilson wrote:
> The clear-residuals mitigation is a relatively heavy hammer and under some
> circumstances the user may wish to forgo the context isolation in order
> to meet some performance requirement. Introduce a generic module
> parameter to allow selectively enabling/disabling different mitigations.
> 
> Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/1858

I'm afraid this will have the same faith as the rc6 and the validation impact :/

> Fixes: 47f8253d2b89 ("drm/i915/gen7: Clear all EU/L3 residual contexts")
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> Cc: Jon Bloomfield <jon.bloomfield@intel.com>
> Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
> Cc: stable@vger.kernel.org # v5.7
> ---
>  drivers/gpu/drm/i915/Makefile                 |   1 +
>  .../gpu/drm/i915/gt/intel_ring_submission.c   |   4 +-
>  drivers/gpu/drm/i915/i915_mitigations.c       | 148 ++++++++++++++++++
>  drivers/gpu/drm/i915/i915_mitigations.h       |  13 ++
>  4 files changed, 165 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/gpu/drm/i915/i915_mitigations.c
>  create mode 100644 drivers/gpu/drm/i915/i915_mitigations.h
> 
> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
> index 4074d8cb0d6e..48f82c354611 100644
> --- a/drivers/gpu/drm/i915/Makefile
> +++ b/drivers/gpu/drm/i915/Makefile
> @@ -38,6 +38,7 @@ i915-y += i915_drv.o \
>  	  i915_config.o \
>  	  i915_irq.o \
>  	  i915_getparam.o \
> +	  i915_mitigations.o \
>  	  i915_params.o \
>  	  i915_pci.o \
>  	  i915_scatterlist.o \
> diff --git a/drivers/gpu/drm/i915/gt/intel_ring_submission.c b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
> index 724d56c9583d..657afd8ebc14 100644
> --- a/drivers/gpu/drm/i915/gt/intel_ring_submission.c
> +++ b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
> @@ -32,6 +32,7 @@
>  #include "gen6_ppgtt.h"
>  #include "gen7_renderclear.h"
>  #include "i915_drv.h"
> +#include "i915_mitigations.h"
>  #include "intel_breadcrumbs.h"
>  #include "intel_context.h"
>  #include "intel_gt.h"
> @@ -918,7 +919,8 @@ static int switch_context(struct i915_request *rq)
>  	GEM_BUG_ON(HAS_EXECLISTS(engine->i915));
>  
>  	if (engine->wa_ctx.vma && ce != engine->kernel_context) {
> -		if (engine->wa_ctx.vma->private != ce) {
> +		if (engine->wa_ctx.vma->private != ce &&
> +		    i915_mitigate_clear_residuals()) {
>  			ret = clear_residuals(rq);
>  			if (ret)
>  				return ret;
> diff --git a/drivers/gpu/drm/i915/i915_mitigations.c b/drivers/gpu/drm/i915/i915_mitigations.c
> new file mode 100644
> index 000000000000..8d5637cfa734
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/i915_mitigations.c
> @@ -0,0 +1,148 @@
> +// SPDX-License-Identifier: MIT
> +/*
> + * Copyright © 2021 Intel Corporation
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/moduleparam.h>
> +#include <linux/slab.h>
> +#include <linux/string.h>
> +
> +#include "i915_drv.h"
> +#include "i915_mitigations.h"
> +
> +static unsigned long mitigations = ~0UL;
> +
> +enum {
> +	CLEAR_RESIDUALS = 0,

specially worse if this list grows...

> +};
> +
> +static const char * const names[] = {
> +	[CLEAR_RESIDUALS] = "residuals",
> +};
> +
> +bool i915_mitigate_clear_residuals(void)
> +{
> +	return READ_ONCE(mitigations) & BIT(CLEAR_RESIDUALS);
> +}
> +
> +static int mitigations_set(const char *val, const struct kernel_param *kp)
> +{
> +	unsigned long new = ~0UL;
> +	char *str, *sep, *tok;
> +	bool first = true;
> +	int err = 0;
> +
> +	BUILD_BUG_ON(ARRAY_SIZE(names) >= BITS_PER_TYPE(mitigations));
> +
> +	str = kstrdup(val, GFP_KERNEL);
> +	if (!str)
> +		return -ENOMEM;
> +
> +	for (sep = str; (tok = strsep(&sep, ","));) {
> +		bool enable = true;
> +		int i;
> +
> +		/* Be tolerant of leading/trailing whitespace */
> +		tok = strim(tok);
> +
> +		if (first) {
> +			first = false;
> +
> +			if (!strcmp(tok, "auto")) {
> +				new = ~0UL;
> +				continue;
> +			}
> +
> +			new = 0;
> +			if (!strcmp(tok, "off"))
> +				continue;
> +		}
> +
> +		if (*tok == '!') {
> +			enable = !enable;
> +			tok++;
> +		}
> +
> +		if (!strncmp(tok, "no", 2)) {
> +			enable = !enable;
> +			tok += 2;
> +		}
> +
> +		if (*tok == '\0')
> +			continue;
> +
> +		for (i = 0; i < ARRAY_SIZE(names); i++) {
> +			if (!strcmp(tok, names[i])) {
> +				if (enable)
> +					new |= BIT(i);
> +				else
> +					new &= ~BIT(i);
> +				break;
> +			}
> +		}
> +		if (i == ARRAY_SIZE(names)) {
> +			pr_err("Bad %s.mitigations=%s, '%s' is unknown\n",
> +			       DRIVER_NAME, val, tok);
> +			err = -EINVAL;
> +			break;
> +		}
> +	}
> +	kfree(str);
> +	if (err)
> +		return err;
> +
> +	WRITE_ONCE(mitigations, new);
> +	return 0;
> +}
> +
> +static int mitigations_get(char *buffer, const struct kernel_param *kp)
> +{
> +	unsigned long local = READ_ONCE(mitigations);
> +	int count, i;
> +	bool enable;
> +
> +	if (!local)
> +		return scnprintf(buffer, PAGE_SIZE, "%s\n", "off");
> +
> +	if (local & BIT(BITS_PER_LONG - 1)) {
> +		count = scnprintf(buffer, PAGE_SIZE, "%s,", "auto");
> +		enable = false;
> +	} else {
> +		enable = true;
> +		count = 0;
> +	}
> +
> +	for (i = 0; i < ARRAY_SIZE(names); i++) {
> +		if ((local & BIT(i)) != enable)
> +			continue;
> +
> +		count += scnprintf(buffer + count, PAGE_SIZE - count,
> +				   "%s%s,", enable ? "" : "!", names[i]);
> +	}
> +
> +	buffer[count - 1] = '\n';
> +	return count;
> +}
> +
> +static const struct kernel_param_ops ops = {
> +	.set = mitigations_set,
> +	.get = mitigations_get,
> +};
> +
> +module_param_cb_unsafe(mitigations, &ops, NULL, 0600);
> +MODULE_PARM_DESC(mitigations,
> +"Selectively enable security mitigations for all Intel® GPUs in the system.\n"
> +"\n"
> +"  auto -- enables all mitigations required for the platform [default]\n"
> +"  off  -- disables all mitigations\n"
> +"\n"
> +"Individual mitigations can be enabled by passing a comma-separated string,\n"
> +"e.g. mitigations=residuals to enable only clearing residuals or\n"
> +"mitigations=auto,noresiduals to disable only the clear residual mitigation.\n"
> +"Either '!' or 'no' may be used to switch from enabling the mitigation to\n"
> +"disabling it.\n"

but I liked this structure to at least stop the growing of the params...

Reviewed-by: Rodrigo Vivi <rodrigo.vivi@intel.com>

> +"\n"
> +"Active mitigations for Ivybridge, Baytrail, Haswell:\n"
> +"  residuals -- clear all thread-local registers between contexts"
> +);
> diff --git a/drivers/gpu/drm/i915/i915_mitigations.h b/drivers/gpu/drm/i915/i915_mitigations.h
> new file mode 100644
> index 000000000000..1359d8135287
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/i915_mitigations.h
> @@ -0,0 +1,13 @@
> +/* SPDX-License-Identifier: MIT */
> +/*
> + * Copyright © 2021 Intel Corporation
> + */
> +
> +#ifndef __I915_MITIGATIONS_H__
> +#define __I915_MITIGATIONS_H__
> +
> +#include <linux/types.h>
> +
> +bool i915_mitigate_clear_residuals(void);
> +
> +#endif /* __I915_MITIGATIONS_H__ */
> -- 
> 2.20.1
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Intel-gfx] [PATCH 01/11] drm/i915/gt: Limit VFE threads based on GT
  2021-01-11 17:35   ` Rodrigo Vivi
@ 2021-01-11 20:51     ` Chris Wilson
  -1 siblings, 0 replies; 30+ messages in thread
From: Chris Wilson @ 2021-01-11 20:51 UTC (permalink / raw)
  To: Rodrigo Vivi; +Cc: intel-gfx, stable, Randy Wright

Quoting Rodrigo Vivi (2021-01-11 17:35:12)
> On Sun, Jan 10, 2021 at 03:03:54PM +0000, Chris Wilson wrote:
> > MEDIA_STATE_VFE only accepts the 'maximum number of threads' in the
> > range [0, n-1] where n is #EU * (#threads/EU) with the number of threads
> > based on plaform and the number of EU based on the number of slices and
> > subslices. This is a fixed number per platform/gt, so appropriately
> > limit the number of threads we spawn to match the device.
> > 
> > v2: Oversaturate the system with tasks to force execution on every HW
> > thread; if the thread idles it is returned to the pool and may be reused
> > again before an unused thread.
> > 
> > v3: Fix more state commands, which was causing Baytrail to barf.
> 
> CI is still not happy with byt right? or is that false positive?

After v3, ivb still failed.
 
> > v4: STATE_CACHE_INVALIDATE requires a stall on Ivybridge

Right now with the multiple pipecontrls around the PIPELINE_SELECT *and*
STATE_BASE, CI has been happy for multiple runs. I was able to reproduce
the same selftests failures and confirm that we do not see any of those
failures in a thousand iterations. High level of confidence, but since
we are dealing with empirical results with cross-referencing to mesa who
also have seen similar undocumented failures, there's still an element
of doubt as to whether it is truly watertight.

The CI results for this series passed on the all important ivb,byt,hsw.

> > Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/2024
> > Fixes: 47f8253d2b89 ("drm/i915/gen7: Clear all EU/L3 residual contexts")
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> > Cc: Prathap Kumar Valsan <prathap.kumar.valsan@intel.com>
> > Cc: Akeem G Abodunrin <akeem.g.abodunrin@intel.com>
> > Cc: Jon Bloomfield <jon.bloomfield@intel.com>
> > Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
> > Cc: Randy Wright <rwright@hpe.com>
> > Cc: stable@vger.kernel.org # v5.7+
> > ---
> >  drivers/gpu/drm/i915/gt/gen7_renderclear.c | 157 ++++++++++++---------
> >  1 file changed, 94 insertions(+), 63 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gt/gen7_renderclear.c b/drivers/gpu/drm/i915/gt/gen7_renderclear.c
> > index d93d85cd3027..f32a8e8040b2 100644
> > --- a/drivers/gpu/drm/i915/gt/gen7_renderclear.c
> > +++ b/drivers/gpu/drm/i915/gt/gen7_renderclear.c
> > @@ -7,8 +7,6 @@
> >  #include "i915_drv.h"
> >  #include "intel_gpu_commands.h"
> >  
> > -#define MAX_URB_ENTRIES 64
> > -#define STATE_SIZE (4 * 1024)
> >  #define GT3_INLINE_DATA_DELAYS 0x1E00
> >  #define batch_advance(Y, CS) GEM_BUG_ON((Y)->end != (CS))
> >  
> > @@ -34,38 +32,59 @@ struct batch_chunk {
> >  };
> >  
> >  struct batch_vals {
> > -     u32 max_primitives;
> > -     u32 max_urb_entries;
> > -     u32 cmd_size;
> > -     u32 state_size;
> > +     u32 max_threads;
> >       u32 state_start;
> > -     u32 batch_size;
> > +     u32 surface_start;
> >       u32 surface_height;
> >       u32 surface_width;
> > -     u32 scratch_size;
> > -     u32 max_size;
> > +     u32 size;
> >  };
> >  
> > +static inline int num_primitives(const struct batch_vals *bv)
> > +{
> > +     /*
> > +      * We need to saturate the GPU with work in order to dispatch
> > +      * a shader on every HW thread, and clear the thread-local registers.
> > +      * In short, we have to dispatch work faster than the shaders can
> > +      * run in order to fill occupy each HW thread.
> > +      */
> > +     return bv->max_threads;
> > +}
> > +
> >  static void
> >  batch_get_defaults(struct drm_i915_private *i915, struct batch_vals *bv)
> >  {
> >       if (IS_HASWELL(i915)) {
> > -             bv->max_primitives = 280;
> > -             bv->max_urb_entries = MAX_URB_ENTRIES;
> > +             switch (INTEL_INFO(i915)->gt) {
> > +             default:
> > +             case 1:
> > +                     bv->max_threads = 70;
> > +                     break;
> > +             case 2:
> > +                     bv->max_threads = 140;
> > +                     break;
> > +             case 3:
> > +                     bv->max_threads = 280;
> > +                     break;
> > +             }
> >               bv->surface_height = 16 * 16;
> >               bv->surface_width = 32 * 2 * 16;
> >       } else {
> > -             bv->max_primitives = 128;
> > -             bv->max_urb_entries = MAX_URB_ENTRIES / 2;
> > +             switch (INTEL_INFO(i915)->gt) {
> > +             default:
> > +             case 1: /* including vlv */
> > +                     bv->max_threads = 36;
> > +                     break;
> > +             case 2:
> > +                     bv->max_threads = 128;
> > +                     break;
> > +             }
> >               bv->surface_height = 16 * 8;
> >               bv->surface_width = 32 * 16;
> 
> all the values above matches the spec.
> 
> >       }
> > -     bv->cmd_size = bv->max_primitives * 4096;
> > -     bv->state_size = STATE_SIZE;
> > -     bv->state_start = bv->cmd_size;
> > -     bv->batch_size = bv->cmd_size + bv->state_size;
> > -     bv->scratch_size = bv->surface_height * bv->surface_width;
> > -     bv->max_size = bv->batch_size + bv->scratch_size;
> > +     bv->state_start = round_up(SZ_1K + num_primitives(bv) * 64, SZ_4K);
> > +     bv->surface_start = bv->state_start + SZ_4K;
> > +     bv->size = bv->surface_start + bv->surface_height * bv->surface_width;
> 
> I liked this batch values simplification...
> 
> >  }
> >  
> >  static void batch_init(struct batch_chunk *bc,
> > @@ -155,7 +174,8 @@ static u32
> >  gen7_fill_binding_table(struct batch_chunk *state,
> >                       const struct batch_vals *bv)
> >  {
> > -     u32 surface_start = gen7_fill_surface_state(state, bv->batch_size, bv);
> > +     u32 surface_start =
> > +             gen7_fill_surface_state(state, bv->surface_start, bv);
> >       u32 *cs = batch_alloc_items(state, 32, 8);
> >       u32 offset = batch_offset(state, cs);
> >  
> > @@ -214,9 +234,9 @@ static void
> >  gen7_emit_state_base_address(struct batch_chunk *batch,
> >                            u32 surface_state_base)
> >  {
> > -     u32 *cs = batch_alloc_items(batch, 0, 12);
> > +     u32 *cs = batch_alloc_items(batch, 0, 10);
> >  
> > -     *cs++ = STATE_BASE_ADDRESS | (12 - 2);
> > +     *cs++ = STATE_BASE_ADDRESS | (10 - 2);
> >       /* general */
> >       *cs++ = batch_addr(batch) | BASE_ADDRESS_MODIFY;
> >       /* surface */
> > @@ -233,8 +253,6 @@ gen7_emit_state_base_address(struct batch_chunk *batch,
> >       *cs++ = BASE_ADDRESS_MODIFY;
> >       *cs++ = 0;
> >       *cs++ = BASE_ADDRESS_MODIFY;
> > -     *cs++ = 0;
> > -     *cs++ = 0;
> 
> why don't we need this anymore?

It was incorrect, gen7 is just (10-2). The last two were extraneous
padding.

> >       batch_advance(batch, cs);
> >  }
> >  
> > @@ -244,8 +262,7 @@ gen7_emit_vfe_state(struct batch_chunk *batch,
> >                   u32 urb_size, u32 curbe_size,
> >                   u32 mode)
> >  {
> > -     u32 urb_entries = bv->max_urb_entries;
> > -     u32 threads = bv->max_primitives - 1;
> > +     u32 threads = bv->max_threads - 1;
> >       u32 *cs = batch_alloc_items(batch, 32, 8);
> >  
> >       *cs++ = MEDIA_VFE_STATE | (8 - 2);
> > @@ -254,7 +271,7 @@ gen7_emit_vfe_state(struct batch_chunk *batch,
> >       *cs++ = 0;
> >  
> >       /* number of threads & urb entries for GPGPU vs Media Mode */
> > -     *cs++ = threads << 16 | urb_entries << 8 | mode << 2;
> > +     *cs++ = threads << 16 | 1 << 8 | mode << 2;
> >  
> >       *cs++ = 0;
> >  
> > @@ -293,17 +310,12 @@ gen7_emit_media_object(struct batch_chunk *batch,
> >  {
> >       unsigned int x_offset = (media_object_index % 16) * 64;
> >       unsigned int y_offset = (media_object_index / 16) * 16;
> > -     unsigned int inline_data_size;
> > -     unsigned int media_batch_size;
> > -     unsigned int i;
> > +     unsigned int pkt = 6 + 3;
> >       u32 *cs;
> >  
> > -     inline_data_size = 112 * 8;
> > -     media_batch_size = inline_data_size + 6;
> > +     cs = batch_alloc_items(batch, 8, pkt);
> >  
> > -     cs = batch_alloc_items(batch, 8, media_batch_size);
> > -
> > -     *cs++ = MEDIA_OBJECT | (media_batch_size - 2);
> > +     *cs++ = MEDIA_OBJECT | (pkt - 2);
> >  
> >       /* interface descriptor offset */
> >       *cs++ = 0;
> > @@ -317,25 +329,44 @@ gen7_emit_media_object(struct batch_chunk *batch,
> >       *cs++ = 0;
> >  
> >       /* inline */
> > -     *cs++ = (y_offset << 16) | (x_offset);
> > +     *cs++ = y_offset << 16 | x_offset;
> >       *cs++ = 0;
> >       *cs++ = GT3_INLINE_DATA_DELAYS;
> > -     for (i = 3; i < inline_data_size; i++)
> > -             *cs++ = 0;
> 
> why?

We don't use the extra urb data, and worse the extra inline data slows
down the CP to be slower than the thread dispatch. That was causing the 
issue that the same HW thread was servicing multiple MEDIA_OBJECTS, and
we did not then clear all the thread-local registers across the EU (as
some threads never executed our shader). And that was the cause of the
validation failures in v1.

[The first clue was that if we submitted more a few more objects than
threads with v1, it takes twice as long, and passes the validation test.
Now, touch wood, it appears that we are able to saturate the HW threads
with an equal number of objects, so every HW thread does exactly one
iteration of the shader.]

> >       batch_advance(batch, cs);
> >  }
> >  
> >  static void gen7_emit_pipeline_flush(struct batch_chunk *batch)
> >  {
> > -     u32 *cs = batch_alloc_items(batch, 0, 5);
> > +     u32 *cs = batch_alloc_items(batch, 0, 4);
> >  
> > -     *cs++ = GFX_OP_PIPE_CONTROL(5);
> > -     *cs++ = PIPE_CONTROL_STATE_CACHE_INVALIDATE |
> > -             PIPE_CONTROL_GLOBAL_GTT_IVB;
> > +     *cs++ = GFX_OP_PIPE_CONTROL(4);
> > +     *cs++ = PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
> > +             PIPE_CONTROL_DEPTH_CACHE_FLUSH |
> > +             PIPE_CONTROL_DC_FLUSH_ENABLE |
> > +             PIPE_CONTROL_CS_STALL;
> >       *cs++ = 0;
> >       *cs++ = 0;
> > +
> > +     batch_advance(batch, cs);
> > +}
> > +
> > +static void gen7_emit_pipeline_invalidate(struct batch_chunk *batch)
> > +{
> > +     u32 *cs = batch_alloc_items(batch, 0, 8);
> > +
> > +     /* ivb: Stall before STATE_CACHE_INVALIDATE */
> > +     *cs++ = GFX_OP_PIPE_CONTROL(4);
> > +     *cs++ = PIPE_CONTROL_STALL_AT_SCOREBOARD |
> > +             PIPE_CONTROL_CS_STALL;
> >       *cs++ = 0;
> > +     *cs++ = 0;
> > +
> > +     *cs++ = GFX_OP_PIPE_CONTROL(4);
> > +     *cs++ = PIPE_CONTROL_STATE_CACHE_INVALIDATE;
> > +     *cs++ = 0;
> > +     *cs++ = 0;
> > +
> >       batch_advance(batch, cs);
> >  }
> >  
> > @@ -344,34 +375,34 @@ static void emit_batch(struct i915_vma * const vma,
> >                      const struct batch_vals *bv)
> >  {
> >       struct drm_i915_private *i915 = vma->vm->i915;
> > -     unsigned int desc_count = 64;
> > -     const u32 urb_size = 112;
> > +     const unsigned int desc_count = 1;
> > +     const unsigned int urb_size = 1;
> >       struct batch_chunk cmds, state;
> > -     u32 interface_descriptor;
> > +     u32 descriptors;
> >       unsigned int i;
> >  
> > -     batch_init(&cmds, vma, start, 0, bv->cmd_size);
> > -     batch_init(&state, vma, start, bv->state_start, bv->state_size);
> > +     batch_init(&cmds, vma, start, 0, bv->state_start);
> > +     batch_init(&state, vma, start, bv->state_start, SZ_4K);
> >  
> > -     interface_descriptor =
> > -             gen7_fill_interface_descriptor(&state, bv,
> > -                                            IS_HASWELL(i915) ?
> > -                                            &cb_kernel_hsw :
> > -                                            &cb_kernel_ivb,
> > -                                            desc_count);
> > -     gen7_emit_pipeline_flush(&cmds);
> > +     descriptors = gen7_fill_interface_descriptor(&state, bv,
> > +                                                  IS_HASWELL(i915) ?
> > +                                                  &cb_kernel_hsw :
> > +                                                  &cb_kernel_ivb,
> > +                                                  desc_count);
> > +
> > +     gen7_emit_pipeline_invalidate(&cmds);
> >       batch_add(&cmds, PIPELINE_SELECT | PIPELINE_SELECT_MEDIA);
> >       batch_add(&cmds, MI_NOOP);
> > -     gen7_emit_state_base_address(&cmds, interface_descriptor);
> > +     gen7_emit_pipeline_invalidate(&cmds);
> > +
> >       gen7_emit_pipeline_flush(&cmds);
> > +     gen7_emit_state_base_address(&cmds, descriptors);
> > +     gen7_emit_pipeline_invalidate(&cmds);
> 
> why do we need double invalidate?

Empirical results. We need the flush before STATE_BASE otherwise there
were lost writes; mesa has had a similar experience with needing a
magical flush before. The invalidate afterwards is similarly required by
the HW.

The invalidate before the PIPELINE_SELECT is mandatory in bspec for MEDIA,
and vouched for by our CI results. The one after the PIPELINE_SELECT does
not appear in the docs, yet preferred by CI.

It's this combination of flush/invalidate that finally worked on all
three gen7 platforms, but there's almost definitely a more optimal set of
pipecontrols.
-Chris

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Intel-gfx] [PATCH 01/11] drm/i915/gt: Limit VFE threads based on GT
@ 2021-01-11 20:51     ` Chris Wilson
  0 siblings, 0 replies; 30+ messages in thread
From: Chris Wilson @ 2021-01-11 20:51 UTC (permalink / raw)
  To: Rodrigo Vivi; +Cc: intel-gfx, Randy Wright, stable

Quoting Rodrigo Vivi (2021-01-11 17:35:12)
> On Sun, Jan 10, 2021 at 03:03:54PM +0000, Chris Wilson wrote:
> > MEDIA_STATE_VFE only accepts the 'maximum number of threads' in the
> > range [0, n-1] where n is #EU * (#threads/EU) with the number of threads
> > based on plaform and the number of EU based on the number of slices and
> > subslices. This is a fixed number per platform/gt, so appropriately
> > limit the number of threads we spawn to match the device.
> > 
> > v2: Oversaturate the system with tasks to force execution on every HW
> > thread; if the thread idles it is returned to the pool and may be reused
> > again before an unused thread.
> > 
> > v3: Fix more state commands, which was causing Baytrail to barf.
> 
> CI is still not happy with byt right? or is that false positive?

After v3, ivb still failed.
 
> > v4: STATE_CACHE_INVALIDATE requires a stall on Ivybridge

Right now with the multiple pipecontrls around the PIPELINE_SELECT *and*
STATE_BASE, CI has been happy for multiple runs. I was able to reproduce
the same selftests failures and confirm that we do not see any of those
failures in a thousand iterations. High level of confidence, but since
we are dealing with empirical results with cross-referencing to mesa who
also have seen similar undocumented failures, there's still an element
of doubt as to whether it is truly watertight.

The CI results for this series passed on the all important ivb,byt,hsw.

> > Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/2024
> > Fixes: 47f8253d2b89 ("drm/i915/gen7: Clear all EU/L3 residual contexts")
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> > Cc: Prathap Kumar Valsan <prathap.kumar.valsan@intel.com>
> > Cc: Akeem G Abodunrin <akeem.g.abodunrin@intel.com>
> > Cc: Jon Bloomfield <jon.bloomfield@intel.com>
> > Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
> > Cc: Randy Wright <rwright@hpe.com>
> > Cc: stable@vger.kernel.org # v5.7+
> > ---
> >  drivers/gpu/drm/i915/gt/gen7_renderclear.c | 157 ++++++++++++---------
> >  1 file changed, 94 insertions(+), 63 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gt/gen7_renderclear.c b/drivers/gpu/drm/i915/gt/gen7_renderclear.c
> > index d93d85cd3027..f32a8e8040b2 100644
> > --- a/drivers/gpu/drm/i915/gt/gen7_renderclear.c
> > +++ b/drivers/gpu/drm/i915/gt/gen7_renderclear.c
> > @@ -7,8 +7,6 @@
> >  #include "i915_drv.h"
> >  #include "intel_gpu_commands.h"
> >  
> > -#define MAX_URB_ENTRIES 64
> > -#define STATE_SIZE (4 * 1024)
> >  #define GT3_INLINE_DATA_DELAYS 0x1E00
> >  #define batch_advance(Y, CS) GEM_BUG_ON((Y)->end != (CS))
> >  
> > @@ -34,38 +32,59 @@ struct batch_chunk {
> >  };
> >  
> >  struct batch_vals {
> > -     u32 max_primitives;
> > -     u32 max_urb_entries;
> > -     u32 cmd_size;
> > -     u32 state_size;
> > +     u32 max_threads;
> >       u32 state_start;
> > -     u32 batch_size;
> > +     u32 surface_start;
> >       u32 surface_height;
> >       u32 surface_width;
> > -     u32 scratch_size;
> > -     u32 max_size;
> > +     u32 size;
> >  };
> >  
> > +static inline int num_primitives(const struct batch_vals *bv)
> > +{
> > +     /*
> > +      * We need to saturate the GPU with work in order to dispatch
> > +      * a shader on every HW thread, and clear the thread-local registers.
> > +      * In short, we have to dispatch work faster than the shaders can
> > +      * run in order to fill occupy each HW thread.
> > +      */
> > +     return bv->max_threads;
> > +}
> > +
> >  static void
> >  batch_get_defaults(struct drm_i915_private *i915, struct batch_vals *bv)
> >  {
> >       if (IS_HASWELL(i915)) {
> > -             bv->max_primitives = 280;
> > -             bv->max_urb_entries = MAX_URB_ENTRIES;
> > +             switch (INTEL_INFO(i915)->gt) {
> > +             default:
> > +             case 1:
> > +                     bv->max_threads = 70;
> > +                     break;
> > +             case 2:
> > +                     bv->max_threads = 140;
> > +                     break;
> > +             case 3:
> > +                     bv->max_threads = 280;
> > +                     break;
> > +             }
> >               bv->surface_height = 16 * 16;
> >               bv->surface_width = 32 * 2 * 16;
> >       } else {
> > -             bv->max_primitives = 128;
> > -             bv->max_urb_entries = MAX_URB_ENTRIES / 2;
> > +             switch (INTEL_INFO(i915)->gt) {
> > +             default:
> > +             case 1: /* including vlv */
> > +                     bv->max_threads = 36;
> > +                     break;
> > +             case 2:
> > +                     bv->max_threads = 128;
> > +                     break;
> > +             }
> >               bv->surface_height = 16 * 8;
> >               bv->surface_width = 32 * 16;
> 
> all the values above matches the spec.
> 
> >       }
> > -     bv->cmd_size = bv->max_primitives * 4096;
> > -     bv->state_size = STATE_SIZE;
> > -     bv->state_start = bv->cmd_size;
> > -     bv->batch_size = bv->cmd_size + bv->state_size;
> > -     bv->scratch_size = bv->surface_height * bv->surface_width;
> > -     bv->max_size = bv->batch_size + bv->scratch_size;
> > +     bv->state_start = round_up(SZ_1K + num_primitives(bv) * 64, SZ_4K);
> > +     bv->surface_start = bv->state_start + SZ_4K;
> > +     bv->size = bv->surface_start + bv->surface_height * bv->surface_width;
> 
> I liked this batch values simplification...
> 
> >  }
> >  
> >  static void batch_init(struct batch_chunk *bc,
> > @@ -155,7 +174,8 @@ static u32
> >  gen7_fill_binding_table(struct batch_chunk *state,
> >                       const struct batch_vals *bv)
> >  {
> > -     u32 surface_start = gen7_fill_surface_state(state, bv->batch_size, bv);
> > +     u32 surface_start =
> > +             gen7_fill_surface_state(state, bv->surface_start, bv);
> >       u32 *cs = batch_alloc_items(state, 32, 8);
> >       u32 offset = batch_offset(state, cs);
> >  
> > @@ -214,9 +234,9 @@ static void
> >  gen7_emit_state_base_address(struct batch_chunk *batch,
> >                            u32 surface_state_base)
> >  {
> > -     u32 *cs = batch_alloc_items(batch, 0, 12);
> > +     u32 *cs = batch_alloc_items(batch, 0, 10);
> >  
> > -     *cs++ = STATE_BASE_ADDRESS | (12 - 2);
> > +     *cs++ = STATE_BASE_ADDRESS | (10 - 2);
> >       /* general */
> >       *cs++ = batch_addr(batch) | BASE_ADDRESS_MODIFY;
> >       /* surface */
> > @@ -233,8 +253,6 @@ gen7_emit_state_base_address(struct batch_chunk *batch,
> >       *cs++ = BASE_ADDRESS_MODIFY;
> >       *cs++ = 0;
> >       *cs++ = BASE_ADDRESS_MODIFY;
> > -     *cs++ = 0;
> > -     *cs++ = 0;
> 
> why don't we need this anymore?

It was incorrect, gen7 is just (10-2). The last two were extraneous
padding.

> >       batch_advance(batch, cs);
> >  }
> >  
> > @@ -244,8 +262,7 @@ gen7_emit_vfe_state(struct batch_chunk *batch,
> >                   u32 urb_size, u32 curbe_size,
> >                   u32 mode)
> >  {
> > -     u32 urb_entries = bv->max_urb_entries;
> > -     u32 threads = bv->max_primitives - 1;
> > +     u32 threads = bv->max_threads - 1;
> >       u32 *cs = batch_alloc_items(batch, 32, 8);
> >  
> >       *cs++ = MEDIA_VFE_STATE | (8 - 2);
> > @@ -254,7 +271,7 @@ gen7_emit_vfe_state(struct batch_chunk *batch,
> >       *cs++ = 0;
> >  
> >       /* number of threads & urb entries for GPGPU vs Media Mode */
> > -     *cs++ = threads << 16 | urb_entries << 8 | mode << 2;
> > +     *cs++ = threads << 16 | 1 << 8 | mode << 2;
> >  
> >       *cs++ = 0;
> >  
> > @@ -293,17 +310,12 @@ gen7_emit_media_object(struct batch_chunk *batch,
> >  {
> >       unsigned int x_offset = (media_object_index % 16) * 64;
> >       unsigned int y_offset = (media_object_index / 16) * 16;
> > -     unsigned int inline_data_size;
> > -     unsigned int media_batch_size;
> > -     unsigned int i;
> > +     unsigned int pkt = 6 + 3;
> >       u32 *cs;
> >  
> > -     inline_data_size = 112 * 8;
> > -     media_batch_size = inline_data_size + 6;
> > +     cs = batch_alloc_items(batch, 8, pkt);
> >  
> > -     cs = batch_alloc_items(batch, 8, media_batch_size);
> > -
> > -     *cs++ = MEDIA_OBJECT | (media_batch_size - 2);
> > +     *cs++ = MEDIA_OBJECT | (pkt - 2);
> >  
> >       /* interface descriptor offset */
> >       *cs++ = 0;
> > @@ -317,25 +329,44 @@ gen7_emit_media_object(struct batch_chunk *batch,
> >       *cs++ = 0;
> >  
> >       /* inline */
> > -     *cs++ = (y_offset << 16) | (x_offset);
> > +     *cs++ = y_offset << 16 | x_offset;
> >       *cs++ = 0;
> >       *cs++ = GT3_INLINE_DATA_DELAYS;
> > -     for (i = 3; i < inline_data_size; i++)
> > -             *cs++ = 0;
> 
> why?

We don't use the extra urb data, and worse the extra inline data slows
down the CP to be slower than the thread dispatch. That was causing the 
issue that the same HW thread was servicing multiple MEDIA_OBJECTS, and
we did not then clear all the thread-local registers across the EU (as
some threads never executed our shader). And that was the cause of the
validation failures in v1.

[The first clue was that if we submitted more a few more objects than
threads with v1, it takes twice as long, and passes the validation test.
Now, touch wood, it appears that we are able to saturate the HW threads
with an equal number of objects, so every HW thread does exactly one
iteration of the shader.]

> >       batch_advance(batch, cs);
> >  }
> >  
> >  static void gen7_emit_pipeline_flush(struct batch_chunk *batch)
> >  {
> > -     u32 *cs = batch_alloc_items(batch, 0, 5);
> > +     u32 *cs = batch_alloc_items(batch, 0, 4);
> >  
> > -     *cs++ = GFX_OP_PIPE_CONTROL(5);
> > -     *cs++ = PIPE_CONTROL_STATE_CACHE_INVALIDATE |
> > -             PIPE_CONTROL_GLOBAL_GTT_IVB;
> > +     *cs++ = GFX_OP_PIPE_CONTROL(4);
> > +     *cs++ = PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
> > +             PIPE_CONTROL_DEPTH_CACHE_FLUSH |
> > +             PIPE_CONTROL_DC_FLUSH_ENABLE |
> > +             PIPE_CONTROL_CS_STALL;
> >       *cs++ = 0;
> >       *cs++ = 0;
> > +
> > +     batch_advance(batch, cs);
> > +}
> > +
> > +static void gen7_emit_pipeline_invalidate(struct batch_chunk *batch)
> > +{
> > +     u32 *cs = batch_alloc_items(batch, 0, 8);
> > +
> > +     /* ivb: Stall before STATE_CACHE_INVALIDATE */
> > +     *cs++ = GFX_OP_PIPE_CONTROL(4);
> > +     *cs++ = PIPE_CONTROL_STALL_AT_SCOREBOARD |
> > +             PIPE_CONTROL_CS_STALL;
> >       *cs++ = 0;
> > +     *cs++ = 0;
> > +
> > +     *cs++ = GFX_OP_PIPE_CONTROL(4);
> > +     *cs++ = PIPE_CONTROL_STATE_CACHE_INVALIDATE;
> > +     *cs++ = 0;
> > +     *cs++ = 0;
> > +
> >       batch_advance(batch, cs);
> >  }
> >  
> > @@ -344,34 +375,34 @@ static void emit_batch(struct i915_vma * const vma,
> >                      const struct batch_vals *bv)
> >  {
> >       struct drm_i915_private *i915 = vma->vm->i915;
> > -     unsigned int desc_count = 64;
> > -     const u32 urb_size = 112;
> > +     const unsigned int desc_count = 1;
> > +     const unsigned int urb_size = 1;
> >       struct batch_chunk cmds, state;
> > -     u32 interface_descriptor;
> > +     u32 descriptors;
> >       unsigned int i;
> >  
> > -     batch_init(&cmds, vma, start, 0, bv->cmd_size);
> > -     batch_init(&state, vma, start, bv->state_start, bv->state_size);
> > +     batch_init(&cmds, vma, start, 0, bv->state_start);
> > +     batch_init(&state, vma, start, bv->state_start, SZ_4K);
> >  
> > -     interface_descriptor =
> > -             gen7_fill_interface_descriptor(&state, bv,
> > -                                            IS_HASWELL(i915) ?
> > -                                            &cb_kernel_hsw :
> > -                                            &cb_kernel_ivb,
> > -                                            desc_count);
> > -     gen7_emit_pipeline_flush(&cmds);
> > +     descriptors = gen7_fill_interface_descriptor(&state, bv,
> > +                                                  IS_HASWELL(i915) ?
> > +                                                  &cb_kernel_hsw :
> > +                                                  &cb_kernel_ivb,
> > +                                                  desc_count);
> > +
> > +     gen7_emit_pipeline_invalidate(&cmds);
> >       batch_add(&cmds, PIPELINE_SELECT | PIPELINE_SELECT_MEDIA);
> >       batch_add(&cmds, MI_NOOP);
> > -     gen7_emit_state_base_address(&cmds, interface_descriptor);
> > +     gen7_emit_pipeline_invalidate(&cmds);
> > +
> >       gen7_emit_pipeline_flush(&cmds);
> > +     gen7_emit_state_base_address(&cmds, descriptors);
> > +     gen7_emit_pipeline_invalidate(&cmds);
> 
> why do we need double invalidate?

Empirical results. We need the flush before STATE_BASE otherwise there
were lost writes; mesa has had a similar experience with needing a
magical flush before. The invalidate afterwards is similarly required by
the HW.

The invalidate before the PIPELINE_SELECT is mandatory in bspec for MEDIA,
and vouched for by our CI results. The one after the PIPELINE_SELECT does
not appear in the docs, yet preferred by CI.

It's this combination of flush/invalidate that finally worked on all
three gen7 platforms, but there's almost definitely a more optimal set of
pipecontrols.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* RE: [Intel-gfx] [PATCH 03/11] drm/i915: Allow the sysadmin to override security mitigations
  2021-01-10 15:03   ` [Intel-gfx] " Chris Wilson
@ 2021-01-11 20:58     ` Abodunrin, Akeem G
  -1 siblings, 0 replies; 30+ messages in thread
From: Abodunrin, Akeem G @ 2021-01-11 20:58 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx; +Cc: stable



> -----Original Message-----
> From: Intel-gfx <intel-gfx-bounces@lists.freedesktop.org> On Behalf Of Chris
> Wilson
> Sent: Sunday, January 10, 2021 7:04 AM
> To: intel-gfx@lists.freedesktop.org
> Cc: stable@vger.kernel.org; Chris Wilson <chris@chris-wilson.co.uk>
> Subject: [Intel-gfx] [PATCH 03/11] drm/i915: Allow the sysadmin to override
> security mitigations
> 
> The clear-residuals mitigation is a relatively heavy hammer and under some
> circumstances the user may wish to forgo the context isolation in order to
> meet some performance requirement. Introduce a generic module parameter
> to allow selectively enabling/disabling different mitigations.
> 
> Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/1858
> Fixes: 47f8253d2b89 ("drm/i915/gen7: Clear all EU/L3 residual contexts")
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> Cc: Jon Bloomfield <jon.bloomfield@intel.com>
> Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
> Cc: stable@vger.kernel.org # v5.7
> ---
>  drivers/gpu/drm/i915/Makefile                 |   1 +
>  .../gpu/drm/i915/gt/intel_ring_submission.c   |   4 +-
>  drivers/gpu/drm/i915/i915_mitigations.c       | 148 ++++++++++++++++++
>  drivers/gpu/drm/i915/i915_mitigations.h       |  13 ++
>  4 files changed, 165 insertions(+), 1 deletion(-)  create mode 100644
> drivers/gpu/drm/i915/i915_mitigations.c
>  create mode 100644 drivers/gpu/drm/i915/i915_mitigations.h
> 
> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
> index 4074d8cb0d6e..48f82c354611 100644
> --- a/drivers/gpu/drm/i915/Makefile
> +++ b/drivers/gpu/drm/i915/Makefile
> @@ -38,6 +38,7 @@ i915-y += i915_drv.o \
>  	  i915_config.o \
>  	  i915_irq.o \
>  	  i915_getparam.o \
> +	  i915_mitigations.o \
>  	  i915_params.o \
>  	  i915_pci.o \
>  	  i915_scatterlist.o \
> diff --git a/drivers/gpu/drm/i915/gt/intel_ring_submission.c
> b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
> index 724d56c9583d..657afd8ebc14 100644
> --- a/drivers/gpu/drm/i915/gt/intel_ring_submission.c
> +++ b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
> @@ -32,6 +32,7 @@
>  #include "gen6_ppgtt.h"
>  #include "gen7_renderclear.h"
>  #include "i915_drv.h"
> +#include "i915_mitigations.h"
>  #include "intel_breadcrumbs.h"
>  #include "intel_context.h"
>  #include "intel_gt.h"
> @@ -918,7 +919,8 @@ static int switch_context(struct i915_request *rq)
>  	GEM_BUG_ON(HAS_EXECLISTS(engine->i915));
> 
>  	if (engine->wa_ctx.vma && ce != engine->kernel_context) {
> -		if (engine->wa_ctx.vma->private != ce) {
> +		if (engine->wa_ctx.vma->private != ce &&
> +		    i915_mitigate_clear_residuals()) {
>  			ret = clear_residuals(rq);
>  			if (ret)
>  				return ret;
> diff --git a/drivers/gpu/drm/i915/i915_mitigations.c
> b/drivers/gpu/drm/i915/i915_mitigations.c
> new file mode 100644
> index 000000000000..8d5637cfa734
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/i915_mitigations.c
> @@ -0,0 +1,148 @@
> +// SPDX-License-Identifier: MIT
> +/*
> + * Copyright © 2021 Intel Corporation
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/moduleparam.h>
> +#include <linux/slab.h>
> +#include <linux/string.h>
> +
> +#include "i915_drv.h"
> +#include "i915_mitigations.h"
> +
> +static unsigned long mitigations = ~0UL;
> +
> +enum {
> +	CLEAR_RESIDUALS = 0,
> +};
> +
> +static const char * const names[] = {
> +	[CLEAR_RESIDUALS] = "residuals",
> +};
> +
> +bool i915_mitigate_clear_residuals(void)
> +{
> +	return READ_ONCE(mitigations) & BIT(CLEAR_RESIDUALS); }
> +
> +static int mitigations_set(const char *val, const struct kernel_param
> +*kp) {
> +	unsigned long new = ~0UL;
> +	char *str, *sep, *tok;
> +	bool first = true;
> +	int err = 0;
> +
> +	BUILD_BUG_ON(ARRAY_SIZE(names) >=
> BITS_PER_TYPE(mitigations));
> +
> +	str = kstrdup(val, GFP_KERNEL);
> +	if (!str)
> +		return -ENOMEM;
> +
> +	for (sep = str; (tok = strsep(&sep, ","));) {
> +		bool enable = true;
> +		int i;
> +
> +		/* Be tolerant of leading/trailing whitespace */
> +		tok = strim(tok);
> +
> +		if (first) {
> +			first = false;
> +
> +			if (!strcmp(tok, "auto")) {
> +				new = ~0UL;
> +				continue;
> +			}
> +
> +			new = 0;
> +			if (!strcmp(tok, "off"))
> +				continue;
> +		}
> +
> +		if (*tok == '!') {
> +			enable = !enable;
> +			tok++;
> +		}
> +
> +		if (!strncmp(tok, "no", 2)) {
> +			enable = !enable;
> +			tok += 2;
> +		}
> +
> +		if (*tok == '\0')
> +			continue;
> +
> +		for (i = 0; i < ARRAY_SIZE(names); i++) {
> +			if (!strcmp(tok, names[i])) {
> +				if (enable)
> +					new |= BIT(i);
> +				else
> +					new &= ~BIT(i);
> +				break;
> +			}
> +		}
> +		if (i == ARRAY_SIZE(names)) {
> +			pr_err("Bad %s.mitigations=%s, '%s' is unknown\n",
> +			       DRIVER_NAME, val, tok);
> +			err = -EINVAL;
> +			break;
> +		}
> +	}
> +	kfree(str);
> +	if (err)
> +		return err;
> +
> +	WRITE_ONCE(mitigations, new);
> +	return 0;
> +}
> +
> +static int mitigations_get(char *buffer, const struct kernel_param *kp)
> +{
> +	unsigned long local = READ_ONCE(mitigations);
> +	int count, i;
> +	bool enable;
> +
> +	if (!local)
> +		return scnprintf(buffer, PAGE_SIZE, "%s\n", "off");
> +
> +	if (local & BIT(BITS_PER_LONG - 1)) {
> +		count = scnprintf(buffer, PAGE_SIZE, "%s,", "auto");
> +		enable = false;
> +	} else {
> +		enable = true;
> +		count = 0;
> +	}
> +
> +	for (i = 0; i < ARRAY_SIZE(names); i++) {
> +		if ((local & BIT(i)) != enable)
> +			continue;
> +
> +		count += scnprintf(buffer + count, PAGE_SIZE - count,
> +				   "%s%s,", enable ? "" : "!", names[i]);
> +	}
> +
> +	buffer[count - 1] = '\n';
> +	return count;
> +}
> +
> +static const struct kernel_param_ops ops = {
> +	.set = mitigations_set,
> +	.get = mitigations_get,
> +};
> +
> +module_param_cb_unsafe(mitigations, &ops, NULL, 0600);
> +MODULE_PARM_DESC(mitigations, "Selectively enable security mitigations
> +for all Intel® GPUs in the system.\n"
> +"\n"
> +"  auto -- enables all mitigations required for the platform [default]\n"
> +"  off  -- disables all mitigations\n"
> +"\n"
> +"Individual mitigations can be enabled by passing a comma-separated
> string,\n"
> +"e.g. mitigations=residuals to enable only clearing residuals or\n"
> +"mitigations=auto,noresiduals to disable only the clear residual
> mitigation.\n"
> +"Either '!' or 'no' may be used to switch from enabling the mitigation to\n"
> +"disabling it.\n"
> +"\n"
> +"Active mitigations for Ivybridge, Baytrail, Haswell:\n"
> +"  residuals -- clear all thread-local registers between contexts"
> +);
> diff --git a/drivers/gpu/drm/i915/i915_mitigations.h
> b/drivers/gpu/drm/i915/i915_mitigations.h
> new file mode 100644
> index 000000000000..1359d8135287
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/i915_mitigations.h
> @@ -0,0 +1,13 @@
> +/* SPDX-License-Identifier: MIT */
> +/*
> + * Copyright © 2021 Intel Corporation
> + */
> +
> +#ifndef __I915_MITIGATIONS_H__
> +#define __I915_MITIGATIONS_H__
> +
> +#include <linux/types.h>
> +
> +bool i915_mitigate_clear_residuals(void);
> +
> +#endif /* __I915_MITIGATIONS_H__ */

Although this seems like ideal solution - giving users option to choose *potential* performance over security or vice-versa -  However, I would have expected that this patch adds a DRM warning to inform users of the consequences of their action, whenever module parameter is used to disable any kind of mitigations. Well, that is my own perspective, not as a legal expert.

Thanks,
~Akeem

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Intel-gfx] [PATCH 03/11] drm/i915: Allow the sysadmin to override security mitigations
@ 2021-01-11 20:58     ` Abodunrin, Akeem G
  0 siblings, 0 replies; 30+ messages in thread
From: Abodunrin, Akeem G @ 2021-01-11 20:58 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx; +Cc: stable



> -----Original Message-----
> From: Intel-gfx <intel-gfx-bounces@lists.freedesktop.org> On Behalf Of Chris
> Wilson
> Sent: Sunday, January 10, 2021 7:04 AM
> To: intel-gfx@lists.freedesktop.org
> Cc: stable@vger.kernel.org; Chris Wilson <chris@chris-wilson.co.uk>
> Subject: [Intel-gfx] [PATCH 03/11] drm/i915: Allow the sysadmin to override
> security mitigations
> 
> The clear-residuals mitigation is a relatively heavy hammer and under some
> circumstances the user may wish to forgo the context isolation in order to
> meet some performance requirement. Introduce a generic module parameter
> to allow selectively enabling/disabling different mitigations.
> 
> Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/1858
> Fixes: 47f8253d2b89 ("drm/i915/gen7: Clear all EU/L3 residual contexts")
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> Cc: Jon Bloomfield <jon.bloomfield@intel.com>
> Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
> Cc: stable@vger.kernel.org # v5.7
> ---
>  drivers/gpu/drm/i915/Makefile                 |   1 +
>  .../gpu/drm/i915/gt/intel_ring_submission.c   |   4 +-
>  drivers/gpu/drm/i915/i915_mitigations.c       | 148 ++++++++++++++++++
>  drivers/gpu/drm/i915/i915_mitigations.h       |  13 ++
>  4 files changed, 165 insertions(+), 1 deletion(-)  create mode 100644
> drivers/gpu/drm/i915/i915_mitigations.c
>  create mode 100644 drivers/gpu/drm/i915/i915_mitigations.h
> 
> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
> index 4074d8cb0d6e..48f82c354611 100644
> --- a/drivers/gpu/drm/i915/Makefile
> +++ b/drivers/gpu/drm/i915/Makefile
> @@ -38,6 +38,7 @@ i915-y += i915_drv.o \
>  	  i915_config.o \
>  	  i915_irq.o \
>  	  i915_getparam.o \
> +	  i915_mitigations.o \
>  	  i915_params.o \
>  	  i915_pci.o \
>  	  i915_scatterlist.o \
> diff --git a/drivers/gpu/drm/i915/gt/intel_ring_submission.c
> b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
> index 724d56c9583d..657afd8ebc14 100644
> --- a/drivers/gpu/drm/i915/gt/intel_ring_submission.c
> +++ b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
> @@ -32,6 +32,7 @@
>  #include "gen6_ppgtt.h"
>  #include "gen7_renderclear.h"
>  #include "i915_drv.h"
> +#include "i915_mitigations.h"
>  #include "intel_breadcrumbs.h"
>  #include "intel_context.h"
>  #include "intel_gt.h"
> @@ -918,7 +919,8 @@ static int switch_context(struct i915_request *rq)
>  	GEM_BUG_ON(HAS_EXECLISTS(engine->i915));
> 
>  	if (engine->wa_ctx.vma && ce != engine->kernel_context) {
> -		if (engine->wa_ctx.vma->private != ce) {
> +		if (engine->wa_ctx.vma->private != ce &&
> +		    i915_mitigate_clear_residuals()) {
>  			ret = clear_residuals(rq);
>  			if (ret)
>  				return ret;
> diff --git a/drivers/gpu/drm/i915/i915_mitigations.c
> b/drivers/gpu/drm/i915/i915_mitigations.c
> new file mode 100644
> index 000000000000..8d5637cfa734
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/i915_mitigations.c
> @@ -0,0 +1,148 @@
> +// SPDX-License-Identifier: MIT
> +/*
> + * Copyright © 2021 Intel Corporation
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/moduleparam.h>
> +#include <linux/slab.h>
> +#include <linux/string.h>
> +
> +#include "i915_drv.h"
> +#include "i915_mitigations.h"
> +
> +static unsigned long mitigations = ~0UL;
> +
> +enum {
> +	CLEAR_RESIDUALS = 0,
> +};
> +
> +static const char * const names[] = {
> +	[CLEAR_RESIDUALS] = "residuals",
> +};
> +
> +bool i915_mitigate_clear_residuals(void)
> +{
> +	return READ_ONCE(mitigations) & BIT(CLEAR_RESIDUALS); }
> +
> +static int mitigations_set(const char *val, const struct kernel_param
> +*kp) {
> +	unsigned long new = ~0UL;
> +	char *str, *sep, *tok;
> +	bool first = true;
> +	int err = 0;
> +
> +	BUILD_BUG_ON(ARRAY_SIZE(names) >=
> BITS_PER_TYPE(mitigations));
> +
> +	str = kstrdup(val, GFP_KERNEL);
> +	if (!str)
> +		return -ENOMEM;
> +
> +	for (sep = str; (tok = strsep(&sep, ","));) {
> +		bool enable = true;
> +		int i;
> +
> +		/* Be tolerant of leading/trailing whitespace */
> +		tok = strim(tok);
> +
> +		if (first) {
> +			first = false;
> +
> +			if (!strcmp(tok, "auto")) {
> +				new = ~0UL;
> +				continue;
> +			}
> +
> +			new = 0;
> +			if (!strcmp(tok, "off"))
> +				continue;
> +		}
> +
> +		if (*tok == '!') {
> +			enable = !enable;
> +			tok++;
> +		}
> +
> +		if (!strncmp(tok, "no", 2)) {
> +			enable = !enable;
> +			tok += 2;
> +		}
> +
> +		if (*tok == '\0')
> +			continue;
> +
> +		for (i = 0; i < ARRAY_SIZE(names); i++) {
> +			if (!strcmp(tok, names[i])) {
> +				if (enable)
> +					new |= BIT(i);
> +				else
> +					new &= ~BIT(i);
> +				break;
> +			}
> +		}
> +		if (i == ARRAY_SIZE(names)) {
> +			pr_err("Bad %s.mitigations=%s, '%s' is unknown\n",
> +			       DRIVER_NAME, val, tok);
> +			err = -EINVAL;
> +			break;
> +		}
> +	}
> +	kfree(str);
> +	if (err)
> +		return err;
> +
> +	WRITE_ONCE(mitigations, new);
> +	return 0;
> +}
> +
> +static int mitigations_get(char *buffer, const struct kernel_param *kp)
> +{
> +	unsigned long local = READ_ONCE(mitigations);
> +	int count, i;
> +	bool enable;
> +
> +	if (!local)
> +		return scnprintf(buffer, PAGE_SIZE, "%s\n", "off");
> +
> +	if (local & BIT(BITS_PER_LONG - 1)) {
> +		count = scnprintf(buffer, PAGE_SIZE, "%s,", "auto");
> +		enable = false;
> +	} else {
> +		enable = true;
> +		count = 0;
> +	}
> +
> +	for (i = 0; i < ARRAY_SIZE(names); i++) {
> +		if ((local & BIT(i)) != enable)
> +			continue;
> +
> +		count += scnprintf(buffer + count, PAGE_SIZE - count,
> +				   "%s%s,", enable ? "" : "!", names[i]);
> +	}
> +
> +	buffer[count - 1] = '\n';
> +	return count;
> +}
> +
> +static const struct kernel_param_ops ops = {
> +	.set = mitigations_set,
> +	.get = mitigations_get,
> +};
> +
> +module_param_cb_unsafe(mitigations, &ops, NULL, 0600);
> +MODULE_PARM_DESC(mitigations, "Selectively enable security mitigations
> +for all Intel® GPUs in the system.\n"
> +"\n"
> +"  auto -- enables all mitigations required for the platform [default]\n"
> +"  off  -- disables all mitigations\n"
> +"\n"
> +"Individual mitigations can be enabled by passing a comma-separated
> string,\n"
> +"e.g. mitigations=residuals to enable only clearing residuals or\n"
> +"mitigations=auto,noresiduals to disable only the clear residual
> mitigation.\n"
> +"Either '!' or 'no' may be used to switch from enabling the mitigation to\n"
> +"disabling it.\n"
> +"\n"
> +"Active mitigations for Ivybridge, Baytrail, Haswell:\n"
> +"  residuals -- clear all thread-local registers between contexts"
> +);
> diff --git a/drivers/gpu/drm/i915/i915_mitigations.h
> b/drivers/gpu/drm/i915/i915_mitigations.h
> new file mode 100644
> index 000000000000..1359d8135287
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/i915_mitigations.h
> @@ -0,0 +1,13 @@
> +/* SPDX-License-Identifier: MIT */
> +/*
> + * Copyright © 2021 Intel Corporation
> + */
> +
> +#ifndef __I915_MITIGATIONS_H__
> +#define __I915_MITIGATIONS_H__
> +
> +#include <linux/types.h>
> +
> +bool i915_mitigate_clear_residuals(void);
> +
> +#endif /* __I915_MITIGATIONS_H__ */

Although this seems like ideal solution - giving users option to choose *potential* performance over security or vice-versa -  However, I would have expected that this patch adds a DRM warning to inform users of the consequences of their action, whenever module parameter is used to disable any kind of mitigations. Well, that is my own perspective, not as a legal expert.

Thanks,
~Akeem
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Intel-gfx] [PATCH 01/11] drm/i915/gt: Limit VFE threads based on GT
  2021-01-11 20:51     ` Chris Wilson
@ 2021-01-11 21:04       ` Rodrigo Vivi
  -1 siblings, 0 replies; 30+ messages in thread
From: Rodrigo Vivi @ 2021-01-11 21:04 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx, stable, Randy Wright

On Mon, Jan 11, 2021 at 08:51:23PM +0000, Chris Wilson wrote:
> Quoting Rodrigo Vivi (2021-01-11 17:35:12)
> > On Sun, Jan 10, 2021 at 03:03:54PM +0000, Chris Wilson wrote:
> > > MEDIA_STATE_VFE only accepts the 'maximum number of threads' in the
> > > range [0, n-1] where n is #EU * (#threads/EU) with the number of threads
> > > based on plaform and the number of EU based on the number of slices and
> > > subslices. This is a fixed number per platform/gt, so appropriately
> > > limit the number of threads we spawn to match the device.
> > > 
> > > v2: Oversaturate the system with tasks to force execution on every HW
> > > thread; if the thread idles it is returned to the pool and may be reused
> > > again before an unused thread.
> > > 
> > > v3: Fix more state commands, which was causing Baytrail to barf.
> > 
> > CI is still not happy with byt right? or is that false positive?
> 
> After v3, ivb still failed.
>  
> > > v4: STATE_CACHE_INVALIDATE requires a stall on Ivybridge
> 
> Right now with the multiple pipecontrls around the PIPELINE_SELECT *and*
> STATE_BASE, CI has been happy for multiple runs. I was able to reproduce
> the same selftests failures and confirm that we do not see any of those
> failures in a thousand iterations. High level of confidence, but since
> we are dealing with empirical results with cross-referencing to mesa who
> also have seen similar undocumented failures, there's still an element
> of doubt as to whether it is truly watertight.
> 
> The CI results for this series passed on the all important ivb,byt,hsw.

great!

> 
> > > Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/2024
> > > Fixes: 47f8253d2b89 ("drm/i915/gen7: Clear all EU/L3 residual contexts")
> > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> > > Cc: Prathap Kumar Valsan <prathap.kumar.valsan@intel.com>
> > > Cc: Akeem G Abodunrin <akeem.g.abodunrin@intel.com>
> > > Cc: Jon Bloomfield <jon.bloomfield@intel.com>
> > > Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
> > > Cc: Randy Wright <rwright@hpe.com>
> > > Cc: stable@vger.kernel.org # v5.7+
> > > ---
> > >  drivers/gpu/drm/i915/gt/gen7_renderclear.c | 157 ++++++++++++---------
> > >  1 file changed, 94 insertions(+), 63 deletions(-)
> > > 
> > > diff --git a/drivers/gpu/drm/i915/gt/gen7_renderclear.c b/drivers/gpu/drm/i915/gt/gen7_renderclear.c
> > > index d93d85cd3027..f32a8e8040b2 100644
> > > --- a/drivers/gpu/drm/i915/gt/gen7_renderclear.c
> > > +++ b/drivers/gpu/drm/i915/gt/gen7_renderclear.c
> > > @@ -7,8 +7,6 @@
> > >  #include "i915_drv.h"
> > >  #include "intel_gpu_commands.h"
> > >  
> > > -#define MAX_URB_ENTRIES 64
> > > -#define STATE_SIZE (4 * 1024)
> > >  #define GT3_INLINE_DATA_DELAYS 0x1E00
> > >  #define batch_advance(Y, CS) GEM_BUG_ON((Y)->end != (CS))
> > >  
> > > @@ -34,38 +32,59 @@ struct batch_chunk {
> > >  };
> > >  
> > >  struct batch_vals {
> > > -     u32 max_primitives;
> > > -     u32 max_urb_entries;
> > > -     u32 cmd_size;
> > > -     u32 state_size;
> > > +     u32 max_threads;
> > >       u32 state_start;
> > > -     u32 batch_size;
> > > +     u32 surface_start;
> > >       u32 surface_height;
> > >       u32 surface_width;
> > > -     u32 scratch_size;
> > > -     u32 max_size;
> > > +     u32 size;
> > >  };
> > >  
> > > +static inline int num_primitives(const struct batch_vals *bv)
> > > +{
> > > +     /*
> > > +      * We need to saturate the GPU with work in order to dispatch
> > > +      * a shader on every HW thread, and clear the thread-local registers.
> > > +      * In short, we have to dispatch work faster than the shaders can
> > > +      * run in order to fill occupy each HW thread.
> > > +      */
> > > +     return bv->max_threads;
> > > +}
> > > +
> > >  static void
> > >  batch_get_defaults(struct drm_i915_private *i915, struct batch_vals *bv)
> > >  {
> > >       if (IS_HASWELL(i915)) {
> > > -             bv->max_primitives = 280;
> > > -             bv->max_urb_entries = MAX_URB_ENTRIES;
> > > +             switch (INTEL_INFO(i915)->gt) {
> > > +             default:
> > > +             case 1:
> > > +                     bv->max_threads = 70;
> > > +                     break;
> > > +             case 2:
> > > +                     bv->max_threads = 140;
> > > +                     break;
> > > +             case 3:
> > > +                     bv->max_threads = 280;
> > > +                     break;
> > > +             }
> > >               bv->surface_height = 16 * 16;
> > >               bv->surface_width = 32 * 2 * 16;
> > >       } else {
> > > -             bv->max_primitives = 128;
> > > -             bv->max_urb_entries = MAX_URB_ENTRIES / 2;
> > > +             switch (INTEL_INFO(i915)->gt) {
> > > +             default:
> > > +             case 1: /* including vlv */
> > > +                     bv->max_threads = 36;
> > > +                     break;
> > > +             case 2:
> > > +                     bv->max_threads = 128;
> > > +                     break;
> > > +             }
> > >               bv->surface_height = 16 * 8;
> > >               bv->surface_width = 32 * 16;
> > 
> > all the values above matches the spec.
> > 
> > >       }
> > > -     bv->cmd_size = bv->max_primitives * 4096;
> > > -     bv->state_size = STATE_SIZE;
> > > -     bv->state_start = bv->cmd_size;
> > > -     bv->batch_size = bv->cmd_size + bv->state_size;
> > > -     bv->scratch_size = bv->surface_height * bv->surface_width;
> > > -     bv->max_size = bv->batch_size + bv->scratch_size;
> > > +     bv->state_start = round_up(SZ_1K + num_primitives(bv) * 64, SZ_4K);
> > > +     bv->surface_start = bv->state_start + SZ_4K;
> > > +     bv->size = bv->surface_start + bv->surface_height * bv->surface_width;
> > 
> > I liked this batch values simplification...
> > 
> > >  }
> > >  
> > >  static void batch_init(struct batch_chunk *bc,
> > > @@ -155,7 +174,8 @@ static u32
> > >  gen7_fill_binding_table(struct batch_chunk *state,
> > >                       const struct batch_vals *bv)
> > >  {
> > > -     u32 surface_start = gen7_fill_surface_state(state, bv->batch_size, bv);
> > > +     u32 surface_start =
> > > +             gen7_fill_surface_state(state, bv->surface_start, bv);
> > >       u32 *cs = batch_alloc_items(state, 32, 8);
> > >       u32 offset = batch_offset(state, cs);
> > >  
> > > @@ -214,9 +234,9 @@ static void
> > >  gen7_emit_state_base_address(struct batch_chunk *batch,
> > >                            u32 surface_state_base)
> > >  {
> > > -     u32 *cs = batch_alloc_items(batch, 0, 12);
> > > +     u32 *cs = batch_alloc_items(batch, 0, 10);
> > >  
> > > -     *cs++ = STATE_BASE_ADDRESS | (12 - 2);
> > > +     *cs++ = STATE_BASE_ADDRESS | (10 - 2);
> > >       /* general */
> > >       *cs++ = batch_addr(batch) | BASE_ADDRESS_MODIFY;
> > >       /* surface */
> > > @@ -233,8 +253,6 @@ gen7_emit_state_base_address(struct batch_chunk *batch,
> > >       *cs++ = BASE_ADDRESS_MODIFY;
> > >       *cs++ = 0;
> > >       *cs++ = BASE_ADDRESS_MODIFY;
> > > -     *cs++ = 0;
> > > -     *cs++ = 0;
> > 
> > why don't we need this anymore?
> 
> It was incorrect, gen7 is just (10-2). The last two were extraneous
> padding.
> 
> > >       batch_advance(batch, cs);
> > >  }
> > >  
> > > @@ -244,8 +262,7 @@ gen7_emit_vfe_state(struct batch_chunk *batch,
> > >                   u32 urb_size, u32 curbe_size,
> > >                   u32 mode)
> > >  {
> > > -     u32 urb_entries = bv->max_urb_entries;
> > > -     u32 threads = bv->max_primitives - 1;
> > > +     u32 threads = bv->max_threads - 1;
> > >       u32 *cs = batch_alloc_items(batch, 32, 8);
> > >  
> > >       *cs++ = MEDIA_VFE_STATE | (8 - 2);
> > > @@ -254,7 +271,7 @@ gen7_emit_vfe_state(struct batch_chunk *batch,
> > >       *cs++ = 0;
> > >  
> > >       /* number of threads & urb entries for GPGPU vs Media Mode */
> > > -     *cs++ = threads << 16 | urb_entries << 8 | mode << 2;
> > > +     *cs++ = threads << 16 | 1 << 8 | mode << 2;
> > >  
> > >       *cs++ = 0;
> > >  
> > > @@ -293,17 +310,12 @@ gen7_emit_media_object(struct batch_chunk *batch,
> > >  {
> > >       unsigned int x_offset = (media_object_index % 16) * 64;
> > >       unsigned int y_offset = (media_object_index / 16) * 16;
> > > -     unsigned int inline_data_size;
> > > -     unsigned int media_batch_size;
> > > -     unsigned int i;
> > > +     unsigned int pkt = 6 + 3;
> > >       u32 *cs;
> > >  
> > > -     inline_data_size = 112 * 8;
> > > -     media_batch_size = inline_data_size + 6;
> > > +     cs = batch_alloc_items(batch, 8, pkt);
> > >  
> > > -     cs = batch_alloc_items(batch, 8, media_batch_size);
> > > -
> > > -     *cs++ = MEDIA_OBJECT | (media_batch_size - 2);
> > > +     *cs++ = MEDIA_OBJECT | (pkt - 2);
> > >  
> > >       /* interface descriptor offset */
> > >       *cs++ = 0;
> > > @@ -317,25 +329,44 @@ gen7_emit_media_object(struct batch_chunk *batch,
> > >       *cs++ = 0;
> > >  
> > >       /* inline */
> > > -     *cs++ = (y_offset << 16) | (x_offset);
> > > +     *cs++ = y_offset << 16 | x_offset;
> > >       *cs++ = 0;
> > >       *cs++ = GT3_INLINE_DATA_DELAYS;
> > > -     for (i = 3; i < inline_data_size; i++)
> > > -             *cs++ = 0;
> > 
> > why?
> 
> We don't use the extra urb data, and worse the extra inline data slows
> down the CP to be slower than the thread dispatch. That was causing the 
> issue that the same HW thread was servicing multiple MEDIA_OBJECTS, and
> we did not then clear all the thread-local registers across the EU (as
> some threads never executed our shader). And that was the cause of the
> validation failures in v1.
> 
> [The first clue was that if we submitted more a few more objects than
> threads with v1, it takes twice as long, and passes the validation test.
> Now, touch wood, it appears that we are able to saturate the HW threads
> with an equal number of objects, so every HW thread does exactly one
> iteration of the shader.]
> 
> > >       batch_advance(batch, cs);
> > >  }
> > >  
> > >  static void gen7_emit_pipeline_flush(struct batch_chunk *batch)
> > >  {
> > > -     u32 *cs = batch_alloc_items(batch, 0, 5);
> > > +     u32 *cs = batch_alloc_items(batch, 0, 4);
> > >  
> > > -     *cs++ = GFX_OP_PIPE_CONTROL(5);
> > > -     *cs++ = PIPE_CONTROL_STATE_CACHE_INVALIDATE |
> > > -             PIPE_CONTROL_GLOBAL_GTT_IVB;
> > > +     *cs++ = GFX_OP_PIPE_CONTROL(4);
> > > +     *cs++ = PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
> > > +             PIPE_CONTROL_DEPTH_CACHE_FLUSH |
> > > +             PIPE_CONTROL_DC_FLUSH_ENABLE |
> > > +             PIPE_CONTROL_CS_STALL;
> > >       *cs++ = 0;
> > >       *cs++ = 0;
> > > +
> > > +     batch_advance(batch, cs);
> > > +}
> > > +
> > > +static void gen7_emit_pipeline_invalidate(struct batch_chunk *batch)
> > > +{
> > > +     u32 *cs = batch_alloc_items(batch, 0, 8);
> > > +
> > > +     /* ivb: Stall before STATE_CACHE_INVALIDATE */
> > > +     *cs++ = GFX_OP_PIPE_CONTROL(4);
> > > +     *cs++ = PIPE_CONTROL_STALL_AT_SCOREBOARD |
> > > +             PIPE_CONTROL_CS_STALL;
> > >       *cs++ = 0;
> > > +     *cs++ = 0;
> > > +
> > > +     *cs++ = GFX_OP_PIPE_CONTROL(4);
> > > +     *cs++ = PIPE_CONTROL_STATE_CACHE_INVALIDATE;
> > > +     *cs++ = 0;
> > > +     *cs++ = 0;
> > > +
> > >       batch_advance(batch, cs);
> > >  }
> > >  
> > > @@ -344,34 +375,34 @@ static void emit_batch(struct i915_vma * const vma,
> > >                      const struct batch_vals *bv)
> > >  {
> > >       struct drm_i915_private *i915 = vma->vm->i915;
> > > -     unsigned int desc_count = 64;
> > > -     const u32 urb_size = 112;
> > > +     const unsigned int desc_count = 1;
> > > +     const unsigned int urb_size = 1;
> > >       struct batch_chunk cmds, state;
> > > -     u32 interface_descriptor;
> > > +     u32 descriptors;
> > >       unsigned int i;
> > >  
> > > -     batch_init(&cmds, vma, start, 0, bv->cmd_size);
> > > -     batch_init(&state, vma, start, bv->state_start, bv->state_size);
> > > +     batch_init(&cmds, vma, start, 0, bv->state_start);
> > > +     batch_init(&state, vma, start, bv->state_start, SZ_4K);
> > >  
> > > -     interface_descriptor =
> > > -             gen7_fill_interface_descriptor(&state, bv,
> > > -                                            IS_HASWELL(i915) ?
> > > -                                            &cb_kernel_hsw :
> > > -                                            &cb_kernel_ivb,
> > > -                                            desc_count);
> > > -     gen7_emit_pipeline_flush(&cmds);
> > > +     descriptors = gen7_fill_interface_descriptor(&state, bv,
> > > +                                                  IS_HASWELL(i915) ?
> > > +                                                  &cb_kernel_hsw :
> > > +                                                  &cb_kernel_ivb,
> > > +                                                  desc_count);
> > > +
> > > +     gen7_emit_pipeline_invalidate(&cmds);
> > >       batch_add(&cmds, PIPELINE_SELECT | PIPELINE_SELECT_MEDIA);
> > >       batch_add(&cmds, MI_NOOP);
> > > -     gen7_emit_state_base_address(&cmds, interface_descriptor);
> > > +     gen7_emit_pipeline_invalidate(&cmds);
> > > +
> > >       gen7_emit_pipeline_flush(&cmds);
> > > +     gen7_emit_state_base_address(&cmds, descriptors);
> > > +     gen7_emit_pipeline_invalidate(&cmds);
> > 
> > why do we need double invalidate?
> 
> Empirical results. We need the flush before STATE_BASE otherwise there
> were lost writes; mesa has had a similar experience with needing a
> magical flush before. The invalidate afterwards is similarly required by
> the HW.
> 
> The invalidate before the PIPELINE_SELECT is mandatory in bspec for MEDIA,
> and vouched for by our CI results. The one after the PIPELINE_SELECT does
> not appear in the docs, yet preferred by CI.
> 
> It's this combination of flush/invalidate that finally worked on all
> three gen7 platforms, but there's almost definitely a more optimal set of
> pipecontrols.

okay. Let's move with this then. Better than reverting all the mitigation fix
and we get something that unblocks users.


Reviewed-by: Rodrigo Vivi <rodrigo.vivi@intel.com>

> -Chris

Thanks for all the clarifications,
Rodrigo.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Intel-gfx] [PATCH 01/11] drm/i915/gt: Limit VFE threads based on GT
@ 2021-01-11 21:04       ` Rodrigo Vivi
  0 siblings, 0 replies; 30+ messages in thread
From: Rodrigo Vivi @ 2021-01-11 21:04 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx, Randy Wright, stable

On Mon, Jan 11, 2021 at 08:51:23PM +0000, Chris Wilson wrote:
> Quoting Rodrigo Vivi (2021-01-11 17:35:12)
> > On Sun, Jan 10, 2021 at 03:03:54PM +0000, Chris Wilson wrote:
> > > MEDIA_STATE_VFE only accepts the 'maximum number of threads' in the
> > > range [0, n-1] where n is #EU * (#threads/EU) with the number of threads
> > > based on plaform and the number of EU based on the number of slices and
> > > subslices. This is a fixed number per platform/gt, so appropriately
> > > limit the number of threads we spawn to match the device.
> > > 
> > > v2: Oversaturate the system with tasks to force execution on every HW
> > > thread; if the thread idles it is returned to the pool and may be reused
> > > again before an unused thread.
> > > 
> > > v3: Fix more state commands, which was causing Baytrail to barf.
> > 
> > CI is still not happy with byt right? or is that false positive?
> 
> After v3, ivb still failed.
>  
> > > v4: STATE_CACHE_INVALIDATE requires a stall on Ivybridge
> 
> Right now with the multiple pipecontrls around the PIPELINE_SELECT *and*
> STATE_BASE, CI has been happy for multiple runs. I was able to reproduce
> the same selftests failures and confirm that we do not see any of those
> failures in a thousand iterations. High level of confidence, but since
> we are dealing with empirical results with cross-referencing to mesa who
> also have seen similar undocumented failures, there's still an element
> of doubt as to whether it is truly watertight.
> 
> The CI results for this series passed on the all important ivb,byt,hsw.

great!

> 
> > > Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/2024
> > > Fixes: 47f8253d2b89 ("drm/i915/gen7: Clear all EU/L3 residual contexts")
> > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> > > Cc: Prathap Kumar Valsan <prathap.kumar.valsan@intel.com>
> > > Cc: Akeem G Abodunrin <akeem.g.abodunrin@intel.com>
> > > Cc: Jon Bloomfield <jon.bloomfield@intel.com>
> > > Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
> > > Cc: Randy Wright <rwright@hpe.com>
> > > Cc: stable@vger.kernel.org # v5.7+
> > > ---
> > >  drivers/gpu/drm/i915/gt/gen7_renderclear.c | 157 ++++++++++++---------
> > >  1 file changed, 94 insertions(+), 63 deletions(-)
> > > 
> > > diff --git a/drivers/gpu/drm/i915/gt/gen7_renderclear.c b/drivers/gpu/drm/i915/gt/gen7_renderclear.c
> > > index d93d85cd3027..f32a8e8040b2 100644
> > > --- a/drivers/gpu/drm/i915/gt/gen7_renderclear.c
> > > +++ b/drivers/gpu/drm/i915/gt/gen7_renderclear.c
> > > @@ -7,8 +7,6 @@
> > >  #include "i915_drv.h"
> > >  #include "intel_gpu_commands.h"
> > >  
> > > -#define MAX_URB_ENTRIES 64
> > > -#define STATE_SIZE (4 * 1024)
> > >  #define GT3_INLINE_DATA_DELAYS 0x1E00
> > >  #define batch_advance(Y, CS) GEM_BUG_ON((Y)->end != (CS))
> > >  
> > > @@ -34,38 +32,59 @@ struct batch_chunk {
> > >  };
> > >  
> > >  struct batch_vals {
> > > -     u32 max_primitives;
> > > -     u32 max_urb_entries;
> > > -     u32 cmd_size;
> > > -     u32 state_size;
> > > +     u32 max_threads;
> > >       u32 state_start;
> > > -     u32 batch_size;
> > > +     u32 surface_start;
> > >       u32 surface_height;
> > >       u32 surface_width;
> > > -     u32 scratch_size;
> > > -     u32 max_size;
> > > +     u32 size;
> > >  };
> > >  
> > > +static inline int num_primitives(const struct batch_vals *bv)
> > > +{
> > > +     /*
> > > +      * We need to saturate the GPU with work in order to dispatch
> > > +      * a shader on every HW thread, and clear the thread-local registers.
> > > +      * In short, we have to dispatch work faster than the shaders can
> > > +      * run in order to fill occupy each HW thread.
> > > +      */
> > > +     return bv->max_threads;
> > > +}
> > > +
> > >  static void
> > >  batch_get_defaults(struct drm_i915_private *i915, struct batch_vals *bv)
> > >  {
> > >       if (IS_HASWELL(i915)) {
> > > -             bv->max_primitives = 280;
> > > -             bv->max_urb_entries = MAX_URB_ENTRIES;
> > > +             switch (INTEL_INFO(i915)->gt) {
> > > +             default:
> > > +             case 1:
> > > +                     bv->max_threads = 70;
> > > +                     break;
> > > +             case 2:
> > > +                     bv->max_threads = 140;
> > > +                     break;
> > > +             case 3:
> > > +                     bv->max_threads = 280;
> > > +                     break;
> > > +             }
> > >               bv->surface_height = 16 * 16;
> > >               bv->surface_width = 32 * 2 * 16;
> > >       } else {
> > > -             bv->max_primitives = 128;
> > > -             bv->max_urb_entries = MAX_URB_ENTRIES / 2;
> > > +             switch (INTEL_INFO(i915)->gt) {
> > > +             default:
> > > +             case 1: /* including vlv */
> > > +                     bv->max_threads = 36;
> > > +                     break;
> > > +             case 2:
> > > +                     bv->max_threads = 128;
> > > +                     break;
> > > +             }
> > >               bv->surface_height = 16 * 8;
> > >               bv->surface_width = 32 * 16;
> > 
> > all the values above matches the spec.
> > 
> > >       }
> > > -     bv->cmd_size = bv->max_primitives * 4096;
> > > -     bv->state_size = STATE_SIZE;
> > > -     bv->state_start = bv->cmd_size;
> > > -     bv->batch_size = bv->cmd_size + bv->state_size;
> > > -     bv->scratch_size = bv->surface_height * bv->surface_width;
> > > -     bv->max_size = bv->batch_size + bv->scratch_size;
> > > +     bv->state_start = round_up(SZ_1K + num_primitives(bv) * 64, SZ_4K);
> > > +     bv->surface_start = bv->state_start + SZ_4K;
> > > +     bv->size = bv->surface_start + bv->surface_height * bv->surface_width;
> > 
> > I liked this batch values simplification...
> > 
> > >  }
> > >  
> > >  static void batch_init(struct batch_chunk *bc,
> > > @@ -155,7 +174,8 @@ static u32
> > >  gen7_fill_binding_table(struct batch_chunk *state,
> > >                       const struct batch_vals *bv)
> > >  {
> > > -     u32 surface_start = gen7_fill_surface_state(state, bv->batch_size, bv);
> > > +     u32 surface_start =
> > > +             gen7_fill_surface_state(state, bv->surface_start, bv);
> > >       u32 *cs = batch_alloc_items(state, 32, 8);
> > >       u32 offset = batch_offset(state, cs);
> > >  
> > > @@ -214,9 +234,9 @@ static void
> > >  gen7_emit_state_base_address(struct batch_chunk *batch,
> > >                            u32 surface_state_base)
> > >  {
> > > -     u32 *cs = batch_alloc_items(batch, 0, 12);
> > > +     u32 *cs = batch_alloc_items(batch, 0, 10);
> > >  
> > > -     *cs++ = STATE_BASE_ADDRESS | (12 - 2);
> > > +     *cs++ = STATE_BASE_ADDRESS | (10 - 2);
> > >       /* general */
> > >       *cs++ = batch_addr(batch) | BASE_ADDRESS_MODIFY;
> > >       /* surface */
> > > @@ -233,8 +253,6 @@ gen7_emit_state_base_address(struct batch_chunk *batch,
> > >       *cs++ = BASE_ADDRESS_MODIFY;
> > >       *cs++ = 0;
> > >       *cs++ = BASE_ADDRESS_MODIFY;
> > > -     *cs++ = 0;
> > > -     *cs++ = 0;
> > 
> > why don't we need this anymore?
> 
> It was incorrect, gen7 is just (10-2). The last two were extraneous
> padding.
> 
> > >       batch_advance(batch, cs);
> > >  }
> > >  
> > > @@ -244,8 +262,7 @@ gen7_emit_vfe_state(struct batch_chunk *batch,
> > >                   u32 urb_size, u32 curbe_size,
> > >                   u32 mode)
> > >  {
> > > -     u32 urb_entries = bv->max_urb_entries;
> > > -     u32 threads = bv->max_primitives - 1;
> > > +     u32 threads = bv->max_threads - 1;
> > >       u32 *cs = batch_alloc_items(batch, 32, 8);
> > >  
> > >       *cs++ = MEDIA_VFE_STATE | (8 - 2);
> > > @@ -254,7 +271,7 @@ gen7_emit_vfe_state(struct batch_chunk *batch,
> > >       *cs++ = 0;
> > >  
> > >       /* number of threads & urb entries for GPGPU vs Media Mode */
> > > -     *cs++ = threads << 16 | urb_entries << 8 | mode << 2;
> > > +     *cs++ = threads << 16 | 1 << 8 | mode << 2;
> > >  
> > >       *cs++ = 0;
> > >  
> > > @@ -293,17 +310,12 @@ gen7_emit_media_object(struct batch_chunk *batch,
> > >  {
> > >       unsigned int x_offset = (media_object_index % 16) * 64;
> > >       unsigned int y_offset = (media_object_index / 16) * 16;
> > > -     unsigned int inline_data_size;
> > > -     unsigned int media_batch_size;
> > > -     unsigned int i;
> > > +     unsigned int pkt = 6 + 3;
> > >       u32 *cs;
> > >  
> > > -     inline_data_size = 112 * 8;
> > > -     media_batch_size = inline_data_size + 6;
> > > +     cs = batch_alloc_items(batch, 8, pkt);
> > >  
> > > -     cs = batch_alloc_items(batch, 8, media_batch_size);
> > > -
> > > -     *cs++ = MEDIA_OBJECT | (media_batch_size - 2);
> > > +     *cs++ = MEDIA_OBJECT | (pkt - 2);
> > >  
> > >       /* interface descriptor offset */
> > >       *cs++ = 0;
> > > @@ -317,25 +329,44 @@ gen7_emit_media_object(struct batch_chunk *batch,
> > >       *cs++ = 0;
> > >  
> > >       /* inline */
> > > -     *cs++ = (y_offset << 16) | (x_offset);
> > > +     *cs++ = y_offset << 16 | x_offset;
> > >       *cs++ = 0;
> > >       *cs++ = GT3_INLINE_DATA_DELAYS;
> > > -     for (i = 3; i < inline_data_size; i++)
> > > -             *cs++ = 0;
> > 
> > why?
> 
> We don't use the extra urb data, and worse the extra inline data slows
> down the CP to be slower than the thread dispatch. That was causing the 
> issue that the same HW thread was servicing multiple MEDIA_OBJECTS, and
> we did not then clear all the thread-local registers across the EU (as
> some threads never executed our shader). And that was the cause of the
> validation failures in v1.
> 
> [The first clue was that if we submitted more a few more objects than
> threads with v1, it takes twice as long, and passes the validation test.
> Now, touch wood, it appears that we are able to saturate the HW threads
> with an equal number of objects, so every HW thread does exactly one
> iteration of the shader.]
> 
> > >       batch_advance(batch, cs);
> > >  }
> > >  
> > >  static void gen7_emit_pipeline_flush(struct batch_chunk *batch)
> > >  {
> > > -     u32 *cs = batch_alloc_items(batch, 0, 5);
> > > +     u32 *cs = batch_alloc_items(batch, 0, 4);
> > >  
> > > -     *cs++ = GFX_OP_PIPE_CONTROL(5);
> > > -     *cs++ = PIPE_CONTROL_STATE_CACHE_INVALIDATE |
> > > -             PIPE_CONTROL_GLOBAL_GTT_IVB;
> > > +     *cs++ = GFX_OP_PIPE_CONTROL(4);
> > > +     *cs++ = PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
> > > +             PIPE_CONTROL_DEPTH_CACHE_FLUSH |
> > > +             PIPE_CONTROL_DC_FLUSH_ENABLE |
> > > +             PIPE_CONTROL_CS_STALL;
> > >       *cs++ = 0;
> > >       *cs++ = 0;
> > > +
> > > +     batch_advance(batch, cs);
> > > +}
> > > +
> > > +static void gen7_emit_pipeline_invalidate(struct batch_chunk *batch)
> > > +{
> > > +     u32 *cs = batch_alloc_items(batch, 0, 8);
> > > +
> > > +     /* ivb: Stall before STATE_CACHE_INVALIDATE */
> > > +     *cs++ = GFX_OP_PIPE_CONTROL(4);
> > > +     *cs++ = PIPE_CONTROL_STALL_AT_SCOREBOARD |
> > > +             PIPE_CONTROL_CS_STALL;
> > >       *cs++ = 0;
> > > +     *cs++ = 0;
> > > +
> > > +     *cs++ = GFX_OP_PIPE_CONTROL(4);
> > > +     *cs++ = PIPE_CONTROL_STATE_CACHE_INVALIDATE;
> > > +     *cs++ = 0;
> > > +     *cs++ = 0;
> > > +
> > >       batch_advance(batch, cs);
> > >  }
> > >  
> > > @@ -344,34 +375,34 @@ static void emit_batch(struct i915_vma * const vma,
> > >                      const struct batch_vals *bv)
> > >  {
> > >       struct drm_i915_private *i915 = vma->vm->i915;
> > > -     unsigned int desc_count = 64;
> > > -     const u32 urb_size = 112;
> > > +     const unsigned int desc_count = 1;
> > > +     const unsigned int urb_size = 1;
> > >       struct batch_chunk cmds, state;
> > > -     u32 interface_descriptor;
> > > +     u32 descriptors;
> > >       unsigned int i;
> > >  
> > > -     batch_init(&cmds, vma, start, 0, bv->cmd_size);
> > > -     batch_init(&state, vma, start, bv->state_start, bv->state_size);
> > > +     batch_init(&cmds, vma, start, 0, bv->state_start);
> > > +     batch_init(&state, vma, start, bv->state_start, SZ_4K);
> > >  
> > > -     interface_descriptor =
> > > -             gen7_fill_interface_descriptor(&state, bv,
> > > -                                            IS_HASWELL(i915) ?
> > > -                                            &cb_kernel_hsw :
> > > -                                            &cb_kernel_ivb,
> > > -                                            desc_count);
> > > -     gen7_emit_pipeline_flush(&cmds);
> > > +     descriptors = gen7_fill_interface_descriptor(&state, bv,
> > > +                                                  IS_HASWELL(i915) ?
> > > +                                                  &cb_kernel_hsw :
> > > +                                                  &cb_kernel_ivb,
> > > +                                                  desc_count);
> > > +
> > > +     gen7_emit_pipeline_invalidate(&cmds);
> > >       batch_add(&cmds, PIPELINE_SELECT | PIPELINE_SELECT_MEDIA);
> > >       batch_add(&cmds, MI_NOOP);
> > > -     gen7_emit_state_base_address(&cmds, interface_descriptor);
> > > +     gen7_emit_pipeline_invalidate(&cmds);
> > > +
> > >       gen7_emit_pipeline_flush(&cmds);
> > > +     gen7_emit_state_base_address(&cmds, descriptors);
> > > +     gen7_emit_pipeline_invalidate(&cmds);
> > 
> > why do we need double invalidate?
> 
> Empirical results. We need the flush before STATE_BASE otherwise there
> were lost writes; mesa has had a similar experience with needing a
> magical flush before. The invalidate afterwards is similarly required by
> the HW.
> 
> The invalidate before the PIPELINE_SELECT is mandatory in bspec for MEDIA,
> and vouched for by our CI results. The one after the PIPELINE_SELECT does
> not appear in the docs, yet preferred by CI.
> 
> It's this combination of flush/invalidate that finally worked on all
> three gen7 platforms, but there's almost definitely a more optimal set of
> pipecontrols.

okay. Let's move with this then. Better than reverting all the mitigation fix
and we get something that unblocks users.


Reviewed-by: Rodrigo Vivi <rodrigo.vivi@intel.com>

> -Chris

Thanks for all the clarifications,
Rodrigo.
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Intel-gfx] [PATCH 03/11] drm/i915: Allow the sysadmin to override security mitigations
  2021-01-11 20:58     ` Abodunrin, Akeem G
  (?)
@ 2021-01-11 21:10     ` Chris Wilson
  -1 siblings, 0 replies; 30+ messages in thread
From: Chris Wilson @ 2021-01-11 21:10 UTC (permalink / raw)
  To: Abodunrin, Akeem G, intel-gfx; +Cc: stable

Quoting Abodunrin, Akeem G (2021-01-11 20:58:42)
> 
> 
> > -----Original Message-----
> > From: Intel-gfx <intel-gfx-bounces@lists.freedesktop.org> On Behalf Of Chris
> > Wilson
> > Sent: Sunday, January 10, 2021 7:04 AM
> > To: intel-gfx@lists.freedesktop.org
> > Cc: stable@vger.kernel.org; Chris Wilson <chris@chris-wilson.co.uk>
> > Subject: [Intel-gfx] [PATCH 03/11] drm/i915: Allow the sysadmin to override
> > security mitigations
> > 
> > The clear-residuals mitigation is a relatively heavy hammer and under some
> > circumstances the user may wish to forgo the context isolation in order to
> > meet some performance requirement. Introduce a generic module parameter
> > to allow selectively enabling/disabling different mitigations.

> Although this seems like ideal solution - giving users option to choose *potential* performance over security or vice-versa -  However, I would have expected that this patch adds a DRM warning to inform users of the consequences of their action, whenever module parameter is used to disable any kind of mitigations. Well, that is my own perspective, not as a legal expert.

It's marked as unsafe; setting this parameter will issue a notice and
taint the kernel. That should be enough to warn of the consequences of
their actions, without going into the gruesome details.

I very briefly considered a few pr_warn_once() for each disabled
mitigation, but I am not sure what we should say to the user.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

end of thread, other threads:[~2021-01-11 21:11 UTC | newest]

Thread overview: 30+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-01-10 15:03 [PATCH 01/11] drm/i915/gt: Limit VFE threads based on GT Chris Wilson
2021-01-10 15:03 ` [Intel-gfx] " Chris Wilson
2021-01-10 15:03 ` [Intel-gfx] [PATCH 02/11] drm/i915/gt: Restore clear-residual mitigations for Ivybridge, Baytrail Chris Wilson
2021-01-11 17:35   ` Rodrigo Vivi
2021-01-10 15:03 ` [PATCH 03/11] drm/i915: Allow the sysadmin to override security mitigations Chris Wilson
2021-01-10 15:03   ` [Intel-gfx] " Chris Wilson
2021-01-11 17:31   ` Bloomfield, Jon
2021-01-11 17:31     ` [Intel-gfx] " Bloomfield, Jon
2021-01-11 17:48   ` Rodrigo Vivi
2021-01-11 17:48     ` Rodrigo Vivi
2021-01-11 20:58   ` Abodunrin, Akeem G
2021-01-11 20:58     ` Abodunrin, Akeem G
2021-01-11 21:10     ` Chris Wilson
2021-01-10 15:03 ` [Intel-gfx] [PATCH 04/11] drm/i915/gt: Rearrange vlv workarounds Chris Wilson
2021-01-10 15:03 ` [Intel-gfx] [PATCH 05/11] drm/i915/gt: Rearrange ivb workarounds Chris Wilson
2021-01-10 15:03 ` [Intel-gfx] [PATCH 06/11] drm/i915/gt: Replace open-coded intel_engine_stop_cs() Chris Wilson
2021-01-10 15:04 ` [Intel-gfx] [PATCH 07/11] drm/i915/gt: Reapply ppgtt enabling after engine resets Chris Wilson
2021-01-10 15:04 ` [Intel-gfx] [PATCH 08/11] drm/i915/gt: Lift stop_ring() to reset_prepare Chris Wilson
2021-01-10 15:04 ` [Intel-gfx] [PATCH 09/11] drm/i915/gt: Pull ring submission resume under its caller forcewake Chris Wilson
2021-01-10 15:04 ` [Intel-gfx] [PATCH 10/11] drm/i915/selftests: Prepare the selftests for engine resets with ring submission Chris Wilson
2021-01-10 15:04 ` [Intel-gfx] [PATCH 11/11] drm/i915: Mark per-engine-reset as supported on gen7 Chris Wilson
2021-01-10 15:35 ` [Intel-gfx] ✗ Fi.CI.CHECKPATCH: warning for series starting with [01/11] drm/i915/gt: Limit VFE threads based on GT Patchwork
2021-01-10 15:35 ` [Intel-gfx] ✗ Fi.CI.SPARSE: " Patchwork
2021-01-10 16:05 ` [Intel-gfx] ✗ Fi.CI.BAT: failure " Patchwork
2021-01-11 17:35 ` [Intel-gfx] [PATCH 01/11] " Rodrigo Vivi
2021-01-11 17:35   ` Rodrigo Vivi
2021-01-11 20:51   ` Chris Wilson
2021-01-11 20:51     ` Chris Wilson
2021-01-11 21:04     ` Rodrigo Vivi
2021-01-11 21:04       ` Rodrigo Vivi

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.