All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/3] drm/i915/execlists: Reinitialise context image after GPU hang
@ 2016-09-30  7:50 Chris Wilson
  2016-09-30  7:50 ` [PATCH 2/3] drm/i915: Call synchronize_irq() after resetting the GPU Chris Wilson
                   ` (4 more replies)
  0 siblings, 5 replies; 10+ messages in thread
From: Chris Wilson @ 2016-09-30  7:50 UTC (permalink / raw)
  To: intel-gfx; +Cc: Mika Kuoppala

On Braswell, at least, we observe that the context image is written in
multiple phases. The first phase is to clear the register state, and
subsequently rewrite it. A GPU reset at the right moment can interrupt
the context update leaving it corrupt, and our update of the RING_HEAD
is not sufficient to restart the engine afterwards. To recover, we need
to reset the registers back to their original values. The context state
is lost. What we need is a better mechanism to serialise the reset with
pending flushes from the GPU.

Fixes: 821ed7df6e2a ("drm/i915: Update reset path to fix incomplete requests")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/intel_lrc.c | 95 +++++++++++++++++++++++-----------------
 1 file changed, 56 insertions(+), 39 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 2d8eb2eb2b72..d6e762718ff4 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -226,10 +226,16 @@ enum {
 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
 
+#define WA_TAIL_DWORDS 2
+
 static int execlists_context_deferred_alloc(struct i915_gem_context *ctx,
 					    struct intel_engine_cs *engine);
 static int intel_lr_context_pin(struct i915_gem_context *ctx,
 				struct intel_engine_cs *engine);
+static void execlists_init_reg_state(u32 *reg_state,
+				     struct i915_gem_context *ctx,
+				     struct intel_engine_cs *engine,
+				     struct intel_ring *ring);
 
 /**
  * intel_sanitize_enable_execlists() - sanitize i915.enable_execlists
@@ -707,7 +713,6 @@ static int intel_lr_context_pin(struct i915_gem_context *ctx,
 {
 	struct intel_context *ce = &ctx->engine[engine->id];
 	void *vaddr;
-	u32 *lrc_reg_state;
 	int ret;
 
 	lockdep_assert_held(&ctx->i915->drm.struct_mutex);
@@ -726,17 +731,16 @@ static int intel_lr_context_pin(struct i915_gem_context *ctx,
 		goto unpin_vma;
 	}
 
-	lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
-
 	ret = intel_ring_pin(ce->ring);
 	if (ret)
 		goto unpin_map;
 
 	intel_lr_context_descriptor_update(ctx, engine);
 
-	lrc_reg_state[CTX_RING_BUFFER_START+1] =
+	ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
+	ce->lrc_reg_state[CTX_RING_BUFFER_START+1] =
 		i915_ggtt_offset(ce->ring->vma);
-	ce->lrc_reg_state = lrc_reg_state;
+
 	ce->state->obj->dirty = true;
 
 	/* Invalidate GuC TLB. */
@@ -1284,8 +1288,14 @@ static void reset_common_ring(struct intel_engine_cs *engine,
 	struct execlist_port *port = engine->execlist_port;
 	struct intel_context *ce = &request->ctx->engine[engine->id];
 
+	execlists_init_reg_state(ce->lrc_reg_state,
+				 request->ctx, engine, ce->ring);
+
 	/* Move the RING_HEAD onto the breadcrumb, past the hanging batch */
+	ce->lrc_reg_state[CTX_RING_BUFFER_START+1] =
+		i915_ggtt_offset(ce->ring->vma);
 	ce->lrc_reg_state[CTX_RING_HEAD+1] = request->postfix;
+
 	request->ring->head = request->postfix;
 	request->ring->last_retired_head = -1;
 	intel_ring_update_space(request->ring);
@@ -1305,6 +1315,9 @@ static void reset_common_ring(struct intel_engine_cs *engine,
 	GEM_BUG_ON(request->ctx != port[0].request->ctx);
 	port[0].count = 0;
 	port[1].count = 0;
+
+	/* Reset WaIdleLiteRestore:bdw,skl as well */
+	request->tail = request->wa_tail - WA_TAIL_DWORDS * sizeof(u32);
 }
 
 static int intel_logical_ring_emit_pdps(struct drm_i915_gem_request *req)
@@ -1542,7 +1555,6 @@ static void bxt_a_seqno_barrier(struct intel_engine_cs *engine)
  * used as a workaround for not being allowed to do lite
  * restore with HEAD==TAIL (WaIdleLiteRestore).
  */
-#define WA_TAIL_DWORDS 2
 
 static int gen8_emit_request(struct drm_i915_gem_request *request)
 {
@@ -1889,38 +1901,13 @@ static u32 intel_lr_indirect_ctx_offset(struct intel_engine_cs *engine)
 	return indirect_ctx_offset;
 }
 
-static int
-populate_lr_context(struct i915_gem_context *ctx,
-		    struct drm_i915_gem_object *ctx_obj,
-		    struct intel_engine_cs *engine,
-		    struct intel_ring *ring)
+static void execlists_init_reg_state(u32 *reg_state,
+				     struct i915_gem_context *ctx,
+				     struct intel_engine_cs *engine,
+				     struct intel_ring *ring)
 {
-	struct drm_i915_private *dev_priv = ctx->i915;
-	struct i915_hw_ppgtt *ppgtt = ctx->ppgtt;
-	void *vaddr;
-	u32 *reg_state;
-	int ret;
-
-	if (!ppgtt)
-		ppgtt = dev_priv->mm.aliasing_ppgtt;
-
-	ret = i915_gem_object_set_to_cpu_domain(ctx_obj, true);
-	if (ret) {
-		DRM_DEBUG_DRIVER("Could not set to CPU domain\n");
-		return ret;
-	}
-
-	vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
-	if (IS_ERR(vaddr)) {
-		ret = PTR_ERR(vaddr);
-		DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret);
-		return ret;
-	}
-	ctx_obj->dirty = true;
-
-	/* The second page of the context object contains some fields which must
-	 * be set up prior to the first execution. */
-	reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
+	struct drm_i915_private *dev_priv = engine->i915;
+	struct i915_hw_ppgtt *ppgtt = ctx->ppgtt ?: dev_priv->mm.aliasing_ppgtt;
 
 	/* A context is actually a big batch buffer with several MI_LOAD_REGISTER_IMM
 	 * commands followed by (reg, value) pairs. The values we are setting here are
@@ -1934,7 +1921,7 @@ populate_lr_context(struct i915_gem_context *ctx,
 		       _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH |
 					  CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT |
 					  (HAS_RESOURCE_STREAMER(dev_priv) ?
-					    CTX_CTRL_RS_CTX_ENABLE : 0)));
+					   CTX_CTRL_RS_CTX_ENABLE : 0)));
 	ASSIGN_CTX_REG(reg_state, CTX_RING_HEAD, RING_HEAD(engine->mmio_base),
 		       0);
 	ASSIGN_CTX_REG(reg_state, CTX_RING_TAIL, RING_TAIL(engine->mmio_base),
@@ -1946,7 +1933,7 @@ populate_lr_context(struct i915_gem_context *ctx,
 		       RING_START(engine->mmio_base), 0);
 	ASSIGN_CTX_REG(reg_state, CTX_RING_BUFFER_CONTROL,
 		       RING_CTL(engine->mmio_base),
-		       ((ring->size - PAGE_SIZE) & RING_NR_PAGES) | RING_VALID);
+		       (ring->size - PAGE_SIZE) | RING_VALID);
 	ASSIGN_CTX_REG(reg_state, CTX_BB_HEAD_U,
 		       RING_BBADDR_UDW(engine->mmio_base), 0);
 	ASSIGN_CTX_REG(reg_state, CTX_BB_HEAD_L,
@@ -2024,6 +2011,36 @@ populate_lr_context(struct i915_gem_context *ctx,
 		ASSIGN_CTX_REG(reg_state, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE,
 			       make_rpcs(dev_priv));
 	}
+}
+
+static int
+populate_lr_context(struct i915_gem_context *ctx,
+		    struct drm_i915_gem_object *ctx_obj,
+		    struct intel_engine_cs *engine,
+		    struct intel_ring *ring)
+{
+	void *vaddr;
+	int ret;
+
+	ret = i915_gem_object_set_to_cpu_domain(ctx_obj, true);
+	if (ret) {
+		DRM_DEBUG_DRIVER("Could not set to CPU domain\n");
+		return ret;
+	}
+
+	vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
+	if (IS_ERR(vaddr)) {
+		ret = PTR_ERR(vaddr);
+		DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret);
+		return ret;
+	}
+	ctx_obj->dirty = true;
+
+	/* The second page of the context object contains some fields which must
+	 * be set up prior to the first execution. */
+
+	execlists_init_reg_state(vaddr + LRC_STATE_PN * PAGE_SIZE,
+				 ctx, engine, ring);
 
 	i915_gem_object_unpin_map(ctx_obj);
 
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 2/3] drm/i915: Call synchronize_irq() after resetting the GPU
  2016-09-30  7:50 [PATCH 1/3] drm/i915/execlists: Reinitialise context image after GPU hang Chris Wilson
@ 2016-09-30  7:50 ` Chris Wilson
  2016-09-30  8:26   ` Mika Kuoppala
  2016-09-30  7:50 ` [PATCH 3/3] drm/i915: Show RING registers through debugfs Chris Wilson
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 10+ messages in thread
From: Chris Wilson @ 2016-09-30  7:50 UTC (permalink / raw)
  To: intel-gfx; +Cc: Mika Kuoppala

When we reset the GPU, we want to reinitialise it from a known step. In
order to do so, we have to flush any pending operations from the irq
handler and its tasklets. Add a missing synchronize_irq().

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_gem.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 1418c1c522cb..d037da8ea449 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2581,8 +2581,6 @@ static void i915_gem_reset_engine(struct intel_engine_cs *engine)
 	struct i915_gem_context *incomplete_ctx;
 	bool ring_hung;
 
-	/* Ensure irq handler finishes, and not run again. */
-	tasklet_kill(&engine->irq_tasklet);
 	if (engine->irq_seqno_barrier)
 		engine->irq_seqno_barrier(engine);
 
@@ -2624,6 +2622,11 @@ void i915_gem_reset(struct drm_i915_private *dev_priv)
 
 	i915_gem_retire_requests(dev_priv);
 
+	/* Ensure irq handler finishes, and not run again. */
+	synchronize_irq(dev_priv->drm.irq);
+	for_each_engine(engine, dev_priv)
+		tasklet_kill(&engine->irq_tasklet);
+
 	for_each_engine(engine, dev_priv)
 		i915_gem_reset_engine(engine);
 
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 3/3] drm/i915: Show RING registers through debugfs
  2016-09-30  7:50 [PATCH 1/3] drm/i915/execlists: Reinitialise context image after GPU hang Chris Wilson
  2016-09-30  7:50 ` [PATCH 2/3] drm/i915: Call synchronize_irq() after resetting the GPU Chris Wilson
@ 2016-09-30  7:50 ` Chris Wilson
  2016-09-30  8:20 ` ✗ Fi.CI.BAT: warning for series starting with [1/3] drm/i915/execlists: Reinitialise context image after GPU hang Patchwork
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 10+ messages in thread
From: Chris Wilson @ 2016-09-30  7:50 UTC (permalink / raw)
  To: intel-gfx; +Cc: Mika Kuoppala

Knowing where the RINGs are pointing is extremely useful in diagnosing
if the engines are executing the ringbuffers you expect - and igt may be
suppressing the usual method of looking in the GPU error state.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c | 178 ++++++++++++++++++++----------------
 1 file changed, 99 insertions(+), 79 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 4fb9d829884e..81e4f12450d9 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -2036,84 +2036,6 @@ static int i915_dump_lrc(struct seq_file *m, void *unused)
 	return 0;
 }
 
-static int i915_execlists(struct seq_file *m, void *data)
-{
-	struct drm_i915_private *dev_priv = node_to_i915(m->private);
-	struct drm_device *dev = &dev_priv->drm;
-	struct intel_engine_cs *engine;
-	u32 status_pointer;
-	u8 read_pointer;
-	u8 write_pointer;
-	u32 status;
-	u32 ctx_id;
-	struct list_head *cursor;
-	int i, ret;
-
-	if (!i915.enable_execlists) {
-		seq_puts(m, "Logical Ring Contexts are disabled\n");
-		return 0;
-	}
-
-	ret = mutex_lock_interruptible(&dev->struct_mutex);
-	if (ret)
-		return ret;
-
-	intel_runtime_pm_get(dev_priv);
-
-	for_each_engine(engine, dev_priv) {
-		struct drm_i915_gem_request *head_req = NULL;
-		int count = 0;
-
-		seq_printf(m, "%s\n", engine->name);
-
-		status = I915_READ(RING_EXECLIST_STATUS_LO(engine));
-		ctx_id = I915_READ(RING_EXECLIST_STATUS_HI(engine));
-		seq_printf(m, "\tExeclist status: 0x%08X, context: %u\n",
-			   status, ctx_id);
-
-		status_pointer = I915_READ(RING_CONTEXT_STATUS_PTR(engine));
-		seq_printf(m, "\tStatus pointer: 0x%08X\n", status_pointer);
-
-		read_pointer = GEN8_CSB_READ_PTR(status_pointer);
-		write_pointer = GEN8_CSB_WRITE_PTR(status_pointer);
-		if (read_pointer > write_pointer)
-			write_pointer += GEN8_CSB_ENTRIES;
-		seq_printf(m, "\tRead pointer: 0x%08X, write pointer 0x%08X\n",
-			   read_pointer, write_pointer);
-
-		for (i = 0; i < GEN8_CSB_ENTRIES; i++) {
-			status = I915_READ(RING_CONTEXT_STATUS_BUF_LO(engine, i));
-			ctx_id = I915_READ(RING_CONTEXT_STATUS_BUF_HI(engine, i));
-
-			seq_printf(m, "\tStatus buffer %d: 0x%08X, context: %u\n",
-				   i, status, ctx_id);
-		}
-
-		spin_lock_bh(&engine->execlist_lock);
-		list_for_each(cursor, &engine->execlist_queue)
-			count++;
-		head_req = list_first_entry_or_null(&engine->execlist_queue,
-						    struct drm_i915_gem_request,
-						    execlist_link);
-		spin_unlock_bh(&engine->execlist_lock);
-
-		seq_printf(m, "\t%d requests in queue\n", count);
-		if (head_req) {
-			seq_printf(m, "\tHead request context: %u\n",
-				   head_req->ctx->hw_id);
-			seq_printf(m, "\tHead request tail: %u\n",
-				   head_req->tail);
-		}
-
-		seq_putc(m, '\n');
-	}
-
-	intel_runtime_pm_put(dev_priv);
-	mutex_unlock(&dev->struct_mutex);
-
-	return 0;
-}
-
 static const char *swizzle_string(unsigned swizzle)
 {
 	switch (swizzle) {
@@ -3135,6 +3057,104 @@ static int i915_display_info(struct seq_file *m, void *unused)
 	return 0;
 }
 
+static int i915_engine_info(struct seq_file *m, void *unused)
+{
+	struct drm_i915_private *dev_priv = node_to_i915(m->private);
+	struct intel_engine_cs *engine;
+	int ret;
+
+	ret = mutex_lock_interruptible(&dev_priv->drm.struct_mutex);
+	if (ret)
+		return ret;
+
+	for_each_engine(engine, dev_priv) {
+		struct intel_breadcrumbs *b = &engine->breadcrumbs;
+		struct drm_i915_gem_request *rq;
+		struct rb_node *rb;
+		int count;
+
+		seq_printf(m, "%s\n", engine->name);
+		seq_printf(m, "\tcurrent seqno %d, last %d\n",
+			   intel_engine_get_seqno(engine),
+			   engine->last_submitted_seqno);
+
+		count = 0;
+		list_for_each_entry(rq, &engine->request_list, link)
+			count++;
+		seq_printf(m, "\t%d submitted requests\n", count);
+
+		if (count > 0 ) {
+			rq = list_first_entry(&engine->request_list,
+					      struct drm_i915_gem_request, link);
+			seq_printf(m, "\t\tfirst  %d\n", rq->fence.seqno);
+
+			rq = list_last_entry(&engine->request_list,
+					     struct drm_i915_gem_request, link);
+			seq_printf(m, "\t\tlast   %d\n", rq->fence.seqno);
+		}
+
+		rq = i915_gem_find_active_request(engine);
+		if (rq)
+			seq_printf(m, "\t\tactive %d\n", rq->fence.seqno);
+
+		seq_printf(m, "\tRING_START: 0x%08x [0x%08x]\n",
+			   I915_READ(RING_START(engine->mmio_base)),
+			   rq ? i915_ggtt_offset(rq->ring->vma) : 0);
+		seq_printf(m, "\tRING_HEAD: 0x%04x [0x%04x]\n",
+			   I915_READ(RING_HEAD(engine->mmio_base)) & HEAD_ADDR,
+			   rq ? rq->ring->head : 0);
+		seq_printf(m, "\tRING_TAIL: 0x%04x [0x%04x]\n",
+			   I915_READ(RING_TAIL(engine->mmio_base)) & TAIL_ADDR,
+			   rq ? rq->ring->tail : 0);
+
+		if (i915.enable_execlists) {
+			u32 ptr, read, write;
+
+			seq_printf(m, "\tExeclist status: 0x%08x %08x\n",
+				   I915_READ(RING_EXECLIST_STATUS_LO(engine)),
+				   I915_READ(RING_EXECLIST_STATUS_HI(engine)));
+
+			ptr = I915_READ(RING_CONTEXT_STATUS_PTR(engine));
+			read = GEN8_CSB_READ_PTR(ptr);
+			write = GEN8_CSB_WRITE_PTR(ptr);
+			seq_printf(m, "\tExeclist CSB read %d, write %d\n",
+				   read, write);
+
+			if (read > write)
+				write += GEN8_CSB_ENTRIES;
+			while (read < write) {
+				unsigned int idx = ++read % GEN8_CSB_ENTRIES;
+
+				seq_printf(m, "\tExeclist CSB[%d]: 0x%08x, context: %d\n",
+					   idx,
+					   I915_READ(RING_CONTEXT_STATUS_BUF_LO(engine, idx)),
+					   I915_READ(RING_CONTEXT_STATUS_BUF_HI(engine, idx)));
+			}
+		} else if (INTEL_GEN(dev_priv) > 6) {
+			seq_printf(m, "\tPP_DIR_BASE: 0x%08x\n",
+				   I915_READ(RING_PP_DIR_BASE(engine)));
+			seq_printf(m, "\tPP_DIR_BASE_READ: 0x%08x\n",
+				   I915_READ(RING_PP_DIR_BASE_READ(engine)));
+			seq_printf(m, "\tPP_DIR_DCLV: 0x%08x\n",
+				   I915_READ(RING_PP_DIR_DCLV(engine)));
+		}
+
+		spin_lock(&b->lock);
+		for (rb = rb_first(&b->waiters); rb; rb = rb_next(rb)) {
+			struct intel_wait *w = container_of(rb, typeof(*w), node);
+
+			seq_printf(m, "\t%s [%d] waiting for %d\n",
+				   w->tsk->comm, w->tsk->pid, w->seqno);
+		}
+		spin_unlock(&b->lock);
+
+		seq_printf(m, "\n");
+	}
+
+	mutex_unlock(&dev_priv->drm.struct_mutex);
+	return 0;
+}
+
 static int i915_semaphore_status(struct seq_file *m, void *unused)
 {
 	struct drm_i915_private *dev_priv = node_to_i915(m->private);
@@ -4441,7 +4461,6 @@ static const struct drm_info_list i915_debugfs_list[] = {
 	{"i915_gem_framebuffer", i915_gem_framebuffer_info, 0},
 	{"i915_context_status", i915_context_status, 0},
 	{"i915_dump_lrc", i915_dump_lrc, 0},
-	{"i915_execlists", i915_execlists, 0},
 	{"i915_forcewake_domains", i915_forcewake_domains, 0},
 	{"i915_swizzle_info", i915_swizzle_info, 0},
 	{"i915_ppgtt_info", i915_ppgtt_info, 0},
@@ -4453,6 +4472,7 @@ static const struct drm_info_list i915_debugfs_list[] = {
 	{"i915_power_domain_info", i915_power_domain_info, 0},
 	{"i915_dmc_info", i915_dmc_info, 0},
 	{"i915_display_info", i915_display_info, 0},
+	{"i915_engine_info", i915_engine_info, 0},
 	{"i915_semaphore_status", i915_semaphore_status, 0},
 	{"i915_shared_dplls_info", i915_shared_dplls_info, 0},
 	{"i915_dp_mst_info", i915_dp_mst_info, 0},
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* ✗ Fi.CI.BAT: warning for series starting with [1/3] drm/i915/execlists: Reinitialise context image after GPU hang
  2016-09-30  7:50 [PATCH 1/3] drm/i915/execlists: Reinitialise context image after GPU hang Chris Wilson
  2016-09-30  7:50 ` [PATCH 2/3] drm/i915: Call synchronize_irq() after resetting the GPU Chris Wilson
  2016-09-30  7:50 ` [PATCH 3/3] drm/i915: Show RING registers through debugfs Chris Wilson
@ 2016-09-30  8:20 ` Patchwork
  2016-09-30 13:18 ` ✗ Fi.CI.BAT: warning for series starting with [1/3] drm/i915/execlists: Reinitialise context image after GPU hang (rev2) Patchwork
  2016-10-03 12:25 ` [PATCH 1/3] drm/i915/execlists: Reinitialise context image after GPU hang Mika Kuoppala
  4 siblings, 0 replies; 10+ messages in thread
From: Patchwork @ 2016-09-30  8:20 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/3] drm/i915/execlists: Reinitialise context image after GPU hang
URL   : https://patchwork.freedesktop.org/series/13117/
State : warning

== Summary ==

Series 13117v1 Series without cover letter
https://patchwork.freedesktop.org/api/1.0/series/13117/revisions/1/mbox/

Test kms_pipe_crc_basic:
        Subgroup hang-read-crc-pipe-b:
                pass       -> DMESG-WARN (fi-ilk-650)
        Subgroup suspend-read-crc-pipe-a:
                dmesg-warn -> PASS       (fi-byt-j1900)

fi-bdw-5557u     total:244  pass:229  dwarn:0   dfail:0   fail:0   skip:15 
fi-bsw-n3050     total:244  pass:202  dwarn:0   dfail:0   fail:0   skip:42 
fi-bxt-t5700     total:244  pass:214  dwarn:0   dfail:0   fail:0   skip:30 
fi-byt-j1900     total:244  pass:211  dwarn:1   dfail:0   fail:1   skip:31 
fi-byt-n2820     total:244  pass:208  dwarn:0   dfail:0   fail:1   skip:35 
fi-hsw-4770      total:244  pass:222  dwarn:0   dfail:0   fail:0   skip:22 
fi-hsw-4770r     total:244  pass:222  dwarn:0   dfail:0   fail:0   skip:22 
fi-ilk-650       total:244  pass:181  dwarn:1   dfail:0   fail:2   skip:60 
fi-ivb-3520m     total:244  pass:219  dwarn:0   dfail:0   fail:0   skip:25 
fi-ivb-3770      total:244  pass:207  dwarn:0   dfail:0   fail:0   skip:37 
fi-skl-6260u     total:244  pass:230  dwarn:0   dfail:0   fail:0   skip:14 
fi-skl-6700hq    total:244  pass:221  dwarn:1   dfail:0   fail:0   skip:22 
fi-skl-6700k     total:244  pass:219  dwarn:1   dfail:0   fail:0   skip:24 
fi-skl-6770hq    total:244  pass:228  dwarn:1   dfail:0   fail:1   skip:14 
fi-snb-2520m     total:244  pass:208  dwarn:0   dfail:0   fail:0   skip:36 
fi-snb-2600      total:244  pass:207  dwarn:0   dfail:0   fail:0   skip:37 

Results at /archive/results/CI_IGT_test/Patchwork_2599/

2fbc239494fa3883e164cc89d35f54d22f0c9e1f drm-intel-nightly: 2016y-09m-29d-13h-31m-39s UTC integration manifest
409c05a drm/i915: Show RING registers through debugfs
998d90f drm/i915: Call synchronize_irq() after resetting the GPU
016f45d drm/i915/execlists: Reinitialise context image after GPU hang

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/3] drm/i915: Call synchronize_irq() after resetting the GPU
  2016-09-30  7:50 ` [PATCH 2/3] drm/i915: Call synchronize_irq() after resetting the GPU Chris Wilson
@ 2016-09-30  8:26   ` Mika Kuoppala
  2016-09-30  8:38     ` Chris Wilson
  2016-09-30 11:25     ` [PATCH] drm/i915: Disable irqs across GPU reset Chris Wilson
  0 siblings, 2 replies; 10+ messages in thread
From: Mika Kuoppala @ 2016-09-30  8:26 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> When we reset the GPU, we want to reinitialise it from a known step. In
> order to do so, we have to flush any pending operations from the irq
> handler and its tasklets. Add a missing synchronize_irq().
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_gem.c | 7 +++++--
>  1 file changed, 5 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 1418c1c522cb..d037da8ea449 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2581,8 +2581,6 @@ static void i915_gem_reset_engine(struct intel_engine_cs *engine)
>  	struct i915_gem_context *incomplete_ctx;
>  	bool ring_hung;
>  
> -	/* Ensure irq handler finishes, and not run again. */
> -	tasklet_kill(&engine->irq_tasklet);
>  	if (engine->irq_seqno_barrier)
>  		engine->irq_seqno_barrier(engine);
>  
> @@ -2624,6 +2622,11 @@ void i915_gem_reset(struct drm_i915_private *dev_priv)
>  
>  	i915_gem_retire_requests(dev_priv);
>  
> +	/* Ensure irq handler finishes, and not run again. */
> +	synchronize_irq(dev_priv->drm.irq);
> +	for_each_engine(engine, dev_priv)
> +		tasklet_kill(&engine->irq_tasklet);
> +

But you still might get one tasklet run if there was interrupt pending,
post reset?

How about this kind of ordering:

engine->disable_irq();
synchronize_irq();
tasklet_kill(engine);

intel_gpu_reset();
i915_gem_reset();

Perhaps I am paranoid but I want to guarantee the frozen state
before we hit the reset button.

-Mika


>  	for_each_engine(engine, dev_priv)
>  		i915_gem_reset_engine(engine);
>  
> -- 
> 2.9.3
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/3] drm/i915: Call synchronize_irq() after resetting the GPU
  2016-09-30  8:26   ` Mika Kuoppala
@ 2016-09-30  8:38     ` Chris Wilson
  2016-09-30 11:25     ` [PATCH] drm/i915: Disable irqs across GPU reset Chris Wilson
  1 sibling, 0 replies; 10+ messages in thread
From: Chris Wilson @ 2016-09-30  8:38 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx

On Fri, Sep 30, 2016 at 11:26:55AM +0300, Mika Kuoppala wrote:
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > When we reset the GPU, we want to reinitialise it from a known step. In
> > order to do so, we have to flush any pending operations from the irq
> > handler and its tasklets. Add a missing synchronize_irq().
> >
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Mika Kuoppala <mika.kuoppala@intel.com>
> > ---
> >  drivers/gpu/drm/i915/i915_gem.c | 7 +++++--
> >  1 file changed, 5 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> > index 1418c1c522cb..d037da8ea449 100644
> > --- a/drivers/gpu/drm/i915/i915_gem.c
> > +++ b/drivers/gpu/drm/i915/i915_gem.c
> > @@ -2581,8 +2581,6 @@ static void i915_gem_reset_engine(struct intel_engine_cs *engine)
> >  	struct i915_gem_context *incomplete_ctx;
> >  	bool ring_hung;
> >  
> > -	/* Ensure irq handler finishes, and not run again. */
> > -	tasklet_kill(&engine->irq_tasklet);
> >  	if (engine->irq_seqno_barrier)
> >  		engine->irq_seqno_barrier(engine);
> >  
> > @@ -2624,6 +2622,11 @@ void i915_gem_reset(struct drm_i915_private *dev_priv)
> >  
> >  	i915_gem_retire_requests(dev_priv);
> >  
> > +	/* Ensure irq handler finishes, and not run again. */
> > +	synchronize_irq(dev_priv->drm.irq);
> > +	for_each_engine(engine, dev_priv)
> > +		tasklet_kill(&engine->irq_tasklet);
> > +
> 
> But you still might get one tasklet run if there was interrupt pending,
> post reset?
> 
> How about this kind of ordering:
> 
> engine->disable_irq();
> synchronize_irq();
> tasklet_kill(engine);
> 
> intel_gpu_reset();
> i915_gem_reset();
> 
> Perhaps I am paranoid but I want to guarantee the frozen state
> before we hit the reset button.

Yeah, we may as way bite the bullet and uninstall the irq around the
reset. We don't yet have a callback to simply mask the engines/execlists.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH] drm/i915: Disable irqs across GPU reset
  2016-09-30  8:26   ` Mika Kuoppala
  2016-09-30  8:38     ` Chris Wilson
@ 2016-09-30 11:25     ` Chris Wilson
  2016-10-03 12:42       ` Mika Kuoppala
  1 sibling, 1 reply; 10+ messages in thread
From: Chris Wilson @ 2016-09-30 11:25 UTC (permalink / raw)
  To: intel-gfx; +Cc: Mika Kuoppala

Whilst we reset the GPU, we want to prevent execlists from submitting
new work (which it does via an interrupt handler). To achieve this we
disable the irq (and drain the irq tasklet) around the reset. When we
enable it again afters, the interrupt queue should be empty and we can
reinitialise from a known state without fear of the tasklet running
concurrently.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.c | 19 +++++++++++++++++++
 drivers/gpu/drm/i915/i915_gem.c |  2 --
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 31b2b6300d8d..7ce498898217 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1728,6 +1728,21 @@ int i915_resume_switcheroo(struct drm_device *dev)
 	return i915_drm_resume(dev);
 }
 
+static void disable_engines_irq(struct drm_i915_private *dev_priv)
+{
+	struct intel_engine_cs *engine;
+
+	/* Ensure irq handler finishes, and not run again. */
+	disable_irq(dev_priv->drm.irq);
+	for_each_engine(engine, dev_priv)
+		tasklet_kill(&engine->irq_tasklet);
+}
+
+static void enable_engines_irq(struct drm_i915_private *dev_priv)
+{
+	enable_irq(dev_priv->drm.irq);
+}
+
 /**
  * i915_reset - reset chip after a hang
  * @dev: drm device to reset
@@ -1761,7 +1776,11 @@ void i915_reset(struct drm_i915_private *dev_priv)
 	error->reset_count++;
 
 	pr_notice("drm/i915: Resetting chip after gpu hang\n");
+
+	disable_engines_irq(dev_priv);
 	ret = intel_gpu_reset(dev_priv, ALL_ENGINES);
+	enable_engines_irq(dev_priv);
+
 	if (ret) {
 		if (ret != -ENODEV)
 			DRM_ERROR("Failed to reset chip: %i\n", ret);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 1418c1c522cb..0cae8acdf906 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2581,8 +2581,6 @@ static void i915_gem_reset_engine(struct intel_engine_cs *engine)
 	struct i915_gem_context *incomplete_ctx;
 	bool ring_hung;
 
-	/* Ensure irq handler finishes, and not run again. */
-	tasklet_kill(&engine->irq_tasklet);
 	if (engine->irq_seqno_barrier)
 		engine->irq_seqno_barrier(engine);
 
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* ✗ Fi.CI.BAT: warning for series starting with [1/3] drm/i915/execlists: Reinitialise context image after GPU hang (rev2)
  2016-09-30  7:50 [PATCH 1/3] drm/i915/execlists: Reinitialise context image after GPU hang Chris Wilson
                   ` (2 preceding siblings ...)
  2016-09-30  8:20 ` ✗ Fi.CI.BAT: warning for series starting with [1/3] drm/i915/execlists: Reinitialise context image after GPU hang Patchwork
@ 2016-09-30 13:18 ` Patchwork
  2016-10-03 12:25 ` [PATCH 1/3] drm/i915/execlists: Reinitialise context image after GPU hang Mika Kuoppala
  4 siblings, 0 replies; 10+ messages in thread
From: Patchwork @ 2016-09-30 13:18 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/3] drm/i915/execlists: Reinitialise context image after GPU hang (rev2)
URL   : https://patchwork.freedesktop.org/series/13117/
State : warning

== Summary ==

Series 13117v2 Series without cover letter
https://patchwork.freedesktop.org/api/1.0/series/13117/revisions/2/mbox/

Test kms_pipe_crc_basic:
        Subgroup suspend-read-crc-pipe-a:
                pass       -> DMESG-WARN (fi-byt-j1900)
        Subgroup suspend-read-crc-pipe-b:
                pass       -> DMESG-WARN (fi-byt-j1900)

fi-bdw-5557u     total:244  pass:229  dwarn:0   dfail:0   fail:0   skip:15 
fi-bsw-n3050     total:244  pass:202  dwarn:0   dfail:0   fail:0   skip:42 
fi-bxt-t5700     total:244  pass:214  dwarn:0   dfail:0   fail:0   skip:30 
fi-byt-j1900     total:244  pass:210  dwarn:2   dfail:0   fail:1   skip:31 
fi-byt-n2820     total:244  pass:208  dwarn:0   dfail:0   fail:1   skip:35 
fi-hsw-4770      total:244  pass:222  dwarn:0   dfail:0   fail:0   skip:22 
fi-hsw-4770r     total:244  pass:222  dwarn:0   dfail:0   fail:0   skip:22 
fi-ilk-650       total:244  pass:182  dwarn:0   dfail:0   fail:2   skip:60 
fi-ivb-3520m     total:244  pass:219  dwarn:0   dfail:0   fail:0   skip:25 
fi-ivb-3770      total:244  pass:207  dwarn:0   dfail:0   fail:0   skip:37 
fi-skl-6260u     total:244  pass:230  dwarn:0   dfail:0   fail:0   skip:14 
fi-skl-6700hq    total:244  pass:221  dwarn:1   dfail:0   fail:0   skip:22 
fi-skl-6700k     total:244  pass:219  dwarn:1   dfail:0   fail:0   skip:24 
fi-skl-6770hq    total:244  pass:228  dwarn:1   dfail:0   fail:1   skip:14 
fi-snb-2520m     total:244  pass:208  dwarn:0   dfail:0   fail:0   skip:36 
fi-snb-2600      total:244  pass:207  dwarn:0   dfail:0   fail:0   skip:37 

Results at /archive/results/CI_IGT_test/Patchwork_2602/

d0534bd0217c83c083ba146b9c987e19b219e0e4 drm-intel-nightly: 2016y-09m-30d-10h-31m-00s UTC integration manifest
7ad3392 drm/i915: Show RING registers through debugfs
50d76f3 drm/i915: Disable irqs across GPU reset
8dbb562 drm/i915/execlists: Reinitialise context image after GPU hang

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/3] drm/i915/execlists: Reinitialise context image after GPU hang
  2016-09-30  7:50 [PATCH 1/3] drm/i915/execlists: Reinitialise context image after GPU hang Chris Wilson
                   ` (3 preceding siblings ...)
  2016-09-30 13:18 ` ✗ Fi.CI.BAT: warning for series starting with [1/3] drm/i915/execlists: Reinitialise context image after GPU hang (rev2) Patchwork
@ 2016-10-03 12:25 ` Mika Kuoppala
  4 siblings, 0 replies; 10+ messages in thread
From: Mika Kuoppala @ 2016-10-03 12:25 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> On Braswell, at least, we observe that the context image is written in
> multiple phases. The first phase is to clear the register state, and
> subsequently rewrite it. A GPU reset at the right moment can interrupt
> the context update leaving it corrupt, and our update of the RING_HEAD
> is not sufficient to restart the engine afterwards. To recover, we need
> to reset the registers back to their original values. The context state
> is lost. What we need is a better mechanism to serialise the reset with
> pending flushes from the GPU.
>
> Fixes: 821ed7df6e2a ("drm/i915: Update reset path to fix incomplete requests")
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
>  drivers/gpu/drm/i915/intel_lrc.c | 95 +++++++++++++++++++++++-----------------
>  1 file changed, 56 insertions(+), 39 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 2d8eb2eb2b72..d6e762718ff4 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -226,10 +226,16 @@ enum {
>  /* Typical size of the average request (2 pipecontrols and a MI_BB) */
>  #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
>  
> +#define WA_TAIL_DWORDS 2
> +
>  static int execlists_context_deferred_alloc(struct i915_gem_context *ctx,
>  					    struct intel_engine_cs *engine);
>  static int intel_lr_context_pin(struct i915_gem_context *ctx,
>  				struct intel_engine_cs *engine);
> +static void execlists_init_reg_state(u32 *reg_state,
> +				     struct i915_gem_context *ctx,
> +				     struct intel_engine_cs *engine,
> +				     struct intel_ring *ring);
>  
>  /**
>   * intel_sanitize_enable_execlists() - sanitize i915.enable_execlists
> @@ -707,7 +713,6 @@ static int intel_lr_context_pin(struct i915_gem_context *ctx,
>  {
>  	struct intel_context *ce = &ctx->engine[engine->id];
>  	void *vaddr;
> -	u32 *lrc_reg_state;
>  	int ret;
>  
>  	lockdep_assert_held(&ctx->i915->drm.struct_mutex);
> @@ -726,17 +731,16 @@ static int intel_lr_context_pin(struct i915_gem_context *ctx,
>  		goto unpin_vma;
>  	}
>  
> -	lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
> -
>  	ret = intel_ring_pin(ce->ring);
>  	if (ret)
>  		goto unpin_map;
>  
>  	intel_lr_context_descriptor_update(ctx, engine);
>  
> -	lrc_reg_state[CTX_RING_BUFFER_START+1] =
> +	ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
> +	ce->lrc_reg_state[CTX_RING_BUFFER_START+1] =
>  		i915_ggtt_offset(ce->ring->vma);
> -	ce->lrc_reg_state = lrc_reg_state;
> +
>  	ce->state->obj->dirty = true;
>  
>  	/* Invalidate GuC TLB. */
> @@ -1284,8 +1288,14 @@ static void reset_common_ring(struct intel_engine_cs *engine,
>  	struct execlist_port *port = engine->execlist_port;
>  	struct intel_context *ce = &request->ctx->engine[engine->id];
>  
> +	execlists_init_reg_state(ce->lrc_reg_state,
> +				 request->ctx, engine, ce->ring);
> +
>  	/* Move the RING_HEAD onto the breadcrumb, past the hanging batch */
> +	ce->lrc_reg_state[CTX_RING_BUFFER_START+1] =
> +		i915_ggtt_offset(ce->ring->vma);
>  	ce->lrc_reg_state[CTX_RING_HEAD+1] = request->postfix;
> +
>  	request->ring->head = request->postfix;
>  	request->ring->last_retired_head = -1;
>  	intel_ring_update_space(request->ring);
> @@ -1305,6 +1315,9 @@ static void reset_common_ring(struct intel_engine_cs *engine,
>  	GEM_BUG_ON(request->ctx != port[0].request->ctx);
>  	port[0].count = 0;
>  	port[1].count = 0;
> +
> +	/* Reset WaIdleLiteRestore:bdw,skl as well */
> +	request->tail = request->wa_tail - WA_TAIL_DWORDS * sizeof(u32);
>  }
>  
>  static int intel_logical_ring_emit_pdps(struct drm_i915_gem_request *req)
> @@ -1542,7 +1555,6 @@ static void bxt_a_seqno_barrier(struct intel_engine_cs *engine)
>   * used as a workaround for not being allowed to do lite
>   * restore with HEAD==TAIL (WaIdleLiteRestore).
>   */
> -#define WA_TAIL_DWORDS 2
>  
>  static int gen8_emit_request(struct drm_i915_gem_request *request)
>  {
> @@ -1889,38 +1901,13 @@ static u32 intel_lr_indirect_ctx_offset(struct intel_engine_cs *engine)
>  	return indirect_ctx_offset;
>  }
>  
> -static int
> -populate_lr_context(struct i915_gem_context *ctx,
> -		    struct drm_i915_gem_object *ctx_obj,
> -		    struct intel_engine_cs *engine,
> -		    struct intel_ring *ring)
> +static void execlists_init_reg_state(u32 *reg_state,
> +				     struct i915_gem_context *ctx,
> +				     struct intel_engine_cs *engine,
> +				     struct intel_ring *ring)
>  {
> -	struct drm_i915_private *dev_priv = ctx->i915;
> -	struct i915_hw_ppgtt *ppgtt = ctx->ppgtt;
> -	void *vaddr;
> -	u32 *reg_state;
> -	int ret;
> -
> -	if (!ppgtt)
> -		ppgtt = dev_priv->mm.aliasing_ppgtt;
> -
> -	ret = i915_gem_object_set_to_cpu_domain(ctx_obj, true);
> -	if (ret) {
> -		DRM_DEBUG_DRIVER("Could not set to CPU domain\n");
> -		return ret;
> -	}
> -
> -	vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
> -	if (IS_ERR(vaddr)) {
> -		ret = PTR_ERR(vaddr);
> -		DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret);
> -		return ret;
> -	}
> -	ctx_obj->dirty = true;
> -
> -	/* The second page of the context object contains some fields which must
> -	 * be set up prior to the first execution. */
> -	reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
> +	struct drm_i915_private *dev_priv = engine->i915;
> +	struct i915_hw_ppgtt *ppgtt = ctx->ppgtt ?: dev_priv->mm.aliasing_ppgtt;
>  
>  	/* A context is actually a big batch buffer with several MI_LOAD_REGISTER_IMM
>  	 * commands followed by (reg, value) pairs. The values we are setting here are
> @@ -1934,7 +1921,7 @@ populate_lr_context(struct i915_gem_context *ctx,
>  		       _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH |
>  					  CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT |
>  					  (HAS_RESOURCE_STREAMER(dev_priv) ?
> -					    CTX_CTRL_RS_CTX_ENABLE : 0)));
> +					   CTX_CTRL_RS_CTX_ENABLE : 0)));
>  	ASSIGN_CTX_REG(reg_state, CTX_RING_HEAD, RING_HEAD(engine->mmio_base),
>  		       0);
>  	ASSIGN_CTX_REG(reg_state, CTX_RING_TAIL, RING_TAIL(engine->mmio_base),
> @@ -1946,7 +1933,7 @@ populate_lr_context(struct i915_gem_context *ctx,
>  		       RING_START(engine->mmio_base), 0);
>  	ASSIGN_CTX_REG(reg_state, CTX_RING_BUFFER_CONTROL,
>  		       RING_CTL(engine->mmio_base),
> -		       ((ring->size - PAGE_SIZE) & RING_NR_PAGES) | RING_VALID);
> +		       (ring->size - PAGE_SIZE) | RING_VALID);

Patch looks good.

Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>

Not exactly problems with this patch, but as we are in
the territory:

I still would like the ring->size setting to be accompanied
with comment about it matching page shift. I have fallen
for it twice now so I suspect the next reader will too.

And for that matter, removal of misleading comment

/ * It is written to the context image in execlists_update_context() */

in execlists_init_reg_state()

Thanks,
-Mika


>  	ASSIGN_CTX_REG(reg_state, CTX_BB_HEAD_U,
>  		       RING_BBADDR_UDW(engine->mmio_base), 0);
>  	ASSIGN_CTX_REG(reg_state, CTX_BB_HEAD_L,
> @@ -2024,6 +2011,36 @@ populate_lr_context(struct i915_gem_context *ctx,
>  		ASSIGN_CTX_REG(reg_state, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE,
>  			       make_rpcs(dev_priv));
>  	}
> +}
> +
> +static int
> +populate_lr_context(struct i915_gem_context *ctx,
> +		    struct drm_i915_gem_object *ctx_obj,
> +		    struct intel_engine_cs *engine,
> +		    struct intel_ring *ring)
> +{
> +	void *vaddr;
> +	int ret;
> +
> +	ret = i915_gem_object_set_to_cpu_domain(ctx_obj, true);
> +	if (ret) {
> +		DRM_DEBUG_DRIVER("Could not set to CPU domain\n");
> +		return ret;
> +	}
> +
> +	vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
> +	if (IS_ERR(vaddr)) {
> +		ret = PTR_ERR(vaddr);
> +		DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret);
> +		return ret;
> +	}
> +	ctx_obj->dirty = true;
> +
> +	/* The second page of the context object contains some fields which must
> +	 * be set up prior to the first execution. */
> +
> +	execlists_init_reg_state(vaddr + LRC_STATE_PN * PAGE_SIZE,
> +				 ctx, engine, ring);
>  
>  	i915_gem_object_unpin_map(ctx_obj);
>  
> -- 
> 2.9.3
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] drm/i915: Disable irqs across GPU reset
  2016-09-30 11:25     ` [PATCH] drm/i915: Disable irqs across GPU reset Chris Wilson
@ 2016-10-03 12:42       ` Mika Kuoppala
  0 siblings, 0 replies; 10+ messages in thread
From: Mika Kuoppala @ 2016-10-03 12:42 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Whilst we reset the GPU, we want to prevent execlists from submitting
> new work (which it does via an interrupt handler). To achieve this we
> disable the irq (and drain the irq tasklet) around the reset. When we
> enable it again afters, the interrupt queue should be empty and we can
> reinitialise from a known state without fear of the tasklet running
> concurrently.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@intel.com>

Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>

> ---
>  drivers/gpu/drm/i915/i915_drv.c | 19 +++++++++++++++++++
>  drivers/gpu/drm/i915/i915_gem.c |  2 --
>  2 files changed, 19 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index 31b2b6300d8d..7ce498898217 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -1728,6 +1728,21 @@ int i915_resume_switcheroo(struct drm_device *dev)
>  	return i915_drm_resume(dev);
>  }
>  
> +static void disable_engines_irq(struct drm_i915_private *dev_priv)
> +{
> +	struct intel_engine_cs *engine;
> +
> +	/* Ensure irq handler finishes, and not run again. */
> +	disable_irq(dev_priv->drm.irq);
> +	for_each_engine(engine, dev_priv)
> +		tasklet_kill(&engine->irq_tasklet);
> +}
> +
> +static void enable_engines_irq(struct drm_i915_private *dev_priv)
> +{
> +	enable_irq(dev_priv->drm.irq);
> +}
> +
>  /**
>   * i915_reset - reset chip after a hang
>   * @dev: drm device to reset
> @@ -1761,7 +1776,11 @@ void i915_reset(struct drm_i915_private *dev_priv)
>  	error->reset_count++;
>  
>  	pr_notice("drm/i915: Resetting chip after gpu hang\n");
> +
> +	disable_engines_irq(dev_priv);
>  	ret = intel_gpu_reset(dev_priv, ALL_ENGINES);
> +	enable_engines_irq(dev_priv);
> +
>  	if (ret) {
>  		if (ret != -ENODEV)
>  			DRM_ERROR("Failed to reset chip: %i\n", ret);
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 1418c1c522cb..0cae8acdf906 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2581,8 +2581,6 @@ static void i915_gem_reset_engine(struct intel_engine_cs *engine)
>  	struct i915_gem_context *incomplete_ctx;
>  	bool ring_hung;
>  
> -	/* Ensure irq handler finishes, and not run again. */
> -	tasklet_kill(&engine->irq_tasklet);
>  	if (engine->irq_seqno_barrier)
>  		engine->irq_seqno_barrier(engine);
>  
> -- 
> 2.9.3
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2016-10-03 12:42 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-09-30  7:50 [PATCH 1/3] drm/i915/execlists: Reinitialise context image after GPU hang Chris Wilson
2016-09-30  7:50 ` [PATCH 2/3] drm/i915: Call synchronize_irq() after resetting the GPU Chris Wilson
2016-09-30  8:26   ` Mika Kuoppala
2016-09-30  8:38     ` Chris Wilson
2016-09-30 11:25     ` [PATCH] drm/i915: Disable irqs across GPU reset Chris Wilson
2016-10-03 12:42       ` Mika Kuoppala
2016-09-30  7:50 ` [PATCH 3/3] drm/i915: Show RING registers through debugfs Chris Wilson
2016-09-30  8:20 ` ✗ Fi.CI.BAT: warning for series starting with [1/3] drm/i915/execlists: Reinitialise context image after GPU hang Patchwork
2016-09-30 13:18 ` ✗ Fi.CI.BAT: warning for series starting with [1/3] drm/i915/execlists: Reinitialise context image after GPU hang (rev2) Patchwork
2016-10-03 12:25 ` [PATCH 1/3] drm/i915/execlists: Reinitialise context image after GPU hang Mika Kuoppala

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.