All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/6] drm/i915: Split up hangcheck phases
@ 2016-11-15 14:36 Mika Kuoppala
  2016-11-15 14:36 ` [PATCH 2/6] drm/i915: Decouple hang detection from hangcheck period Mika Kuoppala
                   ` (6 more replies)
  0 siblings, 7 replies; 19+ messages in thread
From: Mika Kuoppala @ 2016-11-15 14:36 UTC (permalink / raw)
  To: intel-gfx

In order to simplify hangcheck state keeping, split hangcheck
per engine loop in three phases: state load, action, state save.

Add few more hangcheck actions to separate between seqno, head
and subunit movements. This helps to gather all the hangcheck
actions under a single switch umbrella.

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_gpu_error.c   |   8 +-
 drivers/gpu/drm/i915/intel_hangcheck.c  | 241 ++++++++++++++++++--------------
 drivers/gpu/drm/i915/intel_ringbuffer.h |   4 +-
 3 files changed, 146 insertions(+), 107 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 5d620bd..f02f581 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -323,8 +323,12 @@ static const char *hangcheck_action_to_str(enum intel_engine_hangcheck_action a)
 		return "idle";
 	case HANGCHECK_WAIT:
 		return "wait";
-	case HANGCHECK_ACTIVE:
-		return "active";
+	case HANGCHECK_ACTIVE_SEQNO:
+		return "active seqno";
+	case HANGCHECK_ACTIVE_HEAD:
+		return "active head";
+	case HANGCHECK_ACTIVE_SUBUNITS:
+		return "active subunits";
 	case HANGCHECK_KICK:
 		return "kick";
 	case HANGCHECK_HUNG:
diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
index 53df5b1..3d2e81c 100644
--- a/drivers/gpu/drm/i915/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/intel_hangcheck.c
@@ -236,11 +236,11 @@ head_stuck(struct intel_engine_cs *engine, u64 acthd)
 		memset(&engine->hangcheck.instdone, 0,
 		       sizeof(engine->hangcheck.instdone));
 
-		return HANGCHECK_ACTIVE;
+		return HANGCHECK_ACTIVE_HEAD;
 	}
 
 	if (!subunits_stuck(engine))
-		return HANGCHECK_ACTIVE;
+		return HANGCHECK_ACTIVE_SUBUNITS;
 
 	return HANGCHECK_HUNG;
 }
@@ -291,6 +291,129 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
 	return HANGCHECK_HUNG;
 }
 
+static void hangcheck_load_sample(struct intel_engine_cs *engine,
+				  struct intel_engine_hangcheck *hc)
+{
+	/* We don't strictly need an irq-barrier here, as we are not
+	 * serving an interrupt request, be paranoid in case the
+	 * barrier has side-effects (such as preventing a broken
+	 * cacheline snoop) and so be sure that we can see the seqno
+	 * advance. If the seqno should stick, due to a stale
+	 * cacheline, we would erroneously declare the GPU hung.
+	 */
+	if (engine->irq_seqno_barrier)
+		engine->irq_seqno_barrier(engine);
+
+	hc->acthd = intel_engine_get_active_head(engine);
+	hc->seqno = intel_engine_get_seqno(engine);
+	hc->score = engine->hangcheck.score;
+}
+
+static void hangcheck_store_sample(struct intel_engine_cs *engine,
+				   const struct intel_engine_hangcheck *hc)
+{
+	engine->hangcheck.acthd = hc->acthd;
+	engine->hangcheck.seqno = hc->seqno;
+	engine->hangcheck.score = hc->score;
+	engine->hangcheck.action = hc->action;
+}
+
+static enum intel_engine_hangcheck_action
+hangcheck_get_action(struct intel_engine_cs *engine,
+		     const struct intel_engine_hangcheck *hc)
+{
+	if (engine->hangcheck.seqno != hc->seqno)
+		return HANGCHECK_ACTIVE_SEQNO;
+
+	if (i915_seqno_passed(hc->seqno, intel_engine_last_submit(engine)))
+		return HANGCHECK_IDLE;
+
+	return engine_stuck(engine, hc->acthd);
+}
+
+static void hangcheck_accumulate_sample(struct intel_engine_cs *engine,
+					struct intel_engine_hangcheck *hc)
+{
+	hc->action = hangcheck_get_action(engine, hc);
+
+	switch (hc->action) {
+	case HANGCHECK_IDLE:
+	case HANGCHECK_WAIT:
+		break;
+
+	case HANGCHECK_ACTIVE_HEAD:
+	case HANGCHECK_ACTIVE_SUBUNITS:
+		/* We always increment the hangcheck score
+		 * if the engine is busy and still processing
+		 * the same request, so that no single request
+		 * can run indefinitely (such as a chain of
+		 * batches). The only time we do not increment
+		 * the hangcheck score on this ring, if this
+		 * engine is in a legitimate wait for another
+		 * engine. In that case the waiting engine is a
+		 * victim and we want to be sure we catch the
+		 * right culprit. Then every time we do kick
+		 * the ring, add a small increment to the
+		 * score so that we can catch a batch that is
+		 * being repeatedly kicked and so responsible
+		 * for stalling the machine.
+		 */
+		hc->score += 1;
+		break;
+
+	case HANGCHECK_KICK:
+		hc->score += 5;
+		break;
+
+	case HANGCHECK_HUNG:
+		hc->score += 20;
+		break;
+
+	case HANGCHECK_ACTIVE_SEQNO:
+		/* Gradually reduce the count so that we catch DoS
+		 * attempts across multiple batches.
+		 */
+		if (hc->score > 0)
+			hc->score -= 15;
+		if (hc->score < 0)
+			hc->score = 0;
+
+		/* Clear head and subunit states on seqno movement */
+		hc->acthd = 0;
+
+		memset(&engine->hangcheck.instdone, 0,
+		       sizeof(engine->hangcheck.instdone));
+		break;
+
+	default:
+		MISSING_CASE(hc->action);
+	}
+}
+
+static void hangcheck_declare_hang(struct drm_i915_private *i915,
+				   unsigned int hung,
+				   unsigned int stuck)
+{
+	struct intel_engine_cs *engine;
+	char msg[80];
+	unsigned int tmp;
+	int len;
+
+	/* If some rings hung but others were still busy, only
+	 * blame the hanging rings in the synopsis.
+	 */
+	if (stuck != hung)
+		hung &= ~stuck;
+	len = scnprintf(msg, sizeof(msg),
+			"%s on ", stuck == hung ? "No progress" : "Hang");
+	for_each_engine_masked(engine, i915, hung, tmp)
+		len += scnprintf(msg + len, sizeof(msg) - len,
+				 "%s, ", engine->name);
+	msg[len-2] = '\0';
+
+	return i915_handle_error(i915, hung, msg);
+}
+
 /*
  * This is called when the chip hasn't reported back with completed
  * batchbuffers in a long time. We keep track per ring seqno progress and
@@ -308,10 +431,6 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 	enum intel_engine_id id;
 	unsigned int hung = 0, stuck = 0;
 	int busy_count = 0;
-#define BUSY 1
-#define KICK 5
-#define HUNG 20
-#define ACTIVE_DECAY 15
 
 	if (!i915.enable_hangcheck)
 		return;
@@ -326,112 +445,26 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 	intel_uncore_arm_unclaimed_mmio_detection(dev_priv);
 
 	for_each_engine(engine, dev_priv, id) {
-		bool busy = intel_engine_has_waiter(engine);
-		u64 acthd;
-		u32 seqno;
-		u32 submit;
+		struct intel_engine_hangcheck cur_state, *hc = &cur_state;
+		const bool busy = intel_engine_has_waiter(engine);
 
 		semaphore_clear_deadlocks(dev_priv);
 
-		/* We don't strictly need an irq-barrier here, as we are not
-		 * serving an interrupt request, be paranoid in case the
-		 * barrier has side-effects (such as preventing a broken
-		 * cacheline snoop) and so be sure that we can see the seqno
-		 * advance. If the seqno should stick, due to a stale
-		 * cacheline, we would erroneously declare the GPU hung.
-		 */
-		if (engine->irq_seqno_barrier)
-			engine->irq_seqno_barrier(engine);
-
-		acthd = intel_engine_get_active_head(engine);
-		seqno = intel_engine_get_seqno(engine);
-		submit = intel_engine_last_submit(engine);
-
-		if (engine->hangcheck.seqno == seqno) {
-			if (i915_seqno_passed(seqno, submit)) {
-				engine->hangcheck.action = HANGCHECK_IDLE;
-			} else {
-				/* We always increment the hangcheck score
-				 * if the engine is busy and still processing
-				 * the same request, so that no single request
-				 * can run indefinitely (such as a chain of
-				 * batches). The only time we do not increment
-				 * the hangcheck score on this ring, if this
-				 * engine is in a legitimate wait for another
-				 * engine. In that case the waiting engine is a
-				 * victim and we want to be sure we catch the
-				 * right culprit. Then every time we do kick
-				 * the ring, add a small increment to the
-				 * score so that we can catch a batch that is
-				 * being repeatedly kicked and so responsible
-				 * for stalling the machine.
-				 */
-				engine->hangcheck.action =
-					engine_stuck(engine, acthd);
-
-				switch (engine->hangcheck.action) {
-				case HANGCHECK_IDLE:
-				case HANGCHECK_WAIT:
-					break;
-				case HANGCHECK_ACTIVE:
-					engine->hangcheck.score += BUSY;
-					break;
-				case HANGCHECK_KICK:
-					engine->hangcheck.score += KICK;
-					break;
-				case HANGCHECK_HUNG:
-					engine->hangcheck.score += HUNG;
-					break;
-				}
-			}
-
-			if (engine->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG) {
-				hung |= intel_engine_flag(engine);
-				if (engine->hangcheck.action != HANGCHECK_HUNG)
-					stuck |= intel_engine_flag(engine);
-			}
-		} else {
-			engine->hangcheck.action = HANGCHECK_ACTIVE;
-
-			/* Gradually reduce the count so that we catch DoS
-			 * attempts across multiple batches.
-			 */
-			if (engine->hangcheck.score > 0)
-				engine->hangcheck.score -= ACTIVE_DECAY;
-			if (engine->hangcheck.score < 0)
-				engine->hangcheck.score = 0;
-
-			/* Clear head and subunit states on seqno movement */
-			acthd = 0;
-
-			memset(&engine->hangcheck.instdone, 0,
-			       sizeof(engine->hangcheck.instdone));
+		hangcheck_load_sample(engine, hc);
+		hangcheck_accumulate_sample(engine, hc);
+		hangcheck_store_sample(engine, hc);
+
+		if (hc->score >= HANGCHECK_SCORE_RING_HUNG) {
+			hung |= intel_engine_flag(engine);
+			if (hc->action != HANGCHECK_HUNG)
+				stuck |= intel_engine_flag(engine);
 		}
 
-		engine->hangcheck.seqno = seqno;
-		engine->hangcheck.acthd = acthd;
 		busy_count += busy;
 	}
 
-	if (hung) {
-		char msg[80];
-		unsigned int tmp;
-		int len;
-
-		/* If some rings hung but others were still busy, only
-		 * blame the hanging rings in the synopsis.
-		 */
-		if (stuck != hung)
-			hung &= ~stuck;
-		len = scnprintf(msg, sizeof(msg),
-				"%s on ", stuck == hung ? "No progress" : "Hang");
-		for_each_engine_masked(engine, dev_priv, hung, tmp)
-			len += scnprintf(msg + len, sizeof(msg) - len,
-					 "%s, ", engine->name);
-		msg[len-2] = '\0';
-
-		return i915_handle_error(dev_priv, hung, msg);
-	}
+	if (hung)
+		hangcheck_declare_hang(dev_priv, hung, stuck);
 
 	/* Reset timer in case GPU hangs without another request being added */
 	if (busy_count)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 3466b4e..3152b2b 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -67,7 +67,9 @@ struct intel_hw_status_page {
 enum intel_engine_hangcheck_action {
 	HANGCHECK_IDLE = 0,
 	HANGCHECK_WAIT,
-	HANGCHECK_ACTIVE,
+	HANGCHECK_ACTIVE_SEQNO,
+	HANGCHECK_ACTIVE_HEAD,
+	HANGCHECK_ACTIVE_SUBUNITS,
 	HANGCHECK_KICK,
 	HANGCHECK_HUNG,
 };
-- 
2.7.4

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 2/6] drm/i915: Decouple hang detection from hangcheck period
  2016-11-15 14:36 [PATCH 1/6] drm/i915: Split up hangcheck phases Mika Kuoppala
@ 2016-11-15 14:36 ` Mika Kuoppala
  2016-11-15 16:38   ` Chris Wilson
  2016-11-16  8:07   ` Chris Wilson
  2016-11-15 14:36 ` [PATCH 3/6] drm/i915: Use request retirement as context progress Mika Kuoppala
                   ` (5 subsequent siblings)
  6 siblings, 2 replies; 19+ messages in thread
From: Mika Kuoppala @ 2016-11-15 14:36 UTC (permalink / raw)
  To: intel-gfx

Hangcheck state accumulation has gained more steps
along the years, like head movement and more recently the
subunit inactivity check. As the subunit sampling is only
done if the previous state check showed inactivity, we
have added more stages (and time) to reach a hang verdict.

Asymmetric engine states led to different actual weight of
'one hangcheck unit' and it was demonstrated in some
hangs that due to difference in stages, simpler engines
were accused falsely of a hang as their scoring was much
more quicker to accumulate above the hang treshold.

To completely decouple the hangcheck guilty score
from the hangcheck period, convert hangcheck score to a
rough period of inactivity measurement. As these are
tracked as jiffies, they are meaningful also across
reset boundaries. This makes finding a guilty engine
more accurate across multi engine activity scenarios,
especially across asymmetric engines.

We lose the ability to detect cross batch malicious attempts
to hinder the progress. Plan is to move this functionality
to be part of context banning which is more natural fit,
later in the series.

v2: use time_before macros (Chris)
    reinstate the pardoning of moving engine after hc (Chris)

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c      |  17 ++--
 drivers/gpu/drm/i915/i915_drv.h          |  20 +++--
 drivers/gpu/drm/i915/i915_gem.c          |  11 ++-
 drivers/gpu/drm/i915/i915_gem_context.c  |   2 +-
 drivers/gpu/drm/i915/i915_gpu_error.c    |  39 ++-------
 drivers/gpu/drm/i915/intel_breadcrumbs.c |   2 +-
 drivers/gpu/drm/i915/intel_hangcheck.c   | 134 +++++++++++++++++++++----------
 drivers/gpu/drm/i915/intel_ringbuffer.h  |  26 +++++-
 8 files changed, 159 insertions(+), 92 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 1cc971c..cf0ca0f 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1351,10 +1351,12 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 		seq_printf(m, "\tseqno = %x [current %x, last %x]\n",
 			   engine->hangcheck.seqno, seqno[id],
 			   intel_engine_last_submit(engine));
-		seq_printf(m, "\twaiters? %s, fake irq active? %s\n",
+		seq_printf(m, "\twaiters? %s, fake irq active? %s, guilty? %s\n",
 			   yesno(intel_engine_has_waiter(engine)),
 			   yesno(test_bit(engine->id,
-					  &dev_priv->gpu_error.missed_irq_rings)));
+					  &dev_priv->gpu_error.missed_irq_rings)),
+			   yesno(engine->hangcheck.guilty));
+
 		spin_lock_irq(&b->lock);
 		for (rb = rb_first(&b->waiters); rb; rb = rb_next(rb)) {
 			struct intel_wait *w = container_of(rb, typeof(*w), node);
@@ -1367,8 +1369,11 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 		seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
 			   (long long)engine->hangcheck.acthd,
 			   (long long)acthd[id]);
-		seq_printf(m, "\tscore = %d\n", engine->hangcheck.score);
-		seq_printf(m, "\taction = %d\n", engine->hangcheck.action);
+		seq_printf(m, "\taction = %s(%d) %d ms ago\n",
+			   hangcheck_action_to_str(engine->hangcheck.action),
+			   engine->hangcheck.action,
+			   jiffies_to_msecs(jiffies -
+					    engine->hangcheck.action_timestamp));
 
 		if (engine->id == RCS) {
 			seq_puts(m, "\tinstdone read =\n");
@@ -3163,11 +3168,11 @@ static int i915_engine_info(struct seq_file *m, void *unused)
 		u64 addr;
 
 		seq_printf(m, "%s\n", engine->name);
-		seq_printf(m, "\tcurrent seqno %x, last %x, hangcheck %x [score %d]\n",
+		seq_printf(m, "\tcurrent seqno %x, last %x, hangcheck %x [%d ms]\n",
 			   intel_engine_get_seqno(engine),
 			   intel_engine_last_submit(engine),
 			   engine->hangcheck.seqno,
-			   engine->hangcheck.score);
+			   jiffies_to_msecs(jiffies - engine->hangcheck.action_timestamp));
 
 		rcu_read_lock();
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 006914c..3caa55d 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -782,7 +782,8 @@ struct drm_i915_error_state {
 		/* Software tracked state */
 		bool waiting;
 		int num_waiters;
-		int hangcheck_score;
+		unsigned long hangcheck_timestamp;
+		bool hangcheck_guilty;
 		enum intel_engine_hangcheck_action hangcheck_action;
 		struct i915_address_space *vm;
 		int num_requests;
@@ -1429,11 +1430,16 @@ struct i915_error_state_file_priv {
 #define I915_FENCE_TIMEOUT (10 * HZ) /* 10s */
 
 struct i915_gpu_error {
-	/* For hangcheck timer */
-#define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */
-#define DRM_I915_HANGCHECK_JIFFIES msecs_to_jiffies(DRM_I915_HANGCHECK_PERIOD)
-	/* Hang gpu twice in this window and your context gets banned */
-#define DRM_I915_CTX_BAN_PERIOD DIV_ROUND_UP(8*DRM_I915_HANGCHECK_PERIOD, 1000)
+
+#define DRM_I915_STUCK_PERIOD_SEC 24 /* No observed seqno progress */
+#define DRM_I915_HUNG_PERIOD_SEC 4 /* No observed seqno nor head progress */
+
+/* Hang gpu twice in this window and your context gets banned */
+#define DRM_I915_CTX_BAN_PERIOD_SEC 12
+
+#define HANGCHECK_STUCK_JIFFIES (DRM_I915_STUCK_PERIOD_SEC * HZ)
+#define HANGCHECK_HUNG_JIFFIES (DRM_I915_HUNG_PERIOD_SEC * HZ)
+#define HANGCHECK_PERIOD_JIFFIES msecs_to_jiffies(1500)
 
 	struct delayed_work hangcheck_work;
 
@@ -2723,7 +2729,7 @@ static inline void i915_queue_hangcheck(struct drm_i915_private *dev_priv)
 	 * we will ignore a hung ring if a second ring is kept busy.
 	 */
 
-	delay = round_jiffies_up_relative(DRM_I915_HANGCHECK_JIFFIES);
+	delay = round_jiffies_up_relative(HANGCHECK_PERIOD_JIFFIES);
 	queue_delayed_work(system_long_wq,
 			   &dev_priv->gpu_error.hangcheck_work, delay);
 }
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 3fb5e66..708e289 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2703,9 +2703,16 @@ static void i915_gem_reset_engine(struct intel_engine_cs *engine)
 	if (!request)
 		return;
 
-	ring_hung = engine->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG;
-	if (engine->hangcheck.seqno != intel_engine_get_seqno(engine))
+	ring_hung = engine->hangcheck.guilty;
+	if (engine->hangcheck.seqno != intel_engine_get_seqno(engine)) {
+		if (ring_hung)
+			DRM_ERROR("%s pardoned due to progress after hangcheck %x vs %x\n",
+				  engine->name,
+				  engine->hangcheck.seqno,
+				  intel_engine_get_seqno(engine));
+
 		ring_hung = false;
+	}
 
 	i915_set_reset_status(request->ctx, ring_hung);
 	if (!ring_hung)
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 1f94b8d..958a526 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -331,7 +331,7 @@ __create_hw_context(struct drm_device *dev,
 	 * is no remap info, it will be a NOP. */
 	ctx->remap_slice = ALL_L3_SLICES(dev_priv);
 
-	ctx->hang_stats.ban_period_seconds = DRM_I915_CTX_BAN_PERIOD;
+	ctx->hang_stats.ban_period_seconds = DRM_I915_CTX_BAN_PERIOD_SEC;
 	ctx->ring_size = 4 * PAGE_SIZE;
 	ctx->desc_template = GEN8_CTX_ADDRESSING_MODE(dev_priv) <<
 			     GEN8_CTX_ADDRESSING_MODE_SHIFT;
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index f02f581..8d0f2bc 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -316,28 +316,6 @@ static void print_error_buffers(struct drm_i915_error_state_buf *m,
 	}
 }
 
-static const char *hangcheck_action_to_str(enum intel_engine_hangcheck_action a)
-{
-	switch (a) {
-	case HANGCHECK_IDLE:
-		return "idle";
-	case HANGCHECK_WAIT:
-		return "wait";
-	case HANGCHECK_ACTIVE_SEQNO:
-		return "active seqno";
-	case HANGCHECK_ACTIVE_HEAD:
-		return "active head";
-	case HANGCHECK_ACTIVE_SUBUNITS:
-		return "active subunits";
-	case HANGCHECK_KICK:
-		return "kick";
-	case HANGCHECK_HUNG:
-		return "hung";
-	}
-
-	return "unknown";
-}
-
 static void error_print_instdone(struct drm_i915_error_state_buf *m,
 				 struct drm_i915_error_engine *ee)
 {
@@ -445,9 +423,10 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
 	err_printf(m, "  waiting: %s\n", yesno(ee->waiting));
 	err_printf(m, "  ring->head: 0x%08x\n", ee->cpu_ring_head);
 	err_printf(m, "  ring->tail: 0x%08x\n", ee->cpu_ring_tail);
-	err_printf(m, "  hangcheck: %s [%d]\n",
+	err_printf(m, "  hangcheck: %s %s [%lu]\n",
+		   yesno(ee->hangcheck_guilty),
 		   hangcheck_action_to_str(ee->hangcheck_action),
-		   ee->hangcheck_score);
+		   ee->hangcheck_timestamp);
 	error_print_request(m, "  ELSP[0]: ", &ee->execlist[0]);
 	error_print_request(m, "  ELSP[1]: ", &ee->execlist[1]);
 }
@@ -537,7 +516,6 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 	struct pci_dev *pdev = dev_priv->drm.pdev;
 	struct drm_i915_error_state *error = error_priv->error;
 	struct drm_i915_error_object *obj;
-	int max_hangcheck_score;
 	int i, j;
 
 	if (!error) {
@@ -554,13 +532,9 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 	err_printf(m, "Uptime: %ld s %ld us\n",
 		   error->uptime.tv_sec, error->uptime.tv_usec);
 	err_print_capabilities(m, &error->device_info);
-	max_hangcheck_score = 0;
-	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
-		if (error->engine[i].hangcheck_score > max_hangcheck_score)
-			max_hangcheck_score = error->engine[i].hangcheck_score;
-	}
+
 	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
-		if (error->engine[i].hangcheck_score == max_hangcheck_score &&
+		if (error->engine[i].hangcheck_guilty &&
 		    error->engine[i].pid != -1) {
 			err_printf(m, "Active process (on ring %s): %s [%d]\n",
 				   engine_str(i),
@@ -1164,8 +1138,9 @@ static void error_record_engine_registers(struct drm_i915_error_state *error,
 		ee->hws = I915_READ(mmio);
 	}
 
-	ee->hangcheck_score = engine->hangcheck.score;
+	ee->hangcheck_timestamp = engine->hangcheck.action_timestamp;
 	ee->hangcheck_action = engine->hangcheck.action;
+	ee->hangcheck_guilty = engine->hangcheck.guilty;
 
 	if (USES_PPGTT(dev_priv)) {
 		int i;
diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
index c9c46a5..ebb3b8e 100644
--- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
+++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
@@ -57,7 +57,7 @@ static void intel_breadcrumbs_hangcheck(unsigned long data)
 
 static unsigned long wait_timeout(void)
 {
-	return round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES);
+	return round_jiffies_up(jiffies + HANGCHECK_PERIOD_JIFFIES);
 }
 
 static void intel_breadcrumbs_fake_irq(unsigned long data)
diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
index 3d2e81c..7bc8eaa 100644
--- a/drivers/gpu/drm/i915/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/intel_hangcheck.c
@@ -306,7 +306,6 @@ static void hangcheck_load_sample(struct intel_engine_cs *engine,
 
 	hc->acthd = intel_engine_get_active_head(engine);
 	hc->seqno = intel_engine_get_seqno(engine);
-	hc->score = engine->hangcheck.score;
 }
 
 static void hangcheck_store_sample(struct intel_engine_cs *engine,
@@ -314,7 +313,6 @@ static void hangcheck_store_sample(struct intel_engine_cs *engine,
 {
 	engine->hangcheck.acthd = hc->acthd;
 	engine->hangcheck.seqno = hc->seqno;
-	engine->hangcheck.score = hc->score;
 	engine->hangcheck.action = hc->action;
 }
 
@@ -336,58 +334,109 @@ static void hangcheck_accumulate_sample(struct intel_engine_cs *engine,
 {
 	hc->action = hangcheck_get_action(engine, hc);
 
+	/* We always increment the progress
+	 * if the engine is busy and still processing
+	 * the same request, so that no single request
+	 * can run indefinitely (such as a chain of
+	 * batches). The only time we do not increment
+	 * the hangcheck score on this ring, if this
+	 * engine is in a legitimate wait for another
+	 * engine. In that case the waiting engine is a
+	 * victim and we want to be sure we catch the
+	 * right culprit. Then every time we do kick
+	 * the ring, make it as a progress as the seqno
+	 * advancement might ensure and if not, it
+	 * will catch the hanging engine.
+	 */
+
 	switch (hc->action) {
 	case HANGCHECK_IDLE:
-	case HANGCHECK_WAIT:
+	case HANGCHECK_ACTIVE_SEQNO:
+		/* Clear head and subunit states on seqno movement */
+		hc->acthd = 0;
+
+		memset(&engine->hangcheck.instdone, 0,
+		       sizeof(engine->hangcheck.instdone));
+
+		engine->hangcheck.action_timestamp = jiffies;
 		break;
 
 	case HANGCHECK_ACTIVE_HEAD:
 	case HANGCHECK_ACTIVE_SUBUNITS:
-		/* We always increment the hangcheck score
-		 * if the engine is busy and still processing
-		 * the same request, so that no single request
-		 * can run indefinitely (such as a chain of
-		 * batches). The only time we do not increment
-		 * the hangcheck score on this ring, if this
-		 * engine is in a legitimate wait for another
-		 * engine. In that case the waiting engine is a
-		 * victim and we want to be sure we catch the
-		 * right culprit. Then every time we do kick
-		 * the ring, add a small increment to the
-		 * score so that we can catch a batch that is
-		 * being repeatedly kicked and so responsible
-		 * for stalling the machine.
-		 */
-		hc->score += 1;
-		break;
-
 	case HANGCHECK_KICK:
-		hc->score += 5;
-		break;
 
+	case HANGCHECK_WAIT:
 	case HANGCHECK_HUNG:
-		hc->score += 20;
 		break;
 
-	case HANGCHECK_ACTIVE_SEQNO:
-		/* Gradually reduce the count so that we catch DoS
-		 * attempts across multiple batches.
-		 */
-		if (hc->score > 0)
-			hc->score -= 15;
-		if (hc->score < 0)
-			hc->score = 0;
+	default:
+		MISSING_CASE(hc->action);
+	}
+}
 
-		/* Clear head and subunit states on seqno movement */
-		hc->acthd = 0;
+static bool
+hangcheck_engine_stall(struct intel_engine_cs *engine,
+		       struct intel_engine_hangcheck *hc)
+{
+	const unsigned long last_action = engine->hangcheck.action_timestamp;
 
-		memset(&engine->hangcheck.instdone, 0,
-		       sizeof(engine->hangcheck.instdone));
-		break;
+	if (hc->action == HANGCHECK_ACTIVE_SEQNO ||
+	    hc->action == HANGCHECK_IDLE)
+		return false;
+
+	if (time_before(jiffies, last_action + HANGCHECK_HUNG_JIFFIES))
+		return false;
+
+	if (time_before(jiffies, last_action + HANGCHECK_STUCK_JIFFIES))
+		if (hc->action != HANGCHECK_HUNG)
+			return false;
+
+	return true;
+}
+
+static struct intel_engine_cs *find_lra_engine(struct drm_i915_private *i915,
+					       const unsigned int mask)
+{
+	struct intel_engine_cs *engine = NULL, *c;
+	enum intel_engine_id id;
+
+	for_each_engine_masked(c, i915, mask, id) {
+		if (engine == NULL) {
+			engine = c;
+			continue;
+		}
+
+		if (time_before(c->hangcheck.action_timestamp,
+				engine->hangcheck.action_timestamp))
+			engine = c;
+		else if (c->hangcheck.action_timestamp ==
+			 engine->hangcheck.action_timestamp &&
+			 c->hangcheck.seqno < engine->hangcheck.seqno)
+			engine = c;
 
-	default:
-		MISSING_CASE(hc->action);
 	}
+
+	return engine;
+}
+
+static struct intel_engine_cs *find_guilty_engine(struct drm_i915_private *i915,
+						  const unsigned int hung_mask,
+						  const unsigned int stuck_mask)
+{
+	struct intel_engine_cs *engine;
+
+	engine = find_lra_engine(i915, hung_mask);
+	if (engine)
+		return engine;
+
+	engine = find_lra_engine(i915, stuck_mask);
+	if (engine)
+		return engine;
+
+	DRM_DEBUG_DRIVER("No engine found for hang (0x%x,0x%x)\n",
+			 hung_mask, stuck_mask);
+	/* Should not get here. But as a safety valve, blame someone */
+	return find_lra_engine(i915, ~0);
 }
 
 static void hangcheck_declare_hang(struct drm_i915_private *i915,
@@ -454,7 +503,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 		hangcheck_accumulate_sample(engine, hc);
 		hangcheck_store_sample(engine, hc);
 
-		if (hc->score >= HANGCHECK_SCORE_RING_HUNG) {
+		if (hangcheck_engine_stall(engine, hc)) {
 			hung |= intel_engine_flag(engine);
 			if (hc->action != HANGCHECK_HUNG)
 				stuck |= intel_engine_flag(engine);
@@ -463,8 +512,11 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 		busy_count += busy;
 	}
 
-	if (hung)
+	if (hung) {
+		engine = find_guilty_engine(dev_priv, hung, stuck);
+		engine->hangcheck.guilty = true;
 		hangcheck_declare_hang(dev_priv, hung, stuck);
+	}
 
 	/* Reset timer in case GPU hangs without another request being added */
 	if (busy_count)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 3152b2b..92852e5 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -74,7 +74,28 @@ enum intel_engine_hangcheck_action {
 	HANGCHECK_HUNG,
 };
 
-#define HANGCHECK_SCORE_RING_HUNG 31
+static inline const char *
+hangcheck_action_to_str(const enum intel_engine_hangcheck_action a)
+{
+	switch (a) {
+	case HANGCHECK_IDLE:
+		return "idle";
+	case HANGCHECK_WAIT:
+		return "wait";
+	case HANGCHECK_ACTIVE_SEQNO:
+		return "active seqno";
+	case HANGCHECK_ACTIVE_HEAD:
+		return "active head";
+	case HANGCHECK_ACTIVE_SUBUNITS:
+		return "active subunits";
+	case HANGCHECK_KICK:
+		return "kick";
+	case HANGCHECK_HUNG:
+		return "hung";
+	}
+
+	return "unknown";
+}
 
 #define I915_MAX_SLICES	3
 #define I915_MAX_SUBSLICES 3
@@ -106,10 +127,11 @@ struct intel_instdone {
 struct intel_engine_hangcheck {
 	u64 acthd;
 	u32 seqno;
-	int score;
 	enum intel_engine_hangcheck_action action;
+	unsigned long action_timestamp;
 	int deadlock;
 	struct intel_instdone instdone;
+	bool guilty;
 };
 
 struct intel_ring {
-- 
2.7.4

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 3/6] drm/i915: Use request retirement as context progress
  2016-11-15 14:36 [PATCH 1/6] drm/i915: Split up hangcheck phases Mika Kuoppala
  2016-11-15 14:36 ` [PATCH 2/6] drm/i915: Decouple hang detection from hangcheck period Mika Kuoppala
@ 2016-11-15 14:36 ` Mika Kuoppala
  2016-11-15 16:34   ` Chris Wilson
  2016-11-15 14:36 ` [PATCH 4/6] drm/i915: Add bannable context parameter Mika Kuoppala
                   ` (4 subsequent siblings)
  6 siblings, 1 reply; 19+ messages in thread
From: Mika Kuoppala @ 2016-11-15 14:36 UTC (permalink / raw)
  To: intel-gfx

As hangcheck score was removed, the active decay of score
was removed also. This removed feature for hangcheck to detect
if the gpu client was accidentally or maliciously causing intermittent
hangs. Reinstate the scoring as a per context property, so that if
one context starts to act unfavourably, ban it.

v2: ban_period_secs as a gate to score check (Chris)

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h         |  3 +++
 drivers/gpu/drm/i915/i915_gem.c         | 44 ++++++++++++++++++++++-----------
 drivers/gpu/drm/i915/i915_gem_request.c |  4 +++
 3 files changed, 37 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 3caa55d..6bc9d0b 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -901,6 +901,9 @@ struct i915_ctx_hang_stats {
 
 	/* This context is banned to submit more work */
 	bool banned;
+
+	/* Accumulated score of hangs caused by this context */
+	int ban_score;
 };
 
 /* This must match up with the value previously used for execbuf2.rsvd1. */
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 708e289..42a0f96 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2620,33 +2620,45 @@ void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj,
 
 static bool i915_context_is_banned(const struct i915_gem_context *ctx)
 {
+	const struct i915_ctx_hang_stats *hs = &ctx->hang_stats;
 	unsigned long elapsed;
 
-	if (ctx->hang_stats.banned)
+	if (hs->banned)
 		return true;
 
-	elapsed = get_seconds() - ctx->hang_stats.guilty_ts;
-	if (ctx->hang_stats.ban_period_seconds &&
-	    elapsed <= ctx->hang_stats.ban_period_seconds) {
+	if (!hs->ban_period_seconds)
+		return false;
+
+	elapsed = get_seconds() - hs->guilty_ts;
+	if (elapsed <= hs->ban_period_seconds) {
 		DRM_DEBUG("context hanging too fast, banning!\n");
 		return true;
 	}
 
+	if (hs->ban_score >= 40) {
+		DRM_DEBUG("context hanging too often, banning!\n");
+		return true;
+	}
+
 	return false;
 }
 
-static void i915_set_reset_status(struct i915_gem_context *ctx,
-				  const bool guilty)
+static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
 {
 	struct i915_ctx_hang_stats *hs = &ctx->hang_stats;
 
-	if (guilty) {
-		hs->banned = i915_context_is_banned(ctx);
-		hs->batch_active++;
-		hs->guilty_ts = get_seconds();
-	} else {
-		hs->batch_pending++;
-	}
+	hs->ban_score += 10;
+
+	hs->banned = i915_context_is_banned(ctx);
+	hs->batch_active++;
+	hs->guilty_ts = get_seconds();
+}
+
+static void i915_gem_context_mark_innocent(struct i915_gem_context *ctx)
+{
+	struct i915_ctx_hang_stats *hs = &ctx->hang_stats;
+
+	hs->batch_pending++;
 }
 
 struct drm_i915_gem_request *
@@ -2714,7 +2726,11 @@ static void i915_gem_reset_engine(struct intel_engine_cs *engine)
 		ring_hung = false;
 	}
 
-	i915_set_reset_status(request->ctx, ring_hung);
+	if (ring_hung)
+		i915_gem_context_mark_guilty(request->ctx);
+	else
+		i915_gem_context_mark_innocent(request->ctx);
+
 	if (!ring_hung)
 		return;
 
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index b9b5253..095c809 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -204,6 +204,10 @@ static void i915_gem_request_retire(struct drm_i915_gem_request *request)
 
 	trace_i915_gem_request_retire(request);
 
+	/* Retirement decays the ban score as it is a sign of ctx progress */
+	if (request->ctx->hang_stats.ban_score > 0)
+		request->ctx->hang_stats.ban_score--;
+
 	spin_lock_irq(&request->engine->timeline->lock);
 	list_del_init(&request->link);
 	spin_unlock_irq(&request->engine->timeline->lock);
-- 
2.7.4

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 4/6] drm/i915: Add bannable context parameter
  2016-11-15 14:36 [PATCH 1/6] drm/i915: Split up hangcheck phases Mika Kuoppala
  2016-11-15 14:36 ` [PATCH 2/6] drm/i915: Decouple hang detection from hangcheck period Mika Kuoppala
  2016-11-15 14:36 ` [PATCH 3/6] drm/i915: Use request retirement as context progress Mika Kuoppala
@ 2016-11-15 14:36 ` Mika Kuoppala
  2016-11-15 16:31   ` Chris Wilson
  2016-11-15 14:36 ` [PATCH 5/6] drm/i915: Add per client max context ban limit Mika Kuoppala
                   ` (3 subsequent siblings)
  6 siblings, 1 reply; 19+ messages in thread
From: Mika Kuoppala @ 2016-11-15 14:36 UTC (permalink / raw)
  To: intel-gfx

Now when driver has per context scoring of 'hanging badness'
and also subsequent hangs during short windows are allowed,
if there is progress made in between, it does not make sense
to expose a ban timing window as a context parameter anymore.

Let the scoring be the sole indicator for ban policy and substitute
ban period context parameter as a boolean to get/set context
bannable property.

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h         | 14 +++-----------
 drivers/gpu/drm/i915/i915_gem.c         | 10 +---------
 drivers/gpu/drm/i915/i915_gem_context.c | 23 ++++++++++++++---------
 drivers/gpu/drm/i915/i915_gpu_error.c   |  5 +++--
 include/uapi/drm/i915_drm.h             |  1 +
 5 files changed, 22 insertions(+), 31 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 6bc9d0b..5af1c38 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -832,6 +832,7 @@ struct drm_i915_error_state {
 			long jiffies;
 			pid_t pid;
 			u32 context;
+			int ban_score;
 			u32 seqno;
 			u32 head;
 			u32 tail;
@@ -891,16 +892,10 @@ struct i915_ctx_hang_stats {
 	/* This context had batch active when hang was declared */
 	unsigned batch_active;
 
-	/* Time when this context was last blamed for a GPU reset */
-	unsigned long guilty_ts;
-
-	/* If the contexts causes a second GPU hang within this time,
-	 * it is permanently banned from submitting any more work.
-	 */
-	unsigned long ban_period_seconds;
+	bool bannable:1;
 
 	/* This context is banned to submit more work */
-	bool banned;
+	bool banned:1;
 
 	/* Accumulated score of hangs caused by this context */
 	int ban_score;
@@ -1437,9 +1432,6 @@ struct i915_gpu_error {
 #define DRM_I915_STUCK_PERIOD_SEC 24 /* No observed seqno progress */
 #define DRM_I915_HUNG_PERIOD_SEC 4 /* No observed seqno nor head progress */
 
-/* Hang gpu twice in this window and your context gets banned */
-#define DRM_I915_CTX_BAN_PERIOD_SEC 12
-
 #define HANGCHECK_STUCK_JIFFIES (DRM_I915_STUCK_PERIOD_SEC * HZ)
 #define HANGCHECK_HUNG_JIFFIES (DRM_I915_HUNG_PERIOD_SEC * HZ)
 #define HANGCHECK_PERIOD_JIFFIES msecs_to_jiffies(1500)
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 42a0f96..40a9e10 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2621,20 +2621,13 @@ void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj,
 static bool i915_context_is_banned(const struct i915_gem_context *ctx)
 {
 	const struct i915_ctx_hang_stats *hs = &ctx->hang_stats;
-	unsigned long elapsed;
 
 	if (hs->banned)
 		return true;
 
-	if (!hs->ban_period_seconds)
+	if (!hs->bannable)
 		return false;
 
-	elapsed = get_seconds() - hs->guilty_ts;
-	if (elapsed <= hs->ban_period_seconds) {
-		DRM_DEBUG("context hanging too fast, banning!\n");
-		return true;
-	}
-
 	if (hs->ban_score >= 40) {
 		DRM_DEBUG("context hanging too often, banning!\n");
 		return true;
@@ -2651,7 +2644,6 @@ static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
 
 	hs->banned = i915_context_is_banned(ctx);
 	hs->batch_active++;
-	hs->guilty_ts = get_seconds();
 }
 
 static void i915_gem_context_mark_innocent(struct i915_gem_context *ctx)
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 958a526..9abaae4 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -331,7 +331,7 @@ __create_hw_context(struct drm_device *dev,
 	 * is no remap info, it will be a NOP. */
 	ctx->remap_slice = ALL_L3_SLICES(dev_priv);
 
-	ctx->hang_stats.ban_period_seconds = DRM_I915_CTX_BAN_PERIOD_SEC;
+	ctx->hang_stats.bannable = true;
 	ctx->ring_size = 4 * PAGE_SIZE;
 	ctx->desc_template = GEN8_CTX_ADDRESSING_MODE(dev_priv) <<
 			     GEN8_CTX_ADDRESSING_MODE_SHIFT;
@@ -1085,7 +1085,7 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
 	args->size = 0;
 	switch (args->param) {
 	case I915_CONTEXT_PARAM_BAN_PERIOD:
-		args->value = ctx->hang_stats.ban_period_seconds;
+		ret = -EINVAL;
 		break;
 	case I915_CONTEXT_PARAM_NO_ZEROMAP:
 		args->value = ctx->flags & CONTEXT_NO_ZEROMAP;
@@ -1101,6 +1101,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
 	case I915_CONTEXT_PARAM_NO_ERROR_CAPTURE:
 		args->value = !!(ctx->flags & CONTEXT_NO_ERROR_CAPTURE);
 		break;
+	case I915_CONTEXT_PARAM_BANNABLE:
+		args->value = ctx->hang_stats.bannable;
+		break;
 	default:
 		ret = -EINVAL;
 		break;
@@ -1130,13 +1133,7 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
 
 	switch (args->param) {
 	case I915_CONTEXT_PARAM_BAN_PERIOD:
-		if (args->size)
-			ret = -EINVAL;
-		else if (args->value < ctx->hang_stats.ban_period_seconds &&
-			 !capable(CAP_SYS_ADMIN))
-			ret = -EPERM;
-		else
-			ctx->hang_stats.ban_period_seconds = args->value;
+		ret = -EINVAL;
 		break;
 	case I915_CONTEXT_PARAM_NO_ZEROMAP:
 		if (args->size) {
@@ -1156,6 +1153,14 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
 				ctx->flags &= ~CONTEXT_NO_ERROR_CAPTURE;
 		}
 		break;
+	case I915_CONTEXT_PARAM_BANNABLE:
+		if (args->size)
+			ret = -EINVAL;
+		else if (!capable(CAP_SYS_ADMIN))
+			ret = -EPERM;
+		else
+			ctx->hang_stats.bannable = args->value;
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 8d0f2bc..5d2e233 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -352,8 +352,8 @@ static void error_print_request(struct drm_i915_error_state_buf *m,
 	if (!erq->seqno)
 		return;
 
-	err_printf(m, "%s pid %d, seqno %8x:%08x, emitted %dms ago, head %08x, tail %08x\n",
-		   prefix, erq->pid,
+	err_printf(m, "%s pid %d, ban score %d, seqno %8x:%08x, emitted %dms ago, head %08x, tail %08x\n",
+		   prefix, erq->pid, erq->ban_score,
 		   erq->context, erq->seqno,
 		   jiffies_to_msecs(jiffies - erq->jiffies),
 		   erq->head, erq->tail);
@@ -1168,6 +1168,7 @@ static void record_request(struct drm_i915_gem_request *request,
 			   struct drm_i915_error_request *erq)
 {
 	erq->context = request->ctx->hw_id;
+	erq->ban_score = request->ctx->hang_stats.ban_score;
 	erq->seqno = request->global_seqno;
 	erq->jiffies = request->emitted_jiffies;
 	erq->head = request->head;
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 1c12a35..12003f0 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1224,6 +1224,7 @@ struct drm_i915_gem_context_param {
 #define I915_CONTEXT_PARAM_NO_ZEROMAP	0x2
 #define I915_CONTEXT_PARAM_GTT_SIZE	0x3
 #define I915_CONTEXT_PARAM_NO_ERROR_CAPTURE	0x4
+#define I915_CONTEXT_PARAM_BANNABLE	0x5
 	__u64 value;
 };
 
-- 
2.7.4

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 5/6] drm/i915: Add per client max context ban limit
  2016-11-15 14:36 [PATCH 1/6] drm/i915: Split up hangcheck phases Mika Kuoppala
                   ` (2 preceding siblings ...)
  2016-11-15 14:36 ` [PATCH 4/6] drm/i915: Add bannable context parameter Mika Kuoppala
@ 2016-11-15 14:36 ` Mika Kuoppala
  2016-11-15 16:27   ` Chris Wilson
  2016-11-15 14:36 ` [PATCH 6/6] drm/i915: Wipe hang stats as an embedded struct Mika Kuoppala
                   ` (2 subsequent siblings)
  6 siblings, 1 reply; 19+ messages in thread
From: Mika Kuoppala @ 2016-11-15 14:36 UTC (permalink / raw)
  To: intel-gfx

If we have a bad client submitting unfavourably across different
contexts, creating new ones, the per context scoring of badness
doesn't remove the root cause, the offending client.
To counter, keep track of per client context bans. Deny access if
client is responsible for more than 3 context bans in
it's lifetime.

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h            |  4 ++++
 drivers/gpu/drm/i915/i915_gem.c            | 28 +++++++++++++++++++++-------
 drivers/gpu/drm/i915/i915_gem_context.c    | 16 ++++++++++++++++
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |  8 ++++++--
 drivers/gpu/drm/i915/i915_gpu_error.c      | 10 ++++++----
 5 files changed, 53 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 5af1c38..92c5284 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -411,6 +411,9 @@ struct drm_i915_file_private {
 	} rps;
 
 	unsigned int bsd_engine;
+
+#define I915_MAX_CLIENT_CONTEXT_BANS 3
+	int context_bans;
 };
 
 /* Used by dp and fdi links */
@@ -854,6 +857,7 @@ struct drm_i915_error_state {
 
 		pid_t pid;
 		char comm[TASK_COMM_LEN];
+		int context_bans;
 	} engine[I915_NUM_ENGINES];
 
 	struct drm_i915_error_buffer {
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 40a9e10..f56230f 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2636,19 +2636,33 @@ static bool i915_context_is_banned(const struct i915_gem_context *ctx)
 	return false;
 }
 
-static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
+static void i915_gem_request_mark_guilty(struct drm_i915_gem_request *request)
 {
-	struct i915_ctx_hang_stats *hs = &ctx->hang_stats;
+	struct i915_ctx_hang_stats *hs = &request->ctx->hang_stats;
 
 	hs->ban_score += 10;
 
-	hs->banned = i915_context_is_banned(ctx);
+	hs->banned = i915_context_is_banned(request->ctx);
 	hs->batch_active++;
+
+	DRM_DEBUG_DRIVER("context %s marked guilty (score %d) banned? %s\n",
+			 request->ctx->name, hs->ban_score, yesno(hs->banned));
+
+	if (!request->file_priv)
+		return;
+
+	if (hs->banned) {
+		request->file_priv->context_bans++;
+
+		DRM_DEBUG_DRIVER("client %s has has %d context banned\n",
+				 request->ctx->name,
+				 request->file_priv->context_bans);
+	}
 }
 
-static void i915_gem_context_mark_innocent(struct i915_gem_context *ctx)
+static void i915_gem_request_mark_innocent(struct drm_i915_gem_request *request)
 {
-	struct i915_ctx_hang_stats *hs = &ctx->hang_stats;
+	struct i915_ctx_hang_stats *hs = &request->ctx->hang_stats;
 
 	hs->batch_pending++;
 }
@@ -2719,9 +2733,9 @@ static void i915_gem_reset_engine(struct intel_engine_cs *engine)
 	}
 
 	if (ring_hung)
-		i915_gem_context_mark_guilty(request->ctx);
+		i915_gem_request_mark_guilty(request);
 	else
-		i915_gem_context_mark_innocent(request->ctx);
+		i915_gem_request_mark_innocent(request);
 
 	if (!ring_hung)
 		return;
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 9abaae4..4ddd044 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -347,6 +347,14 @@ __create_hw_context(struct drm_device *dev,
 	return ERR_PTR(ret);
 }
 
+static bool client_is_banned(struct drm_i915_file_private *file_priv)
+{
+	if (file_priv->context_bans <= I915_MAX_CLIENT_CONTEXT_BANS)
+		return false;
+
+	return true;
+}
+
 /**
  * The default context needs to exist per ring that uses contexts. It stores the
  * context state of the GPU for applications that don't utilize HW contexts, as
@@ -360,6 +368,14 @@ i915_gem_create_context(struct drm_device *dev,
 
 	lockdep_assert_held(&dev->struct_mutex);
 
+	if (file_priv && client_is_banned(file_priv)) {
+		DRM_DEBUG("client %s[%d] banned from creating ctx\n",
+			  current->comm,
+			  pid_nr(get_task_pid(current, PIDTYPE_PID)));
+
+		return ERR_PTR(-EIO);
+	}
+
 	ctx = __create_hw_context(dev, file_priv);
 	if (IS_ERR(ctx))
 		return ctx;
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index e804cb2..21beaf0 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1231,16 +1231,20 @@ static struct i915_gem_context *
 i915_gem_validate_context(struct drm_device *dev, struct drm_file *file,
 			  struct intel_engine_cs *engine, const u32 ctx_id)
 {
+	struct drm_i915_file_private *file_priv = file->driver_priv;
 	struct i915_gem_context *ctx;
 	struct i915_ctx_hang_stats *hs;
 
-	ctx = i915_gem_context_lookup(file->driver_priv, ctx_id);
+	ctx = i915_gem_context_lookup(file_priv, ctx_id);
 	if (IS_ERR(ctx))
 		return ctx;
 
 	hs = &ctx->hang_stats;
 	if (hs->banned) {
-		DRM_DEBUG("Context %u tried to submit while banned\n", ctx_id);
+		DRM_DEBUG("Client %s banned from submitting (%d:%d)\n",
+			  ctx->name,
+			  file_priv->context_bans,
+			  hs->ban_score);
 		return ERR_PTR(-EIO);
 	}
 
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 5d2e233..b8dfa0c 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -536,10 +536,11 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
 		if (error->engine[i].hangcheck_guilty &&
 		    error->engine[i].pid != -1) {
-			err_printf(m, "Active process (on ring %s): %s [%d]\n",
+			err_printf(m, "Active process (on ring %s): %s [%d], context bans %d\n",
 				   engine_str(i),
 				   error->engine[i].comm,
-				   error->engine[i].pid);
+				   error->engine[i].pid,
+				   error->engine[i].context_bans);
 		}
 	}
 	err_printf(m, "Reset count: %u\n", error->reset_count);
@@ -630,9 +631,10 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 		if (obj) {
 			err_puts(m, dev_priv->engine[i]->name);
 			if (ee->pid != -1)
-				err_printf(m, " (submitted by %s [%d])",
+				err_printf(m, " (submitted by %s [%d], bans %d)",
 					   ee->comm,
-					   ee->pid);
+					   ee->pid,
+					   ee->context_bans);
 			err_printf(m, " --- gtt_offset = 0x%08x %08x\n",
 				   upper_32_bits(obj->gtt_offset),
 				   lower_32_bits(obj->gtt_offset));
-- 
2.7.4

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 6/6] drm/i915: Wipe hang stats as an embedded struct
  2016-11-15 14:36 [PATCH 1/6] drm/i915: Split up hangcheck phases Mika Kuoppala
                   ` (3 preceding siblings ...)
  2016-11-15 14:36 ` [PATCH 5/6] drm/i915: Add per client max context ban limit Mika Kuoppala
@ 2016-11-15 14:36 ` Mika Kuoppala
  2016-11-15 15:45 ` ✓ Fi.CI.BAT: success for series starting with [1/6] drm/i915: Split up hangcheck phases Patchwork
  2016-11-15 16:39 ` [PATCH 1/6] " Chris Wilson
  6 siblings, 0 replies; 19+ messages in thread
From: Mika Kuoppala @ 2016-11-15 14:36 UTC (permalink / raw)
  To: intel-gfx

Bannable property, banned status, guilty and active counts are
properties of i915_gem_context. Make them so.

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h            | 27 ++++++++-------------------
 drivers/gpu/drm/i915/i915_gem.c            | 25 ++++++++++++-------------
 drivers/gpu/drm/i915/i915_gem_context.c    | 12 +++++-------
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |  6 ++----
 drivers/gpu/drm/i915/i915_gem_request.c    |  4 ++--
 drivers/gpu/drm/i915/i915_gpu_error.c      |  2 +-
 6 files changed, 30 insertions(+), 46 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 92c5284..fc9d348 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -889,23 +889,6 @@ enum i915_cache_level {
 	I915_CACHE_WT, /* hsw:gt3e WriteThrough for scanouts */
 };
 
-struct i915_ctx_hang_stats {
-	/* This context had batch pending when hang was declared */
-	unsigned batch_pending;
-
-	/* This context had batch active when hang was declared */
-	unsigned batch_active;
-
-	bool bannable:1;
-
-	/* This context is banned to submit more work */
-	bool banned:1;
-
-	/* Accumulated score of hangs caused by this context */
-	int ban_score;
-};
-
-/* This must match up with the value previously used for execbuf2.rsvd1. */
 #define DEFAULT_CONTEXT_HANDLE 0
 
 /**
@@ -935,8 +918,6 @@ struct i915_gem_context {
 	struct pid *pid;
 	const char *name;
 
-	struct i915_ctx_hang_stats hang_stats;
-
 	unsigned long flags;
 #define CONTEXT_NO_ZEROMAP		BIT(0)
 #define CONTEXT_NO_ERROR_CAPTURE	BIT(1)
@@ -965,6 +946,14 @@ struct i915_gem_context {
 
 	u8 remap_slice;
 	bool closed:1;
+	bool bannable:1;
+	bool banned:1;
+
+	unsigned int guilty_count; /* guilty of a hang */
+	unsigned int active_count; /* active during hang */
+
+	/* Accumulated score of hangs caused by this context */
+	int ban_score;
 };
 
 enum fb_op_origin {
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index f56230f..f78c407 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2620,15 +2620,13 @@ void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj,
 
 static bool i915_context_is_banned(const struct i915_gem_context *ctx)
 {
-	const struct i915_ctx_hang_stats *hs = &ctx->hang_stats;
-
-	if (hs->banned)
+	if (ctx->banned)
 		return true;
 
-	if (!hs->bannable)
+	if (!ctx->bannable)
 		return false;
 
-	if (hs->ban_score >= 40) {
+	if (ctx->ban_score >= 40) {
 		DRM_DEBUG("context hanging too often, banning!\n");
 		return true;
 	}
@@ -2638,20 +2636,21 @@ static bool i915_context_is_banned(const struct i915_gem_context *ctx)
 
 static void i915_gem_request_mark_guilty(struct drm_i915_gem_request *request)
 {
-	struct i915_ctx_hang_stats *hs = &request->ctx->hang_stats;
+	struct i915_gem_context *ctx = request->ctx;
 
-	hs->ban_score += 10;
+	ctx->ban_score += 10;
 
-	hs->banned = i915_context_is_banned(request->ctx);
-	hs->batch_active++;
+	ctx->banned = i915_context_is_banned(request->ctx);
+	ctx->guilty_count++;
 
 	DRM_DEBUG_DRIVER("context %s marked guilty (score %d) banned? %s\n",
-			 request->ctx->name, hs->ban_score, yesno(hs->banned));
+			 request->ctx->name, ctx->ban_score,
+			 yesno(ctx->banned));
 
 	if (!request->file_priv)
 		return;
 
-	if (hs->banned) {
+	if (ctx->banned) {
 		request->file_priv->context_bans++;
 
 		DRM_DEBUG_DRIVER("client %s has has %d context banned\n",
@@ -2662,9 +2661,9 @@ static void i915_gem_request_mark_guilty(struct drm_i915_gem_request *request)
 
 static void i915_gem_request_mark_innocent(struct drm_i915_gem_request *request)
 {
-	struct i915_ctx_hang_stats *hs = &request->ctx->hang_stats;
+	struct i915_gem_context *ctx = request->ctx;
 
-	hs->batch_pending++;
+	ctx->active_count++;
 }
 
 struct drm_i915_gem_request *
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 4ddd044..a854eb0 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -331,7 +331,7 @@ __create_hw_context(struct drm_device *dev,
 	 * is no remap info, it will be a NOP. */
 	ctx->remap_slice = ALL_L3_SLICES(dev_priv);
 
-	ctx->hang_stats.bannable = true;
+	ctx->bannable = true;
 	ctx->ring_size = 4 * PAGE_SIZE;
 	ctx->desc_template = GEN8_CTX_ADDRESSING_MODE(dev_priv) <<
 			     GEN8_CTX_ADDRESSING_MODE_SHIFT;
@@ -1118,7 +1118,7 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
 		args->value = !!(ctx->flags & CONTEXT_NO_ERROR_CAPTURE);
 		break;
 	case I915_CONTEXT_PARAM_BANNABLE:
-		args->value = ctx->hang_stats.bannable;
+		args->value = ctx->bannable;
 		break;
 	default:
 		ret = -EINVAL;
@@ -1175,7 +1175,7 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
 		else if (!capable(CAP_SYS_ADMIN))
 			ret = -EPERM;
 		else
-			ctx->hang_stats.bannable = args->value;
+			ctx->bannable = args->value;
 		break;
 	default:
 		ret = -EINVAL;
@@ -1191,7 +1191,6 @@ int i915_gem_context_reset_stats_ioctl(struct drm_device *dev,
 {
 	struct drm_i915_private *dev_priv = to_i915(dev);
 	struct drm_i915_reset_stats *args = data;
-	struct i915_ctx_hang_stats *hs;
 	struct i915_gem_context *ctx;
 	int ret;
 
@@ -1210,15 +1209,14 @@ int i915_gem_context_reset_stats_ioctl(struct drm_device *dev,
 		mutex_unlock(&dev->struct_mutex);
 		return PTR_ERR(ctx);
 	}
-	hs = &ctx->hang_stats;
 
 	if (capable(CAP_SYS_ADMIN))
 		args->reset_count = i915_reset_count(&dev_priv->gpu_error);
 	else
 		args->reset_count = 0;
 
-	args->batch_active = hs->batch_active;
-	args->batch_pending = hs->batch_pending;
+	args->batch_active = ctx->guilty_count;
+	args->batch_pending = ctx->active_count;
 
 	mutex_unlock(&dev->struct_mutex);
 
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 21beaf0..1a2ee19 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1233,18 +1233,16 @@ i915_gem_validate_context(struct drm_device *dev, struct drm_file *file,
 {
 	struct drm_i915_file_private *file_priv = file->driver_priv;
 	struct i915_gem_context *ctx;
-	struct i915_ctx_hang_stats *hs;
 
 	ctx = i915_gem_context_lookup(file_priv, ctx_id);
 	if (IS_ERR(ctx))
 		return ctx;
 
-	hs = &ctx->hang_stats;
-	if (hs->banned) {
+	if (ctx->banned) {
 		DRM_DEBUG("Client %s banned from submitting (%d:%d)\n",
 			  ctx->name,
 			  file_priv->context_bans,
-			  hs->ban_score);
+			  ctx->ban_score);
 		return ERR_PTR(-EIO);
 	}
 
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 095c809..3c66798 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -205,8 +205,8 @@ static void i915_gem_request_retire(struct drm_i915_gem_request *request)
 	trace_i915_gem_request_retire(request);
 
 	/* Retirement decays the ban score as it is a sign of ctx progress */
-	if (request->ctx->hang_stats.ban_score > 0)
-		request->ctx->hang_stats.ban_score--;
+	if (request->ctx->ban_score > 0)
+		request->ctx->ban_score--;
 
 	spin_lock_irq(&request->engine->timeline->lock);
 	list_del_init(&request->link);
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index b8dfa0c..046b6b1 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -1170,7 +1170,7 @@ static void record_request(struct drm_i915_gem_request *request,
 			   struct drm_i915_error_request *erq)
 {
 	erq->context = request->ctx->hw_id;
-	erq->ban_score = request->ctx->hang_stats.ban_score;
+	erq->ban_score = request->ctx->ban_score;
 	erq->seqno = request->global_seqno;
 	erq->jiffies = request->emitted_jiffies;
 	erq->head = request->head;
-- 
2.7.4

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* ✓ Fi.CI.BAT: success for series starting with [1/6] drm/i915: Split up hangcheck phases
  2016-11-15 14:36 [PATCH 1/6] drm/i915: Split up hangcheck phases Mika Kuoppala
                   ` (4 preceding siblings ...)
  2016-11-15 14:36 ` [PATCH 6/6] drm/i915: Wipe hang stats as an embedded struct Mika Kuoppala
@ 2016-11-15 15:45 ` Patchwork
  2016-11-15 16:47   ` Chris Wilson
  2016-11-15 16:39 ` [PATCH 1/6] " Chris Wilson
  6 siblings, 1 reply; 19+ messages in thread
From: Patchwork @ 2016-11-15 15:45 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/6] drm/i915: Split up hangcheck phases
URL   : https://patchwork.freedesktop.org/series/15351/
State : success

== Summary ==

Series 15351v1 Series without cover letter
https://patchwork.freedesktop.org/api/1.0/series/15351/revisions/1/mbox/


fi-bdw-5557u     total:244  pass:229  dwarn:0   dfail:0   fail:0   skip:15 
fi-bsw-n3050     total:244  pass:204  dwarn:0   dfail:0   fail:0   skip:40 
fi-bxt-t5700     total:244  pass:216  dwarn:0   dfail:0   fail:0   skip:28 
fi-byt-j1900     total:244  pass:216  dwarn:0   dfail:0   fail:0   skip:28 
fi-byt-n2820     total:244  pass:212  dwarn:0   dfail:0   fail:0   skip:32 
fi-hsw-4770      total:244  pass:224  dwarn:0   dfail:0   fail:0   skip:20 
fi-hsw-4770r     total:244  pass:224  dwarn:0   dfail:0   fail:0   skip:20 
fi-ilk-650       total:244  pass:191  dwarn:0   dfail:0   fail:0   skip:53 
fi-ivb-3520m     total:244  pass:222  dwarn:0   dfail:0   fail:0   skip:22 
fi-ivb-3770      total:244  pass:222  dwarn:0   dfail:0   fail:0   skip:22 
fi-kbl-7200u     total:244  pass:222  dwarn:0   dfail:0   fail:0   skip:22 
fi-skl-6260u     total:244  pass:230  dwarn:0   dfail:0   fail:0   skip:14 
fi-skl-6700hq    total:244  pass:223  dwarn:0   dfail:0   fail:0   skip:21 
fi-skl-6700k     total:244  pass:222  dwarn:1   dfail:0   fail:0   skip:21 
fi-skl-6770hq    total:244  pass:230  dwarn:0   dfail:0   fail:0   skip:14 
fi-snb-2520m     total:244  pass:212  dwarn:0   dfail:0   fail:0   skip:32 
fi-snb-2600      total:244  pass:211  dwarn:0   dfail:0   fail:0   skip:33 

04145fe15cf8c81c221e62fc9d65d93053f9bd1a drm-intel-nightly: 2016y-11m-15d-14h-47m-07s UTC integration manifest
b216162 drm/i915: Wipe hang stats as an embedded struct
5d08c98 drm/i915: Add per client max context ban limit
6550ffa drm/i915: Add bannable context parameter
3f59a7a drm/i915: Use request retirement as context progress
a997e41 drm/i915: Decouple hang detection from hangcheck period
75c63fa drm/i915: Split up hangcheck phases

== Logs ==

For more details see: https://intel-gfx-ci.01.org/CI/Patchwork_3001/
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 5/6] drm/i915: Add per client max context ban limit
  2016-11-15 14:36 ` [PATCH 5/6] drm/i915: Add per client max context ban limit Mika Kuoppala
@ 2016-11-15 16:27   ` Chris Wilson
  2016-11-15 17:12     ` Mika Kuoppala
  0 siblings, 1 reply; 19+ messages in thread
From: Chris Wilson @ 2016-11-15 16:27 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx

On Tue, Nov 15, 2016 at 04:36:35PM +0200, Mika Kuoppala wrote:
> If we have a bad client submitting unfavourably across different
> contexts, creating new ones, the per context scoring of badness
> doesn't remove the root cause, the offending client.
> To counter, keep track of per client context bans. Deny access if
> client is responsible for more than 3 context bans in
> it's lifetime.
> 
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h            |  4 ++++
>  drivers/gpu/drm/i915/i915_gem.c            | 28 +++++++++++++++++++++-------
>  drivers/gpu/drm/i915/i915_gem_context.c    | 16 ++++++++++++++++
>  drivers/gpu/drm/i915/i915_gem_execbuffer.c |  8 ++++++--
>  drivers/gpu/drm/i915/i915_gpu_error.c      | 10 ++++++----
>  5 files changed, 53 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 5af1c38..92c5284 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -411,6 +411,9 @@ struct drm_i915_file_private {
>  	} rps;
>  
>  	unsigned int bsd_engine;
> +
> +#define I915_MAX_CLIENT_CONTEXT_BANS 3
> +	int context_bans;

3 feels a little too small. But possibly ok since this is at the context
level. Still webgl...

>  };
>  
>  /* Used by dp and fdi links */
> @@ -854,6 +857,7 @@ struct drm_i915_error_state {
>  
>  		pid_t pid;
>  		char comm[TASK_COMM_LEN];
> +		int context_bans;
>  	} engine[I915_NUM_ENGINES];
>  
>  	struct drm_i915_error_buffer {
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 40a9e10..f56230f 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2636,19 +2636,33 @@ static bool i915_context_is_banned(const struct i915_gem_context *ctx)
>  	return false;
>  }
>  
> -static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
> +static void i915_gem_request_mark_guilty(struct drm_i915_gem_request *request)
>  {
> -	struct i915_ctx_hang_stats *hs = &ctx->hang_stats;
> +	struct i915_ctx_hang_stats *hs = &request->ctx->hang_stats;
>  
>  	hs->ban_score += 10;
>  
> -	hs->banned = i915_context_is_banned(ctx);
> +	hs->banned = i915_context_is_banned(request->ctx);
>  	hs->batch_active++;
> +
> +	DRM_DEBUG_DRIVER("context %s marked guilty (score %d) banned? %s\n",
> +			 request->ctx->name, hs->ban_score, yesno(hs->banned));
> +
> +	if (!request->file_priv)
> +		return;
> +
> +	if (hs->banned) {
> +		request->file_priv->context_bans++;
> +
> +		DRM_DEBUG_DRIVER("client %s has has %d context banned\n",
> +				 request->ctx->name,
> +				 request->file_priv->context_bans);
> +	}
>  }
>  
> -static void i915_gem_context_mark_innocent(struct i915_gem_context *ctx)
> +static void i915_gem_request_mark_innocent(struct drm_i915_gem_request *request)
>  {
> -	struct i915_ctx_hang_stats *hs = &ctx->hang_stats;
> +	struct i915_ctx_hang_stats *hs = &request->ctx->hang_stats;
>  
>  	hs->batch_pending++;
>  }
> @@ -2719,9 +2733,9 @@ static void i915_gem_reset_engine(struct intel_engine_cs *engine)
>  	}
>  
>  	if (ring_hung)
> -		i915_gem_context_mark_guilty(request->ctx);
> +		i915_gem_request_mark_guilty(request);
>  	else
> -		i915_gem_context_mark_innocent(request->ctx);
> +		i915_gem_request_mark_innocent(request);

Why? Just use the file_priv on the ctx.

> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index 9abaae4..4ddd044 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -347,6 +347,14 @@ __create_hw_context(struct drm_device *dev,
>  	return ERR_PTR(ret);
>  }
>  
> +static bool client_is_banned(struct drm_i915_file_private *file_priv)
> +{
> +	if (file_priv->context_bans <= I915_MAX_CLIENT_CONTEXT_BANS)
> +		return false;

	return file_priv->context_bans > I915_MAX_CLIENT_CONTEXT_BANS;
> +
> +	return true;
> +}
> +
>  /**
>   * The default context needs to exist per ring that uses contexts. It stores the
>   * context state of the GPU for applications that don't utilize HW contexts, as
> @@ -360,6 +368,14 @@ i915_gem_create_context(struct drm_device *dev,
>  
>  	lockdep_assert_held(&dev->struct_mutex);
>  
> +	if (file_priv && client_is_banned(file_priv)) {

So this belongs to context_craate_ioctl

> +		DRM_DEBUG("client %s[%d] banned from creating ctx\n",
> +			  current->comm,
> +			  pid_nr(get_task_pid(current, PIDTYPE_PID)));
> +
> +		return ERR_PTR(-EIO);
> +	}
> +
>  	ctx = __create_hw_context(dev, file_priv);
>  	if (IS_ERR(ctx))
>  		return ctx;
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index e804cb2..21beaf0 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -1231,16 +1231,20 @@ static struct i915_gem_context *
>  i915_gem_validate_context(struct drm_device *dev, struct drm_file *file,
>  			  struct intel_engine_cs *engine, const u32 ctx_id)
>  {
> +	struct drm_i915_file_private *file_priv = file->driver_priv;
>  	struct i915_gem_context *ctx;
>  	struct i915_ctx_hang_stats *hs;
>  
> -	ctx = i915_gem_context_lookup(file->driver_priv, ctx_id);
> +	ctx = i915_gem_context_lookup(file_priv, ctx_id);
>  	if (IS_ERR(ctx))
>  		return ctx;
>  
>  	hs = &ctx->hang_stats;
>  	if (hs->banned) {
> -		DRM_DEBUG("Context %u tried to submit while banned\n", ctx_id);
> +		DRM_DEBUG("Client %s banned from submitting (%d:%d)\n",
> +			  ctx->name,
> +			  file_priv->context_bans,
> +			  hs->ban_score);

Context bans prevents creating new contexts. What is its relevance here?

(We should start using pr_debug for these user aides, at least something
more suitable than DRM_DEBUG).
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 4/6] drm/i915: Add bannable context parameter
  2016-11-15 14:36 ` [PATCH 4/6] drm/i915: Add bannable context parameter Mika Kuoppala
@ 2016-11-15 16:31   ` Chris Wilson
  0 siblings, 0 replies; 19+ messages in thread
From: Chris Wilson @ 2016-11-15 16:31 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx

On Tue, Nov 15, 2016 at 04:36:34PM +0200, Mika Kuoppala wrote:
> Now when driver has per context scoring of 'hanging badness'
> and also subsequent hangs during short windows are allowed,
> if there is progress made in between, it does not make sense
> to expose a ban timing window as a context parameter anymore.
> 
> Let the scoring be the sole indicator for ban policy and substitute
> ban period context parameter as a boolean to get/set context
> bannable property.
> 
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>

> @@ -1156,6 +1153,14 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
>  				ctx->flags &= ~CONTEXT_NO_ERROR_CAPTURE;
>  		}
>  		break;
> +	case I915_CONTEXT_PARAM_BANNABLE:
> +		if (args->size)
> +			ret = -EINVAL;
> +		else if (!capable(CAP_SYS_ADMIN))

You can allow non-root to opt into being banned.

else if (!capable(CAP_SYS_ADMIN) && !args->value)

Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/6] drm/i915: Use request retirement as context progress
  2016-11-15 14:36 ` [PATCH 3/6] drm/i915: Use request retirement as context progress Mika Kuoppala
@ 2016-11-15 16:34   ` Chris Wilson
  0 siblings, 0 replies; 19+ messages in thread
From: Chris Wilson @ 2016-11-15 16:34 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx

On Tue, Nov 15, 2016 at 04:36:33PM +0200, Mika Kuoppala wrote:
> As hangcheck score was removed, the active decay of score
> was removed also. This removed feature for hangcheck to detect
> if the gpu client was accidentally or maliciously causing intermittent
> hangs. Reinstate the scoring as a per context property, so that if
> one context starts to act unfavourably, ban it.
> 
> v2: ban_period_secs as a gate to score check (Chris)
> 
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>

> -	elapsed = get_seconds() - ctx->hang_stats.guilty_ts;
> -	if (ctx->hang_stats.ban_period_seconds &&
> -	    elapsed <= ctx->hang_stats.ban_period_seconds) {
> +	if (!hs->ban_period_seconds)
> +		return false;
> +
> +	elapsed = get_seconds() - hs->guilty_ts;
> +	if (elapsed <= hs->ban_period_seconds) {
>  		DRM_DEBUG("context hanging too fast, banning!\n");
>  		return true;
>  	}
>  
> +	if (hs->ban_score >= 40) {
> +		DRM_DEBUG("context hanging too often, banning!\n");
> +		return true;
> +	}
> +
>  	return false;
>  }

> +	hs->ban_score += 10;

This pair should be tunables (i.e. a macro somewhere sensible).

> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> index b9b5253..095c809 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -204,6 +204,10 @@ static void i915_gem_request_retire(struct drm_i915_gem_request *request)
>  
>  	trace_i915_gem_request_retire(request);
>  
> +	/* Retirement decays the ban score as it is a sign of ctx progress */
> +	if (request->ctx->hang_stats.ban_score > 0)
> +		request->ctx->hang_stats.ban_score--;

Please put this along with the other request->ctx updates (i.e. after
request->previos_context and before the context_put).

Otherwise lgtm.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/6] drm/i915: Decouple hang detection from hangcheck period
  2016-11-15 14:36 ` [PATCH 2/6] drm/i915: Decouple hang detection from hangcheck period Mika Kuoppala
@ 2016-11-15 16:38   ` Chris Wilson
  2016-11-15 17:23     ` Mika Kuoppala
  2016-11-16  8:07   ` Chris Wilson
  1 sibling, 1 reply; 19+ messages in thread
From: Chris Wilson @ 2016-11-15 16:38 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx

On Tue, Nov 15, 2016 at 04:36:32PM +0200, Mika Kuoppala wrote:
> +static bool
> +hangcheck_engine_stall(struct intel_engine_cs *engine,
> +		       struct intel_engine_hangcheck *hc)
> +{
> +	const unsigned long last_action = engine->hangcheck.action_timestamp;
>  
> -		memset(&engine->hangcheck.instdone, 0,
> -		       sizeof(engine->hangcheck.instdone));
> -		break;
> +	if (hc->action == HANGCHECK_ACTIVE_SEQNO ||
> +	    hc->action == HANGCHECK_IDLE)
> +		return false;
> +
> +	if (time_before(jiffies, last_action + HANGCHECK_HUNG_JIFFIES))
> +		return false;
> +
> +	if (time_before(jiffies, last_action + HANGCHECK_STUCK_JIFFIES))
> +		if (hc->action != HANGCHECK_HUNG)
> +			return false;

This deserves a lot more explanation. Why two values? What are their
significance?

Do we want to be using jiffies? (Values are in seconds, so jiffie
resoluation makes sense, but add that as a comment somewhere.)

> +	return true;
> +}
> +
> +static struct intel_engine_cs *find_lra_engine(struct drm_i915_private *i915,
> +					       const unsigned int mask)

What is lra?

> +{
> +	struct intel_engine_cs *engine = NULL, *c;
> +	enum intel_engine_id id;
> +
> +	for_each_engine_masked(c, i915, mask, id) {
> +		if (engine == NULL) {
> +			engine = c;
> +			continue;
> +		}
> +
> +		if (time_before(c->hangcheck.action_timestamp,
> +				engine->hangcheck.action_timestamp))
> +			engine = c;
> +		else if (c->hangcheck.action_timestamp ==
> +			 engine->hangcheck.action_timestamp &&
> +			 c->hangcheck.seqno < engine->hangcheck.seqno)
> +			engine = c;

Why is the last engine significant?
Why is just engine guilty?


Too many whys in this one.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/6] drm/i915: Split up hangcheck phases
  2016-11-15 14:36 [PATCH 1/6] drm/i915: Split up hangcheck phases Mika Kuoppala
                   ` (5 preceding siblings ...)
  2016-11-15 15:45 ` ✓ Fi.CI.BAT: success for series starting with [1/6] drm/i915: Split up hangcheck phases Patchwork
@ 2016-11-15 16:39 ` Chris Wilson
  6 siblings, 0 replies; 19+ messages in thread
From: Chris Wilson @ 2016-11-15 16:39 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx

On Tue, Nov 15, 2016 at 04:36:31PM +0200, Mika Kuoppala wrote:
> In order to simplify hangcheck state keeping, split hangcheck
> per engine loop in three phases: state load, action, state save.
> 
> Add few more hangcheck actions to separate between seqno, head
> and subunit movements. This helps to gather all the hangcheck
> actions under a single switch umbrella.
> 
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: ✓ Fi.CI.BAT: success for series starting with [1/6] drm/i915: Split up hangcheck phases
  2016-11-15 15:45 ` ✓ Fi.CI.BAT: success for series starting with [1/6] drm/i915: Split up hangcheck phases Patchwork
@ 2016-11-15 16:47   ` Chris Wilson
  2016-11-15 17:27     ` Mika Kuoppala
  0 siblings, 1 reply; 19+ messages in thread
From: Chris Wilson @ 2016-11-15 16:47 UTC (permalink / raw)
  To: intel-gfx

On Tue, Nov 15, 2016 at 03:45:41PM -0000, Patchwork wrote:
> == Series Details ==
> 
> Series: series starting with [1/6] drm/i915: Split up hangcheck phases
> URL   : https://patchwork.freedesktop.org/series/15351/
> State : success
> 
> == Summary ==
> 
> Series 15351v1 Series without cover letter
> https://patchwork.freedesktop.org/api/1.0/series/15351/revisions/1/mbox/
> 
> 
> fi-bdw-5557u     total:244  pass:229  dwarn:0   dfail:0   fail:0   skip:15 
> fi-bsw-n3050     total:244  pass:204  dwarn:0   dfail:0   fail:0   skip:40 
> fi-bxt-t5700     total:244  pass:216  dwarn:0   dfail:0   fail:0   skip:28 
> fi-byt-j1900     total:244  pass:216  dwarn:0   dfail:0   fail:0   skip:28 
> fi-byt-n2820     total:244  pass:212  dwarn:0   dfail:0   fail:0   skip:32 
> fi-hsw-4770      total:244  pass:224  dwarn:0   dfail:0   fail:0   skip:20 
> fi-hsw-4770r     total:244  pass:224  dwarn:0   dfail:0   fail:0   skip:20 
> fi-ilk-650       total:244  pass:191  dwarn:0   dfail:0   fail:0   skip:53 
> fi-ivb-3520m     total:244  pass:222  dwarn:0   dfail:0   fail:0   skip:22 
> fi-ivb-3770      total:244  pass:222  dwarn:0   dfail:0   fail:0   skip:22 
> fi-kbl-7200u     total:244  pass:222  dwarn:0   dfail:0   fail:0   skip:22 
> fi-skl-6260u     total:244  pass:230  dwarn:0   dfail:0   fail:0   skip:14 
> fi-skl-6700hq    total:244  pass:223  dwarn:0   dfail:0   fail:0   skip:21 
> fi-skl-6700k     total:244  pass:222  dwarn:1   dfail:0   fail:0   skip:21 
> fi-skl-6770hq    total:244  pass:230  dwarn:0   dfail:0   fail:0   skip:14 
> fi-snb-2520m     total:244  pass:212  dwarn:0   dfail:0   fail:0   skip:32 
> fi-snb-2600      total:244  pass:211  dwarn:0   dfail:0   fail:0   skip:33 

Worth asking: what's the impact on the hangcheck tests runtime within BAT?

Presumably still the same timeouts...
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 5/6] drm/i915: Add per client max context ban limit
  2016-11-15 16:27   ` Chris Wilson
@ 2016-11-15 17:12     ` Mika Kuoppala
  0 siblings, 0 replies; 19+ messages in thread
From: Mika Kuoppala @ 2016-11-15 17:12 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> On Tue, Nov 15, 2016 at 04:36:35PM +0200, Mika Kuoppala wrote:
>> If we have a bad client submitting unfavourably across different
>> contexts, creating new ones, the per context scoring of badness
>> doesn't remove the root cause, the offending client.
>> To counter, keep track of per client context bans. Deny access if
>> client is responsible for more than 3 context bans in
>> it's lifetime.
>> 
>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
>> ---
>>  drivers/gpu/drm/i915/i915_drv.h            |  4 ++++
>>  drivers/gpu/drm/i915/i915_gem.c            | 28 +++++++++++++++++++++-------
>>  drivers/gpu/drm/i915/i915_gem_context.c    | 16 ++++++++++++++++
>>  drivers/gpu/drm/i915/i915_gem_execbuffer.c |  8 ++++++--
>>  drivers/gpu/drm/i915/i915_gpu_error.c      | 10 ++++++----
>>  5 files changed, 53 insertions(+), 13 deletions(-)
>> 
>> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>> index 5af1c38..92c5284 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.h
>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>> @@ -411,6 +411,9 @@ struct drm_i915_file_private {
>>  	} rps;
>>  
>>  	unsigned int bsd_engine;
>> +
>> +#define I915_MAX_CLIENT_CONTEXT_BANS 3
>> +	int context_bans;
>
> 3 feels a little too small. But possibly ok since this is at the context
> level. Still webgl...
>

3 bans is a significant amount on hangs. 12 consecutive hangs without
single succesful request in between to reach this.

>>  };
>>  
>>  /* Used by dp and fdi links */
>> @@ -854,6 +857,7 @@ struct drm_i915_error_state {
>>  
>>  		pid_t pid;
>>  		char comm[TASK_COMM_LEN];
>> +		int context_bans;
>>  	} engine[I915_NUM_ENGINES];
>>  
>>  	struct drm_i915_error_buffer {
>> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
>> index 40a9e10..f56230f 100644
>> --- a/drivers/gpu/drm/i915/i915_gem.c
>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>> @@ -2636,19 +2636,33 @@ static bool i915_context_is_banned(const struct i915_gem_context *ctx)
>>  	return false;
>>  }
>>  
>> -static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
>> +static void i915_gem_request_mark_guilty(struct drm_i915_gem_request *request)
>>  {
>> -	struct i915_ctx_hang_stats *hs = &ctx->hang_stats;
>> +	struct i915_ctx_hang_stats *hs = &request->ctx->hang_stats;
>>  
>>  	hs->ban_score += 10;
>>  
>> -	hs->banned = i915_context_is_banned(ctx);
>> +	hs->banned = i915_context_is_banned(request->ctx);
>>  	hs->batch_active++;
>> +
>> +	DRM_DEBUG_DRIVER("context %s marked guilty (score %d) banned? %s\n",
>> +			 request->ctx->name, hs->ban_score, yesno(hs->banned));
>> +
>> +	if (!request->file_priv)
>> +		return;
>> +
>> +	if (hs->banned) {
>> +		request->file_priv->context_bans++;
>> +
>> +		DRM_DEBUG_DRIVER("client %s has has %d context banned\n",
>> +				 request->ctx->name,
>> +				 request->file_priv->context_bans);
>> +	}
>>  }
>>  
>> -static void i915_gem_context_mark_innocent(struct i915_gem_context *ctx)
>> +static void i915_gem_request_mark_innocent(struct drm_i915_gem_request *request)
>>  {
>> -	struct i915_ctx_hang_stats *hs = &ctx->hang_stats;
>> +	struct i915_ctx_hang_stats *hs = &request->ctx->hang_stats;
>>  
>>  	hs->batch_pending++;
>>  }
>> @@ -2719,9 +2733,9 @@ static void i915_gem_reset_engine(struct intel_engine_cs *engine)
>>  	}
>>  
>>  	if (ring_hung)
>> -		i915_gem_context_mark_guilty(request->ctx);
>> +		i915_gem_request_mark_guilty(request);
>>  	else
>> -		i915_gem_context_mark_innocent(request->ctx);
>> +		i915_gem_request_mark_innocent(request);
>
> Why? Just use the file_priv on the ctx.
>
>> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
>> index 9abaae4..4ddd044 100644
>> --- a/drivers/gpu/drm/i915/i915_gem_context.c
>> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
>> @@ -347,6 +347,14 @@ __create_hw_context(struct drm_device *dev,
>>  	return ERR_PTR(ret);
>>  }
>>  
>> +static bool client_is_banned(struct drm_i915_file_private *file_priv)
>> +{
>> +	if (file_priv->context_bans <= I915_MAX_CLIENT_CONTEXT_BANS)
>> +		return false;
>
> 	return file_priv->context_bans > I915_MAX_CLIENT_CONTEXT_BANS;
>> +
>> +	return true;
>> +}
>> +
>>  /**
>>   * The default context needs to exist per ring that uses contexts. It stores the
>>   * context state of the GPU for applications that don't utilize HW contexts, as
>> @@ -360,6 +368,14 @@ i915_gem_create_context(struct drm_device *dev,
>>  
>>  	lockdep_assert_held(&dev->struct_mutex);
>>  
>> +	if (file_priv && client_is_banned(file_priv)) {
>
> So this belongs to context_craate_ioctl
>

Ah yes.

>> +		DRM_DEBUG("client %s[%d] banned from creating ctx\n",
>> +			  current->comm,
>> +			  pid_nr(get_task_pid(current, PIDTYPE_PID)));
>> +
>> +		return ERR_PTR(-EIO);
>> +	}
>> +
>>  	ctx = __create_hw_context(dev, file_priv);
>>  	if (IS_ERR(ctx))
>>  		return ctx;
>> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
>> index e804cb2..21beaf0 100644
>> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
>> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
>> @@ -1231,16 +1231,20 @@ static struct i915_gem_context *
>>  i915_gem_validate_context(struct drm_device *dev, struct drm_file *file,
>>  			  struct intel_engine_cs *engine, const u32 ctx_id)
>>  {
>> +	struct drm_i915_file_private *file_priv = file->driver_priv;
>>  	struct i915_gem_context *ctx;
>>  	struct i915_ctx_hang_stats *hs;
>>  
>> -	ctx = i915_gem_context_lookup(file->driver_priv, ctx_id);
>> +	ctx = i915_gem_context_lookup(file_priv, ctx_id);
>>  	if (IS_ERR(ctx))
>>  		return ctx;
>>  
>>  	hs = &ctx->hang_stats;
>>  	if (hs->banned) {
>> -		DRM_DEBUG("Context %u tried to submit while banned\n", ctx_id);
>> +		DRM_DEBUG("Client %s banned from submitting (%d:%d)\n",
>> +			  ctx->name,
>> +			  file_priv->context_bans,
>> +			  hs->ban_score);
>
> Context bans prevents creating new contexts. What is its relevance here?
>

No relevanse, just upgraded the ctx->name. Should not be in this patch.

-Mika

> (We should start using pr_debug for these user aides, at least something
> more suitable than DRM_DEBUG).
> -Chris
>
> -- 
> Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/6] drm/i915: Decouple hang detection from hangcheck period
  2016-11-15 16:38   ` Chris Wilson
@ 2016-11-15 17:23     ` Mika Kuoppala
  2016-11-15 21:32       ` Chris Wilson
  0 siblings, 1 reply; 19+ messages in thread
From: Mika Kuoppala @ 2016-11-15 17:23 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> On Tue, Nov 15, 2016 at 04:36:32PM +0200, Mika Kuoppala wrote:
>> +static bool
>> +hangcheck_engine_stall(struct intel_engine_cs *engine,
>> +		       struct intel_engine_hangcheck *hc)
>> +{
>> +	const unsigned long last_action = engine->hangcheck.action_timestamp;
>>  
>> -		memset(&engine->hangcheck.instdone, 0,
>> -		       sizeof(engine->hangcheck.instdone));
>> -		break;
>> +	if (hc->action == HANGCHECK_ACTIVE_SEQNO ||
>> +	    hc->action == HANGCHECK_IDLE)
>> +		return false;
>> +
>> +	if (time_before(jiffies, last_action + HANGCHECK_HUNG_JIFFIES))
>> +		return false;
>> +
>> +	if (time_before(jiffies, last_action + HANGCHECK_STUCK_JIFFIES))
>> +		if (hc->action != HANGCHECK_HUNG)
>> +			return false;
>
> This deserves a lot more explanation. Why two values? What are their
> significance?
>

#define DRM_I915_STUCK_PERIOD_SEC 24 /* No observed seqno progress */
#define DRM_I915_HUNG_PERIOD_SEC 4 /* No observed seqno nor head progress */

We allow bigger timeout if head is moving inside a request. THus two
values. I tried to mimic the behaviour from old code.

> Do we want to be using jiffies? (Values are in seconds, so jiffie
> resoluation makes sense, but add that as a comment somewhere.)

The defines as seconds and the subsequent jiffie counterparts in
i915_drv.h not enough?

>
>> +	return true;
>> +}
>> +
>> +static struct intel_engine_cs *find_lra_engine(struct drm_i915_private *i915,
>> +					       const unsigned int mask)
>
> What is lra?
>

Least recently active. I will opencode the name.

>> +{
>> +	struct intel_engine_cs *engine = NULL, *c;
>> +	enum intel_engine_id id;
>> +
>> +	for_each_engine_masked(c, i915, mask, id) {
>> +		if (engine == NULL) {
>> +			engine = c;
>> +			continue;
>> +		}
>> +
>> +		if (time_before(c->hangcheck.action_timestamp,
>> +				engine->hangcheck.action_timestamp))
>> +			engine = c;
>> +		else if (c->hangcheck.action_timestamp ==
>> +			 engine->hangcheck.action_timestamp &&
>> +			 c->hangcheck.seqno < engine->hangcheck.seqno)
>> +			engine = c;
>
> Why is the last engine significant?

We blame the engine which had work and was inactive for the
longest amount of time. 

> Why is just engine guilty?

Just one? As the stamps are valid cross reset, the next
advacement after this hang will make progress show
on this engine, and the next engine will get caught.

The hangcheck action is a filter to weed out those engines
that can't possibly be culprit for hang.

So we aim to pinpoint only one engine with accuracy and
reset that to lean towards per engine resets.

-Mika

>
>
> Too many whys in this one.
> -Chris
>
> -- 
> Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: ✓ Fi.CI.BAT: success for series starting with [1/6] drm/i915: Split up hangcheck phases
  2016-11-15 16:47   ` Chris Wilson
@ 2016-11-15 17:27     ` Mika Kuoppala
  0 siblings, 0 replies; 19+ messages in thread
From: Mika Kuoppala @ 2016-11-15 17:27 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> On Tue, Nov 15, 2016 at 03:45:41PM -0000, Patchwork wrote:
>> == Series Details ==
>> 
>> Series: series starting with [1/6] drm/i915: Split up hangcheck phases
>> URL   : https://patchwork.freedesktop.org/series/15351/
>> State : success
>> 
>> == Summary ==
>> 
>> Series 15351v1 Series without cover letter
>> https://patchwork.freedesktop.org/api/1.0/series/15351/revisions/1/mbox/
>> 
>> 
>> fi-bdw-5557u     total:244  pass:229  dwarn:0   dfail:0   fail:0   skip:15 
>> fi-bsw-n3050     total:244  pass:204  dwarn:0   dfail:0   fail:0   skip:40 
>> fi-bxt-t5700     total:244  pass:216  dwarn:0   dfail:0   fail:0   skip:28 
>> fi-byt-j1900     total:244  pass:216  dwarn:0   dfail:0   fail:0   skip:28 
>> fi-byt-n2820     total:244  pass:212  dwarn:0   dfail:0   fail:0   skip:32 
>> fi-hsw-4770      total:244  pass:224  dwarn:0   dfail:0   fail:0   skip:20 
>> fi-hsw-4770r     total:244  pass:224  dwarn:0   dfail:0   fail:0   skip:20 
>> fi-ilk-650       total:244  pass:191  dwarn:0   dfail:0   fail:0   skip:53 
>> fi-ivb-3520m     total:244  pass:222  dwarn:0   dfail:0   fail:0   skip:22 
>> fi-ivb-3770      total:244  pass:222  dwarn:0   dfail:0   fail:0   skip:22 
>> fi-kbl-7200u     total:244  pass:222  dwarn:0   dfail:0   fail:0   skip:22 
>> fi-skl-6260u     total:244  pass:230  dwarn:0   dfail:0   fail:0   skip:14 
>> fi-skl-6700hq    total:244  pass:223  dwarn:0   dfail:0   fail:0   skip:21 
>> fi-skl-6700k     total:244  pass:222  dwarn:1   dfail:0   fail:0   skip:21 
>> fi-skl-6770hq    total:244  pass:230  dwarn:0   dfail:0   fail:0   skip:14 
>> fi-snb-2520m     total:244  pass:212  dwarn:0   dfail:0   fail:0   skip:32 
>> fi-snb-2600      total:244  pass:211  dwarn:0   dfail:0   fail:0   skip:33 
>
> Worth asking: what's the impact on the hangcheck tests runtime within BAT?
>

Will measure.

I suspect that we will be a tad faster due to brutal
nature of igt hangs being now deterministcally detected in hangcheck
resolution accuracy...

> Presumably still the same timeouts...

and that the simplest engine fastest detection was 4.5secs
on old code. Now it is 4secs.

I just wanted second resolution the defines.

-Mika

> -Chris
>
> -- 
> Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/6] drm/i915: Decouple hang detection from hangcheck period
  2016-11-15 17:23     ` Mika Kuoppala
@ 2016-11-15 21:32       ` Chris Wilson
  0 siblings, 0 replies; 19+ messages in thread
From: Chris Wilson @ 2016-11-15 21:32 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx

On Tue, Nov 15, 2016 at 07:23:26PM +0200, Mika Kuoppala wrote:
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > On Tue, Nov 15, 2016 at 04:36:32PM +0200, Mika Kuoppala wrote:
> >> +static bool
> >> +hangcheck_engine_stall(struct intel_engine_cs *engine,
> >> +		       struct intel_engine_hangcheck *hc)
> >> +{
> >> +	const unsigned long last_action = engine->hangcheck.action_timestamp;
> >>  
> >> -		memset(&engine->hangcheck.instdone, 0,
> >> -		       sizeof(engine->hangcheck.instdone));
> >> -		break;
> >> +	if (hc->action == HANGCHECK_ACTIVE_SEQNO ||
> >> +	    hc->action == HANGCHECK_IDLE)
> >> +		return false;
> >> +
> >> +	if (time_before(jiffies, last_action + HANGCHECK_HUNG_JIFFIES))
> >> +		return false;
> >> +
> >> +	if (time_before(jiffies, last_action + HANGCHECK_STUCK_JIFFIES))
> >> +		if (hc->action != HANGCHECK_HUNG)
> >> +			return false;
> >
> > This deserves a lot more explanation. Why two values? What are their
> > significance?
> >
> 
> #define DRM_I915_STUCK_PERIOD_SEC 24 /* No observed seqno progress */
> #define DRM_I915_HUNG_PERIOD_SEC 4 /* No observed seqno nor head progress */

They are very far away and so not helping to explain this code at all.

How about calling them NO_SEQNO_PROGRESS, NO_HEAD_PROGRESS ? And 24s
without a seqno progress is far too long. We set off alarms elsewhere
for much shorter unresponsive code.

> We allow bigger timeout if head is moving inside a request. THus two
> values. I tried to mimic the behaviour from old code.
> 
> > Do we want to be using jiffies? (Values are in seconds, so jiffie
> > resoluation makes sense, but add that as a comment somewhere.)
> 
> The defines as seconds and the subsequent jiffie counterparts in
> i915_drv.h not enough?

I was just thinking how more flexible ktime was in terms of convenience
api.

> 
> >
> >> +	return true;
> >> +}
> >> +
> >> +static struct intel_engine_cs *find_lra_engine(struct drm_i915_private *i915,
> >> +					       const unsigned int mask)
> >
> > What is lra?
> >
> 
> Least recently active. I will opencode the name.
> 
> >> +{
> >> +	struct intel_engine_cs *engine = NULL, *c;
> >> +	enum intel_engine_id id;
> >> +
> >> +	for_each_engine_masked(c, i915, mask, id) {
> >> +		if (engine == NULL) {
> >> +			engine = c;
> >> +			continue;
> >> +		}
> >> +
> >> +		if (time_before(c->hangcheck.action_timestamp,
> >> +				engine->hangcheck.action_timestamp))
> >> +			engine = c;
> >> +		else if (c->hangcheck.action_timestamp ==
> >> +			 engine->hangcheck.action_timestamp &&
> >> +			 c->hangcheck.seqno < engine->hangcheck.seqno)
> >> +			engine = c;
> >
> > Why is the last engine significant?
> 
> We blame the engine which had work and was inactive for the
> longest amount of time. 

This feels like a step backwards.
 
> > Why is just engine guilty?
> 
> Just one? As the stamps are valid cross reset, the next
> advacement after this hang will make progress show
> on this engine, and the next engine will get caught.
> 
> The hangcheck action is a filter to weed out those engines
> that can't possibly be culprit for hang.

If it hadn't advanced for umpteen seconds, it is not exactly innocenet
either (consider we take care not to blame those waiting on the guilty).

I thought moving forwards we wanted to treat the engines/reset with
greater indepdence whereas this adds global state.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/6] drm/i915: Decouple hang detection from hangcheck period
  2016-11-15 14:36 ` [PATCH 2/6] drm/i915: Decouple hang detection from hangcheck period Mika Kuoppala
  2016-11-15 16:38   ` Chris Wilson
@ 2016-11-16  8:07   ` Chris Wilson
  1 sibling, 0 replies; 19+ messages in thread
From: Chris Wilson @ 2016-11-16  8:07 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx

On Tue, Nov 15, 2016 at 04:36:32PM +0200, Mika Kuoppala wrote:
> +static bool
> +hangcheck_engine_stall(struct intel_engine_cs *engine,
> +		       struct intel_engine_hangcheck *hc)
> +{
> +	const unsigned long last_action = engine->hangcheck.action_timestamp;
>  
> -		memset(&engine->hangcheck.instdone, 0,
> -		       sizeof(engine->hangcheck.instdone));
> -		break;
> +	if (hc->action == HANGCHECK_ACTIVE_SEQNO ||
> +	    hc->action == HANGCHECK_IDLE)
> +		return false;
> +
> +	if (time_before(jiffies, last_action + HANGCHECK_HUNG_JIFFIES))
> +		return false;
> +
> +	if (time_before(jiffies, last_action + HANGCHECK_STUCK_JIFFIES))
> +		if (hc->action != HANGCHECK_HUNG)
> +			return false;

Coming back to this, this is very confusing as it is written. As you
write it, the primary decision is based on time "if time since last
action is greater than threshold, declare stuck or hang". That doesn't
convey that you only want to take certain action if the head hasn't
moved for N seconds, or if the seqno hasn't changed for M seconds.

I kind of expect something more like:

unsigned long timeout;

switch (hc->action) {
case ACTIVE:
case IDLE:
	return false;

case NO_HEAD_MOVEMENT:
	timeout = 5s;
	break;

case NO_SEQNO_ADVANCE:
	timeout = 20s
	break;
}

return time_after(jiffies, last_action + timeout);
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 1/6] drm/i915: Split up hangcheck phases
@ 2016-11-16 15:20 Mika Kuoppala
  0 siblings, 0 replies; 19+ messages in thread
From: Mika Kuoppala @ 2016-11-16 15:20 UTC (permalink / raw)
  To: intel-gfx

In order to simplify hangcheck state keeping, split hangcheck
per engine loop in three phases: state load, action, state save.

Add few more hangcheck actions to separate between seqno, head
and subunit movements. This helps to gather all the hangcheck
actions under a single switch umbrella.

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gpu_error.c   |   8 +-
 drivers/gpu/drm/i915/intel_hangcheck.c  | 241 ++++++++++++++++++--------------
 drivers/gpu/drm/i915/intel_ringbuffer.h |   4 +-
 3 files changed, 146 insertions(+), 107 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 5d620bd..f02f581 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -323,8 +323,12 @@ static const char *hangcheck_action_to_str(enum intel_engine_hangcheck_action a)
 		return "idle";
 	case HANGCHECK_WAIT:
 		return "wait";
-	case HANGCHECK_ACTIVE:
-		return "active";
+	case HANGCHECK_ACTIVE_SEQNO:
+		return "active seqno";
+	case HANGCHECK_ACTIVE_HEAD:
+		return "active head";
+	case HANGCHECK_ACTIVE_SUBUNITS:
+		return "active subunits";
 	case HANGCHECK_KICK:
 		return "kick";
 	case HANGCHECK_HUNG:
diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
index 53df5b1..3d2e81c 100644
--- a/drivers/gpu/drm/i915/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/intel_hangcheck.c
@@ -236,11 +236,11 @@ head_stuck(struct intel_engine_cs *engine, u64 acthd)
 		memset(&engine->hangcheck.instdone, 0,
 		       sizeof(engine->hangcheck.instdone));
 
-		return HANGCHECK_ACTIVE;
+		return HANGCHECK_ACTIVE_HEAD;
 	}
 
 	if (!subunits_stuck(engine))
-		return HANGCHECK_ACTIVE;
+		return HANGCHECK_ACTIVE_SUBUNITS;
 
 	return HANGCHECK_HUNG;
 }
@@ -291,6 +291,129 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
 	return HANGCHECK_HUNG;
 }
 
+static void hangcheck_load_sample(struct intel_engine_cs *engine,
+				  struct intel_engine_hangcheck *hc)
+{
+	/* We don't strictly need an irq-barrier here, as we are not
+	 * serving an interrupt request, be paranoid in case the
+	 * barrier has side-effects (such as preventing a broken
+	 * cacheline snoop) and so be sure that we can see the seqno
+	 * advance. If the seqno should stick, due to a stale
+	 * cacheline, we would erroneously declare the GPU hung.
+	 */
+	if (engine->irq_seqno_barrier)
+		engine->irq_seqno_barrier(engine);
+
+	hc->acthd = intel_engine_get_active_head(engine);
+	hc->seqno = intel_engine_get_seqno(engine);
+	hc->score = engine->hangcheck.score;
+}
+
+static void hangcheck_store_sample(struct intel_engine_cs *engine,
+				   const struct intel_engine_hangcheck *hc)
+{
+	engine->hangcheck.acthd = hc->acthd;
+	engine->hangcheck.seqno = hc->seqno;
+	engine->hangcheck.score = hc->score;
+	engine->hangcheck.action = hc->action;
+}
+
+static enum intel_engine_hangcheck_action
+hangcheck_get_action(struct intel_engine_cs *engine,
+		     const struct intel_engine_hangcheck *hc)
+{
+	if (engine->hangcheck.seqno != hc->seqno)
+		return HANGCHECK_ACTIVE_SEQNO;
+
+	if (i915_seqno_passed(hc->seqno, intel_engine_last_submit(engine)))
+		return HANGCHECK_IDLE;
+
+	return engine_stuck(engine, hc->acthd);
+}
+
+static void hangcheck_accumulate_sample(struct intel_engine_cs *engine,
+					struct intel_engine_hangcheck *hc)
+{
+	hc->action = hangcheck_get_action(engine, hc);
+
+	switch (hc->action) {
+	case HANGCHECK_IDLE:
+	case HANGCHECK_WAIT:
+		break;
+
+	case HANGCHECK_ACTIVE_HEAD:
+	case HANGCHECK_ACTIVE_SUBUNITS:
+		/* We always increment the hangcheck score
+		 * if the engine is busy and still processing
+		 * the same request, so that no single request
+		 * can run indefinitely (such as a chain of
+		 * batches). The only time we do not increment
+		 * the hangcheck score on this ring, if this
+		 * engine is in a legitimate wait for another
+		 * engine. In that case the waiting engine is a
+		 * victim and we want to be sure we catch the
+		 * right culprit. Then every time we do kick
+		 * the ring, add a small increment to the
+		 * score so that we can catch a batch that is
+		 * being repeatedly kicked and so responsible
+		 * for stalling the machine.
+		 */
+		hc->score += 1;
+		break;
+
+	case HANGCHECK_KICK:
+		hc->score += 5;
+		break;
+
+	case HANGCHECK_HUNG:
+		hc->score += 20;
+		break;
+
+	case HANGCHECK_ACTIVE_SEQNO:
+		/* Gradually reduce the count so that we catch DoS
+		 * attempts across multiple batches.
+		 */
+		if (hc->score > 0)
+			hc->score -= 15;
+		if (hc->score < 0)
+			hc->score = 0;
+
+		/* Clear head and subunit states on seqno movement */
+		hc->acthd = 0;
+
+		memset(&engine->hangcheck.instdone, 0,
+		       sizeof(engine->hangcheck.instdone));
+		break;
+
+	default:
+		MISSING_CASE(hc->action);
+	}
+}
+
+static void hangcheck_declare_hang(struct drm_i915_private *i915,
+				   unsigned int hung,
+				   unsigned int stuck)
+{
+	struct intel_engine_cs *engine;
+	char msg[80];
+	unsigned int tmp;
+	int len;
+
+	/* If some rings hung but others were still busy, only
+	 * blame the hanging rings in the synopsis.
+	 */
+	if (stuck != hung)
+		hung &= ~stuck;
+	len = scnprintf(msg, sizeof(msg),
+			"%s on ", stuck == hung ? "No progress" : "Hang");
+	for_each_engine_masked(engine, i915, hung, tmp)
+		len += scnprintf(msg + len, sizeof(msg) - len,
+				 "%s, ", engine->name);
+	msg[len-2] = '\0';
+
+	return i915_handle_error(i915, hung, msg);
+}
+
 /*
  * This is called when the chip hasn't reported back with completed
  * batchbuffers in a long time. We keep track per ring seqno progress and
@@ -308,10 +431,6 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 	enum intel_engine_id id;
 	unsigned int hung = 0, stuck = 0;
 	int busy_count = 0;
-#define BUSY 1
-#define KICK 5
-#define HUNG 20
-#define ACTIVE_DECAY 15
 
 	if (!i915.enable_hangcheck)
 		return;
@@ -326,112 +445,26 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 	intel_uncore_arm_unclaimed_mmio_detection(dev_priv);
 
 	for_each_engine(engine, dev_priv, id) {
-		bool busy = intel_engine_has_waiter(engine);
-		u64 acthd;
-		u32 seqno;
-		u32 submit;
+		struct intel_engine_hangcheck cur_state, *hc = &cur_state;
+		const bool busy = intel_engine_has_waiter(engine);
 
 		semaphore_clear_deadlocks(dev_priv);
 
-		/* We don't strictly need an irq-barrier here, as we are not
-		 * serving an interrupt request, be paranoid in case the
-		 * barrier has side-effects (such as preventing a broken
-		 * cacheline snoop) and so be sure that we can see the seqno
-		 * advance. If the seqno should stick, due to a stale
-		 * cacheline, we would erroneously declare the GPU hung.
-		 */
-		if (engine->irq_seqno_barrier)
-			engine->irq_seqno_barrier(engine);
-
-		acthd = intel_engine_get_active_head(engine);
-		seqno = intel_engine_get_seqno(engine);
-		submit = intel_engine_last_submit(engine);
-
-		if (engine->hangcheck.seqno == seqno) {
-			if (i915_seqno_passed(seqno, submit)) {
-				engine->hangcheck.action = HANGCHECK_IDLE;
-			} else {
-				/* We always increment the hangcheck score
-				 * if the engine is busy and still processing
-				 * the same request, so that no single request
-				 * can run indefinitely (such as a chain of
-				 * batches). The only time we do not increment
-				 * the hangcheck score on this ring, if this
-				 * engine is in a legitimate wait for another
-				 * engine. In that case the waiting engine is a
-				 * victim and we want to be sure we catch the
-				 * right culprit. Then every time we do kick
-				 * the ring, add a small increment to the
-				 * score so that we can catch a batch that is
-				 * being repeatedly kicked and so responsible
-				 * for stalling the machine.
-				 */
-				engine->hangcheck.action =
-					engine_stuck(engine, acthd);
-
-				switch (engine->hangcheck.action) {
-				case HANGCHECK_IDLE:
-				case HANGCHECK_WAIT:
-					break;
-				case HANGCHECK_ACTIVE:
-					engine->hangcheck.score += BUSY;
-					break;
-				case HANGCHECK_KICK:
-					engine->hangcheck.score += KICK;
-					break;
-				case HANGCHECK_HUNG:
-					engine->hangcheck.score += HUNG;
-					break;
-				}
-			}
-
-			if (engine->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG) {
-				hung |= intel_engine_flag(engine);
-				if (engine->hangcheck.action != HANGCHECK_HUNG)
-					stuck |= intel_engine_flag(engine);
-			}
-		} else {
-			engine->hangcheck.action = HANGCHECK_ACTIVE;
-
-			/* Gradually reduce the count so that we catch DoS
-			 * attempts across multiple batches.
-			 */
-			if (engine->hangcheck.score > 0)
-				engine->hangcheck.score -= ACTIVE_DECAY;
-			if (engine->hangcheck.score < 0)
-				engine->hangcheck.score = 0;
-
-			/* Clear head and subunit states on seqno movement */
-			acthd = 0;
-
-			memset(&engine->hangcheck.instdone, 0,
-			       sizeof(engine->hangcheck.instdone));
+		hangcheck_load_sample(engine, hc);
+		hangcheck_accumulate_sample(engine, hc);
+		hangcheck_store_sample(engine, hc);
+
+		if (hc->score >= HANGCHECK_SCORE_RING_HUNG) {
+			hung |= intel_engine_flag(engine);
+			if (hc->action != HANGCHECK_HUNG)
+				stuck |= intel_engine_flag(engine);
 		}
 
-		engine->hangcheck.seqno = seqno;
-		engine->hangcheck.acthd = acthd;
 		busy_count += busy;
 	}
 
-	if (hung) {
-		char msg[80];
-		unsigned int tmp;
-		int len;
-
-		/* If some rings hung but others were still busy, only
-		 * blame the hanging rings in the synopsis.
-		 */
-		if (stuck != hung)
-			hung &= ~stuck;
-		len = scnprintf(msg, sizeof(msg),
-				"%s on ", stuck == hung ? "No progress" : "Hang");
-		for_each_engine_masked(engine, dev_priv, hung, tmp)
-			len += scnprintf(msg + len, sizeof(msg) - len,
-					 "%s, ", engine->name);
-		msg[len-2] = '\0';
-
-		return i915_handle_error(dev_priv, hung, msg);
-	}
+	if (hung)
+		hangcheck_declare_hang(dev_priv, hung, stuck);
 
 	/* Reset timer in case GPU hangs without another request being added */
 	if (busy_count)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 3466b4e..3152b2b 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -67,7 +67,9 @@ struct intel_hw_status_page {
 enum intel_engine_hangcheck_action {
 	HANGCHECK_IDLE = 0,
 	HANGCHECK_WAIT,
-	HANGCHECK_ACTIVE,
+	HANGCHECK_ACTIVE_SEQNO,
+	HANGCHECK_ACTIVE_HEAD,
+	HANGCHECK_ACTIVE_SUBUNITS,
 	HANGCHECK_KICK,
 	HANGCHECK_HUNG,
 };
-- 
2.7.4

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2016-11-16 15:20 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-11-15 14:36 [PATCH 1/6] drm/i915: Split up hangcheck phases Mika Kuoppala
2016-11-15 14:36 ` [PATCH 2/6] drm/i915: Decouple hang detection from hangcheck period Mika Kuoppala
2016-11-15 16:38   ` Chris Wilson
2016-11-15 17:23     ` Mika Kuoppala
2016-11-15 21:32       ` Chris Wilson
2016-11-16  8:07   ` Chris Wilson
2016-11-15 14:36 ` [PATCH 3/6] drm/i915: Use request retirement as context progress Mika Kuoppala
2016-11-15 16:34   ` Chris Wilson
2016-11-15 14:36 ` [PATCH 4/6] drm/i915: Add bannable context parameter Mika Kuoppala
2016-11-15 16:31   ` Chris Wilson
2016-11-15 14:36 ` [PATCH 5/6] drm/i915: Add per client max context ban limit Mika Kuoppala
2016-11-15 16:27   ` Chris Wilson
2016-11-15 17:12     ` Mika Kuoppala
2016-11-15 14:36 ` [PATCH 6/6] drm/i915: Wipe hang stats as an embedded struct Mika Kuoppala
2016-11-15 15:45 ` ✓ Fi.CI.BAT: success for series starting with [1/6] drm/i915: Split up hangcheck phases Patchwork
2016-11-15 16:47   ` Chris Wilson
2016-11-15 17:27     ` Mika Kuoppala
2016-11-15 16:39 ` [PATCH 1/6] " Chris Wilson
2016-11-16 15:20 Mika Kuoppala

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.