All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC] [PATCH 0/7] arb robustness enablers
@ 2013-02-04 14:04 Mika Kuoppala
  2013-02-04 14:04 ` [RFC] [PATCH 1/7] drm/i915: detect infinite loops in hang check Mika Kuoppala
                   ` (7 more replies)
  0 siblings, 8 replies; 18+ messages in thread
From: Mika Kuoppala @ 2013-02-04 14:04 UTC (permalink / raw)
  To: intel-gfx

Hi,

This patchset adds ioctl and related changes to allow
userspace to query about the context loss status of the
specified context. The aim is to provide enabler
for the ARB_Robustness GL extension.

I post this as RFC to get ack/nack on the interface.
The interface is in 0004-drm-i915-add-i915_get_reset_status_ioctl.patch

Also any feedback on the other parts of the patchset is most
welcomed.

The i-g-t testcase can be found here:
https://github.com/mkuoppal/intel-gpu-tools/commits/arb-robustness

Thanks,
--Mika

Mika Kuoppala (7):
  drm/i915: detect infinite loops in hang check
  drm/i915: add struct i915_reset_stats
  drm/i915: add reset_status for hw_contexts
  drm/i915: add i915_get_reset_status_ioctl
  drm/i915: add batch object and context to i915_add_request()
  drm/i915: reference count for i915_hw_contexts
  drm/i915: find guilty batch buffer on ring resets

 drivers/gpu/drm/i915/i915_dma.c            |    3 +-
 drivers/gpu/drm/i915/i915_drv.c            |   42 +++++++++
 drivers/gpu/drm/i915/i915_drv.h            |   41 +++++++--
 drivers/gpu/drm/i915/i915_gem.c            |  119 ++++++++++++++++++++++++--
 drivers/gpu/drm/i915/i915_gem_context.c    |   66 +++++++++++++--
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |   11 ++-
 drivers/gpu/drm/i915/i915_irq.c            |  127 ++++++++++++++--------------
 drivers/gpu/drm/i915/intel_overlay.c       |    5 +-
 drivers/gpu/drm/i915/intel_ringbuffer.c    |    2 +-
 drivers/gpu/drm/i915/intel_ringbuffer.h    |    4 +
 include/uapi/drm/i915_drm.h                |   19 +++++
 11 files changed, 351 insertions(+), 88 deletions(-)

-- 
1.7.9.5

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [RFC] [PATCH 1/7] drm/i915: detect infinite loops in hang check
  2013-02-04 14:04 [RFC] [PATCH 0/7] arb robustness enablers Mika Kuoppala
@ 2013-02-04 14:04 ` Mika Kuoppala
  2013-02-15  6:14   ` Ben Widawsky
  2013-02-04 14:04 ` [RFC] [PATCH 2/7] drm/i915: add struct i915_reset_stats Mika Kuoppala
                   ` (6 subsequent siblings)
  7 siblings, 1 reply; 18+ messages in thread
From: Mika Kuoppala @ 2013-02-04 14:04 UTC (permalink / raw)
  To: intel-gfx

If there was a batch chaining loop or infinite loop in the batchbuffer,
we didn't detect it as acthd and instdone kept changing in those cases
and hang was never declared.

To detect ring hangs, including infinite loops, keep track of ring
seqno progression.

Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h         |    3 -
 drivers/gpu/drm/i915/i915_irq.c         |  127 +++++++++++++++----------------
 drivers/gpu/drm/i915/intel_ringbuffer.h |    4 +
 3 files changed, 67 insertions(+), 67 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 984523d..4daab74 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -791,9 +791,6 @@ struct i915_gpu_error {
 #define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */
 #define DRM_I915_HANGCHECK_JIFFIES msecs_to_jiffies(DRM_I915_HANGCHECK_PERIOD)
 	struct timer_list hangcheck_timer;
-	int hangcheck_count;
-	uint32_t last_acthd[I915_NUM_RINGS];
-	uint32_t prev_instdone[I915_NUM_INSTDONE_REG];
 
 	/* For reset and error_state handling. */
 	spinlock_t lock;
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 5648d84..d0a9e21 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -356,7 +356,7 @@ static void notify_ring(struct drm_device *dev,
 
 	wake_up_all(&ring->irq_queue);
 	if (i915_enable_hangcheck) {
-		dev_priv->gpu_error.hangcheck_count = 0;
+		ring->hangcheck_count = 0;
 		mod_timer(&dev_priv->gpu_error.hangcheck_timer,
 			  round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
 	}
@@ -1727,11 +1727,11 @@ ring_last_seqno(struct intel_ring_buffer *ring)
 			  struct drm_i915_gem_request, list)->seqno;
 }
 
-static bool i915_hangcheck_ring_idle(struct intel_ring_buffer *ring, bool *err)
+static bool i915_hangcheck_ring_idle(struct intel_ring_buffer *ring,
+				     u32 ring_seqno, bool *err)
 {
 	if (list_empty(&ring->request_list) ||
-	    i915_seqno_passed(ring->get_seqno(ring, false),
-			      ring_last_seqno(ring))) {
+	    i915_seqno_passed(ring_seqno, ring_last_seqno(ring))) {
 		/* Issue a wake-up to catch stuck h/w. */
 		if (waitqueue_active(&ring->irq_queue)) {
 			DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
@@ -1749,39 +1749,32 @@ static bool kick_ring(struct intel_ring_buffer *ring)
 	struct drm_device *dev = ring->dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	u32 tmp = I915_READ_CTL(ring);
+
+	ring->hangcheck_waiting = false;
+
 	if (tmp & RING_WAIT) {
 		DRM_ERROR("Kicking stuck wait on %s\n",
 			  ring->name);
 		I915_WRITE_CTL(ring, tmp);
-		return true;
+		ring->hangcheck_waiting = true;
 	}
-	return false;
-}
-
-static bool i915_hangcheck_hung(struct drm_device *dev)
-{
-	drm_i915_private_t *dev_priv = dev->dev_private;
 
-	if (dev_priv->gpu_error.hangcheck_count++ > 1) {
-		bool hung = true;
+	if ((INTEL_INFO(dev)->gen >= 6) && (tmp & RING_WAIT_SEMAPHORE))
+		ring->hangcheck_waiting = true;
 
-		DRM_ERROR("Hangcheck timer elapsed... GPU hung\n");
-		i915_handle_error(dev, true);
-
-		if (!IS_GEN2(dev)) {
-			struct intel_ring_buffer *ring;
-			int i;
-
-			/* Is the chip hanging on a WAIT_FOR_EVENT?
-			 * If so we can simply poke the RB_WAIT bit
-			 * and break the hang. This should work on
-			 * all but the second generation chipsets.
-			 */
-			for_each_ring(ring, dev_priv, i)
-				hung &= !kick_ring(ring);
-		}
+	return ring->hangcheck_waiting;
+}
 
-		return hung;
+static bool i915_hangcheck_ring_hung(struct drm_device *dev,
+				     struct intel_ring_buffer *ring)
+{
+	if (!IS_GEN2(dev)) {
+		/* Is the chip hanging on a WAIT_FOR_EVENT?
+		 * If so we can simply poke the RB_WAIT bit
+		 * and break the hang. This should work on
+		 * all but the second generation chipsets.
+		 */
+		return !kick_ring(ring);
 	}
 
 	return false;
@@ -1789,62 +1782,68 @@ static bool i915_hangcheck_hung(struct drm_device *dev)
 
 /**
  * This is called when the chip hasn't reported back with completed
- * batchbuffers in a long time. The first time this is called we simply record
- * ACTHD. If ACTHD hasn't changed by the time the hangcheck timer elapses
- * again, we assume the chip is wedged and try to fix it.
+ * batchbuffers in a long time. We record current seqno for each count and
+ * in subsequent calls we check if requests have been processed by each ring.
+ * If there is no progress on specific ring, we declare it as hung.
  */
 void i915_hangcheck_elapsed(unsigned long data)
 {
 	struct drm_device *dev = (struct drm_device *)data;
 	drm_i915_private_t *dev_priv = dev->dev_private;
-	uint32_t acthd[I915_NUM_RINGS], instdone[I915_NUM_INSTDONE_REG];
 	struct intel_ring_buffer *ring;
-	bool err = false, idle;
+	bool ring_hung;
 	int i;
+	int busy_count = 0;
 
 	if (!i915_enable_hangcheck)
 		return;
 
-	memset(acthd, 0, sizeof(acthd));
-	idle = true;
 	for_each_ring(ring, dev_priv, i) {
-	    idle &= i915_hangcheck_ring_idle(ring, &err);
-	    acthd[i] = intel_ring_get_active_head(ring);
-	}
+		bool err = false, idle;
+		u32 seqno;
 
-	/* If all work is done then ACTHD clearly hasn't advanced. */
-	if (idle) {
-		if (err) {
-			if (i915_hangcheck_hung(dev))
-				return;
+		seqno = ring->get_seqno(ring, false);
+		idle = i915_hangcheck_ring_idle(ring, seqno, &err);
+
+		if (idle) {
+			if (err)
+				ring->hangcheck_count++;
+			else
+				ring->hangcheck_count = 0;
+		} else {
+			busy_count++;
 
-			goto repeat;
+			if (ring->hangcheck_seqno == seqno) {
+				ring->hangcheck_count++;
+
+				/* If the ring is not waiting, raise the
+				 * hung score */
+				if (i915_hangcheck_ring_hung(dev, ring))
+					ring->hangcheck_count++;
+			} else {
+				ring->hangcheck_count = 0;
+			}
 		}
 
-		dev_priv->gpu_error.hangcheck_count = 0;
-		return;
+		ring->hangcheck_seqno = seqno;
 	}
 
-	i915_get_extra_instdone(dev, instdone);
-	if (memcmp(dev_priv->gpu_error.last_acthd, acthd,
-		   sizeof(acthd)) == 0 &&
-	    memcmp(dev_priv->gpu_error.prev_instdone, instdone,
-		   sizeof(instdone)) == 0) {
-		if (i915_hangcheck_hung(dev))
-			return;
-	} else {
-		dev_priv->gpu_error.hangcheck_count = 0;
-
-		memcpy(dev_priv->gpu_error.last_acthd, acthd,
-		       sizeof(acthd));
-		memcpy(dev_priv->gpu_error.prev_instdone, instdone,
-		       sizeof(instdone));
+	ring_hung = false;
+	for_each_ring(ring, dev_priv, i) {
+		if (ring->hangcheck_count > 2) {
+			ring_hung = true;
+			DRM_ERROR("%s seems hung\n", ring->name);
+		}
 	}
 
-repeat:
+	if (ring_hung)
+		return i915_handle_error(dev, true);
+
 	/* Reset timer case chip hangs without another request being added */
-	mod_timer(&dev_priv->gpu_error.hangcheck_timer,
-		  round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
+	if  (busy_count)
+		mod_timer(&dev_priv->gpu_error.hangcheck_timer,
+			  round_jiffies_up(jiffies +
+					   DRM_I915_HANGCHECK_JIFFIES));
 }
 
 /* drm_dma.h hooks
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index d66208c..7257252 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -137,6 +137,10 @@ struct  intel_ring_buffer {
 	struct i915_hw_context *default_context;
 	struct drm_i915_gem_object *last_context_obj;
 
+	int hangcheck_count;
+	u32 hangcheck_seqno;
+	bool hangcheck_waiting;
+
 	void *private;
 };
 
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [RFC] [PATCH 2/7] drm/i915: add struct i915_reset_stats
  2013-02-04 14:04 [RFC] [PATCH 0/7] arb robustness enablers Mika Kuoppala
  2013-02-04 14:04 ` [RFC] [PATCH 1/7] drm/i915: detect infinite loops in hang check Mika Kuoppala
@ 2013-02-04 14:04 ` Mika Kuoppala
  2013-02-15  6:21   ` Ben Widawsky
  2013-02-04 14:04 ` [RFC] [PATCH 3/7] drm/i915: add reset_status for hw_contexts Mika Kuoppala
                   ` (5 subsequent siblings)
  7 siblings, 1 reply; 18+ messages in thread
From: Mika Kuoppala @ 2013-02-04 14:04 UTC (permalink / raw)
  To: intel-gfx

To count context losses, add struct i915_reset stats for
both i915_hw_context and drm_i915_file_private.
drm_i915_file_private is used when there is no context.

Use kzalloc when allocating drm_i915_file_private to
initialize stats

Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_dma.c |    2 +-
 drivers/gpu/drm/i915/i915_drv.h |    8 ++++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index cf06103..82732ee 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -1773,7 +1773,7 @@ int i915_driver_open(struct drm_device *dev, struct drm_file *file)
 	struct drm_i915_file_private *file_priv;
 
 	DRM_DEBUG_DRIVER("\n");
-	file_priv = kmalloc(sizeof(*file_priv), GFP_KERNEL);
+	file_priv = kzalloc(sizeof(*file_priv), GFP_KERNEL);
 	if (!file_priv)
 		return -ENOMEM;
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 4daab74..b87240f 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -429,6 +429,11 @@ struct i915_hw_ppgtt {
 	void (*cleanup)(struct i915_hw_ppgtt *ppgtt);
 };
 
+struct i915_reset_stats {
+	u32 total;
+	u32 innocent;
+	u32 guilty;
+};
 
 /* This must match up with the value previously used for execbuf2.rsvd1. */
 #define DEFAULT_CONTEXT_ID 0
@@ -438,6 +443,7 @@ struct i915_hw_context {
 	struct drm_i915_file_private *file_priv;
 	struct intel_ring_buffer *ring;
 	struct drm_i915_gem_object *obj;
+	struct i915_reset_stats reset_stats;
 };
 
 enum no_fbc_reason {
@@ -1251,6 +1257,8 @@ struct drm_i915_file_private {
 		struct list_head request_list;
 	} mm;
 	struct idr context_idr;
+
+	struct i915_reset_stats reset_stats;
 };
 
 #define INTEL_INFO(dev)	(((struct drm_i915_private *) (dev)->dev_private)->info)
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [RFC] [PATCH 3/7] drm/i915: add reset_status for hw_contexts
  2013-02-04 14:04 [RFC] [PATCH 0/7] arb robustness enablers Mika Kuoppala
  2013-02-04 14:04 ` [RFC] [PATCH 1/7] drm/i915: detect infinite loops in hang check Mika Kuoppala
  2013-02-04 14:04 ` [RFC] [PATCH 2/7] drm/i915: add struct i915_reset_stats Mika Kuoppala
@ 2013-02-04 14:04 ` Mika Kuoppala
  2013-02-04 14:04 ` [RFC] [PATCH 4/7] drm/i915: add i915_get_reset_status_ioctl Mika Kuoppala
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 18+ messages in thread
From: Mika Kuoppala @ 2013-02-04 14:04 UTC (permalink / raw)
  To: intel-gfx

For arb-robustness, every context needs to have it's own
reset status. Default context will be handled in a identical
way as the no-context case in further down in the patch set.
For no-context case, the reset status will be stored in
the file_priv part.

Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h         |    5 +++++
 drivers/gpu/drm/i915/i915_gem_context.c |   33 +++++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index b87240f..42fcfb6 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1674,6 +1674,11 @@ void i915_gem_context_fini(struct drm_device *dev);
 void i915_gem_context_close(struct drm_device *dev, struct drm_file *file);
 int i915_switch_context(struct intel_ring_buffer *ring,
 			struct drm_file *file, int to_id);
+void i915_gem_context_free(struct kref *ctx_ref);
+int i915_gem_context_get_reset_stats(struct intel_ring_buffer *ring,
+				     struct drm_file *file,
+				     u32 id,
+				     struct i915_reset_stats **rs);
 int i915_gem_context_create_ioctl(struct drm_device *dev, void *data,
 				  struct drm_file *file);
 int i915_gem_context_destroy_ioctl(struct drm_device *dev, void *data,
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index a3f06bc..8e0d50c 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -299,6 +299,39 @@ static int context_idr_cleanup(int id, void *p, void *data)
 	return 0;
 }
 
+int i915_gem_context_get_reset_stats(struct intel_ring_buffer *ring,
+				     struct drm_file *file,
+				     u32 id,
+				     struct i915_reset_stats **rs)
+{
+	struct drm_i915_private *dev_priv = ring->dev->dev_private;
+	struct i915_hw_context *to;
+
+	if (dev_priv->hw_contexts_disabled)
+		return -ENOENT;
+
+	if (ring->id != RCS)
+		return -EINVAL;
+
+	/* No reset status for default context id, it is in file_priv */
+	if (id == DEFAULT_CONTEXT_ID)
+		return -EINVAL;
+
+	if (rs == NULL)
+		return -EINVAL;
+
+	if (file == NULL)
+		return -EINVAL;
+
+	to = i915_gem_context_get(file->driver_priv, id);
+	if (to == NULL)
+		return -ENOENT;
+
+	*rs = &to->reset_stats;
+
+	return 0;
+}
+
 void i915_gem_context_close(struct drm_device *dev, struct drm_file *file)
 {
 	struct drm_i915_file_private *file_priv = file->driver_priv;
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [RFC] [PATCH 4/7] drm/i915: add i915_get_reset_status_ioctl
  2013-02-04 14:04 [RFC] [PATCH 0/7] arb robustness enablers Mika Kuoppala
                   ` (2 preceding siblings ...)
  2013-02-04 14:04 ` [RFC] [PATCH 3/7] drm/i915: add reset_status for hw_contexts Mika Kuoppala
@ 2013-02-04 14:04 ` Mika Kuoppala
  2013-02-05 10:47   ` [PATCH 4/7] drm/i915: add i915_get_reset_stats_ioctl Mika Kuoppala
  2013-02-04 14:04 ` [RFC] [PATCH 5/7] drm/i915: add batch object and context to i915_add_request() Mika Kuoppala
                   ` (3 subsequent siblings)
  7 siblings, 1 reply; 18+ messages in thread
From: Mika Kuoppala @ 2013-02-04 14:04 UTC (permalink / raw)
  To: intel-gfx

This ioctl returns reset status for specified context.

Returned status is context loss state with regards to
previous call to this ioctl. It can one off:

no_error : no context lost detected

guilty   : context was lost and this context was one
           submitting a batch which hung the gpu

innocent : context was lost but this context fell victim
           to some other context which submitted batch which
           hung the gpu

v2: get rid of state tracking completely and deliver only counts. Idea
    from Chris Wilson.

Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_dma.c |    1 +
 drivers/gpu/drm/i915/i915_drv.c |   42 +++++++++++++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/i915_drv.h |    6 ++++++
 include/uapi/drm/i915_drm.h     |   19 ++++++++++++++++++
 4 files changed, 68 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index 82732ee..00b6765 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -1882,6 +1882,7 @@ struct drm_ioctl_desc i915_ioctls[] = {
 	DRM_IOCTL_DEF_DRV(I915_GEM_CONTEXT_CREATE, i915_gem_context_create_ioctl, DRM_UNLOCKED),
 	DRM_IOCTL_DEF_DRV(I915_GEM_CONTEXT_DESTROY, i915_gem_context_destroy_ioctl, DRM_UNLOCKED),
 	DRM_IOCTL_DEF_DRV(I915_REG_READ, i915_reg_read_ioctl, DRM_UNLOCKED),
+	DRM_IOCTL_DEF_DRV(I915_GET_RESET_STATS, i915_get_reset_stats_ioctl, DRM_UNLOCKED),
 };
 
 int i915_max_ioctl = DRM_ARRAY_SIZE(i915_ioctls);
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index d159d7a..67b023a 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -823,6 +823,9 @@ int i915_reset(struct drm_device *dev)
 
 	i915_gem_reset(dev);
 
+	/* Count unsuccessful ones */
+	dev_priv->reset_count++;
+
 	ret = -ENODEV;
 	if (get_seconds() - dev_priv->gpu_error.last_reset < 5)
 		DRM_ERROR("GPU hanging too fast, declaring wedged!\n");
@@ -1228,3 +1231,42 @@ int i915_reg_read_ioctl(struct drm_device *dev,
 
 	return 0;
 }
+
+int i915_get_reset_stats_ioctl(struct drm_device *dev,
+			       void *data, struct drm_file *file)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct intel_ring_buffer *ring;
+	struct drm_i915_file_private *file_priv = file->driver_priv;
+	struct drm_i915_reset_stats *args = data;
+	struct i915_reset_stats *rs = NULL;
+	int ret;
+
+	ret = mutex_lock_interruptible(&dev->struct_mutex);
+	if (ret)
+		return ret;
+
+	if (args->ctx_id == 0) {
+		rs = &file_priv->reset_stats;
+		ret = 0;
+		goto out;
+	}
+
+	ring = &dev_priv->ring[RCS];
+
+	ret = i915_gem_context_get_reset_stats(ring,
+					       file,
+					       args->ctx_id,
+					       &rs);
+out:
+	if (rs && ret == 0) {
+		args->global_lost = dev_priv->reset_count;
+		args->total_lost = rs->total;
+		args->innocent_lost = rs->innocent;
+		args->guilty_lost = rs->guilty;
+	}
+
+	mutex_unlock(&dev->struct_mutex);
+
+	return ret ? -EINVAL : 0;
+}
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 42fcfb6..f43a482 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1040,6 +1040,10 @@ typedef struct drm_i915_private {
 	/* Old dri1 support infrastructure, beware the dragons ya fools entering
 	 * here! */
 	struct i915_dri1_state dri1;
+
+	/* get_reset_stats ioctl */
+	u32 reset_count;
+
 } drm_i915_private_t;
 
 /* Iterate over initialised rings */
@@ -1832,6 +1836,8 @@ extern int intel_enable_rc6(const struct drm_device *dev);
 extern bool i915_semaphore_is_enabled(struct drm_device *dev);
 int i915_reg_read_ioctl(struct drm_device *dev, void *data,
 			struct drm_file *file);
+int i915_get_reset_stats_ioctl(struct drm_device *dev, void *data,
+			       struct drm_file *file);
 
 /* overlay */
 #ifdef CONFIG_DEBUG_FS
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 07d5941..8f4f5e2 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -198,6 +198,7 @@ typedef struct _drm_i915_sarea {
 #define DRM_I915_GEM_SET_CACHING	0x2f
 #define DRM_I915_GEM_GET_CACHING	0x30
 #define DRM_I915_REG_READ		0x31
+#define DRM_I915_GET_RESET_STATS	0x32
 
 #define DRM_IOCTL_I915_INIT		DRM_IOW( DRM_COMMAND_BASE + DRM_I915_INIT, drm_i915_init_t)
 #define DRM_IOCTL_I915_FLUSH		DRM_IO ( DRM_COMMAND_BASE + DRM_I915_FLUSH)
@@ -247,6 +248,7 @@ typedef struct _drm_i915_sarea {
 #define DRM_IOCTL_I915_GEM_CONTEXT_CREATE	DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GEM_CONTEXT_CREATE, struct drm_i915_gem_context_create)
 #define DRM_IOCTL_I915_GEM_CONTEXT_DESTROY	DRM_IOW (DRM_COMMAND_BASE + DRM_I915_GEM_CONTEXT_DESTROY, struct drm_i915_gem_context_destroy)
 #define DRM_IOCTL_I915_REG_READ			DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_REG_READ, struct drm_i915_reg_read)
+#define DRM_IOCTL_I915_GET_RESET_STATS		DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GET_RESET_STATS, struct drm_i915_reset_stats)
 
 /* Allow drivers to submit batchbuffers directly to hardware, relying
  * on the security mechanisms provided by hardware.
@@ -980,4 +982,21 @@ struct drm_i915_reg_read {
 	__u64 offset;
 	__u64 val; /* Return value */
 };
+
+struct drm_i915_reset_stats {
+	__u32 ctx_id;
+	__u32 flags;
+
+	/* For all contexts */
+	__u32 global_lost;
+
+	/* For this context */
+	__u32 total_lost;
+	__u32 innocent_lost;
+	__u32 guilty_lost;
+
+	/* unknown_lost ==
+	 * total - (innocent + guilty) */
+};
+
 #endif /* _UAPI_I915_DRM_H_ */
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [RFC] [PATCH 5/7] drm/i915: add batch object and context to i915_add_request()
  2013-02-04 14:04 [RFC] [PATCH 0/7] arb robustness enablers Mika Kuoppala
                   ` (3 preceding siblings ...)
  2013-02-04 14:04 ` [RFC] [PATCH 4/7] drm/i915: add i915_get_reset_status_ioctl Mika Kuoppala
@ 2013-02-04 14:04 ` Mika Kuoppala
  2013-02-04 14:04 ` [RFC] [PATCH 6/7] drm/i915: reference count for i915_hw_contexts Mika Kuoppala
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 18+ messages in thread
From: Mika Kuoppala @ 2013-02-04 14:04 UTC (permalink / raw)
  To: intel-gfx

In order to track down a batch buffer and context which
caused the ring to hang, store reference to bo and context
into the request struct. Request can also cause gpu to hang
after the batch in the flush section in the ring. To detect this
add start of the flush portion offset into the request.

Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h            |   15 +++++++++++++--
 drivers/gpu/drm/i915/i915_gem.c            |   14 ++++++++++----
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |    9 ++++++---
 drivers/gpu/drm/i915/intel_overlay.c       |    5 +++--
 drivers/gpu/drm/i915/intel_ringbuffer.c    |    2 +-
 5 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index f43a482..b0df678 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1241,9 +1241,18 @@ struct drm_i915_gem_request {
 	/** GEM sequence number associated with this request. */
 	uint32_t seqno;
 
-	/** Postion in the ringbuffer of the end of the request */
+	/** Position in the ringbuffer of the start of the request */
+	u32 head;
+
+	/** Position in the ringbuffer of the end of the request */
 	u32 tail;
 
+	/** Batch buffer related to this request if any */
+	struct drm_i915_gem_object *batch_obj;
+
+	/** Context related to this request */
+	struct i915_hw_context *ctx;
+
 	/** Time at which this request was emitted, in jiffies. */
 	unsigned long emitted_jiffies;
 
@@ -1635,7 +1644,9 @@ int __must_check i915_gpu_idle(struct drm_device *dev);
 int __must_check i915_gem_idle(struct drm_device *dev);
 int i915_add_request(struct intel_ring_buffer *ring,
 		     struct drm_file *file,
-		     u32 *seqno);
+		     u32 *seqno,
+		     struct drm_i915_gem_object *batch_obj,
+		     struct i915_hw_context *ctx);
 int __must_check i915_wait_seqno(struct intel_ring_buffer *ring,
 				 uint32_t seqno);
 int i915_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index d746177..9aad0aa 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -967,7 +967,7 @@ i915_gem_check_olr(struct intel_ring_buffer *ring, u32 seqno)
 
 	ret = 0;
 	if (seqno == ring->outstanding_lazy_request)
-		ret = i915_add_request(ring, NULL, NULL);
+		ret = i915_add_request(ring, NULL, NULL, NULL, NULL);
 
 	return ret;
 }
@@ -1990,14 +1990,17 @@ i915_gem_get_seqno(struct drm_device *dev, u32 *seqno)
 int
 i915_add_request(struct intel_ring_buffer *ring,
 		 struct drm_file *file,
-		 u32 *out_seqno)
+		 u32 *out_seqno,
+		 struct drm_i915_gem_object *obj,
+		 struct i915_hw_context *ctx)
 {
 	drm_i915_private_t *dev_priv = ring->dev->dev_private;
 	struct drm_i915_gem_request *request;
-	u32 request_ring_position;
+	u32 request_ring_position, request_start;
 	int was_empty;
 	int ret;
 
+	request_start = intel_ring_get_tail(ring);
 	/*
 	 * Emit any outstanding flushes - execbuf can fail to emit the flush
 	 * after having emitted the batchbuffer command. Hence we need to fix
@@ -2029,7 +2032,10 @@ i915_add_request(struct intel_ring_buffer *ring,
 
 	request->seqno = intel_ring_get_seqno(ring);
 	request->ring = ring;
+	request->head = request_start;
 	request->tail = request_ring_position;
+	request->batch_obj = obj;
+	request->ctx = ctx;
 	request->emitted_jiffies = jiffies;
 	was_empty = list_empty(&ring->request_list);
 	list_add_tail(&request->list, &ring->request_list);
@@ -2255,7 +2261,7 @@ i915_gem_retire_work_handler(struct work_struct *work)
 	idle = true;
 	for_each_ring(ring, dev_priv, i) {
 		if (ring->gpu_caches_dirty)
-			i915_add_request(ring, NULL, NULL);
+			i915_add_request(ring, NULL, NULL, NULL, NULL);
 
 		idle &= list_empty(&ring->request_list);
 	}
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 2726910..312683c 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -768,13 +768,15 @@ i915_gem_execbuffer_move_to_active(struct list_head *objects,
 static void
 i915_gem_execbuffer_retire_commands(struct drm_device *dev,
 				    struct drm_file *file,
-				    struct intel_ring_buffer *ring)
+				    struct intel_ring_buffer *ring,
+				    struct drm_i915_gem_object *obj,
+				    struct i915_hw_context *ctx)
 {
 	/* Unconditionally force add_request to emit a full flush. */
 	ring->gpu_caches_dirty = true;
 
 	/* Add a breadcrumb for the completion of the batch buffer */
-	(void)i915_add_request(ring, file, NULL);
+	(void)i915_add_request(ring, file, NULL, obj, ctx);
 }
 
 static int
@@ -812,6 +814,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	struct eb_objects *eb;
 	struct drm_i915_gem_object *batch_obj;
 	struct drm_clip_rect *cliprects = NULL;
+	struct i915_hw_context *ctx = NULL;
 	struct intel_ring_buffer *ring;
 	u32 ctx_id = i915_execbuffer2_get_context_id(*args);
 	u32 exec_start, exec_len;
@@ -1047,7 +1050,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	trace_i915_gem_ring_dispatch(ring, intel_ring_get_seqno(ring), flags);
 
 	i915_gem_execbuffer_move_to_active(&eb->objects, ring);
-	i915_gem_execbuffer_retire_commands(dev, file, ring);
+	i915_gem_execbuffer_retire_commands(dev, file, ring, batch_obj, ctx);
 
 err:
 	eb_destroy(eb);
diff --git a/drivers/gpu/drm/i915/intel_overlay.c b/drivers/gpu/drm/i915/intel_overlay.c
index ba978bf..a6d10a7 100644
--- a/drivers/gpu/drm/i915/intel_overlay.c
+++ b/drivers/gpu/drm/i915/intel_overlay.c
@@ -217,7 +217,7 @@ static int intel_overlay_do_wait_request(struct intel_overlay *overlay,
 	int ret;
 
 	BUG_ON(overlay->last_flip_req);
-	ret = i915_add_request(ring, NULL, &overlay->last_flip_req);
+	ret = i915_add_request(ring, NULL, &overlay->last_flip_req, NULL, NULL);
 	if (ret)
 		return ret;
 
@@ -286,7 +286,8 @@ static int intel_overlay_continue(struct intel_overlay *overlay,
 	intel_ring_emit(ring, flip_addr);
 	intel_ring_advance(ring);
 
-	return i915_add_request(ring, NULL, &overlay->last_flip_req);
+	return i915_add_request(ring, NULL, &overlay->last_flip_req,
+				NULL, NULL);
 }
 
 static void intel_overlay_release_old_vid_tail(struct intel_overlay *overlay)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index dc6ae2f..dab6117 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -1407,7 +1407,7 @@ int intel_ring_idle(struct intel_ring_buffer *ring)
 
 	/* We need to add any requests required to flush the objects and ring */
 	if (ring->outstanding_lazy_request) {
-		ret = i915_add_request(ring, NULL, NULL);
+		ret = i915_add_request(ring, NULL, NULL, NULL, NULL);
 		if (ret)
 			return ret;
 	}
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [RFC] [PATCH 6/7] drm/i915: reference count for i915_hw_contexts
  2013-02-04 14:04 [RFC] [PATCH 0/7] arb robustness enablers Mika Kuoppala
                   ` (4 preceding siblings ...)
  2013-02-04 14:04 ` [RFC] [PATCH 5/7] drm/i915: add batch object and context to i915_add_request() Mika Kuoppala
@ 2013-02-04 14:04 ` Mika Kuoppala
  2013-02-15  5:55   ` Ben Widawsky
  2013-02-04 14:04 ` [RFC] [PATCH 7/7] drm/i915: find guilty batch buffer on ring resets Mika Kuoppala
  2013-02-08 10:35 ` [RFC] [PATCH 0/7] arb robustness enablers Daniel Vetter
  7 siblings, 1 reply; 18+ messages in thread
From: Mika Kuoppala @ 2013-02-04 14:04 UTC (permalink / raw)
  To: intel-gfx

When using i915_hw_context pointer in request struct,
we need to use reference counts as context might be
released before request processing takes place.

v2: track i915_hw_context pointers instead of using ctx_ids
    (from Chris Wilson)

Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h            |    4 +++-
 drivers/gpu/drm/i915/i915_gem.c            |   14 ++++++++++-
 drivers/gpu/drm/i915/i915_gem_context.c    |   35 +++++++++++++++++++++++-----
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |    2 +-
 4 files changed, 46 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index b0df678..e2f8ea6 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -438,6 +438,7 @@ struct i915_reset_stats {
 /* This must match up with the value previously used for execbuf2.rsvd1. */
 #define DEFAULT_CONTEXT_ID 0
 struct i915_hw_context {
+	struct kref ref;
 	int id;
 	bool is_initialized;
 	struct drm_i915_file_private *file_priv;
@@ -1688,7 +1689,8 @@ void i915_gem_context_init(struct drm_device *dev);
 void i915_gem_context_fini(struct drm_device *dev);
 void i915_gem_context_close(struct drm_device *dev, struct drm_file *file);
 int i915_switch_context(struct intel_ring_buffer *ring,
-			struct drm_file *file, int to_id);
+			struct drm_file *file, int to_id,
+			struct i915_hw_context **ctx_out);
 void i915_gem_context_free(struct kref *ctx_ref);
 int i915_gem_context_get_reset_stats(struct intel_ring_buffer *ring,
 				     struct drm_file *file,
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 9aad0aa..b304b06 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2036,6 +2036,10 @@ i915_add_request(struct intel_ring_buffer *ring,
 	request->tail = request_ring_position;
 	request->batch_obj = obj;
 	request->ctx = ctx;
+
+	if (request->ctx)
+		kref_get(&request->ctx->ref);
+
 	request->emitted_jiffies = jiffies;
 	was_empty = list_empty(&ring->request_list);
 	list_add_tail(&request->list, &ring->request_list);
@@ -2100,6 +2104,10 @@ static void i915_gem_reset_ring_lists(struct drm_i915_private *dev_priv,
 
 		list_del(&request->list);
 		i915_gem_request_remove_from_client(request);
+
+		if (request->ctx)
+			kref_put(&request->ctx->ref, i915_gem_context_free);
+
 		kfree(request);
 	}
 
@@ -2194,6 +2202,10 @@ i915_gem_retire_requests_ring(struct intel_ring_buffer *ring)
 
 		list_del(&request->list);
 		i915_gem_request_remove_from_client(request);
+
+		if (request->ctx)
+			kref_put(&request->ctx->ref, i915_gem_context_free);
+
 		kfree(request);
 	}
 
@@ -2516,7 +2528,7 @@ int i915_gpu_idle(struct drm_device *dev)
 
 	/* Flush everything onto the inactive list. */
 	for_each_ring(ring, dev_priv, i) {
-		ret = i915_switch_context(ring, NULL, DEFAULT_CONTEXT_ID);
+		ret = i915_switch_context(ring, NULL, DEFAULT_CONTEXT_ID, NULL);
 		if (ret)
 			return ret;
 
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 8e0d50c..7e1fdb1 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -124,7 +124,14 @@ static int get_context_size(struct drm_device *dev)
 	return ret;
 }
 
-static void do_destroy(struct i915_hw_context *ctx)
+void i915_gem_context_free(struct kref *ctx_ref)
+{
+	struct i915_hw_context *ctx = container_of(ctx_ref,
+						   typeof(*ctx), ref);
+	kfree(ctx);
+}
+
+static void do_release(struct i915_hw_context *ctx)
 {
 	struct drm_device *dev = ctx->obj->base.dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
@@ -135,6 +142,11 @@ static void do_destroy(struct i915_hw_context *ctx)
 		BUG_ON(ctx != dev_priv->ring[RCS].default_context);
 
 	drm_gem_object_unreference(&ctx->obj->base);
+}
+
+static void do_destroy(struct i915_hw_context *ctx)
+{
+	do_release(ctx);
 	kfree(ctx);
 }
 
@@ -150,6 +162,7 @@ create_hw_context(struct drm_device *dev,
 	if (ctx == NULL)
 		return ERR_PTR(-ENOMEM);
 
+	kref_init(&ctx->ref);
 	ctx->obj = i915_gem_alloc_object(dev, dev_priv->hw_context_size);
 	if (ctx->obj == NULL) {
 		kfree(ctx);
@@ -294,8 +307,8 @@ static int context_idr_cleanup(int id, void *p, void *data)
 
 	BUG_ON(id == DEFAULT_CONTEXT_ID);
 
-	do_destroy(ctx);
-
+	do_release(ctx);
+	kref_put(&ctx->ref, i915_gem_context_free);
 	return 0;
 }
 
@@ -481,10 +494,12 @@ static int do_switch(struct i915_hw_context *to)
  */
 int i915_switch_context(struct intel_ring_buffer *ring,
 			struct drm_file *file,
-			int to_id)
+			int to_id,
+			struct i915_hw_context **ctx_out)
 {
 	struct drm_i915_private *dev_priv = ring->dev->dev_private;
 	struct i915_hw_context *to;
+	int ret;
 
 	if (dev_priv->hw_contexts_disabled)
 		return 0;
@@ -503,7 +518,14 @@ int i915_switch_context(struct intel_ring_buffer *ring,
 			return -ENOENT;
 	}
 
-	return do_switch(to);
+	ret = do_switch(to);
+	if (ret)
+		return ret;
+
+	if (ctx_out)
+		*ctx_out = to;
+
+	return 0;
 }
 
 int i915_gem_context_create_ioctl(struct drm_device *dev, void *data,
@@ -557,7 +579,8 @@ int i915_gem_context_destroy_ioctl(struct drm_device *dev, void *data,
 		return -ENOENT;
 	}
 
-	do_destroy(ctx);
+	do_release(ctx);
+	kref_put(&ctx->ref, i915_gem_context_free);
 
 	mutex_unlock(&dev->struct_mutex);
 
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 312683c..3b76317 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -999,7 +999,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	if (ret)
 		goto err;
 
-	ret = i915_switch_context(ring, file, ctx_id);
+	ret = i915_switch_context(ring, file, ctx_id, &ctx);
 	if (ret)
 		goto err;
 
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [RFC] [PATCH 7/7] drm/i915: find guilty batch buffer on ring resets
  2013-02-04 14:04 [RFC] [PATCH 0/7] arb robustness enablers Mika Kuoppala
                   ` (5 preceding siblings ...)
  2013-02-04 14:04 ` [RFC] [PATCH 6/7] drm/i915: reference count for i915_hw_contexts Mika Kuoppala
@ 2013-02-04 14:04 ` Mika Kuoppala
  2013-02-07 14:11   ` Ville Syrjälä
  2013-02-08 10:35 ` [RFC] [PATCH 0/7] arb robustness enablers Daniel Vetter
  7 siblings, 1 reply; 18+ messages in thread
From: Mika Kuoppala @ 2013-02-04 14:04 UTC (permalink / raw)
  To: intel-gfx

After hang check timer has declared gpu to be hang,
rings are reset. In ring reset, when clearing
request list, do post mortem analysis to find out
the guilty batch buffer.

Select requests for further analysis by inspecting
the completed sequence number which has been updated
into the HWS page. If request was completed, it can't
be related to the hang.

For completed requests mark the batch as guilty
if the ring was not waiting and the ring head was
stuck inside the buffer object or in the flush region
right after the batch. For everything else, mark
them as innocents.

Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_gem.c |   91 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 91 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index b304b06..db0f3e3 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2092,9 +2092,97 @@ i915_gem_request_remove_from_client(struct drm_i915_gem_request *request)
 	spin_unlock(&file_priv->mm.lock);
 }
 
+static bool i915_head_inside_object(u32 acthd, struct drm_i915_gem_object *obj)
+{
+	if (acthd >= obj->gtt_offset &&
+	    acthd < obj->gtt_offset + obj->base.size)
+		return true;
+
+	return false;
+}
+
+static bool i915_head_inside_request(u32 acthd, u32 rs, u32 re)
+{
+	if (rs < re) {
+		if (acthd >= rs && acthd < re)
+			return true;
+	} else if (rs > re) {
+		if (acthd >= rs || acthd < re)
+			return true;
+	}
+
+	return false;
+}
+
+static bool i915_request_guilty(struct drm_i915_gem_request *request,
+				const u32 acthd, bool *inside)
+{
+	if (request->batch_obj) {
+		if (i915_head_inside_object(acthd, request->batch_obj)) {
+			*inside = true;
+			return true;
+		}
+	}
+
+	if (i915_head_inside_request(acthd, request->head, request->tail)) {
+		*inside = false;
+		return true;
+	}
+
+	return false;
+}
+
+static void i915_set_reset_status(struct intel_ring_buffer *ring,
+				  struct drm_i915_gem_request *request,
+				  u32 acthd)
+{
+	bool inside;
+	struct i915_reset_stats *rs = NULL;
+	bool guilty;
+
+	/* Innocent until proven guilty */
+	guilty = false;
+
+	if (!ring->hangcheck_waiting &&
+	    i915_request_guilty(request, acthd, &inside)) {
+		DRM_ERROR("%s hung %s bo (0x%x ctx %d) at 0x%x\n",
+			  ring->name,
+			  inside ? "inside" : "flushing",
+			  request->batch_obj ?
+			  request->batch_obj->gtt_offset : 0,
+			  request->ctx ? request->ctx->id : 0,
+			  acthd);
+
+		guilty = true;
+	}
+
+	/* If contexts are disabled or this is the default context, use
+	 * file_priv->reset_stats
+	 */
+	if (request->ctx && request->ctx->id != DEFAULT_CONTEXT_ID)
+		rs = &request->ctx->reset_stats;
+	else if (request->file_priv)
+		rs = &request->file_priv->reset_stats;
+
+	if (rs) {
+		rs->total++;
+
+		if (guilty)
+			rs->guilty++;
+		else
+			rs->innocent++;
+	}
+}
+
 static void i915_gem_reset_ring_lists(struct drm_i915_private *dev_priv,
 				      struct intel_ring_buffer *ring)
 {
+	u32 completed_seqno;
+	u32 acthd;
+
+	acthd = intel_ring_get_active_head(ring);
+	completed_seqno = ring->get_seqno(ring, false);
+
 	while (!list_empty(&ring->request_list)) {
 		struct drm_i915_gem_request *request;
 
@@ -2102,6 +2190,9 @@ static void i915_gem_reset_ring_lists(struct drm_i915_private *dev_priv,
 					   struct drm_i915_gem_request,
 					   list);
 
+		if (request->seqno > completed_seqno)
+			i915_set_reset_status(ring, request, acthd);
+
 		list_del(&request->list);
 		i915_gem_request_remove_from_client(request);
 
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 4/7] drm/i915: add i915_get_reset_stats_ioctl
  2013-02-04 14:04 ` [RFC] [PATCH 4/7] drm/i915: add i915_get_reset_status_ioctl Mika Kuoppala
@ 2013-02-05 10:47   ` Mika Kuoppala
  0 siblings, 0 replies; 18+ messages in thread
From: Mika Kuoppala @ 2013-02-05 10:47 UTC (permalink / raw)
  To: intel-gfx

This ioctl returns reset stats for specified context.

The struct returned contains context loss counters.
These are:

global_lost:   all resets across all contexts
total_lost:    all resets for this context
innocent_lost: contexts lost, guilty was some other context
guilty_lost:   contexts lost, guilty was this context

v2: get rid of state tracking completely and deliver only counts. Idea
    from Chris Wilson.

v3: fix commit message

Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_dma.c |    1 +
 drivers/gpu/drm/i915/i915_drv.c |   42 +++++++++++++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/i915_drv.h |    6 ++++++
 include/uapi/drm/i915_drm.h     |   19 ++++++++++++++++++
 4 files changed, 68 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index 82732ee..00b6765 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -1882,6 +1882,7 @@ struct drm_ioctl_desc i915_ioctls[] = {
 	DRM_IOCTL_DEF_DRV(I915_GEM_CONTEXT_CREATE, i915_gem_context_create_ioctl, DRM_UNLOCKED),
 	DRM_IOCTL_DEF_DRV(I915_GEM_CONTEXT_DESTROY, i915_gem_context_destroy_ioctl, DRM_UNLOCKED),
 	DRM_IOCTL_DEF_DRV(I915_REG_READ, i915_reg_read_ioctl, DRM_UNLOCKED),
+	DRM_IOCTL_DEF_DRV(I915_GET_RESET_STATS, i915_get_reset_stats_ioctl, DRM_UNLOCKED),
 };
 
 int i915_max_ioctl = DRM_ARRAY_SIZE(i915_ioctls);
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index d159d7a..67b023a 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -823,6 +823,9 @@ int i915_reset(struct drm_device *dev)
 
 	i915_gem_reset(dev);
 
+	/* Count unsuccessful ones */
+	dev_priv->reset_count++;
+
 	ret = -ENODEV;
 	if (get_seconds() - dev_priv->gpu_error.last_reset < 5)
 		DRM_ERROR("GPU hanging too fast, declaring wedged!\n");
@@ -1228,3 +1231,42 @@ int i915_reg_read_ioctl(struct drm_device *dev,
 
 	return 0;
 }
+
+int i915_get_reset_stats_ioctl(struct drm_device *dev,
+			       void *data, struct drm_file *file)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct intel_ring_buffer *ring;
+	struct drm_i915_file_private *file_priv = file->driver_priv;
+	struct drm_i915_reset_stats *args = data;
+	struct i915_reset_stats *rs = NULL;
+	int ret;
+
+	ret = mutex_lock_interruptible(&dev->struct_mutex);
+	if (ret)
+		return ret;
+
+	if (args->ctx_id == 0) {
+		rs = &file_priv->reset_stats;
+		ret = 0;
+		goto out;
+	}
+
+	ring = &dev_priv->ring[RCS];
+
+	ret = i915_gem_context_get_reset_stats(ring,
+					       file,
+					       args->ctx_id,
+					       &rs);
+out:
+	if (rs && ret == 0) {
+		args->global_lost = dev_priv->reset_count;
+		args->total_lost = rs->total;
+		args->innocent_lost = rs->innocent;
+		args->guilty_lost = rs->guilty;
+	}
+
+	mutex_unlock(&dev->struct_mutex);
+
+	return ret ? -EINVAL : 0;
+}
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 42fcfb6..f43a482 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1040,6 +1040,10 @@ typedef struct drm_i915_private {
 	/* Old dri1 support infrastructure, beware the dragons ya fools entering
 	 * here! */
 	struct i915_dri1_state dri1;
+
+	/* get_reset_stats ioctl */
+	u32 reset_count;
+
 } drm_i915_private_t;
 
 /* Iterate over initialised rings */
@@ -1832,6 +1836,8 @@ extern int intel_enable_rc6(const struct drm_device *dev);
 extern bool i915_semaphore_is_enabled(struct drm_device *dev);
 int i915_reg_read_ioctl(struct drm_device *dev, void *data,
 			struct drm_file *file);
+int i915_get_reset_stats_ioctl(struct drm_device *dev, void *data,
+			       struct drm_file *file);
 
 /* overlay */
 #ifdef CONFIG_DEBUG_FS
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 07d5941..8f4f5e2 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -198,6 +198,7 @@ typedef struct _drm_i915_sarea {
 #define DRM_I915_GEM_SET_CACHING	0x2f
 #define DRM_I915_GEM_GET_CACHING	0x30
 #define DRM_I915_REG_READ		0x31
+#define DRM_I915_GET_RESET_STATS	0x32
 
 #define DRM_IOCTL_I915_INIT		DRM_IOW( DRM_COMMAND_BASE + DRM_I915_INIT, drm_i915_init_t)
 #define DRM_IOCTL_I915_FLUSH		DRM_IO ( DRM_COMMAND_BASE + DRM_I915_FLUSH)
@@ -247,6 +248,7 @@ typedef struct _drm_i915_sarea {
 #define DRM_IOCTL_I915_GEM_CONTEXT_CREATE	DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GEM_CONTEXT_CREATE, struct drm_i915_gem_context_create)
 #define DRM_IOCTL_I915_GEM_CONTEXT_DESTROY	DRM_IOW (DRM_COMMAND_BASE + DRM_I915_GEM_CONTEXT_DESTROY, struct drm_i915_gem_context_destroy)
 #define DRM_IOCTL_I915_REG_READ			DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_REG_READ, struct drm_i915_reg_read)
+#define DRM_IOCTL_I915_GET_RESET_STATS		DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GET_RESET_STATS, struct drm_i915_reset_stats)
 
 /* Allow drivers to submit batchbuffers directly to hardware, relying
  * on the security mechanisms provided by hardware.
@@ -980,4 +982,21 @@ struct drm_i915_reg_read {
 	__u64 offset;
 	__u64 val; /* Return value */
 };
+
+struct drm_i915_reset_stats {
+	__u32 ctx_id;
+	__u32 flags;
+
+	/* For all contexts */
+	__u32 global_lost;
+
+	/* For this context */
+	__u32 total_lost;
+	__u32 innocent_lost;
+	__u32 guilty_lost;
+
+	/* unknown_lost ==
+	 * total - (innocent + guilty) */
+};
+
 #endif /* _UAPI_I915_DRM_H_ */
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [RFC] [PATCH 7/7] drm/i915: find guilty batch buffer on ring resets
  2013-02-04 14:04 ` [RFC] [PATCH 7/7] drm/i915: find guilty batch buffer on ring resets Mika Kuoppala
@ 2013-02-07 14:11   ` Ville Syrjälä
  2013-02-15 14:12     ` Mika Kuoppala
  0 siblings, 1 reply; 18+ messages in thread
From: Ville Syrjälä @ 2013-02-07 14:11 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx

On Mon, Feb 04, 2013 at 04:04:43PM +0200, Mika Kuoppala wrote:
> After hang check timer has declared gpu to be hang,
> rings are reset. In ring reset, when clearing
> request list, do post mortem analysis to find out
> the guilty batch buffer.
> 
> Select requests for further analysis by inspecting
> the completed sequence number which has been updated
> into the HWS page. If request was completed, it can't
> be related to the hang.
> 
> For completed requests mark the batch as guilty
      ^^^^^^^^^

That's a typo, right?

> if the ring was not waiting and the ring head was
> stuck inside the buffer object or in the flush region
> right after the batch. For everything else, mark
> them as innocents.
> 
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_gem.c |   91 +++++++++++++++++++++++++++++++++++++++
>  1 file changed, 91 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index b304b06..db0f3e3 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2092,9 +2092,97 @@ i915_gem_request_remove_from_client(struct drm_i915_gem_request *request)
>  	spin_unlock(&file_priv->mm.lock);
>  }
>  
> +static bool i915_head_inside_object(u32 acthd, struct drm_i915_gem_object *obj)
> +{
> +	if (acthd >= obj->gtt_offset &&
> +	    acthd < obj->gtt_offset + obj->base.size)
> +		return true;
> +
> +	return false;
> +}
> +
> +static bool i915_head_inside_request(u32 acthd, u32 rs, u32 re)
> +{
> +	if (rs < re) {
> +		if (acthd >= rs && acthd < re)
> +			return true;
> +	} else if (rs > re) {
> +		if (acthd >= rs || acthd < re)
> +			return true;
> +	}
> +
> +	return false;
> +}
> +
> +static bool i915_request_guilty(struct drm_i915_gem_request *request,
> +				const u32 acthd, bool *inside)
> +{
> +	if (request->batch_obj) {
> +		if (i915_head_inside_object(acthd, request->batch_obj)) {
> +			*inside = true;
> +			return true;
> +		}
> +	}
> +
> +	if (i915_head_inside_request(acthd, request->head, request->tail)) {
> +		*inside = false;
> +		return true;
> +	}
> +
> +	return false;
> +}
> +
> +static void i915_set_reset_status(struct intel_ring_buffer *ring,
> +				  struct drm_i915_gem_request *request,
> +				  u32 acthd)
> +{
> +	bool inside;
> +	struct i915_reset_stats *rs = NULL;
> +	bool guilty;
> +
> +	/* Innocent until proven guilty */
> +	guilty = false;
> +
> +	if (!ring->hangcheck_waiting &&
> +	    i915_request_guilty(request, acthd, &inside)) {
> +		DRM_ERROR("%s hung %s bo (0x%x ctx %d) at 0x%x\n",
> +			  ring->name,
> +			  inside ? "inside" : "flushing",
> +			  request->batch_obj ?
> +			  request->batch_obj->gtt_offset : 0,
> +			  request->ctx ? request->ctx->id : 0,
> +			  acthd);
> +
> +		guilty = true;
> +	}
> +
> +	/* If contexts are disabled or this is the default context, use
> +	 * file_priv->reset_stats
> +	 */
> +	if (request->ctx && request->ctx->id != DEFAULT_CONTEXT_ID)
> +		rs = &request->ctx->reset_stats;
> +	else if (request->file_priv)
> +		rs = &request->file_priv->reset_stats;
> +
> +	if (rs) {
> +		rs->total++;
> +
> +		if (guilty)
> +			rs->guilty++;
> +		else
> +			rs->innocent++;
> +	}
> +}
> +
>  static void i915_gem_reset_ring_lists(struct drm_i915_private *dev_priv,
>  				      struct intel_ring_buffer *ring)
>  {
> +	u32 completed_seqno;
> +	u32 acthd;
> +
> +	acthd = intel_ring_get_active_head(ring);
> +	completed_seqno = ring->get_seqno(ring, false);
> +
>  	while (!list_empty(&ring->request_list)) {
>  		struct drm_i915_gem_request *request;
>  
> @@ -2102,6 +2190,9 @@ static void i915_gem_reset_ring_lists(struct drm_i915_private *dev_priv,
>  					   struct drm_i915_gem_request,
>  					   list);
>  
> +		if (request->seqno > completed_seqno)

i915_seqno_passed()?

> +			i915_set_reset_status(ring, request, acthd);
> +
>  		list_del(&request->list);
>  		i915_gem_request_remove_from_client(request);
>  
> -- 
> 1.7.9.5
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Ville Syrjälä
Intel OTC

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] [PATCH 0/7] arb robustness enablers
  2013-02-04 14:04 [RFC] [PATCH 0/7] arb robustness enablers Mika Kuoppala
                   ` (6 preceding siblings ...)
  2013-02-04 14:04 ` [RFC] [PATCH 7/7] drm/i915: find guilty batch buffer on ring resets Mika Kuoppala
@ 2013-02-08 10:35 ` Daniel Vetter
  7 siblings, 0 replies; 18+ messages in thread
From: Daniel Vetter @ 2013-02-08 10:35 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx

On Mon, Feb 04, 2013 at 04:04:36PM +0200, Mika Kuoppala wrote:
> Hi,
> 
> This patchset adds ioctl and related changes to allow
> userspace to query about the context loss status of the
> specified context. The aim is to provide enabler
> for the ARB_Robustness GL extension.
> 
> I post this as RFC to get ack/nack on the interface.
> The interface is in 0004-drm-i915-add-i915_get_reset_status_ioctl.patch
> 
> Also any feedback on the other parts of the patchset is most
> welcomed.
> 
> The i-g-t testcase can be found here:
> https://github.com/mkuoppal/intel-gpu-tools/commits/arb-robustness

Ok, I've only quickly scrolled through this but it looks rather sane, and
the interface seems to be a fit for arb robustness. So tentative ack on
the ioctl ;-) A few comments:

The first patch imo makes sense conceptually, even though it's not
required to implement the ioctl (only the test). This needs critical
review from Chris though since he's really picky about the hangcheck
misfiring or not detecting hangs. Might be that we also have some races
left which have been papered over by the acthd checks ... Best to split
this out for maximum scrunity.

For the remaining stuff, I see three pieces:
- refcounts for hw contexts: Imo this is rather independent and should go
  in early. If it doesn't apply without the other I think we should move
  it ahead in a rebase.
- Infrastructure to figure out whom to blame for a hang. One thing I'd
  really love to see on top of your patches is adjusting the "hanging too
  fast" detection to only disable command execution for the offending fd
  and not kill the entire gpu unnecessarily. This should help a lot
  already today, without any userspace support. I think it'd be also good
  to have an i-g-t test for this which tries to hang the gpu with the
  hangman on e.g.  the blit constantly until the gpu dies. And then open a
  new fd to check whether things still work.
- Putting that blame to the right hw contexts and exposing it through the
  ioctl. Usual interface rules applies: We need to have userspace support
  more or less ready before merging, i.e. a rough cut at the robustness
  extension patches for mesa. Probably best if you can do this in
  collaboration with one of the gl guys in Helsinki.

Cheers, Daniel

> 
> Thanks,
> --Mika
> 
> Mika Kuoppala (7):
>   drm/i915: detect infinite loops in hang check
>   drm/i915: add struct i915_reset_stats
>   drm/i915: add reset_status for hw_contexts
>   drm/i915: add i915_get_reset_status_ioctl
>   drm/i915: add batch object and context to i915_add_request()
>   drm/i915: reference count for i915_hw_contexts
>   drm/i915: find guilty batch buffer on ring resets
> 
>  drivers/gpu/drm/i915/i915_dma.c            |    3 +-
>  drivers/gpu/drm/i915/i915_drv.c            |   42 +++++++++
>  drivers/gpu/drm/i915/i915_drv.h            |   41 +++++++--
>  drivers/gpu/drm/i915/i915_gem.c            |  119 ++++++++++++++++++++++++--
>  drivers/gpu/drm/i915/i915_gem_context.c    |   66 +++++++++++++--
>  drivers/gpu/drm/i915/i915_gem_execbuffer.c |   11 ++-
>  drivers/gpu/drm/i915/i915_irq.c            |  127 ++++++++++++++--------------
>  drivers/gpu/drm/i915/intel_overlay.c       |    5 +-
>  drivers/gpu/drm/i915/intel_ringbuffer.c    |    2 +-
>  drivers/gpu/drm/i915/intel_ringbuffer.h    |    4 +
>  include/uapi/drm/i915_drm.h                |   19 +++++
>  11 files changed, 351 insertions(+), 88 deletions(-)
> 
> -- 
> 1.7.9.5
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] [PATCH 6/7] drm/i915: reference count for i915_hw_contexts
  2013-02-04 14:04 ` [RFC] [PATCH 6/7] drm/i915: reference count for i915_hw_contexts Mika Kuoppala
@ 2013-02-15  5:55   ` Ben Widawsky
  0 siblings, 0 replies; 18+ messages in thread
From: Ben Widawsky @ 2013-02-15  5:55 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx

On Mon, Feb 04, 2013 at 04:04:42PM +0200, Mika Kuoppala wrote:
> When using i915_hw_context pointer in request struct,
> we need to use reference counts as context might be
> released before request processing takes place.
> 
> v2: track i915_hw_context pointers instead of using ctx_ids
>     (from Chris Wilson)
> 
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>

It's satisfying to me to see this given that Daniel killed my version of
this patch so long ago.

http://lists.freedesktop.org/archives/intel-gfx/2012-March/015519.html

-- 
Ben Widawsky, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] [PATCH 1/7] drm/i915: detect infinite loops in hang check
  2013-02-04 14:04 ` [RFC] [PATCH 1/7] drm/i915: detect infinite loops in hang check Mika Kuoppala
@ 2013-02-15  6:14   ` Ben Widawsky
  2013-02-15  9:49     ` Daniel Vetter
  2013-02-15 14:48     ` Mika Kuoppala
  0 siblings, 2 replies; 18+ messages in thread
From: Ben Widawsky @ 2013-02-15  6:14 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx

Mostly just me talking outloud...

On Mon, Feb 04, 2013 at 04:04:37PM +0200, Mika Kuoppala wrote:
> If there was a batch chaining loop or infinite loop in the batchbuffer,
> we didn't detect it as acthd and instdone kept changing in those cases
> and hang was never declared.

I don't think we can infinite loop on any platforms which exists today.
Can we? I agree chaining issues can occur, but it seems like it'd be
extremely rare. Furthermore if this did indeed occur, almost certainly
we'd fill up the ring and fail in some other way anyway.

> 
> To detect ring hangs, including infinite loops, keep track of ring
> seqno progression.

I'm having a lot of trouble reviewing this patch, which leads me to
believe it should probably be split up a bit more, or I'm stupid
(both?).

It sounds like a good idea but there are a lot of pitfalls involved.

> 
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h         |    3 -
>  drivers/gpu/drm/i915/i915_irq.c         |  127 +++++++++++++++----------------
>  drivers/gpu/drm/i915/intel_ringbuffer.h |    4 +
>  3 files changed, 67 insertions(+), 67 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 984523d..4daab74 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -791,9 +791,6 @@ struct i915_gpu_error {
>  #define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */
>  #define DRM_I915_HANGCHECK_JIFFIES msecs_to_jiffies(DRM_I915_HANGCHECK_PERIOD)
>  	struct timer_list hangcheck_timer;
> -	int hangcheck_count;
> -	uint32_t last_acthd[I915_NUM_RINGS];
> -	uint32_t prev_instdone[I915_NUM_INSTDONE_REG];
>  
>  	/* For reset and error_state handling. */
>  	spinlock_t lock;
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 5648d84..d0a9e21 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -356,7 +356,7 @@ static void notify_ring(struct drm_device *dev,
>  
>  	wake_up_all(&ring->irq_queue);
>  	if (i915_enable_hangcheck) {
> -		dev_priv->gpu_error.hangcheck_count = 0;
> +		ring->hangcheck_count = 0;
>  		mod_timer(&dev_priv->gpu_error.hangcheck_timer,
>  			  round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
>  	}
> @@ -1727,11 +1727,11 @@ ring_last_seqno(struct intel_ring_buffer *ring)
>  			  struct drm_i915_gem_request, list)->seqno;
>  }
>  
> -static bool i915_hangcheck_ring_idle(struct intel_ring_buffer *ring, bool *err)
> +static bool i915_hangcheck_ring_idle(struct intel_ring_buffer *ring,
> +				     u32 ring_seqno, bool *err)
>  {
>  	if (list_empty(&ring->request_list) ||
> -	    i915_seqno_passed(ring->get_seqno(ring, false),
> -			      ring_last_seqno(ring))) {
> +	    i915_seqno_passed(ring_seqno, ring_last_seqno(ring))) {
>  		/* Issue a wake-up to catch stuck h/w. */
>  		if (waitqueue_active(&ring->irq_queue)) {
>  			DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
> @@ -1749,39 +1749,32 @@ static bool kick_ring(struct intel_ring_buffer *ring)
>  	struct drm_device *dev = ring->dev;
>  	struct drm_i915_private *dev_priv = dev->dev_private;
>  	u32 tmp = I915_READ_CTL(ring);
> +
> +	ring->hangcheck_waiting = false;
> +
>  	if (tmp & RING_WAIT) {
>  		DRM_ERROR("Kicking stuck wait on %s\n",
>  			  ring->name);
>  		I915_WRITE_CTL(ring, tmp);
> -		return true;
> +		ring->hangcheck_waiting = true;
>  	}
> -	return false;
> -}
> -
> -static bool i915_hangcheck_hung(struct drm_device *dev)
> -{
> -	drm_i915_private_t *dev_priv = dev->dev_private;
>  
> -	if (dev_priv->gpu_error.hangcheck_count++ > 1) {
> -		bool hung = true;
> +	if ((INTEL_INFO(dev)->gen >= 6) && (tmp & RING_WAIT_SEMAPHORE))
> +		ring->hangcheck_waiting = true;
>  
> -		DRM_ERROR("Hangcheck timer elapsed... GPU hung\n");
> -		i915_handle_error(dev, true);
> -
> -		if (!IS_GEN2(dev)) {
> -			struct intel_ring_buffer *ring;
> -			int i;
> -
> -			/* Is the chip hanging on a WAIT_FOR_EVENT?
> -			 * If so we can simply poke the RB_WAIT bit
> -			 * and break the hang. This should work on
> -			 * all but the second generation chipsets.
> -			 */
> -			for_each_ring(ring, dev_priv, i)
> -				hung &= !kick_ring(ring);
> -		}
> +	return ring->hangcheck_waiting;
> +}
>  
> -		return hung;
> +static bool i915_hangcheck_ring_hung(struct drm_device *dev,
> +				     struct intel_ring_buffer *ring)
> +{
> +	if (!IS_GEN2(dev)) {
> +		/* Is the chip hanging on a WAIT_FOR_EVENT?
> +		 * If so we can simply poke the RB_WAIT bit
> +		 * and break the hang. This should work on
> +		 * all but the second generation chipsets.
> +		 */
> +		return !kick_ring(ring);
>  	}
>  
>  	return false;
> @@ -1789,62 +1782,68 @@ static bool i915_hangcheck_hung(struct drm_device *dev)
>  
>  /**
>   * This is called when the chip hasn't reported back with completed
> - * batchbuffers in a long time. The first time this is called we simply record
> - * ACTHD. If ACTHD hasn't changed by the time the hangcheck timer elapses
> - * again, we assume the chip is wedged and try to fix it.
> + * batchbuffers in a long time. We record current seqno for each count and
> + * in subsequent calls we check if requests have been processed by each ring.
> + * If there is no progress on specific ring, we declare it as hung.
>   */
>  void i915_hangcheck_elapsed(unsigned long data)
>  {
>  	struct drm_device *dev = (struct drm_device *)data;
>  	drm_i915_private_t *dev_priv = dev->dev_private;
> -	uint32_t acthd[I915_NUM_RINGS], instdone[I915_NUM_INSTDONE_REG];
>  	struct intel_ring_buffer *ring;
> -	bool err = false, idle;
> +	bool ring_hung;
>  	int i;
> +	int busy_count = 0;
>  
>  	if (!i915_enable_hangcheck)
>  		return;
>  
> -	memset(acthd, 0, sizeof(acthd));
> -	idle = true;
>  	for_each_ring(ring, dev_priv, i) {
> -	    idle &= i915_hangcheck_ring_idle(ring, &err);
> -	    acthd[i] = intel_ring_get_active_head(ring);
> -	}
> +		bool err = false, idle;
> +		u32 seqno;
>  
> -	/* If all work is done then ACTHD clearly hasn't advanced. */
> -	if (idle) {
> -		if (err) {
> -			if (i915_hangcheck_hung(dev))
> -				return;
> +		seqno = ring->get_seqno(ring, false);
> +		idle = i915_hangcheck_ring_idle(ring, seqno, &err);
> +
> +		if (idle) {
> +			if (err)
> +				ring->hangcheck_count++;
> +			else
> +				ring->hangcheck_count = 0;
> +		} else {
> +			busy_count++;
>  
> -			goto repeat;
> +			if (ring->hangcheck_seqno == seqno) {
> +				ring->hangcheck_count++;
> +
> +				/* If the ring is not waiting, raise the
> +				 * hung score */
> +				if (i915_hangcheck_ring_hung(dev, ring))
> +					ring->hangcheck_count++;
> +			} else {
> +				ring->hangcheck_count = 0;
> +			}
>  		}
>  
> -		dev_priv->gpu_error.hangcheck_count = 0;
> -		return;
> +		ring->hangcheck_seqno = seqno;
>  	}
>  
> -	i915_get_extra_instdone(dev, instdone);
> -	if (memcmp(dev_priv->gpu_error.last_acthd, acthd,
> -		   sizeof(acthd)) == 0 &&
> -	    memcmp(dev_priv->gpu_error.prev_instdone, instdone,
> -		   sizeof(instdone)) == 0) {
> -		if (i915_hangcheck_hung(dev))
> -			return;
> -	} else {
> -		dev_priv->gpu_error.hangcheck_count = 0;
> -
> -		memcpy(dev_priv->gpu_error.last_acthd, acthd,
> -		       sizeof(acthd));
> -		memcpy(dev_priv->gpu_error.prev_instdone, instdone,
> -		       sizeof(instdone));
> +	ring_hung = false;
> +	for_each_ring(ring, dev_priv, i) {
> +		if (ring->hangcheck_count > 2) {
> +			ring_hung = true;
> +			DRM_ERROR("%s seems hung\n", ring->name);
> +		}
>  	}
>  
> -repeat:
> +	if (ring_hung)
> +		return i915_handle_error(dev, true);
> +
>  	/* Reset timer case chip hangs without another request being added */
> -	mod_timer(&dev_priv->gpu_error.hangcheck_timer,
> -		  round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
> +	if  (busy_count)
> +		mod_timer(&dev_priv->gpu_error.hangcheck_timer,
> +			  round_jiffies_up(jiffies +
> +					   DRM_I915_HANGCHECK_JIFFIES));
>  }
>  
>  /* drm_dma.h hooks
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index d66208c..7257252 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -137,6 +137,10 @@ struct  intel_ring_buffer {
>  	struct i915_hw_context *default_context;
>  	struct drm_i915_gem_object *last_context_obj;
>  
> +	int hangcheck_count;
> +	u32 hangcheck_seqno;
> +	bool hangcheck_waiting;
> +
>  	void *private;
>  };
>  

-- 
Ben Widawsky, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] [PATCH 2/7] drm/i915: add struct i915_reset_stats
  2013-02-04 14:04 ` [RFC] [PATCH 2/7] drm/i915: add struct i915_reset_stats Mika Kuoppala
@ 2013-02-15  6:21   ` Ben Widawsky
  0 siblings, 0 replies; 18+ messages in thread
From: Ben Widawsky @ 2013-02-15  6:21 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx

On Mon, Feb 04, 2013 at 04:04:38PM +0200, Mika Kuoppala wrote:
> To count context losses, add struct i915_reset stats for
> both i915_hw_context and drm_i915_file_private.
> drm_i915_file_private is used when there is no context.
> 
> Use kzalloc when allocating drm_i915_file_private to
> initialize stats
> 

I don't see anything functionally wrong with this idea. I'd envisioned
it a bit differently though. I was thinking file_priv can keep a list of
contexts, and each context holds how many resets it was responsible for.
When you query the ioctl, you walk the list and tally up the stats.

> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_dma.c |    2 +-
>  drivers/gpu/drm/i915/i915_drv.h |    8 ++++++++
>  2 files changed, 9 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
> index cf06103..82732ee 100644
> --- a/drivers/gpu/drm/i915/i915_dma.c
> +++ b/drivers/gpu/drm/i915/i915_dma.c
> @@ -1773,7 +1773,7 @@ int i915_driver_open(struct drm_device *dev, struct drm_file *file)
>  	struct drm_i915_file_private *file_priv;
>  
>  	DRM_DEBUG_DRIVER("\n");
> -	file_priv = kmalloc(sizeof(*file_priv), GFP_KERNEL);
> +	file_priv = kzalloc(sizeof(*file_priv), GFP_KERNEL);
>  	if (!file_priv)
>  		return -ENOMEM;
>  
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 4daab74..b87240f 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -429,6 +429,11 @@ struct i915_hw_ppgtt {
>  	void (*cleanup)(struct i915_hw_ppgtt *ppgtt);
>  };
>  
> +struct i915_reset_stats {
> +	u32 total;
> +	u32 innocent;
> +	u32 guilty;
> +};
>  
>  /* This must match up with the value previously used for execbuf2.rsvd1. */
>  #define DEFAULT_CONTEXT_ID 0
> @@ -438,6 +443,7 @@ struct i915_hw_context {
>  	struct drm_i915_file_private *file_priv;
>  	struct intel_ring_buffer *ring;
>  	struct drm_i915_gem_object *obj;
> +	struct i915_reset_stats reset_stats;
>  };
>  
>  enum no_fbc_reason {
> @@ -1251,6 +1257,8 @@ struct drm_i915_file_private {
>  		struct list_head request_list;
>  	} mm;
>  	struct idr context_idr;
> +
> +	struct i915_reset_stats reset_stats;
>  };
>  
>  #define INTEL_INFO(dev)	(((struct drm_i915_private *) (dev)->dev_private)->info)
> -- 
> 1.7.9.5
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Ben Widawsky, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] [PATCH 1/7] drm/i915: detect infinite loops in hang check
  2013-02-15  6:14   ` Ben Widawsky
@ 2013-02-15  9:49     ` Daniel Vetter
  2013-02-15 14:48     ` Mika Kuoppala
  1 sibling, 0 replies; 18+ messages in thread
From: Daniel Vetter @ 2013-02-15  9:49 UTC (permalink / raw)
  To: Ben Widawsky; +Cc: intel-gfx

On Thu, Feb 14, 2013 at 10:14:56PM -0800, Ben Widawsky wrote:
> Mostly just me talking outloud...
> 
> On Mon, Feb 04, 2013 at 04:04:37PM +0200, Mika Kuoppala wrote:
> > If there was a batch chaining loop or infinite loop in the batchbuffer,
> > we didn't detect it as acthd and instdone kept changing in those cases
> > and hang was never declared.
> 
> I don't think we can infinite loop on any platforms which exists today.
> Can we? I agree chaining issues can occur, but it seems like it'd be
> extremely rare. Furthermore if this did indeed occur, almost certainly
> we'd fill up the ring and fail in some other way anyway.

Infinitely looping chained batches was the simplest trick we could come up
with to get a hang in a batch. And we need such a thing since the current
i-g-t hangman always hangs in the ring. Better ideas highly welcome for
how this could be reliably tested without increased risks of killing the
gpu for good ...

Hm, that reminds me that this _will_ kill the gpu on gen2/3 where we don't
have reset support.
-Daniel

> 
> > 
> > To detect ring hangs, including infinite loops, keep track of ring
> > seqno progression.
> 
> I'm having a lot of trouble reviewing this patch, which leads me to
> believe it should probably be split up a bit more, or I'm stupid
> (both?).
> 
> It sounds like a good idea but there are a lot of pitfalls involved.
> 
> > 
> > Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
> > ---
> >  drivers/gpu/drm/i915/i915_drv.h         |    3 -
> >  drivers/gpu/drm/i915/i915_irq.c         |  127 +++++++++++++++----------------
> >  drivers/gpu/drm/i915/intel_ringbuffer.h |    4 +
> >  3 files changed, 67 insertions(+), 67 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> > index 984523d..4daab74 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.h
> > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > @@ -791,9 +791,6 @@ struct i915_gpu_error {
> >  #define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */
> >  #define DRM_I915_HANGCHECK_JIFFIES msecs_to_jiffies(DRM_I915_HANGCHECK_PERIOD)
> >  	struct timer_list hangcheck_timer;
> > -	int hangcheck_count;
> > -	uint32_t last_acthd[I915_NUM_RINGS];
> > -	uint32_t prev_instdone[I915_NUM_INSTDONE_REG];
> >  
> >  	/* For reset and error_state handling. */
> >  	spinlock_t lock;
> > diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> > index 5648d84..d0a9e21 100644
> > --- a/drivers/gpu/drm/i915/i915_irq.c
> > +++ b/drivers/gpu/drm/i915/i915_irq.c
> > @@ -356,7 +356,7 @@ static void notify_ring(struct drm_device *dev,
> >  
> >  	wake_up_all(&ring->irq_queue);
> >  	if (i915_enable_hangcheck) {
> > -		dev_priv->gpu_error.hangcheck_count = 0;
> > +		ring->hangcheck_count = 0;
> >  		mod_timer(&dev_priv->gpu_error.hangcheck_timer,
> >  			  round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
> >  	}
> > @@ -1727,11 +1727,11 @@ ring_last_seqno(struct intel_ring_buffer *ring)
> >  			  struct drm_i915_gem_request, list)->seqno;
> >  }
> >  
> > -static bool i915_hangcheck_ring_idle(struct intel_ring_buffer *ring, bool *err)
> > +static bool i915_hangcheck_ring_idle(struct intel_ring_buffer *ring,
> > +				     u32 ring_seqno, bool *err)
> >  {
> >  	if (list_empty(&ring->request_list) ||
> > -	    i915_seqno_passed(ring->get_seqno(ring, false),
> > -			      ring_last_seqno(ring))) {
> > +	    i915_seqno_passed(ring_seqno, ring_last_seqno(ring))) {
> >  		/* Issue a wake-up to catch stuck h/w. */
> >  		if (waitqueue_active(&ring->irq_queue)) {
> >  			DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
> > @@ -1749,39 +1749,32 @@ static bool kick_ring(struct intel_ring_buffer *ring)
> >  	struct drm_device *dev = ring->dev;
> >  	struct drm_i915_private *dev_priv = dev->dev_private;
> >  	u32 tmp = I915_READ_CTL(ring);
> > +
> > +	ring->hangcheck_waiting = false;
> > +
> >  	if (tmp & RING_WAIT) {
> >  		DRM_ERROR("Kicking stuck wait on %s\n",
> >  			  ring->name);
> >  		I915_WRITE_CTL(ring, tmp);
> > -		return true;
> > +		ring->hangcheck_waiting = true;
> >  	}
> > -	return false;
> > -}
> > -
> > -static bool i915_hangcheck_hung(struct drm_device *dev)
> > -{
> > -	drm_i915_private_t *dev_priv = dev->dev_private;
> >  
> > -	if (dev_priv->gpu_error.hangcheck_count++ > 1) {
> > -		bool hung = true;
> > +	if ((INTEL_INFO(dev)->gen >= 6) && (tmp & RING_WAIT_SEMAPHORE))
> > +		ring->hangcheck_waiting = true;
> >  
> > -		DRM_ERROR("Hangcheck timer elapsed... GPU hung\n");
> > -		i915_handle_error(dev, true);
> > -
> > -		if (!IS_GEN2(dev)) {
> > -			struct intel_ring_buffer *ring;
> > -			int i;
> > -
> > -			/* Is the chip hanging on a WAIT_FOR_EVENT?
> > -			 * If so we can simply poke the RB_WAIT bit
> > -			 * and break the hang. This should work on
> > -			 * all but the second generation chipsets.
> > -			 */
> > -			for_each_ring(ring, dev_priv, i)
> > -				hung &= !kick_ring(ring);
> > -		}
> > +	return ring->hangcheck_waiting;
> > +}
> >  
> > -		return hung;
> > +static bool i915_hangcheck_ring_hung(struct drm_device *dev,
> > +				     struct intel_ring_buffer *ring)
> > +{
> > +	if (!IS_GEN2(dev)) {
> > +		/* Is the chip hanging on a WAIT_FOR_EVENT?
> > +		 * If so we can simply poke the RB_WAIT bit
> > +		 * and break the hang. This should work on
> > +		 * all but the second generation chipsets.
> > +		 */
> > +		return !kick_ring(ring);
> >  	}
> >  
> >  	return false;
> > @@ -1789,62 +1782,68 @@ static bool i915_hangcheck_hung(struct drm_device *dev)
> >  
> >  /**
> >   * This is called when the chip hasn't reported back with completed
> > - * batchbuffers in a long time. The first time this is called we simply record
> > - * ACTHD. If ACTHD hasn't changed by the time the hangcheck timer elapses
> > - * again, we assume the chip is wedged and try to fix it.
> > + * batchbuffers in a long time. We record current seqno for each count and
> > + * in subsequent calls we check if requests have been processed by each ring.
> > + * If there is no progress on specific ring, we declare it as hung.
> >   */
> >  void i915_hangcheck_elapsed(unsigned long data)
> >  {
> >  	struct drm_device *dev = (struct drm_device *)data;
> >  	drm_i915_private_t *dev_priv = dev->dev_private;
> > -	uint32_t acthd[I915_NUM_RINGS], instdone[I915_NUM_INSTDONE_REG];
> >  	struct intel_ring_buffer *ring;
> > -	bool err = false, idle;
> > +	bool ring_hung;
> >  	int i;
> > +	int busy_count = 0;
> >  
> >  	if (!i915_enable_hangcheck)
> >  		return;
> >  
> > -	memset(acthd, 0, sizeof(acthd));
> > -	idle = true;
> >  	for_each_ring(ring, dev_priv, i) {
> > -	    idle &= i915_hangcheck_ring_idle(ring, &err);
> > -	    acthd[i] = intel_ring_get_active_head(ring);
> > -	}
> > +		bool err = false, idle;
> > +		u32 seqno;
> >  
> > -	/* If all work is done then ACTHD clearly hasn't advanced. */
> > -	if (idle) {
> > -		if (err) {
> > -			if (i915_hangcheck_hung(dev))
> > -				return;
> > +		seqno = ring->get_seqno(ring, false);
> > +		idle = i915_hangcheck_ring_idle(ring, seqno, &err);
> > +
> > +		if (idle) {
> > +			if (err)
> > +				ring->hangcheck_count++;
> > +			else
> > +				ring->hangcheck_count = 0;
> > +		} else {
> > +			busy_count++;
> >  
> > -			goto repeat;
> > +			if (ring->hangcheck_seqno == seqno) {
> > +				ring->hangcheck_count++;
> > +
> > +				/* If the ring is not waiting, raise the
> > +				 * hung score */
> > +				if (i915_hangcheck_ring_hung(dev, ring))
> > +					ring->hangcheck_count++;
> > +			} else {
> > +				ring->hangcheck_count = 0;
> > +			}
> >  		}
> >  
> > -		dev_priv->gpu_error.hangcheck_count = 0;
> > -		return;
> > +		ring->hangcheck_seqno = seqno;
> >  	}
> >  
> > -	i915_get_extra_instdone(dev, instdone);
> > -	if (memcmp(dev_priv->gpu_error.last_acthd, acthd,
> > -		   sizeof(acthd)) == 0 &&
> > -	    memcmp(dev_priv->gpu_error.prev_instdone, instdone,
> > -		   sizeof(instdone)) == 0) {
> > -		if (i915_hangcheck_hung(dev))
> > -			return;
> > -	} else {
> > -		dev_priv->gpu_error.hangcheck_count = 0;
> > -
> > -		memcpy(dev_priv->gpu_error.last_acthd, acthd,
> > -		       sizeof(acthd));
> > -		memcpy(dev_priv->gpu_error.prev_instdone, instdone,
> > -		       sizeof(instdone));
> > +	ring_hung = false;
> > +	for_each_ring(ring, dev_priv, i) {
> > +		if (ring->hangcheck_count > 2) {
> > +			ring_hung = true;
> > +			DRM_ERROR("%s seems hung\n", ring->name);
> > +		}
> >  	}
> >  
> > -repeat:
> > +	if (ring_hung)
> > +		return i915_handle_error(dev, true);
> > +
> >  	/* Reset timer case chip hangs without another request being added */
> > -	mod_timer(&dev_priv->gpu_error.hangcheck_timer,
> > -		  round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
> > +	if  (busy_count)
> > +		mod_timer(&dev_priv->gpu_error.hangcheck_timer,
> > +			  round_jiffies_up(jiffies +
> > +					   DRM_I915_HANGCHECK_JIFFIES));
> >  }
> >  
> >  /* drm_dma.h hooks
> > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> > index d66208c..7257252 100644
> > --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> > @@ -137,6 +137,10 @@ struct  intel_ring_buffer {
> >  	struct i915_hw_context *default_context;
> >  	struct drm_i915_gem_object *last_context_obj;
> >  
> > +	int hangcheck_count;
> > +	u32 hangcheck_seqno;
> > +	bool hangcheck_waiting;
> > +
> >  	void *private;
> >  };
> >  
> 
> -- 
> Ben Widawsky, Intel Open Source Technology Center
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] [PATCH 7/7] drm/i915: find guilty batch buffer on ring resets
  2013-02-07 14:11   ` Ville Syrjälä
@ 2013-02-15 14:12     ` Mika Kuoppala
  2013-02-15 14:55       ` Ville Syrjälä
  0 siblings, 1 reply; 18+ messages in thread
From: Mika Kuoppala @ 2013-02-15 14:12 UTC (permalink / raw)
  To: Ville Syrjälä; +Cc: intel-gfx

Ville Syrjälä <ville.syrjala@linux.intel.com> writes:

> On Mon, Feb 04, 2013 at 04:04:43PM +0200, Mika Kuoppala wrote:
>> After hang check timer has declared gpu to be hang,
>> rings are reset. In ring reset, when clearing
>> request list, do post mortem analysis to find out
>> the guilty batch buffer.
>> 
>> Select requests for further analysis by inspecting
>> the completed sequence number which has been updated
>> into the HWS page. If request was completed, it can't
>> be related to the hang.
>> 
>> For completed requests mark the batch as guilty
>       ^^^^^^^^^
>
> That's a typo, right?

It sure is. Will fix.

>> if the ring was not waiting and the ring head was
>> stuck inside the buffer object or in the flush region
>> right after the batch. For everything else, mark
>> them as innocents.
>> 
>> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
>> ---
>>  drivers/gpu/drm/i915/i915_gem.c |   91 +++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 91 insertions(+)
>> 
>> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
>> index b304b06..db0f3e3 100644
>> --- a/drivers/gpu/drm/i915/i915_gem.c
>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>> @@ -2092,9 +2092,97 @@ i915_gem_request_remove_from_client(struct drm_i915_gem_request *request)
>>  	spin_unlock(&file_priv->mm.lock);
>>  }
>>  
>> +static bool i915_head_inside_object(u32 acthd, struct drm_i915_gem_object *obj)
>> +{
>> +	if (acthd >= obj->gtt_offset &&
>> +	    acthd < obj->gtt_offset + obj->base.size)
>> +		return true;
>> +
>> +	return false;
>> +}
>> +
>> +static bool i915_head_inside_request(u32 acthd, u32 rs, u32 re)
>> +{
>> +	if (rs < re) {
>> +		if (acthd >= rs && acthd < re)
>> +			return true;
>> +	} else if (rs > re) {
>> +		if (acthd >= rs || acthd < re)
>> +			return true;
>> +	}
>> +
>> +	return false;
>> +}
>> +
>> +static bool i915_request_guilty(struct drm_i915_gem_request *request,
>> +				const u32 acthd, bool *inside)
>> +{
>> +	if (request->batch_obj) {
>> +		if (i915_head_inside_object(acthd, request->batch_obj)) {
>> +			*inside = true;
>> +			return true;
>> +		}
>> +	}
>> +
>> +	if (i915_head_inside_request(acthd, request->head, request->tail)) {
>> +		*inside = false;
>> +		return true;
>> +	}
>> +
>> +	return false;
>> +}
>> +
>> +static void i915_set_reset_status(struct intel_ring_buffer *ring,
>> +				  struct drm_i915_gem_request *request,
>> +				  u32 acthd)
>> +{
>> +	bool inside;
>> +	struct i915_reset_stats *rs = NULL;
>> +	bool guilty;
>> +
>> +	/* Innocent until proven guilty */
>> +	guilty = false;
>> +
>> +	if (!ring->hangcheck_waiting &&
>> +	    i915_request_guilty(request, acthd, &inside)) {
>> +		DRM_ERROR("%s hung %s bo (0x%x ctx %d) at 0x%x\n",
>> +			  ring->name,
>> +			  inside ? "inside" : "flushing",
>> +			  request->batch_obj ?
>> +			  request->batch_obj->gtt_offset : 0,
>> +			  request->ctx ? request->ctx->id : 0,
>> +			  acthd);
>> +
>> +		guilty = true;
>> +	}
>> +
>> +	/* If contexts are disabled or this is the default context, use
>> +	 * file_priv->reset_stats
>> +	 */
>> +	if (request->ctx && request->ctx->id != DEFAULT_CONTEXT_ID)
>> +		rs = &request->ctx->reset_stats;
>> +	else if (request->file_priv)
>> +		rs = &request->file_priv->reset_stats;
>> +
>> +	if (rs) {
>> +		rs->total++;
>> +
>> +		if (guilty)
>> +			rs->guilty++;
>> +		else
>> +			rs->innocent++;
>> +	}
>> +}
>> +
>>  static void i915_gem_reset_ring_lists(struct drm_i915_private *dev_priv,
>>  				      struct intel_ring_buffer *ring)
>>  {
>> +	u32 completed_seqno;
>> +	u32 acthd;
>> +
>> +	acthd = intel_ring_get_active_head(ring);
>> +	completed_seqno = ring->get_seqno(ring, false);
>> +
>>  	while (!list_empty(&ring->request_list)) {
>>  		struct drm_i915_gem_request *request;
>>  
>> @@ -2102,6 +2190,9 @@ static void i915_gem_reset_ring_lists(struct drm_i915_private *dev_priv,
>>  					   struct drm_i915_gem_request,
>>  					   list);
>>  
>> +		if (request->seqno > completed_seqno)
>
> i915_seqno_passed()?

For readability or for correctness?

When seqno wraps, the request queue will be cleaned up so
we can't have cross wrap boundary stuff in here.

Or did you have something else in mind that i have missed.

Thanks for review,
-Mika

>
>> +			i915_set_reset_status(ring, request, acthd);
>> +
>>  		list_del(&request->list);
>>  		i915_gem_request_remove_from_client(request);
>>  
>> -- 
>> 1.7.9.5
>> 
>> _______________________________________________
>> Intel-gfx mailing list
>> Intel-gfx@lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
>
> -- 
> Ville Syrjälä
> Intel OTC
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] [PATCH 1/7] drm/i915: detect infinite loops in hang check
  2013-02-15  6:14   ` Ben Widawsky
  2013-02-15  9:49     ` Daniel Vetter
@ 2013-02-15 14:48     ` Mika Kuoppala
  1 sibling, 0 replies; 18+ messages in thread
From: Mika Kuoppala @ 2013-02-15 14:48 UTC (permalink / raw)
  To: Ben Widawsky; +Cc: intel-gfx

Ben Widawsky <ben@bwidawsk.net> writes:

> Mostly just me talking outloud...

Much appreciated!

> On Mon, Feb 04, 2013 at 04:04:37PM +0200, Mika Kuoppala wrote:
>> If there was a batch chaining loop or infinite loop in the batchbuffer,
>> we didn't detect it as acthd and instdone kept changing in those cases
>> and hang was never declared.
>
> I don't think we can infinite loop on any platforms which exists today.
> Can we? I agree chaining issues can occur, but it seems like it'd be
> extremely rare. Furthermore if this did indeed occur, almost certainly
> we'd fill up the ring and fail in some other way anyway.

I received a claim, which I haven't verified, that oglconform 
would have a test with vertex shader loop that manages to bring this
up.

Well I suspect that we have some other problems in recovery for that
test than the actual loop.

>> 
>> To detect ring hangs, including infinite loops, keep track of ring
>> seqno progression.
>
> I'm having a lot of trouble reviewing this patch, which leads me to
> believe it should probably be split up a bit more, or I'm stupid
> (both?).

It was poor judgement form my part to mail this patch. It is horrible.
It is not wise to throw garbage at people if you want to discuss.
I have split this up to smaller parts, will post after i get the
commit messages in shape.

> It sounds like a good idea but there are a lot of pitfalls involved.

Idea of tracking the ring progression came as the remedy to the verter
shader loop test problem in oglconform. As the loop keeps executing
the achtd kept changing and no hang was declared. Perhaps hang would
happened eventually if the ring would have filled up? But it would
happen only after lots of batch buffer submissions.

-Mika

>> 
>> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
>> ---
>>  drivers/gpu/drm/i915/i915_drv.h         |    3 -
>>  drivers/gpu/drm/i915/i915_irq.c         |  127 +++++++++++++++----------------
>>  drivers/gpu/drm/i915/intel_ringbuffer.h |    4 +
>>  3 files changed, 67 insertions(+), 67 deletions(-)
>> 
>> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>> index 984523d..4daab74 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.h
>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>> @@ -791,9 +791,6 @@ struct i915_gpu_error {
>>  #define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */
>>  #define DRM_I915_HANGCHECK_JIFFIES msecs_to_jiffies(DRM_I915_HANGCHECK_PERIOD)
>>  	struct timer_list hangcheck_timer;
>> -	int hangcheck_count;
>> -	uint32_t last_acthd[I915_NUM_RINGS];
>> -	uint32_t prev_instdone[I915_NUM_INSTDONE_REG];
>>  
>>  	/* For reset and error_state handling. */
>>  	spinlock_t lock;
>> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
>> index 5648d84..d0a9e21 100644
>> --- a/drivers/gpu/drm/i915/i915_irq.c
>> +++ b/drivers/gpu/drm/i915/i915_irq.c
>> @@ -356,7 +356,7 @@ static void notify_ring(struct drm_device *dev,
>>  
>>  	wake_up_all(&ring->irq_queue);
>>  	if (i915_enable_hangcheck) {
>> -		dev_priv->gpu_error.hangcheck_count = 0;
>> +		ring->hangcheck_count = 0;
>>  		mod_timer(&dev_priv->gpu_error.hangcheck_timer,
>>  			  round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
>>  	}
>> @@ -1727,11 +1727,11 @@ ring_last_seqno(struct intel_ring_buffer *ring)
>>  			  struct drm_i915_gem_request, list)->seqno;
>>  }
>>  
>> -static bool i915_hangcheck_ring_idle(struct intel_ring_buffer *ring, bool *err)
>> +static bool i915_hangcheck_ring_idle(struct intel_ring_buffer *ring,
>> +				     u32 ring_seqno, bool *err)
>>  {
>>  	if (list_empty(&ring->request_list) ||
>> -	    i915_seqno_passed(ring->get_seqno(ring, false),
>> -			      ring_last_seqno(ring))) {
>> +	    i915_seqno_passed(ring_seqno, ring_last_seqno(ring))) {
>>  		/* Issue a wake-up to catch stuck h/w. */
>>  		if (waitqueue_active(&ring->irq_queue)) {
>>  			DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
>> @@ -1749,39 +1749,32 @@ static bool kick_ring(struct intel_ring_buffer *ring)
>>  	struct drm_device *dev = ring->dev;
>>  	struct drm_i915_private *dev_priv = dev->dev_private;
>>  	u32 tmp = I915_READ_CTL(ring);
>> +
>> +	ring->hangcheck_waiting = false;
>> +
>>  	if (tmp & RING_WAIT) {
>>  		DRM_ERROR("Kicking stuck wait on %s\n",
>>  			  ring->name);
>>  		I915_WRITE_CTL(ring, tmp);
>> -		return true;
>> +		ring->hangcheck_waiting = true;
>>  	}
>> -	return false;
>> -}
>> -
>> -static bool i915_hangcheck_hung(struct drm_device *dev)
>> -{
>> -	drm_i915_private_t *dev_priv = dev->dev_private;
>>  
>> -	if (dev_priv->gpu_error.hangcheck_count++ > 1) {
>> -		bool hung = true;
>> +	if ((INTEL_INFO(dev)->gen >= 6) && (tmp & RING_WAIT_SEMAPHORE))
>> +		ring->hangcheck_waiting = true;
>>  
>> -		DRM_ERROR("Hangcheck timer elapsed... GPU hung\n");
>> -		i915_handle_error(dev, true);
>> -
>> -		if (!IS_GEN2(dev)) {
>> -			struct intel_ring_buffer *ring;
>> -			int i;
>> -
>> -			/* Is the chip hanging on a WAIT_FOR_EVENT?
>> -			 * If so we can simply poke the RB_WAIT bit
>> -			 * and break the hang. This should work on
>> -			 * all but the second generation chipsets.
>> -			 */
>> -			for_each_ring(ring, dev_priv, i)
>> -				hung &= !kick_ring(ring);
>> -		}
>> +	return ring->hangcheck_waiting;
>> +}
>>  
>> -		return hung;
>> +static bool i915_hangcheck_ring_hung(struct drm_device *dev,
>> +				     struct intel_ring_buffer *ring)
>> +{
>> +	if (!IS_GEN2(dev)) {
>> +		/* Is the chip hanging on a WAIT_FOR_EVENT?
>> +		 * If so we can simply poke the RB_WAIT bit
>> +		 * and break the hang. This should work on
>> +		 * all but the second generation chipsets.
>> +		 */
>> +		return !kick_ring(ring);
>>  	}
>>  
>>  	return false;
>> @@ -1789,62 +1782,68 @@ static bool i915_hangcheck_hung(struct drm_device *dev)
>>  
>>  /**
>>   * This is called when the chip hasn't reported back with completed
>> - * batchbuffers in a long time. The first time this is called we simply record
>> - * ACTHD. If ACTHD hasn't changed by the time the hangcheck timer elapses
>> - * again, we assume the chip is wedged and try to fix it.
>> + * batchbuffers in a long time. We record current seqno for each count and
>> + * in subsequent calls we check if requests have been processed by each ring.
>> + * If there is no progress on specific ring, we declare it as hung.
>>   */
>>  void i915_hangcheck_elapsed(unsigned long data)
>>  {
>>  	struct drm_device *dev = (struct drm_device *)data;
>>  	drm_i915_private_t *dev_priv = dev->dev_private;
>> -	uint32_t acthd[I915_NUM_RINGS], instdone[I915_NUM_INSTDONE_REG];
>>  	struct intel_ring_buffer *ring;
>> -	bool err = false, idle;
>> +	bool ring_hung;
>>  	int i;
>> +	int busy_count = 0;
>>  
>>  	if (!i915_enable_hangcheck)
>>  		return;
>>  
>> -	memset(acthd, 0, sizeof(acthd));
>> -	idle = true;
>>  	for_each_ring(ring, dev_priv, i) {
>> -	    idle &= i915_hangcheck_ring_idle(ring, &err);
>> -	    acthd[i] = intel_ring_get_active_head(ring);
>> -	}
>> +		bool err = false, idle;
>> +		u32 seqno;
>>  
>> -	/* If all work is done then ACTHD clearly hasn't advanced. */
>> -	if (idle) {
>> -		if (err) {
>> -			if (i915_hangcheck_hung(dev))
>> -				return;
>> +		seqno = ring->get_seqno(ring, false);
>> +		idle = i915_hangcheck_ring_idle(ring, seqno, &err);
>> +
>> +		if (idle) {
>> +			if (err)
>> +				ring->hangcheck_count++;
>> +			else
>> +				ring->hangcheck_count = 0;
>> +		} else {
>> +			busy_count++;
>>  
>> -			goto repeat;
>> +			if (ring->hangcheck_seqno == seqno) {
>> +				ring->hangcheck_count++;
>> +
>> +				/* If the ring is not waiting, raise the
>> +				 * hung score */
>> +				if (i915_hangcheck_ring_hung(dev, ring))
>> +					ring->hangcheck_count++;
>> +			} else {
>> +				ring->hangcheck_count = 0;
>> +			}
>>  		}
>>  
>> -		dev_priv->gpu_error.hangcheck_count = 0;
>> -		return;
>> +		ring->hangcheck_seqno = seqno;
>>  	}
>>  
>> -	i915_get_extra_instdone(dev, instdone);
>> -	if (memcmp(dev_priv->gpu_error.last_acthd, acthd,
>> -		   sizeof(acthd)) == 0 &&
>> -	    memcmp(dev_priv->gpu_error.prev_instdone, instdone,
>> -		   sizeof(instdone)) == 0) {
>> -		if (i915_hangcheck_hung(dev))
>> -			return;
>> -	} else {
>> -		dev_priv->gpu_error.hangcheck_count = 0;
>> -
>> -		memcpy(dev_priv->gpu_error.last_acthd, acthd,
>> -		       sizeof(acthd));
>> -		memcpy(dev_priv->gpu_error.prev_instdone, instdone,
>> -		       sizeof(instdone));
>> +	ring_hung = false;
>> +	for_each_ring(ring, dev_priv, i) {
>> +		if (ring->hangcheck_count > 2) {
>> +			ring_hung = true;
>> +			DRM_ERROR("%s seems hung\n", ring->name);
>> +		}
>>  	}
>>  
>> -repeat:
>> +	if (ring_hung)
>> +		return i915_handle_error(dev, true);
>> +
>>  	/* Reset timer case chip hangs without another request being added */
>> -	mod_timer(&dev_priv->gpu_error.hangcheck_timer,
>> -		  round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
>> +	if  (busy_count)
>> +		mod_timer(&dev_priv->gpu_error.hangcheck_timer,
>> +			  round_jiffies_up(jiffies +
>> +					   DRM_I915_HANGCHECK_JIFFIES));
>>  }
>>  
>>  /* drm_dma.h hooks
>> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
>> index d66208c..7257252 100644
>> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
>> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
>> @@ -137,6 +137,10 @@ struct  intel_ring_buffer {
>>  	struct i915_hw_context *default_context;
>>  	struct drm_i915_gem_object *last_context_obj;
>>  
>> +	int hangcheck_count;
>> +	u32 hangcheck_seqno;
>> +	bool hangcheck_waiting;
>> +
>>  	void *private;
>>  };
>>  
>
> -- 
> Ben Widawsky, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] [PATCH 7/7] drm/i915: find guilty batch buffer on ring resets
  2013-02-15 14:12     ` Mika Kuoppala
@ 2013-02-15 14:55       ` Ville Syrjälä
  0 siblings, 0 replies; 18+ messages in thread
From: Ville Syrjälä @ 2013-02-15 14:55 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx

On Fri, Feb 15, 2013 at 04:12:16PM +0200, Mika Kuoppala wrote:
> Ville Syrjälä <ville.syrjala@linux.intel.com> writes:
> 
> > On Mon, Feb 04, 2013 at 04:04:43PM +0200, Mika Kuoppala wrote:
> >> After hang check timer has declared gpu to be hang,
> >> rings are reset. In ring reset, when clearing
> >> request list, do post mortem analysis to find out
> >> the guilty batch buffer.
> >> 
> >> Select requests for further analysis by inspecting
> >> the completed sequence number which has been updated
> >> into the HWS page. If request was completed, it can't
> >> be related to the hang.
> >> 
> >> For completed requests mark the batch as guilty
> >       ^^^^^^^^^
> >
> > That's a typo, right?
> 
> It sure is. Will fix.
> 
> >> if the ring was not waiting and the ring head was
> >> stuck inside the buffer object or in the flush region
> >> right after the batch. For everything else, mark
> >> them as innocents.
> >> 
> >> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
> >> ---
> >>  drivers/gpu/drm/i915/i915_gem.c |   91 +++++++++++++++++++++++++++++++++++++++
> >>  1 file changed, 91 insertions(+)
> >> 
> >> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> >> index b304b06..db0f3e3 100644
> >> --- a/drivers/gpu/drm/i915/i915_gem.c
> >> +++ b/drivers/gpu/drm/i915/i915_gem.c
> >> @@ -2092,9 +2092,97 @@ i915_gem_request_remove_from_client(struct drm_i915_gem_request *request)
> >>  	spin_unlock(&file_priv->mm.lock);
> >>  }
> >>  
> >> +static bool i915_head_inside_object(u32 acthd, struct drm_i915_gem_object *obj)
> >> +{
> >> +	if (acthd >= obj->gtt_offset &&
> >> +	    acthd < obj->gtt_offset + obj->base.size)
> >> +		return true;
> >> +
> >> +	return false;
> >> +}
> >> +
> >> +static bool i915_head_inside_request(u32 acthd, u32 rs, u32 re)
> >> +{
> >> +	if (rs < re) {
> >> +		if (acthd >= rs && acthd < re)
> >> +			return true;
> >> +	} else if (rs > re) {
> >> +		if (acthd >= rs || acthd < re)
> >> +			return true;
> >> +	}
> >> +
> >> +	return false;
> >> +}
> >> +
> >> +static bool i915_request_guilty(struct drm_i915_gem_request *request,
> >> +				const u32 acthd, bool *inside)
> >> +{
> >> +	if (request->batch_obj) {
> >> +		if (i915_head_inside_object(acthd, request->batch_obj)) {
> >> +			*inside = true;
> >> +			return true;
> >> +		}
> >> +	}
> >> +
> >> +	if (i915_head_inside_request(acthd, request->head, request->tail)) {
> >> +		*inside = false;
> >> +		return true;
> >> +	}
> >> +
> >> +	return false;
> >> +}
> >> +
> >> +static void i915_set_reset_status(struct intel_ring_buffer *ring,
> >> +				  struct drm_i915_gem_request *request,
> >> +				  u32 acthd)
> >> +{
> >> +	bool inside;
> >> +	struct i915_reset_stats *rs = NULL;
> >> +	bool guilty;
> >> +
> >> +	/* Innocent until proven guilty */
> >> +	guilty = false;
> >> +
> >> +	if (!ring->hangcheck_waiting &&
> >> +	    i915_request_guilty(request, acthd, &inside)) {
> >> +		DRM_ERROR("%s hung %s bo (0x%x ctx %d) at 0x%x\n",
> >> +			  ring->name,
> >> +			  inside ? "inside" : "flushing",
> >> +			  request->batch_obj ?
> >> +			  request->batch_obj->gtt_offset : 0,
> >> +			  request->ctx ? request->ctx->id : 0,
> >> +			  acthd);
> >> +
> >> +		guilty = true;
> >> +	}
> >> +
> >> +	/* If contexts are disabled or this is the default context, use
> >> +	 * file_priv->reset_stats
> >> +	 */
> >> +	if (request->ctx && request->ctx->id != DEFAULT_CONTEXT_ID)
> >> +		rs = &request->ctx->reset_stats;
> >> +	else if (request->file_priv)
> >> +		rs = &request->file_priv->reset_stats;
> >> +
> >> +	if (rs) {
> >> +		rs->total++;
> >> +
> >> +		if (guilty)
> >> +			rs->guilty++;
> >> +		else
> >> +			rs->innocent++;
> >> +	}
> >> +}
> >> +
> >>  static void i915_gem_reset_ring_lists(struct drm_i915_private *dev_priv,
> >>  				      struct intel_ring_buffer *ring)
> >>  {
> >> +	u32 completed_seqno;
> >> +	u32 acthd;
> >> +
> >> +	acthd = intel_ring_get_active_head(ring);
> >> +	completed_seqno = ring->get_seqno(ring, false);
> >> +
> >>  	while (!list_empty(&ring->request_list)) {
> >>  		struct drm_i915_gem_request *request;
> >>  
> >> @@ -2102,6 +2190,9 @@ static void i915_gem_reset_ring_lists(struct drm_i915_private *dev_priv,
> >>  					   struct drm_i915_gem_request,
> >>  					   list);
> >>  
> >> +		if (request->seqno > completed_seqno)
> >
> > i915_seqno_passed()?
> 
> For readability or for correctness?
> 
> When seqno wraps, the request queue will be cleaned up so
> we can't have cross wrap boundary stuff in here.
> 
> Or did you have something else in mind that i have missed.

Nah. It just seems suspicious to have a direct comparison with any
comment why i915_seqno_passed() isn't used.

-- 
Ville Syrjälä
Intel OTC

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2013-02-15 14:55 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-02-04 14:04 [RFC] [PATCH 0/7] arb robustness enablers Mika Kuoppala
2013-02-04 14:04 ` [RFC] [PATCH 1/7] drm/i915: detect infinite loops in hang check Mika Kuoppala
2013-02-15  6:14   ` Ben Widawsky
2013-02-15  9:49     ` Daniel Vetter
2013-02-15 14:48     ` Mika Kuoppala
2013-02-04 14:04 ` [RFC] [PATCH 2/7] drm/i915: add struct i915_reset_stats Mika Kuoppala
2013-02-15  6:21   ` Ben Widawsky
2013-02-04 14:04 ` [RFC] [PATCH 3/7] drm/i915: add reset_status for hw_contexts Mika Kuoppala
2013-02-04 14:04 ` [RFC] [PATCH 4/7] drm/i915: add i915_get_reset_status_ioctl Mika Kuoppala
2013-02-05 10:47   ` [PATCH 4/7] drm/i915: add i915_get_reset_stats_ioctl Mika Kuoppala
2013-02-04 14:04 ` [RFC] [PATCH 5/7] drm/i915: add batch object and context to i915_add_request() Mika Kuoppala
2013-02-04 14:04 ` [RFC] [PATCH 6/7] drm/i915: reference count for i915_hw_contexts Mika Kuoppala
2013-02-15  5:55   ` Ben Widawsky
2013-02-04 14:04 ` [RFC] [PATCH 7/7] drm/i915: find guilty batch buffer on ring resets Mika Kuoppala
2013-02-07 14:11   ` Ville Syrjälä
2013-02-15 14:12     ` Mika Kuoppala
2013-02-15 14:55       ` Ville Syrjälä
2013-02-08 10:35 ` [RFC] [PATCH 0/7] arb robustness enablers Daniel Vetter

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.